From c2eacb7da1d2d4129b19be89a2c07e91dbff3964 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Wed, 10 Aug 2016 12:07:34 -0700
Subject: [PATCH 01/99] [SPARK-16980][SQL] Load only catalog table partition
 metadata required to answer a query

---
 .../catalyst/catalog/ExternalCatalog.scala    |  14 ++
 .../catalyst/catalog/InMemoryCatalog.scala    |  10 +
 .../scala/org/apache/spark/sql/Dataset.scala  |   4 +-
 .../spark/sql/execution/CacheManager.scala    |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |   2 +-
 .../command/createDataSourceTables.scala      |   2 +-
 .../execution/datasources/DataSource.scala    |   4 +-
 .../datasources/DataSourceStrategy.scala      |   8 +-
 .../datasources/ListingFileCatalog.scala      | 197 +--------------
 .../datasources/LogicalRelation.scala         |   2 +-
 .../PartitioningAwareFileCatalog.scala        |  25 +-
 .../datasources/SessionFileCatalog.scala      | 230 ++++++++++++++++++
 .../datasources/TableFileCatalog.scala        | 102 ++++++++
 .../datasources/fileSourceInterfaces.scala    |  54 ++--
 .../streaming/MetadataLogFileCatalog.scala    |   2 +-
 .../datasources/FileCatalogSuite.scala        |   5 +-
 .../datasources/FileSourceStrategySuite.scala |   2 +-
 ...te.scala => SessionFileCatalogSuite.scala} |  16 +-
 .../ParquetPartitionDiscoverySuite.scala      |   6 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  |  43 +++-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 104 ++------
 21 files changed, 494 insertions(+), 340 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/{ListingFileCatalogSuite.scala => SessionFileCatalogSuite.scala} (66%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index dd93b467eeeb2..e736bd2062fa5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.catalog
 
 import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException}
+import org.apache.spark.sql.catalyst.expressions.Expression
 
 
 /**
@@ -196,6 +197,19 @@ abstract class ExternalCatalog {
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
+  /**
+   * List the metadata of partitions that belong to the specified table, assuming it exists, that
+   * satisfy the given partition-pruning predicate expressions.
+   *
+   * @param db database name
+   * @param table table name
+   * @param predicates  partition-pruning predicate expressions
+   */
+  def listPartitionsByFilter(
+      db: String,
+      table: String,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition]
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 3e31127118b44..8cc54d521ec02 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -28,6 +28,7 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.util.StringUtils
 
 /**
@@ -477,6 +478,15 @@ class InMemoryCatalog(
     catalog(db).tables(table).partitions.values.toSeq
   }
 
+  override def listPartitionsByFilter(
+      db: String,
+      table: String,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
+    // TODO: Provide an implementation
+    throw new UnsupportedOperationException(
+      "listPartitionsByFilter is not implemented")
+  }
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e59a483075c94..90897ac5d7b50 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
@@ -2603,6 +2603,8 @@ class Dataset[T] private[sql](
    */
   def inputFiles: Array[String] = {
     val files: Seq[String] = logicalPlan.collect {
+      case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
+        location.inputFiles
       case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 83b7c779ab818..92fd366e101fd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -185,7 +185,7 @@ class CacheManager extends Logging {
     plan match {
       case lr: LogicalRelation => lr.relation match {
         case hr: HadoopFsRelation =>
-          val invalidate = hr.location.paths
+          val invalidate = hr.location.rootPaths
             .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory))
             .contains(qualifiedPath)
           if (invalidate) hr.location.refresh()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 6cdba406937de..808f2052c48b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -231,7 +231,7 @@ case class FileSourceScanExec(
     "Batched" -> supportsBatch.toString,
     "PartitionFilters" -> partitionFilters.mkString("[", ", ", "]"),
     "PushedFilters" -> dataFilters.mkString("[", ", ", "]"),
-    "InputPaths" -> relation.location.paths.mkString(", "))
+    "RootPaths" -> relation.location.rootPaths.mkString(", "))
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val readFile: (PartitionedFile) => Iterator[InternalRow] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index a04a13e698c43..a8c75a7f29cef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -67,7 +67,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     dataSource match {
       case fs: HadoopFsRelation =>
-        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.paths.isEmpty) {
+        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.rootPaths.isEmpty) {
           throw new AnalysisException(
             "Cannot create a file-based external data source table without path")
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index e75e7d2770b4e..92b1fff7d8127 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -471,9 +471,7 @@ case class DataSource(
           val existingPartitionColumns = Try {
             resolveRelation()
               .asInstanceOf[HadoopFsRelation]
-              .location
-              .partitionSpec()
-              .partitionColumns
+              .partitionSchema
               .fieldNames
               .toSeq
           }.getOrElse(Seq.empty[String])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 6f9ed50a02b09..7d0abe86a44df 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -163,14 +163,14 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         if query.resolved && t.schema.asNullable == query.schema.asNullable =>
 
       // Sanity checks
-      if (t.location.paths.size != 1) {
+      if (t.location.rootPaths.size != 1) {
         throw new AnalysisException(
           "Can only write data to relations with a single path.")
       }
 
-      val outputPath = t.location.paths.head
+      val outputPath = t.location.rootPaths.head
       val inputPaths = query.collect {
-        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.paths
+        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths
       }.flatten
 
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
@@ -184,7 +184,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
         t.bucketSpec,
         t.fileFormat,
-        () => t.refresh(),
+        () => t.location.refresh(),
         t.options,
         query,
         mode)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index a68ae523e0faa..6d10501b7265d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -17,32 +17,26 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import java.io.FileNotFoundException
-
 import scala.collection.mutable
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.SerializableConfiguration
 
 
 /**
  * A [[FileCatalog]] that generates the list of files to process by recursively listing all the
  * files present in `paths`.
  *
+ * @param rootPaths the list of root table paths to scan
  * @param parameters as set of options to control discovery
- * @param paths a list of paths to scan
  * @param partitionSchema an optional partition schema that will be use to provide types for the
  *                        discovered partitions
  */
 class ListingFileCatalog(
     sparkSession: SparkSession,
-    override val paths: Seq[Path],
+    override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
   extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) {
@@ -70,198 +64,17 @@ class ListingFileCatalog(
   }
 
   override def refresh(): Unit = {
-    val files = listLeafFiles(paths)
+    val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
     cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
     cachedPartitionSpec = null
   }
 
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        ListingFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        ListingFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    mutable.LinkedHashSet(files: _*)
-  }
-
   override def equals(other: Any): Boolean = other match {
-    case hdfs: ListingFileCatalog => paths.toSet == hdfs.paths.toSet
+    case hdfs: ListingFileCatalog => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false
   }
 
-  override def hashCode(): Int = paths.toSet.hashCode()
-}
-
-
-object ListingFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
+  override def hashCode(): Int = rootPaths.toSet.hashCode()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index d9562fd32e87d..7c28d48f26416 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -94,7 +94,7 @@ case class LogicalRelation(
   }
 
   override def refresh(): Unit = relation match {
-    case fs: HadoopFsRelation => fs.refresh()
+    case fs: HadoopFsRelation => fs.location.refresh()
     case _ =>  // Do nothing.
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 702ba97222e34..04d7d89250586 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -21,7 +21,6 @@ import scala.collection.mutable
 
 import org.apache.hadoop.fs.{FileStatus, Path}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
@@ -40,9 +39,10 @@ abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
-  extends FileCatalog with Logging {
+  extends SessionFileCatalog(sparkSession) with FileCatalog {
+  import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
@@ -72,8 +72,8 @@ abstract class PartitioningAwareFileCatalog(
 
   override def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
-      // For each of the input paths, get the list of files inside them
-      paths.flatMap { path =>
+      // For each of the root input paths, get the list of files inside them
+      rootPaths.flatMap { path =>
         // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
         val fs = path.getFileSystem(hadoopConf)
         val qualifiedPathPre = fs.makeQualified(path)
@@ -105,8 +105,6 @@ abstract class PartitioningAwareFileCatalog(
   protected def inferPartitioning(): PartitionSpec = {
     // We use leaf dirs containing data files to discover the schema.
     val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
-      // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
-      // counted as data files, so that they shouldn't participate partition discovery.
       files.exists(f => isDataPath(f.getPath))
     }.keys.toSeq
     partitionSchema match {
@@ -194,24 +192,23 @@ abstract class PartitioningAwareFileCatalog(
    * and the returned DataFrame will have the column of `something`.
    */
   private def basePaths: Set[Path] = {
-    parameters.get("basePath").map(new Path(_)) match {
+    parameters.get(BASE_PATH_PARAM).map(new Path(_)) match {
       case Some(userDefinedBasePath) =>
         val fs = userDefinedBasePath.getFileSystem(hadoopConf)
         if (!fs.isDirectory(userDefinedBasePath)) {
-          throw new IllegalArgumentException("Option 'basePath' must be a directory")
+          throw new IllegalArgumentException(s"Option '$BASE_PATH_PARAM' must be a directory")
         }
         Set(fs.makeQualified(userDefinedBasePath))
 
       case None =>
-        paths.map { path =>
+        rootPaths.map { path =>
           // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
           val qualifiedPath = path.getFileSystem(hadoopConf).makeQualified(path)
           if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet
     }
   }
+}
 
-  private def isDataPath(path: Path): Boolean = {
-    val name = path.getName
-    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
-  }
+object PartitioningAwareFileCatalog {
+  val BASE_PATH_PARAM = "basePath"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
new file mode 100644
index 0000000000000..47b69eb721b29
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
+ * files in a list of HDFS paths.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param ignoreFileNotFound (see [[ListingFileCatalog]])
+ */
+abstract class SessionFileCatalog(sparkSession: SparkSession)
+    extends BasicFileCatalog with Logging {
+  protected val hadoopConf: Configuration
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    mutable.LinkedHashSet(files: _*)
+  }
+
+  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
+  // counted as data files, so that they shouldn't participate partition discovery.
+  protected def isDataPath(path: Path): Boolean = {
+    val name = path.getName
+    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
+  }
+}
+
+object SessionFileCatalog extends Logging {
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
new file mode 100644
index 0000000000000..d90ce19869e46
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{StructField, StructType}
+
+
+/**
+ * A [[BasicFileCatalog]] for a metastore catalog table.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param db the table's database name
+ * @param table the table's (unqualified) name
+ * @param partitionSchema the schema of a partitioned table's partition columns
+ * @param sizeInBytes the table's data size in bytes
+ */
+class TableFileCatalog(
+    sparkSession: SparkSession,
+    db: String,
+    table: String,
+    partitionSchema: Option[StructType],
+    override val sizeInBytes: Long)
+  extends SessionFileCatalog(sparkSession) {
+
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+
+  private val externalCatalog = sparkSession.sharedState.externalCatalog
+
+  private val catalogTable = externalCatalog.getTable(db, table)
+
+  private val baseLocation = catalogTable.storage.locationUri
+
+  override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
+
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = partitionSchema match {
+    case Some(partitionSchema) =>
+      externalCatalog.listPartitionsByFilter(db, table, filters).flatMap {
+        case CatalogTablePartition(spec, storage, _) =>
+          storage.locationUri.map(new Path(_)).map { path =>
+            val files = listDataLeafFiles(path :: Nil).toSeq
+            val values =
+              InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+                Cast(Literal(spec(name)), dataType).eval()
+              })
+            Partition(values, files)
+          }
+      }
+    case None =>
+      Partition(InternalRow.empty, listDataLeafFiles(rootPaths).toSeq) :: Nil
+  }
+
+  override def refresh(): Unit = {}
+
+
+  /**
+   * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
+   * specified by the given partition-pruning filters.
+   *
+   * @param filters partition-pruning filters
+   */
+  def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
+    val rootPaths = partitionSchema match {
+      case Some(_) =>
+        externalCatalog
+          .listPartitionsByFilter(db, table, filters)
+          .flatMap(_.storage.locationUri)
+          .map(new Path(_))
+      case None =>
+        this.rootPaths
+    }
+    val parameters =
+      baseLocation
+        .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
+        .getOrElse(Map.empty)
+
+    new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
+  }
+
+  private def listDataLeafFiles(paths: Seq[Path]) =
+    listLeafFiles(paths).filter(f => isDataPath(f.getPath))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index 69dd622ce4a54..7f696dba557de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
-import org.apache.spark.sql.execution.FileRelation
 import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, Filter}
 import org.apache.spark.sql.types.StructType
 
@@ -129,13 +128,13 @@ abstract class OutputWriter {
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: FileCatalog,
+    location: BasicFileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
     options: Map[String, String])(val sparkSession: SparkSession)
-  extends BaseRelation with FileRelation {
+  extends BaseRelation {
 
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
@@ -148,9 +147,6 @@ case class HadoopFsRelation(
 
   def partitionSchemaOption: Option[StructType] =
     if (partitionSchema.isEmpty) None else Some(partitionSchema)
-  def partitionSpec: PartitionSpec = location.partitionSpec()
-
-  def refresh(): Unit = location.refresh()
 
   override def toString: String = {
     fileFormat match {
@@ -159,11 +155,7 @@ case class HadoopFsRelation(
     }
   }
 
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    location.allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = location.allFiles().map(_.getLen).sum
+  override def sizeInBytes: Long = location.sizeInBytes
 }
 
 /**
@@ -319,16 +311,17 @@ abstract class TextBasedFileFormat extends FileFormat {
 case class Partition(values: InternalRow, files: Seq[FileStatus])
 
 /**
- * An interface for objects capable of enumerating the files that comprise a relation as well
- * as the partitioning characteristics of those files.
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
  */
-trait FileCatalog {
+trait BasicFileCatalog {
 
-  /** Returns the list of input paths from which the catalog will get files. */
-  def paths: Seq[Path]
-
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
+  /**
+   * Returns the list of root input paths from which the catalog will get files. These paths
+   * should *not* include any table partition directories. Partition directories are discovered or
+   * provided by a metastore catalog.
+   */
+  def rootPaths: Seq[Path]
 
   /**
    * Returns all valid files grouped into partitions when the data is partitioned. If the data is
@@ -341,9 +334,28 @@ trait FileCatalog {
    */
   def listFiles(filters: Seq[Expression]): Seq[Partition]
 
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
+
+/**
+ * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
+ * those, infer the relation's partition specification.
+ */
+trait FileCatalog extends BasicFileCatalog {
+
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
-  /** Refresh the file listing */
-  def refresh(): Unit
+  /** Returns the list of files that will be read when scanning this relation. */
+  def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
index a32c4671e3475..82b67cb1ca6ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
@@ -47,7 +47,7 @@ class MetadataLogFileCatalog(sparkSession: SparkSession, path: Path)
     allFilesFromLog.toArray.groupBy(_.getPath.getParent)
   }
 
-  override def paths: Seq[Path] = path :: Nil
+  override def rootPaths: Seq[Path] = path :: Nil
 
   override def refresh(): Unit = { }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index fa3abd0098f5b..2695974b84b00 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -77,13 +77,14 @@ class FileCatalogSuite extends SharedSQLContext {
       val catalog1 = new ListingFileCatalog(
         spark, Seq(new Path(deletedFolder.getCanonicalPath)), Map.empty, None)
       // doesn't throw an exception
-      assert(catalog1.listLeafFiles(catalog1.paths).isEmpty)
+      assert(catalog1.listLeafFiles(catalog1.rootPaths).isEmpty)
     }
   }
 
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
-      override val paths: Seq[Path]) extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
+      override val rootPaths: Seq[Path])
+      extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
 
       override def refresh(): Unit = {}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index c5deb31fec183..c32254d9dfde2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -395,7 +395,7 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
 
         val fileCatalog = new ListingFileCatalog(
           sparkSession = spark,
-          paths = Seq(new Path(tempDir)),
+          rootPaths = Seq(new Path(tempDir)),
           parameters = Map.empty[String, String],
           partitionSchema = None)
         // This should not fail.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
similarity index 66%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
index f15730aeb11f2..df509583377ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
@@ -19,16 +19,16 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.SparkFunSuite
 
-class ListingFileCatalogSuite extends SparkFunSuite {
+class SessionFileCatalogSuite extends SparkFunSuite {
 
   test("file filtering") {
-    assert(!ListingFileCatalog.shouldFilterOut("abcd"))
-    assert(ListingFileCatalog.shouldFilterOut(".ab"))
-    assert(ListingFileCatalog.shouldFilterOut("_cd"))
+    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
+    assert(SessionFileCatalog.shouldFilterOut(".ab"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd"))
 
-    assert(!ListingFileCatalog.shouldFilterOut("_metadata"))
-    assert(!ListingFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(ListingFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(ListingFileCatalog.shouldFilterOut("_cd_common_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 8d18be9300f7e..43357c97c395a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -626,8 +626,8 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: HadoopFsRelation, _, _) =>
-          assert(relation.partitionSpec === PartitionSpec.emptySpec)
+        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 237b829da882f..115c0d9c7d576 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -29,16 +29,17 @@ import org.apache.thrift.TException
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
 
 /**
@@ -626,6 +627,44 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     client.getPartition(db, table, spec)
   }
 
+  override def listPartitionsByFilter(
+      db: String,
+      table: String,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
+    val catalogTable = client.getTable(db, table)
+    val partitionColumnNames = catalogTable.partitionColumnNames.toSet
+    val nonPartitionPruningPredicates = predicates.filterNot {
+      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
+    }
+
+    if (nonPartitionPruningPredicates.nonEmpty) {
+        sys.error("Expected only partition pruning predicates: " +
+          predicates.reduceLeft(And))
+    }
+
+    val partitionSchema = catalogTable.partitionSchema
+
+    if (predicates.nonEmpty) {
+      val clientPrunedPartitions =
+        client.getPartitionsByFilter(catalogTable, predicates)
+      val boundPredicate =
+        InterpretedPredicate.create(predicates.reduce(And).transform {
+          case att: AttributeReference =>
+            val index = partitionSchema.indexWhere(_.name == att.name)
+            BoundReference(index, partitionSchema(index).dataType, nullable = true)
+        })
+      clientPrunedPartitions.filter { case CatalogTablePartition(spec, _, _) =>
+        val row =
+          InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+            Cast(Literal(spec(name)), dataType).eval()
+          })
+        boundPredicate(row)
+      }
+    } else {
+      client.getPartitions(catalogTable)
+    }
+  }
+
   /**
    * Returns the specified partition or None if it does not exist.
    */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 8410a2e4a47ca..e7c14940b2158 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -136,16 +136,16 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
-      pathsInMetastore: Seq[String],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
       expectedBucketSpec: Option[BucketSpec],
-      partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
+      partitionSchema: Option[StructType]): Option[LogicalRelation] = {
 
     cachedDataSourceTables.getIfPresent(tableIdentifier) match {
       case null => None // Cache miss
       case logical @ LogicalRelation(relation: HadoopFsRelation, _, _) =>
+        val metastoreRelationRootPath = metastoreRelation.hiveQlTable.getDataLocation
         val cachedRelationFileFormatClass = relation.fileFormat.getClass
 
         expectedFileFormat match {
@@ -153,12 +153,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.paths.map(_.toString).toSet == pathsInMetastore.toSet &&
+              relation.location.rootPaths.toSet == Set(metastoreRelationRootPath) &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
-                relation.partitionSpec == partitionSpecInMetastore.getOrElse {
-                  PartitionSpec(StructType(Nil), Array.empty[PartitionDirectory])
-                }
+                relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
 
             if (useCached) {
               Some(logical)
@@ -199,59 +197,30 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
-      val partitionColumnDataTypes = partitionSchema.map(_.dataType)
-      // We're converting the entire table into HadoopFsRelation, so predicates to Hive metastore
-      // are empty.
-      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
-        val location = p.getLocation
-        val values = InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map {
-          case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
-        })
-        PartitionDirectory(values, location)
-      }
-      val partitionSpec = PartitionSpec(partitionSchema, partitions)
-      val partitionPaths = partitions.map(_.path.toString)
-
-      // By convention (for example, see MetaStorePartitionedTableFileCatalog), the definition of a
-      // partitioned table's paths depends on whether that table has any actual partitions.
-      // Partitioned tables without partitions use the location of the table's base path.
-      // Partitioned tables with partitions use the locations of those partitions' data locations,
-      // _omitting_ the table's base path.
-      val paths = if (partitionPaths.isEmpty) {
-        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
-      } else {
-        partitionPaths
-      }
 
       val cached = getCached(
         tableIdentifier,
-        paths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
         bucketSpec,
-        Some(partitionSpec))
-
-      val hadoopFsRelation = cached.getOrElse {
-        val fileCatalog = new MetaStorePartitionedTableFileCatalog(
-          sparkSession,
-          new Path(metastoreRelation.catalogTable.storage.locationUri.get),
-          partitionSpec)
-
-        val inferredSchema = if (fileType.equals("parquet")) {
-          val inferredSchema =
-            defaultSource.inferSchema(sparkSession, options, fileCatalog.allFiles())
-          inferredSchema.map { inferred =>
-            ParquetFileFormat.mergeMetastoreParquetSchema(metastoreSchema, inferred)
-          }.getOrElse(metastoreSchema)
-        } else {
-          defaultSource.inferSchema(sparkSession, options, fileCatalog.allFiles()).get
-        }
+        Some(partitionSchema))
+
+      val logicalRelation = cached.getOrElse {
+        val db = metastoreRelation.databaseName
+        val table = metastoreRelation.tableName
+        val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
+        val fileCatalog =
+          new TableFileCatalog(sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+        val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
+        val dataSchema =
+          StructType(metastoreSchema
+            .filterNot(field => partitionSchemaColumnNames.contains(field.name.toLowerCase)))
 
         val relation = HadoopFsRelation(
           location = fileCatalog,
           partitionSchema = partitionSchema,
-          dataSchema = inferredSchema,
+          dataSchema = dataSchema,
           bucketSpec = bucketSpec,
           fileFormat = defaultSource,
           options = options)(sparkSession = sparkSession)
@@ -261,12 +230,11 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         created
       }
 
-      hadoopFsRelation
+      logicalRelation
     } else {
-      val paths = Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
-        paths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
@@ -277,14 +245,13 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           LogicalRelation(
             DataSource(
               sparkSession = sparkSession,
-              paths = paths,
+              paths = rootPath.toString :: Nil,
               userSpecifiedSchema = Some(metastoreRelation.schema),
               bucketSpec = bucketSpec,
               options = options,
               className = fileType).resolveRelation(),
               catalogTable = Some(metastoreRelation.catalogTable))
 
-
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
@@ -372,34 +339,3 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     }
   }
 }
-
-/**
- * An override of the standard HDFS listing based catalog, that overrides the partition spec with
- * the information from the metastore.
- *
- * @param tableBasePath The default base path of the Hive metastore table
- * @param partitionSpec The partition specifications from Hive metastore
- */
-private[hive] class MetaStorePartitionedTableFileCatalog(
-    sparkSession: SparkSession,
-    tableBasePath: Path,
-    override val partitionSpec: PartitionSpec)
-  extends ListingFileCatalog(
-    sparkSession,
-    MetaStorePartitionedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
-    Map.empty,
-    Some(partitionSpec.partitionColumns)) {
-}
-
-private[hive] object MetaStorePartitionedTableFileCatalog {
-  /** Get the list of paths to list files in the for a metastore table */
-  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
-    // If there are no partitions currently specified then use base path,
-    // otherwise use the paths corresponding to the partitions.
-    if (partitionSpec.partitions.isEmpty) {
-      Seq(tableBasePath)
-    } else {
-      partitionSpec.partitions.map(_.path)
-    }
-  }
-}

From 1f611c4089102744242b73346d9724d248635cac Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Mon, 12 Sep 2016 18:21:38 -0700
Subject: [PATCH 02/99] Add a new catalyst optimizer rule to SQL core for
 pruning unneeded partitions' files from a table file catalog

---
 .../spark/sql/execution/SparkOptimizer.scala  |  2 +
 .../PruneFileSourcePartitions.scala           | 72 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 8b762b5d6c5f2..981728331d361 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
+import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
 import org.apache.spark.sql.internal.SQLConf
 
@@ -32,5 +33,6 @@ class SparkOptimizer(
   override def batches: Seq[Batch] = super.batches :+
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
+    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
new file mode 100644
index 0000000000000..b8af0f53423bd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+
+private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
+    case op @ PhysicalOperation(projects, filters,
+        logicalRelation @
+          LogicalRelation(fsRelation @
+            HadoopFsRelation(
+              tableFileCatalog: TableFileCatalog,
+              partitionSchema,
+              _,
+              _,
+              _,
+              _),
+            _,
+            _))
+        if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined =>
+      // The attribute name of predicate could be different than the one in schema in case of
+      // case insensitive, we should change them to match the one in schema, so we donot need to
+      // worry about case sensitivity anymore.
+      val normalizedFilters = filters.map { e =>
+        e transform {
+          case a: AttributeReference =>
+            a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name)
+        }
+      }
+
+      val sparkSession = fsRelation.sparkSession
+      val partitionColumns =
+        logicalRelation.resolve(
+          partitionSchema, sparkSession.sessionState.analyzer.resolver)
+      val partitionSet = AttributeSet(partitionColumns)
+      val partitionKeyFilters =
+        ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
+
+      if (partitionKeyFilters.nonEmpty) {
+          val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
+          val prunedFsRelation =
+            fsRelation.copy(location = prunedFileCatalog)(sparkSession)
+          val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+
+          // Keep partition-pruning predicates so that they are visible in physical planning
+          val filterExpression = filters.reduceLeft(And)
+          val filter = Filter(filterExpression, prunedLogicalRelation)
+          Project(projects, filter)
+      } else {
+        op
+      }
+  }
+}

From 8b24eada4a0b49f39d16570ee86f52ddc1682251 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Fri, 7 Oct 2016 17:15:11 -0700
Subject: [PATCH 03/99] Include the type of file catalog in the
 FileSourceScanExec metadata

---
 .../sql/execution/DataSourceScanExec.scala      | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 808f2052c48b3..4065483262ec8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -225,13 +225,16 @@ case class FileSourceScanExec(
   }
 
   // These metadata values make scan plans uniquely identifiable for equality checking.
-  override val metadata: Map[String, String] = Map(
-    "Format" -> relation.fileFormat.toString,
-    "ReadSchema" -> outputSchema.catalogString,
-    "Batched" -> supportsBatch.toString,
-    "PartitionFilters" -> partitionFilters.mkString("[", ", ", "]"),
-    "PushedFilters" -> dataFilters.mkString("[", ", ", "]"),
-    "RootPaths" -> relation.location.rootPaths.mkString(", "))
+  override val metadata: Map[String, String] = {
+    def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
+    val location = relation.location
+    Map("Format" -> relation.fileFormat.toString,
+      "ReadSchema" -> outputSchema.catalogString,
+      "Batched" -> supportsBatch.toString,
+      "PartitionFilters" -> seqToString(partitionFilters),
+      "PushedFilters" -> seqToString(dataFilters),
+      "Location" -> (location.getClass.getSimpleName + seqToString(location.rootPaths)))
+  }
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val readFile: (PartitionedFile) => Iterator[InternalRow] =

From f82f0d228141dd026b0b631e8d984961ee8b827b Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Fri, 7 Oct 2016 17:15:54 -0700
Subject: [PATCH 04/99] TODO: Consider renaming FileCatalog to better
 differentiate it from BasicFileCatalog (or vice-versa)

---
 .../spark/sql/execution/datasources/fileSourceInterfaces.scala  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index 7f696dba557de..4c0943c1d5aef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -345,6 +345,8 @@ trait BasicFileCatalog {
  * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
  * those, infer the relation's partition specification.
  */
+// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
+// which it is safe to list all of its files?
 trait FileCatalog extends BasicFileCatalog {
 
   /** Returns the specification of the partitions inferred from the data. */

From 198dd9457fad08516f65ea1bcfa6edf4af17d948 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 11 Oct 2016 10:53:13 -0700
Subject: [PATCH 05/99] Refactor the FileSourceScanExec.metadata val to make it
 prettier

---
 .../apache/spark/sql/execution/DataSourceScanExec.scala    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 4065483262ec8..ee61f7f0413da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -228,12 +228,15 @@ case class FileSourceScanExec(
   override val metadata: Map[String, String] = {
     def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
     val location = relation.location
-    Map("Format" -> relation.fileFormat.toString,
+    val locationDesc =
+      location.getClass.getSimpleName + seqToString(location.rootPaths)
+    Map(
+      "Format" -> relation.fileFormat.toString,
       "ReadSchema" -> outputSchema.catalogString,
       "Batched" -> supportsBatch.toString,
       "PartitionFilters" -> seqToString(partitionFilters),
       "PushedFilters" -> seqToString(dataFilters),
-      "Location" -> (location.getClass.getSimpleName + seqToString(location.rootPaths)))
+      "Location" -> locationDesc)
   }
 
   private lazy val inputRDD: RDD[InternalRow] = {

From 1f0d5d88538da058e474098eabba53d387f70f53 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 10 Oct 2016 19:54:53 -0700
Subject: [PATCH 06/99] try out parquet case insensitive fallback

---
 .../parquet/ParquetReadSupport.scala          |  6 +++-
 .../parquet/ParquetSchemaSuite.scala          | 28 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index f1a35dd8a6200..4dea8cf29ec58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -269,11 +269,15 @@ private[parquet] object ParquetReadSupport {
    */
   private def clipParquetGroupFields(
       parquetRecord: GroupType, structType: StructType): Seq[Type] = {
-    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
+    val parquetFieldMap = parquetRecord.getFields.asScala
+      .map(f => f.getName -> f).toMap
+    val caseInsensitiveParquetFieldMap = parquetRecord.getFields.asScala
+      .map(f => f.getName.toLowerCase -> f).toMap
     val toParquet = new ParquetSchemaConverter(writeLegacyParquetFormat = false)
     structType.map { f =>
       parquetFieldMap
         .get(f.name)
+        .orElse(caseInsensitiveParquetFieldMap.get(f.name.toLowerCase))
         .map(clipParquetType(_, f.dataType))
         .getOrElse(toParquet.convertField(f))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 8a980a7eb538f..c3d202ced24c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -1080,6 +1080,34 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
+  testSchemaClipping(
+    "falls back to case insensitive resolution",
+
+    parquetSchema =
+      """message root {
+        |  required group A {
+        |    optional int32 B;
+        |  }
+        |  optional int32 c;
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val nestedType = new StructType().add("b", IntegerType, nullable = true)
+      new StructType()
+        .add("a", nestedType, nullable = true)
+        .add("c", IntegerType, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group A {
+        |    optional int32 B;
+        |  }
+        |  optional int32 c;
+        |}
+      """.stripMargin)
+
   testSchemaClipping(
     "simple nested struct",
 

From 59de5ca2c8b209a190dc0c6082fc6e2d2de0096b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 11 Oct 2016 16:03:18 -0700
Subject: [PATCH 07/99] fix and add test for input files

---
 .../scala/org/apache/spark/sql/Dataset.scala  |  4 +--
 .../datasources/TableFileCatalog.scala        |  3 +-
 .../datasources/fileSourceInterfaces.scala    | 11 +++++--
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 29 ++++++++++++++++++-
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 90897ac5d7b50..9b9f54e046fbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2602,9 +2602,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def inputFiles: Array[String] = {
-    val files: Seq[String] = logicalPlan.collect {
-      case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
-        location.inputFiles
+    val files: Seq[String] = queryExecution.optimizedPlan.collect {
       case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index d90ce19869e46..2ca179f84e7f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -72,7 +72,6 @@ class TableFileCatalog(
 
   override def refresh(): Unit = {}
 
-
   /**
    * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
    * specified by the given partition-pruning filters.
@@ -97,6 +96,8 @@ class TableFileCatalog(
     new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
   }
 
+  override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
+
   private def listDataLeafFiles(paths: Seq[Path]) =
     listLeafFiles(paths).filter(f => isDataPath(f.getPath))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index 4c0943c1d5aef..c72410815cc5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.execution.FileRelation
 import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, Filter}
 import org.apache.spark.sql.types.StructType
 
@@ -134,7 +135,7 @@ case class HadoopFsRelation(
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
     options: Map[String, String])(val sparkSession: SparkSession)
-  extends BaseRelation {
+  extends BaseRelation with FileRelation {
 
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
@@ -156,6 +157,8 @@ case class HadoopFsRelation(
   }
 
   override def sizeInBytes: Long = location.sizeInBytes
+
+  override def inputFiles: Array[String] = location.inputFiles
 }
 
 /**
@@ -334,6 +337,9 @@ trait BasicFileCatalog {
    */
   def listFiles(filters: Seq[Expression]): Seq[Partition]
 
+  /** Returns the list of files that will be read when scanning this relation. */
+  def inputFiles: Array[String]
+
   /** Refresh any cached file listings */
   def refresh(): Unit
 
@@ -355,8 +361,7 @@ trait FileCatalog extends BasicFileCatalog {
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
-  /** Returns the list of files that will be read when scanning this relation. */
-  def inputFiles: Array[String] =
+  override def inputFiles: Array[String] =
     allFiles().map(_.getPath.toUri.toString).toArray
 
   override def sizeInBytes: Long = allFiles().map(_.getLen).sum
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 96e9054cd4876..3771a295ce940 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -18,9 +18,10 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
-class HiveDataFrameSuite extends QueryTest with TestHiveSingleton {
+class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   test("table name with schema") {
     // regression test for SPARK-11778
     spark.sql("create schema usrdb")
@@ -34,4 +35,30 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton {
     val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
+
+  test("inputFiles of pruned and partitioned table") {
+    withTable("test") {
+      withTempDir { dir =>
+        spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+          .partitionBy("f1", "f2")
+          .mode("overwrite")
+          .parquet(dir.getAbsolutePath)
+
+        spark.sql(s"""
+          |create external table test (id long)
+          |partitioned by (f1 int, f2 int)
+          |stored as parquet
+          |location "${dir.getAbsolutePath}"""".stripMargin)
+        spark.sql("msck repair table test")
+
+        val df = spark.sql("select * from test")
+        assert(df.count() == 5)
+        assert(df.inputFiles.length == 5)  // unpruned
+
+        val df2 = spark.sql("select * from test where f2 = 3 or f2 = 4")
+        assert(df2.count() == 2)
+        assert(df2.inputFiles.length == 2)  // pruned, so we have less files
+      }
+    }
+  }
 }

From 3b51624263cfcedd3e51b71342b940592a5f6118 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 11 Oct 2016 16:09:06 -0700
Subject: [PATCH 08/99] rename test

---
 .../scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 3771a295ce940..c5d234bf20e31 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -36,7 +36,7 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
 
-  test("inputFiles of pruned and partitioned table") {
+  test("partitioned pruned table reports only selected files") {
     withTable("test") {
       withTempDir { dir =>
         spark.range(5).selectExpr("id", "id as f1", "id as f2").write

From acc84f07f53d3c87c5637636e69b1c564421484a Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 11 Oct 2016 12:00:43 -0700
Subject: [PATCH 09/99] Refactor `TableFileCatalog.listFiles` to call
 `listDataLeafFiles` once instead of once per partition

---
 .../sql/execution/datasources/TableFileCatalog.scala | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 2ca179f84e7f1..3e2950bffb4d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -55,10 +55,16 @@ class TableFileCatalog(
 
   override def listFiles(filters: Seq[Expression]): Seq[Partition] = partitionSchema match {
     case Some(partitionSchema) =>
-      externalCatalog.listPartitionsByFilter(db, table, filters).flatMap {
+      val catalogTablePartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
+      val partitionPaths = catalogTablePartitions.flatMap {
         case CatalogTablePartition(spec, storage, _) =>
-          storage.locationUri.map(new Path(_)).map { path =>
-            val files = listDataLeafFiles(path :: Nil).toSeq
+          storage.locationUri.map(new Path(_))
+      }
+      val dataLeafFiles = listDataLeafFiles(partitionPaths).toSeq
+      catalogTablePartitions.flatMap {
+        case CatalogTablePartition(spec, storage, _) =>
+          storage.locationUri.map(new Path(_)).map { partitionPath =>
+            val files = dataLeafFiles.filter(_.getPath.getParent == partitionPath)
             val values =
               InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
                 Cast(Literal(spec(name)), dataType).eval()

From f94863dd386a8654986a1fde09e5d87ded97a6e3 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 18:09:02 -0700
Subject: [PATCH 10/99] fix it

---
 .../sql/catalyst/catalog/interface.scala      | 15 ++-
 .../datasources/TableFileCatalog.scala        | 93 ++++++++++++-------
 .../spark/sql/hive/HiveExternalCatalog.scala  |  8 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 12 ++-
 .../apache/spark/sql/hive/parquetSuites.scala |  3 +
 5 files changed, 83 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 51326ca25e9cc..1a57a7707caa1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -20,11 +20,11 @@ package org.apache.spark.sql.catalyst.catalog
 import java.util.Date
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 
 /**
@@ -97,6 +97,15 @@ case class CatalogTablePartition(
 
     output.filter(_.nonEmpty).mkString("CatalogPartition(\n\t", "\n\t", ")")
   }
+
+  /**
+   * Given the partition schema, returns a row with that schema holding the partition values.
+   */
+  def toRow(partitionSchema: StructType): InternalRow = {
+    InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+      Cast(Literal(spec(name)), dataType).eval()
+    })
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 3e2950bffb4d6..ef531efd0f649 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -53,27 +53,8 @@ class TableFileCatalog(
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = partitionSchema match {
-    case Some(partitionSchema) =>
-      val catalogTablePartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
-      val partitionPaths = catalogTablePartitions.flatMap {
-        case CatalogTablePartition(spec, storage, _) =>
-          storage.locationUri.map(new Path(_))
-      }
-      val dataLeafFiles = listDataLeafFiles(partitionPaths).toSeq
-      catalogTablePartitions.flatMap {
-        case CatalogTablePartition(spec, storage, _) =>
-          storage.locationUri.map(new Path(_)).map { partitionPath =>
-            val files = dataLeafFiles.filter(_.getPath.getParent == partitionPath)
-            val values =
-              InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
-                Cast(Literal(spec(name)), dataType).eval()
-              })
-            Partition(values, files)
-          }
-      }
-    case None =>
-      Partition(InternalRow.empty, listDataLeafFiles(rootPaths).toSeq) :: Nil
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+    filterPartitions(filters).listFiles(Nil)
   }
 
   override def refresh(): Unit = {}
@@ -85,25 +66,67 @@ class TableFileCatalog(
    * @param filters partition-pruning filters
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
-    val rootPaths = partitionSchema match {
-      case Some(_) =>
-        externalCatalog
-          .listPartitionsByFilter(db, table, filters)
-          .flatMap(_.storage.locationUri)
-          .map(new Path(_))
-      case None =>
-        this.rootPaths
+    if (filters.isEmpty) {
+      cachedAllPartitions
+    } else {
+      filterPartitions0(filters)
     }
-    val parameters =
-      baseLocation
-        .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
-        .getOrElse(Map.empty)
+  }
 
-    new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
+  private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
+    val parameters = baseLocation
+      .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
+      .getOrElse(Map.empty)
+    partitionSchema match {
+      case Some(schema) =>
+        val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
+        val partitions = selectedPartitions.map { p =>
+          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
+        }
+        val partitionSpec = PartitionSpec(schema, partitions)
+        new PrunedTableFileCatalog(
+          sparkSession, new Path(baseLocation.get), partitionSpec)
+      case None =>
+        new ListingFileCatalog(sparkSession, rootPaths, parameters, None)
+    }
   }
 
-  override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
+  // Not used in the hot path of queries when metastore partition pruning is enabled
+  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
+
+  override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
 
   private def listDataLeafFiles(paths: Seq[Path]) =
     listLeafFiles(paths).filter(f => isDataPath(f.getPath))
 }
+
+/**
+ * An override of the standard HDFS listing based catalog, that overrides the partition spec with
+ * the information from the metastore.
+ *
+ * @param tableBasePath The default base path of the Hive metastore table
+ * @param partitionSpec The partition specifications from Hive metastore
+ */
+private class PrunedTableFileCatalog(
+    sparkSession: SparkSession,
+    tableBasePath: Path,
+    override val partitionSpec: PartitionSpec)
+  extends ListingFileCatalog(
+    sparkSession,
+    PrunedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
+    Map.empty,
+    Some(partitionSpec.partitionColumns)) {
+}
+
+object PrunedTableFileCatalog {
+  /** Get the list of paths to list files in the for a metastore table */
+  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
+    // If there are no partitions currently specified then use base path,
+    // otherwise use the paths corresponding to the partitions.
+    if (partitionSpec.partitions.isEmpty) {
+      Seq(tableBasePath)
+    } else {
+      partitionSpec.partitions.map(_.path)
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 115c0d9c7d576..61d110495a065 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -653,12 +653,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
             val index = partitionSchema.indexWhere(_.name == att.name)
             BoundReference(index, partitionSchema(index).dataType, nullable = true)
         })
-      clientPrunedPartitions.filter { case CatalogTablePartition(spec, _, _) =>
-        val row =
-          InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
-            Cast(Literal(spec(name)), dataType).eval()
-          })
-        boundPredicate(row)
+      clientPrunedPartitions.filter { case p: CatalogTablePartition =>
+        boundPredicate(p.toRow(partitionSchema))
       }
     } else {
       client.getPartitions(catalogTable)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index c5d234bf20e31..8e49baf34064e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -39,14 +39,14 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
   test("partitioned pruned table reports only selected files") {
     withTable("test") {
       withTempDir { dir =>
-        spark.range(5).selectExpr("id", "id as f1", "id as f2").write
-          .partitionBy("f1", "f2")
+        spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+          .partitionBy("partCol1", "partCol2")
           .mode("overwrite")
           .parquet(dir.getAbsolutePath)
 
         spark.sql(s"""
           |create external table test (id long)
-          |partitioned by (f1 int, f2 int)
+          |partitioned by (partCol1 int, partCol2 int)
           |stored as parquet
           |location "${dir.getAbsolutePath}"""".stripMargin)
         spark.sql("msck repair table test")
@@ -55,9 +55,13 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
         assert(df.count() == 5)
         assert(df.inputFiles.length == 5)  // unpruned
 
-        val df2 = spark.sql("select * from test where f2 = 3 or f2 = 4")
+        val df2 = spark.sql("select * from test where partCol1 = 3 or partCol2 = 4")
         assert(df2.count() == 2)
         assert(df2.inputFiles.length == 2)  // pruned, so we have less files
+
+        val df3 = spark.sql("select * from test where PARTCOL1 = 3 or partcol2 = 4")
+        assert(df3.count() == 2)
+        assert(df3.inputFiles.length == 2)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 2f6d9fb96b825..8f21c3d38a32f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -586,6 +586,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         checkAnswer(
           sql("SELECT * FROM test_added_partitions"),
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
+
+        // Also verify the inputFiles implementation
+        assert(sql("select * from test_added_partitions").inputFiles.length == 2)
       }
     }
   }

From 022d5b9873018dad8ac08646704f567176977877 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 18:26:23 -0700
Subject: [PATCH 11/99] more test cases

---
 .../scala/org/apache/spark/sql/hive/parquetSuites.scala   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 8f21c3d38a32f..6b66ea1d82607 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -587,6 +587,14 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
           sql("SELECT * FROM test_added_partitions"),
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
 
+        // Check it with pruning predicates
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 1"),
+          Seq(("baz", 1)).toDF("a", "b"))
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 0"),
+          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+
         // Also verify the inputFiles implementation
         assert(sql("select * from test_added_partitions").inputFiles.length == 2)
       }

From 8bd27be814f7721f3764364c72b33c7f67e0e9ff Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 18:46:41 -0700
Subject: [PATCH 12/99] also fix a bug with zero partitions selected

---
 .../datasources/TableFileCatalog.scala         | 18 ++----------------
 .../spark/sql/hive/HiveDataFrameSuite.scala    |  4 ++++
 .../apache/spark/sql/hive/parquetSuites.scala  | 10 ++++++++--
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index ef531efd0f649..b66ad90354b7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -113,20 +113,6 @@ private class PrunedTableFileCatalog(
     override val partitionSpec: PartitionSpec)
   extends ListingFileCatalog(
     sparkSession,
-    PrunedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
+    partitionSpec.partitions.map(_.path),
     Map.empty,
-    Some(partitionSpec.partitionColumns)) {
-}
-
-object PrunedTableFileCatalog {
-  /** Get the list of paths to list files in the for a metastore table */
-  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
-    // If there are no partitions currently specified then use base path,
-    // otherwise use the paths corresponding to the partitions.
-    if (partitionSpec.partitions.isEmpty) {
-      Seq(tableBasePath)
-    } else {
-      partitionSpec.partitions.map(_.path)
-    }
-  }
-}
+    Some(partitionSpec.partitionColumns))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 8e49baf34064e..6acbdbd25c4ee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -62,6 +62,10 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
         val df3 = spark.sql("select * from test where PARTCOL1 = 3 or partcol2 = 4")
         assert(df3.count() == 2)
         assert(df3.inputFiles.length == 2)
+
+        val df4 = spark.sql("select * from test where partCol1 = 999")
+        assert(df4.count() == 0)
+        assert(df4.inputFiles.length == 0)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 6b66ea1d82607..4b85aac851748 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -588,15 +588,21 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
 
         // Check it with pruning predicates
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 0"),
+          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
         checkAnswer(
           sql("SELECT * FROM test_added_partitions where b = 1"),
           Seq(("baz", 1)).toDF("a", "b"))
         checkAnswer(
-          sql("SELECT * FROM test_added_partitions where b = 0"),
-          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+          sql("SELECT * FROM test_added_partitions where b = 2"),
+          Seq[(String, Int)]().toDF("a", "b"))
 
         // Also verify the inputFiles implementation
         assert(sql("select * from test_added_partitions").inputFiles.length == 2)
+        assert(sql("select * from test_added_partitions where b = 0").inputFiles.length == 1)
+        assert(sql("select * from test_added_partitions where b = 1").inputFiles.length == 1)
+        assert(sql("select * from test_added_partitions where b = 2").inputFiles.length == 0)
       }
     }
   }

From 0958bcd8f088d5641fc78952b8265ce05232c3f9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 13:20:11 -0700
Subject: [PATCH 13/99] feature flag

---
 .../PruneFileSourcePartitions.scala           | 16 ++++-----
 .../datasources/fileSourceInterfaces.scala    |  7 ++--
 .../apache/spark/sql/internal/SQLConf.scala   |  9 +++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 35 +++++++++++++++++--
 .../sql/hive/HiveMetadataCacheSuite.scala     | 35 +++++++++++++++++++
 5 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index b8af0f53423bd..29121a47d92d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -56,15 +56,15 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
 
       if (partitionKeyFilters.nonEmpty) {
-          val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
-          val prunedFsRelation =
-            fsRelation.copy(location = prunedFileCatalog)(sparkSession)
-          val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+        val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
+        val prunedFsRelation =
+          fsRelation.copy(location = prunedFileCatalog)(sparkSession)
+        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
 
-          // Keep partition-pruning predicates so that they are visible in physical planning
-          val filterExpression = filters.reduceLeft(And)
-          val filter = Filter(filterExpression, prunedLogicalRelation)
-          Project(projects, filter)
+        // Keep partition-pruning predicates so that they are visible in physical planning
+        val filterExpression = filters.reduceLeft(And)
+        val filter = Filter(filterExpression, prunedLogicalRelation)
+        Project(projects, filter)
       } else {
         op
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index c72410815cc5c..648b638a4956e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -320,9 +320,9 @@ case class Partition(values: InternalRow, files: Seq[FileStatus])
 trait BasicFileCatalog {
 
   /**
-   * Returns the list of root input paths from which the catalog will get files. These paths
-   * should *not* include any table partition directories. Partition directories are discovered or
-   * provided by a metastore catalog.
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
    */
   def rootPaths: Seq[Path]
 
@@ -361,6 +361,7 @@ trait FileCatalog extends BasicFileCatalog {
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
+  /** Returns the list of files that will be read when scanning this relation. */
   override def inputFiles: Array[String] =
     allFiles().map(_.getPath.toUri.toString).toArray
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e671604c39855..b8a7034334c86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -269,6 +269,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val HIVE_DATASOURCE_PARTITION_PRUNING =
+    SQLConfigBuilder("spark.sql.hive.datasourcePartitionPruning")
+      .doc("When true, enable metastore partition pruning for Datasource tables as well. " +
+           "This is currently implemented for converted Hive tables only.")
+      .booleanConf
+      .createWithDefault(true)
+
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
@@ -675,6 +682,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
+  def datasourcePartitionPruning: Boolean = getConf(HIVE_DATASOURCE_PARTITION_PRUNING)
+
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
   def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index e7c14940b2158..eaeddcd3bea84 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -136,6 +136,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
+      pathsInMetastore: Seq[String],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
@@ -153,7 +154,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.rootPaths.toSet == Set(metastoreRelationRootPath) &&
+              relation.location.rootPaths.map(_.toString).toSet == pathsInMetastore.toSet &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
                 relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
@@ -195,11 +196,31 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.datasourcePartitionPruning
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
+      val rootPaths = if (lazyPruningEnabled) {
+        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      } else {
+        // By convention (for example, see TableFileCatalog), the definition of a
+        // partitioned table's paths depends on whether that table has any actual partitions.
+        // Partitioned tables without partitions use the location of the table's base path.
+        // Partitioned tables with partitions use the locations of those partitions' data
+        // locations,_omitting_ the table's base path.
+        val paths = metastoreRelation.getHiveQlPartitions().map { p =>
+          p.getLocation.toString
+        }
+        if (paths.isEmpty) {
+          Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+        } else {
+          paths
+        }
+      }
+
       val cached = getCached(
         tableIdentifier,
+        rootPaths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
@@ -210,8 +231,15 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val db = metastoreRelation.databaseName
         val table = metastoreRelation.tableName
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
-        val fileCatalog =
-          new TableFileCatalog(sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+        val fileCatalog = {
+          val catalog = new TableFileCatalog(
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+          if (lazyPruningEnabled) {
+            catalog
+          } else {
+            catalog.cachedAllPartitions
+          }
+        }
         val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
         val dataSchema =
           StructType(metastoreSchema
@@ -235,6 +263,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
+        Seq(rootPath.toString),
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 3414f5e0409a1..8b56bbee56e1c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -59,4 +59,39 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
       }
     }
   }
+
+  test("partitioned table is cached when partition pruning is off") {
+    withSQLConf("spark.sql.hive.datasourcePartitionPruning" -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+            .partitionBy("f1", "f2")
+            .mode("overwrite")
+            .parquet(dir.getAbsolutePath)
+
+          spark.sql(s"""
+            |create external table test (id long)
+            |partitioned by (f1 int, f2 int)
+            |stored as parquet
+            |location "${dir.getAbsolutePath}"""".stripMargin)
+          spark.sql("msck repair table test")
+
+          val df = spark.sql("select * from test")
+          assert(sql("select * from test").count() == 5)
+
+          // Delete a file, then assert that we tried to read it. This means the table was cached.
+          val p = new Path(spark.table("test").inputFiles.head)
+          assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, false))
+          val e = intercept[SparkException] {
+            sql("select * from test").count()
+          }
+          assert(e.getMessage.contains("FileNotFoundException"))
+
+          // Test refreshing the cache.
+          spark.catalog.refreshTable("test")
+          assert(sql("select * from test").count() == 4)
+        }
+      }
+    }
+  }
 }

From 291cee788e1bcc3ecbd7b1a4187f8eba58e134fb Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 15:48:03 -0700
Subject: [PATCH 14/99] add comments

---
 .../org/apache/spark/sql/internal/SQLConf.scala |  8 ++++----
 .../spark/sql/hive/HiveMetastoreCatalog.scala   | 17 ++++++++---------
 .../spark/sql/hive/HiveMetadataCacheSuite.scala |  2 +-
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b8a7034334c86..306f277e1af1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -269,9 +269,9 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val HIVE_DATASOURCE_PARTITION_PRUNING =
-    SQLConfigBuilder("spark.sql.hive.datasourcePartitionPruning")
-      .doc("When true, enable metastore partition pruning for Datasource tables as well. " +
+  val HIVE_FILESOURCE_PARTITION_PRUNING =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
+      .doc("When true, enable metastore partition pruning for file source tables as well. " +
            "This is currently implemented for converted Hive tables only.")
       .booleanConf
       .createWithDefault(true)
@@ -682,7 +682,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  def datasourcePartitionPruning: Boolean = getConf(HIVE_DATASOURCE_PARTITION_PRUNING)
+  def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
 
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index eaeddcd3bea84..dd15d27508c5f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -136,7 +136,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
-      pathsInMetastore: Seq[String],
+      pathsInMetastore: Seq[Path],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
@@ -146,7 +146,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     cachedDataSourceTables.getIfPresent(tableIdentifier) match {
       case null => None // Cache miss
       case logical @ LogicalRelation(relation: HadoopFsRelation, _, _) =>
-        val metastoreRelationRootPath = metastoreRelation.hiveQlTable.getDataLocation
         val cachedRelationFileFormatClass = relation.fileFormat.getClass
 
         expectedFileFormat match {
@@ -154,7 +153,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.rootPaths.map(_.toString).toSet == pathsInMetastore.toSet &&
+              relation.location.rootPaths.toSet == pathsInMetastore.toSet &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
                 relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
@@ -196,12 +195,12 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
-    val lazyPruningEnabled = sparkSession.sqlContext.conf.datasourcePartitionPruning
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionPruning
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
-      val rootPaths = if (lazyPruningEnabled) {
-        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      val rootPaths: Seq[Path] = if (lazyPruningEnabled) {
+        Seq(metastoreRelation.hiveQlTable.getDataLocation)
       } else {
         // By convention (for example, see TableFileCatalog), the definition of a
         // partitioned table's paths depends on whether that table has any actual partitions.
@@ -209,10 +208,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         // Partitioned tables with partitions use the locations of those partitions' data
         // locations,_omitting_ the table's base path.
         val paths = metastoreRelation.getHiveQlPartitions().map { p =>
-          p.getLocation.toString
+          new Path(p.getLocation)
         }
         if (paths.isEmpty) {
-          Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+          Seq(metastoreRelation.hiveQlTable.getDataLocation)
         } else {
           paths
         }
@@ -263,7 +262,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
-        Seq(rootPath.toString),
+        Seq(rootPath),
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 8b56bbee56e1c..c5db16e3d7194 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -61,7 +61,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
   }
 
   test("partitioned table is cached when partition pruning is off") {
-    withSQLConf("spark.sql.hive.datasourcePartitionPruning" -> "false") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
       withTable("test") {
         withTempDir { dir =>
           spark.range(5).selectExpr("id", "id as f1", "id as f2").write

From 627572e0020d313a9c1378349e2ee4ab0d0e97f1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 13 Oct 2016 10:30:48 -0700
Subject: [PATCH 15/99] extend and fix flakiness in test

---
 .../sql/hive/HiveMetadataCacheSuite.scala     | 58 ++++++++++---------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index c5db16e3d7194..7af81a3a90504 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -60,38 +60,44 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
     }
   }
 
-  test("partitioned table is cached when partition pruning is off") {
-    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
-      withTable("test") {
-        withTempDir { dir =>
-          spark.range(5).selectExpr("id", "id as f1", "id as f2").write
-            .partitionBy("f1", "f2")
-            .mode("overwrite")
-            .parquet(dir.getAbsolutePath)
+  def testCaching(pruningEnabled: Boolean): Unit = {
+    test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
+      withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> pruningEnabled.toString) {
+        withTable("test") {
+          withTempDir { dir =>
+            spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+              .partitionBy("f1", "f2")
+              .mode("overwrite")
+              .parquet(dir.getAbsolutePath)
 
-          spark.sql(s"""
-            |create external table test (id long)
-            |partitioned by (f1 int, f2 int)
-            |stored as parquet
-            |location "${dir.getAbsolutePath}"""".stripMargin)
-          spark.sql("msck repair table test")
+            spark.sql(s"""
+              |create external table test (id long)
+              |partitioned by (f1 int, f2 int)
+              |stored as parquet
+              |location "${dir.getAbsolutePath}"""".stripMargin)
+            spark.sql("msck repair table test")
 
-          val df = spark.sql("select * from test")
-          assert(sql("select * from test").count() == 5)
+            val df = spark.sql("select * from test")
+            assert(sql("select * from test").count() == 5)
 
-          // Delete a file, then assert that we tried to read it. This means the table was cached.
-          val p = new Path(spark.table("test").inputFiles.head)
-          assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, false))
-          val e = intercept[SparkException] {
-            sql("select * from test").count()
-          }
-          assert(e.getMessage.contains("FileNotFoundException"))
+            // Delete a file, then assert that we tried to read it. This means the table was cached.
+            val p = new Path(spark.table("test").inputFiles.head)
+            assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, true))
+            val e = intercept[SparkException] {
+              sql("select * from test").count()
+            }
+            assert(e.getMessage.contains("FileNotFoundException"))
 
-          // Test refreshing the cache.
-          spark.catalog.refreshTable("test")
-          assert(sql("select * from test").count() == 4)
+            // Test refreshing the cache.
+            spark.catalog.refreshTable("test")
+            assert(sql("select * from test").count() == 4)
+          }
         }
       }
     }
   }
+
+  for (pruningEnabled <- Seq(true, false)) {
+    testCaching(pruningEnabled)
+  }
 }

From 6d8e7ea9f904e33af4ca7372f5b31379aede9308 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 10:55:26 -0700
Subject: [PATCH 16/99] Enhance `ParquetMetastoreSuite` with mixed-case
 partition columns

---
 .../apache/spark/sql/hive/parquetSuites.scala | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 4b85aac851748..1068d3f6f27dc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -75,7 +75,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -89,7 +89,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -118,7 +118,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -134,7 +134,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -156,19 +156,19 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       """.stripMargin)
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).map(i => (i, s"str$i")).toDF("a", "b").createOrReplaceTempView("jt")
@@ -828,7 +828,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     normalTableDir = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDir, s"p=$p")
+      val partDir = new File(partitionedTableDir, s"pQ=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetData(i, s"part-$p"))
         .toDF()
@@ -844,7 +844,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKey = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
+      val partDir = new File(partitionedTableDirWithKey, s"pQ=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetDataWithKey(p, i, s"part-$p"))
         .toDF()
@@ -854,7 +854,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p")
+      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"pQ=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithKeyAndComplexTypes(
           p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
@@ -864,7 +864,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p")
+      val partDir = new File(partitionedTableDirWithComplexTypes, s"pQ=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
       }.toDF().write.parquet(partDir.getCanonicalPath)
@@ -898,19 +898,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"ordering of the partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
+        sql(s"SELECT pQ, stringField FROM $table WHERE pQ = 1"),
         Seq.fill(10)(Row(1, "part-1"))
       )
 
       checkAnswer(
-        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
+        sql(s"SELECT stringField, pQ FROM $table WHERE pQ = 1"),
         Seq.fill(10)(Row("part-1", 1))
       )
     }
 
     test(s"project the partitioning column $table") {
       checkAnswer(
-        sql(s"SELECT p, count(*) FROM $table group by p"),
+        sql(s"SELECT pQ, count(*) FROM $table group by pQ"),
         Row(1, 10) ::
           Row(2, 10) ::
           Row(3, 10) ::
@@ -926,7 +926,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"project partitioning and non-partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
+        sql(s"SELECT stringField, pQ, count(intField) FROM $table GROUP BY pQ, stringField"),
         Row("part-1", 1, 10) ::
           Row("part-2", 2, 10) ::
           Row("part-3", 3, 10) ::
@@ -948,19 +948,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1"),
         Row(10))
     }
 
     test(s"non-existent partition $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1000"),
         Row(0))
     }
 
     test(s"multi-partition pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE pQ IN (1,2,3)"),
         Row(30))
     }
 
@@ -972,7 +972,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"sum $table") {
       checkAnswer(
-        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
+        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND pQ = 1"),
         Row(1 + 2 + 3))
     }
 
@@ -993,15 +993,15 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
       checkAnswer(
         sql(
           s"""
-             |SELECT p, structField.intStructField, structField.stringStructField
-             |FROM $table WHERE p = 1
+             |SELECT pQ, structField.intStructField, structField.stringStructField
+             |FROM $table WHERE pQ = 1
            """.stripMargin),
         (1 to 10).map(i => Row(1, i, f"${i}_string")))
     }
 
     test(s"SPARK-5775 read array from $table") {
       checkAnswer(
-        sql(s"SELECT arrayField, p FROM $table WHERE p = 1"),
+        sql(s"SELECT arrayField, pQ FROM $table WHERE pQ = 1"),
         (1 to 10).map(i => Row(1 to i, 1)))
     }
   }

From 21caa932a157ec3dd394829061b06bd3d857de0f Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 11:29:25 -0700
Subject: [PATCH 17/99] Tidy up a little by removing some unused imports, an
 unused method and moving a protected method down and making it private

---
 .../datasources/PartitioningAwareFileCatalog.scala         | 7 +++++++
 .../sql/execution/datasources/SessionFileCatalog.scala     | 7 -------
 .../spark/sql/execution/datasources/TableFileCatalog.scala | 7 +------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 04d7d89250586..b2508115c282f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -207,6 +207,13 @@ abstract class PartitioningAwareFileCatalog(
           if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet
     }
   }
+
+  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
+  // counted as data files, so that they shouldn't participate partition discovery.
+  private def isDataPath(path: Path): Boolean = {
+    val name = path.getName
+    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
+  }
 }
 
 object PartitioningAwareFileCatalog {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
index 47b69eb721b29..7e7fc18535749 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -58,13 +58,6 @@ abstract class SessionFileCatalog(sparkSession: SparkSession)
 
     mutable.LinkedHashSet(files: _*)
   }
-
-  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
-  // counted as data files, so that they shouldn't participate partition discovery.
-  protected def isDataPath(path: Path): Boolean = {
-    val name = path.getName
-    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
-  }
 }
 
 object SessionFileCatalog extends Logging {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index b66ad90354b7c..a5c41b244589b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -20,10 +20,8 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.types.StructType
 
 
 /**
@@ -95,9 +93,6 @@ class TableFileCatalog(
   lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
 
   override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
-
-  private def listDataLeafFiles(paths: Seq[Path]) =
-    listLeafFiles(paths).filter(f => isDataPath(f.getPath))
 }
 
 /**

From d7795cd0f3bc517bdf278e626ca25ce08ea23bcb Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 11:44:15 -0700
Subject: [PATCH 18/99] Put partition count in `FileSourceScanExec.metadata`
 for partitioned tables

---
 .../sql/execution/DataSourceScanExec.scala    | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index ee61f7f0413da..623d2be55dcec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -230,13 +230,21 @@ case class FileSourceScanExec(
     val location = relation.location
     val locationDesc =
       location.getClass.getSimpleName + seqToString(location.rootPaths)
-    Map(
-      "Format" -> relation.fileFormat.toString,
-      "ReadSchema" -> outputSchema.catalogString,
-      "Batched" -> supportsBatch.toString,
-      "PartitionFilters" -> seqToString(partitionFilters),
-      "PushedFilters" -> seqToString(dataFilters),
-      "Location" -> locationDesc)
+    val metadata =
+      Map(
+        "Format" -> relation.fileFormat.toString,
+        "ReadSchema" -> outputSchema.catalogString,
+        "Batched" -> supportsBatch.toString,
+        "PartitionFilters" -> seqToString(partitionFilters),
+        "PushedFilters" -> seqToString(dataFilters),
+        "Location" -> locationDesc)
+    val withOptPartitionCount =
+      relation.partitionSchemaOption.map { _ =>
+        metadata + ("PartitionCount" -> selectedPartitions.size.toString)
+      } getOrElse {
+        metadata
+      }
+    withOptPartitionCount
   }
 
   private lazy val inputRDD: RDD[InternalRow] = {

From 765f93ce664ef33c1c62bf80b678ff5ba2992b85 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 13:48:33 -0700
Subject: [PATCH 19/99] Fix some errors in my revision of `ParquetSourceSuite`

---
 .../scala/org/apache/spark/sql/hive/parquetSuites.scala     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 1068d3f6f27dc..c4344dd12780b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -34,7 +34,7 @@ import org.apache.spark.util.Utils
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
-case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
+case class ParquetDataWithKey(pQ: Int, intField: Int, stringField: String)
 
 case class StructContainer(intStructField: Int, stringStructField: String)
 
@@ -45,7 +45,7 @@ case class ParquetDataWithComplexTypes(
     arrayField: Seq[Int])
 
 case class ParquetDataWithKeyAndComplexTypes(
-    p: Int,
+    pQ: Int,
     intField: Int,
     stringField: String,
     structField: StructContainer,
@@ -650,7 +650,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
       CREATE TEMPORARY VIEW normal_parquet
       USING org.apache.spark.sql.parquet
       OPTIONS (
-        path '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
+        path '${new File(partitionedTableDir, "pQ=1").getCanonicalPath}'
       )
     """)
 

From e1635e4570c0e4b892b93d1ac1e71d52d5a4f66b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 13 Oct 2016 18:24:31 -0700
Subject: [PATCH 20/99] Add metrics and cost tests for partition pruning
 effectiveness (#5)

* [SPARK-16980][SQL] Load only catalog table partition metadata required
to answer a query

* Add a new catalyst optimizer rule to SQL core for pruning unneeded
partitions' files from a table file catalog

* Include the type of file catalog in the FileSourceScanExec metadata

* TODO: Consider renaming FileCatalog to better differentiate it from
BasicFileCatalog (or vice-versa)

* try out parquet case insensitive fallback

* Refactor the FileSourceScanExec.metadata val to make it prettier

* fix and add test for input files

* rename test

* Refactor `TableFileCatalog.listFiles` to call `listDataLeafFiles` once
instead of once per partition

* fix it

* more test cases

* also fix a bug with zero partitions selected

* feature flag

* add comments

* extend and fix flakiness in test

* Enhance `ParquetMetastoreSuite` with mixed-case partition columns

* Tidy up a little by removing some unused imports, an unused method and
moving a protected method down and making it private

* Put partition count in `FileSourceScanExec.metadata` for partitioned
tables

* Fix some errors in my revision of `ParquetSourceSuite`

* Thu Oct 13 17:18:14 PDT 2016

* more generic

* Thu Oct 13 18:09:42 PDT 2016

* Thu Oct 13 18:09:55 PDT 2016

* Thu Oct 13 18:22:31 PDT 2016
---
 .../spark/metrics/source/StaticSources.scala  | 29 ++++++
 .../datasources/SessionFileCatalog.scala      |  2 +
 .../sql/hive/client/HiveClientImpl.scala      |  9 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 96 ++++++++++++++++---
 .../apache/spark/sql/hive/parquetSuites.scala |  3 +-
 5 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 6bba259acc391..4e88ae65bd89f 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -60,3 +60,32 @@ object CodegenMetrics extends Source {
   val METRIC_GENERATED_METHOD_BYTECODE_SIZE =
     metricRegistry.histogram(MetricRegistry.name("generatedMethodSize"))
 }
+
+/**
+ * :: Experimental ::
+ * Metrics for access to the hive external catalog.
+ */
+@Experimental
+object HiveCatalogMetrics extends Source {
+  override val sourceName: String = "HiveExternalCatalog"
+  override val metricRegistry: MetricRegistry = new MetricRegistry()
+
+  /**
+   * Tracks the total number of partition metadata entries fetched via the client api.
+   */
+  val METRIC_PARTITIONS_FETCHED = metricRegistry.counter(MetricRegistry.name("partitionsFetched"))
+
+  /**
+   * Tracks the total number of files discovered off of the filesystem by ListingFileCatalog.
+   */
+  val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
+
+  def reset(): Unit = {
+    METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
+    METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
+  }
+
+  // clients can use these to avoid classloader issues with the codahale classes
+  def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
+  def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
index 7e7fc18535749..4807a92c2e6b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs._
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.util.SerializableConfiguration
 
@@ -56,6 +57,7 @@ abstract class SessionFileCatalog(sparkSession: SparkSession)
         SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
       }
 
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
     mutable.LinkedHashSet(files: _*)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index dd33d750a4d45..e745a8c5b3589 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -37,6 +37,7 @@ import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPartitionException}
@@ -528,17 +529,21 @@ private[hive] class HiveClientImpl(
       table: CatalogTable,
       spec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = withHiveState {
     val hiveTable = toHiveTable(table)
-    spec match {
+    val parts = spec match {
       case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
       case Some(s) => client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
     }
+    HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
+    parts
   }
 
   override def getPartitionsByFilter(
       table: CatalogTable,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withHiveState {
     val hiveTable = toHiveTable(table)
-    shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
+    val parts = shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
+    HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
+    parts
   }
 
   override def listTables(dbName: String): Seq[String] = withHiveState {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 6acbdbd25c4ee..f65e74de87a57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import java.io.File
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
@@ -36,21 +39,25 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
 
+  private def setupPartitionedTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+      .partitionBy("partCol1", "partCol2")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create external table $tableName (id long)
+      |partitioned by (partCol1 int, partCol2 int)
+      |stored as parquet
+      |location "${dir.getAbsolutePath}"""".stripMargin)
+    spark.sql(s"msck repair table $tableName")
+  }
+
   test("partitioned pruned table reports only selected files") {
+    assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
     withTable("test") {
       withTempDir { dir =>
-        spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
-          .partitionBy("partCol1", "partCol2")
-          .mode("overwrite")
-          .parquet(dir.getAbsolutePath)
-
-        spark.sql(s"""
-          |create external table test (id long)
-          |partitioned by (partCol1 int, partCol2 int)
-          |stored as parquet
-          |location "${dir.getAbsolutePath}"""".stripMargin)
-        spark.sql("msck repair table test")
-
+        setupPartitionedTable("test", dir)
         val df = spark.sql("select * from test")
         assert(df.count() == 5)
         assert(df.inputFiles.length == 5)  // unpruned
@@ -69,4 +76,69 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
       }
     }
   }
+
+  test("lazy partition pruning reads only necessary partition data") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 3").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 3)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 3)
+
+          // should read all
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          // read all should be cached
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("all partitions read and cached when filesource partition pruning is off") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+
+          // We actually query the partitions from hive each time the table is resolved in this
+          // mode. This is kind of terrible, but is needed to preserve the legacy behavior
+          // of doing plan cache validation based on the entire partition set.
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
+          // 5 from table resolution, another 5 from ListingFileCatalog
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 10)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index c4344dd12780b..d39d64195e1cd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -175,7 +175,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
     (1 to 10).map(i => Tuple1(Seq(new Integer(i), null))).toDF("a")
       .createOrReplaceTempView("jt_array")
 
-    setConf(HiveUtils.CONVERT_METASTORE_PARQUET, true)
+    assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
   }
 
   override def afterAll(): Unit = {
@@ -187,7 +187,6 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       "jt",
       "jt_array",
        "test_parquet")
-    setConf(HiveUtils.CONVERT_METASTORE_PARQUET, false)
   }
 
   test(s"conversion is working") {

From 71049d130e89aedba75e8875d8fde7620d6a55e2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 13 Oct 2016 19:27:01 -0700
Subject: [PATCH 21/99] Actually register the hive catalog metrics, also revert
 broken tests (#6)

* Thu Oct 13 19:02:36 PDT 2016

* Thu Oct 13 19:03:06 PDT 2016
---
 .../spark/metrics/source/StaticSources.scala  |  5 +-
 .../apache/spark/sql/hive/parquetSuites.scala | 52 +++++++++----------
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 4e88ae65bd89f..cf92a10deabd5 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -26,7 +26,7 @@ private[spark] object StaticSources {
    * The set of all static sources. These sources may be reported to from any class, including
    * static classes, without requiring reference to a SparkEnv.
    */
-  val allSources = Seq(CodegenMetrics)
+  val allSources = Seq(CodegenMetrics, HiveCatalogMetrics)
 }
 
 /**
@@ -80,6 +80,9 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
 
+  /**
+   * Resets the values of all metrics to zero. This is useful in tests.
+   */
   def reset(): Unit = {
     METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index d39d64195e1cd..9fc62a389db4d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -34,7 +34,7 @@ import org.apache.spark.util.Utils
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
-case class ParquetDataWithKey(pQ: Int, intField: Int, stringField: String)
+case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
 
 case class StructContainer(intStructField: Int, stringStructField: String)
 
@@ -45,7 +45,7 @@ case class ParquetDataWithComplexTypes(
     arrayField: Seq[Int])
 
 case class ParquetDataWithKeyAndComplexTypes(
-    pQ: Int,
+    p: Int,
     intField: Int,
     stringField: String,
     structField: StructContainer,
@@ -75,7 +75,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -89,7 +89,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -118,7 +118,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -134,7 +134,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -156,19 +156,19 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       """.stripMargin)
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (p=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (p=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (p=$p)")
     }
 
     (1 to 10).map(i => (i, s"str$i")).toDF("a", "b").createOrReplaceTempView("jt")
@@ -649,7 +649,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
       CREATE TEMPORARY VIEW normal_parquet
       USING org.apache.spark.sql.parquet
       OPTIONS (
-        path '${new File(partitionedTableDir, "pQ=1").getCanonicalPath}'
+        path '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
       )
     """)
 
@@ -827,7 +827,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     normalTableDir = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDir, s"pQ=$p")
+      val partDir = new File(partitionedTableDir, s"p=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetData(i, s"part-$p"))
         .toDF()
@@ -843,7 +843,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKey = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKey, s"pQ=$p")
+      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetDataWithKey(p, i, s"part-$p"))
         .toDF()
@@ -853,7 +853,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"pQ=$p")
+      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithKeyAndComplexTypes(
           p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
@@ -863,7 +863,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithComplexTypes, s"pQ=$p")
+      val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
       }.toDF().write.parquet(partDir.getCanonicalPath)
@@ -897,19 +897,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"ordering of the partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT pQ, stringField FROM $table WHERE pQ = 1"),
+        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
         Seq.fill(10)(Row(1, "part-1"))
       )
 
       checkAnswer(
-        sql(s"SELECT stringField, pQ FROM $table WHERE pQ = 1"),
+        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
         Seq.fill(10)(Row("part-1", 1))
       )
     }
 
     test(s"project the partitioning column $table") {
       checkAnswer(
-        sql(s"SELECT pQ, count(*) FROM $table group by pQ"),
+        sql(s"SELECT p, count(*) FROM $table group by p"),
         Row(1, 10) ::
           Row(2, 10) ::
           Row(3, 10) ::
@@ -925,7 +925,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"project partitioning and non-partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT stringField, pQ, count(intField) FROM $table GROUP BY pQ, stringField"),
+        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
         Row("part-1", 1, 10) ::
           Row("part-2", 2, 10) ::
           Row("part-3", 3, 10) ::
@@ -947,19 +947,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
         Row(10))
     }
 
     test(s"non-existent partition $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1000"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
         Row(0))
     }
 
     test(s"multi-partition pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE pQ IN (1,2,3)"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
         Row(30))
     }
 
@@ -971,7 +971,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"sum $table") {
       checkAnswer(
-        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND pQ = 1"),
+        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
         Row(1 + 2 + 3))
     }
 
@@ -992,15 +992,15 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
       checkAnswer(
         sql(
           s"""
-             |SELECT pQ, structField.intStructField, structField.stringStructField
-             |FROM $table WHERE pQ = 1
+             |SELECT p, structField.intStructField, structField.stringStructField
+             |FROM $table WHERE p = 1
            """.stripMargin),
         (1 to 10).map(i => Row(1, i, f"${i}_string")))
     }
 
     test(s"SPARK-5775 read array from $table") {
       checkAnswer(
-        sql(s"SELECT arrayField, pQ FROM $table WHERE pQ = 1"),
+        sql(s"SELECT arrayField, p FROM $table WHERE p = 1"),
         (1 to 10).map(i => Row(1 to i, 1)))
     }
   }

From 6a63afd156d4806122b9ad0c2593de69a0ae790c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 14 Oct 2016 14:04:01 -0700
Subject: [PATCH 22/99] Fri Oct 14 14:04:01 PDT 2016

---
 .../spark/sql/hive/orc/OrcFileFormat.scala    | 12 +++++++++-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 23 +++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index e94f49ea81177..1af3280e18a89 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -313,7 +313,17 @@ private[orc] object OrcRelation extends HiveInspectors {
 
   def setRequiredColumns(
       conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
+    val caseInsensitiveFieldMap: Map[String, Int] = physicalSchema.fieldNames
+      .zipWithIndex
+      .map(f => (f._1.toLowerCase, f._2))
+      .toMap
+    val ids = requestedSchema.map { a =>
+      val exactMatch: Option[Int] = physicalSchema.getFieldIndex(a.name)
+      val res = exactMatch.getOrElse(
+        caseInsensitiveFieldMap.getOrElse(a.name,
+          throw new IllegalArgumentException(s"""Field "$a.name" does not exist.""")))
+      res: Integer
+    }
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index b2ee49c441ef2..ddb2ae23d4ea1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -474,6 +474,29 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("converted ORC table supports resolving mixed case field") {
+    withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
+      withTable("dummy_orc") {
+        withTempPath { dir =>
+          val df = spark.range(5).selectExpr("id", "id as valueField", "id as partitionValue")
+          df.write
+            .partitionBy("partitionValue")
+            .mode("overwrite")
+            .orc(dir.getAbsolutePath)
+
+          spark.sql(s"""
+            |create external table dummy_orc (id long, valueField long)
+            |partitioned by (partitionValue int)
+            |stored as orc
+            |location "${dir.getAbsolutePath}"""".stripMargin)
+          spark.sql(s"msck repair table dummy_orc")
+          println(spark.sql("select * from dummy_orc"))
+          checkAnswer(spark.sql("select * from dummy_orc"), df)
+        }
+      }
+    }
+  }
+
   test("SPARK-14962 Produce correct results on array type with isnotnull") {
     withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       val data = (0 until 10).map(i => Tuple1(Array(i)))

From 6b02b3c36b3c1f99695262f9d60fe2aaaf25c5bc Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Fri, 14 Oct 2016 14:35:10 -0700
Subject: [PATCH 23/99] [SPARK-16980][SQL] Load only catalog table partition
 metadata required to answer a query

---
 .../catalyst/catalog/ExternalCatalog.scala    |   5 +-
 .../catalyst/catalog/InMemoryCatalog.scala    |   4 +-
 .../scala/org/apache/spark/sql/Dataset.scala  |   4 +-
 .../spark/sql/execution/CacheManager.scala    |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |   2 +-
 .../command/createDataSourceTables.scala      |   2 +-
 .../execution/datasources/DataSource.scala    |   4 +-
 .../datasources/DataSourceStrategy.scala      |   8 +-
 .../execution/datasources/FileFormat.scala    |  42 +++-
 .../datasources/HadoopFsRelation.scala        |  18 +-
 .../datasources/ListingFileCatalog.scala      | 197 +--------------
 .../datasources/LogicalRelation.scala         |   2 +-
 .../PartitioningAwareFileCatalog.scala        |  25 +-
 .../datasources/SessionFileCatalog.scala      | 230 ++++++++++++++++++
 .../datasources/TableFileCatalog.scala        | 102 ++++++++
 .../streaming/MetadataLogFileCatalog.scala    |   2 +-
 .../datasources/FileCatalogSuite.scala        |   5 +-
 .../datasources/FileSourceStrategySuite.scala |   2 +-
 ...te.scala => SessionFileCatalogSuite.scala} |  16 +-
 .../ParquetPartitionDiscoverySuite.scala      |   6 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  |  41 +++-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 104 ++------
 .../spark/sql/hive/client/HiveClient.scala    |  15 +-
 .../sql/hive/client/HiveClientImpl.scala      |  10 +-
 .../spark/sql/hive/client/VersionsSuite.scala |   4 +-
 25 files changed, 493 insertions(+), 359 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/{ListingFileCatalogSuite.scala => SessionFileCatalogSuite.scala} (66%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 348d3d0be2152..a5e02523d2889 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -198,11 +198,12 @@ abstract class ExternalCatalog {
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
   /**
-   * List the metadata of selected partitions according to the given partition predicates.
+   * List the metadata of partitions that belong to the specified table, assuming it exists, that
+   * satisfy the given partition-pruning predicate expressions.
    *
    * @param db database name
    * @param table table name
-   * @param predicates partition predicated
+   * @param predicates  partition-pruning predicates
    */
   def listPartitionsByFilter(
       db: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 49280f82e20be..f95c9f8cfa2d4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -482,7 +482,9 @@ class InMemoryCatalog(
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
-    throw new UnsupportedOperationException("listPartitionsByFilter is not implemented.")
+    // TODO: Provide an implementation
+    throw new UnsupportedOperationException(
+      "listPartitionsByFilter is not implemented for InMemoryCatalog")
   }
 
   // --------------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e59a483075c94..90897ac5d7b50 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
@@ -2603,6 +2603,8 @@ class Dataset[T] private[sql](
    */
   def inputFiles: Array[String] = {
     val files: Seq[String] = logicalPlan.collect {
+      case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
+        location.inputFiles
       case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 83b7c779ab818..92fd366e101fd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -185,7 +185,7 @@ class CacheManager extends Logging {
     plan match {
       case lr: LogicalRelation => lr.relation match {
         case hr: HadoopFsRelation =>
-          val invalidate = hr.location.paths
+          val invalidate = hr.location.rootPaths
             .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory))
             .contains(qualifiedPath)
           if (invalidate) hr.location.refresh()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 6cdba406937de..808f2052c48b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -231,7 +231,7 @@ case class FileSourceScanExec(
     "Batched" -> supportsBatch.toString,
     "PartitionFilters" -> partitionFilters.mkString("[", ", ", "]"),
     "PushedFilters" -> dataFilters.mkString("[", ", ", "]"),
-    "InputPaths" -> relation.location.paths.mkString(", "))
+    "RootPaths" -> relation.location.rootPaths.mkString(", "))
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val readFile: (PartitionedFile) => Iterator[InternalRow] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index a04a13e698c43..a8c75a7f29cef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -67,7 +67,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     dataSource match {
       case fs: HadoopFsRelation =>
-        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.paths.isEmpty) {
+        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.rootPaths.isEmpty) {
           throw new AnalysisException(
             "Cannot create a file-based external data source table without path")
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index e75e7d2770b4e..92b1fff7d8127 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -471,9 +471,7 @@ case class DataSource(
           val existingPartitionColumns = Try {
             resolveRelation()
               .asInstanceOf[HadoopFsRelation]
-              .location
-              .partitionSpec()
-              .partitionColumns
+              .partitionSchema
               .fieldNames
               .toSeq
           }.getOrElse(Seq.empty[String])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 6f9ed50a02b09..7d0abe86a44df 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -163,14 +163,14 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         if query.resolved && t.schema.asNullable == query.schema.asNullable =>
 
       // Sanity checks
-      if (t.location.paths.size != 1) {
+      if (t.location.rootPaths.size != 1) {
         throw new AnalysisException(
           "Can only write data to relations with a single path.")
       }
 
-      val outputPath = t.location.paths.head
+      val outputPath = t.location.rootPaths.head
       val inputPaths = query.collect {
-        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.paths
+        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths
       }.flatten
 
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
@@ -184,7 +184,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
         t.bucketSpec,
         t.fileFormat,
-        () => t.refresh(),
+        () => t.location.refresh(),
         t.options,
         query,
         mode)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index bde2d2b89d56f..01bceef2efc2e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -182,16 +182,17 @@ abstract class TextBasedFileFormat extends FileFormat {
 case class Partition(values: InternalRow, files: Seq[FileStatus])
 
 /**
- * An interface for objects capable of enumerating the files that comprise a relation as well
- * as the partitioning characteristics of those files.
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
  */
-trait FileCatalog {
+trait BasicFileCatalog {
 
-  /** Returns the list of input paths from which the catalog will get files. */
-  def paths: Seq[Path]
-
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
+  /**
+   * Returns the list of root input paths from which the catalog will get files. These paths
+   * should *not* include any table partition directories. Partition directories are discovered or
+   * provided by a metastore catalog.
+   */
+  def rootPaths: Seq[Path]
 
   /**
    * Returns all valid files grouped into partitions when the data is partitioned. If the data is
@@ -204,9 +205,30 @@ trait FileCatalog {
    */
   def listFiles(filters: Seq[Expression]): Seq[Partition]
 
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
+
+/**
+ * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
+ * those, infer the relation's partition specification.
+ */
+// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
+// which it is safe to list all of its files?
+trait FileCatalog extends BasicFileCatalog {
+
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
-  /** Refresh the file listing */
-  def refresh(): Unit
+  /** Returns the list of files that will be read when scanning this relation. */
+  def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index c7ebe0b76a150..c600d683cb695 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,8 +28,8 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[FileCatalog]] that can enumerate the locations of all the files that comprise
- *                 this relation.
+ * @param location A [[BasicFileCatalog]] that can enumerate the locations of all the files that
+ *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
  *                   present in the actual data files as well, they are preserved.
@@ -38,13 +38,13 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: FileCatalog,
+    location: BasicFileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
     options: Map[String, String])(val sparkSession: SparkSession)
-  extends BaseRelation with FileRelation {
+  extends BaseRelation {
 
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
@@ -58,10 +58,6 @@ case class HadoopFsRelation(
   def partitionSchemaOption: Option[StructType] =
     if (partitionSchema.isEmpty) None else Some(partitionSchema)
 
-  def partitionSpec: PartitionSpec = location.partitionSpec()
-
-  def refresh(): Unit = location.refresh()
-
   override def toString: String = {
     fileFormat match {
       case source: DataSourceRegister => source.shortName()
@@ -69,9 +65,5 @@ case class HadoopFsRelation(
     }
   }
 
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    location.allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = location.allFiles().map(_.getLen).sum
+  override def sizeInBytes: Long = location.sizeInBytes
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index a68ae523e0faa..6d10501b7265d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -17,32 +17,26 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import java.io.FileNotFoundException
-
 import scala.collection.mutable
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.SerializableConfiguration
 
 
 /**
  * A [[FileCatalog]] that generates the list of files to process by recursively listing all the
  * files present in `paths`.
  *
+ * @param rootPaths the list of root table paths to scan
  * @param parameters as set of options to control discovery
- * @param paths a list of paths to scan
  * @param partitionSchema an optional partition schema that will be use to provide types for the
  *                        discovered partitions
  */
 class ListingFileCatalog(
     sparkSession: SparkSession,
-    override val paths: Seq[Path],
+    override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
   extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) {
@@ -70,198 +64,17 @@ class ListingFileCatalog(
   }
 
   override def refresh(): Unit = {
-    val files = listLeafFiles(paths)
+    val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
     cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
     cachedPartitionSpec = null
   }
 
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        ListingFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        ListingFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    mutable.LinkedHashSet(files: _*)
-  }
-
   override def equals(other: Any): Boolean = other match {
-    case hdfs: ListingFileCatalog => paths.toSet == hdfs.paths.toSet
+    case hdfs: ListingFileCatalog => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false
   }
 
-  override def hashCode(): Int = paths.toSet.hashCode()
-}
-
-
-object ListingFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
+  override def hashCode(): Int = rootPaths.toSet.hashCode()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index d9562fd32e87d..7c28d48f26416 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -94,7 +94,7 @@ case class LogicalRelation(
   }
 
   override def refresh(): Unit = relation match {
-    case fs: HadoopFsRelation => fs.refresh()
+    case fs: HadoopFsRelation => fs.location.refresh()
     case _ =>  // Do nothing.
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 702ba97222e34..04d7d89250586 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -21,7 +21,6 @@ import scala.collection.mutable
 
 import org.apache.hadoop.fs.{FileStatus, Path}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
@@ -40,9 +39,10 @@ abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
-  extends FileCatalog with Logging {
+  extends SessionFileCatalog(sparkSession) with FileCatalog {
+  import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
@@ -72,8 +72,8 @@ abstract class PartitioningAwareFileCatalog(
 
   override def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
-      // For each of the input paths, get the list of files inside them
-      paths.flatMap { path =>
+      // For each of the root input paths, get the list of files inside them
+      rootPaths.flatMap { path =>
         // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
         val fs = path.getFileSystem(hadoopConf)
         val qualifiedPathPre = fs.makeQualified(path)
@@ -105,8 +105,6 @@ abstract class PartitioningAwareFileCatalog(
   protected def inferPartitioning(): PartitionSpec = {
     // We use leaf dirs containing data files to discover the schema.
     val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
-      // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
-      // counted as data files, so that they shouldn't participate partition discovery.
       files.exists(f => isDataPath(f.getPath))
     }.keys.toSeq
     partitionSchema match {
@@ -194,24 +192,23 @@ abstract class PartitioningAwareFileCatalog(
    * and the returned DataFrame will have the column of `something`.
    */
   private def basePaths: Set[Path] = {
-    parameters.get("basePath").map(new Path(_)) match {
+    parameters.get(BASE_PATH_PARAM).map(new Path(_)) match {
       case Some(userDefinedBasePath) =>
         val fs = userDefinedBasePath.getFileSystem(hadoopConf)
         if (!fs.isDirectory(userDefinedBasePath)) {
-          throw new IllegalArgumentException("Option 'basePath' must be a directory")
+          throw new IllegalArgumentException(s"Option '$BASE_PATH_PARAM' must be a directory")
         }
         Set(fs.makeQualified(userDefinedBasePath))
 
       case None =>
-        paths.map { path =>
+        rootPaths.map { path =>
           // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
           val qualifiedPath = path.getFileSystem(hadoopConf).makeQualified(path)
           if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet
     }
   }
+}
 
-  private def isDataPath(path: Path): Boolean = {
-    val name = path.getName
-    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
-  }
+object PartitioningAwareFileCatalog {
+  val BASE_PATH_PARAM = "basePath"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
new file mode 100644
index 0000000000000..47b69eb721b29
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
+ * files in a list of HDFS paths.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param ignoreFileNotFound (see [[ListingFileCatalog]])
+ */
+abstract class SessionFileCatalog(sparkSession: SparkSession)
+    extends BasicFileCatalog with Logging {
+  protected val hadoopConf: Configuration
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    mutable.LinkedHashSet(files: _*)
+  }
+
+  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
+  // counted as data files, so that they shouldn't participate partition discovery.
+  protected def isDataPath(path: Path): Boolean = {
+    val name = path.getName
+    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
+  }
+}
+
+object SessionFileCatalog extends Logging {
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
new file mode 100644
index 0000000000000..d90ce19869e46
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{StructField, StructType}
+
+
+/**
+ * A [[BasicFileCatalog]] for a metastore catalog table.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param db the table's database name
+ * @param table the table's (unqualified) name
+ * @param partitionSchema the schema of a partitioned table's partition columns
+ * @param sizeInBytes the table's data size in bytes
+ */
+class TableFileCatalog(
+    sparkSession: SparkSession,
+    db: String,
+    table: String,
+    partitionSchema: Option[StructType],
+    override val sizeInBytes: Long)
+  extends SessionFileCatalog(sparkSession) {
+
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+
+  private val externalCatalog = sparkSession.sharedState.externalCatalog
+
+  private val catalogTable = externalCatalog.getTable(db, table)
+
+  private val baseLocation = catalogTable.storage.locationUri
+
+  override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
+
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = partitionSchema match {
+    case Some(partitionSchema) =>
+      externalCatalog.listPartitionsByFilter(db, table, filters).flatMap {
+        case CatalogTablePartition(spec, storage, _) =>
+          storage.locationUri.map(new Path(_)).map { path =>
+            val files = listDataLeafFiles(path :: Nil).toSeq
+            val values =
+              InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+                Cast(Literal(spec(name)), dataType).eval()
+              })
+            Partition(values, files)
+          }
+      }
+    case None =>
+      Partition(InternalRow.empty, listDataLeafFiles(rootPaths).toSeq) :: Nil
+  }
+
+  override def refresh(): Unit = {}
+
+
+  /**
+   * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
+   * specified by the given partition-pruning filters.
+   *
+   * @param filters partition-pruning filters
+   */
+  def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
+    val rootPaths = partitionSchema match {
+      case Some(_) =>
+        externalCatalog
+          .listPartitionsByFilter(db, table, filters)
+          .flatMap(_.storage.locationUri)
+          .map(new Path(_))
+      case None =>
+        this.rootPaths
+    }
+    val parameters =
+      baseLocation
+        .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
+        .getOrElse(Map.empty)
+
+    new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
+  }
+
+  private def listDataLeafFiles(paths: Seq[Path]) =
+    listLeafFiles(paths).filter(f => isDataPath(f.getPath))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
index a32c4671e3475..82b67cb1ca6ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
@@ -47,7 +47,7 @@ class MetadataLogFileCatalog(sparkSession: SparkSession, path: Path)
     allFilesFromLog.toArray.groupBy(_.getPath.getParent)
   }
 
-  override def paths: Seq[Path] = path :: Nil
+  override def rootPaths: Seq[Path] = path :: Nil
 
   override def refresh(): Unit = { }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index fa3abd0098f5b..2695974b84b00 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -77,13 +77,14 @@ class FileCatalogSuite extends SharedSQLContext {
       val catalog1 = new ListingFileCatalog(
         spark, Seq(new Path(deletedFolder.getCanonicalPath)), Map.empty, None)
       // doesn't throw an exception
-      assert(catalog1.listLeafFiles(catalog1.paths).isEmpty)
+      assert(catalog1.listLeafFiles(catalog1.rootPaths).isEmpty)
     }
   }
 
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
-      override val paths: Seq[Path]) extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
+      override val rootPaths: Seq[Path])
+      extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
 
       override def refresh(): Unit = {}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index c5deb31fec183..c32254d9dfde2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -395,7 +395,7 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
 
         val fileCatalog = new ListingFileCatalog(
           sparkSession = spark,
-          paths = Seq(new Path(tempDir)),
+          rootPaths = Seq(new Path(tempDir)),
           parameters = Map.empty[String, String],
           partitionSchema = None)
         // This should not fail.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
similarity index 66%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
index f15730aeb11f2..df509583377ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
@@ -19,16 +19,16 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.SparkFunSuite
 
-class ListingFileCatalogSuite extends SparkFunSuite {
+class SessionFileCatalogSuite extends SparkFunSuite {
 
   test("file filtering") {
-    assert(!ListingFileCatalog.shouldFilterOut("abcd"))
-    assert(ListingFileCatalog.shouldFilterOut(".ab"))
-    assert(ListingFileCatalog.shouldFilterOut("_cd"))
+    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
+    assert(SessionFileCatalog.shouldFilterOut(".ab"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd"))
 
-    assert(!ListingFileCatalog.shouldFilterOut("_metadata"))
-    assert(!ListingFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(ListingFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(ListingFileCatalog.shouldFilterOut("_cd_common_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 8d18be9300f7e..43357c97c395a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -626,8 +626,8 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: HadoopFsRelation, _, _) =>
-          assert(relation.partitionSpec === PartitionSpec.emptySpec)
+        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index b5d93c3d7c804..a103f0363c615 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -29,17 +29,17 @@ import org.apache.thrift.TException
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
 
 /**
@@ -650,8 +650,39 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   override def listPartitionsByFilter(
       db: String,
       table: String,
-      predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
-    client.getPartitionsByFilter(db, table, predicates)
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
+    val catalogTable = client.getTable(db, table)
+    val partitionColumnNames = catalogTable.partitionColumnNames.toSet
+    val nonPartitionPruningPredicates = predicates.filterNot {
+      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
+    }
+
+    if (nonPartitionPruningPredicates.nonEmpty) {
+        sys.error("Expected only partition pruning predicates: " +
+          predicates.reduceLeft(And))
+    }
+
+    val partitionSchema = catalogTable.partitionSchema
+
+    if (predicates.nonEmpty) {
+      val clientPrunedPartitions =
+        client.getPartitionsByFilter(catalogTable, predicates)
+      val boundPredicate =
+        InterpretedPredicate.create(predicates.reduce(And).transform {
+          case att: AttributeReference =>
+            val index = partitionSchema.indexWhere(_.name == att.name)
+            BoundReference(index, partitionSchema(index).dataType, nullable = true)
+        })
+      clientPrunedPartitions.filter { case CatalogTablePartition(spec, _, _) =>
+        val row =
+          InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+            Cast(Literal(spec(name)), dataType).eval()
+          })
+        boundPredicate(row)
+      }
+    } else {
+      client.getPartitions(catalogTable)
+    }
   }
 
   // --------------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c44f0adda44c0..f4494f564a689 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -135,16 +135,16 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
-      pathsInMetastore: Seq[String],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
       expectedBucketSpec: Option[BucketSpec],
-      partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
+      partitionSchema: Option[StructType]): Option[LogicalRelation] = {
 
     cachedDataSourceTables.getIfPresent(tableIdentifier) match {
       case null => None // Cache miss
       case logical @ LogicalRelation(relation: HadoopFsRelation, _, _) =>
+        val metastoreRelationRootPath = metastoreRelation.hiveQlTable.getDataLocation
         val cachedRelationFileFormatClass = relation.fileFormat.getClass
 
         expectedFileFormat match {
@@ -152,12 +152,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.paths.map(_.toString).toSet == pathsInMetastore.toSet &&
+              relation.location.rootPaths.toSet == Set(metastoreRelationRootPath) &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
-                relation.partitionSpec == partitionSpecInMetastore.getOrElse {
-                  PartitionSpec(StructType(Nil), Array.empty[PartitionDirectory])
-                }
+                relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
 
             if (useCached) {
               Some(logical)
@@ -198,59 +196,30 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
-      val partitionColumnDataTypes = partitionSchema.map(_.dataType)
-      // We're converting the entire table into HadoopFsRelation, so predicates to Hive metastore
-      // are empty.
-      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
-        val location = p.getLocation
-        val values = InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map {
-          case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
-        })
-        PartitionDirectory(values, location)
-      }
-      val partitionSpec = PartitionSpec(partitionSchema, partitions)
-      val partitionPaths = partitions.map(_.path.toString)
-
-      // By convention (for example, see MetaStorePartitionedTableFileCatalog), the definition of a
-      // partitioned table's paths depends on whether that table has any actual partitions.
-      // Partitioned tables without partitions use the location of the table's base path.
-      // Partitioned tables with partitions use the locations of those partitions' data locations,
-      // _omitting_ the table's base path.
-      val paths = if (partitionPaths.isEmpty) {
-        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
-      } else {
-        partitionPaths
-      }
 
       val cached = getCached(
         tableIdentifier,
-        paths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
         bucketSpec,
-        Some(partitionSpec))
-
-      val hadoopFsRelation = cached.getOrElse {
-        val fileCatalog = new MetaStorePartitionedTableFileCatalog(
-          sparkSession,
-          new Path(metastoreRelation.catalogTable.storage.locationUri.get),
-          partitionSpec)
-
-        val inferredSchema = if (fileType.equals("parquet")) {
-          val inferredSchema =
-            defaultSource.inferSchema(sparkSession, options, fileCatalog.allFiles())
-          inferredSchema.map { inferred =>
-            ParquetFileFormat.mergeMetastoreParquetSchema(metastoreSchema, inferred)
-          }.getOrElse(metastoreSchema)
-        } else {
-          defaultSource.inferSchema(sparkSession, options, fileCatalog.allFiles()).get
-        }
+        Some(partitionSchema))
+
+      val logicalRelation = cached.getOrElse {
+        val db = metastoreRelation.databaseName
+        val table = metastoreRelation.tableName
+        val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
+        val fileCatalog =
+          new TableFileCatalog(sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+        val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
+        val dataSchema =
+          StructType(metastoreSchema
+            .filterNot(field => partitionSchemaColumnNames.contains(field.name.toLowerCase)))
 
         val relation = HadoopFsRelation(
           location = fileCatalog,
           partitionSchema = partitionSchema,
-          dataSchema = inferredSchema,
+          dataSchema = dataSchema,
           bucketSpec = bucketSpec,
           fileFormat = defaultSource,
           options = options)(sparkSession = sparkSession)
@@ -260,12 +229,11 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         created
       }
 
-      hadoopFsRelation
+      logicalRelation
     } else {
-      val paths = Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
-        paths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
@@ -276,14 +244,13 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           LogicalRelation(
             DataSource(
               sparkSession = sparkSession,
-              paths = paths,
+              paths = rootPath.toString :: Nil,
               userSpecifiedSchema = Some(metastoreRelation.schema),
               bucketSpec = bucketSpec,
               options = options,
               className = fileType).resolveRelation(),
               catalogTable = Some(metastoreRelation.catalogTable))
 
-
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
@@ -371,34 +338,3 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     }
   }
 }
-
-/**
- * An override of the standard HDFS listing based catalog, that overrides the partition spec with
- * the information from the metastore.
- *
- * @param tableBasePath The default base path of the Hive metastore table
- * @param partitionSpec The partition specifications from Hive metastore
- */
-private[hive] class MetaStorePartitionedTableFileCatalog(
-    sparkSession: SparkSession,
-    tableBasePath: Path,
-    override val partitionSpec: PartitionSpec)
-  extends ListingFileCatalog(
-    sparkSession,
-    MetaStorePartitionedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
-    Map.empty,
-    Some(partitionSpec.partitionColumns)) {
-}
-
-private[hive] object MetaStorePartitionedTableFileCatalog {
-  /** Get the list of paths to list files in the for a metastore table */
-  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
-    // If there are no partitions currently specified then use base path,
-    // otherwise use the paths corresponding to the partitions.
-    if (partitionSpec.partitions.isEmpty) {
-      Seq(tableBasePath)
-    } else {
-      partitionSpec.partitions.map(_.path)
-    }
-  }
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 9ee3d629c9977..569a9c11398ea 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -172,15 +172,24 @@ private[hive] trait HiveClient {
    * Returns the partitions for the given table that match the supplied partition spec.
    * If no partition spec is specified, all partitions are returned.
    */
-  def getPartitions(
+  final def getPartitions(
       db: String,
       table: String,
+      partialSpec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = {
+    getPartitions(getTable(db, table), partialSpec)
+  }
+
+  /**
+   * Returns the partitions for the given table that match the supplied partition spec.
+   * If no partition spec is specified, all partitions are returned.
+   */
+  def getPartitions(
+      catalogTable: CatalogTable,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
   /** Returns partitions filtered by predicates for the given table. */
   def getPartitionsByFilter(
-      db: String,
-      table: String,
+      catalogTable: CatalogTable,
       predicates: Seq[Expression]): Seq[CatalogTablePartition]
 
   /** Loads a static partition into an existing table. */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 5c8f7ff1af9fa..dd33d750a4d45 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -525,10 +525,9 @@ private[hive] class HiveClientImpl(
    * If no partition spec is specified, all partitions are returned.
    */
   override def getPartitions(
-      db: String,
-      table: String,
+      table: CatalogTable,
       spec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = withHiveState {
-    val hiveTable = toHiveTable(getTable(db, table))
+    val hiveTable = toHiveTable(table)
     spec match {
       case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
       case Some(s) => client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
@@ -536,10 +535,9 @@ private[hive] class HiveClientImpl(
   }
 
   override def getPartitionsByFilter(
-      db: String,
-      table: String,
+      table: CatalogTable,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withHiveState {
-    val hiveTable = toHiveTable(getTable(db, table))
+    val hiveTable = toHiveTable(table)
     shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index c158bf1ab09cb..9a10957c8efa5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -295,12 +295,12 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: getPartitions(catalogTable)") {
-      assert(2 == client.getPartitions("default", "src_part").size)
+      assert(2 == client.getPartitions(client.getTable("default", "src_part")).size)
     }
 
     test(s"$version: getPartitionsByFilter") {
       // Only one partition [1, 1] for key2 == 1
-      val result = client.getPartitionsByFilter("default", "src_part",
+      val result = client.getPartitionsByFilter(client.getTable("default", "src_part"),
         Seq(EqualTo(AttributeReference("key2", IntegerType)(), Literal(1))))
 
       // Hive 0.12 doesn't support getPartitionsByFilter, it ignores the filter condition.

From e816919fe8b4cd06cc91fb373e8e55f7c18e99b6 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Mon, 12 Sep 2016 18:21:38 -0700
Subject: [PATCH 24/99] Add a new catalyst optimizer rule to SQL core for
 pruning unnecessary partition data from a HadoopFsRelation's file catalog

---
 .../spark/sql/execution/SparkOptimizer.scala  |  2 +
 .../PruneFileSourcePartitions.scala           | 72 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 8b762b5d6c5f2..981728331d361 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
+import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
 import org.apache.spark.sql.internal.SQLConf
 
@@ -32,5 +33,6 @@ class SparkOptimizer(
   override def batches: Seq[Batch] = super.batches :+
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
+    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
new file mode 100644
index 0000000000000..b8af0f53423bd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+
+private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
+    case op @ PhysicalOperation(projects, filters,
+        logicalRelation @
+          LogicalRelation(fsRelation @
+            HadoopFsRelation(
+              tableFileCatalog: TableFileCatalog,
+              partitionSchema,
+              _,
+              _,
+              _,
+              _),
+            _,
+            _))
+        if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined =>
+      // The attribute name of predicate could be different than the one in schema in case of
+      // case insensitive, we should change them to match the one in schema, so we donot need to
+      // worry about case sensitivity anymore.
+      val normalizedFilters = filters.map { e =>
+        e transform {
+          case a: AttributeReference =>
+            a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name)
+        }
+      }
+
+      val sparkSession = fsRelation.sparkSession
+      val partitionColumns =
+        logicalRelation.resolve(
+          partitionSchema, sparkSession.sessionState.analyzer.resolver)
+      val partitionSet = AttributeSet(partitionColumns)
+      val partitionKeyFilters =
+        ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
+
+      if (partitionKeyFilters.nonEmpty) {
+          val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
+          val prunedFsRelation =
+            fsRelation.copy(location = prunedFileCatalog)(sparkSession)
+          val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+
+          // Keep partition-pruning predicates so that they are visible in physical planning
+          val filterExpression = filters.reduceLeft(And)
+          val filter = Filter(filterExpression, prunedLogicalRelation)
+          Project(projects, filter)
+      } else {
+        op
+      }
+  }
+}

From 8cca6dc02847eb04740ec1ed5d29920b4f2f0030 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Fri, 7 Oct 2016 17:15:11 -0700
Subject: [PATCH 25/99] Include the type of file catalog in the
 FileSourceScanExec metadata

---
 .../sql/execution/DataSourceScanExec.scala      | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 808f2052c48b3..4065483262ec8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -225,13 +225,16 @@ case class FileSourceScanExec(
   }
 
   // These metadata values make scan plans uniquely identifiable for equality checking.
-  override val metadata: Map[String, String] = Map(
-    "Format" -> relation.fileFormat.toString,
-    "ReadSchema" -> outputSchema.catalogString,
-    "Batched" -> supportsBatch.toString,
-    "PartitionFilters" -> partitionFilters.mkString("[", ", ", "]"),
-    "PushedFilters" -> dataFilters.mkString("[", ", ", "]"),
-    "RootPaths" -> relation.location.rootPaths.mkString(", "))
+  override val metadata: Map[String, String] = {
+    def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
+    val location = relation.location
+    Map("Format" -> relation.fileFormat.toString,
+      "ReadSchema" -> outputSchema.catalogString,
+      "Batched" -> supportsBatch.toString,
+      "PartitionFilters" -> seqToString(partitionFilters),
+      "PushedFilters" -> seqToString(dataFilters),
+      "Location" -> (location.getClass.getSimpleName + seqToString(location.rootPaths)))
+  }
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val readFile: (PartitionedFile) => Iterator[InternalRow] =

From 7acc3f1072ece6b2e5f5324ff84bbcbeae487ef2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 10 Oct 2016 19:54:53 -0700
Subject: [PATCH 26/99] try out parquet case insensitive fallback

---
 .../parquet/ParquetReadSupport.scala          |  6 +++-
 .../parquet/ParquetSchemaSuite.scala          | 28 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index f1a35dd8a6200..4dea8cf29ec58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -269,11 +269,15 @@ private[parquet] object ParquetReadSupport {
    */
   private def clipParquetGroupFields(
       parquetRecord: GroupType, structType: StructType): Seq[Type] = {
-    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
+    val parquetFieldMap = parquetRecord.getFields.asScala
+      .map(f => f.getName -> f).toMap
+    val caseInsensitiveParquetFieldMap = parquetRecord.getFields.asScala
+      .map(f => f.getName.toLowerCase -> f).toMap
     val toParquet = new ParquetSchemaConverter(writeLegacyParquetFormat = false)
     structType.map { f =>
       parquetFieldMap
         .get(f.name)
+        .orElse(caseInsensitiveParquetFieldMap.get(f.name.toLowerCase))
         .map(clipParquetType(_, f.dataType))
         .getOrElse(toParquet.convertField(f))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 8a980a7eb538f..c3d202ced24c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -1080,6 +1080,34 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
+  testSchemaClipping(
+    "falls back to case insensitive resolution",
+
+    parquetSchema =
+      """message root {
+        |  required group A {
+        |    optional int32 B;
+        |  }
+        |  optional int32 c;
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val nestedType = new StructType().add("b", IntegerType, nullable = true)
+      new StructType()
+        .add("a", nestedType, nullable = true)
+        .add("c", IntegerType, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group A {
+        |    optional int32 B;
+        |  }
+        |  optional int32 c;
+        |}
+      """.stripMargin)
+
   testSchemaClipping(
     "simple nested struct",
 

From cf7d1f15e0045cbd12c81a39138e7c3439c611d7 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 11 Oct 2016 10:53:13 -0700
Subject: [PATCH 27/99] Refactor the FileSourceScanExec.metadata val to make it
 prettier

---
 .../apache/spark/sql/execution/DataSourceScanExec.scala    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 4065483262ec8..ee61f7f0413da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -228,12 +228,15 @@ case class FileSourceScanExec(
   override val metadata: Map[String, String] = {
     def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
     val location = relation.location
-    Map("Format" -> relation.fileFormat.toString,
+    val locationDesc =
+      location.getClass.getSimpleName + seqToString(location.rootPaths)
+    Map(
+      "Format" -> relation.fileFormat.toString,
       "ReadSchema" -> outputSchema.catalogString,
       "Batched" -> supportsBatch.toString,
       "PartitionFilters" -> seqToString(partitionFilters),
       "PushedFilters" -> seqToString(dataFilters),
-      "Location" -> (location.getClass.getSimpleName + seqToString(location.rootPaths)))
+      "Location" -> locationDesc)
   }
 
   private lazy val inputRDD: RDD[InternalRow] = {

From c75855c0615d88001a83c03a9515a9b1fff0b241 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 11 Oct 2016 16:03:18 -0700
Subject: [PATCH 28/99] fix and add test for input files

---
 .../scala/org/apache/spark/sql/Dataset.scala  |  4 +--
 .../execution/datasources/FileFormat.scala    |  7 +++--
 .../datasources/HadoopFsRelation.scala        |  4 ++-
 .../datasources/TableFileCatalog.scala        |  3 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 29 ++++++++++++++++++-
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 90897ac5d7b50..9b9f54e046fbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2602,9 +2602,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def inputFiles: Array[String] = {
-    val files: Seq[String] = logicalPlan.collect {
-      case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
-        location.inputFiles
+    val files: Seq[String] = queryExecution.optimizedPlan.collect {
       case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index 01bceef2efc2e..bd4b7add41c79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
 
+
 /**
  * Used to read and write data stored in files to/from the [[InternalRow]] format.
  */
@@ -205,6 +206,9 @@ trait BasicFileCatalog {
    */
   def listFiles(filters: Seq[Expression]): Seq[Partition]
 
+  /** Returns the list of files that will be read when scanning this relation. */
+  def inputFiles: Array[String]
+
   /** Refresh any cached file listings */
   def refresh(): Unit
 
@@ -226,8 +230,7 @@ trait FileCatalog extends BasicFileCatalog {
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
-  /** Returns the list of files that will be read when scanning this relation. */
-  def inputFiles: Array[String] =
+  override def inputFiles: Array[String] =
     allFiles().map(_.getPath.toUri.toString).toArray
 
   override def sizeInBytes: Long = allFiles().map(_.getLen).sum
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index c600d683cb695..db889edf032d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -44,7 +44,7 @@ case class HadoopFsRelation(
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
     options: Map[String, String])(val sparkSession: SparkSession)
-  extends BaseRelation {
+  extends BaseRelation with FileRelation {
 
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
@@ -66,4 +66,6 @@ case class HadoopFsRelation(
   }
 
   override def sizeInBytes: Long = location.sizeInBytes
+
+  override def inputFiles: Array[String] = location.inputFiles
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index d90ce19869e46..2ca179f84e7f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -72,7 +72,6 @@ class TableFileCatalog(
 
   override def refresh(): Unit = {}
 
-
   /**
    * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
    * specified by the given partition-pruning filters.
@@ -97,6 +96,8 @@ class TableFileCatalog(
     new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
   }
 
+  override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
+
   private def listDataLeafFiles(paths: Seq[Path]) =
     listLeafFiles(paths).filter(f => isDataPath(f.getPath))
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 96e9054cd4876..3771a295ce940 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -18,9 +18,10 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
-class HiveDataFrameSuite extends QueryTest with TestHiveSingleton {
+class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   test("table name with schema") {
     // regression test for SPARK-11778
     spark.sql("create schema usrdb")
@@ -34,4 +35,30 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton {
     val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
+
+  test("inputFiles of pruned and partitioned table") {
+    withTable("test") {
+      withTempDir { dir =>
+        spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+          .partitionBy("f1", "f2")
+          .mode("overwrite")
+          .parquet(dir.getAbsolutePath)
+
+        spark.sql(s"""
+          |create external table test (id long)
+          |partitioned by (f1 int, f2 int)
+          |stored as parquet
+          |location "${dir.getAbsolutePath}"""".stripMargin)
+        spark.sql("msck repair table test")
+
+        val df = spark.sql("select * from test")
+        assert(df.count() == 5)
+        assert(df.inputFiles.length == 5)  // unpruned
+
+        val df2 = spark.sql("select * from test where f2 = 3 or f2 = 4")
+        assert(df2.count() == 2)
+        assert(df2.inputFiles.length == 2)  // pruned, so we have less files
+      }
+    }
+  }
 }

From 821372f2fdc09ebd882bb6958bed24a42738235c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 11 Oct 2016 16:09:06 -0700
Subject: [PATCH 29/99] rename test

---
 .../scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 3771a295ce940..c5d234bf20e31 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -36,7 +36,7 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
 
-  test("inputFiles of pruned and partitioned table") {
+  test("partitioned pruned table reports only selected files") {
     withTable("test") {
       withTempDir { dir =>
         spark.range(5).selectExpr("id", "id as f1", "id as f2").write

From d0b893ba5c45db32aad640ea6732a8803c054f07 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 11 Oct 2016 12:00:43 -0700
Subject: [PATCH 30/99] Refactor `TableFileCatalog.listFiles` to call
 `listDataLeafFiles` once instead of once per partition

---
 .../sql/execution/datasources/TableFileCatalog.scala | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 2ca179f84e7f1..3e2950bffb4d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -55,10 +55,16 @@ class TableFileCatalog(
 
   override def listFiles(filters: Seq[Expression]): Seq[Partition] = partitionSchema match {
     case Some(partitionSchema) =>
-      externalCatalog.listPartitionsByFilter(db, table, filters).flatMap {
+      val catalogTablePartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
+      val partitionPaths = catalogTablePartitions.flatMap {
         case CatalogTablePartition(spec, storage, _) =>
-          storage.locationUri.map(new Path(_)).map { path =>
-            val files = listDataLeafFiles(path :: Nil).toSeq
+          storage.locationUri.map(new Path(_))
+      }
+      val dataLeafFiles = listDataLeafFiles(partitionPaths).toSeq
+      catalogTablePartitions.flatMap {
+        case CatalogTablePartition(spec, storage, _) =>
+          storage.locationUri.map(new Path(_)).map { partitionPath =>
+            val files = dataLeafFiles.filter(_.getPath.getParent == partitionPath)
             val values =
               InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
                 Cast(Literal(spec(name)), dataType).eval()

From c47a2a3b38bb8ec62c3a6f91ae1dd90772396ba1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 13:20:11 -0700
Subject: [PATCH 31/99] feature flag

---
 .../execution/datasources/FileFormat.scala    |  7 ++--
 .../PruneFileSourcePartitions.scala           | 16 ++++-----
 .../datasources/TableFileCatalog.scala        |  5 ++-
 .../apache/spark/sql/internal/SQLConf.scala   |  9 +++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 35 +++++++++++++++++--
 .../sql/hive/HiveMetadataCacheSuite.scala     | 35 +++++++++++++++++++
 6 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index bd4b7add41c79..e7239ef91b326 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -189,9 +189,9 @@ case class Partition(values: InternalRow, files: Seq[FileStatus])
 trait BasicFileCatalog {
 
   /**
-   * Returns the list of root input paths from which the catalog will get files. These paths
-   * should *not* include any table partition directories. Partition directories are discovered or
-   * provided by a metastore catalog.
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
    */
   def rootPaths: Seq[Path]
 
@@ -230,6 +230,7 @@ trait FileCatalog extends BasicFileCatalog {
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
+  /** Returns the list of files that will be read when scanning this relation. */
   override def inputFiles: Array[String] =
     allFiles().map(_.getPath.toUri.toString).toArray
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index b8af0f53423bd..29121a47d92d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -56,15 +56,15 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
 
       if (partitionKeyFilters.nonEmpty) {
-          val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
-          val prunedFsRelation =
-            fsRelation.copy(location = prunedFileCatalog)(sparkSession)
-          val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+        val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
+        val prunedFsRelation =
+          fsRelation.copy(location = prunedFileCatalog)(sparkSession)
+        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
 
-          // Keep partition-pruning predicates so that they are visible in physical planning
-          val filterExpression = filters.reduceLeft(And)
-          val filter = Filter(filterExpression, prunedLogicalRelation)
-          Project(projects, filter)
+        // Keep partition-pruning predicates so that they are visible in physical planning
+        val filterExpression = filters.reduceLeft(And)
+        val filter = Filter(filterExpression, prunedLogicalRelation)
+        Project(projects, filter)
       } else {
         op
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 3e2950bffb4d6..b6431ca929f9c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -102,7 +102,10 @@ class TableFileCatalog(
     new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
   }
 
-  override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
+  // Not used in the hot path of queries when metastore partition pruning is enabled
+  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions(Nil)
+
+  override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
 
   private def listDataLeafFiles(paths: Seq[Path]) =
     listLeafFiles(paths).filter(f => isDataPath(f.getPath))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index c8447651dd672..cff49fbfdae3f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -269,6 +269,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val HIVE_DATASOURCE_PARTITION_PRUNING =
+    SQLConfigBuilder("spark.sql.hive.datasourcePartitionPruning")
+      .doc("When true, enable metastore partition pruning for Datasource tables as well. " +
+           "This is currently implemented for converted Hive tables only.")
+      .booleanConf
+      .createWithDefault(true)
+
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
@@ -676,6 +683,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
+  def datasourcePartitionPruning: Boolean = getConf(HIVE_DATASOURCE_PARTITION_PRUNING)
+
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
   def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index f4494f564a689..cbaafae728926 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -135,6 +135,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
+      pathsInMetastore: Seq[String],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
@@ -152,7 +153,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.rootPaths.toSet == Set(metastoreRelationRootPath) &&
+              relation.location.rootPaths.map(_.toString).toSet == pathsInMetastore.toSet &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
                 relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
@@ -194,11 +195,31 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.datasourcePartitionPruning
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
+      val rootPaths = if (lazyPruningEnabled) {
+        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      } else {
+        // By convention (for example, see TableFileCatalog), the definition of a
+        // partitioned table's paths depends on whether that table has any actual partitions.
+        // Partitioned tables without partitions use the location of the table's base path.
+        // Partitioned tables with partitions use the locations of those partitions' data
+        // locations,_omitting_ the table's base path.
+        val paths = metastoreRelation.getHiveQlPartitions().map { p =>
+          p.getLocation.toString
+        }
+        if (paths.isEmpty) {
+          Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+        } else {
+          paths
+        }
+      }
+
       val cached = getCached(
         tableIdentifier,
+        rootPaths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
@@ -209,8 +230,15 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val db = metastoreRelation.databaseName
         val table = metastoreRelation.tableName
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
-        val fileCatalog =
-          new TableFileCatalog(sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+        val fileCatalog = {
+          val catalog = new TableFileCatalog(
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+          if (lazyPruningEnabled) {
+            catalog
+          } else {
+            catalog.cachedAllPartitions
+          }
+        }
         val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
         val dataSchema =
           StructType(metastoreSchema
@@ -234,6 +262,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
+        Seq(rootPath.toString),
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 3414f5e0409a1..8b56bbee56e1c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -59,4 +59,39 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
       }
     }
   }
+
+  test("partitioned table is cached when partition pruning is off") {
+    withSQLConf("spark.sql.hive.datasourcePartitionPruning" -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+            .partitionBy("f1", "f2")
+            .mode("overwrite")
+            .parquet(dir.getAbsolutePath)
+
+          spark.sql(s"""
+            |create external table test (id long)
+            |partitioned by (f1 int, f2 int)
+            |stored as parquet
+            |location "${dir.getAbsolutePath}"""".stripMargin)
+          spark.sql("msck repair table test")
+
+          val df = spark.sql("select * from test")
+          assert(sql("select * from test").count() == 5)
+
+          // Delete a file, then assert that we tried to read it. This means the table was cached.
+          val p = new Path(spark.table("test").inputFiles.head)
+          assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, false))
+          val e = intercept[SparkException] {
+            sql("select * from test").count()
+          }
+          assert(e.getMessage.contains("FileNotFoundException"))
+
+          // Test refreshing the cache.
+          spark.catalog.refreshTable("test")
+          assert(sql("select * from test").count() == 4)
+        }
+      }
+    }
+  }
 }

From ed7dd37ee55a92ff35e582415739f2d8b5c7a4eb Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 15:48:03 -0700
Subject: [PATCH 32/99] add comments

---
 .../org/apache/spark/sql/internal/SQLConf.scala |  8 ++++----
 .../spark/sql/hive/HiveMetastoreCatalog.scala   | 17 ++++++++---------
 .../spark/sql/hive/HiveMetadataCacheSuite.scala |  2 +-
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index cff49fbfdae3f..e73d0187b584b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -269,9 +269,9 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val HIVE_DATASOURCE_PARTITION_PRUNING =
-    SQLConfigBuilder("spark.sql.hive.datasourcePartitionPruning")
-      .doc("When true, enable metastore partition pruning for Datasource tables as well. " +
+  val HIVE_FILESOURCE_PARTITION_PRUNING =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
+      .doc("When true, enable metastore partition pruning for file source tables as well. " +
            "This is currently implemented for converted Hive tables only.")
       .booleanConf
       .createWithDefault(true)
@@ -683,7 +683,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  def datasourcePartitionPruning: Boolean = getConf(HIVE_DATASOURCE_PARTITION_PRUNING)
+  def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
 
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index cbaafae728926..4a2aaa7d4f6ca 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -135,7 +135,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
-      pathsInMetastore: Seq[String],
+      pathsInMetastore: Seq[Path],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
@@ -145,7 +145,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     cachedDataSourceTables.getIfPresent(tableIdentifier) match {
       case null => None // Cache miss
       case logical @ LogicalRelation(relation: HadoopFsRelation, _, _) =>
-        val metastoreRelationRootPath = metastoreRelation.hiveQlTable.getDataLocation
         val cachedRelationFileFormatClass = relation.fileFormat.getClass
 
         expectedFileFormat match {
@@ -153,7 +152,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.rootPaths.map(_.toString).toSet == pathsInMetastore.toSet &&
+              relation.location.rootPaths.toSet == pathsInMetastore.toSet &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
                 relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
@@ -195,12 +194,12 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
-    val lazyPruningEnabled = sparkSession.sqlContext.conf.datasourcePartitionPruning
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionPruning
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
-      val rootPaths = if (lazyPruningEnabled) {
-        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      val rootPaths: Seq[Path] = if (lazyPruningEnabled) {
+        Seq(metastoreRelation.hiveQlTable.getDataLocation)
       } else {
         // By convention (for example, see TableFileCatalog), the definition of a
         // partitioned table's paths depends on whether that table has any actual partitions.
@@ -208,10 +207,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         // Partitioned tables with partitions use the locations of those partitions' data
         // locations,_omitting_ the table's base path.
         val paths = metastoreRelation.getHiveQlPartitions().map { p =>
-          p.getLocation.toString
+          new Path(p.getLocation)
         }
         if (paths.isEmpty) {
-          Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+          Seq(metastoreRelation.hiveQlTable.getDataLocation)
         } else {
           paths
         }
@@ -262,7 +261,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
-        Seq(rootPath.toString),
+        Seq(rootPath),
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 8b56bbee56e1c..c5db16e3d7194 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -61,7 +61,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
   }
 
   test("partitioned table is cached when partition pruning is off") {
-    withSQLConf("spark.sql.hive.datasourcePartitionPruning" -> "false") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
       withTable("test") {
         withTempDir { dir =>
           spark.range(5).selectExpr("id", "id as f1", "id as f2").write

From bdff4887890f324d918939c35d7e7a5c22b483a0 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 18:09:02 -0700
Subject: [PATCH 33/99] fix it

---
 .../sql/catalyst/catalog/interface.scala      | 15 +++-
 .../datasources/TableFileCatalog.scala        | 90 +++++++++++--------
 .../spark/sql/hive/HiveExternalCatalog.scala  |  8 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 12 ++-
 .../apache/spark/sql/hive/parquetSuites.scala |  3 +
 5 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 51326ca25e9cc..1a57a7707caa1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -20,11 +20,11 @@ package org.apache.spark.sql.catalyst.catalog
 import java.util.Date
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 
 /**
@@ -97,6 +97,15 @@ case class CatalogTablePartition(
 
     output.filter(_.nonEmpty).mkString("CatalogPartition(\n\t", "\n\t", ")")
   }
+
+  /**
+   * Given the partition schema, returns a row with that schema holding the partition values.
+   */
+  def toRow(partitionSchema: StructType): InternalRow = {
+    InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+      Cast(Literal(spec(name)), dataType).eval()
+    })
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index b6431ca929f9c..ef531efd0f649 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -53,27 +53,8 @@ class TableFileCatalog(
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = partitionSchema match {
-    case Some(partitionSchema) =>
-      val catalogTablePartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
-      val partitionPaths = catalogTablePartitions.flatMap {
-        case CatalogTablePartition(spec, storage, _) =>
-          storage.locationUri.map(new Path(_))
-      }
-      val dataLeafFiles = listDataLeafFiles(partitionPaths).toSeq
-      catalogTablePartitions.flatMap {
-        case CatalogTablePartition(spec, storage, _) =>
-          storage.locationUri.map(new Path(_)).map { partitionPath =>
-            val files = dataLeafFiles.filter(_.getPath.getParent == partitionPath)
-            val values =
-              InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
-                Cast(Literal(spec(name)), dataType).eval()
-              })
-            Partition(values, files)
-          }
-      }
-    case None =>
-      Partition(InternalRow.empty, listDataLeafFiles(rootPaths).toSeq) :: Nil
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+    filterPartitions(filters).listFiles(Nil)
   }
 
   override def refresh(): Unit = {}
@@ -85,28 +66,67 @@ class TableFileCatalog(
    * @param filters partition-pruning filters
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
-    val rootPaths = partitionSchema match {
-      case Some(_) =>
-        externalCatalog
-          .listPartitionsByFilter(db, table, filters)
-          .flatMap(_.storage.locationUri)
-          .map(new Path(_))
-      case None =>
-        this.rootPaths
+    if (filters.isEmpty) {
+      cachedAllPartitions
+    } else {
+      filterPartitions0(filters)
     }
-    val parameters =
-      baseLocation
-        .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
-        .getOrElse(Map.empty)
+  }
 
-    new ListingFileCatalog(sparkSession, rootPaths, parameters, partitionSchema)
+  private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
+    val parameters = baseLocation
+      .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
+      .getOrElse(Map.empty)
+    partitionSchema match {
+      case Some(schema) =>
+        val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
+        val partitions = selectedPartitions.map { p =>
+          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
+        }
+        val partitionSpec = PartitionSpec(schema, partitions)
+        new PrunedTableFileCatalog(
+          sparkSession, new Path(baseLocation.get), partitionSpec)
+      case None =>
+        new ListingFileCatalog(sparkSession, rootPaths, parameters, None)
+    }
   }
 
   // Not used in the hot path of queries when metastore partition pruning is enabled
-  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions(Nil)
+  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
 
   override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
 
   private def listDataLeafFiles(paths: Seq[Path]) =
     listLeafFiles(paths).filter(f => isDataPath(f.getPath))
 }
+
+/**
+ * An override of the standard HDFS listing based catalog, that overrides the partition spec with
+ * the information from the metastore.
+ *
+ * @param tableBasePath The default base path of the Hive metastore table
+ * @param partitionSpec The partition specifications from Hive metastore
+ */
+private class PrunedTableFileCatalog(
+    sparkSession: SparkSession,
+    tableBasePath: Path,
+    override val partitionSpec: PartitionSpec)
+  extends ListingFileCatalog(
+    sparkSession,
+    PrunedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
+    Map.empty,
+    Some(partitionSpec.partitionColumns)) {
+}
+
+object PrunedTableFileCatalog {
+  /** Get the list of paths to list files in the for a metastore table */
+  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
+    // If there are no partitions currently specified then use base path,
+    // otherwise use the paths corresponding to the partitions.
+    if (partitionSpec.partitions.isEmpty) {
+      Seq(tableBasePath)
+    } else {
+      partitionSpec.partitions.map(_.path)
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index a103f0363c615..ff59b54f53909 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -673,12 +673,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
             val index = partitionSchema.indexWhere(_.name == att.name)
             BoundReference(index, partitionSchema(index).dataType, nullable = true)
         })
-      clientPrunedPartitions.filter { case CatalogTablePartition(spec, _, _) =>
-        val row =
-          InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
-            Cast(Literal(spec(name)), dataType).eval()
-          })
-        boundPredicate(row)
+      clientPrunedPartitions.filter { case p: CatalogTablePartition =>
+        boundPredicate(p.toRow(partitionSchema))
       }
     } else {
       client.getPartitions(catalogTable)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index c5d234bf20e31..8e49baf34064e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -39,14 +39,14 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
   test("partitioned pruned table reports only selected files") {
     withTable("test") {
       withTempDir { dir =>
-        spark.range(5).selectExpr("id", "id as f1", "id as f2").write
-          .partitionBy("f1", "f2")
+        spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+          .partitionBy("partCol1", "partCol2")
           .mode("overwrite")
           .parquet(dir.getAbsolutePath)
 
         spark.sql(s"""
           |create external table test (id long)
-          |partitioned by (f1 int, f2 int)
+          |partitioned by (partCol1 int, partCol2 int)
           |stored as parquet
           |location "${dir.getAbsolutePath}"""".stripMargin)
         spark.sql("msck repair table test")
@@ -55,9 +55,13 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
         assert(df.count() == 5)
         assert(df.inputFiles.length == 5)  // unpruned
 
-        val df2 = spark.sql("select * from test where f2 = 3 or f2 = 4")
+        val df2 = spark.sql("select * from test where partCol1 = 3 or partCol2 = 4")
         assert(df2.count() == 2)
         assert(df2.inputFiles.length == 2)  // pruned, so we have less files
+
+        val df3 = spark.sql("select * from test where PARTCOL1 = 3 or partcol2 = 4")
+        assert(df3.count() == 2)
+        assert(df3.inputFiles.length == 2)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 2f6d9fb96b825..8f21c3d38a32f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -586,6 +586,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         checkAnswer(
           sql("SELECT * FROM test_added_partitions"),
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
+
+        // Also verify the inputFiles implementation
+        assert(sql("select * from test_added_partitions").inputFiles.length == 2)
       }
     }
   }

From 5ad4b256fff6fcca8dd493afe0e0a10261e06828 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 18:26:23 -0700
Subject: [PATCH 34/99] more test cases

---
 .../scala/org/apache/spark/sql/hive/parquetSuites.scala   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 8f21c3d38a32f..6b66ea1d82607 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -587,6 +587,14 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
           sql("SELECT * FROM test_added_partitions"),
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
 
+        // Check it with pruning predicates
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 1"),
+          Seq(("baz", 1)).toDF("a", "b"))
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 0"),
+          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+
         // Also verify the inputFiles implementation
         assert(sql("select * from test_added_partitions").inputFiles.length == 2)
       }

From fa19224b8f75ac1502ed6857bfa95f41aad77022 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 12 Oct 2016 18:46:41 -0700
Subject: [PATCH 35/99] also fix a bug with zero partitions selected

---
 .../datasources/TableFileCatalog.scala         | 18 ++----------------
 .../spark/sql/hive/HiveDataFrameSuite.scala    |  4 ++++
 .../apache/spark/sql/hive/parquetSuites.scala  | 10 ++++++++--
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index ef531efd0f649..b66ad90354b7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -113,20 +113,6 @@ private class PrunedTableFileCatalog(
     override val partitionSpec: PartitionSpec)
   extends ListingFileCatalog(
     sparkSession,
-    PrunedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
+    partitionSpec.partitions.map(_.path),
     Map.empty,
-    Some(partitionSpec.partitionColumns)) {
-}
-
-object PrunedTableFileCatalog {
-  /** Get the list of paths to list files in the for a metastore table */
-  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
-    // If there are no partitions currently specified then use base path,
-    // otherwise use the paths corresponding to the partitions.
-    if (partitionSpec.partitions.isEmpty) {
-      Seq(tableBasePath)
-    } else {
-      partitionSpec.partitions.map(_.path)
-    }
-  }
-}
+    Some(partitionSpec.partitionColumns))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 8e49baf34064e..6acbdbd25c4ee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -62,6 +62,10 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
         val df3 = spark.sql("select * from test where PARTCOL1 = 3 or partcol2 = 4")
         assert(df3.count() == 2)
         assert(df3.inputFiles.length == 2)
+
+        val df4 = spark.sql("select * from test where partCol1 = 999")
+        assert(df4.count() == 0)
+        assert(df4.inputFiles.length == 0)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 6b66ea1d82607..4b85aac851748 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -588,15 +588,21 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
 
         // Check it with pruning predicates
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 0"),
+          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
         checkAnswer(
           sql("SELECT * FROM test_added_partitions where b = 1"),
           Seq(("baz", 1)).toDF("a", "b"))
         checkAnswer(
-          sql("SELECT * FROM test_added_partitions where b = 0"),
-          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+          sql("SELECT * FROM test_added_partitions where b = 2"),
+          Seq[(String, Int)]().toDF("a", "b"))
 
         // Also verify the inputFiles implementation
         assert(sql("select * from test_added_partitions").inputFiles.length == 2)
+        assert(sql("select * from test_added_partitions where b = 0").inputFiles.length == 1)
+        assert(sql("select * from test_added_partitions where b = 1").inputFiles.length == 1)
+        assert(sql("select * from test_added_partitions where b = 2").inputFiles.length == 0)
       }
     }
   }

From 00bf91295267b0d1ca742399a14e00a59a9d783d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 13 Oct 2016 10:30:48 -0700
Subject: [PATCH 36/99] extend and fix flakiness in test

---
 .../sql/hive/HiveMetadataCacheSuite.scala     | 58 ++++++++++---------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index c5db16e3d7194..7af81a3a90504 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -60,38 +60,44 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
     }
   }
 
-  test("partitioned table is cached when partition pruning is off") {
-    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
-      withTable("test") {
-        withTempDir { dir =>
-          spark.range(5).selectExpr("id", "id as f1", "id as f2").write
-            .partitionBy("f1", "f2")
-            .mode("overwrite")
-            .parquet(dir.getAbsolutePath)
+  def testCaching(pruningEnabled: Boolean): Unit = {
+    test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
+      withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> pruningEnabled.toString) {
+        withTable("test") {
+          withTempDir { dir =>
+            spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+              .partitionBy("f1", "f2")
+              .mode("overwrite")
+              .parquet(dir.getAbsolutePath)
 
-          spark.sql(s"""
-            |create external table test (id long)
-            |partitioned by (f1 int, f2 int)
-            |stored as parquet
-            |location "${dir.getAbsolutePath}"""".stripMargin)
-          spark.sql("msck repair table test")
+            spark.sql(s"""
+              |create external table test (id long)
+              |partitioned by (f1 int, f2 int)
+              |stored as parquet
+              |location "${dir.getAbsolutePath}"""".stripMargin)
+            spark.sql("msck repair table test")
 
-          val df = spark.sql("select * from test")
-          assert(sql("select * from test").count() == 5)
+            val df = spark.sql("select * from test")
+            assert(sql("select * from test").count() == 5)
 
-          // Delete a file, then assert that we tried to read it. This means the table was cached.
-          val p = new Path(spark.table("test").inputFiles.head)
-          assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, false))
-          val e = intercept[SparkException] {
-            sql("select * from test").count()
-          }
-          assert(e.getMessage.contains("FileNotFoundException"))
+            // Delete a file, then assert that we tried to read it. This means the table was cached.
+            val p = new Path(spark.table("test").inputFiles.head)
+            assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, true))
+            val e = intercept[SparkException] {
+              sql("select * from test").count()
+            }
+            assert(e.getMessage.contains("FileNotFoundException"))
 
-          // Test refreshing the cache.
-          spark.catalog.refreshTable("test")
-          assert(sql("select * from test").count() == 4)
+            // Test refreshing the cache.
+            spark.catalog.refreshTable("test")
+            assert(sql("select * from test").count() == 4)
+          }
         }
       }
     }
   }
+
+  for (pruningEnabled <- Seq(true, false)) {
+    testCaching(pruningEnabled)
+  }
 }

From b5f7691b7d5401a1d4a6b4f98a2edb8475b9731e Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 10:55:26 -0700
Subject: [PATCH 37/99] Enhance `ParquetMetastoreSuite` with mixed-case
 partition columns

---
 .../apache/spark/sql/hive/parquetSuites.scala | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 4b85aac851748..1068d3f6f27dc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -75,7 +75,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -89,7 +89,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -118,7 +118,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -134,7 +134,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (p int)
+      PARTITIONED BY (pQ int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -156,19 +156,19 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       """.stripMargin)
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (p=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (pQ=$p)")
     }
 
     (1 to 10).map(i => (i, s"str$i")).toDF("a", "b").createOrReplaceTempView("jt")
@@ -828,7 +828,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     normalTableDir = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDir, s"p=$p")
+      val partDir = new File(partitionedTableDir, s"pQ=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetData(i, s"part-$p"))
         .toDF()
@@ -844,7 +844,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKey = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
+      val partDir = new File(partitionedTableDirWithKey, s"pQ=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetDataWithKey(p, i, s"part-$p"))
         .toDF()
@@ -854,7 +854,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p")
+      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"pQ=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithKeyAndComplexTypes(
           p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
@@ -864,7 +864,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p")
+      val partDir = new File(partitionedTableDirWithComplexTypes, s"pQ=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
       }.toDF().write.parquet(partDir.getCanonicalPath)
@@ -898,19 +898,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"ordering of the partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
+        sql(s"SELECT pQ, stringField FROM $table WHERE pQ = 1"),
         Seq.fill(10)(Row(1, "part-1"))
       )
 
       checkAnswer(
-        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
+        sql(s"SELECT stringField, pQ FROM $table WHERE pQ = 1"),
         Seq.fill(10)(Row("part-1", 1))
       )
     }
 
     test(s"project the partitioning column $table") {
       checkAnswer(
-        sql(s"SELECT p, count(*) FROM $table group by p"),
+        sql(s"SELECT pQ, count(*) FROM $table group by pQ"),
         Row(1, 10) ::
           Row(2, 10) ::
           Row(3, 10) ::
@@ -926,7 +926,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"project partitioning and non-partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
+        sql(s"SELECT stringField, pQ, count(intField) FROM $table GROUP BY pQ, stringField"),
         Row("part-1", 1, 10) ::
           Row("part-2", 2, 10) ::
           Row("part-3", 3, 10) ::
@@ -948,19 +948,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1"),
         Row(10))
     }
 
     test(s"non-existent partition $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1000"),
         Row(0))
     }
 
     test(s"multi-partition pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE pQ IN (1,2,3)"),
         Row(30))
     }
 
@@ -972,7 +972,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"sum $table") {
       checkAnswer(
-        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
+        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND pQ = 1"),
         Row(1 + 2 + 3))
     }
 
@@ -993,15 +993,15 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
       checkAnswer(
         sql(
           s"""
-             |SELECT p, structField.intStructField, structField.stringStructField
-             |FROM $table WHERE p = 1
+             |SELECT pQ, structField.intStructField, structField.stringStructField
+             |FROM $table WHERE pQ = 1
            """.stripMargin),
         (1 to 10).map(i => Row(1, i, f"${i}_string")))
     }
 
     test(s"SPARK-5775 read array from $table") {
       checkAnswer(
-        sql(s"SELECT arrayField, p FROM $table WHERE p = 1"),
+        sql(s"SELECT arrayField, pQ FROM $table WHERE pQ = 1"),
         (1 to 10).map(i => Row(1 to i, 1)))
     }
   }

From 77932a1579cbeaa6e08191ed96dc5cd3853b5f11 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 11:29:25 -0700
Subject: [PATCH 38/99] Tidy up a little by removing some unused imports, an
 unused method and moving a protected method down and making it private

---
 .../datasources/PartitioningAwareFileCatalog.scala         | 7 +++++++
 .../sql/execution/datasources/SessionFileCatalog.scala     | 7 -------
 .../spark/sql/execution/datasources/TableFileCatalog.scala | 7 +------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 04d7d89250586..b2508115c282f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -207,6 +207,13 @@ abstract class PartitioningAwareFileCatalog(
           if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet
     }
   }
+
+  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
+  // counted as data files, so that they shouldn't participate partition discovery.
+  private def isDataPath(path: Path): Boolean = {
+    val name = path.getName
+    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
+  }
 }
 
 object PartitioningAwareFileCatalog {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
index 47b69eb721b29..7e7fc18535749 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -58,13 +58,6 @@ abstract class SessionFileCatalog(sparkSession: SparkSession)
 
     mutable.LinkedHashSet(files: _*)
   }
-
-  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
-  // counted as data files, so that they shouldn't participate partition discovery.
-  protected def isDataPath(path: Path): Boolean = {
-    val name = path.getName
-    !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
-  }
 }
 
 object SessionFileCatalog extends Logging {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index b66ad90354b7c..a5c41b244589b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -20,10 +20,8 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.types.StructType
 
 
 /**
@@ -95,9 +93,6 @@ class TableFileCatalog(
   lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
 
   override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
-
-  private def listDataLeafFiles(paths: Seq[Path]) =
-    listLeafFiles(paths).filter(f => isDataPath(f.getPath))
 }
 
 /**

From 97cd27d8110dbdb2dcfd466bb990d298eaf25fa6 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 11:44:15 -0700
Subject: [PATCH 39/99] Put partition count in `FileSourceScanExec.metadata`
 for partitioned tables

---
 .../sql/execution/DataSourceScanExec.scala    | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index ee61f7f0413da..623d2be55dcec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -230,13 +230,21 @@ case class FileSourceScanExec(
     val location = relation.location
     val locationDesc =
       location.getClass.getSimpleName + seqToString(location.rootPaths)
-    Map(
-      "Format" -> relation.fileFormat.toString,
-      "ReadSchema" -> outputSchema.catalogString,
-      "Batched" -> supportsBatch.toString,
-      "PartitionFilters" -> seqToString(partitionFilters),
-      "PushedFilters" -> seqToString(dataFilters),
-      "Location" -> locationDesc)
+    val metadata =
+      Map(
+        "Format" -> relation.fileFormat.toString,
+        "ReadSchema" -> outputSchema.catalogString,
+        "Batched" -> supportsBatch.toString,
+        "PartitionFilters" -> seqToString(partitionFilters),
+        "PushedFilters" -> seqToString(dataFilters),
+        "Location" -> locationDesc)
+    val withOptPartitionCount =
+      relation.partitionSchemaOption.map { _ =>
+        metadata + ("PartitionCount" -> selectedPartitions.size.toString)
+      } getOrElse {
+        metadata
+      }
+    withOptPartitionCount
   }
 
   private lazy val inputRDD: RDD[InternalRow] = {

From 851d7f944c44172789edd6bfaa878a73c77e862e Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 13 Oct 2016 13:48:33 -0700
Subject: [PATCH 40/99] Fix some errors in my revision of `ParquetSourceSuite`

---
 .../scala/org/apache/spark/sql/hive/parquetSuites.scala     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 1068d3f6f27dc..c4344dd12780b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -34,7 +34,7 @@ import org.apache.spark.util.Utils
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
-case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
+case class ParquetDataWithKey(pQ: Int, intField: Int, stringField: String)
 
 case class StructContainer(intStructField: Int, stringStructField: String)
 
@@ -45,7 +45,7 @@ case class ParquetDataWithComplexTypes(
     arrayField: Seq[Int])
 
 case class ParquetDataWithKeyAndComplexTypes(
-    p: Int,
+    pQ: Int,
     intField: Int,
     stringField: String,
     structField: StructContainer,
@@ -650,7 +650,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
       CREATE TEMPORARY VIEW normal_parquet
       USING org.apache.spark.sql.parquet
       OPTIONS (
-        path '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
+        path '${new File(partitionedTableDir, "pQ=1").getCanonicalPath}'
       )
     """)
 

From 26e0d34d43edf8922185917b01388ff699f0a835 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 13 Oct 2016 18:24:31 -0700
Subject: [PATCH 41/99] Add metrics and cost tests for partition pruning
 effectiveness (#5)

* [SPARK-16980][SQL] Load only catalog table partition metadata required
to answer a query

* Add a new catalyst optimizer rule to SQL core for pruning unneeded
partitions' files from a table file catalog

* Include the type of file catalog in the FileSourceScanExec metadata

* TODO: Consider renaming FileCatalog to better differentiate it from
BasicFileCatalog (or vice-versa)

* try out parquet case insensitive fallback

* Refactor the FileSourceScanExec.metadata val to make it prettier

* fix and add test for input files

* rename test

* Refactor `TableFileCatalog.listFiles` to call `listDataLeafFiles` once
instead of once per partition

* fix it

* more test cases

* also fix a bug with zero partitions selected

* feature flag

* add comments

* extend and fix flakiness in test

* Enhance `ParquetMetastoreSuite` with mixed-case partition columns

* Tidy up a little by removing some unused imports, an unused method and
moving a protected method down and making it private

* Put partition count in `FileSourceScanExec.metadata` for partitioned
tables

* Fix some errors in my revision of `ParquetSourceSuite`

* Thu Oct 13 17:18:14 PDT 2016

* more generic

* Thu Oct 13 18:09:42 PDT 2016

* Thu Oct 13 18:09:55 PDT 2016

* Thu Oct 13 18:22:31 PDT 2016
---
 .../spark/metrics/source/StaticSources.scala  | 29 ++++++
 .../datasources/SessionFileCatalog.scala      |  2 +
 .../sql/hive/client/HiveClientImpl.scala      |  9 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 96 ++++++++++++++++---
 .../apache/spark/sql/hive/parquetSuites.scala |  3 +-
 5 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 6bba259acc391..4e88ae65bd89f 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -60,3 +60,32 @@ object CodegenMetrics extends Source {
   val METRIC_GENERATED_METHOD_BYTECODE_SIZE =
     metricRegistry.histogram(MetricRegistry.name("generatedMethodSize"))
 }
+
+/**
+ * :: Experimental ::
+ * Metrics for access to the hive external catalog.
+ */
+@Experimental
+object HiveCatalogMetrics extends Source {
+  override val sourceName: String = "HiveExternalCatalog"
+  override val metricRegistry: MetricRegistry = new MetricRegistry()
+
+  /**
+   * Tracks the total number of partition metadata entries fetched via the client api.
+   */
+  val METRIC_PARTITIONS_FETCHED = metricRegistry.counter(MetricRegistry.name("partitionsFetched"))
+
+  /**
+   * Tracks the total number of files discovered off of the filesystem by ListingFileCatalog.
+   */
+  val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
+
+  def reset(): Unit = {
+    METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
+    METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
+  }
+
+  // clients can use these to avoid classloader issues with the codahale classes
+  def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
+  def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
index 7e7fc18535749..4807a92c2e6b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs._
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.util.SerializableConfiguration
 
@@ -56,6 +57,7 @@ abstract class SessionFileCatalog(sparkSession: SparkSession)
         SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
       }
 
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
     mutable.LinkedHashSet(files: _*)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index dd33d750a4d45..e745a8c5b3589 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -37,6 +37,7 @@ import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPartitionException}
@@ -528,17 +529,21 @@ private[hive] class HiveClientImpl(
       table: CatalogTable,
       spec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = withHiveState {
     val hiveTable = toHiveTable(table)
-    spec match {
+    val parts = spec match {
       case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
       case Some(s) => client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
     }
+    HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
+    parts
   }
 
   override def getPartitionsByFilter(
       table: CatalogTable,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withHiveState {
     val hiveTable = toHiveTable(table)
-    shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
+    val parts = shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
+    HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
+    parts
   }
 
   override def listTables(dbName: String): Seq[String] = withHiveState {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 6acbdbd25c4ee..f65e74de87a57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import java.io.File
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
@@ -36,21 +39,25 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
 
+  private def setupPartitionedTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+      .partitionBy("partCol1", "partCol2")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create external table $tableName (id long)
+      |partitioned by (partCol1 int, partCol2 int)
+      |stored as parquet
+      |location "${dir.getAbsolutePath}"""".stripMargin)
+    spark.sql(s"msck repair table $tableName")
+  }
+
   test("partitioned pruned table reports only selected files") {
+    assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
     withTable("test") {
       withTempDir { dir =>
-        spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
-          .partitionBy("partCol1", "partCol2")
-          .mode("overwrite")
-          .parquet(dir.getAbsolutePath)
-
-        spark.sql(s"""
-          |create external table test (id long)
-          |partitioned by (partCol1 int, partCol2 int)
-          |stored as parquet
-          |location "${dir.getAbsolutePath}"""".stripMargin)
-        spark.sql("msck repair table test")
-
+        setupPartitionedTable("test", dir)
         val df = spark.sql("select * from test")
         assert(df.count() == 5)
         assert(df.inputFiles.length == 5)  // unpruned
@@ -69,4 +76,69 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
       }
     }
   }
+
+  test("lazy partition pruning reads only necessary partition data") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 3").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 3)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 3)
+
+          // should read all
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          // read all should be cached
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("all partitions read and cached when filesource partition pruning is off") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+
+          // We actually query the partitions from hive each time the table is resolved in this
+          // mode. This is kind of terrible, but is needed to preserve the legacy behavior
+          // of doing plan cache validation based on the entire partition set.
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
+          // 5 from table resolution, another 5 from ListingFileCatalog
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 10)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index c4344dd12780b..d39d64195e1cd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -175,7 +175,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
     (1 to 10).map(i => Tuple1(Seq(new Integer(i), null))).toDF("a")
       .createOrReplaceTempView("jt_array")
 
-    setConf(HiveUtils.CONVERT_METASTORE_PARQUET, true)
+    assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
   }
 
   override def afterAll(): Unit = {
@@ -187,7 +187,6 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       "jt",
       "jt_array",
        "test_parquet")
-    setConf(HiveUtils.CONVERT_METASTORE_PARQUET, false)
   }
 
   test(s"conversion is working") {

From 83a168ce92a65978d907f509109ce5c70cfb0c0f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 13 Oct 2016 19:27:01 -0700
Subject: [PATCH 42/99] Actually register the hive catalog metrics, also revert
 broken tests (#6)

* Thu Oct 13 19:02:36 PDT 2016

* Thu Oct 13 19:03:06 PDT 2016
---
 .../spark/metrics/source/StaticSources.scala  |  5 +-
 .../apache/spark/sql/hive/parquetSuites.scala | 52 +++++++++----------
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 4e88ae65bd89f..cf92a10deabd5 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -26,7 +26,7 @@ private[spark] object StaticSources {
    * The set of all static sources. These sources may be reported to from any class, including
    * static classes, without requiring reference to a SparkEnv.
    */
-  val allSources = Seq(CodegenMetrics)
+  val allSources = Seq(CodegenMetrics, HiveCatalogMetrics)
 }
 
 /**
@@ -80,6 +80,9 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
 
+  /**
+   * Resets the values of all metrics to zero. This is useful in tests.
+   */
   def reset(): Unit = {
     METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index d39d64195e1cd..9fc62a389db4d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -34,7 +34,7 @@ import org.apache.spark.util.Utils
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
-case class ParquetDataWithKey(pQ: Int, intField: Int, stringField: String)
+case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
 
 case class StructContainer(intStructField: Int, stringStructField: String)
 
@@ -45,7 +45,7 @@ case class ParquetDataWithComplexTypes(
     arrayField: Seq[Int])
 
 case class ParquetDataWithKeyAndComplexTypes(
-    pQ: Int,
+    p: Int,
     intField: Int,
     stringField: String,
     structField: StructContainer,
@@ -75,7 +75,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -89,7 +89,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         intField INT,
         stringField STRING
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -118,7 +118,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -134,7 +134,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         structField STRUCT<intStructField: INT, stringStructField: STRING>,
         arrayField ARRAY<INT>
       )
-      PARTITIONED BY (pQ int)
+      PARTITIONED BY (p int)
       ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
@@ -156,19 +156,19 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       """.stripMargin)
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (p=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (p=$p)")
     }
 
     (1 to 10).foreach { p =>
-      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (pQ=$p)")
+      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (p=$p)")
     }
 
     (1 to 10).map(i => (i, s"str$i")).toDF("a", "b").createOrReplaceTempView("jt")
@@ -649,7 +649,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
       CREATE TEMPORARY VIEW normal_parquet
       USING org.apache.spark.sql.parquet
       OPTIONS (
-        path '${new File(partitionedTableDir, "pQ=1").getCanonicalPath}'
+        path '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
       )
     """)
 
@@ -827,7 +827,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     normalTableDir = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDir, s"pQ=$p")
+      val partDir = new File(partitionedTableDir, s"p=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetData(i, s"part-$p"))
         .toDF()
@@ -843,7 +843,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKey = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKey, s"pQ=$p")
+      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetDataWithKey(p, i, s"part-$p"))
         .toDF()
@@ -853,7 +853,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"pQ=$p")
+      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithKeyAndComplexTypes(
           p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
@@ -863,7 +863,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
     partitionedTableDirWithComplexTypes = Utils.createTempDir()
 
     (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithComplexTypes, s"pQ=$p")
+      val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p")
       sparkContext.makeRDD(1 to 10).map { i =>
         ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
       }.toDF().write.parquet(partDir.getCanonicalPath)
@@ -897,19 +897,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"ordering of the partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT pQ, stringField FROM $table WHERE pQ = 1"),
+        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
         Seq.fill(10)(Row(1, "part-1"))
       )
 
       checkAnswer(
-        sql(s"SELECT stringField, pQ FROM $table WHERE pQ = 1"),
+        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
         Seq.fill(10)(Row("part-1", 1))
       )
     }
 
     test(s"project the partitioning column $table") {
       checkAnswer(
-        sql(s"SELECT pQ, count(*) FROM $table group by pQ"),
+        sql(s"SELECT p, count(*) FROM $table group by p"),
         Row(1, 10) ::
           Row(2, 10) ::
           Row(3, 10) ::
@@ -925,7 +925,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"project partitioning and non-partitioning columns $table") {
       checkAnswer(
-        sql(s"SELECT stringField, pQ, count(intField) FROM $table GROUP BY pQ, stringField"),
+        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
         Row("part-1", 1, 10) ::
           Row("part-2", 2, 10) ::
           Row("part-3", 3, 10) ::
@@ -947,19 +947,19 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
         Row(10))
     }
 
     test(s"non-existent partition $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE pQ = 1000"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
         Row(0))
     }
 
     test(s"multi-partition pruned count $table") {
       checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE pQ IN (1,2,3)"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
         Row(30))
     }
 
@@ -971,7 +971,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
 
     test(s"sum $table") {
       checkAnswer(
-        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND pQ = 1"),
+        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
         Row(1 + 2 + 3))
     }
 
@@ -992,15 +992,15 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
       checkAnswer(
         sql(
           s"""
-             |SELECT pQ, structField.intStructField, structField.stringStructField
-             |FROM $table WHERE pQ = 1
+             |SELECT p, structField.intStructField, structField.stringStructField
+             |FROM $table WHERE p = 1
            """.stripMargin),
         (1 to 10).map(i => Row(1, i, f"${i}_string")))
     }
 
     test(s"SPARK-5775 read array from $table") {
       checkAnswer(
-        sql(s"SELECT arrayField, pQ FROM $table WHERE pQ = 1"),
+        sql(s"SELECT arrayField, p FROM $table WHERE p = 1"),
         (1 to 10).map(i => Row(1 to i, 1)))
     }
   }

From 8871e3a15ddedb6c80f70e807038fe3ae0bcc2cd Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 14 Oct 2016 14:48:49 -0700
Subject: [PATCH 43/99] stray println

---
 .../test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index ddb2ae23d4ea1..ecb5972984523 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -490,7 +490,6 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
             |stored as orc
             |location "${dir.getAbsolutePath}"""".stripMargin)
           spark.sql(s"msck repair table dummy_orc")
-          println(spark.sql("select * from dummy_orc"))
           checkAnswer(spark.sql("select * from dummy_orc"), df)
         }
       }

From 151f7ac840015b53dab4d58160369cb20760bc6d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 14 Oct 2016 14:49:31 -0700
Subject: [PATCH 44/99] Fri Oct 14 14:49:31 PDT 2016

---
 .../datasources/ListingFileCatalog.scala      |   3 +-
 .../PartitioningAwareFileCatalog.scala        | 194 ++++++++++++++-
 .../datasources/SessionFileCatalog.scala      | 225 ------------------
 .../datasources/TableFileCatalog.scala        |   5 +-
 4 files changed, 193 insertions(+), 234 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index 6d10501b7265d..5a01a3601de89 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -21,6 +21,7 @@ import scala.collection.mutable
 
 import org.apache.hadoop.fs._
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types.StructType
 
@@ -39,7 +40,7 @@ class ListingFileCatalog(
     override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
-  extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) {
+  extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) with Logging {
 
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index b2508115c282f..1265e20acfb10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -17,14 +17,21 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.io.FileNotFoundException
+
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.util.SerializableConfiguration
 
 
 /**
@@ -38,11 +45,10 @@ import org.apache.spark.sql.types.{StringType, StructType}
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType])
-  extends SessionFileCatalog(sparkSession) with FileCatalog {
+    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
@@ -214,8 +220,186 @@ abstract class PartitioningAwareFileCatalog(
     val name = path.getName
     !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
   }
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
+    mutable.LinkedHashSet(files: _*)
+  }
 }
 
-object PartitioningAwareFileCatalog {
+object PartitioningAwareFileCatalog extends Logging {
   val BASE_PATH_PARAM = "basePath"
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
deleted file mode 100644
index 4807a92c2e6b8..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.io.FileNotFoundException
-
-import scala.collection.mutable
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.SerializableConfiguration
-
-
-/**
- * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
- * files in a list of HDFS paths.
- *
- * @param sparkSession a [[SparkSession]]
- * @param ignoreFileNotFound (see [[ListingFileCatalog]])
- */
-abstract class SessionFileCatalog(sparkSession: SparkSession)
-    extends BasicFileCatalog with Logging {
-  protected val hadoopConf: Configuration
-
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
-  }
-}
-
-object SessionFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index a5c41b244589b..518113215cd5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -38,10 +38,9 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long)
-  extends SessionFileCatalog(sparkSession) {
+    override val sizeInBytes: Long) extends BasicFileCatalog {
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 

From 014c9989cb132041bce45215450a85661c3202e2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 14 Oct 2016 15:03:30 -0700
Subject: [PATCH 45/99] Also support mixed case field resolution for converted
 ORC tables (#7)

* Fri Oct 14 14:04:01 PDT 2016

* stray println
---
 .../spark/sql/hive/orc/OrcFileFormat.scala    | 12 +++++++++-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 22 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index e94f49ea81177..1af3280e18a89 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -313,7 +313,17 @@ private[orc] object OrcRelation extends HiveInspectors {
 
   def setRequiredColumns(
       conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
+    val caseInsensitiveFieldMap: Map[String, Int] = physicalSchema.fieldNames
+      .zipWithIndex
+      .map(f => (f._1.toLowerCase, f._2))
+      .toMap
+    val ids = requestedSchema.map { a =>
+      val exactMatch: Option[Int] = physicalSchema.getFieldIndex(a.name)
+      val res = exactMatch.getOrElse(
+        caseInsensitiveFieldMap.getOrElse(a.name,
+          throw new IllegalArgumentException(s"""Field "$a.name" does not exist.""")))
+      res: Integer
+    }
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index b2ee49c441ef2..ecb5972984523 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -474,6 +474,28 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("converted ORC table supports resolving mixed case field") {
+    withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
+      withTable("dummy_orc") {
+        withTempPath { dir =>
+          val df = spark.range(5).selectExpr("id", "id as valueField", "id as partitionValue")
+          df.write
+            .partitionBy("partitionValue")
+            .mode("overwrite")
+            .orc(dir.getAbsolutePath)
+
+          spark.sql(s"""
+            |create external table dummy_orc (id long, valueField long)
+            |partitioned by (partitionValue int)
+            |stored as orc
+            |location "${dir.getAbsolutePath}"""".stripMargin)
+          spark.sql(s"msck repair table dummy_orc")
+          checkAnswer(spark.sql("select * from dummy_orc"), df)
+        }
+      }
+    }
+  }
+
   test("SPARK-14962 Produce correct results on array type with isnotnull") {
     withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       val data = (0 until 10).map(i => Tuple1(Array(i)))

From f967ed81b10075f6361eca7d65f669ae6dd25c3d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 14 Oct 2016 15:06:46 -0700
Subject: [PATCH 46/99] Fri Oct 14 15:06:46 PDT 2016

---
 .../execution/datasources/ListingFileCatalog.scala   |  6 ++++--
 .../sql/execution/datasources/TableFileCatalog.scala | 12 ++++++++++--
 .../apache/spark/sql/hive/HiveMetastoreCatalog.scala |  3 ++-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index 5a01a3601de89..0629af89ee88f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -39,8 +39,10 @@ class ListingFileCatalog(
     sparkSession: SparkSession,
     override val rootPaths: Seq[Path],
     parameters: Map[String, String],
-    partitionSchema: Option[StructType])
-  extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) with Logging {
+    partitionSchema: Option[StructType],
+    fileStatusCache: Option[FileStatusCache] = None)
+  extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema, fileStatusCache)
+  with Logging {
 
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 518113215cd5a..b588cbbd593e0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -32,13 +32,21 @@ import org.apache.spark.sql.types.StructType
  * @param table the table's (unqualified) name
  * @param partitionSchema the schema of a partitioned table's partition columns
  * @param sizeInBytes the table's data size in bytes
+ * @param enableFileStatusCache whether to enable file status caching
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long) extends BasicFileCatalog {
+    override val sizeInBytes: Long,
+    enableFileStatusCache: Boolean) extends BasicFileCatalog {
+
+  private val fileStatusCache = if (enableFileStatusCache)  {
+    Some(new FileStatusCache)
+  } else {
+    None
+  }
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
@@ -84,7 +92,7 @@ class TableFileCatalog(
         new PrunedTableFileCatalog(
           sparkSession, new Path(baseLocation.get), partitionSpec)
       case None =>
-        new ListingFileCatalog(sparkSession, rootPaths, parameters, None)
+        new ListingFileCatalog(sparkSession, rootPaths, parameters, None, fileStatusCache)
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index dd15d27508c5f..ea68832c1ae9c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -232,7 +232,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes, sharedCache,
+            enableFileStatusCache = lazyPruningEnabled)
           if (lazyPruningEnabled) {
             catalog
           } else {

From dd55499a60caceef179d1776c9c8e5b5f6f2a1bc Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 14 Oct 2016 17:40:41 -0700
Subject: [PATCH 47/99] wip

---
 .../spark/metrics/source/StaticSources.scala  |   7 +
 .../datasources/ListingFileCatalog.scala      |  15 +-
 .../PartitioningAwareFileCatalog.scala        | 159 +++++++++----
 .../datasources/SessionFileCatalog.scala      | 225 ------------------
 .../datasources/TableFileCatalog.scala        |  27 +--
 .../spark/sql/hive/HiveExternalCatalog.scala  |  34 ---
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 7 files changed, 140 insertions(+), 329 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index cf92a10deabd5..b54885b7ff8b0 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -80,15 +80,22 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
 
+  /**
+   * Tracks the total number of files served from the file status cache instead of discovered.
+   */
+  val METRIC_FILE_CACHE_HITS = metricRegistry.counter(MetricRegistry.name("fileCacheHits"))
+
   /**
    * Resets the values of all metrics to zero. This is useful in tests.
    */
   def reset(): Unit = {
     METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
+    METRIC_FILE_CACHE_HITS.dec(METRIC_FILE_CACHE_HITS.getCount())
   }
 
   // clients can use these to avoid classloader issues with the codahale classes
   def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
   def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
+  def incrementFileCacheHits(n: Int): Unit = METRIC_FILE_CACHE_HITS.inc(n)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index 560e33b85c85e..b27ac6d7868ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -39,15 +39,15 @@ class ListingFileCatalog(
     override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType],
-    fileStatusCache: Option[FileStatusCache] = None)
-  extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema, fileStatusCache)
-  with Logging {
+    fileStatusCache: FileStatusCache = new NoopCache)
+  extends PartitioningAwareFileCatalog(
+    sparkSession, parameters, partitionSchema, fileStatusCache) {
 
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
   @volatile private var cachedPartitionSpec: PartitionSpec = _
 
-  refresh()
+  refresh0(false)
 
   override def partitionSpec(): PartitionSpec = {
     if (cachedPartitionSpec == null) {
@@ -66,11 +66,18 @@ class ListingFileCatalog(
   }
 
   override def refresh(): Unit = {
+    refresh0(true)
+  }
+
+  private def refresh0(invalidateSharedCache: Boolean): Unit = {
     val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
     cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
     cachedPartitionSpec = null
+    if (invalidateSharedCache) {
+      fileStatusCache.invalidateAll()
+    }
   }
 
   override def equals(other: Any): Boolean = other match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 9f9ea59755640..b4dd739fa6038 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources
 
 import java.io.FileNotFoundException
+import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.mutable
 
@@ -34,6 +35,46 @@ import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.SerializableConfiguration
 
 
+abstract class FileStatusCache {
+  def getLeafFiles(path: Path): Option[Seq[FileStatus]] = None
+  def putLeafFiles(path: Path, leafFiles: Seq[FileStatus]): Unit
+  def invalidateAll(): Unit
+}
+
+class InMemoryCache extends FileStatusCache {
+  private val cache = new ConcurrentHashMap[Path, Seq[FileStatus]]()
+
+  override def getLeafFiles(path: Path): Option[Seq[FileStatus]] = {
+    val res = Option(cache.get(path))
+    res.foreach { r =>
+      HiveCatalogMetrics.incrementFileCacheHits(r.length)
+    }
+    res
+  }
+
+  override def putLeafFiles(path: Path, leafFiles: Seq[FileStatus]): Unit = {
+    println("discovered files: " + leafFiles)
+    HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
+    cache.put(path, leafFiles)
+  }
+
+  override def invalidateAll(): Unit = {
+    println("invalidating all")
+    cache.clear()
+  }
+}
+
+class NoopCache extends FileStatusCache {
+  override def getLeafFiles(path: Path): Option[Seq[FileStatus]] = None
+  override def putLeafFiles(path: Path, leafFiles: Seq[FileStatus]): Unit = {
+    println("[uncached] discovered files: " + leafFiles)
+    HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
+  }
+  override def invalidateAll(): Unit = {
+    println("invalidating all")
+  }
+}
+
 /**
  * An abstract class that represents [[FileCatalog]]s that are aware of partitioned tables.
  * It provides the necessary methods to parse partition data based on a set of files.
@@ -45,10 +86,11 @@ import org.apache.spark.util.SerializableConfiguration
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
+    partitionSchema: Option[StructType],
+    fileStatusCache: FileStatusCache = new NoopCache) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
@@ -229,15 +271,29 @@ abstract class PartitioningAwareFileCatalog(
    * This is publicly visible for testing.
    */
   def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+    val output = mutable.LinkedHashSet[FileStatus]()
+    val pathsToFetch = mutable.ArrayBuffer[Path]()
+    for (path <- paths) {
+      fileStatusCache.getLeafFiles(path) match {
+        case Some(files) =>
+          println("cache hit: " + path)
+          output ++= files
+        case None =>
+          println("cache miss: " + path)
+          pathsToFetch += path
       }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
+    }
+    val discovered = if (pathsToFetch.length >=
+        sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+      PartitioningAwareFileCatalog.listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
+    } else {
+      PartitioningAwareFileCatalog.listLeafFilesInSerial(pathsToFetch, hadoopConf)
+    }
+    discovered.foreach { case (path, leafFiles) =>
+      fileStatusCache.putLeafFiles(path, leafFiles)
+      output ++= leafFiles
+    }
+    output
   }
 }
 
@@ -267,15 +323,15 @@ object PartitioningAwareFileCatalog extends Logging {
    */
   private def listLeafFilesInSerial(
       paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
+      hadoopConf: Configuration): Map[Path, Seq[FileStatus]] = {
     // Dummy jobconf to get to the pathFilter defined in configuration
     val jobConf = new JobConf(hadoopConf, this.getClass)
     val filter = FileInputFormat.getInputPathFilter(jobConf)
 
-    paths.flatMap { path =>
+    paths.map { path =>
       val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
+      (path, listLeafFiles0(fs, path, filter))
+    }.toMap
   }
 
   /**
@@ -285,7 +341,7 @@ object PartitioningAwareFileCatalog extends Logging {
   private def listLeafFilesInParallel(
       paths: Seq[Path],
       hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
+      sparkSession: SparkSession): Map[Path, Seq[FileStatus]] = {
     assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
     logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
 
@@ -297,48 +353,55 @@ object PartitioningAwareFileCatalog extends Logging {
     // in case of large #defaultParallelism.
     val numParallelism = Math.min(paths.size, 10000)
 
-    val statuses = sparkContext
+    val statusMap = sparkContext
       .parallelize(serializedPaths, numParallelism)
       .mapPartitions { paths =>
         val hadoopConf = serializableConfiguration.value
         listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
+      }.map { case (path, statuses) =>
+        val serializableStatuses = statuses.map { status =>
+          // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+          val blockLocations = status match {
+            case f: LocatedFileStatus =>
+              f.getBlockLocations.map { loc =>
+                SerializableBlockLocation(
+                  loc.getNames,
+                  loc.getHosts,
+                  loc.getOffset,
+                  loc.getLength)
+              }
+
+            case _ =>
+              Array.empty[SerializableBlockLocation]
+          }
 
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
+          SerializableFileStatus(
+            status.getPath.toString,
+            status.getLen,
+            status.isDirectory,
+            status.getReplication,
+            status.getBlockSize,
+            status.getModificationTime,
+            status.getAccessTime,
+            blockLocations)
+        }
+        (path.toString, serializableStatuses)
       }.collect()
 
     // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+    statusMap.map { case (path, serializableStatuses) =>
+      val statuses = serializableStatuses.map { f =>
+        val blockLocations = f.blockLocations.map { loc =>
+          new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+        }
+        new LocatedFileStatus(
+          new FileStatus(
+            f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
+            new Path(f.path)),
+          blockLocations)
       }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
+      (new Path(path), statuses)
+    }.toMap
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
deleted file mode 100644
index 4807a92c2e6b8..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.io.FileNotFoundException
-
-import scala.collection.mutable
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.SerializableConfiguration
-
-
-/**
- * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
- * files in a list of HDFS paths.
- *
- * @param sparkSession a [[SparkSession]]
- * @param ignoreFileNotFound (see [[ListingFileCatalog]])
- */
-abstract class SessionFileCatalog(sparkSession: SparkSession)
-    extends BasicFileCatalog with Logging {
-  protected val hadoopConf: Configuration
-
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
-  }
-}
-
-object SessionFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index b588cbbd593e0..cbc5f4edbebd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -43,9 +43,11 @@ class TableFileCatalog(
     enableFileStatusCache: Boolean) extends BasicFileCatalog {
 
   private val fileStatusCache = if (enableFileStatusCache)  {
-    Some(new FileStatusCache)
+    println("in mem cache")
+    new InMemoryCache
   } else {
-    None
+    println("using noop cache")
+    new NoopCache
   }
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
@@ -62,7 +64,7 @@ class TableFileCatalog(
     filterPartitions(filters).listFiles(Nil)
   }
 
-  override def refresh(): Unit = {}
+  override def refresh(): Unit = fileStatusCache.invalidateAll()
 
   /**
    * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
@@ -71,14 +73,6 @@ class TableFileCatalog(
    * @param filters partition-pruning filters
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
-    if (filters.isEmpty) {
-      cachedAllPartitions
-    } else {
-      filterPartitions0(filters)
-    }
-  }
-
-  private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
     val parameters = baseLocation
       .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
       .getOrElse(Map.empty)
@@ -90,16 +84,13 @@ class TableFileCatalog(
         }
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
-          sparkSession, new Path(baseLocation.get), partitionSpec)
+          sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
       case None =>
         new ListingFileCatalog(sparkSession, rootPaths, parameters, None, fileStatusCache)
     }
   }
 
-  // Not used in the hot path of queries when metastore partition pruning is enabled
-  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
-
-  override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
+  override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
 }
 
 /**
@@ -112,9 +103,11 @@ class TableFileCatalog(
 private class PrunedTableFileCatalog(
     sparkSession: SparkSession,
     tableBasePath: Path,
+    fileStatusCache: FileStatusCache,
     override val partitionSpec: PartitionSpec)
   extends ListingFileCatalog(
     sparkSession,
     partitionSpec.partitions.map(_.path),
     Map.empty,
-    Some(partitionSpec.partitionColumns))
+    Some(partitionSpec.partitionColumns),
+    fileStatusCache)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 76dc23eca1a6e..ff59b54f53909 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -627,40 +627,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     client.getPartition(db, table, spec)
   }
 
-  override def listPartitionsByFilter(
-      db: String,
-      table: String,
-      predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
-    val catalogTable = client.getTable(db, table)
-    val partitionColumnNames = catalogTable.partitionColumnNames.toSet
-    val nonPartitionPruningPredicates = predicates.filterNot {
-      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
-    }
-
-    if (nonPartitionPruningPredicates.nonEmpty) {
-        sys.error("Expected only partition pruning predicates: " +
-          predicates.reduceLeft(And))
-    }
-
-    val partitionSchema = catalogTable.partitionSchema
-
-    if (predicates.nonEmpty) {
-      val clientPrunedPartitions =
-        client.getPartitionsByFilter(catalogTable, predicates)
-      val boundPredicate =
-        InterpretedPredicate.create(predicates.reduce(And).transform {
-          case att: AttributeReference =>
-            val index = partitionSchema.indexWhere(_.name == att.name)
-            BoundReference(index, partitionSchema(index).dataType, nullable = true)
-        })
-      clientPrunedPartitions.filter { case p: CatalogTablePartition =>
-        boundPredicate(p.toRow(partitionSchema))
-      }
-    } else {
-      client.getPartitions(catalogTable)
-    }
-  }
-
   /**
    * Returns the specified partition or None if it does not exist.
    */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 7f6129b8442eb..b57fcb92fd1cb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -236,7 +236,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           if (lazyPruningEnabled) {
             catalog
           } else {
-            catalog.cachedAllPartitions
+            catalog.filterPartitions(Nil)  // materialize all the partitions in memory
           }
         }
         val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet

From 1602ecf2e4abe90abd8a22cddb5cdbc8a72923a4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 17 Oct 2016 15:42:31 -0700
Subject: [PATCH 48/99] Mon Oct 17 15:42:29 PDT 2016

---
 .../datasources/PartitioningAwareFileCatalog.scala        | 7 -------
 .../sql/execution/datasources/TableFileCatalog.scala      | 8 --------
 2 files changed, 15 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index dea9e06f347b2..18ed86d14646b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -26,11 +26,8 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
-<<<<<<< HEAD
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.HiveCatalogMetrics
-=======
->>>>>>> c7ac027d5fd7a80d3122a9269b2bb9c28c6a57db
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
@@ -469,7 +466,3 @@ object PartitioningAwareFileCatalog extends Logging {
       !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
   }
 }
-
-object PartitioningAwareFileCatalog {
-  val BASE_PATH_PARAM = "basePath"
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index a7999fd100089..cbc5f4edbebd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -73,14 +73,6 @@ class TableFileCatalog(
    * @param filters partition-pruning filters
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
-    if (filters.isEmpty) {
-      cachedAllPartitions
-    } else {
-      filterPartitions0(filters)
-    }
-  }
-
-  private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
     val parameters = baseLocation
       .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
       .getOrElse(Map.empty)

From a6e78f7937935688991a305aaad9ac5cd7883f50 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 17 Oct 2016 15:59:01 -0700
Subject: [PATCH 49/99] Mon Oct 17 15:59:01 PDT 2016

---
 .../execution/datasources/PartitioningAwareFileCatalog.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 18ed86d14646b..b4dd739fa6038 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -90,7 +90,7 @@ abstract class PartitioningAwareFileCatalog(
     fileStatusCache: FileStatusCache = new NoopCache) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 

From 1413744ed250a2135c7beb610302ebb387e07441 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 10:56:33 -0700
Subject: [PATCH 50/99] Revert "Revert "[SPARK-17974] Refactor FileCatalog
 classes to simplify the inheritance tree""

This reverts commit 1c5a7d7f64993540baa5558be80130ee6911ba3c.
---
 .../scala/org/apache/spark/sql/Dataset.scala  |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |   4 +-
 .../execution/datasources/FileCatalog.scala   |  66 +++++
 .../execution/datasources/FileFormat.scala    |  61 -----
 .../datasources/HadoopFsRelation.scala        |   4 +-
 .../PartitioningAwareFileCatalog.scala        | 217 ++++++++++++++++-
 .../datasources/PartitioningUtils.scala       |  12 +-
 .../datasources/SessionFileCatalog.scala      | 225 ------------------
 .../datasources/TableFileCatalog.scala        |  11 +-
 .../datasources/FileCatalogSuite.scala        |  10 +
 .../datasources/SessionFileCatalogSuite.scala |  34 ---
 .../ParquetPartitionDiscoverySuite.scala      |   9 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 13 files changed, 303 insertions(+), 354 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7dccbbd3f0a5b..073d2b1512b95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 623d2be55dcec..fdd1fa3648251 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -431,7 +431,7 @@ case class FileSourceScanExec(
   private def createBucketedReadRDD(
       bucketSpec: BucketSpec,
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[Partition],
+      selectedPartitions: Seq[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
     val bucketed =
@@ -463,7 +463,7 @@ case class FileSourceScanExec(
    */
   private def createNonBucketedReadRDD(
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[Partition],
+      selectedPartitions: Seq[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     val defaultMaxSplitBytes =
       fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
new file mode 100644
index 0000000000000..2bc66ceeebdb4
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs._
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+
+/**
+ * A collection of data files from a partitioned relation, along with the partition values in the
+ * form of an [[InternalRow]].
+ */
+case class PartitionDirectory(values: InternalRow, files: Seq[FileStatus])
+
+/**
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
+ */
+trait FileCatalog {
+
+  /**
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
+   */
+  def rootPaths: Seq[Path]
+
+  /**
+   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
+   * unpartitioned, this will return a single partition with no partition values.
+   *
+   * @param filters The filters used to prune which partitions are returned.  These filters must
+   *                only refer to partition columns and this method will only return files
+   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
+   *                filters will not need to be evaluated again on the returned data.
+   */
+  def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory]
+
+  /**
+   * Returns the list of files that will be read when scanning this relation. This call may be
+   * very expensive for large tables.
+   */
+  def inputFiles: Array[String]
+
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index e7239ef91b326..9d153cec731a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -175,64 +175,3 @@ abstract class TextBasedFileFormat extends FileFormat {
     codec == null || codec.isInstanceOf[SplittableCompressionCodec]
   }
 }
-
-/**
- * A collection of data files from a partitioned relation, along with the partition values in the
- * form of an [[InternalRow]].
- */
-case class Partition(values: InternalRow, files: Seq[FileStatus])
-
-/**
- * An interface for objects capable of enumerating the root paths of a relation as well as the
- * partitions of a relation subject to some pruning expressions.
- */
-trait BasicFileCatalog {
-
-  /**
-   * Returns the list of root input paths from which the catalog will get files. There may be a
-   * single root path from which partitions are discovered, or individual partitions may be
-   * specified by each path.
-   */
-  def rootPaths: Seq[Path]
-
-  /**
-   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
-   * unpartitioned, this will return a single partition with no partition values.
-   *
-   * @param filters The filters used to prune which partitions are returned.  These filters must
-   *                only refer to partition columns and this method will only return files
-   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
-   *                filters will not need to be evaluated again on the returned data.
-   */
-  def listFiles(filters: Seq[Expression]): Seq[Partition]
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  def inputFiles: Array[String]
-
-  /** Refresh any cached file listings */
-  def refresh(): Unit
-
-  /** Sum of table file sizes, in bytes */
-  def sizeInBytes: Long
-}
-
-/**
- * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
- * those, infer the relation's partition specification.
- */
-// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
-// which it is safe to list all of its files?
-trait FileCatalog extends BasicFileCatalog {
-
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
-
-  /** Returns all the valid files. */
-  def allFiles(): Seq[FileStatus]
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index db889edf032d6..afad8898089bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[BasicFileCatalog]] that can enumerate the locations of all the files that
+ * @param location A [[FileCatalog]] that can enumerate the locations of all the files that
  *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: BasicFileCatalog,
+    location: FileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index b2508115c282f..5c8eff7ec46b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -17,14 +17,21 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.io.FileNotFoundException
+
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.util.SerializableConfiguration
 
 
 /**
@@ -38,22 +45,24 @@ import org.apache.spark.sql.types.{StringType, StructType}
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType])
-  extends SessionFileCatalog(sparkSession) with FileCatalog {
+    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
   protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
-      Partition(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
+      PartitionDirectory(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
     } else {
       prunePartitions(filters, partitionSpec()).map {
-        case PartitionDirectory(values, path) =>
+        case PartitionPath(values, path) =>
           val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
             case Some(existingDir) =>
               // Directory has children files in it, return them
@@ -63,14 +72,20 @@ abstract class PartitioningAwareFileCatalog(
               // Directory does not exist, or has no children files
               Nil
           }
-          Partition(values, files)
+          PartitionDirectory(values, files)
       }
     }
     logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
     selectedPartitions
   }
 
-  override def allFiles(): Seq[FileStatus] = {
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
+
+  def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
       // For each of the root input paths, get the list of files inside them
       rootPaths.flatMap { path =>
@@ -139,7 +154,7 @@ abstract class PartitioningAwareFileCatalog(
 
   private def prunePartitions(
       predicates: Seq[Expression],
-      partitionSpec: PartitionSpec): Seq[PartitionDirectory] = {
+      partitionSpec: PartitionSpec): Seq[PartitionPath] = {
     val PartitionSpec(partitionColumns, partitions) = partitionSpec
     val partitionColumnNames = partitionColumns.map(_.name).toSet
     val partitionPruningPredicates = predicates.filter {
@@ -156,7 +171,7 @@ abstract class PartitioningAwareFileCatalog(
       })
 
       val selected = partitions.filter {
-        case PartitionDirectory(values, _) => boundPredicate(values)
+        case PartitionPath(values, _) => boundPredicate(values)
       }
       logInfo {
         val total = partitions.length
@@ -214,8 +229,186 @@ abstract class PartitioningAwareFileCatalog(
     val name = path.getName
     !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
   }
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
+    mutable.LinkedHashSet(files: _*)
+  }
 }
 
-object PartitioningAwareFileCatalog {
+object PartitioningAwareFileCatalog extends Logging {
   val BASE_PATH_PARAM = "basePath"
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 504464216e5a4..ac6795b9a2e7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -33,8 +33,8 @@ import org.apache.spark.sql.types._
 
 // TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
 
-object PartitionDirectory {
-  def apply(values: InternalRow, path: String): PartitionDirectory =
+object PartitionPath {
+  def apply(values: InternalRow, path: String): PartitionPath =
     apply(values, new Path(path))
 }
 
@@ -42,14 +42,14 @@ object PartitionDirectory {
  * Holds a directory in a partitioned collection of files as well as as the partition values
  * in the form of a Row.  Before scanning, the files at `path` need to be enumerated.
  */
-case class PartitionDirectory(values: InternalRow, path: Path)
+case class PartitionPath(values: InternalRow, path: Path)
 
 case class PartitionSpec(
     partitionColumns: StructType,
-    partitions: Seq[PartitionDirectory])
+    partitions: Seq[PartitionPath])
 
 object PartitionSpec {
-  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionDirectory])
+  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionPath])
 }
 
 object PartitioningUtils {
@@ -141,7 +141,7 @@ object PartitioningUtils {
       // Finally, we create `Partition`s based on paths and resolved partition values.
       val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
         case (PartitionValues(_, literals), (path, _)) =>
-          PartitionDirectory(InternalRow.fromSeq(literals.map(_.value)), path)
+          PartitionPath(InternalRow.fromSeq(literals.map(_.value)), path)
       }
 
       PartitionSpec(StructType(fields), partitions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
deleted file mode 100644
index 4807a92c2e6b8..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.io.FileNotFoundException
-
-import scala.collection.mutable
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.SerializableConfiguration
-
-
-/**
- * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
- * files in a list of HDFS paths.
- *
- * @param sparkSession a [[SparkSession]]
- * @param ignoreFileNotFound (see [[ListingFileCatalog]])
- */
-abstract class SessionFileCatalog(sparkSession: SparkSession)
-    extends BasicFileCatalog with Logging {
-  protected val hadoopConf: Configuration
-
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
-  }
-}
-
-object SessionFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index a5c41b244589b..5648ab480a98a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * A [[BasicFileCatalog]] for a metastore catalog table.
+ * A [[FileCatalog]] for a metastore catalog table.
  *
  * @param sparkSession a [[SparkSession]]
  * @param db the table's database name
@@ -38,10 +38,9 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long)
-  extends SessionFileCatalog(sparkSession) {
+    override val sizeInBytes: Long) extends FileCatalog {
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 
@@ -51,7 +50,7 @@ class TableFileCatalog(
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     filterPartitions(filters).listFiles(Nil)
   }
 
@@ -79,7 +78,7 @@ class TableFileCatalog(
       case Some(schema) =>
         val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
         val partitions = selectedPartitions.map { p =>
-          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
+          PartitionPath(p.toRow(schema), p.storage.locationUri.get)
         }
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index 2695974b84b00..9c43169cbf898 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -81,6 +81,16 @@ class FileCatalogSuite extends SharedSQLContext {
     }
   }
 
+  test("PartitioningAwareFileCatalog - file filtering") {
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("abcd"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut(".ab"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd"))
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_metadata"))
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd_common_metadata"))
+  }
+
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
       override val rootPaths: Seq[Path])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
deleted file mode 100644
index df509583377ae..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import org.apache.spark.SparkFunSuite
-
-class SessionFileCatalogSuite extends SparkFunSuite {
-
-  test("file filtering") {
-    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
-    assert(SessionFileCatalog.shouldFilterOut(".ab"))
-    assert(SessionFileCatalog.shouldFilterOut("_cd"))
-
-    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
-    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 43357c97c395a..36d4df0015ffd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionPath => Partition, PartitioningAwareFileCatalog, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -626,10 +626,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
-          assert(location.partitionSpec === PartitionSpec.emptySpec)
+        case LogicalRelation(
+            HadoopFsRelation(location: PartitioningAwareFileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec() === PartitionSpec.emptySpec)
       }.getOrElse {
-        fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
+        fail(s"Expecting a matching HadoopFsRelation, but got:\n$queryExecution")
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 4a2aaa7d4f6ca..16e1e37b2fb02 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.{Partition => _, _}
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.orc.OrcFileFormat
 import org.apache.spark.sql.types._

From 699bce560ef77c2801ba8a7e28538f49581f98fe Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 11:19:34 -0700
Subject: [PATCH 51/99] fix it

---
 .../datasources/parquet/ParquetPartitionDiscoverySuite.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 36d4df0015ffd..7c50e33a648d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,8 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionPath => Partition, PartitioningAwareFileCatalog, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.{PartitionPath => Partition}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext

From 2cef03b87f5dd8d5f2c0e15b3795104ea7f028d8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 14:41:34 -0700
Subject: [PATCH 52/99] update

---
 .../execution/datasources/TableFileCatalog.scala |  2 +-
 .../org/apache/spark/sql/internal/SQLConf.scala  |  6 ++++--
 .../spark/sql/hive/HiveDDLCommandSuite.scala     | 16 +++++++++++++++-
 ...Suite.scala => HiveTablePerfStatsSuite.scala} | 15 +--------------
 4 files changed, 21 insertions(+), 18 deletions(-)
 rename sql/hive/src/test/scala/org/apache/spark/sql/hive/{HiveDataFrameSuite.scala => HiveTablePerfStatsSuite.scala} (90%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index fcad1cb3a8411..dee2c89de9b32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -40,7 +40,7 @@ class TableFileCatalog(
     table: String,
     partitionSchema: Option[StructType],
     override val sizeInBytes: Long,
-    enableFileStatusCache: Boolean) extends BasicFileCatalog {
+    enableFileStatusCache: Boolean) extends FileCatalog {
 
   private val fileStatusCache = if (enableFileStatusCache)  {
     println("in mem cache")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8afd39d657865..8aa7ce7098fa3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -265,13 +265,15 @@ object SQLConf {
   val HIVE_METASTORE_PARTITION_PRUNING =
     SQLConfigBuilder("spark.sql.hive.metastorePartitionPruning")
       .doc("When true, some predicates will be pushed down into the Hive metastore so that " +
-           "unmatching partitions can be eliminated earlier.")
+           "unmatching partitions can be eliminated earlier. This only affects Hive tables " +
+           "not converted to filesource relations (see `spark.sql.hive.convertMetastoreParquet` " +
+           "and `spark.sql.hive.convertMetastoreOrc` for more information)." )
       .booleanConf
       .createWithDefault(false)
 
   val HIVE_FILESOURCE_PARTITION_PRUNING =
     SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
-      .doc("When true, enable metastore partition pruning for file source tables as well. " +
+      .doc("When true, enable metastore partition pruning for filesource relations as well. " +
            "This is currently implemented for converted Hive tables only.")
       .booleanConf
       .createWithDefault(true)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
index 81337493c7f28..d13e29b3029b1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
@@ -577,5 +577,19 @@ class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingle
       assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
       assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
     }
-   }
+  }
+
+  test("table name with schema") {
+    // regression test for SPARK-11778
+    spark.sql("create schema usrdb")
+    spark.sql("create table usrdb.test(c int)")
+    spark.read.table("usrdb.test")
+    spark.sql("drop table usrdb.test")
+    spark.sql("drop schema usrdb")
+  }
+
+  test("SPARK-15887: hive-site.xml should be loaded") {
+    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+    assert(hiveClient.getConf("hive.in.test", "") == "true")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
similarity index 90%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
index f65e74de87a57..cd6a77efb9df0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
@@ -24,20 +24,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
-class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
-  test("table name with schema") {
-    // regression test for SPARK-11778
-    spark.sql("create schema usrdb")
-    spark.sql("create table usrdb.test(c int)")
-    spark.read.table("usrdb.test")
-    spark.sql("drop table usrdb.test")
-    spark.sql("drop schema usrdb")
-  }
-
-  test("SPARK-15887: hive-site.xml should be loaded") {
-    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
-    assert(hiveClient.getConf("hive.in.test", "") == "true")
-  }
+class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
 
   private def setupPartitionedTable(tableName: String, dir: File): Unit = {
     spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write

From d1d9d0b1d3ca681c4988511d766676b60d541c4f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 16:00:27 -0700
Subject: [PATCH 53/99] Tue Oct 18 16:00:27 PDT 2016

---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index a4da138e71992..7fba901b85695 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -42,7 +42,6 @@ import scala.util.control.{ControlThrowable, NonFatal}
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import com.google.common.io.{ByteStreams, Files => GFiles}
 import com.google.common.net.InetAddresses
-import org.apache.commons.io.IOUtils
 import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
@@ -1486,10 +1485,10 @@ private[spark] object Utils extends Logging {
       val gzInputStream = new GZIPInputStream(new FileInputStream(file))
       val bufSize = 1024
       val buf = new Array[Byte](bufSize)
-      var numBytes = IOUtils.read(gzInputStream, buf)
+      var numBytes = ByteStreams.read(gzInputStream, buf, 0, bufSize)
       while (numBytes > 0) {
         fileSize += numBytes
-        numBytes = IOUtils.read(gzInputStream, buf)
+        numBytes = ByteStreams.read(gzInputStream, buf, 0, bufSize)
       }
       fileSize
     } catch {

From 947c32d1658948ed4322ab1c3413e92648432ab9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 16:54:19 -0700
Subject: [PATCH 54/99] update

---
 .../datasources/FileStatusCache.scala         | 78 ++++++++++++++++++
 .../PartitioningAwareFileCatalog.scala        | 48 +----------
 .../datasources/TableFileCatalog.scala        |  2 -
 .../apache/spark/sql/internal/SQLConf.scala   | 14 +++-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  3 +-
 .../sql/hive/HiveTablePerfStatsSuite.scala    | 79 ++++++++++++++++++-
 6 files changed, 172 insertions(+), 52 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
new file mode 100644
index 0000000000000..aee4d606394ce
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.util.concurrent.ConcurrentHashMap
+
+import org.apache.hadoop.fs.{FileStatus, Path}
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.util.SerializableConfiguration
+
+/**
+ * A cache of the leaf files of partition directories. We cache these files in order to speed
+ * up iterated queries over the same set of partitions. Otherwise, each query would have to
+ * hit remote storage in order to gather file statistics for physical planning.
+ *
+ * Each resolved catalog table has its own FileStatusCache. When the backing relation for the
+ * table is refreshed via refreshTable() or refreshByPath(), this cache will be invalidated.
+ */
+abstract class FileStatusCache {
+  /**
+   * @return the leaf files for the specified path from this cache, or None if not cached.
+   */
+  def getLeafFiles(path: Path): Option[Array[FileStatus]] = None
+
+  /**
+   * Saves the given set of leaf files for a path in this cache.
+   */
+  def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit
+
+  /**
+   * Invalidates all data held by this cache.
+   */
+  def invalidateAll(): Unit
+}
+
+/**
+ * An implementation that caches all partition file statuses in memory forever.
+ */
+class InMemoryCache extends FileStatusCache {
+  private val cache = new ConcurrentHashMap[Path, Array[FileStatus]]()
+
+  override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
+    Option(cache.get(path))
+  }
+
+  override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
+    cache.put(path, leafFiles.toArray)
+  }
+
+  override def invalidateAll(): Unit = {
+    cache.clear()
+  }
+}
+
+/**
+ * A non-caching implementation used when partition file status caching is disabled.
+ */
+class NoopCache extends FileStatusCache {
+  override def getLeafFiles(path: Path): Option[Array[FileStatus]] = None
+  override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {}
+  override def invalidateAll(): Unit = {}
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 3ba4e5c8f69fc..3deadb8312691 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.execution.datasources
 
 import java.io.FileNotFoundException
-import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.mutable
 
@@ -34,47 +33,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.SerializableConfiguration
 
-
-abstract class FileStatusCache {
-  def getLeafFiles(path: Path): Option[Seq[FileStatus]] = None
-  def putLeafFiles(path: Path, leafFiles: Seq[FileStatus]): Unit
-  def invalidateAll(): Unit
-}
-
-class InMemoryCache extends FileStatusCache {
-  private val cache = new ConcurrentHashMap[Path, Seq[FileStatus]]()
-
-  override def getLeafFiles(path: Path): Option[Seq[FileStatus]] = {
-    val res = Option(cache.get(path))
-    res.foreach { r =>
-      HiveCatalogMetrics.incrementFileCacheHits(r.length)
-    }
-    res
-  }
-
-  override def putLeafFiles(path: Path, leafFiles: Seq[FileStatus]): Unit = {
-    println("discovered files: " + leafFiles)
-    HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
-    cache.put(path, leafFiles)
-  }
-
-  override def invalidateAll(): Unit = {
-    println("invalidating all")
-    cache.clear()
-  }
-}
-
-class NoopCache extends FileStatusCache {
-  override def getLeafFiles(path: Path): Option[Seq[FileStatus]] = None
-  override def putLeafFiles(path: Path, leafFiles: Seq[FileStatus]): Unit = {
-    println("[uncached] discovered files: " + leafFiles)
-    HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
-  }
-  override def invalidateAll(): Unit = {
-    println("invalidating all")
-  }
-}
-
 /**
  * An abstract class that represents [[FileCatalog]]s that are aware of partitioned tables.
  * It provides the necessary methods to parse partition data based on a set of files.
@@ -285,10 +243,9 @@ abstract class PartitioningAwareFileCatalog(
     for (path <- paths) {
       fileStatusCache.getLeafFiles(path) match {
         case Some(files) =>
-          println("cache hit: " + path)
+          HiveCatalogMetrics.incrementFileCacheHits(files.length)
           output ++= files
         case None =>
-          println("cache miss: " + path)
           pathsToFetch += path
       }
     }
@@ -299,7 +256,8 @@ abstract class PartitioningAwareFileCatalog(
       PartitioningAwareFileCatalog.listLeafFilesInSerial(pathsToFetch, hadoopConf)
     }
     discovered.foreach { case (path, leafFiles) =>
-      fileStatusCache.putLeafFiles(path, leafFiles)
+      HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
+      fileStatusCache.putLeafFiles(path, leafFiles.toArray)
       output ++= leafFiles
     }
     output
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index dee2c89de9b32..67d98505b72e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -43,10 +43,8 @@ class TableFileCatalog(
     enableFileStatusCache: Boolean) extends FileCatalog {
 
   private val fileStatusCache = if (enableFileStatusCache)  {
-    println("in mem cache")
     new InMemoryCache
   } else {
-    println("using noop cache")
     new NoopCache
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8aa7ce7098fa3..920c3dee5616e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -266,8 +266,8 @@ object SQLConf {
     SQLConfigBuilder("spark.sql.hive.metastorePartitionPruning")
       .doc("When true, some predicates will be pushed down into the Hive metastore so that " +
            "unmatching partitions can be eliminated earlier. This only affects Hive tables " +
-           "not converted to filesource relations (see `spark.sql.hive.convertMetastoreParquet` " +
-           "and `spark.sql.hive.convertMetastoreOrc` for more information)." )
+           "not converted to filesource relations (see HiveUtils.CONVERT_METASTORE_PARQUET and " +
+           "HiveUtils.CONVERT_METASTORE_ORC for more information).")
       .booleanConf
       .createWithDefault(false)
 
@@ -278,6 +278,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val HIVE_FILESOURCE_PARTITION_FILE_CACHE_ENABLED =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheEnabled")
+      .doc("When true, enable caching of partition files in memory. This only takes effect " +
+           "if filesource partition pruning is also enabled.")
+      .booleanConf
+      .createWithDefault(true)
+
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
@@ -679,6 +686,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
 
+  def filesourcePartitionFileCacheEnabled: Boolean =
+    getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_ENABLED)
+
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
   def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ed1f41bac23d9..a42aec0961f00 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -232,7 +232,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val fileCatalog = {
           val catalog = new TableFileCatalog(
             sparkSession, db, table, Some(partitionSchema), sizeInBytes,
-            enableFileStatusCache = lazyPruningEnabled)
+            enableFileStatusCache = lazyPruningEnabled &&
+              sparkSession.sqlContext.conf.filesourcePartitionFileCacheEnabled)
           if (lazyPruningEnabled) {
             catalog
           } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
index cd6a77efb9df0..e05671bafc85d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
@@ -65,7 +65,9 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
   }
 
   test("lazy partition pruning reads only necessary partition data") {
-    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "true") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheEnabled" -> "false") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
@@ -90,11 +92,84 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
 
-          // read all should be cached
+          // read all should not be cached
           HiveCatalogMetrics.reset()
           spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          // cache should be disabled
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("lazy partition pruning with file status caching enabled") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheEnabled" -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 3").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 3)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 1)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 2)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 3)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 5)
+        }
+      }
+    }
+  }
+
+  test("file status caching respects refresh table and refreshByPath") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheEnabled" -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("refresh table test")
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          spark.catalog.cacheTable("test")
+          HiveCatalogMetrics.reset()
+          spark.catalog.refreshByPath(dir.getAbsolutePath)
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
         }
       }
     }

From b9272c2a51ab981a6343a43931f57d03e931aab1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 19:11:58 -0700
Subject: [PATCH 55/99] Tue Oct 18 19:11:58 PDT 2016

---
 .../spark/sql/execution/datasources/FileStatusCache.scala   | 6 +++++-
 .../datasources/PartitioningAwareFileCatalog.scala          | 2 +-
 .../spark/sql/execution/datasources/TableFileCatalog.scala  | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index aee4d606394ce..c01afa40ec5f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -71,8 +71,12 @@ class InMemoryCache extends FileStatusCache {
 /**
  * A non-caching implementation used when partition file status caching is disabled.
  */
-class NoopCache extends FileStatusCache {
+private class NoopCache extends FileStatusCache {
   override def getLeafFiles(path: Path): Option[Array[FileStatus]] = None
   override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {}
   override def invalidateAll(): Unit = {}
 }
+
+object FileStatusCache {
+  val noop: FileStatusCache = new NoopCache
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 3deadb8312691..cfc4747ef9ad9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -45,7 +45,7 @@ abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
     partitionSchema: Option[StructType],
-    fileStatusCache: FileStatusCache = new NoopCache) extends FileCatalog with Logging {
+    fileStatusCache: FileStatusCache = FileStatusCache.noop) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 67d98505b72e5..17e2a7bf35bbe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -45,7 +45,7 @@ class TableFileCatalog(
   private val fileStatusCache = if (enableFileStatusCache)  {
     new InMemoryCache
   } else {
-    new NoopCache
+    FileStatusCache.noop
   }
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf

From 96b12e54daab0186f081226e1943a9cd5322dbec Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 19 Oct 2016 17:29:54 +0800
Subject: [PATCH 56/99] store partition spec in metastore for data source table

---
 .../sql/catalyst/catalog/SessionCatalog.scala |   3 +-
 .../sql/catalyst/catalog/interface.scala      |   8 +-
 .../command/createDataSourceTables.scala      |  17 ++
 .../spark/sql/execution/command/ddl.scala     |  32 ++--
 .../datasources/DataSourceStrategy.scala      |  13 +-
 .../PruneFileSourcePartitions.scala           |   4 +-
 .../datasources/TableFileCatalog.scala        |  26 ++-
 .../sql/execution/command/DDLSuite.scala      | 154 ++++++++----------
 .../spark/sql/hive/HiveExternalCatalog.scala  |  14 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  16 +-
 .../sql/hive/client/HiveClientImpl.scala      |   2 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   2 +-
 12 files changed, 155 insertions(+), 136 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 9711131d88a05..d8c18e8e61cd1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -762,7 +762,8 @@ class SessionCatalog(
   private def requireExactMatchedPartitionSpec(
       specs: Seq[TablePartitionSpec],
       table: CatalogTable): Unit = {
-    val defined = table.partitionColumnNames.sorted
+    // The partition columns in partition specification are always lower cased.
+    val defined = table.partitionColumnNames.map(_.toLowerCase).sorted
     specs.foreach { s =>
       if (s.keys.toSeq.sorted != defined) {
         throw new AnalysisException(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 1a57a7707caa1..06f310a575b26 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -89,9 +89,10 @@ case class CatalogTablePartition(
     parameters: Map[String, String] = Map.empty) {
 
   override def toString: String = {
+    val specString = spec.map { case (k, v) => s"$k=$v" }.mkString(", ")
     val output =
       Seq(
-        s"Partition Values: [${spec.values.mkString(", ")}]",
+        s"Partition Values: [$specString]",
         s"$storage",
         s"Partition Parameters:{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}")
 
@@ -102,8 +103,9 @@ case class CatalogTablePartition(
    * Given the partition schema, returns a row with that schema holding the partition values.
    */
   def toRow(partitionSchema: StructType): InternalRow = {
-    InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
-      Cast(Literal(spec(name)), dataType).eval()
+    InternalRow.fromSeq(partitionSchema.map { field =>
+      // The partition columns in partition specification are always lower cased.
+      Cast(Literal(spec(field.name.toLowerCase)), field.dataType).eval()
     })
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index a8c75a7f29cef..d7907ace47166 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -98,6 +98,16 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
     sessionState.catalog.createTable(newTable, ignoreIfExists = false)
+
+    dataSource match {
+      case fs: HadoopFsRelation =>
+        if (table.tableType == CatalogTableType.EXTERNAL && partitionColumnNames.nonEmpty) {
+          sparkSession.sessionState.executePlan(
+            AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
+        }
+      case _ =>
+    }
+
     Seq.empty[Row]
   }
 }
@@ -232,6 +242,13 @@ case class CreateDataSourceTableAsSelectCommand(
       sessionState.catalog.createTable(newTable, ignoreIfExists = false)
     }
 
+    result match {
+      case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty =>
+        sparkSession.sessionState.executePlan(
+          AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
+      case _ =>
+    }
+
     // Refresh the cache of the table in the catalog.
     sessionState.catalog.refreshTable(tableIdentWithDB)
     Seq.empty[Row]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 45fa293e58951..aea3bce914454 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, PartitioningUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
@@ -346,10 +346,6 @@ case class AlterTableAddPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        "ALTER TABLE ADD PARTITION is not allowed for tables defined using the datasource API")
-    }
     val parts = partitionSpecsAndLocs.map { case (spec, location) =>
       // inherit table storage format (possibly except for location)
       CatalogTablePartition(spec, table.storage.copy(locationUri = location))
@@ -377,10 +373,6 @@ case class AlterTableRenamePartitionCommand(
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        "ALTER TABLE RENAME PARTITION is not allowed for tables defined using the datasource API")
-    }
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
     catalog.renamePartitions(
       tableName, Seq(oldPartition), Seq(newPartition))
@@ -414,10 +406,6 @@ case class AlterTableDropPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        "ALTER TABLE DROP PARTITIONS is not allowed for tables defined using the datasource API")
-    }
     catalog.dropPartitions(table.identifier, specs, ignoreIfNotExists = ifExists, purge = purge)
     Seq.empty[Row]
   }
@@ -465,25 +453,31 @@ case class AlterTableRecoverPartitionsCommand(
     }
   }
 
+  private def getPath(table: CatalogTable): Option[String] = {
+    if (table.provider == Some("hive")) {
+      table.storage.locationUri
+    } else {
+      new CaseInsensitiveMap(table.storage.properties).get("path")
+    }
+  }
+
   override def run(spark: SparkSession): Seq[Row] = {
     val catalog = spark.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     val tableIdentWithDB = table.identifier.quotedString
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        s"Operation not allowed: $cmd on datasource tables: $tableIdentWithDB")
-    }
     if (table.partitionColumnNames.isEmpty) {
       throw new AnalysisException(
         s"Operation not allowed: $cmd only works on partitioned tables: $tableIdentWithDB")
     }
-    if (table.storage.locationUri.isEmpty) {
+
+    val tablePath = getPath(table)
+    if (tablePath.isEmpty) {
       throw new AnalysisException(s"Operation not allowed: $cmd only works on table with " +
         s"location provided: $tableIdentWithDB")
     }
 
-    val root = new Path(table.storage.locationUri.get)
+    val root = new Path(tablePath.get)
     logInfo(s"Recover all the partitions in $root")
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 7d0abe86a44df..588d9bd4e6167 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -30,11 +30,11 @@ import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
-import org.apache.spark.sql.execution.command.{DDLUtils, ExecutedCommandExec}
+import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, DDLUtils, ExecutedCommandExec}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -179,7 +179,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
           "Cannot overwrite a path that is also being read from.")
       }
 
-      InsertIntoHadoopFsRelationCommand(
+      val insertCmd = InsertIntoHadoopFsRelationCommand(
         outputPath,
         query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
         t.bucketSpec,
@@ -188,6 +188,13 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         t.options,
         query,
         mode)
+
+      if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty) {
+        val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)
+        Union(insertCmd, recoverPartitionCmd)
+      } else {
+        insertCmd
+      }
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index 29121a47d92d1..8689017c3ed75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -59,7 +59,9 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
         val prunedFsRelation =
           fsRelation.copy(location = prunedFileCatalog)(sparkSession)
-        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+        val prunedLogicalRelation = logicalRelation.copy(
+          relation = prunedFsRelation,
+          expectedOutputAttributes = Some(logicalRelation.output))
 
         // Keep partition-pruning predicates so that they are visible in physical planning
         val filterExpression = filters.reduceLeft(And)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index fc08c3798ee06..916ffbbd39e0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -35,8 +35,8 @@ import org.apache.spark.sql.types.StructType
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
-    db: String,
-    table: String,
+    val db: String,
+    val table: String,
     partitionSchema: Option[StructType],
     override val sizeInBytes: Long) extends FileCatalog {
 
@@ -46,7 +46,11 @@ class TableFileCatalog(
 
   private val catalogTable = externalCatalog.getTable(db, table)
 
-  private val baseLocation = catalogTable.storage.locationUri
+  private val baseLocation = if (catalogTable.provider == Some("hive")) {
+    catalogTable.storage.locationUri
+  } else {
+    new CaseInsensitiveMap(catalogTable.storage.properties).get("path")
+  }
 
   // Populated on-demand by calls to cachedAllPartitions
   private var cachedAllPartitions: ListingFileCatalog = null
@@ -76,11 +80,8 @@ class TableFileCatalog(
   }
 
   private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
-    val parameters = baseLocation
-      .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
-      .getOrElse(Map.empty)
     partitionSchema match {
-      case Some(schema) =>
+      case Some(schema) if schema.nonEmpty =>
         val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
         val partitions = selectedPartitions.map { p =>
           PartitionPath(p.toRow(schema), p.storage.locationUri.get)
@@ -88,8 +89,8 @@ class TableFileCatalog(
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
           sparkSession, new Path(baseLocation.get), partitionSpec)
-      case None =>
-        new ListingFileCatalog(sparkSession, rootPaths, parameters, None)
+      case _ =>
+        new ListingFileCatalog(sparkSession, rootPaths, catalogTable.storage.properties, None)
     }
   }
 
@@ -102,6 +103,13 @@ class TableFileCatalog(
   }
 
   override def inputFiles: Array[String] = allPartitions.inputFiles
+
+  override def equals(o: Any): Boolean = o match {
+    case other: TableFileCatalog => this.db == other.db && this.table == other.table
+    case _ => false
+  }
+
+  override def hashCode(): Int = 31 * db.hashCode + table.hashCode
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index c8b8e9ebabc75..9bbde8308e6a6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -926,58 +926,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   }
 
   test("alter table: rename partition") {
-    val catalog = spark.sessionState.catalog
-    val tableIdent = TableIdentifier("tab1", Some("dbx"))
-    createPartitionedTable(tableIdent, isDatasourceTable = false)
-    sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
-    sql("ALTER TABLE dbx.tab1 PARTITION (a='2', b='c') RENAME TO PARTITION (a='20', b='c')")
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(Map("a" -> "100", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
-    // rename without explicitly specifying database
-    catalog.setCurrentDatabase("dbx")
-    sql("ALTER TABLE tab1 PARTITION (a='100', b='p') RENAME TO PARTITION (a='10', b='p')")
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(Map("a" -> "10", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
-    // table to alter does not exist
-    intercept[NoSuchTableException] {
-      sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
-    }
-    // partition to rename does not exist
-    intercept[NoSuchPartitionException] {
-      sql("ALTER TABLE tab1 PARTITION (a='not_found', b='1') RENAME TO PARTITION (a='1', b='2')")
-    }
+    testRenamePartitions(isDatasourceTable = false)
   }
 
   test("alter table: rename partition (datasource table)") {
-    createPartitionedTable(TableIdentifier("tab1", Some("dbx")), isDatasourceTable = true)
-    val e = intercept[AnalysisException] {
-      sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
-    }.getMessage
-    assert(e.contains(
-      "ALTER TABLE RENAME PARTITION is not allowed for tables defined using the datasource API"))
-    // table to alter does not exist
-    intercept[NoSuchTableException] {
-      sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
-    }
-  }
-
-  private def createPartitionedTable(
-      tableIdent: TableIdentifier,
-      isDatasourceTable: Boolean): Unit = {
-    val catalog = spark.sessionState.catalog
-    val part1 = Map("a" -> "1", "b" -> "q")
-    val part2 = Map("a" -> "2", "b" -> "c")
-    val part3 = Map("a" -> "3", "b" -> "p")
-    createDatabase(catalog, "dbx")
-    createTable(catalog, tableIdent)
-    createTablePartition(catalog, part1, tableIdent)
-    createTablePartition(catalog, part2, tableIdent)
-    createTablePartition(catalog, part3, tableIdent)
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(part1, part2, part3))
-    if (isDatasourceTable) {
-      convertToDatasourceTable(catalog, tableIdent)
-    }
+    testRenamePartitions(isDatasourceTable = true)
   }
 
   test("show tables") {
@@ -1344,40 +1297,33 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       convertToDatasourceTable(catalog, tableIdent)
     }
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS " +
-        "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
-      assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isEmpty)
-      assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
-      assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
-    }
+
+    sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS PARTITION (a='2', b='6') LOCATION 'paris' " +
+      "PARTITION (a='3', b='7')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
+    assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
+    assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
+
     // add partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-        Set(part1, part2, part3, part4))
-    }
+    sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(part1, part2, part3, part4))
+
     // table to alter does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE does_not_exist ADD IF NOT EXISTS PARTITION (a='4', b='9')")
     }
+
     // partition to add already exists
     intercept[AnalysisException] {
       sql("ALTER TABLE tab1 ADD PARTITION (a='4', b='8')")
     }
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-        Set(part1, part2, part3, part4))
-    }
+
+    sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(part1, part2, part3, part4))
   }
 
   private def testDropPartitions(isDatasourceTable: Boolean): Unit = {
@@ -1398,20 +1344,15 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     if (isDatasourceTable) {
       convertToDatasourceTable(catalog, tableIdent)
     }
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE dbx.tab1 DROP IF EXISTS PARTITION (a='4', b='8'), PARTITION (a='3', b='7')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2))
-    }
+
+    sql("ALTER TABLE dbx.tab1 DROP IF EXISTS PARTITION (a='4', b='8'), PARTITION (a='3', b='7')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2))
+
     // drop partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='2', b ='6')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
-    }
+    sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='2', b ='6')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
+
     // table to alter does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE does_not_exist DROP IF EXISTS PARTITION (a='2')")
@@ -1420,11 +1361,46 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     intercept[AnalysisException] {
       sql("ALTER TABLE tab1 DROP PARTITION (a='300')")
     }
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='300')")
+    sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='300')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
+  }
+
+  private def testRenamePartitions(isDatasourceTable: Boolean): Unit = {
+    val catalog = spark.sessionState.catalog
+    val tableIdent = TableIdentifier("tab1", Some("dbx"))
+    val part1 = Map("a" -> "1", "b" -> "q")
+    val part2 = Map("a" -> "2", "b" -> "c")
+    val part3 = Map("a" -> "3", "b" -> "p")
+    createDatabase(catalog, "dbx")
+    createTable(catalog, tableIdent)
+    createTablePartition(catalog, part1, tableIdent)
+    createTablePartition(catalog, part2, tableIdent)
+    createTablePartition(catalog, part3, tableIdent)
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(part1, part2, part3))
+    if (isDatasourceTable) {
+      convertToDatasourceTable(catalog, tableIdent)
     }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
+
+    sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
+    sql("ALTER TABLE dbx.tab1 PARTITION (a='2', b='c') RENAME TO PARTITION (a='20', b='c')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(Map("a" -> "100", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+
+    // rename without explicitly specifying database
+    catalog.setCurrentDatabase("dbx")
+    sql("ALTER TABLE tab1 PARTITION (a='100', b='p') RENAME TO PARTITION (a='10', b='p')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(Map("a" -> "10", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+
+    // table to alter does not exist
+    intercept[NoSuchTableException] {
+      sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
+    }
+
+    // partition to rename does not exist
+    intercept[NoSuchPartitionException] {
+      sql("ALTER TABLE tab1 PARTITION (a='not_found', b='1') RENAME TO PARTITION (a='1', b='2')")
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 2003ff42d4f0c..7f140589e6956 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -241,12 +241,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         }
       }
 
-      // converts the table metadata to Spark SQL specific format, i.e. set schema, partition column
-      // names and bucket specification to empty.
+      // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
+      // bucket specification to empty. Note that partition columns are retained, so that we can
+      // call partition-related Hive API later.
       def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
         tableDefinition.copy(
-          schema = new StructType,
-          partitionColumnNames = Nil,
+          schema = tableDefinition.partitionSchema,
           bucketSpec = None,
           properties = tableDefinition.properties ++ tableProperties)
       }
@@ -649,7 +649,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val catalogTable = client.getTable(db, table)
     val partitionColumnNames = catalogTable.partitionColumnNames.toSet
     val nonPartitionPruningPredicates = predicates.filterNot {
-      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
+      // Hive metastore is not case-preserving, so the `partitionColumnNames` are always lower
+      // cased, here we also lower case the attribute names in partition spec.
+      _.references.map(_.name.toLowerCase).toSet.subsetOf(partitionColumnNames)
     }
 
     if (nonPartitionPruningPredicates.nonEmpty) {
@@ -665,7 +667,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       val boundPredicate =
         InterpretedPredicate.create(predicates.reduce(And).transform {
           case att: AttributeReference =>
-            val index = partitionSchema.indexWhere(_.name == att.name)
+            val index = partitionSchema.indexWhere(_.name == att.name.toLowerCase)
             BoundReference(index, partitionSchema(index).dataType, nullable = true)
         })
       clientPrunedPartitions.filter { case p: CatalogTablePartition =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c909eb5d20bcd..41277fa8c16a3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -78,9 +78,19 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             className = table.provider.get,
             options = table.storage.properties)
 
-        LogicalRelation(
-          dataSource.resolveRelation(),
-          catalogTable = Some(table))
+        val relation = dataSource.resolveRelation() match {
+          case r: HadoopFsRelation =>
+            val fileCatalog = new TableFileCatalog(
+              r.sparkSession,
+              in.database,
+              in.name,
+              Some(table.partitionSchema),
+              r.sizeInBytes)
+            r.copy(location = fileCatalog)(r.sparkSession)
+          case other => other
+        }
+
+        LogicalRelation(relation, catalogTable = Some(table))
       }
     }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index e745a8c5b3589..84873bbbb81ce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -777,7 +777,7 @@ private[hive] class HiveClientImpl(
     val (partCols, schema) = table.schema.map(toHiveColumn).partition { c =>
       table.partitionColumnNames.contains(c.getName)
     }
-    if (table.schema.isEmpty) {
+    if (schema.isEmpty) {
       // This is a hack to preserve existing behavior. Before Spark 2.0, we do not
       // set a default serde here (this was done in Hive), and so if the user provides
       // an empty schema Hive would automatically populate the schema with a single
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e26b6b57ef569..8d10a7d73a948 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -355,7 +355,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         "# Partition Information",
         "# col_name",
         "Detailed Partition Information CatalogPartition(",
-        "Partition Values: [Us, 1]",
+        "Partition Values: [c=Us, d=1]",
         "Storage(Location:",
         "Partition Parameters")
 

From 3fba74cf0413ae9cc3d2117d49cd14bb42edd549 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 13:26:09 -0700
Subject: [PATCH 57/99] comments

---
 .../datasources/FileStatusCache.scala         |  9 +++-----
 .../datasources/ListingFileCatalog.scala      |  2 +-
 .../PartitioningAwareFileCatalog.scala        |  6 ++---
 .../datasources/TableFileCatalog.scala        |  2 +-
 .../sql/hive/HiveTablePerfStatsSuite.scala    | 22 +++++++++----------
 5 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index c01afa40ec5f4..a2453c41625b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -53,7 +53,8 @@ abstract class FileStatusCache {
  * An implementation that caches all partition file statuses in memory forever.
  */
 class InMemoryCache extends FileStatusCache {
-  private val cache = new ConcurrentHashMap[Path, Array[FileStatus]]()
+  private val cache = CacheBuilder
+    .maximumSizenew ConcurrentHashMap[Path, Array[FileStatus]]()
 
   override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
     Option(cache.get(path))
@@ -71,12 +72,8 @@ class InMemoryCache extends FileStatusCache {
 /**
  * A non-caching implementation used when partition file status caching is disabled.
  */
-private class NoopCache extends FileStatusCache {
+object NoopCache extends FileStatusCache {
   override def getLeafFiles(path: Path): Option[Array[FileStatus]] = None
   override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {}
   override def invalidateAll(): Unit = {}
 }
-
-object FileStatusCache {
-  val noop: FileStatusCache = new NoopCache
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index b27ac6d7868ab..f9f1748d4c3b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -39,7 +39,7 @@ class ListingFileCatalog(
     override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType],
-    fileStatusCache: FileStatusCache = new NoopCache)
+    fileStatusCache: FileStatusCache = NoopCache)
   extends PartitioningAwareFileCatalog(
     sparkSession, parameters, partitionSchema, fileStatusCache) {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index cfc4747ef9ad9..354836cfc5da0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -45,7 +45,7 @@ abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
     partitionSchema: Option[StructType],
-    fileStatusCache: FileStatusCache = FileStatusCache.noop) extends FileCatalog with Logging {
+    fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
@@ -290,7 +290,7 @@ object PartitioningAwareFileCatalog extends Logging {
    */
   private def listLeafFilesInSerial(
       paths: Seq[Path],
-      hadoopConf: Configuration): Map[Path, Seq[FileStatus]] = {
+      hadoopConf: Configuration): Seq[(Path, Seq[FileStatus])] = {
     // Dummy jobconf to get to the pathFilter defined in configuration
     val jobConf = new JobConf(hadoopConf, this.getClass)
     val filter = FileInputFormat.getInputPathFilter(jobConf)
@@ -298,7 +298,7 @@ object PartitioningAwareFileCatalog extends Logging {
     paths.map { path =>
       val fs = path.getFileSystem(hadoopConf)
       (path, listLeafFiles0(fs, path, filter))
-    }.toMap
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 17e2a7bf35bbe..ba09f3f32d1e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -45,7 +45,7 @@ class TableFileCatalog(
   private val fileStatusCache = if (enableFileStatusCache)  {
     new InMemoryCache
   } else {
-    FileStatusCache.noop
+    NoopCache
   }
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
index e05671bafc85d..9f80bc0281866 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
@@ -113,31 +113,31 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 = 999").count()
+          assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 < 2").count()
+          assert(spark.sql("select * from test where partCol1 < 2").count() == 2)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 < 3").count()
+          assert(spark.sql("select * from test where partCol1 < 3").count() == 3)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 3)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 1)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 2)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 3)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 5)
@@ -154,20 +154,20 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
 
           HiveCatalogMetrics.reset()
           spark.sql("refresh table test")
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
 
           spark.catalog.cacheTable("test")
           HiveCatalogMetrics.reset()
           spark.catalog.refreshByPath(dir.getAbsolutePath)
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
         }
@@ -185,18 +185,18 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
           // mode. This is kind of terrible, but is needed to preserve the legacy behavior
           // of doing plan cache validation based on the entire partition set.
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 = 999").count()
+          assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
           // 5 from table resolution, another 5 from ListingFileCatalog
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 10)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 < 2").count()
+          assert(spark.sql("select * from test where partCol1 < 2").count() == 2)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
         }

From f318aa73f85d8141863a3138a0b7c21e56ffa1e5 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 14:25:08 -0700
Subject: [PATCH 58/99] Wed Oct 19 14:25:08 PDT 2016

---
 .../spark/sql/execution/datasources/FileStatusCache.scala      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index a2453c41625b6..cd303e51ff523 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -53,8 +53,7 @@ abstract class FileStatusCache {
  * An implementation that caches all partition file statuses in memory forever.
  */
 class InMemoryCache extends FileStatusCache {
-  private val cache = CacheBuilder
-    .maximumSizenew ConcurrentHashMap[Path, Array[FileStatus]]()
+  private val cache = new ConcurrentHashMap[Path, Array[FileStatus]]()
 
   override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
     Option(cache.get(path))

From 0b54b4ca61309779e4de661216d15bbbe08e76be Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 15:16:26 -0700
Subject: [PATCH 59/99] byte limit

---
 .../datasources/FileStatusCache.scala         | 30 +++++++++++++++----
 .../datasources/TableFileCatalog.scala        |  8 ++---
 .../apache/spark/sql/internal/SQLConf.scala   | 16 +++++-----
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  7 +++--
 .../sql/hive/HiveTablePerfStatsSuite.scala    | 25 ++++++++++++++--
 5 files changed, 64 insertions(+), 22 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index cd303e51ff523..61ce6a1e240e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -18,11 +18,14 @@
 package org.apache.spark.sql.execution.datasources
 
 import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.AtomicBoolean
 
+import com.google.common.cache._
 import org.apache.hadoop.fs.{FileStatus, Path}
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.{SerializableConfiguration, SizeEstimator}
 
 /**
  * A cache of the leaf files of partition directories. We cache these files in order to speed
@@ -52,11 +55,28 @@ abstract class FileStatusCache {
 /**
  * An implementation that caches all partition file statuses in memory forever.
  */
-class InMemoryCache extends FileStatusCache {
-  private val cache = new ConcurrentHashMap[Path, Array[FileStatus]]()
+class InMemoryCache(maxSizeInBytes: Long) extends FileStatusCache with Logging {
+  private val warnedAboutEviction = new AtomicBoolean(false)
+  private val cache: Cache[Path, Array[FileStatus]] = CacheBuilder.newBuilder()
+    .weigher(new Weigher[Path, Array[FileStatus]] {
+      override def weigh(key: Path, value: Array[FileStatus]): Int = {
+        (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)).toInt
+      }})
+    .removalListener(new RemovalListener[Path, Array[FileStatus]]() {
+      override def onRemoval(removed: RemovalNotification[Path, Array[FileStatus]]) = {
+        if (removed.getCause() == RemovalCause.SIZE &&
+            warnedAboutEviction.compareAndSet(false, true)) {
+          logWarning(
+            "Evicting cached table partition metadata from memory due to size constraints " +
+            "(spark.sql.hive.filesourcePartitionFileCacheSize = " + maxSizeInBytes + " bytes). " +
+            "This may impact query planning performance.")
+        }
+      }})
+    .maximumWeight(maxSizeInBytes)
+    .build()
 
   override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
-    Option(cache.get(path))
+    Option(cache.getIfPresent(path))
   }
 
   override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
@@ -64,7 +84,7 @@ class InMemoryCache extends FileStatusCache {
   }
 
   override def invalidateAll(): Unit = {
-    cache.clear()
+    cache.invalidateAll()
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index b18df8756244e..cf195129d2a30 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType
  * @param table the table's (unqualified) name
  * @param partitionSchema the schema of a partitioned table's partition columns
  * @param sizeInBytes the table's data size in bytes
- * @param enableFileStatusCache whether to enable file status caching
+ * @param fileStatusCacheSize if nonzero, enables and specifies the size of the file status cache
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
@@ -40,10 +40,10 @@ class TableFileCatalog(
     table: String,
     partitionSchema: Option[StructType],
     override val sizeInBytes: Long,
-    enableFileStatusCache: Boolean) extends FileCatalog {
+    fileStatusCacheSize: Long) extends FileCatalog {
 
-  private val fileStatusCache = if (enableFileStatusCache)  {
-    new InMemoryCache
+  private val fileStatusCache = if (fileStatusCacheSize > 0) {
+    new InMemoryCache(fileStatusCacheSize)
   } else {
     NoopCache
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 920c3dee5616e..f95ff0a07e76b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -278,12 +278,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
-  val HIVE_FILESOURCE_PARTITION_FILE_CACHE_ENABLED =
-    SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheEnabled")
-      .doc("When true, enable caching of partition files in memory. This only takes effect " +
-           "if filesource partition pruning is also enabled.")
-      .booleanConf
-      .createWithDefault(true)
+  val HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheSize")
+      .doc("When nonzero, enable caching of partition file metadata in memory. Each table may " +
+           "use up to the specified number of bytes for caching file metadata. This conf only " +
+           "applies if filesource partition pruning is also enabled.")
+      .longConf
+      .createWithDefault(50 * 1024 * 1024)
 
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
@@ -686,8 +687,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
 
-  def filesourcePartitionFileCacheEnabled: Boolean =
-    getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_ENABLED)
+  def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)
 
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index a42aec0961f00..2b3bffc76bdfa 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -232,8 +232,11 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val fileCatalog = {
           val catalog = new TableFileCatalog(
             sparkSession, db, table, Some(partitionSchema), sizeInBytes,
-            enableFileStatusCache = lazyPruningEnabled &&
-              sparkSession.sqlContext.conf.filesourcePartitionFileCacheEnabled)
+            fileStatusCacheSize = if (lazyPruningEnabled) {
+              sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize
+            } else {
+              0
+            })
           if (lazyPruningEnabled) {
             catalog
           } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
index 9f80bc0281866..50c3f2dbdca15 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
@@ -67,7 +67,7 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
   test("lazy partition pruning reads only necessary partition data") {
     withSQLConf(
         "spark.sql.hive.filesourcePartitionPruning" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheEnabled" -> "false") {
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "0") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
@@ -108,7 +108,7 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
   test("lazy partition pruning with file status caching enabled") {
     withSQLConf(
         "spark.sql.hive.filesourcePartitionPruning" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheEnabled" -> "true") {
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
@@ -149,7 +149,7 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
   test("file status caching respects refresh table and refreshByPath") {
     withSQLConf(
         "spark.sql.hive.filesourcePartitionPruning" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheEnabled" -> "true") {
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
@@ -175,6 +175,25 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
     }
   }
 
+  test("file status cache respects size limit") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "1" /* 1 byte */) {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 10)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
   test("all partitions read and cached when filesource partition pruning is off") {
     withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
       withTable("test") {

From 35b565bf7aa114068fc11fd50a0c51574be0cbe8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 15:53:05 -0700
Subject: [PATCH 60/99] update

---
 .../spark/sql/execution/datasources/FileStatusCache.scala     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index 61ce6a1e240e9..5b708f24f1e86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -53,7 +53,9 @@ abstract class FileStatusCache {
 }
 
 /**
- * An implementation that caches all partition file statuses in memory forever.
+ * An implementation that caches partition file statuses in memory.
+ *
+ * @param maxSizeInBytes max allowable cache size before entries start getting evicted
  */
 class InMemoryCache(maxSizeInBytes: Long) extends FileStatusCache with Logging {
   private val warnedAboutEviction = new AtomicBoolean(false)

From 766a3681270241ec07cc6f58c95b25763dda5938 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 19:06:00 -0700
Subject: [PATCH 61/99] backwards compat support

---
 .../sql/catalyst/catalog/interface.scala      | 13 ++++++++
 .../command/createDataSourceTables.scala      |  6 ++--
 .../spark/sql/execution/command/ddl.scala     | 32 ++++++++++++++++++-
 .../spark/sql/execution/command/tables.scala  | 18 +++--------
 .../execution/datasources/DataSource.scala    | 23 ++++++++++---
 .../datasources/DataSourceStrategy.scala      |  3 +-
 .../execution/datasources/FileCatalog.scala   |  4 +++
 .../PartitioningAwareFileCatalog.scala        | 14 +++++---
 .../datasources/TableFileCatalog.scala        |  4 +--
 .../apache/spark/sql/internal/SQLConf.scala   | 11 ++++---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 21 +++---------
 11 files changed, 99 insertions(+), 50 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 06f310a575b26..dd1de5eb3fb69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -158,6 +158,8 @@ case class CatalogTable(
     comment: Option[String] = None,
     unsupportedFeatures: Seq[String] = Seq.empty) {
 
+  import CatalogTable._
+
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
     c => partitionColumnNames.contains(c.name)
@@ -219,8 +221,19 @@ case class CatalogTable(
     output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }
 
+  /**
+   * @return whether this table's partition metadata is stored in the Hive metastore.
+   */
+  def partitionProviderIsHive: Boolean = {
+    provider == Some("hive") ||
+      properties.get(PARTITION_PROVIDER_KEY) == Some(PARTITION_PROVIDER_HIVE)
+  }
 }
 
+object CatalogTable {
+  val PARTITION_PROVIDER_KEY = "partitionProvider"
+  val PARTITION_PROVIDER_HIVE = "hive"
+}
 
 case class CatalogTableType private(name: String)
 object CatalogTableType {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index d7907ace47166..d752d0dff93f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -101,7 +101,8 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     dataSource match {
       case fs: HadoopFsRelation =>
-        if (table.tableType == CatalogTableType.EXTERNAL && partitionColumnNames.nonEmpty) {
+        if (table.tableType == CatalogTableType.EXTERNAL && partitionColumnNames.nonEmpty &&
+            sparkSession.sqlContext.conf.filesourcePartitionManagement) {
           sparkSession.sessionState.executePlan(
             AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
         }
@@ -243,7 +244,8 @@ case class CreateDataSourceTableAsSelectCommand(
     }
 
     result match {
-      case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty =>
+      case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&
+          sparkSession.sqlContext.conf.filesourcePartitionManagement =>
         sparkSession.sessionState.executePlan(
           AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
       case _ =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index aea3bce914454..14fb29d315a96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -346,6 +346,7 @@ case class AlterTableAddPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
+    DDLUtils.verifyPartitionProviderIsHive(table, "ALTER TABLE ADD PARTITION")
     val parts = partitionSpecsAndLocs.map { case (spec, location) =>
       // inherit table storage format (possibly except for location)
       CatalogTablePartition(spec, table.storage.copy(locationUri = location))
@@ -374,6 +375,7 @@ case class AlterTableRenamePartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
+    DDLUtils.verifyPartitionProviderIsHive(table, "ALTER TABLE RENAME PARTITION")
     catalog.renamePartitions(
       tableName, Seq(oldPartition), Seq(newPartition))
     Seq.empty[Row]
@@ -406,6 +408,7 @@ case class AlterTableDropPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
+    DDLUtils.verifyPartitionProviderIsHive(table, "ALTER TABLE DROP PARTITION")
     catalog.dropPartitions(table.identifier, specs, ignoreIfNotExists = ifExists, purge = purge)
     Seq.empty[Row]
   }
@@ -497,6 +500,8 @@ case class AlterTableRecoverPartitionsCommand(
     logInfo(s"Finished to gather the fast stats for all $total partitions.")
 
     addPartitions(spark, table, partitionSpecsAndLocs, partitionStats)
+    DDLUtils.setPartitionProviderHive(spark, table)
+    catalog.refreshTable(tableName)
     logInfo(s"Recovered all partitions ($total).")
     Seq.empty[Row]
   }
@@ -537,7 +542,8 @@ case class AlterTableRecoverPartitionsCommand(
           scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(columnName -> value),
             partitionNames.drop(1), threshold)
         } else {
-          logWarning(s"expect partition column ${partitionNames.head}, but got ${ps(0)}, ignore it")
+          logWarning(
+            s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it")
           Seq()
         }
       } else {
@@ -675,6 +681,30 @@ object DDLUtils {
     table.provider.isDefined && table.provider.get != "hive"
   }
 
+  /**
+   * Updates a table to indicate that its partition metadata is stored in the Hive metastore.
+   * This is always the case for Hive format tables, but is not true for Datasource tables created
+   * before Spark 2.1 unless they are converted via `msck repair table`.
+   */
+  def setPartitionProviderHive(spark: SparkSession, table: CatalogTable): Unit = {
+    spark.sessionState.catalog.alterTable(
+      table.copy(properties = table.properties ++
+        Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)))
+  }
+
+  /**
+   * Throws a standard error for actions that require partitionProvider = hive.
+   */
+  def verifyPartitionProviderIsHive(table: CatalogTable, action: String): Unit = {
+    if (!table.partitionProviderIsHive) {
+      val tableName = table.identifier.table
+      throw new AnalysisException(
+        s"$action is not allowed on $tableName since its partition metadata is not stored in " +
+          s"the Hive metastore. To import this information into the metastore, run " +
+          s"`msck repair table $tableName`")
+    }
+  }
+
   /**
    * If the command ALTER VIEW is to alter a table or ALTER TABLE is to alter a view,
    * issue an exception [[AnalysisException]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 403b479a0e1bc..ecfa4b56219d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -338,10 +338,8 @@ case class TruncateTableCommand(
         s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentwithDB")
     }
     val isDatasourceTable = DDLUtils.isDatasourceTable(table)
-    if (isDatasourceTable && partitionSpec.isDefined) {
-      throw new AnalysisException(
-        s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +
-        s"for tables created using the data sources API: $tableIdentwithDB")
+    if (partitionSpec.isDefined) {
+      DDLUtils.verifyPartitionProviderIsHive(table, "TRUNCATE TABLE ... PARTITION")
     }
     if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) {
       throw new AnalysisException(
@@ -349,7 +347,7 @@ case class TruncateTableCommand(
         s"for tables that are not partitioned: $tableIdentwithDB")
     }
     val locations =
-      if (isDatasourceTable) {
+      if (DDLUtils.isDatasourceTable(table)) {
         Seq(table.storage.properties.get("path"))
       } else if (table.partitionColumnNames.isEmpty) {
         Seq(table.storage.locationUri)
@@ -514,10 +512,7 @@ case class DescribeTableCommand(
       throw new AnalysisException(
         s"DESC PARTITION is not allowed on a view: ${table.identifier}")
     }
-    if (DDLUtils.isDatasourceTable(metadata)) {
-      throw new AnalysisException(
-        s"DESC PARTITION is not allowed on a datasource table: ${table.identifier}")
-    }
+    DDLUtils.verifyPartitionProviderIsHive(metadata, "DESC PARTITION")
     val partition = catalog.getPartition(table, partitionSpec)
     if (isExtended) {
       describeExtendedDetailedPartitionInfo(table, metadata, partition, result)
@@ -712,10 +707,7 @@ case class ShowPartitionsCommand(
         s"SHOW PARTITIONS is not allowed on a table that is not partitioned: $tableIdentWithDB")
     }
 
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        s"SHOW PARTITIONS is not allowed on a datasource table: $tableIdentWithDB")
-    }
+    DDLUtils.verifyPartitionProviderIsHive(table, "SHOW PARTITIONS")
 
     /**
      * Validate the partitioning spec by making sure all the referenced columns are
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 92b1fff7d8127..d53e5f8e962fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -30,7 +30,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
@@ -65,6 +65,8 @@ import org.apache.spark.util.Utils
  * @param partitionColumns A list of column names that the relation is partitioned by. When this
  *                         list is empty, the relation is unpartitioned.
  * @param bucketSpec An optional specification for bucketing (hash-partitioning) of the data.
+ * @param catalogTable Optional catalog table reference that can be used to push down operations
+ *                     over the datasource to the catalog service.
  */
 case class DataSource(
     sparkSession: SparkSession,
@@ -73,7 +75,8 @@ case class DataSource(
     userSpecifiedSchema: Option[StructType] = None,
     partitionColumns: Seq[String] = Seq.empty,
     bucketSpec: Option[BucketSpec] = None,
-    options: Map[String, String] = Map.empty) extends Logging {
+    options: Map[String, String] = Map.empty,
+    catalogTable: Option[CatalogTable] = None) extends Logging {
 
   case class SourceInfo(name: String, schema: StructType)
 
@@ -402,9 +405,19 @@ case class DataSource(
             })
         }
 
-        val fileCatalog =
+        val enableDatasourceHivePartitionProvider = true  // TODO(ekl) conf
+        val fileCatalog = if (enableDatasourceHivePartitionProvider &&
+            catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
+          new TableFileCatalog(
+            sparkSession,
+            catalogTable.get.identifier.database.get,
+            catalogTable.get.identifier.table,
+            partitionSchema.getOrElse(StructType(Nil)),
+            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L) /* TODO(ekl) */)
+        } else {
           new ListingFileCatalog(
             sparkSession, globbedPaths, options, partitionSchema)
+        }
 
         val dataSchema = userSpecifiedSchema.map { schema =>
           val equality = sparkSession.sessionState.conf.resolver
@@ -413,7 +426,7 @@ case class DataSource(
           format.inferSchema(
             sparkSession,
             caseInsensitiveOptions,
-            fileCatalog.allFiles())
+            fileCatalog.asInstanceOf[ListingFileCatalog].allFiles())
         }.getOrElse {
           throw new AnalysisException(
             s"Unable to infer schema for $format at ${allPaths.take(2).mkString(",")}. " +
@@ -422,7 +435,7 @@ case class DataSource(
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = fileCatalog.partitionSpec().partitionColumns,
+          partitionSchema = fileCatalog.partitionSchema,
           dataSchema = dataSchema.asNullable,
           bucketSpec = bucketSpec,
           format,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 588d9bd4e6167..0e97b7854f0c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -189,7 +189,8 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         query,
         mode)
 
-      if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty) {
+      if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&
+          t.sparkSession.sqlContext.conf.filesourcePartitionManagement) {
         val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)
         Union(insertCmd, recoverPartitionCmd)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
index 2bc66ceeebdb4..df2a4e18f14fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
@@ -21,6 +21,7 @@ import org.apache.hadoop.fs._
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.StructType
 
 /**
  * A collection of data files from a partitioned relation, along with the partition values in the
@@ -63,4 +64,7 @@ trait FileCatalog {
 
   /** Sum of table file sizes, in bytes */
   def sizeInBytes: Long
+
+  /** Schema of the partitioning columns, or the empty schema. */
+  def partitionSchema: StructType
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 5c8eff7ec46b4..bc22c70d316b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -39,18 +39,22 @@ import org.apache.spark.util.SerializableConfiguration
  * It provides the necessary methods to parse partition data based on a set of files.
  *
  * @param parameters as set of options to control partition discovery
- * @param partitionSchema an optional partition schema that will be use to provide types for the
- *                        discovered partitions
-*/
+ * @param givenPartitionSchema an optional partition schema that will be use to provide types for
+ *                             the discovered partitions
+ */
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
+    givenPartitionSchema: Option[StructType]) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
   def partitionSpec(): PartitionSpec
 
+  override def partitionSchema: StructType = {
+    givenPartitionSchema.getOrElse(partitionSpec().partitionColumns)
+  }
+
   protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
@@ -122,7 +126,7 @@ abstract class PartitioningAwareFileCatalog(
     val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
       files.exists(f => isDataPath(f.getPath))
     }.keys.toSeq
-    partitionSchema match {
+    givenPartitionSchema match {
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
           leafDirs,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 916ffbbd39e0a..bb2c775033c56 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -37,7 +37,7 @@ class TableFileCatalog(
     sparkSession: SparkSession,
     val db: String,
     val table: String,
-    partitionSchema: Option[StructType],
+    override val partitionSchema: StructType,
     override val sizeInBytes: Long) extends FileCatalog {
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
@@ -81,7 +81,7 @@ class TableFileCatalog(
 
   private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
     partitionSchema match {
-      case Some(schema) if schema.nonEmpty =>
+      case schema if schema.nonEmpty =>
         val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
         val partitions = selectedPartitions.map { p =>
           PartitionPath(p.toRow(schema), p.storage.locationUri.get)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8afd39d657865..6c6f51b24bec3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -269,10 +269,11 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val HIVE_FILESOURCE_PARTITION_PRUNING =
-    SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
-      .doc("When true, enable metastore partition pruning for file source tables as well. " +
-           "This is currently implemented for converted Hive tables only.")
+  val HIVE_FILESOURCE_PARTITION_MANAGEMENT =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionManagement")
+      .doc("When true, enable metastore partition management for file source tables as well. " +
+           "This includes both datasource and converted Hive tables. This also controls whether " +
+           "datasource tables will automatically store partition metadata in the Hive metastore.")
       .booleanConf
       .createWithDefault(true)
 
@@ -675,7 +676,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
+  def filesourcePartitionManagement: Boolean = getConf(HIVE_FILESOURCE_PARTITION_MANAGEMENT)
 
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 41277fa8c16a3..e0d9de9a7d70c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -76,21 +76,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = table.provider.get,
-            options = table.storage.properties)
-
-        val relation = dataSource.resolveRelation() match {
-          case r: HadoopFsRelation =>
-            val fileCatalog = new TableFileCatalog(
-              r.sparkSession,
-              in.database,
-              in.name,
-              Some(table.partitionSchema),
-              r.sizeInBytes)
-            r.copy(location = fileCatalog)(r.sparkSession)
-          case other => other
-        }
+            options = table.storage.properties,
+            catalogTable = Some(table))
 
-        LogicalRelation(relation, catalogTable = Some(table))
+        LogicalRelation(dataSource.resolveRelation(), catalogTable = Some(table))
       }
     }
 
@@ -204,7 +193,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
-    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionPruning
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionManagement
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
@@ -241,7 +230,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+            sparkSession, db, table, partitionSchema, sizeInBytes)
           if (lazyPruningEnabled) {
             catalog
           } else {

From 4d93f48788d05ae6983f0f2feae6cc4368c039e7 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 19:16:23 -0700
Subject: [PATCH 62/99] fix flag use

---
 .../apache/spark/sql/execution/datasources/DataSource.scala    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index d53e5f8e962fe..176a8a33edd61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -405,8 +405,7 @@ case class DataSource(
             })
         }
 
-        val enableDatasourceHivePartitionProvider = true  // TODO(ekl) conf
-        val fileCatalog = if (enableDatasourceHivePartitionProvider &&
+        val fileCatalog = if (sparkSession.sqlContext.conf.filesourcePartitionManagement &&
             catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
           new TableFileCatalog(
             sparkSession,

From 4647f1fd5c2283ed543f07d212478965bfa5b88d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 12:00:02 -0700
Subject: [PATCH 63/99] Thu Oct 20 12:00:02 PDT 2016

---
 .../datasources/TableFileCatalog.scala        | 19 +++++++++----------
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  7 +------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index cf195129d2a30..72f33927320af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -39,13 +39,15 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long,
-    fileStatusCacheSize: Long) extends FileCatalog {
-
-  private val fileStatusCache = if (fileStatusCacheSize > 0) {
-    new InMemoryCache(fileStatusCacheSize)
-  } else {
-    NoopCache
+    override val sizeInBytes: Long) extends FileCatalog {
+
+  private val fileStatusCache = {
+    if (sparkSession.sqlContext.conf.filesourcePartitionPruning &&
+        sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
+      new InMemoryCache(sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize)
+    } else {
+      NoopCache
+    }
   }
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
@@ -56,9 +58,6 @@ class TableFileCatalog(
 
   private val baseLocation = catalogTable.storage.locationUri
 
-  // Populated on-demand by calls to cachedAllPartitions
-  private var cachedAllPartitions: ListingFileCatalog = null
-
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
   override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 2b3bffc76bdfa..44089335e1a1d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -231,12 +231,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes,
-            fileStatusCacheSize = if (lazyPruningEnabled) {
-              sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize
-            } else {
-              0
-            })
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
           if (lazyPruningEnabled) {
             catalog
           } else {

From 44f6c70a2f9cae82dca9cf419f336d1996039713 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 12:36:29 -0700
Subject: [PATCH 64/99] make tests cover ds tables too

---
 .../datasources/TableFileCatalog.scala        |  4 +-
 ...a => PartitionedTablePerfStatsSuite.scala} | 98 +++++++++++++++----
 2 files changed, 79 insertions(+), 23 deletions(-)
 rename sql/hive/src/test/scala/org/apache/spark/sql/hive/{HiveTablePerfStatsSuite.scala => PartitionedTablePerfStatsSuite.scala} (70%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 7554028b504e1..9e31d091e89cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -42,7 +42,7 @@ class TableFileCatalog(
     override val sizeInBytes: Long) extends FileCatalog {
 
   private val fileStatusCache = {
-    if (sparkSession.sqlContext.conf.filesourcePartitionPruning &&
+    if (sparkSession.sqlContext.conf.filesourcePartitionManagement &&
         sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
       new InMemoryCache(sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize)
     } else {
@@ -89,7 +89,7 @@ class TableFileCatalog(
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
           sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
-      case None =>
+      case _ =>
         new ListingFileCatalog(sparkSession, rootPaths, parameters, None, fileStatusCache)
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
similarity index 70%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 50c3f2dbdca15..07f8f9635d116 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -24,27 +24,55 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
-class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
+class PartitionedTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
 
-  private def setupPartitionedTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+  private case class TestSpec(setupTable: (String, File) => Unit)
+
+  /**
+   * Runs a test against both converted hive and native datasource tables. The test can use the
+   * passed TestSpec object for setup and inspecting test parameters.
+   */
+  private def genericTest(testName: String)(fn: TestSpec => Unit): Unit = {
+    test("hive table: " + testName) {
+      fn(TestSpec(setupPartitionedHiveTable))
+    }
+    test("datasource table: " + testName) {
+      fn(TestSpec(setupPartitionedDatasourceTable))
+    }
+  }
+
+  private def setupPartitionedHiveTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
     spark.sql(s"""
-      |create external table $tableName (id long)
+      |create external table $tableName (fieldOne long)
       |partitioned by (partCol1 int, partCol2 int)
       |stored as parquet
       |location "${dir.getAbsolutePath}"""".stripMargin)
     spark.sql(s"msck repair table $tableName")
   }
 
-  test("partitioned pruned table reports only selected files") {
+  private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+      .partitionBy("partCol1", "partCol2")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create table $tableName (fieldOne long, partCol1 int, partCol2 int)
+      |using parquet
+      |options (path "${dir.getAbsolutePath}")
+      |partitioned by (partCol1, partCol2)""".stripMargin)
+  }
+
+  genericTest("partitioned pruned table reports only selected files") { spec =>
     assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
     withTable("test") {
       withTempDir { dir =>
-        setupPartitionedTable("test", dir)
+        spec.setupTable("test", dir)
         val df = spark.sql("select * from test")
         assert(df.count() == 5)
         assert(df.inputFiles.length == 5)  // unpruned
@@ -64,13 +92,13 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
     }
   }
 
-  test("lazy partition pruning reads only necessary partition data") {
+  genericTest("lazy partition pruning reads only necessary partition data") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionManagement" -> "true",
         "spark.sql.hive.filesourcePartitionFileCacheSize" -> "0") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           spark.sql("select * from test where partCol1 = 999").count()
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
@@ -105,13 +133,13 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
     }
   }
 
-  test("lazy partition pruning with file status caching enabled") {
+  genericTest("lazy partition pruning with file status caching enabled") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionManagement" -> "true",
         "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
@@ -146,13 +174,13 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
     }
   }
 
-  test("file status caching respects refresh table and refreshByPath") {
+  genericTest("file status caching respects refresh table and refreshByPath") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionManagement" -> "true",
         "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
@@ -175,13 +203,13 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
     }
   }
 
-  test("file status cache respects size limit") {
+  genericTest("file status cache respects size limit") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionManagement" -> "true",
         "spark.sql.hive.filesourcePartitionFileCacheSize" -> "1" /* 1 byte */) {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
@@ -194,11 +222,11 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
     }
   }
 
-  test("all partitions read and cached when filesource partition pruning is off") {
-    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
+  test("hive table: files read and cached when filesource partition management is off") {
+    withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "false") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          setupPartitionedHiveTable("test", dir)
 
           // We actually query the partitions from hive each time the table is resolved in this
           // mode. This is kind of terrible, but is needed to preserve the legacy behavior
@@ -222,4 +250,32 @@ class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLT
       }
     }
   }
+
+  test("datasource table: all partition data cached in memory when partition management is off") {
+    withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir)
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
+
+          // not using metastore
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+
+          // reads and caches all the files initially
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 < 2").count() == 2)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
 }

From e940cb9b6b5ea8f5f34cb3643f15ebb15715d142 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 13:10:12 -0700
Subject: [PATCH 65/99] Thu Oct 20 13:10:12 PDT 2016

---
 .../PartitionProviderCompatibilitySuite.scala | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
new file mode 100644
index 0000000000000..b3ee706b00131
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.{AnalysisException, QueryTest}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
+
+class PartitionProviderCompatibilitySuite
+  extends QueryTest with TestHiveSingleton with SQLTestUtils {
+
+  private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
+    // TODO(ekl) make these mixed-case fields once support for that is fixed
+    spark.range(5).selectExpr("id as fieldone", "id as partcol1", "id as partcol2").write
+      .partitionBy("partcol1", "partcol2")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create table $tableName (fieldone long, partcol1 int, partcol2 int)
+      |using parquet
+      |options (path "${dir.getAbsolutePath}")
+      |partitioned by (partcol1, partcol2)""".stripMargin)
+  }
+
+  private def verifyIsLegacyTable(tableName: String): Unit = {
+    val unsupportedCommands = Seq(
+      s"SHOW PARTITIONS $tableName")
+
+    for (cmd <- unsupportedCommands) {
+      val e = intercept[AnalysisException] {
+        spark.sql(s"show partitions $tableName")
+      }
+      assert(e.getMessage.contains("partition metadata is not stored in the Hive metastore"), e)
+    }
+  }
+
+  private def verifyIsNewTable(tableName: String): Unit = {
+    withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "true") {
+      assert(spark.sql(s"show partitions $tableName").count() == 5)
+      HiveCatalogMetrics.reset()
+
+      // sanity check table performance
+      assert(spark.sql(s"select * from $tableName where partcol1 < 2").count() == 2)
+      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+      assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+    }
+  }
+
+  test("convert partition provider to hive with repair table") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "false") {
+          setupPartitionedDatasourceTable("test", dir)
+        }
+        withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "true") {
+          verifyIsLegacyTable("test")
+          spark.sql("msck repair table test")
+          verifyIsNewTable("test")
+        }
+      }
+    }
+  }
+}

From 6733ba61039cba34a6d802fa2ea7c166b676de2a Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 14:27:00 -0700
Subject: [PATCH 66/99] global cache

---
 .../datasources/FileStatusCache.scala         | 76 +++++++++++++++----
 .../datasources/TableFileCatalog.scala        | 14 +---
 .../apache/spark/sql/internal/SQLConf.scala   |  6 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  3 +-
 .../sql/hive/HiveTablePerfStatsSuite.scala    | 16 +++-
 5 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index 5b708f24f1e86..54d97974ad364 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -20,11 +20,14 @@ package org.apache.spark.sql.execution.datasources
 import java.util.concurrent.ConcurrentHashMap
 import java.util.concurrent.atomic.AtomicBoolean
 
+import scala.collection.JavaConverters._
+
 import com.google.common.cache._
 import org.apache.hadoop.fs.{FileStatus, Path}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.util.{SerializableConfiguration, SizeEstimator}
 
 /**
@@ -52,20 +55,53 @@ abstract class FileStatusCache {
   def invalidateAll(): Unit
 }
 
+object FileStatusCache {
+  // Opaque object that uniquely identifies a shared cache user
+  type ClientId = Object
+
+  private var sharedCache: SharedInMemoryCache = null
+
+  /**
+   * @return a cache for the specified client. Cache resources are shared across all clients.
+   */
+  def getOrInitializeShared(clientId: ClientId, session: SparkSession): FileStatusCache = {
+    synchronized {
+      if (session.sqlContext.conf.filesourcePartitionPruning &&
+          session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
+        if (sharedCache == null) {
+          sharedCache = new SharedInMemoryCache(
+            session.sqlContext.conf.filesourcePartitionFileCacheSize)
+        }
+        sharedCache.getForClient(clientId)
+      } else {
+        NoopCache
+      }
+    }
+  }
+
+  def resetForTesting(): Unit = synchronized {
+    sharedCache = null
+  }
+}
+
 /**
  * An implementation that caches partition file statuses in memory.
  *
  * @param maxSizeInBytes max allowable cache size before entries start getting evicted
  */
-class InMemoryCache(maxSizeInBytes: Long) extends FileStatusCache with Logging {
+private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
+  import FileStatusCache._
+
   private val warnedAboutEviction = new AtomicBoolean(false)
-  private val cache: Cache[Path, Array[FileStatus]] = CacheBuilder.newBuilder()
-    .weigher(new Weigher[Path, Array[FileStatus]] {
-      override def weigh(key: Path, value: Array[FileStatus]): Int = {
+
+  // we use a composite cache key in order to provide isolation between cache clients
+  private val cache: Cache[(ClientId, Path), Array[FileStatus]] = CacheBuilder.newBuilder()
+    .weigher(new Weigher[(ClientId, Path), Array[FileStatus]] {
+      override def weigh(key: (ClientId, Path), value: Array[FileStatus]): Int = {
         (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)).toInt
       }})
-    .removalListener(new RemovalListener[Path, Array[FileStatus]]() {
-      override def onRemoval(removed: RemovalNotification[Path, Array[FileStatus]]) = {
+    .removalListener(new RemovalListener[(ClientId, Path), Array[FileStatus]]() {
+      override def onRemoval(removed: RemovalNotification[(ClientId, Path), Array[FileStatus]]) = {
         if (removed.getCause() == RemovalCause.SIZE &&
             warnedAboutEviction.compareAndSet(false, true)) {
           logWarning(
@@ -77,16 +113,28 @@ class InMemoryCache(maxSizeInBytes: Long) extends FileStatusCache with Logging {
     .maximumWeight(maxSizeInBytes)
     .build()
 
-  override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
-    Option(cache.getIfPresent(path))
-  }
+  /**
+   * @param clientId object that uniquely identifies this client. Cache entries are isolated
+   *                 across clients, but cache resources are shared across all clients.
+   *
+   * @return a FileStatusCache for the specified client
+   */
+  def getForClient(clientId: ClientId): FileStatusCache = new FileStatusCache {
+    override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
+      Option(cache.getIfPresent((clientId, path)))
+    }
 
-  override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
-    cache.put(path, leafFiles.toArray)
-  }
+    override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
+      cache.put((clientId, path), leafFiles.toArray)
+    }
 
-  override def invalidateAll(): Unit = {
-    cache.invalidateAll()
+    override def invalidateAll(): Unit = {
+      cache.asMap.asScala.foreach { case (key, value) =>
+        if (key._1 == clientId) {
+          cache.invalidate(key)
+        }
+      }
+    }
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 72f33927320af..16fc651cec381 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -32,23 +32,15 @@ import org.apache.spark.sql.types.StructType
  * @param table the table's (unqualified) name
  * @param partitionSchema the schema of a partitioned table's partition columns
  * @param sizeInBytes the table's data size in bytes
- * @param fileStatusCacheSize if nonzero, enables and specifies the size of the file status cache
+ * @param fileStatusCache optional cache implementation to use for file listing
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long) extends FileCatalog {
-
-  private val fileStatusCache = {
-    if (sparkSession.sqlContext.conf.filesourcePartitionPruning &&
-        sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
-      new InMemoryCache(sparkSession.sqlContext.conf.filesourcePartitionFileCacheSize)
-    } else {
-      NoopCache
-    }
-  }
+    override val sizeInBytes: Long,
+    fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog {
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f95ff0a07e76b..e9944a2424a5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -280,11 +280,11 @@ object SQLConf {
 
   val HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE =
     SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheSize")
-      .doc("When nonzero, enable caching of partition file metadata in memory. Each table may " +
-           "use up to the specified number of bytes for caching file metadata. This conf only " +
+      .doc("When nonzero, enable caching of partition file metadata in memory. All table share " +
+           "a cache that can use up to specified num bytes for file metadata. This conf only " +
            "applies if filesource partition pruning is also enabled.")
       .longConf
-      .createWithDefault(50 * 1024 * 1024)
+      .createWithDefault(250 * 1024 * 1024)
 
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 44089335e1a1d..cf5c4705c4e95 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -231,7 +231,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes,
+            FileStatusCache.getOrInitializeShared(new Object(), sparkSession))
           if (lazyPruningEnabled) {
             catalog
           } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
index 50c3f2dbdca15..8e1f6113345df 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
@@ -19,12 +19,26 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.scalatest.BeforeAndAfterEach
+
 import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.execution.datasources.FileStatusCache
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
-class HiveTablePerfStatsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
+class HiveTablePerfStatsSuite
+  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    FileStatusCache.resetForTesting()
+  }
+
+  override def afterEach(): Unit = {
+    super.afterEach()
+    FileStatusCache.resetForTesting()
+  }
 
   private def setupPartitionedTable(tableName: String, dir: File): Unit = {
     spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write

From 9d7282549c9a9774bca088543d42140da4b1e7f4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 14:28:31 -0700
Subject: [PATCH 67/99] Thu Oct 20 14:28:31 PDT 2016

---
 .../spark/sql/execution/datasources/FileStatusCache.scala      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index 54d97974ad364..4138cba1e0ed7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -62,7 +62,8 @@ object FileStatusCache {
   private var sharedCache: SharedInMemoryCache = null
 
   /**
-   * @return a cache for the specified client. Cache resources are shared across all clients.
+   * @return a cache for the specified client, sized based on session configuration. Cache
+   *         resources are shared across all clients.
    */
   def getOrInitializeShared(clientId: ClientId, session: SparkSession): FileStatusCache = {
     synchronized {

From 9ada9b50790ad808e208c6dc64b5f00a9990611a Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 14:30:36 -0700
Subject: [PATCH 68/99] Thu Oct 20 14:30:36 PDT 2016

---
 .../spark/sql/execution/datasources/FileStatusCache.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index 4138cba1e0ed7..bf1cd5ec4bdc7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -95,7 +95,7 @@ private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
 
   private val warnedAboutEviction = new AtomicBoolean(false)
 
-  // we use a composite cache key in order to provide isolation between cache clients
+  // we use a composite cache key in order to distinguish entries inserted by different clients
   private val cache: Cache[(ClientId, Path), Array[FileStatus]] = CacheBuilder.newBuilder()
     .weigher(new Weigher[(ClientId, Path), Array[FileStatus]] {
       override def weigh(key: (ClientId, Path), value: Array[FileStatus]): Int = {

From 3b6398bc0d1a347f8ce779eb9c04e50b5c998192 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 14:39:28 -0700
Subject: [PATCH 69/99] Thu Oct 20 14:39:28 PDT 2016

---
 .../spark/sql/execution/datasources/FileStatusCache.scala   | 2 +-
 .../spark/sql/execution/datasources/TableFileCatalog.scala  | 6 +++---
 .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index bf1cd5ec4bdc7..13c549c57d1b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -67,7 +67,7 @@ object FileStatusCache {
    */
   def getOrInitializeShared(clientId: ClientId, session: SparkSession): FileStatusCache = {
     synchronized {
-      if (session.sqlContext.conf.filesourcePartitionPruning &&
+      if (session.sqlContext.conf.filesourcePartitionManagement &&
           session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
         if (sharedCache == null) {
           sharedCache = new SharedInMemoryCache(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 3a2e614210184..9098c5385fc62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -36,9 +36,9 @@ import org.apache.spark.sql.types.StructType
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
-    db: String,
-    table: String,
-    partitionSchema: Option[StructType],
+    val db: String,
+    val table: String,
+    override val partitionSchema: StructType,
     override val sizeInBytes: Long,
     fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog {
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c7f64dbf98d26..0b2056933ace1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -230,7 +230,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes,
+            sparkSession, db, table, partitionSchema, sizeInBytes,
             FileStatusCache.getOrInitializeShared(new Object(), sparkSession))
           if (lazyPruningEnabled) {
             catalog

From 57452a9e0049157b2d80876e6fb668ffd65a6966 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 14:40:14 -0700
Subject: [PATCH 70/99] cleanup

---
 .../spark/sql/execution/datasources/TableFileCatalog.scala   | 5 +++--
 .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala     | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 16fc651cec381..c897bf3589158 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -39,11 +39,12 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long,
-    fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog {
+    override val sizeInBytes: Long) extends FileCatalog {
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
+  private val fileStatusCache = FileStatusCache.getOrInitializeShared(new Object(), sparkSession)
+
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 
   private val catalogTable = externalCatalog.getTable(db, table)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index cf5c4705c4e95..44089335e1a1d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -231,8 +231,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes,
-            FileStatusCache.getOrInitializeShared(new Object(), sparkSession))
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
           if (lazyPruningEnabled) {
             catalog
           } else {

From 17f23cd2e8fb57a93bced250aa228574dc656227 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 20 Oct 2016 15:06:34 -0700
Subject: [PATCH 71/99] Thu Oct 20 15:06:34 PDT 2016

---
 .../PartitionProviderCompatibilitySuite.scala | 30 +++++++++++++++++--
 .../hive/PartitionedTablePerfStatsSuite.scala | 21 ++++++-------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index b3ee706b00131..2ff06cb23cbac 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.{AnalysisException, QueryTest}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 
 class PartitionProviderCompatibilitySuite
@@ -43,6 +44,7 @@ class PartitionProviderCompatibilitySuite
 
   private def verifyIsLegacyTable(tableName: String): Unit = {
     val unsupportedCommands = Seq(
+      s"DESCRIBE $tableName PARTITION (partcol1=1)",
       s"SHOW PARTITIONS $tableName")
 
     for (cmd <- unsupportedCommands) {
@@ -54,7 +56,7 @@ class PartitionProviderCompatibilitySuite
   }
 
   private def verifyIsNewTable(tableName: String): Unit = {
-    withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "true") {
+    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
       assert(spark.sql(s"show partitions $tableName").count() == 5)
       HiveCatalogMetrics.reset()
 
@@ -68,10 +70,10 @@ class PartitionProviderCompatibilitySuite
   test("convert partition provider to hive with repair table") {
     withTable("test") {
       withTempDir { dir =>
-        withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "false") {
+        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
           setupPartitionedDatasourceTable("test", dir)
         }
-        withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "true") {
+        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
           verifyIsLegacyTable("test")
           spark.sql("msck repair table test")
           verifyIsNewTable("test")
@@ -79,4 +81,26 @@ class PartitionProviderCompatibilitySuite
       }
     }
   }
+
+  test("when partition management is enabled, new tables have partition provider hive") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
+          setupPartitionedDatasourceTable("test", dir)
+          verifyIsNewTable("test")
+        }
+      }
+    }
+  }
+
+  test("when partition management is disabled, new tables have no partition provider") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
+          setupPartitionedDatasourceTable("test", dir)
+          verifyIsLegacyTable("test")
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 0aa3b9d2e1d11..6edc6cb56b9c0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -24,6 +24,7 @@ import org.scalatest.BeforeAndAfterEach
 import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.execution.datasources.FileStatusCache
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
@@ -108,8 +109,8 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("lazy partition pruning reads only necessary partition data") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionManagement" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "0") {
+        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "0") {
       withTable("test") {
         withTempDir { dir =>
           spec.setupTable("test", dir)
@@ -149,8 +150,8 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("lazy partition pruning with file status caching enabled") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionManagement" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
+        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
           spec.setupTable("test", dir)
@@ -190,8 +191,8 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("file status caching respects refresh table and refreshByPath") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionManagement" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
+        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
           spec.setupTable("test", dir)
@@ -219,8 +220,8 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("file status cache respects size limit") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionManagement" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "1" /* 1 byte */) {
+        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "1" /* 1 byte */) {
       withTable("test") {
         withTempDir { dir =>
           spec.setupTable("test", dir)
@@ -237,7 +238,7 @@ class PartitionedTablePerfStatsSuite
   }
 
   test("hive table: files read and cached when filesource partition management is off") {
-    withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "false") {
+    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedHiveTable("test", dir)
@@ -266,7 +267,7 @@ class PartitionedTablePerfStatsSuite
   }
 
   test("datasource table: all partition data cached in memory when partition management is off") {
-    withSQLConf("spark.sql.hive.filesourcePartitionManagement" -> "false") {
+    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedDatasourceTable("test", dir)

From c0711ad4fe7458ba8d42c84b5e5e0eb063e7ce1d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 20 Oct 2016 23:23:53 -0700
Subject: [PATCH 72/99] Update PartitioningAwareFileCatalog.scala

---
 .../execution/datasources/PartitioningAwareFileCatalog.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 354836cfc5da0..9b1903c47119e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -308,7 +308,7 @@ object PartitioningAwareFileCatalog extends Logging {
   private def listLeafFilesInParallel(
       paths: Seq[Path],
       hadoopConf: Configuration,
-      sparkSession: SparkSession): Map[Path, Seq[FileStatus]] = {
+      sparkSession: SparkSession): Seq[(Path, Seq[FileStatus])] = {
     assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
     logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
 
@@ -368,7 +368,7 @@ object PartitioningAwareFileCatalog extends Logging {
           blockLocations)
       }
       (new Path(path), statuses)
-    }.toMap
+    }
   }
 
   /**

From 262f6eed6de2a290c7b2ce6b0efefb05d0754629 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 20 Oct 2016 23:26:22 -0700
Subject: [PATCH 73/99] Update ListingFileCatalog.scala

---
 .../sql/execution/datasources/ListingFileCatalog.scala | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index f9f1748d4c3b6..d9d588388aaf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -47,7 +47,7 @@ class ListingFileCatalog(
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
   @volatile private var cachedPartitionSpec: PartitionSpec = _
 
-  refresh0(false)
+  refresh0()
 
   override def partitionSpec(): PartitionSpec = {
     if (cachedPartitionSpec == null) {
@@ -66,18 +66,16 @@ class ListingFileCatalog(
   }
 
   override def refresh(): Unit = {
-    refresh0(true)
+    refresh0()
+    fileStatusCache.invalidateAll()
   }
 
-  private def refresh0(invalidateSharedCache: Boolean): Unit = {
+  private def refresh0(): Unit = {
     val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
     cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
     cachedPartitionSpec = null
-    if (invalidateSharedCache) {
-      fileStatusCache.invalidateAll()
-    }
   }
 
   override def equals(other: Any): Boolean = other match {

From 2ee5665bec8f871b7dd8098aa74a8cfdf3d3d26f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 10:17:36 -0700
Subject: [PATCH 74/99] fix tests

---
 .../apache/spark/sql/DataFrameWriter.scala    | 12 +++++++--
 .../command/createDataSourceTables.scala      | 21 +++++++--------
 .../spark/sql/execution/command/tables.scala  |  6 ++---
 .../sql/execution/command/DDLSuite.scala      | 26 ++++++++++++-------
 .../spark/sql/hive/HiveExternalCatalog.scala  |  1 +
 .../PartitionProviderCompatibilitySuite.scala | 19 +++++++++-----
 .../hive/PartitionedTablePerfStatsSuite.scala |  1 +
 .../sql/hive/execution/SQLQuerySuite.scala    |  6 ++---
 8 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 5be3277651d02..d9a21141da90d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -25,7 +25,8 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType}
-import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Union}
+import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand
 import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, CreateTable, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
 
@@ -387,7 +388,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           partitionColumnNames = partitioningColumns.getOrElse(Nil),
           bucketSpec = getBucketSpec
         )
-        val cmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
+        val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
+        val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&
+            df.sparkSession.sqlContext.conf.filesourcePartitionManagement) {
+          val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)
+          Union(createCmd, recoverPartitionCmd)
+        } else {
+          createCmd
+        }
         df.sparkSession.sessionState.executePlan(cmd).toRdd
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index d752d0dff93f9..288ca3777a185 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -91,24 +91,23 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
       table.storage.properties
     }
 
+    val newProps = if (partitionColumnNames.nonEmpty &&
+        sparkSession.sqlContext.conf.filesourcePartitionManagement) {
+      table.properties ++
+        Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)
+    } else {
+      table.properties
+    }
+
     val newTable = table.copy(
       storage = table.storage.copy(properties = optionsWithPath),
       schema = dataSource.schema,
-      partitionColumnNames = partitionColumnNames)
+      partitionColumnNames = partitionColumnNames,
+      properties = newProps)
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
     sessionState.catalog.createTable(newTable, ignoreIfExists = false)
 
-    dataSource match {
-      case fs: HadoopFsRelation =>
-        if (table.tableType == CatalogTableType.EXTERNAL && partitionColumnNames.nonEmpty &&
-            sparkSession.sqlContext.conf.filesourcePartitionManagement) {
-          sparkSession.sessionState.executePlan(
-            AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
-        }
-      case _ =>
-    }
-
     Seq.empty[Row]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index ecfa4b56219d1..b23788a82fd63 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -338,14 +338,14 @@ case class TruncateTableCommand(
         s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentwithDB")
     }
     val isDatasourceTable = DDLUtils.isDatasourceTable(table)
-    if (partitionSpec.isDefined) {
-      DDLUtils.verifyPartitionProviderIsHive(table, "TRUNCATE TABLE ... PARTITION")
-    }
     if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) {
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +
         s"for tables that are not partitioned: $tableIdentwithDB")
     }
+    if (partitionSpec.isDefined) {
+      DDLUtils.verifyPartitionProviderIsHive(table, "TRUNCATE TABLE ... PARTITION")
+    }
     val locations =
       if (DDLUtils.isDatasourceTable(table)) {
         Seq(table.storage.properties.get("path"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 9bbde8308e6a6..3912b3653fc0d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -96,7 +96,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
         .add("b", "int"),
       provider = Some("hive"),
       partitionColumnNames = Seq("a", "b"),
-      createTime = 0L)
+      createTime = 0L,
+      properties = Map("partitionProvider" -> "hive"))
   }
 
   private def createTable(catalog: SessionCatalog, name: TableIdentifier): Unit = {
@@ -1076,14 +1077,14 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     def getProps: Map[String, String] = {
       catalog.getTableMetadata(tableIdent).properties
     }
-    assert(getProps.isEmpty)
     // set table properties
     sql("ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('andrew' = 'or14', 'kor' = 'bel')")
-    assert(getProps == Map("andrew" -> "or14", "kor" -> "bel"))
+    assert(Map("andrew" -> "or14", "kor" -> "bel").toSet.subsetOf(getProps.toSet))
     // set table properties without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     sql("ALTER TABLE tab1 SET TBLPROPERTIES ('kor' = 'belle', 'kar' = 'bol')")
-    assert(getProps == Map("andrew" -> "or14", "kor" -> "belle", "kar" -> "bol"))
+    assert(
+      Map("andrew" -> "or14", "kor" -> "belle", "kar" -> "bol").toSet.subsetOf(getProps.toSet))
     // table to alter does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE does_not_exist SET TBLPROPERTIES ('winner' = 'loser')")
@@ -1104,11 +1105,13 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     // unset table properties
     sql("ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('j' = 'am', 'p' = 'an', 'c' = 'lan', 'x' = 'y')")
     sql("ALTER TABLE dbx.tab1 UNSET TBLPROPERTIES ('j')")
-    assert(getProps == Map("p" -> "an", "c" -> "lan", "x" -> "y"))
+    assert(Map("p" -> "an", "c" -> "lan", "x" -> "y").toSet.subsetOf(getProps.toSet))
+    assert(!getProps.contains("j"))
     // unset table properties without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     sql("ALTER TABLE tab1 UNSET TBLPROPERTIES ('p')")
-    assert(getProps == Map("c" -> "lan", "x" -> "y"))
+    assert(Map("c" -> "lan", "x" -> "y").toSet.subsetOf(getProps.toSet))
+    assert(!getProps.contains("p"))
     // table to alter does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE does_not_exist UNSET TBLPROPERTIES ('c' = 'lan')")
@@ -1120,7 +1123,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     assert(e.getMessage.contains("xyz"))
     // property to unset does not exist, but "IF EXISTS" is specified
     sql("ALTER TABLE tab1 UNSET TBLPROPERTIES IF EXISTS ('c', 'xyz')")
-    assert(getProps == Map("x" -> "y"))
+    assert(Map("x" -> "y").toSet.subsetOf(getProps.toSet))
+    assert(!getProps.contains("c"))
   }
 
   private def testSetLocation(isDatasourceTable: Boolean): Unit = {
@@ -1620,12 +1624,16 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       }
     }
 
-    // truncating partitioned data source tables is not supported
     withTable("rectangles", "rectangles2") {
       data.write.saveAsTable("rectangles")
       data.write.partitionBy("length").saveAsTable("rectangles2")
+
+      // not supported since the table is not partitioned
       assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
-      assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
+
+      // supported since partitions are stored in the metastore
+      spark.sql("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
+      assert(spark.table("rectangles2").collect().isEmpty)
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 7f140589e6956..cb9947a697c7a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
+import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index 2ff06cb23cbac..e8907f078a0ca 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -57,13 +57,7 @@ class PartitionProviderCompatibilitySuite
 
   private def verifyIsNewTable(tableName: String): Unit = {
     withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
-      assert(spark.sql(s"show partitions $tableName").count() == 5)
-      HiveCatalogMetrics.reset()
-
-      // sanity check table performance
-      assert(spark.sql(s"select * from $tableName where partcol1 < 2").count() == 2)
-      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
-      assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+      spark.sql(s"show partitions $tableName").count()  // check does not throw
     }
   }
 
@@ -72,11 +66,18 @@ class PartitionProviderCompatibilitySuite
       withTempDir { dir =>
         withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
           setupPartitionedDatasourceTable("test", dir)
+          assert(spark.sql("select * from test").count() == 5)
         }
         withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
           verifyIsLegacyTable("test")
           spark.sql("msck repair table test")
           verifyIsNewTable("test")
+
+          // sanity check table performance
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partcol1 < 2").count() == 2)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
         }
       }
     }
@@ -88,6 +89,9 @@ class PartitionProviderCompatibilitySuite
         withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
           setupPartitionedDatasourceTable("test", dir)
           verifyIsNewTable("test")
+          assert(spark.sql("select * from test").count() == 0)  // needs repair
+          spark.sql("msck repair table test")
+          assert(spark.sql("select * from test").count() == 5)
         }
       }
     }
@@ -99,6 +103,7 @@ class PartitionProviderCompatibilitySuite
         withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
           setupPartitionedDatasourceTable("test", dir)
           verifyIsLegacyTable("test")
+          assert(spark.sql("select * from test").count() == 5)
         }
       }
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 6edc6cb56b9c0..30aa99be43a77 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -81,6 +81,7 @@ class PartitionedTablePerfStatsSuite
       |using parquet
       |options (path "${dir.getAbsolutePath}")
       |partitioned by (partCol1, partCol2)""".stripMargin)
+    spark.sql(s"msck repair table $tableName")
   }
 
   genericTest("partitioned pruned table reports only selected files") { spec =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8d10a7d73a948..ca3a9bfd33b59 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -396,10 +396,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         .range(1).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd).write
         .partitionBy("d")
         .saveAsTable("datasource_table")
-      val m4 = intercept[AnalysisException] {
-        sql("DESC datasource_table PARTITION (d=2)")
-      }.getMessage()
-      assert(m4.contains("DESC PARTITION is not allowed on a datasource table"))
+
+      sql("DESC datasource_table PARTITION (d=0)")
 
       val m5 = intercept[AnalysisException] {
         spark.range(10).select('id as 'a, 'id as 'b).createTempView("view1")

From 2e69cabafe9cc7d8bfdb224bee9d24d811cf94c4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 10:31:56 -0700
Subject: [PATCH 75/99] cleanup

---
 .../datasources/FileStatusCache.scala         | 24 +++++++++----------
 .../datasources/TableFileCatalog.scala        |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index bf1cd5ec4bdc7..dafd921f66535 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -56,16 +56,13 @@ abstract class FileStatusCache {
 }
 
 object FileStatusCache {
-  // Opaque object that uniquely identifies a shared cache user
-  type ClientId = Object
-
   private var sharedCache: SharedInMemoryCache = null
 
   /**
-   * @return a cache for the specified client, sized based on session configuration. Cache
-   *         resources are shared across all clients.
+   * @return a new FileStatusCache sized based on session configuration. Cache memory quota is
+   *         shared across all clients.
    */
-  def getOrInitializeShared(clientId: ClientId, session: SparkSession): FileStatusCache = {
+  def newCache(session: SparkSession): FileStatusCache = {
     synchronized {
       if (session.sqlContext.conf.filesourcePartitionPruning &&
           session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
@@ -73,7 +70,7 @@ object FileStatusCache {
           sharedCache = new SharedInMemoryCache(
             session.sqlContext.conf.filesourcePartitionFileCacheSize)
         }
-        sharedCache.getForClient(clientId)
+        sharedCache.getForNewClient()
       } else {
         NoopCache
       }
@@ -93,6 +90,9 @@ object FileStatusCache {
 private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
   import FileStatusCache._
 
+  // Opaque object that uniquely identifies a shared cache user
+  private type ClientId = Object
+
   private val warnedAboutEviction = new AtomicBoolean(false)
 
   // we use a composite cache key in order to distinguish entries inserted by different clients
@@ -115,12 +115,12 @@ private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
     .build()
 
   /**
-   * @param clientId object that uniquely identifies this client. Cache entries are isolated
-   *                 across clients, but cache resources are shared across all clients.
-   *
-   * @return a FileStatusCache for the specified client
+   * @return a FileStatusCache that does not share any entries with any other client, but does
+   *         shared memory resources for the purpose of cache eviction.
    */
-  def getForClient(clientId: ClientId): FileStatusCache = new FileStatusCache {
+  def getForNewClient(): FileStatusCache = new FileStatusCache {
+    val clientId = new Object()
+
     override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
       Option(cache.getIfPresent((clientId, path)))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index c897bf3589158..31a01bc6db082 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -43,7 +43,7 @@ class TableFileCatalog(
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
-  private val fileStatusCache = FileStatusCache.getOrInitializeShared(new Object(), sparkSession)
+  private val fileStatusCache = FileStatusCache.newCache(sparkSession)
 
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 

From 2a965377258d4d77db5a3f00d4257bbacc4a0adb Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 13:06:03 -0700
Subject: [PATCH 76/99] Fri Oct 21 13:06:03 PDT 2016

---
 .../spark/sql/execution/datasources/FileStatusCache.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index dafd921f66535..e0ec748a0b34d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -59,7 +59,7 @@ object FileStatusCache {
   private var sharedCache: SharedInMemoryCache = null
 
   /**
-   * @return a new FileStatusCache sized based on session configuration. Cache memory quota is
+   * @return a new FileStatusCache based on session configuration. Cache memory quota is
    *         shared across all clients.
    */
   def newCache(session: SparkSession): FileStatusCache = {
@@ -116,7 +116,7 @@ private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
 
   /**
    * @return a FileStatusCache that does not share any entries with any other client, but does
-   *         shared memory resources for the purpose of cache eviction.
+   *         share memory resources for the purpose of cache eviction.
    */
   def getForNewClient(): FileStatusCache = new FileStatusCache {
     val clientId = new Object()

From 0d3b07453825570236652e36a7f789ac44046e08 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 14:24:22 -0700
Subject: [PATCH 77/99] Fri Oct 21 14:24:22 PDT 2016

---
 .../org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index d290fe9962db2..069f01e7aca47 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -63,7 +63,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
 
   def testCaching(pruningEnabled: Boolean): Unit = {
     test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
-      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> pruningEnabled.toString) {
+      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> pruningEnabled.toString) {
         withTable("test") {
           withTempDir { dir =>
             spark.range(5).selectExpr("id", "id as f1", "id as f2").write

From 390c2dbf1925f28764735f5054218de5ebe5fc29 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 14:25:46 -0700
Subject: [PATCH 78/99] conf name change

---
 .../apache/spark/sql/DataFrameWriter.scala    |   2 +-
 .../command/createDataSourceTables.scala      |   4 +-
 .../execution/datasources/DataSource.scala    |   2 +-
 .../datasources/DataSourceStrategy.scala      |   2 +-
 .../datasources/FileStatusCache.scala         |   2 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   6 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 .../sql/hive/HiveMetadataCacheSuite.scala     |   2 +-
 .../PartitionProviderCompatibilitySuite.scala |  10 +-
 .../hive/PartitionedTablePerfStatsSuite.scala |  12 +-
 t                                             | 227 ++++++++++++++++++
 11 files changed, 249 insertions(+), 22 deletions(-)
 create mode 100644 t

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index d9a21141da90d..7529798c3d61f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -390,7 +390,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         )
         val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
         val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&
-            df.sparkSession.sqlContext.conf.filesourcePartitionManagement) {
+            df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
           val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)
           Union(createCmd, recoverPartitionCmd)
         } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 288ca3777a185..b04a553450f26 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -92,7 +92,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
     }
 
     val newProps = if (partitionColumnNames.nonEmpty &&
-        sparkSession.sqlContext.conf.filesourcePartitionManagement) {
+        sparkSession.sqlContext.conf.manageFilesourcePartitions) {
       table.properties ++
         Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)
     } else {
@@ -244,7 +244,7 @@ case class CreateDataSourceTableAsSelectCommand(
 
     result match {
       case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&
-          sparkSession.sqlContext.conf.filesourcePartitionManagement =>
+          sparkSession.sqlContext.conf.manageFilesourcePartitions =>
         sparkSession.sessionState.executePlan(
           AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
       case _ =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 176a8a33edd61..18f749604a727 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -405,7 +405,7 @@ case class DataSource(
             })
         }
 
-        val fileCatalog = if (sparkSession.sqlContext.conf.filesourcePartitionManagement &&
+        val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
             catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
           new TableFileCatalog(
             sparkSession,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 0e97b7854f0c5..2f8941f077739 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -190,7 +190,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         mode)
 
       if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&
-          t.sparkSession.sqlContext.conf.filesourcePartitionManagement) {
+          t.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
         val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)
         Union(insertCmd, recoverPartitionCmd)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index da48e6162fc37..7c2e6fd04d5db 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -64,7 +64,7 @@ object FileStatusCache {
    */
   def newCache(session: SparkSession): FileStatusCache = {
     synchronized {
-      if (session.sqlContext.conf.filesourcePartitionManagement &&
+      if (session.sqlContext.conf.manageFilesourcePartitions &&
           session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
         if (sharedCache == null) {
           sharedCache = new SharedInMemoryCache(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1d5bbcd30ac21..1b913bf23964f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -271,8 +271,8 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
-  val HIVE_FILESOURCE_PARTITION_MANAGEMENT =
-    SQLConfigBuilder("spark.sql.hive.filesourcePartitionManagement")
+  val HIVE_MANAGE_FILESOURCE_PARTITIONS =
+    SQLConfigBuilder("spark.sql.hive.manageFilesourcePartitions")
       .doc("When true, enable metastore partition management for file source tables as well. " +
            "This includes both datasource and converted Hive tables. This also controls whether " +
            "datasource tables will automatically store partition metadata in the Hive metastore.")
@@ -679,7 +679,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  def filesourcePartitionManagement: Boolean = getConf(HIVE_FILESOURCE_PARTITION_MANAGEMENT)
+  def manageFilesourcePartitions: Boolean = getConf(HIVE_MANAGE_FILESOURCE_PARTITIONS)
 
   def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c704df4caabdd..817df470e9a68 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -193,7 +193,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
-    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionManagement
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 069f01e7aca47..6e887d95c0f09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -63,7 +63,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
 
   def testCaching(pruningEnabled: Boolean): Unit = {
     test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
-      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> pruningEnabled.toString) {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> pruningEnabled.toString) {
         withTable("test") {
           withTempDir { dir =>
             spark.range(5).selectExpr("id", "id as f1", "id as f2").write
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index e8907f078a0ca..22563415d0a3a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -56,7 +56,7 @@ class PartitionProviderCompatibilitySuite
   }
 
   private def verifyIsNewTable(tableName: String): Unit = {
-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
       spark.sql(s"show partitions $tableName").count()  // check does not throw
     }
   }
@@ -64,11 +64,11 @@ class PartitionProviderCompatibilitySuite
   test("convert partition provider to hive with repair table") {
     withTable("test") {
       withTempDir { dir =>
-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
           setupPartitionedDatasourceTable("test", dir)
           assert(spark.sql("select * from test").count() == 5)
         }
-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
           verifyIsLegacyTable("test")
           spark.sql("msck repair table test")
           verifyIsNewTable("test")
@@ -86,7 +86,7 @@ class PartitionProviderCompatibilitySuite
   test("when partition management is enabled, new tables have partition provider hive") {
     withTable("test") {
       withTempDir { dir =>
-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
           setupPartitionedDatasourceTable("test", dir)
           verifyIsNewTable("test")
           assert(spark.sql("select * from test").count() == 0)  // needs repair
@@ -100,7 +100,7 @@ class PartitionProviderCompatibilitySuite
   test("when partition management is disabled, new tables have no partition provider") {
     withTable("test") {
       withTempDir { dir =>
-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
           setupPartitionedDatasourceTable("test", dir)
           verifyIsLegacyTable("test")
           assert(spark.sql("select * from test").count() == 5)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 5fbf105a5c99d..e679bad462295 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -110,7 +110,7 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("lazy partition pruning reads only necessary partition data") { spec =>
     withSQLConf(
-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "0") {
       withTable("test") {
         withTempDir { dir =>
@@ -151,7 +151,7 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("lazy partition pruning with file status caching enabled") { spec =>
     withSQLConf(
-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
@@ -192,7 +192,7 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("file status caching respects refresh table and refreshByPath") { spec =>
     withSQLConf(
-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
@@ -221,7 +221,7 @@ class PartitionedTablePerfStatsSuite
 
   genericTest("file status cache respects size limit") { spec =>
     withSQLConf(
-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "1" /* 1 byte */) {
       withTable("test") {
         withTempDir { dir =>
@@ -239,7 +239,7 @@ class PartitionedTablePerfStatsSuite
   }
 
   test("hive table: files read and cached when filesource partition management is off") {
-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedHiveTable("test", dir)
@@ -268,7 +268,7 @@ class PartitionedTablePerfStatsSuite
   }
 
   test("datasource table: all partition data cached in memory when partition management is off") {
-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedDatasourceTable("test", dir)
diff --git a/t b/t
new file mode 100644
index 0000000000000..3956d81ba3816
--- /dev/null
+++ b/t
@@ -0,0 +1,227 @@
+[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala[m
+[1mindex d9a2114..7529798 100644[m
+[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala[m
+[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala[m
+[36m@@ -390,7 +390,7 @@[m [mfinal class DataFrameWriter[T] private[sql](ds: Dataset[T]) {[m
+         )[m
+         val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))[m
+         val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&[m
+[31m-            df.sparkSession.sqlContext.conf.filesourcePartitionManagement) {[m
+[32m+[m[32m            df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {[m
+           val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)[m
+           Union(createCmd, recoverPartitionCmd)[m
+         } else {[m
+[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala[m
+[1mindex 288ca37..b04a553 100644[m
+[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala[m
+[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala[m
+[36m@@ -92,7 +92,7 @@[m [mcase class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo[m
+     }[m
+ [m
+     val newProps = if (partitionColumnNames.nonEmpty &&[m
+[31m-        sparkSession.sqlContext.conf.filesourcePartitionManagement) {[m
+[32m+[m[32m        sparkSession.sqlContext.conf.manageFilesourcePartitions) {[m
+       table.properties ++[m
+         Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)[m
+     } else {[m
+[36m@@ -244,7 +244,7 @@[m [mcase class CreateDataSourceTableAsSelectCommand([m
+ [m
+     result match {[m
+       case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&[m
+[31m-          sparkSession.sqlContext.conf.filesourcePartitionManagement =>[m
+[32m+[m[32m          sparkSession.sqlContext.conf.manageFilesourcePartitions =>[m
+         sparkSession.sessionState.executePlan([m
+           AlterTableRecoverPartitionsCommand(table.identifier)).toRdd[m
+       case _ =>[m
+[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala[m
+[1mindex 176a8a3..18f7496 100644[m
+[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala[m
+[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala[m
+[36m@@ -405,7 +405,7 @@[m [mcase class DataSource([m
+             })[m
+         }[m
+ [m
+[31m-        val fileCatalog = if (sparkSession.sqlContext.conf.filesourcePartitionManagement &&[m
+[32m+[m[32m        val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&[m
+             catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {[m
+           new TableFileCatalog([m
+             sparkSession,[m
+[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala[m
+[1mindex 0e97b78..2f8941f 100644[m
+[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala[m
+[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala[m
+[36m@@ -190,7 +190,7 @@[m [mcase class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {[m
+         mode)[m
+ [m
+       if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&[m
+[31m-          t.sparkSession.sqlContext.conf.filesourcePartitionManagement) {[m
+[32m+[m[32m          t.sparkSession.sqlContext.conf.manageFilesourcePartitions) {[m
+         val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)[m
+         Union(insertCmd, recoverPartitionCmd)[m
+       } else {[m
+[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala[m
+[1mindex da48e61..7c2e6fd 100644[m
+[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala[m
+[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala[m
+[36m@@ -64,7 +64,7 @@[m [mobject FileStatusCache {[m
+    */[m
+   def newCache(session: SparkSession): FileStatusCache = {[m
+     synchronized {[m
+[31m-      if (session.sqlContext.conf.filesourcePartitionManagement &&[m
+[32m+[m[32m      if (session.sqlContext.conf.manageFilesourcePartitions &&[m
+           session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {[m
+         if (sharedCache == null) {[m
+           sharedCache = new SharedInMemoryCache([m
+[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala[m
+[1mindex 1d5bbcd..1b913bf 100644[m
+[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala[m
+[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala[m
+[36m@@ -271,8 +271,8 @@[m [mobject SQLConf {[m
+       .booleanConf[m
+       .createWithDefault(true)[m
+ [m
+[31m-  val HIVE_FILESOURCE_PARTITION_MANAGEMENT =[m
+[31m-    SQLConfigBuilder("spark.sql.hive.filesourcePartitionManagement")[m
+[32m+[m[32m  val HIVE_MANAGE_FILESOURCE_PARTITIONS =[m
+[32m+[m[32m    SQLConfigBuilder("spark.sql.hive.manageFilesourcePartitions")[m
+       .doc("When true, enable metastore partition management for file source tables as well. " +[m
+            "This includes both datasource and converted Hive tables. This also controls whether " +[m
+            "datasource tables will automatically store partition metadata in the Hive metastore.")[m
+[36m@@ -679,7 +679,7 @@[m [mprivate[sql] class SQLConf extends Serializable with CatalystConf with Logging {[m
+ [m
+   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)[m
+ [m
+[31m-  def filesourcePartitionManagement: Boolean = getConf(HIVE_FILESOURCE_PARTITION_MANAGEMENT)[m
+[32m+[m[32m  def manageFilesourcePartitions: Boolean = getConf(HIVE_MANAGE_FILESOURCE_PARTITIONS)[m
+ [m
+   def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)[m
+ [m
+[1mdiff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala[m
+[1mindex c704df4..817df47 100644[m
+[1m--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala[m
+[1m+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala[m
+[36m@@ -193,7 +193,7 @@[m [mprivate[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log[m
+       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)[m
+     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.[m
+ [m
+[31m-    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionManagement[m
+[32m+[m[32m    val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions[m
+     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {[m
+       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)[m
+ [m
+[1mdiff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala[m
+[1mindex 069f01e..6e887d9 100644[m
+[1m--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala[m
+[1m+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala[m
+[36m@@ -63,7 +63,7 @@[m [mclass HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi[m
+ [m
+   def testCaching(pruningEnabled: Boolean): Unit = {[m
+     test(s"partitioned table is cached when partition pruning is $pruningEnabled") {[m
+[31m-      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> pruningEnabled.toString) {[m
+[32m+[m[32m      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> pruningEnabled.toString) {[m
+         withTable("test") {[m
+           withTempDir { dir =>[m
+             spark.range(5).selectExpr("id", "id as f1", "id as f2").write[m
+[1mdiff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala[m
+[1mindex e8907f0..2256341 100644[m
+[1m--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala[m
+[1m+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala[m
+[36m@@ -56,7 +56,7 @@[m [mclass PartitionProviderCompatibilitySuite[m
+   }[m
+ [m
+   private def verifyIsNewTable(tableName: String): Unit = {[m
+[31m-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {[m
+[32m+[m[32m    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {[m
+       spark.sql(s"show partitions $tableName").count()  // check does not throw[m
+     }[m
+   }[m
+[36m@@ -64,11 +64,11 @@[m [mclass PartitionProviderCompatibilitySuite[m
+   test("convert partition provider to hive with repair table") {[m
+     withTable("test") {[m
+       withTempDir { dir =>[m
+[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
+[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
+           setupPartitionedDatasourceTable("test", dir)[m
+           assert(spark.sql("select * from test").count() == 5)[m
+         }[m
+[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {[m
+[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {[m
+           verifyIsLegacyTable("test")[m
+           spark.sql("msck repair table test")[m
+           verifyIsNewTable("test")[m
+[36m@@ -86,7 +86,7 @@[m [mclass PartitionProviderCompatibilitySuite[m
+   test("when partition management is enabled, new tables have partition provider hive") {[m
+     withTable("test") {[m
+       withTempDir { dir =>[m
+[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {[m
+[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {[m
+           setupPartitionedDatasourceTable("test", dir)[m
+           verifyIsNewTable("test")[m
+           assert(spark.sql("select * from test").count() == 0)  // needs repair[m
+[36m@@ -100,7 +100,7 @@[m [mclass PartitionProviderCompatibilitySuite[m
+   test("when partition management is disabled, new tables have no partition provider") {[m
+     withTable("test") {[m
+       withTempDir { dir =>[m
+[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
+[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
+           setupPartitionedDatasourceTable("test", dir)[m
+           verifyIsLegacyTable("test")[m
+           assert(spark.sql("select * from test").count() == 5)[m
+[1mdiff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala[m
+[1mindex 5fbf105..e679bad 100644[m
+[1m--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala[m
+[1m+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala[m
+[36m@@ -110,7 +110,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
+ [m
+   genericTest("lazy partition pruning reads only necessary partition data") { spec =>[m
+     withSQLConf([m
+[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
+[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
+         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "0") {[m
+       withTable("test") {[m
+         withTempDir { dir =>[m
+[36m@@ -151,7 +151,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
+ [m
+   genericTest("lazy partition pruning with file status caching enabled") { spec =>[m
+     withSQLConf([m
+[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
+[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
+         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {[m
+       withTable("test") {[m
+         withTempDir { dir =>[m
+[36m@@ -192,7 +192,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
+ [m
+   genericTest("file status caching respects refresh table and refreshByPath") { spec =>[m
+     withSQLConf([m
+[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
+[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
+         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {[m
+       withTable("test") {[m
+         withTempDir { dir =>[m
+[36m@@ -221,7 +221,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
+ [m
+   genericTest("file status cache respects size limit") { spec =>[m
+     withSQLConf([m
+[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
+[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
+         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "1" /* 1 byte */) {[m
+       withTable("test") {[m
+         withTempDir { dir =>[m
+[36m@@ -239,7 +239,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
+   }[m
+ [m
+   test("hive table: files read and cached when filesource partition management is off") {[m
+[31m-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
+[32m+[m[32m    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
+       withTable("test") {[m
+         withTempDir { dir =>[m
+           setupPartitionedHiveTable("test", dir)[m
+[36m@@ -268,7 +268,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
+   }[m
+ [m
+   test("datasource table: all partition data cached in memory when partition management is off") {[m
+[31m-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
+[32m+[m[32m    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
+       withTable("test") {[m
+         withTempDir { dir =>[m
+           setupPartitionedDatasourceTable("test", dir)[m

From 429451010da5bf2bf343ec89a605ee7ed7726473 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 14:29:00 -0700
Subject: [PATCH 79/99] fix compile

---
 .../sql/hive/execution/PruneFileSourcePartitionsSuite.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
index 346ea0ca4367e..f2b331fa47712 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -49,7 +49,7 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
           spark,
           tableMeta.database,
           tableMeta.identifier.table,
-          Some(tableMeta.partitionSchema),
+          tableMeta.partitionSchema,
           0)
 
         val dataSchema = StructType(tableMeta.schema.filterNot { f =>

From de6c00da42c7a4f663286f2ab98fc9cd4989777e Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 14:36:12 -0700
Subject: [PATCH 80/99] Fri Oct 21 14:36:12 PDT 2016

---
 t | 227 --------------------------------------------------------------
 1 file changed, 227 deletions(-)
 delete mode 100644 t

diff --git a/t b/t
deleted file mode 100644
index 3956d81ba3816..0000000000000
--- a/t
+++ /dev/null
@@ -1,227 +0,0 @@
-[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala[m
-[1mindex d9a2114..7529798 100644[m
-[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala[m
-[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala[m
-[36m@@ -390,7 +390,7 @@[m [mfinal class DataFrameWriter[T] private[sql](ds: Dataset[T]) {[m
-         )[m
-         val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))[m
-         val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&[m
-[31m-            df.sparkSession.sqlContext.conf.filesourcePartitionManagement) {[m
-[32m+[m[32m            df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {[m
-           val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)[m
-           Union(createCmd, recoverPartitionCmd)[m
-         } else {[m
-[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala[m
-[1mindex 288ca37..b04a553 100644[m
-[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala[m
-[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala[m
-[36m@@ -92,7 +92,7 @@[m [mcase class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo[m
-     }[m
- [m
-     val newProps = if (partitionColumnNames.nonEmpty &&[m
-[31m-        sparkSession.sqlContext.conf.filesourcePartitionManagement) {[m
-[32m+[m[32m        sparkSession.sqlContext.conf.manageFilesourcePartitions) {[m
-       table.properties ++[m
-         Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)[m
-     } else {[m
-[36m@@ -244,7 +244,7 @@[m [mcase class CreateDataSourceTableAsSelectCommand([m
- [m
-     result match {[m
-       case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&[m
-[31m-          sparkSession.sqlContext.conf.filesourcePartitionManagement =>[m
-[32m+[m[32m          sparkSession.sqlContext.conf.manageFilesourcePartitions =>[m
-         sparkSession.sessionState.executePlan([m
-           AlterTableRecoverPartitionsCommand(table.identifier)).toRdd[m
-       case _ =>[m
-[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala[m
-[1mindex 176a8a3..18f7496 100644[m
-[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala[m
-[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala[m
-[36m@@ -405,7 +405,7 @@[m [mcase class DataSource([m
-             })[m
-         }[m
- [m
-[31m-        val fileCatalog = if (sparkSession.sqlContext.conf.filesourcePartitionManagement &&[m
-[32m+[m[32m        val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&[m
-             catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {[m
-           new TableFileCatalog([m
-             sparkSession,[m
-[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala[m
-[1mindex 0e97b78..2f8941f 100644[m
-[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala[m
-[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala[m
-[36m@@ -190,7 +190,7 @@[m [mcase class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {[m
-         mode)[m
- [m
-       if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&[m
-[31m-          t.sparkSession.sqlContext.conf.filesourcePartitionManagement) {[m
-[32m+[m[32m          t.sparkSession.sqlContext.conf.manageFilesourcePartitions) {[m
-         val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)[m
-         Union(insertCmd, recoverPartitionCmd)[m
-       } else {[m
-[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala[m
-[1mindex da48e61..7c2e6fd 100644[m
-[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala[m
-[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala[m
-[36m@@ -64,7 +64,7 @@[m [mobject FileStatusCache {[m
-    */[m
-   def newCache(session: SparkSession): FileStatusCache = {[m
-     synchronized {[m
-[31m-      if (session.sqlContext.conf.filesourcePartitionManagement &&[m
-[32m+[m[32m      if (session.sqlContext.conf.manageFilesourcePartitions &&[m
-           session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {[m
-         if (sharedCache == null) {[m
-           sharedCache = new SharedInMemoryCache([m
-[1mdiff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala[m
-[1mindex 1d5bbcd..1b913bf 100644[m
-[1m--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala[m
-[1m+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala[m
-[36m@@ -271,8 +271,8 @@[m [mobject SQLConf {[m
-       .booleanConf[m
-       .createWithDefault(true)[m
- [m
-[31m-  val HIVE_FILESOURCE_PARTITION_MANAGEMENT =[m
-[31m-    SQLConfigBuilder("spark.sql.hive.filesourcePartitionManagement")[m
-[32m+[m[32m  val HIVE_MANAGE_FILESOURCE_PARTITIONS =[m
-[32m+[m[32m    SQLConfigBuilder("spark.sql.hive.manageFilesourcePartitions")[m
-       .doc("When true, enable metastore partition management for file source tables as well. " +[m
-            "This includes both datasource and converted Hive tables. This also controls whether " +[m
-            "datasource tables will automatically store partition metadata in the Hive metastore.")[m
-[36m@@ -679,7 +679,7 @@[m [mprivate[sql] class SQLConf extends Serializable with CatalystConf with Logging {[m
- [m
-   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)[m
- [m
-[31m-  def filesourcePartitionManagement: Boolean = getConf(HIVE_FILESOURCE_PARTITION_MANAGEMENT)[m
-[32m+[m[32m  def manageFilesourcePartitions: Boolean = getConf(HIVE_MANAGE_FILESOURCE_PARTITIONS)[m
- [m
-   def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)[m
- [m
-[1mdiff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala[m
-[1mindex c704df4..817df47 100644[m
-[1m--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala[m
-[1m+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala[m
-[36m@@ -193,7 +193,7 @@[m [mprivate[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log[m
-       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)[m
-     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.[m
- [m
-[31m-    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionManagement[m
-[32m+[m[32m    val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions[m
-     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {[m
-       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)[m
- [m
-[1mdiff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala[m
-[1mindex 069f01e..6e887d9 100644[m
-[1m--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala[m
-[1m+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala[m
-[36m@@ -63,7 +63,7 @@[m [mclass HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi[m
- [m
-   def testCaching(pruningEnabled: Boolean): Unit = {[m
-     test(s"partitioned table is cached when partition pruning is $pruningEnabled") {[m
-[31m-      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> pruningEnabled.toString) {[m
-[32m+[m[32m      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> pruningEnabled.toString) {[m
-         withTable("test") {[m
-           withTempDir { dir =>[m
-             spark.range(5).selectExpr("id", "id as f1", "id as f2").write[m
-[1mdiff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala[m
-[1mindex e8907f0..2256341 100644[m
-[1m--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala[m
-[1m+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala[m
-[36m@@ -56,7 +56,7 @@[m [mclass PartitionProviderCompatibilitySuite[m
-   }[m
- [m
-   private def verifyIsNewTable(tableName: String): Unit = {[m
-[31m-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {[m
-[32m+[m[32m    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {[m
-       spark.sql(s"show partitions $tableName").count()  // check does not throw[m
-     }[m
-   }[m
-[36m@@ -64,11 +64,11 @@[m [mclass PartitionProviderCompatibilitySuite[m
-   test("convert partition provider to hive with repair table") {[m
-     withTable("test") {[m
-       withTempDir { dir =>[m
-[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
-[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
-           setupPartitionedDatasourceTable("test", dir)[m
-           assert(spark.sql("select * from test").count() == 5)[m
-         }[m
-[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {[m
-[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {[m
-           verifyIsLegacyTable("test")[m
-           spark.sql("msck repair table test")[m
-           verifyIsNewTable("test")[m
-[36m@@ -86,7 +86,7 @@[m [mclass PartitionProviderCompatibilitySuite[m
-   test("when partition management is enabled, new tables have partition provider hive") {[m
-     withTable("test") {[m
-       withTempDir { dir =>[m
-[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true") {[m
-[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {[m
-           setupPartitionedDatasourceTable("test", dir)[m
-           verifyIsNewTable("test")[m
-           assert(spark.sql("select * from test").count() == 0)  // needs repair[m
-[36m@@ -100,7 +100,7 @@[m [mclass PartitionProviderCompatibilitySuite[m
-   test("when partition management is disabled, new tables have no partition provider") {[m
-     withTable("test") {[m
-       withTempDir { dir =>[m
-[31m-        withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
-[32m+[m[32m        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
-           setupPartitionedDatasourceTable("test", dir)[m
-           verifyIsLegacyTable("test")[m
-           assert(spark.sql("select * from test").count() == 5)[m
-[1mdiff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala[m
-[1mindex 5fbf105..e679bad 100644[m
-[1m--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala[m
-[1m+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala[m
-[36m@@ -110,7 +110,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
- [m
-   genericTest("lazy partition pruning reads only necessary partition data") { spec =>[m
-     withSQLConf([m
-[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
-[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
-         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "0") {[m
-       withTable("test") {[m
-         withTempDir { dir =>[m
-[36m@@ -151,7 +151,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
- [m
-   genericTest("lazy partition pruning with file status caching enabled") { spec =>[m
-     withSQLConf([m
-[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
-[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
-         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {[m
-       withTable("test") {[m
-         withTempDir { dir =>[m
-[36m@@ -192,7 +192,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
- [m
-   genericTest("file status caching respects refresh table and refreshByPath") { spec =>[m
-     withSQLConf([m
-[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
-[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
-         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {[m
-       withTable("test") {[m
-         withTempDir { dir =>[m
-[36m@@ -221,7 +221,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
- [m
-   genericTest("file status cache respects size limit") { spec =>[m
-     withSQLConf([m
-[31m-        SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "true",[m
-[32m+[m[32m        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",[m
-         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "1" /* 1 byte */) {[m
-       withTable("test") {[m
-         withTempDir { dir =>[m
-[36m@@ -239,7 +239,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
-   }[m
- [m
-   test("hive table: files read and cached when filesource partition management is off") {[m
-[31m-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
-[32m+[m[32m    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
-       withTable("test") {[m
-         withTempDir { dir =>[m
-           setupPartitionedHiveTable("test", dir)[m
-[36m@@ -268,7 +268,7 @@[m [mclass PartitionedTablePerfStatsSuite[m
-   }[m
- [m
-   test("datasource table: all partition data cached in memory when partition management is off") {[m
-[31m-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_MANAGEMENT.key -> "false") {[m
-[32m+[m[32m    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {[m
-       withTable("test") {[m
-         withTempDir { dir =>[m
-           setupPartitionedDatasourceTable("test", dir)[m

From 1b73b7bed87c0a48a342782eb6383123bae71386 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 15:15:09 -0700
Subject: [PATCH 81/99] Fri Oct 21 15:15:09 PDT 2016

---
 .../src/main/scala/org/apache/spark/sql/DataFrameWriter.scala  | 1 +
 .../spark/sql/execution/command/createDataSourceTables.scala   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 7529798c3d61f..6777a2170ab65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -391,6 +391,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
         val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&
             df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
+          // Need to recover partitions into the metastore so our saved data is visible.
           val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)
           Union(createCmd, recoverPartitionCmd)
         } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index b04a553450f26..0e84ea52f3efd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -93,6 +93,8 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     val newProps = if (partitionColumnNames.nonEmpty &&
         sparkSession.sqlContext.conf.manageFilesourcePartitions) {
+      // Start off with partition provider hive, but no partitions in the metastore. The user
+      // has to call `msck repair table` to populate the table partitions.
       table.properties ++
         Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)
     } else {
@@ -245,6 +247,7 @@ case class CreateDataSourceTableAsSelectCommand(
     result match {
       case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&
           sparkSession.sqlContext.conf.manageFilesourcePartitions =>
+        // Need to recover partitions into the metastore so our saved data is visible.
         sparkSession.sessionState.executePlan(
           AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
       case _ =>

From 4da77241beb7d8a247df9a5f8a9c7299b47925d8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 15:31:38 -0700
Subject: [PATCH 82/99] Fri Oct 21 15:31:38 PDT 2016

---
 .../org/apache/spark/sql/execution/command/ddl.scala     | 4 ++--
 .../sql/execution/datasources/DataSourceStrategy.scala   | 3 ++-
 .../spark/sql/execution/datasources/FileCatalog.scala    | 2 +-
 .../scala/org/apache/spark/sql/internal/SQLConf.scala    | 9 +++++----
 .../apache/spark/sql/execution/command/DDLSuite.scala    | 3 ++-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 14fb29d315a96..f600ed5c18db0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -456,7 +456,7 @@ case class AlterTableRecoverPartitionsCommand(
     }
   }
 
-  private def getPath(table: CatalogTable): Option[String] = {
+  private def getBasePath(table: CatalogTable): Option[String] = {
     if (table.provider == Some("hive")) {
       table.storage.locationUri
     } else {
@@ -474,7 +474,7 @@ case class AlterTableRecoverPartitionsCommand(
         s"Operation not allowed: $cmd only works on partitioned tables: $tableIdentWithDB")
     }
 
-    val tablePath = getPath(table)
+    val tablePath = getBasePath(table)
     if (tablePath.isEmpty) {
       throw new AnalysisException(s"Operation not allowed: $cmd only works on table with " +
         s"location provided: $tableIdentWithDB")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 2f8941f077739..f0bcf94eadc96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -190,7 +190,8 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         mode)
 
       if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&
-          t.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
+          l.catalogTable.get.partitionProviderIsHive) {
+        // TODO(ekl) we should be more efficient here and only recover the newly added partitions
         val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)
         Union(insertCmd, recoverPartitionCmd)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
index df2a4e18f14fe..dba64624c34b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
@@ -65,6 +65,6 @@ trait FileCatalog {
   /** Sum of table file sizes, in bytes */
   def sizeInBytes: Long
 
-  /** Schema of the partitioning columns, or the empty schema. */
+  /** Schema of the partitioning columns, or the empty schema if the table is not partitioned. */
   def partitionSchema: StructType
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1b913bf23964f..c19f0b82dc6a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -274,16 +274,17 @@ object SQLConf {
   val HIVE_MANAGE_FILESOURCE_PARTITIONS =
     SQLConfigBuilder("spark.sql.hive.manageFilesourcePartitions")
       .doc("When true, enable metastore partition management for file source tables as well. " +
-           "This includes both datasource and converted Hive tables. This also controls whether " +
-           "datasource tables will automatically store partition metadata in the Hive metastore.")
+           "This includes both datasource and converted Hive tables. When partition managment " +
+           "is enabled, datasource tables store partition in the Hive metastore, and use the " +
+           "metastore to prune partitions during query planning.")
       .booleanConf
       .createWithDefault(true)
 
   val HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE =
     SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheSize")
-      .doc("When nonzero, enable caching of partition file metadata in memory. All table share " +
+      .doc("When nonzero, enable caching of partition file metadata in memory. All tables share " +
            "a cache that can use up to specified num bytes for file metadata. This conf only " +
-           "applies if filesource partition pruning is also enabled.")
+           "has an effect when hive filesource partition management is enabled.")
       .longConf
       .createWithDefault(250 * 1024 * 1024)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 1d44d0c4945db..7e73f7b3e0459 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -97,7 +97,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       provider = Some("hive"),
       partitionColumnNames = Seq("a", "b"),
       createTime = 0L,
-      properties = Map("partitionProvider" -> "hive"))
+      properties =
+        Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE))
   }
 
   private def createTable(catalog: SessionCatalog, name: TableIdentifier): Unit = {

From 6687c7bd2ab17bc19f2a9b01066914a3905414b8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 15:38:42 -0700
Subject: [PATCH 83/99] Fri Oct 21 15:38:42 PDT 2016

---
 .../sql/hive/PartitionProviderCompatibilitySuite.scala | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index 22563415d0a3a..b5bcb5168dddf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -55,12 +55,6 @@ class PartitionProviderCompatibilitySuite
     }
   }
 
-  private def verifyIsNewTable(tableName: String): Unit = {
-    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
-      spark.sql(s"show partitions $tableName").count()  // check does not throw
-    }
-  }
-
   test("convert partition provider to hive with repair table") {
     withTable("test") {
       withTempDir { dir =>
@@ -71,7 +65,7 @@ class PartitionProviderCompatibilitySuite
         withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
           verifyIsLegacyTable("test")
           spark.sql("msck repair table test")
-          verifyIsNewTable("test")
+          spark.sql("show partitions test").count()  // check we are a new table
 
           // sanity check table performance
           HiveCatalogMetrics.reset()
@@ -88,7 +82,7 @@ class PartitionProviderCompatibilitySuite
       withTempDir { dir =>
         withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
           setupPartitionedDatasourceTable("test", dir)
-          verifyIsNewTable("test")
+          spark.sql("show partitions test").count()  // check we are a new table
           assert(spark.sql("select * from test").count() == 0)  // needs repair
           spark.sql("msck repair table test")
           assert(spark.sql("select * from test").count() == 5)

From 34b1ae6e78be8237bfe1d0cd88ffb97f4456a9ee Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 15:40:42 -0700
Subject: [PATCH 84/99] Fri Oct 21 15:40:42 PDT 2016

---
 .../spark/sql/hive/PartitionProviderCompatibilitySuite.scala  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index b5bcb5168dddf..73c41cc415cd4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -44,6 +44,10 @@ class PartitionProviderCompatibilitySuite
 
   private def verifyIsLegacyTable(tableName: String): Unit = {
     val unsupportedCommands = Seq(
+      s"ALTER TABLE $tableName ADD PARTITION (partcol=1)",
+      s"ALTER TABLE $tableName RENAME PARTITION (partcol=1)",
+      s"ALTER TABLE $tableName DROP PARTITION (partcol=1)",
+      s"TRUNCATE TABLE $tableName PARTITION (partcol=1)",
       s"DESCRIBE $tableName PARTITION (partcol1=1)",
       s"SHOW PARTITIONS $tableName")
 

From aa5b24fe213e20b778295e6da37e76321e94274f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 17:28:05 -0700
Subject: [PATCH 85/99] Fri Oct 21 17:28:05 PDT 2016

---
 .../apache/spark/sql/hive/execution/HiveCommandSuite.scala   | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
index ad1e9b17a9f71..46ed18c70fb56 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
@@ -415,10 +415,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
         .mode(SaveMode.Overwrite)
         .saveAsTable("part_datasrc")
 
-      val message1 = intercept[AnalysisException] {
-        sql("SHOW PARTITIONS part_datasrc")
-      }.getMessage
-      assert(message1.contains("is not allowed on a datasource table"))
+      assert(sql("SHOW PARTITIONS part_datasrc").count() == 3)
     }
   }
 }

From f9032439bcf1dc404f3fd07d44991b3e96180777 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 21 Oct 2016 18:03:07 -0700
Subject: [PATCH 86/99] fix tests

---
 .../datasources/PartitioningAwareFileCatalog.scala     | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 61763678ee5ba..38d1b5bcc6046 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -38,22 +38,20 @@ import org.apache.spark.util.SerializableConfiguration
  * It provides the necessary methods to parse partition data based on a set of files.
  *
  * @param parameters as set of options to control partition discovery
- * @param givenPartitionSchema an optional partition schema that will be use to provide types for
+ * @param userPartitionSchema an optional partition schema that will be use to provide types for
  *                             the discovered partitions
  */
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    givenPartitionSchema: Option[StructType],
+    userPartitionSchema: Option[StructType],
     fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
   def partitionSpec(): PartitionSpec
 
-  override def partitionSchema: StructType = {
-    givenPartitionSchema.getOrElse(partitionSpec().partitionColumns)
-  }
+  override def partitionSchema: StructType = partitionSpec().partitionColumns
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
@@ -126,7 +124,7 @@ abstract class PartitioningAwareFileCatalog(
     val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
       files.exists(f => isDataPath(f.getPath))
     }.keys.toSeq
-    givenPartitionSchema match {
+    userPartitionSchema match {
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
           leafDirs,

From e42bb5aca817aed301f735af00c6d0eacdd10efd Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 24 Oct 2016 12:19:26 -0700
Subject: [PATCH 87/99] fix statistics collection

---
 .../command/AnalyzeTableCommand.scala         |  3 +-
 .../execution/datasources/DataSource.scala    |  2 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 65 +++++++++++--------
 3 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 7b0e49b665f42..52a8fc88c56cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -51,7 +51,8 @@ case class AnalyzeTableCommand(
 
       // data source tables have been converted into LogicalRelations
       case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
-        updateTableStats(logicalRel.catalogTable.get, logicalRel.relation.sizeInBytes)
+        updateTableStats(logicalRel.catalogTable.get,
+          AnalyzeTableCommand.calculateTotalSize(sessionState, logicalRel.catalogTable.get))
 
       case otherRelation =>
         throw new AnalysisException("ANALYZE TABLE is not supported for " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 28e671f003687..043b7cadb429a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -422,7 +422,7 @@ case class DataSource(
             catalogTable.get.identifier.database.get,
             catalogTable.get.identifier.table,
             partitionSchema.getOrElse(StructType(Nil)),
-            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L) /* TODO(ekl) */)
+            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
         } else {
           new ListingFileCatalog(
             sparkSession, globbedPaths, options, partitionSchema)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index c351063a63ff8..4f5ebc3d838b9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -310,39 +310,50 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
-  test("test table-level statistics for data source table created in HiveExternalCatalog") {
-    val parquetTable = "parquetTable"
-    withTable(parquetTable) {
-      sql(s"CREATE TABLE $parquetTable (key STRING, value STRING) USING PARQUET")
-      val catalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(parquetTable))
-      assert(DDLUtils.isDatasourceTable(catalogTable))
+  private def testUpdatingTableStats(tableDescription: String, createTableCmd: String): Unit = {
+    test("test table-level statistics for " + tableDescription) {
+      val parquetTable = "parquetTable"
+      withTable(parquetTable) {
+        sql(createTableCmd)
+        val catalogTable = spark.sessionState.catalog.getTableMetadata(
+          TableIdentifier(parquetTable))
+        assert(DDLUtils.isDatasourceTable(catalogTable))
+
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        checkTableStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
 
-      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
-      checkTableStats(
-        parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
+        // noscan won't count the number of rows
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
+        val fetchedStats1 = checkTableStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
 
-      // noscan won't count the number of rows
-      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
-      val fetchedStats1 = checkTableStats(
-        parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
+        val fetchedStats2 = checkTableStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+        assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
 
-      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
-      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
-      val fetchedStats2 = checkTableStats(
-        parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
-      assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
-
-      // without noscan, we count the number of rows
-      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
-      val fetchedStats3 = checkTableStats(
-        parquetTable,
-        isDataSourceTable = true,
-        hasSizeInBytes = true,
-        expectedRowCounts = Some(1000))
-      assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
+        // without noscan, we count the number of rows
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
+        val fetchedStats3 = checkTableStats(
+          parquetTable,
+          isDataSourceTable = true,
+          hasSizeInBytes = true,
+          expectedRowCounts = Some(1000))
+        assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
+      }
     }
   }
 
+  testUpdatingTableStats(
+    "data source table created in HiveExternalCatalog",
+    "CREATE TABLE parquetTable (key STRING, value STRING) USING PARQUET")
+
+  testUpdatingTableStats(
+    "partitioned data source table",
+    "CREATE TABLE parquetTable (key STRING, value STRING) USING PARQUET PARTITIONED BY (key)")
+
   test("statistics collection of a table with zero column") {
     val table_no_cols = "table_no_cols"
     withTable(table_no_cols) {

From bc659c519958fbfd157f6514a8d07a1d5cac2f70 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 24 Oct 2016 12:43:56 -0700
Subject: [PATCH 88/99] better backwards compat

---
 .../spark/sql/execution/command/ddl.scala     | 18 ++++++---
 .../spark/sql/execution/command/tables.scala  |  9 +++--
 .../PartitionProviderCompatibilitySuite.scala | 37 +++++++++++++++++--
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index f600ed5c18db0..99627b8efbcbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -346,7 +346,7 @@ case class AlterTableAddPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    DDLUtils.verifyPartitionProviderIsHive(table, "ALTER TABLE ADD PARTITION")
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE ADD PARTITION")
     val parts = partitionSpecsAndLocs.map { case (spec, location) =>
       // inherit table storage format (possibly except for location)
       CatalogTablePartition(spec, table.storage.copy(locationUri = location))
@@ -375,7 +375,7 @@ case class AlterTableRenamePartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    DDLUtils.verifyPartitionProviderIsHive(table, "ALTER TABLE RENAME PARTITION")
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE RENAME PARTITION")
     catalog.renamePartitions(
       tableName, Seq(oldPartition), Seq(newPartition))
     Seq.empty[Row]
@@ -408,7 +408,7 @@ case class AlterTableDropPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    DDLUtils.verifyPartitionProviderIsHive(table, "ALTER TABLE DROP PARTITION")
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE DROP PARTITION")
     catalog.dropPartitions(table.identifier, specs, ignoreIfNotExists = ifExists, purge = purge)
     Seq.empty[Row]
   }
@@ -695,12 +695,18 @@ object DDLUtils {
   /**
    * Throws a standard error for actions that require partitionProvider = hive.
    */
-  def verifyPartitionProviderIsHive(table: CatalogTable, action: String): Unit = {
+  def verifyPartitionProviderIsHive(
+      spark: SparkSession, table: CatalogTable, action: String): Unit = {
+    val tableName = table.identifier.table
+    if (!spark.sqlContext.conf.manageFilesourcePartitions && isDatasourceTable(table)) {
+      throw new AnalysisException(
+        s"$action is not allowed on $tableName since filesource partition management is " +
+          "disabled (spark.sql.hive.manageFilesourcePartitions = false).")
+    }
     if (!table.partitionProviderIsHive) {
-      val tableName = table.identifier.table
       throw new AnalysisException(
         s"$action is not allowed on $tableName since its partition metadata is not stored in " +
-          s"the Hive metastore. To import this information into the metastore, run " +
+          "the Hive metastore. To import this information into the metastore, run " +
           s"`msck repair table $tableName`")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index d72e142b8c265..f5cbead2c93c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -365,7 +365,7 @@ case class TruncateTableCommand(
         s"for tables that are not partitioned: $tableIdentwithDB")
     }
     if (partitionSpec.isDefined) {
-      DDLUtils.verifyPartitionProviderIsHive(table, "TRUNCATE TABLE ... PARTITION")
+      DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
     }
     val locations =
       if (DDLUtils.isDatasourceTable(table)) {
@@ -451,7 +451,7 @@ case class DescribeTableCommand(
           describeFormattedTableInfo(metadata, result)
         }
       } else {
-        describeDetailedPartitionInfo(catalog, metadata, result)
+        describeDetailedPartitionInfo(sparkSession, catalog, metadata, result)
       }
     }
 
@@ -526,6 +526,7 @@ case class DescribeTableCommand(
   }
 
   private def describeDetailedPartitionInfo(
+      spark: SparkSession,
       catalog: SessionCatalog,
       metadata: CatalogTable,
       result: ArrayBuffer[Row]): Unit = {
@@ -533,7 +534,7 @@ case class DescribeTableCommand(
       throw new AnalysisException(
         s"DESC PARTITION is not allowed on a view: ${table.identifier}")
     }
-    DDLUtils.verifyPartitionProviderIsHive(metadata, "DESC PARTITION")
+    DDLUtils.verifyPartitionProviderIsHive(spark, metadata, "DESC PARTITION")
     val partition = catalog.getPartition(table, partitionSpec)
     if (isExtended) {
       describeExtendedDetailedPartitionInfo(table, metadata, partition, result)
@@ -738,7 +739,7 @@ case class ShowPartitionsCommand(
         s"SHOW PARTITIONS is not allowed on a table that is not partitioned: $tableIdentWithDB")
     }
 
-    DDLUtils.verifyPartitionProviderIsHive(table, "SHOW PARTITIONS")
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "SHOW PARTITIONS")
 
     /**
      * Validate the partitioning spec by making sure all the referenced columns are
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index 73c41cc415cd4..d67633cdb78c6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -51,11 +51,13 @@ class PartitionProviderCompatibilitySuite
       s"DESCRIBE $tableName PARTITION (partcol1=1)",
       s"SHOW PARTITIONS $tableName")
 
-    for (cmd <- unsupportedCommands) {
-      val e = intercept[AnalysisException] {
-        spark.sql(s"show partitions $tableName")
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      for (cmd <- unsupportedCommands) {
+        val e = intercept[AnalysisException] {
+          spark.sql(s"show partitions $tableName")
+        }
+        assert(e.getMessage.contains("partition metadata is not stored in the Hive metastore"), e)
       }
-      assert(e.getMessage.contains("partition metadata is not stored in the Hive metastore"), e)
     }
   }
 
@@ -106,4 +108,31 @@ class PartitionProviderCompatibilitySuite
       }
     }
   }
+
+  test("when partition management is disabled, we preserve the old behavior even for new tables") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+          setupPartitionedDatasourceTable("test", dir)
+          spark.sql("show partitions test").count()  // check we are a new table
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 0)
+        }
+        // disabled
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+          val e = intercept[AnalysisException] {
+            spark.sql(s"show partitions test")
+          }
+          assert(e.getMessage.contains("filesource partition management is disabled"))
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 5)
+        }
+        // then enabled again
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 0)
+        }
+      }
+    }
+  }
 }

From ba4f32f7e8424bf97c6806eef1c2d37bec2e28c5 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 24 Oct 2016 15:52:47 -0700
Subject: [PATCH 89/99] also compute stats for column analyze

---
 .../spark/sql/execution/command/AnalyzeColumnCommand.scala     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 488138709a12b..f873f34a845ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -50,7 +50,8 @@ case class AnalyzeColumnCommand(
           AnalyzeTableCommand.calculateTotalSize(sessionState, catalogRel.catalogTable))
 
       case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
-        updateStats(logicalRel.catalogTable.get, logicalRel.relation.sizeInBytes)
+        updateStats(logicalRel.catalogTable.get,
+          AnalyzeTableCommand.calculateTotalSize(sessionState, logicalRel.catalogTable.get))
 
       case otherRelation =>
         throw new AnalysisException("ANALYZE TABLE is not supported for " +

From ef954fb371897668633e0d656f717b18adc7c2dd Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 25 Oct 2016 19:31:39 +0800
Subject: [PATCH 90/99] minor updates

---
 .../apache/spark/sql/catalyst/catalog/SessionCatalog.scala  | 3 +--
 .../org/apache/spark/sql/catalyst/catalog/interface.scala   | 6 ++----
 .../org/apache/spark/sql/execution/command/tables.scala     | 1 -
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index a3fd26064f05e..3d6eec81c03c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -776,8 +776,7 @@ class SessionCatalog(
   private def requireExactMatchedPartitionSpec(
       specs: Seq[TablePartitionSpec],
       table: CatalogTable): Unit = {
-    // The partition columns in partition specification are always lower cased.
-    val defined = table.partitionColumnNames.map(_.toLowerCase).sorted
+    val defined = table.partitionColumnNames.sorted
     specs.foreach { s =>
       if (s.keys.toSeq.sorted != defined) {
         throw new AnalysisException(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index dd1de5eb3fb69..45152670efd4c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -104,8 +104,7 @@ case class CatalogTablePartition(
    */
   def toRow(partitionSchema: StructType): InternalRow = {
     InternalRow.fromSeq(partitionSchema.map { field =>
-      // The partition columns in partition specification are always lower cased.
-      Cast(Literal(spec(field.name.toLowerCase)), field.dataType).eval()
+      Cast(Literal(spec(field.name)), field.dataType).eval()
     })
   }
 }
@@ -158,8 +157,6 @@ case class CatalogTable(
     comment: Option[String] = None,
     unsupportedFeatures: Seq[String] = Seq.empty) {
 
-  import CatalogTable._
-
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
     c => partitionColumnNames.contains(c.name)
@@ -225,6 +222,7 @@ case class CatalogTable(
    * @return whether this table's partition metadata is stored in the Hive metastore.
    */
   def partitionProviderIsHive: Boolean = {
+    import CatalogTable._
     provider == Some("hive") ||
       properties.get(PARTITION_PROVIDER_KEY) == Some(PARTITION_PROVIDER_HIVE)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index f5cbead2c93c1..6c58a119fe6e2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -358,7 +358,6 @@ case class TruncateTableCommand(
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentwithDB")
     }
-    val isDatasourceTable = DDLUtils.isDatasourceTable(table)
     if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) {
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +

From 802372c47376b684286b5d07dbe3a00855b180b5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 26 Oct 2016 00:44:04 +0800
Subject: [PATCH 91/99] fix case preserving

---
 .../spark/sql/execution/command/ddl.scala     | 17 +++---
 .../spark/sql/hive/HiveExternalCatalog.scala  | 60 +++++++++++++------
 .../sql/hive/client/HiveClientImpl.scala      |  3 -
 3 files changed, 51 insertions(+), 29 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index bac6a7472dec9..f66665fac2aa9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
@@ -515,8 +516,8 @@ case class AlterTableRecoverPartitionsCommand(
     val threshold = spark.conf.get("spark.rdd.parallelListingThreshold", "10").toInt
     val hadoopConf = spark.sparkContext.hadoopConfiguration
     val pathFilter = getPathFilter(hadoopConf)
-    val partitionSpecsAndLocs = scanPartitions(
-      spark, fs, pathFilter, root, Map(), table.partitionColumnNames.map(_.toLowerCase), threshold)
+    val partitionSpecsAndLocs = scanPartitions(spark, fs, pathFilter, root, Map(),
+      table.partitionColumnNames, threshold, spark.sessionState.conf.resolver)
     val total = partitionSpecsAndLocs.length
     logInfo(s"Found $total partitions in $root")
 
@@ -543,7 +544,8 @@ case class AlterTableRecoverPartitionsCommand(
       path: Path,
       spec: TablePartitionSpec,
       partitionNames: Seq[String],
-      threshold: Int): GenSeq[(TablePartitionSpec, Path)] = {
+      threshold: Int,
+      resolver: Resolver): GenSeq[(TablePartitionSpec, Path)] = {
     if (partitionNames.isEmpty) {
       return Seq(spec -> path)
     }
@@ -562,13 +564,12 @@ case class AlterTableRecoverPartitionsCommand(
       val name = st.getPath.getName
       if (st.isDirectory && name.contains("=")) {
         val ps = name.split("=", 2)
-        val columnName = PartitioningUtils.unescapePathName(ps(0)).toLowerCase
+        val columnName = PartitioningUtils.unescapePathName(ps(0))
         // TODO: Validate the value
         val value = PartitioningUtils.unescapePathName(ps(1))
-        // comparing with case-insensitive, but preserve the case
-        if (columnName == partitionNames.head) {
-          scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(columnName -> value),
-            partitionNames.drop(1), threshold)
+        if (resolver(columnName, partitionNames.head)) {
+          scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value),
+            partitionNames.drop(1), threshold, resolver)
         } else {
           logWarning(
             s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index cb9947a697c7a..d1d11099a50e7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -582,13 +582,30 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   // Partitions
   // --------------------------------------------------------------------------
 
+  // Hive metastore is not case preserving and the partition columns are always lower cased. We need
+  // to lower case the column names in partition specification before calling partition related Hive
+  // APIs, to match this behaviour.
+  private def lowerCasePartitionSpec(spec: TablePartitionSpec): TablePartitionSpec = {
+    spec.map { case (k, v) => k.toLowerCase -> v }
+  }
+
+  // Hive metastore is not case preserving and the column names of the partition specification we
+  // get from the metastore are always lower cased. We should restore them w.r.t. the actual table
+  // partition columns.
+  private def restorePartitionSpec(
+      spec: TablePartitionSpec,
+      partCols: Seq[String]): TablePartitionSpec = {
+    spec.map { case (k, v) => partCols.find(_.equalsIgnoreCase(k)).get -> v }
+  }
+
   override def createPartitions(
       db: String,
       table: String,
       parts: Seq[CatalogTablePartition],
       ignoreIfExists: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    client.createPartitions(db, table, parts, ignoreIfExists)
+    val lowerCasedParts = parts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+    client.createPartitions(db, table, lowerCasedParts, ignoreIfExists)
   }
 
   override def dropPartitions(
@@ -598,7 +615,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       ignoreIfNotExists: Boolean,
       purge: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    client.dropPartitions(db, table, parts, ignoreIfNotExists, purge)
+    client.dropPartitions(db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge)
   }
 
   override def renamePartitions(
@@ -606,21 +623,24 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       specs: Seq[TablePartitionSpec],
       newSpecs: Seq[TablePartitionSpec]): Unit = withClient {
-    client.renamePartitions(db, table, specs, newSpecs)
+    client.renamePartitions(
+      db, table, specs.map(lowerCasePartitionSpec), newSpecs.map(lowerCasePartitionSpec))
   }
 
   override def alterPartitions(
       db: String,
       table: String,
       newParts: Seq[CatalogTablePartition]): Unit = withClient {
-    client.alterPartitions(db, table, newParts)
+    val lowerCasedParts = newParts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+    client.alterPartitions(db, table, lowerCasedParts)
   }
 
   override def getPartition(
       db: String,
       table: String,
       spec: TablePartitionSpec): CatalogTablePartition = withClient {
-    client.getPartition(db, table, spec)
+    val part = client.getPartition(db, table, lowerCasePartitionSpec(spec))
+    part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
   }
 
   /**
@@ -630,7 +650,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       spec: TablePartitionSpec): Option[CatalogTablePartition] = withClient {
-    client.getPartitionOption(db, table, spec)
+    client.getPartitionOption(db, table, lowerCasePartitionSpec(spec)).map { part =>
+      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+    }
   }
 
   /**
@@ -640,19 +662,20 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
-    client.getPartitions(db, table, partialSpec)
+    client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
+      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+    }
   }
 
   override def listPartitionsByFilter(
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
-    val catalogTable = client.getTable(db, table)
+    val rawTable = client.getTable(db, table)
+    val catalogTable = restoreTableMetadata(rawTable)
     val partitionColumnNames = catalogTable.partitionColumnNames.toSet
     val nonPartitionPruningPredicates = predicates.filterNot {
-      // Hive metastore is not case-preserving, so the `partitionColumnNames` are always lower
-      // cased, here we also lower case the attribute names in partition spec.
-      _.references.map(_.name.toLowerCase).toSet.subsetOf(partitionColumnNames)
+      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
     }
 
     if (nonPartitionPruningPredicates.nonEmpty) {
@@ -663,19 +686,20 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val partitionSchema = catalogTable.partitionSchema
 
     if (predicates.nonEmpty) {
-      val clientPrunedPartitions =
-        client.getPartitionsByFilter(catalogTable, predicates)
+      val clientPrunedPartitions = client.getPartitionsByFilter(rawTable, predicates).map { part =>
+        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+      }
       val boundPredicate =
         InterpretedPredicate.create(predicates.reduce(And).transform {
           case att: AttributeReference =>
-            val index = partitionSchema.indexWhere(_.name == att.name.toLowerCase)
+            val index = partitionSchema.indexWhere(_.name == att.name)
             BoundReference(index, partitionSchema(index).dataType, nullable = true)
         })
-      clientPrunedPartitions.filter { case p: CatalogTablePartition =>
-        boundPredicate(p.toRow(partitionSchema))
-      }
+      clientPrunedPartitions.filter { p => boundPredicate(p.toRow(partitionSchema)) }
     } else {
-      client.getPartitions(catalogTable)
+      client.getPartitions(catalogTable).map { part =>
+        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+      }
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index c4ccce86b9223..84873bbbb81ce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -831,9 +831,6 @@ private[hive] class HiveClientImpl(
     new HivePartition(ht, tpart)
   }
 
-  // TODO (cloud-fan): the column names in partition specification are always lower cased because
-  // Hive metastore is not case preserving. We should normalize them to the actual column names of
-  // the table, once we store partition spec of data source tables.
   private def fromHivePartition(hp: HivePartition): CatalogTablePartition = {
     val apiPartition = hp.getTPartition
     CatalogTablePartition(

From e3f66104598a9b8d086154a8d9c250b3a21ec34f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 25 Oct 2016 13:20:55 -0700
Subject: [PATCH 92/99] comments and fix mixed case

---
 .../sql/catalyst/catalog/interface.scala      |  6 ++++-
 .../spark/sql/execution/command/ddl.scala     | 11 +++------
 .../spark/sql/execution/command/tables.scala  |  5 ++--
 .../PartitionProviderCompatibilitySuite.scala | 23 +++++++++----------
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 45152670efd4c..cd3d3e17d9549 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -229,8 +229,12 @@ case class CatalogTable(
 }
 
 object CatalogTable {
-  val PARTITION_PROVIDER_KEY = "partitionProvider"
+  val PARTITION_PROVIDER_KEY = "org.apache.spark/partitionProvider"
   val PARTITION_PROVIDER_HIVE = "hive"
+
+  def isSparkManagedTableProp(prop: String): Boolean = {
+    prop == PARTITION_PROVIDER_KEY  // only one for now
+  }
 }
 
 case class CatalogTableType private(name: String)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index f66665fac2aa9..dac2ab7c978d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -677,16 +677,11 @@ case class AlterTableSetLocationCommand(
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
     partitionSpec match {
       case Some(spec) =>
+        DDLUtils.verifyPartitionProviderIsHive(
+          sparkSession, table, "ALTER TABLE ... SET LOCATION")
         // Partition spec is specified, so we set the location only for this partition
         val part = catalog.getPartition(table.identifier, spec)
-        val newPart =
-          if (DDLUtils.isDatasourceTable(table)) {
-            throw new AnalysisException(
-              "ALTER TABLE SET LOCATION for partition is not allowed for tables defined " +
-              "using the datasource API")
-          } else {
-            part.copy(storage = part.storage.copy(locationUri = Some(location)))
-          }
+        val newPart = part.copy(storage = part.storage.copy(locationUri = Some(location)))
         catalog.alterPartitions(table.identifier, Seq(newPart))
       case None =>
         // No partition spec is specified, so we set the location for the table itself
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 6c58a119fe6e2..38f6607a3ba13 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -887,8 +887,9 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
   private def showHiveTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = {
     if (metadata.properties.nonEmpty) {
       val filteredProps = metadata.properties.filterNot {
-        // Skips "EXTERNAL" property for external tables
-        case (key, _) => key == "EXTERNAL" && metadata.tableType == EXTERNAL
+        case (key, _) =>
+          (key == "EXTERNAL" && metadata.tableType == EXTERNAL) ||
+            CatalogTable.isSparkManagedTableProp(key)
       }
 
       val props = filteredProps.map { case (key, value) =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index d67633cdb78c6..5f16960fb1496 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -29,32 +29,31 @@ class PartitionProviderCompatibilitySuite
   extends QueryTest with TestHiveSingleton with SQLTestUtils {
 
   private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
-    // TODO(ekl) make these mixed-case fields once support for that is fixed
-    spark.range(5).selectExpr("id as fieldone", "id as partcol1", "id as partcol2").write
-      .partitionBy("partcol1", "partcol2")
+    spark.range(5).selectExpr("id as fieldOne", "id as partCol").write
+      .partitionBy("partCol")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
     spark.sql(s"""
-      |create table $tableName (fieldone long, partcol1 int, partcol2 int)
+      |create table $tableName (fieldOne long, partCol int)
       |using parquet
       |options (path "${dir.getAbsolutePath}")
-      |partitioned by (partcol1, partcol2)""".stripMargin)
+      |partitioned by (partCol)""".stripMargin)
   }
 
   private def verifyIsLegacyTable(tableName: String): Unit = {
     val unsupportedCommands = Seq(
-      s"ALTER TABLE $tableName ADD PARTITION (partcol=1)",
-      s"ALTER TABLE $tableName RENAME PARTITION (partcol=1)",
-      s"ALTER TABLE $tableName DROP PARTITION (partcol=1)",
-      s"TRUNCATE TABLE $tableName PARTITION (partcol=1)",
-      s"DESCRIBE $tableName PARTITION (partcol1=1)",
+      s"ALTER TABLE $tableName ADD PARTITION (partCol=1) LOCATION '/foo'",
+      s"ALTER TABLE $tableName PARTITION (partCol=1) RENAME TO PARTITION (partCol=2)",
+      s"ALTER TABLE $tableName PARTITION (partCol=1) SET LOCATION '/foo'",
+      s"ALTER TABLE $tableName DROP PARTITION (partCol=1)",
+      s"DESCRIBE $tableName PARTITION (partCol=1)",
       s"SHOW PARTITIONS $tableName")
 
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
       for (cmd <- unsupportedCommands) {
         val e = intercept[AnalysisException] {
-          spark.sql(s"show partitions $tableName")
+          spark.sql(cmd)
         }
         assert(e.getMessage.contains("partition metadata is not stored in the Hive metastore"), e)
       }
@@ -75,7 +74,7 @@ class PartitionProviderCompatibilitySuite
 
           // sanity check table performance
           HiveCatalogMetrics.reset()
-          assert(spark.sql("select * from test where partcol1 < 2").count() == 2)
+          assert(spark.sql("select * from test where partCol < 2").count() == 2)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
         }

From a442aa4182b5782f5990a7ed72dc282a78bfb4c4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 25 Oct 2016 13:30:12 -0700
Subject: [PATCH 93/99] another small test

---
 .../sql/hive/PartitionedTablePerfStatsSuite.scala   | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index e679bad462295..476383a5b33a5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -41,7 +41,7 @@ class PartitionedTablePerfStatsSuite
     FileStatusCache.resetForTesting()
   }
 
-  private case class TestSpec(setupTable: (String, File) => Unit)
+  private case class TestSpec(setupTable: (String, File) => Unit, isDatasourceTable: Boolean)
 
   /**
    * Runs a test against both converted hive and native datasource tables. The test can use the
@@ -49,10 +49,10 @@ class PartitionedTablePerfStatsSuite
    */
   private def genericTest(testName: String)(fn: TestSpec => Unit): Unit = {
     test("hive table: " + testName) {
-      fn(TestSpec(setupPartitionedHiveTable))
+      fn(TestSpec(setupPartitionedHiveTable, false))
     }
     test("datasource table: " + testName) {
-      fn(TestSpec(setupPartitionedDatasourceTable))
+      fn(TestSpec(setupPartitionedDatasourceTable, true))
     }
   }
 
@@ -104,6 +104,13 @@ class PartitionedTablePerfStatsSuite
         val df4 = spark.sql("select * from test where partCol1 = 999")
         assert(df4.count() == 0)
         assert(df4.inputFiles.length == 0)
+
+        // TODO(ekl) enable for hive tables as well once SPARK-17983 is fixed
+        if (spec.isDatasourceTable) {
+          val df5 = spark.sql("select * from test where fieldOne = 4")
+          assert(df5.count() == 1)
+          assert(df5.inputFiles.length == 5)
+        }
       }
     }
   }

From 87a6b40ee544310de9312cdf7416ca9d624af03b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 25 Oct 2016 16:30:07 -0700
Subject: [PATCH 94/99] fix test

---
 .../spark/sql/execution/command/DDLSuite.scala       | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 23ed3ea5c3ac3..51a7710c3cc68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1148,7 +1148,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       if (isDatasourceTable) {
         if (spec.isDefined) {
           assert(storageFormat.properties.isEmpty)
-          assert(storageFormat.locationUri.isEmpty)
+          assert(storageFormat.locationUri === Some(expected))
         } else {
           assert(storageFormat.properties.get("path") === Some(expected))
           assert(storageFormat.locationUri === Some(expected))
@@ -1161,18 +1161,14 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     sql("ALTER TABLE dbx.tab1 SET LOCATION '/path/to/your/lovely/heart'")
     verifyLocation("/path/to/your/lovely/heart")
     // set table partition location
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION '/path/to/part/ways'")
-    }
+    sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION '/path/to/part/ways'")
     verifyLocation("/path/to/part/ways", Some(partSpec))
     // set table location without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     sql("ALTER TABLE tab1 SET LOCATION '/swanky/steak/place'")
     verifyLocation("/swanky/steak/place")
     // set table partition location without explicitly specifying database
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 PARTITION (a='1', b='2') SET LOCATION 'vienna'")
-    }
+    sql("ALTER TABLE tab1 PARTITION (a='1', b='2') SET LOCATION 'vienna'")
     verifyLocation("vienna", Some(partSpec))
     // table to alter does not exist
     intercept[AnalysisException] {
@@ -1652,7 +1648,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
 
       // supported since partitions are stored in the metastore
-      assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
+      sql("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
       assert(spark.table("rectangles2").collect().isEmpty)
     }
   }

From 05fd862bd1bedf99a7b9b717a139a0df76971e12 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 26 Oct 2016 21:28:56 +0800
Subject: [PATCH 95/99] make partitionProviderIsHive a field

---
 .../sql/catalyst/catalog/interface.scala      | 25 ++------
 .../command/createDataSourceTables.scala      | 16 ++---
 .../spark/sql/execution/command/ddl.scala     | 16 ++---
 .../spark/sql/execution/command/tables.scala  | 16 ++---
 .../PartitioningAwareFileCatalog.scala        |  2 +-
 .../sql/execution/command/DDLSuite.scala      | 11 +---
 .../spark/sql/hive/HiveExternalCatalog.scala  | 62 +++++++++++--------
 7 files changed, 61 insertions(+), 87 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index cd3d3e17d9549..7c3bec897956a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -138,6 +138,8 @@ case class BucketSpec(
  *                 Can be None if this table is a View, should be "hive" for hive serde tables.
  * @param unsupportedFeatures is a list of string descriptions of features that are used by the
  *        underlying table but not supported by Spark SQL yet.
+ * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive
+ *                                metastore.
  */
 case class CatalogTable(
     identifier: TableIdentifier,
@@ -155,7 +157,8 @@ case class CatalogTable(
     viewOriginalText: Option[String] = None,
     viewText: Option[String] = None,
     comment: Option[String] = None,
-    unsupportedFeatures: Seq[String] = Seq.empty) {
+    unsupportedFeatures: Seq[String] = Seq.empty,
+    partitionProviderIsHive: Boolean = false) {
 
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
@@ -213,29 +216,13 @@ case class CatalogTable(
         comment.map("Comment: " + _).getOrElse(""),
         if (properties.nonEmpty) s"Properties: $tableProperties" else "",
         if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "",
-        s"$storage")
+        s"$storage",
+        if (partitionProviderIsHive) "Partition Provider: Hive" else "")
 
     output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }
-
-  /**
-   * @return whether this table's partition metadata is stored in the Hive metastore.
-   */
-  def partitionProviderIsHive: Boolean = {
-    import CatalogTable._
-    provider == Some("hive") ||
-      properties.get(PARTITION_PROVIDER_KEY) == Some(PARTITION_PROVIDER_HIVE)
-  }
 }
 
-object CatalogTable {
-  val PARTITION_PROVIDER_KEY = "org.apache.spark/partitionProvider"
-  val PARTITION_PROVIDER_HIVE = "hive"
-
-  def isSparkManagedTableProp(prop: String): Boolean = {
-    prop == PARTITION_PROVIDER_KEY  // only one for now
-  }
-}
 
 case class CatalogTableType private(name: String)
 object CatalogTableType {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 0e84ea52f3efd..2a9743130d4c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -91,21 +91,15 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
       table.storage.properties
     }
 
-    val newProps = if (partitionColumnNames.nonEmpty &&
-        sparkSession.sqlContext.conf.manageFilesourcePartitions) {
-      // Start off with partition provider hive, but no partitions in the metastore. The user
-      // has to call `msck repair table` to populate the table partitions.
-      table.properties ++
-        Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)
-    } else {
-      table.properties
-    }
-
     val newTable = table.copy(
       storage = table.storage.copy(properties = optionsWithPath),
       schema = dataSource.schema,
       partitionColumnNames = partitionColumnNames,
-      properties = newProps)
+      // If metastore partition management for file source tables is enabled, we start off with
+      // partition provider hive, but no partitions in the metastore. The user has to call
+      // `msck repair table` to populate the table partitions.
+      partitionProviderIsHive = partitionColumnNames.nonEmpty &&
+        sparkSession.sessionState.conf.manageFilesourcePartitions)
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
     sessionState.catalog.createTable(newTable, ignoreIfExists = false)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index dac2ab7c978d3..5e558e8c96ff1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -529,7 +529,10 @@ case class AlterTableRecoverPartitionsCommand(
     logInfo(s"Finished to gather the fast stats for all $total partitions.")
 
     addPartitions(spark, table, partitionSpecsAndLocs, partitionStats)
-    DDLUtils.setPartitionProviderHive(spark, table)
+    // Updates the table to indicate that its partition metadata is stored in the Hive metastore.
+    // This is always the case for Hive format tables, but is not true for Datasource tables created
+    // before Spark 2.1 unless they are converted via `msck repair table`.
+    spark.sessionState.catalog.alterTable(table.copy(partitionProviderIsHive = true))
     catalog.refreshTable(tableName)
     logInfo(s"Recovered all partitions ($total).")
     Seq.empty[Row]
@@ -705,17 +708,6 @@ object DDLUtils {
     table.provider.isDefined && table.provider.get != "hive"
   }
 
-  /**
-   * Updates a table to indicate that its partition metadata is stored in the Hive metastore.
-   * This is always the case for Hive format tables, but is not true for Datasource tables created
-   * before Spark 2.1 unless they are converted via `msck repair table`.
-   */
-  def setPartitionProviderHive(spark: SparkSession, table: CatalogTable): Unit = {
-    spark.sessionState.catalog.alterTable(
-      table.copy(properties = table.properties ++
-        Map(CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE)))
-  }
-
   /**
    * Throws a standard error for actions that require partitionProvider = hive.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 38f6607a3ba13..4acfffb628047 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -489,6 +489,10 @@ case class DescribeTableCommand(
     describeStorageInfo(table, buffer)
 
     if (table.tableType == CatalogTableType.VIEW) describeViewInfo(table, buffer)
+
+    if (DDLUtils.isDatasourceTable(table) && table.partitionProviderIsHive) {
+      append(buffer, "Partition Provider:", "Hive", "")
+    }
   }
 
   private def describeStorageInfo(metadata: CatalogTable, buffer: ArrayBuffer[Row]): Unit = {
@@ -886,19 +890,11 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
 
   private def showHiveTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = {
     if (metadata.properties.nonEmpty) {
-      val filteredProps = metadata.properties.filterNot {
-        case (key, _) =>
-          (key == "EXTERNAL" && metadata.tableType == EXTERNAL) ||
-            CatalogTable.isSparkManagedTableProp(key)
-      }
-
-      val props = filteredProps.map { case (key, value) =>
+      val props = metadata.properties.map { case (key, value) =>
         s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'"
       }
 
-      if (props.nonEmpty) {
-        builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
-      }
+      builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 38d1b5bcc6046..cc4049e925905 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -39,7 +39,7 @@ import org.apache.spark.util.SerializableConfiguration
  *
  * @param parameters as set of options to control partition discovery
  * @param userPartitionSchema an optional partition schema that will be use to provide types for
- *                             the discovered partitions
+ *                            the discovered partitions
  */
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 51a7710c3cc68..ad27a203d9ee8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -96,8 +96,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       provider = Some("hive"),
       partitionColumnNames = Seq("a", "b"),
       createTime = 0L,
-      properties = Map(
-        CatalogTable.PARTITION_PROVIDER_KEY -> CatalogTable.PARTITION_PROVIDER_HIVE))
+      partitionProviderIsHive = true)
   }
 
   private def createTable(catalog: SessionCatalog, name: TableIdentifier): Unit = {
@@ -1073,9 +1072,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       convertToDatasourceTable(catalog, tableIdent)
     }
     def getProps: Map[String, String] = {
-      catalog.getTableMetadata(tableIdent).properties.filter { case (key, _) =>
-        key != CatalogTable.PARTITION_PROVIDER_KEY
-      }
+      catalog.getTableMetadata(tableIdent).properties
     }
     assert(getProps.isEmpty)
     // set table properties
@@ -1100,9 +1097,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       convertToDatasourceTable(catalog, tableIdent)
     }
     def getProps: Map[String, String] = {
-      catalog.getTableMetadata(tableIdent).properties.filter { case (key, _) =>
-        key != CatalogTable.PARTITION_PROVIDER_KEY
-      }
+      catalog.getTableMetadata(tableIdent).properties
     }
     // unset table properties
     sql("ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('j' = 'am', 'p' = 'an', 'c' = 'lan', 'x' = 'y')")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index d1d11099a50e7..a75ab56436334 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -106,13 +106,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * metastore.
    */
   private def verifyTableProperties(table: CatalogTable): Unit = {
-    val invalidKeys = table.properties.keys.filter { key =>
-      key.startsWith(DATASOURCE_PREFIX) || key.startsWith(STATISTICS_PREFIX)
-    }
+    val invalidKeys = table.properties.keys.filter(_.startsWith(SPARK_SQL_PREFIX))
     if (invalidKeys.nonEmpty) {
       throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " +
-        s"as table property keys may not start with '$DATASOURCE_PREFIX' or '$STATISTICS_PREFIX':" +
-        s" ${invalidKeys.mkString("[", ", ", "]")}")
+        s"as table property keys may not start with '$SPARK_SQL_PREFIX': " +
+        invalidKeys.mkString("[", ", ", "]"))
     }
     // External users are not allowed to set/switch the table type. In Hive metastore, the table
     // type can be switched by changing the value of a case-sensitive table property `EXTERNAL`.
@@ -191,11 +189,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       throw new TableAlreadyExistsException(db = db, table = table)
     }
     // Before saving data source table metadata into Hive metastore, we should:
-    //  1. Put table schema, partition column names and bucket specification in table properties.
+    //  1. Put table provider, schema, partition column names, bucket specification and partition
+    //     provider in table properties.
     //  2. Check if this table is hive compatible
     //    2.1  If it's not hive compatible, set schema, partition columns and bucket spec to empty
     //         and save table metadata to Hive.
-    //    2.1  If it's hive compatible, set serde information in table metadata and try to save
+    //    2.2  If it's hive compatible, set serde information in table metadata and try to save
     //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
     if (DDLUtils.isDatasourceTable(tableDefinition)) {
       // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
@@ -205,6 +204,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
       val tableProperties = new scala.collection.mutable.HashMap[String, String]
       tableProperties.put(DATASOURCE_PROVIDER, provider)
+      if (tableDefinition.partitionProviderIsHive) {
+        tableProperties.put(TABLE_PARTITION_PROVIDER, "hive")
+      }
 
       // Serialized JSON schema string may be too long to be stored into a single metastore table
       // property. In this case, we split the JSON string and store each part as a separate table
@@ -420,12 +422,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
       // to retain the spark specific format if it is. Also add old data source properties to table
       // properties, to retain the data source table format.
-      val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(DATASOURCE_PREFIX))
+      val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(SPARK_SQL_PREFIX))
+      val partitionProviderProp = if (tableDefinition.partitionProviderIsHive) {
+        TABLE_PARTITION_PROVIDER -> "hive"
+      } else {
+        TABLE_PARTITION_PROVIDER -> "builtin"
+      }
       val newDef = withStatsProps.copy(
         schema = oldDef.schema,
         partitionColumnNames = oldDef.partitionColumnNames,
         bucketSpec = oldDef.bucketSpec,
-        properties = oldDataSourceProps ++ withStatsProps.properties)
+        properties = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp)
 
       client.alterTable(newDef)
     } else {
@@ -449,7 +456,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * properties, and filter out these special entries from table properties.
    */
   private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
-    val catalogTable = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) {
+    val tableWithSchema = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) {
       table
     } else {
       getProviderFromTableProperties(table).map { provider =>
@@ -474,30 +481,32 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           provider = Some(provider),
           partitionColumnNames = getPartitionColumnsFromTableProperties(table),
           bucketSpec = getBucketSpecFromTableProperties(table),
-          properties = getOriginalTableProperties(table))
+          partitionProviderIsHive = table.properties.get(TABLE_PARTITION_PROVIDER) == Some("hive"))
       } getOrElse {
-        table.copy(provider = Some("hive"))
+        table.copy(provider = Some("hive"), partitionProviderIsHive = true)
       }
     }
+
     // construct Spark's statistics from information in Hive metastore
-    val statsProps = catalogTable.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
-    if (statsProps.nonEmpty) {
+    val statsProps = tableWithSchema.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+    val tableWithStats = if (statsProps.nonEmpty) {
       val colStatsProps = statsProps.filterKeys(_.startsWith(STATISTICS_COL_STATS_PREFIX))
         .map { case (k, v) => (k.drop(STATISTICS_COL_STATS_PREFIX.length), v) }
-      val colStats: Map[String, ColumnStat] = catalogTable.schema.collect {
+      val colStats: Map[String, ColumnStat] = tableWithSchema.schema.collect {
         case f if colStatsProps.contains(f.name) =>
           val numFields = ColumnStatStruct.numStatFields(f.dataType)
           (f.name, ColumnStat(numFields, colStatsProps(f.name)))
       }.toMap
-      catalogTable.copy(
-        properties = removeStatsProperties(catalogTable),
+      tableWithSchema.copy(
         stats = Some(Statistics(
-          sizeInBytes = BigInt(catalogTable.properties(STATISTICS_TOTAL_SIZE)),
-          rowCount = catalogTable.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
+          sizeInBytes = BigInt(tableWithSchema.properties(STATISTICS_TOTAL_SIZE)),
+          rowCount = tableWithSchema.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
           colStats = colStats)))
     } else {
-      catalogTable
+      tableWithSchema
     }
+
+    tableWithStats.copy(properties = getOriginalTableProperties(table))
   }
 
   override def tableExists(db: String, table: String): Boolean = withClient {
@@ -749,7 +758,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 }
 
 object HiveExternalCatalog {
-  val DATASOURCE_PREFIX = "spark.sql.sources."
+  val SPARK_SQL_PREFIX = "spark.sql."
+
+  val DATASOURCE_PREFIX = SPARK_SQL_PREFIX + "sources."
   val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider"
   val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema"
   val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "."
@@ -763,21 +774,20 @@ object HiveExternalCatalog {
   val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol."
   val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol."
 
-  val STATISTICS_PREFIX = "spark.sql.statistics."
+  val STATISTICS_PREFIX = SPARK_SQL_PREFIX + "statistics."
   val STATISTICS_TOTAL_SIZE = STATISTICS_PREFIX + "totalSize"
   val STATISTICS_NUM_ROWS = STATISTICS_PREFIX + "numRows"
   val STATISTICS_COL_STATS_PREFIX = STATISTICS_PREFIX + "colStats."
 
-  def removeStatsProperties(metadata: CatalogTable): Map[String, String] = {
-    metadata.properties.filterNot { case (key, _) => key.startsWith(STATISTICS_PREFIX) }
-  }
+  val TABLE_PARTITION_PROVIDER = SPARK_SQL_PREFIX + "partitionProvider"
+
 
   def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = {
     metadata.properties.get(DATASOURCE_PROVIDER)
   }
 
   def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = {
-    metadata.properties.filterNot { case (key, _) => key.startsWith(DATASOURCE_PREFIX) }
+    metadata.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) }
   }
 
   // A persisted data source table always store its schema in the catalog.

From 012c124c1a320347310aae1b593c0d6a2098d0b3 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 26 Oct 2016 12:44:42 -0700
Subject: [PATCH 96/99] fix debug mode test

---
 .../org/apache/spark/sql/hive/HiveExternalCatalog.scala     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index a75ab56436334..2b8f826ba7e9e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -506,7 +506,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       tableWithSchema
     }
 
-    tableWithStats.copy(properties = getOriginalTableProperties(table))
+    if (conf.get(DEBUG_MODE)) {
+      tableWithStats
+    } else {
+      tableWithStats.copy(properties = getOriginalTableProperties(table))
+    }
   }
 
   override def tableExists(db: String, table: String): Boolean = withClient {

From 9a6fff6e9e5f06e7aa434e467c5ce65f18f9bd70 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 26 Oct 2016 14:48:24 -0700
Subject: [PATCH 97/99] fix in memory catalog test

---
 .../main/scala/org/apache/spark/sql/execution/command/ddl.scala | 2 +-
 .../src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 5e558e8c96ff1..61e0550cef5e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -719,7 +719,7 @@ object DDLUtils {
         s"$action is not allowed on $tableName since filesource partition management is " +
           "disabled (spark.sql.hive.manageFilesourcePartitions = false).")
     }
-    if (!table.partitionProviderIsHive) {
+    if (!table.partitionProviderIsHive && isDatasourceTable(table)) {
       throw new AnalysisException(
         s"$action is not allowed on $tableName since its partition metadata is not stored in " +
           "the Hive metastore. To import this information into the metastore, run " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 6857dd37286dd..2d73d9f1fc802 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -197,7 +197,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
       assertResult(expected.schema, s"Schema did not match for query #$i\n${expected.sql}") {
         output.schema
       }
-      assertResult(expected.output, s"Result dit not match for query #$i\n${expected.sql}") {
+      assertResult(expected.output, s"Result did not match for query #$i\n${expected.sql}") {
         output.output
       }
     }

From 8c805551d3876a40354faa18ca9749d9f5aecbe1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 26 Oct 2016 17:48:41 -0700
Subject: [PATCH 98/99] fix tree-node suite

---
 .../org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index cb0426c7a98a1..3eff12f9eed14 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -489,6 +489,7 @@ class TreeNodeSuite extends SparkFunSuite {
         "owner" -> "",
         "createTime" -> 0,
         "lastAccessTime" -> -1,
+        "partitionProviderIsHive" -> false,
         "properties" -> JNull,
         "unsupportedFeatures" -> List.empty[String]))
 

From b6776cc16d97691b1ca1b936302b865ffaa08bcc Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 27 Oct 2016 11:16:10 +0800
Subject: [PATCH 99/99] minor cleanup

---
 .../apache/spark/sql/hive/HiveExternalCatalog.scala  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 2b8f826ba7e9e..409c316c6802c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -456,7 +456,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * properties, and filter out these special entries from table properties.
    */
   private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
-    val tableWithSchema = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) {
+    if (conf.get(DEBUG_MODE)) {
+      return table
+    }
+
+    val tableWithSchema = if (table.tableType == VIEW) {
       table
     } else {
       getProviderFromTableProperties(table).map { provider =>
@@ -506,11 +510,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       tableWithSchema
     }
 
-    if (conf.get(DEBUG_MODE)) {
-      tableWithStats
-    } else {
-      tableWithStats.copy(properties = getOriginalTableProperties(table))
-    }
+    tableWithStats.copy(properties = getOriginalTableProperties(table))
   }
 
   override def tableExists(db: String, table: String): Boolean = withClient {