-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-28837][SQL] CTAS/RTAS should use nullable schema #25536
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -232,7 +232,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(table.partitioning.isEmpty) | ||
| assert(table.properties == Map("provider" -> "foo").asJava) | ||
| assert(table.schema == new StructType() | ||
| .add("id", LongType, nullable = false) | ||
| .add("id", LongType) | ||
| .add("data", StringType)) | ||
|
|
||
| val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
|
|
@@ -258,8 +258,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(replacedTable.name == identifier) | ||
| assert(replacedTable.partitioning.isEmpty) | ||
| assert(replacedTable.properties == Map("provider" -> "foo").asJava) | ||
| assert(replacedTable.schema == new StructType() | ||
| .add("id", LongType, nullable = false)) | ||
| assert(replacedTable.schema == new StructType().add("id", LongType)) | ||
|
|
||
| val rdd = spark.sparkContext.parallelize(replacedTable.asInstanceOf[InMemoryTable].rows) | ||
| checkAnswer( | ||
|
|
@@ -391,7 +390,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(table.partitioning.isEmpty) | ||
| assert(table.properties == Map("provider" -> orc2).asJava) | ||
| assert(table.schema == new StructType() | ||
| .add("id", LongType, nullable = false) | ||
| .add("id", LongType) | ||
| .add("data", StringType)) | ||
|
|
||
| val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
|
|
@@ -408,7 +407,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(table.partitioning.isEmpty) | ||
| assert(table.properties == Map("provider" -> "foo").asJava) | ||
| assert(table.schema == new StructType() | ||
| .add("id", LongType, nullable = false) | ||
| .add("id", LongType) | ||
| .add("data", StringType)) | ||
|
|
||
| val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
|
|
@@ -428,7 +427,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(table2.partitioning.isEmpty) | ||
| assert(table2.properties == Map("provider" -> "foo").asJava) | ||
| assert(table2.schema == new StructType() | ||
| .add("id", LongType, nullable = false) | ||
| .add("id", LongType) | ||
| .add("data", StringType)) | ||
|
|
||
| val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
|
|
@@ -446,7 +445,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(table.partitioning.isEmpty) | ||
| assert(table.properties == Map("provider" -> "foo").asJava) | ||
| assert(table.schema == new StructType() | ||
| .add("id", LongType, nullable = false) | ||
| .add("id", LongType) | ||
| .add("data", StringType)) | ||
|
|
||
| val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
|
|
@@ -477,7 +476,7 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(table.partitioning.isEmpty) | ||
| assert(table.properties == Map("provider" -> "foo").asJava) | ||
| assert(table.schema == new StructType() | ||
| .add("id", LongType, nullable = false) | ||
| .add("id", LongType) | ||
| .add("data", StringType)) | ||
|
|
||
| val rdd = sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
|
|
@@ -500,6 +499,32 @@ class DataSourceV2SQLSuite extends QueryTest with SharedSparkSession with Before | |
| assert(t.isInstanceOf[UnresolvedTable], "V1 table wasn't returned as an unresolved table") | ||
| } | ||
|
|
||
| test("CreateTableAsSelect: nullable schema") { | ||
| val basicCatalog = catalog("testcat").asTableCatalog | ||
| val atomicCatalog = catalog("testcat_atomic").asTableCatalog | ||
| val basicIdentifier = "testcat.table_name" | ||
| val atomicIdentifier = "testcat_atomic.table_name" | ||
|
|
||
| Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { | ||
| case (catalog, identifier) => | ||
| spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT 1 i") | ||
|
|
||
| val table = catalog.loadTable(Identifier.of(Array(), "table_name")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor and non-blocking... "table_name" repeated many times and is it better to make it a test class variable and each test case referencing it? sorry maybe I'm being too nitpick lol PR looks good to me :)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to me if you don't have to jump around too much when reading the code.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in your PR https://github.com/apache/spark/pull/25507/files so I thought it's a style convention... |
||
|
|
||
| assert(table.name == identifier) | ||
| assert(table.partitioning.isEmpty) | ||
| assert(table.properties == Map("provider" -> "foo").asJava) | ||
| assert(table.schema == new StructType().add("i", "int")) | ||
|
|
||
| val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
| checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Row(1)) | ||
|
|
||
| sql(s"INSERT INTO $identifier SELECT CAST(null AS INT)") | ||
| val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) | ||
| checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), Seq(Row(1), Row(null))) | ||
| } | ||
| } | ||
|
|
||
| test("DropTable: basic") { | ||
| val tableName = "testcat.ns1.ns2.tbl" | ||
| val ident = Identifier.of(Array("ns1", "ns2"), "tbl") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to update the schema nullability of the corresponding logical plans,
CreateTableAsSelectandReplaceTable, in the analyzer phase? Any reason to directly update the nullability of physical plans?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's too much work if we need to transform the logical plan and add an extra Project to change the nullability.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be incorrect to change the logical plan. The behavior of CTAS should be that tables are created with nullable types. The query used by CTAS should not be changed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see. Thanks. Looks ok to me.