-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-55337][SS] Fix MemoryStream backward compatibility #54108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
823133a
3ca4eb6
fdb8c9e
e1c0b1e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,36 +43,51 @@ import org.apache.spark.sql.internal.connector.SimpleTableProvider | |
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
|
||
| object MemoryStream extends LowPriorityMemoryStreamImplicits { | ||
| object MemoryStream { | ||
| protected val currentBlockId = new AtomicInteger(0) | ||
| protected val memoryStreamId = new AtomicInteger(0) | ||
|
|
||
| def apply[A : Encoder](implicit sparkSession: SparkSession): MemoryStream[A] = | ||
| new MemoryStream[A](memoryStreamId.getAndIncrement(), sparkSession) | ||
|
|
||
| def apply[A : Encoder](numPartitions: Int)(implicit sparkSession: SparkSession): MemoryStream[A] = | ||
| new MemoryStream[A](memoryStreamId.getAndIncrement(), sparkSession, Some(numPartitions)) | ||
| } | ||
|
|
||
| /** | ||
| * Provides lower-priority implicits for MemoryStream to prevent ambiguity when both | ||
| * SparkSession and SQLContext are in scope. The implicits in the companion object, | ||
| * which use SparkSession, take higher precedence. | ||
| */ | ||
| trait LowPriorityMemoryStreamImplicits { | ||
| this: MemoryStream.type => | ||
|
|
||
| // Deprecated: Used when an implicit SQLContext is in scope | ||
| @deprecated("Use MemoryStream.apply with an implicit SparkSession instead of SQLContext", "4.1.0") | ||
| def apply[A: Encoder]()(implicit sqlContext: SQLContext): MemoryStream[A] = | ||
| /** | ||
| * Creates a MemoryStream with an implicit SQLContext (backward compatible). | ||
| * Usage: `MemoryStream[Int]` | ||
| */ | ||
| def apply[A: Encoder](implicit sqlContext: SQLContext): MemoryStream[A] = | ||
| new MemoryStream[A](memoryStreamId.getAndIncrement(), sqlContext.sparkSession) | ||
|
|
||
| @deprecated("Use MemoryStream.apply with an implicit SparkSession instead of SQLContext", "4.1.0") | ||
| def apply[A: Encoder](numPartitions: Int)(implicit sqlContext: SQLContext): MemoryStream[A] = | ||
| /** | ||
| * Creates a MemoryStream with specified partitions using implicit SQLContext. | ||
| * Usage: `MemoryStream[Int](numPartitions)` | ||
| */ | ||
| def apply[A: Encoder](numPartitions: Int)( | ||
| implicit sqlContext: SQLContext): MemoryStream[A] = | ||
| new MemoryStream[A]( | ||
| memoryStreamId.getAndIncrement(), | ||
| sqlContext.sparkSession, | ||
| Some(numPartitions)) | ||
|
|
||
| /** | ||
| * Creates a MemoryStream with explicit SparkSession. | ||
| * Usage: `MemoryStream[Int](spark)` | ||
| */ | ||
| def apply[A: Encoder](sparkSession: SparkSession): MemoryStream[A] = | ||
| new MemoryStream[A](memoryStreamId.getAndIncrement(), sparkSession) | ||
|
|
||
| /** | ||
| * Creates a MemoryStream with specified partitions using explicit SparkSession. | ||
| * Usage: `MemoryStream[Int](spark, numPartitions)` | ||
| */ | ||
| def apply[A: Encoder](sparkSession: SparkSession, numPartitions: Int): MemoryStream[A] = | ||
| new MemoryStream[A]( | ||
| memoryStreamId.getAndIncrement(), | ||
| sparkSession, | ||
| Some(numPartitions)) | ||
|
|
||
| /** | ||
| * Creates a MemoryStream with explicit encoder and SparkSession. | ||
| * Usage: `MemoryStream(Encoders.scalaInt, spark)` | ||
| */ | ||
| def apply[A](encoder: Encoder[A], sparkSession: SparkSession): MemoryStream[A] = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I roughly remember the intention was to discourage the usage of SQLContext - if that's the case, we probably want to have the way to pass That said, looks like this method (explicit encoder instance) is newly added. Is there any usage of this? We don't seem to add the same in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used in quite some places like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've add a new overload to specific numPartitions with SparkSession. |
||
| new MemoryStream[A](memoryStreamId.getAndIncrement(), sparkSession)(encoder) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -343,84 +343,6 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter { | |
| intsToDF(expected)(schema)) | ||
| } | ||
|
|
||
| test("LowPriorityMemoryStreamImplicits works with implicit sqlContext") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lol we added tests in wrong place... the file seems to be for memory "sink", not memory "source". |
||
| // Test that MemoryStream can be created using implicit sqlContext | ||
| implicit val sqlContext: SQLContext = spark.sqlContext | ||
|
|
||
| // Test MemoryStream[A]() with implicit sqlContext | ||
| val stream1 = MemoryStream[Int]() | ||
| assert(stream1 != null) | ||
|
|
||
| // Test MemoryStream[A](numPartitions) with implicit sqlContext | ||
| val stream2 = MemoryStream[String](3) | ||
| assert(stream2 != null) | ||
|
|
||
| // Verify the streams work correctly | ||
| stream1.addData(1, 2, 3) | ||
| val df1 = stream1.toDF() | ||
| assert(df1.schema.fieldNames.contains("value")) | ||
|
|
||
| stream2.addData("a", "b", "c") | ||
| val df2 = stream2.toDF() | ||
| assert(df2.schema.fieldNames.contains("value")) | ||
| } | ||
|
|
||
| test("LowPriorityContinuousMemoryStreamImplicits works with implicit sqlContext") { | ||
| import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryStream | ||
| // Test that ContinuousMemoryStream can be created using implicit sqlContext | ||
| implicit val sqlContext: SQLContext = spark.sqlContext | ||
|
|
||
| // Test ContinuousMemoryStream[A]() with implicit sqlContext | ||
| val stream1 = ContinuousMemoryStream[Int]() | ||
| assert(stream1 != null) | ||
|
|
||
| // Test ContinuousMemoryStream[A](numPartitions) with implicit sqlContext | ||
| val stream2 = ContinuousMemoryStream[String](3) | ||
| assert(stream2 != null) | ||
|
|
||
| // Test ContinuousMemoryStream.singlePartition with implicit sqlContext | ||
| val stream3 = ContinuousMemoryStream.singlePartition[Int]() | ||
| assert(stream3 != null) | ||
|
|
||
| // Verify the streams work correctly | ||
| stream1.addData(Seq(1, 2, 3)) | ||
| stream2.addData(Seq("a", "b", "c")) | ||
| stream3.addData(Seq(10, 20)) | ||
|
|
||
| // Basic verification that streams are functional | ||
| assert(stream1.initialOffset() != null) | ||
| assert(stream2.initialOffset() != null) | ||
| assert(stream3.initialOffset() != null) | ||
| } | ||
|
|
||
| test("LowPriorityLowLatencyMemoryStreamImplicits works with implicit sqlContext") { | ||
| import org.apache.spark.sql.execution.streaming.LowLatencyMemoryStream | ||
| // Test that LowLatencyMemoryStream can be created using implicit sqlContext | ||
| implicit val sqlContext: SQLContext = spark.sqlContext | ||
|
|
||
| // Test LowLatencyMemoryStream[A]() with implicit sqlContext | ||
| val stream1 = LowLatencyMemoryStream[Int]() | ||
| assert(stream1 != null) | ||
|
|
||
| // Test LowLatencyMemoryStream[A](numPartitions) with implicit sqlContext | ||
| val stream2 = LowLatencyMemoryStream[String](3) | ||
| assert(stream2 != null) | ||
|
|
||
| // Test LowLatencyMemoryStream.singlePartition with implicit sqlContext | ||
| val stream3 = LowLatencyMemoryStream.singlePartition[Int]() | ||
| assert(stream3 != null) | ||
|
|
||
| // Verify the streams work correctly | ||
| stream1.addData(Seq(1, 2, 3)) | ||
| stream2.addData(Seq("a", "b", "c")) | ||
| stream3.addData(Seq(10, 20)) | ||
|
|
||
| // Basic verification that streams are functional | ||
| assert(stream1.initialOffset() != null) | ||
| assert(stream2.initialOffset() != null) | ||
| assert(stream3.initialOffset() != null) | ||
| } | ||
|
|
||
| private implicit def intsToDF(seq: Seq[Int])(implicit schema: StructType): DataFrame = { | ||
| require(schema.fields.length === 1) | ||
| sqlContext.createDataset(seq).toDF(schema.fieldNames.head) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the problem. This is not backward compatible, as the previous API is
def apply[A: Encoder](implicit sqlContext: SQLContext): MemoryStream[A](no parentheses).There is no way to keep both implicits. So the proposal here is to only keep implicit
SQLContext, and require to passSparkSessionimplicitly.