-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-2460] Optimize SparkContext.hadoopFile api #1385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c691f3d
94b3fa2
5d8265b
754f68b
1f0b4ec
659df51
9967588
afbaae7
1c85a3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -84,7 +84,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp | |
| * variabe references an instance of JobConf, then that JobConf will be used for the Hadoop job. | ||
| * Otherwise, a new JobConf will be created on each slave using the enclosed Configuration. | ||
| * @param initLocalJobConfFuncOpt Optional closure used to initialize any JobConf that HadoopRDD | ||
| * creates. | ||
| * creates. | ||
| * @param inputFormatClass Storage format of the data to be read. | ||
| * @param keyClass Class of the key associated with the inputFormatClass. | ||
| * @param valueClass Class of the value associated with the inputFormatClass. | ||
|
|
@@ -93,7 +93,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp | |
| @DeveloperApi | ||
| class HadoopRDD[K, V]( | ||
| sc: SparkContext, | ||
| broadcastedConf: Broadcast[SerializableWritable[Configuration]], | ||
| broadcastedConf: Broadcast[SerializableWritable[JobConf]], | ||
| initLocalJobConfFuncOpt: Option[JobConf => Unit], | ||
| inputFormatClass: Class[_ <: InputFormat[K, V]], | ||
| keyClass: Class[K], | ||
|
|
@@ -110,8 +110,7 @@ class HadoopRDD[K, V]( | |
| minPartitions: Int) = { | ||
| this( | ||
| sc, | ||
| sc.broadcast(new SerializableWritable(conf)) | ||
| .asInstanceOf[Broadcast[SerializableWritable[Configuration]]], | ||
| sc.broadcast(new SerializableWritable(conf)), | ||
| None /* initLocalJobConfFuncOpt */, | ||
| inputFormatClass, | ||
| keyClass, | ||
|
|
@@ -128,25 +127,16 @@ class HadoopRDD[K, V]( | |
|
|
||
| // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads. | ||
| protected def getJobConf(): JobConf = { | ||
| val conf: Configuration = broadcastedConf.value.value | ||
| if (conf.isInstanceOf[JobConf]) { | ||
| // A user-broadcasted JobConf was provided to the HadoopRDD, so always use it. | ||
| conf.asInstanceOf[JobConf] | ||
| } else if (HadoopRDD.containsCachedMetadata(jobConfCacheKey)) { | ||
| val conf: JobConf = broadcastedConf.value.value | ||
| if (HadoopRDD.containsCachedMetadata(jobConfCacheKey)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need this check? I think this may have been the only place we put the JobConf inside the cache.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, there is no need to cache the jobconf if it is in broadcast
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes,i agree with you. broadcastedConf is cached by blockManager in Broadcast |
||
| // getJobConf() has been called previously, so there is already a local cache of the JobConf | ||
| // needed by this RDD. | ||
| HadoopRDD.getCachedMetadata(jobConfCacheKey).asInstanceOf[JobConf] | ||
| } else { | ||
| // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the | ||
| // local process. The local cache is accessed through HadoopRDD.putCachedMetadata(). | ||
| // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects. | ||
| // synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456) | ||
| conf.synchronized { | ||
| val newJobConf = new JobConf(conf) | ||
| initLocalJobConfFuncOpt.map(f => f(newJobConf)) | ||
| HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf) | ||
| newJobConf | ||
| initLocalJobConfFuncOpt.synchronized { | ||
| initLocalJobConfFuncOpt.map(f => f(conf)) | ||
| } | ||
| conf | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you know if there was ever a reason we broadcasted the Configuration and this lambda, just to turn it into a JobConf on the Executor side? Was it really just extra work for no reason?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. See my comment below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there is no extra reason for broascast Configuration. I think it is because the old api here Instantiate HadoopRDD directly and HadoopRDD's construct method need broadcastedConf: Broadcast[SerializableWritable[Configuration]], so here broadcasted the Configuration.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is. See my comment below.