From c32fad04d332d8006252d81b9dfb890dc69c9de1 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 001/103] spelling: actual Signed-off-by: Josh Soref --- R/pkg/R/pairRDD.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index b29381bb900fb..8af0c0cf0f421 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -239,7 +239,7 @@ setMethod("partitionByRDD", javaPairRDD <- callJMethod(javaPairRDD, "partitionBy", rPartitioner) # Call .values() on the result to get back the final result, the - # shuffled acutal content key-val pairs. + # shuffled actual content key-val pairs. r <- callJMethod(javaPairRDD, "values") RDD(r, serializedMode = "byte") From 7a041bd7dddd51992e0e58ef6add384ed717d345 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 002/103] spelling: address Signed-off-by: Josh Soref --- .../java/org/apache/spark/network/client/TransportClient.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java index 6dcc703e92669..eb2882074d7c7 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java @@ -303,7 +303,7 @@ public void close() { @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) - .append("remoteAdress", channel.remoteAddress()) + .append("remoteAddress", channel.remoteAddress()) .append("clientId", clientId) .append("isActive", isActive()) .toString(); From 4ee2f69a1592454cd7fce3a241bae7fa09446d1e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 003/103] spelling: against Signed-off-by: Josh Soref --- python/pyspark/mllib/stat/_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index 43454ba5187dd..22da0471b8400 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -159,7 +159,7 @@ def chiSqTest(observed, expected=None): """ If `observed` is Vector, conduct Pearson's chi-squared goodness of fit test of the observed data against the expected distribution, - or againt the uniform distribution (by default), with each category + or against the uniform distribution (by default), with each category having an expected frequency of `1 / len(observed)`. If `observed` is matrix, conduct Pearson's independence test on the From 5eeb636c3b49b9c60421e8967b0000f4119f2d2a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 004/103] spelling: algorithms Signed-off-by: Josh Soref --- R/pkg/R/mllib_utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R index f38f1ac3a6b4c..d943d8d0ab4c0 100644 --- a/R/pkg/R/mllib_utils.R +++ b/R/pkg/R/mllib_utils.R @@ -18,7 +18,7 @@ # mllib_utils.R: Utilities for MLlib integration # Integration with R's standard functions. -# Most of MLlib's argorithms are provided in two flavours: +# Most of MLlib's algorithms are provided in two flavours: # - a specialization of the default R methods (glm). These methods try to respect # the inputs and the outputs of R's method to the largest extent, but some small differences # may exist. From 5160398923d53662adb8c50bc98a7b70006eebc4 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 005/103] spelling: alternative Signed-off-by: Josh Soref --- R/pkg/inst/worker/worker.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R index 1ef05ea621e83..9ef10c0394a17 100644 --- a/R/pkg/inst/worker/worker.R +++ b/R/pkg/inst/worker/worker.R @@ -180,7 +180,7 @@ if (isEmpty != 0) { } else if (deserializer == "arrow" && mode == 1) { data <- SparkR:::readDeserializeInArrow(inputCon) # See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html - # rbind.fill might be an anternative to make it faster if plyr is installed. + # rbind.fill might be an alternative to make it faster if plyr is installed. # Also, note that, 'dapply' applies a function to each partition. data <- do.call("rbind", data) } @@ -212,7 +212,7 @@ if (isEmpty != 0) { if (serializer == "arrow") { # See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html - # rbind.fill might be an anternative to make it faster if plyr is installed. + # rbind.fill might be an alternative to make it faster if plyr is installed. combined <- do.call("rbind", outputs) SparkR:::writeSerializeInArrow(outputCon, combined) } From 4af7ec0e0b5430041d6fc1e48c88b51a738f7caf Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 006/103] spelling: avoid Signed-off-by: Josh Soref --- python/docs/source/_templates/autosummary/class.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/_templates/autosummary/class.rst b/python/docs/source/_templates/autosummary/class.rst index d794f797ee2ad..b5f62677ee0ed 100644 --- a/python/docs/source/_templates/autosummary/class.rst +++ b/python/docs/source/_templates/autosummary/class.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -.. Workaround to avoud documenting __init__. +.. Workaround to avoid documenting __init__. {% extends "!autosummary/class.rst" %} From 10978617ae2c4d64fab474330656b883e3adb0ec Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 007/103] spelling: cannot Signed-off-by: Josh Soref --- .../org/apache/spark/streaming/api/python/PythonDStream.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala index 570663c6f6ad3..7a8e3f1d2ccf4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala @@ -163,7 +163,7 @@ private[python] object PythonTransformFunctionSerializer { private[streaming] object PythonDStream { /** - * can not access PythonTransformFunctionSerializer.register() via Py4j + * cannot access PythonTransformFunctionSerializer.register() via Py4j * Py4JError: PythonTransformFunctionSerializerregister does not exist in the JVM */ def registerSerializer(ser: PythonTransformFunctionSerializer): Unit = { From ab19ac245e5b938d0eeb4a92420d93b65634366b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 008/103] spelling: centers Signed-off-by: Josh Soref --- python/pyspark/mllib/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index b99a4150c396d..6dffcadfc5dd5 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -843,7 +843,7 @@ def setInitialCenters(self, centers, weights): @since('1.5.0') def setRandomCenters(self, dim, weight, seed): """ - Set the initial centres to be random samples from + Set the initial centers to be random samples from a gaussian population with constant weights. """ rng = random.RandomState(seed) From 326b346376fc9d6a669fb7f5997468c9cc6befb2 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 009/103] spelling: checkpointof Signed-off-by: Josh Soref --- .../test/java/test/org/apache/spark/streaming/JavaAPISuite.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java index c7cde5674f547..8a57b0c58b228 100644 --- a/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java @@ -1595,7 +1595,7 @@ public void testContextGetOrCreate() throws InterruptedException { /* TEST DISABLED: Pending a discussion about checkpoint() semantics with TD @SuppressWarnings("unchecked") @Test - public void testCheckpointofIndividualStream() throws InterruptedException { + public void testCheckpointOfIndividualStream() throws InterruptedException { List> inputData = Arrays.asList( Arrays.asList("this", "is"), Arrays.asList("a", "test"), From d80f112ee9ac25b5c38ddb6ddc89acb071b07e34 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 010/103] spelling: claim Signed-off-by: Josh Soref --- .../apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala index 349cbd04f6027..156740d7c8aee 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala @@ -49,14 +49,14 @@ class KubernetesVolumeUtilsSuite extends SparkFunSuite { val sparkConf = new SparkConf(false) sparkConf.set("test.persistentVolumeClaim.volumeName.mount.path", "/path") sparkConf.set("test.persistentVolumeClaim.volumeName.mount.readOnly", "true") - sparkConf.set("test.persistentVolumeClaim.volumeName.options.claimName", "claimeName") + sparkConf.set("test.persistentVolumeClaim.volumeName.options.claimName", "claimName") val volumeSpec = KubernetesVolumeUtils.parseVolumesWithPrefix(sparkConf, "test.").head assert(volumeSpec.volumeName === "volumeName") assert(volumeSpec.mountPath === "/path") assert(volumeSpec.mountReadOnly) assert(volumeSpec.volumeConf.asInstanceOf[KubernetesPVCVolumeConf] === - KubernetesPVCVolumeConf("claimeName")) + KubernetesPVCVolumeConf("claimName")) } test("Parses emptyDir volumes correctly") { From b331088463e12e9c2374d3c20217aadc93e1bfd1 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 011/103] spelling: cloudpickle Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index 8e683e7a6988b..dffa9362fabce 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -699,7 +699,7 @@ def _make_skel_func(code, cell_count, base_globals=None): """ # This function is deprecated and should be removed in cloudpickle 1.7 warnings.warn( - "A pickle file created using an old (<=1.4.1) version of cloudpicke " + "A pickle file created using an old (<=1.4.1) version of cloudpickle " "is currently being loaded. This is not supported by cloudpickle and " "will break in cloudpickle 1.7", category=UserWarning ) From 6e38abbf71c1cc5ee32d97f7d4dd232acd6ed668 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Wed, 11 Nov 2020 19:53:42 -0500 Subject: [PATCH 012/103] spelling: cloudpickler Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index e8e46b88fdc91..1c683610a4c73 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -556,7 +556,7 @@ def dump(self, obj): # `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler # used `CloudPickler.dispatch` as a class-level attribute storing all # reducers implemented by cloudpickle, but the attribute name was not a - # great choice given the meaning of `Cloudpickler.dispatch` when + # great choice given the meaning of `CloudPickler.dispatch` when # `CloudPickler` extends the pure-python pickler. dispatch = dispatch_table From d30b24ebe089281418c6e9e8c393f256814cd87b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 013/103] spelling: column Signed-off-by: Josh Soref --- R/pkg/R/WindowSpec.R | 2 +- R/pkg/R/column.R | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 037809cd0923e..5268a13cbf46b 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -231,7 +231,7 @@ setMethod("rangeBetween", #' @rdname over #' @name over #' @aliases over,Column,WindowSpec-method -#' @family colum_func +#' @family column_func #' @examples #' \dontrun{ #' df <- createDataFrame(mtcars) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 835178990b485..b515907c1cf33 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -135,7 +135,7 @@ createMethods() #' @rdname alias #' @name alias #' @aliases alias,Column-method -#' @family colum_func +#' @family column_func #' @examples #' \dontrun{ #' df <- createDataFrame(iris) @@ -161,7 +161,7 @@ setMethod("alias", #' #' @rdname substr #' @name substr -#' @family colum_func +#' @family column_func #' @aliases substr,Column-method #' #' @param x a Column. @@ -187,7 +187,7 @@ setMethod("substr", signature(x = "Column"), #' #' @rdname startsWith #' @name startsWith -#' @family colum_func +#' @family column_func #' @aliases startsWith,Column-method #' #' @param x vector of character string whose "starts" are considered @@ -206,7 +206,7 @@ setMethod("startsWith", signature(x = "Column"), #' #' @rdname endsWith #' @name endsWith -#' @family colum_func +#' @family column_func #' @aliases endsWith,Column-method #' #' @param x vector of character string whose "ends" are considered @@ -224,7 +224,7 @@ setMethod("endsWith", signature(x = "Column"), #' #' @rdname between #' @name between -#' @family colum_func +#' @family column_func #' @aliases between,Column-method #' #' @param x a Column @@ -251,7 +251,7 @@ setMethod("between", signature(x = "Column"), # nolint end #' @rdname cast #' @name cast -#' @family colum_func +#' @family column_func #' @aliases cast,Column-method #' #' @examples @@ -300,7 +300,7 @@ setMethod("%in%", #' Can be a single value or a Column. #' @rdname otherwise #' @name otherwise -#' @family colum_func +#' @family column_func #' @aliases otherwise,Column-method #' @note otherwise since 1.5.0 setMethod("otherwise", From bc798f66fd27729853db3857f9890f3ee6d25588 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 014/103] spelling: combination Signed-off-by: Josh Soref --- python/pyspark/sql/functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 4af5d1f484ee4..79c319d462c88 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -4601,7 +4601,7 @@ def years(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4625,7 +4625,7 @@ def months(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4649,7 +4649,7 @@ def days(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4673,7 +4673,7 @@ def hours(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. From b523262ec60604a96897f8f442caf27e09acbcea Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:15 -0500 Subject: [PATCH 015/103] spelling: combinations Signed-off-by: Josh Soref --- python/pyspark/sql/pandas/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 750aa4b0e6c56..5c33ee84a791e 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -114,7 +114,7 @@ def pandas_udf(f=None, returnType=None, functionType=None): | |-- col1: string (nullable = true) | |-- col2: long (nullable = true) - In the following sections, it describes the cominations of the supported type hints. For + In the following sections, it describes the combinations of the supported type hints. For simplicity, `pandas.DataFrame` variant is omitted. * Series to Series From f2d8d7d82dc5e7714ad2451d5548ab8841c1410b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 016/103] spelling: compatibility Signed-off-by: Josh Soref --- python/pyspark/__init__.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/__init__.pyi b/python/pyspark/__init__.pyi index 98bd40684c01b..ef07c32b1db7b 100644 --- a/python/pyspark/__init__.pyi +++ b/python/pyspark/__init__.pyi @@ -53,7 +53,7 @@ from pyspark.taskcontext import ( # noqa: F401 ) from pyspark.util import InheritableThread as InheritableThread # noqa: F401 -# Compatiblity imports +# Compatibility imports from pyspark.sql import ( # noqa: F401 SQLContext as SQLContext, HiveContext as HiveContext, From fae65af2f99c92cbb1d703819012a0ce970d4c42 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 017/103] spelling: compilation Signed-off-by: Josh Soref --- project/SparkBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 05413b7091ad9..f0eec5a047e90 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -327,7 +327,7 @@ object SparkBuild extends PomBuild { // to be enabled in specific ones that have previous artifacts MimaKeys.mimaFailOnNoPrevious := false, - // To prevent intermittent compliation failures, see also SPARK-33297 + // To prevent intermittent compilation failures, see also SPARK-33297 // Apparently we can remove this when we use JDK 11. Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat ) From 5b1406f858d142a332a1b848c87dc90eec28a43f Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 018/103] spelling: component Signed-off-by: Josh Soref --- python/docs/source/getting_started/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 9c9ff7fa7844b..d78716d9fe575 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -42,7 +42,7 @@ PySpark installation using `PyPI `_ is as fol pip install pyspark -If you want to install extra dependencies for a specific componenet, you can install it as below: +If you want to install extra dependencies for a specific component, you can install it as below: .. code-block:: bash From afbe1e674aad3202e3fab80b3474f2c9aeb6a28b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 019/103] spelling: components Signed-off-by: Josh Soref --- python/docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index 4286f616374c5..6a631052a642d 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -30,7 +30,7 @@ of Spark's features such as Spark SQL, DataFrame, Streaming, MLlib (Machine Learning) and Spark Core. .. image:: ../../../docs/img/pyspark-components.png - :alt: PySpark Compoenents + :alt: PySpark Components **Spark SQL and DataFrame** From 779cc9fd08fe1a54823de6a1e8ee4e955505ce54 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 020/103] spelling: compress Signed-off-by: Josh Soref --- project/MimaExcludes.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 98769d951b6ac..c772ec2e068a5 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -1729,7 +1729,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") ) ++ Seq( - // [SPARK-21680][ML][MLLIB]optimzie Vector coompress + // [SPARK-21680][ML][MLLIB]optimzie Vector compress ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize") ) ++ Seq( From 072ee1b255adb03d5b600c9f68def0cdabee70da Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 021/103] spelling: concatenates Signed-off-by: Josh Soref --- R/pkg/R/pairRDD.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index 8af0c0cf0f421..41676be03e951 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -411,7 +411,7 @@ setMethod("reduceByKeyLocally", #' \itemize{ #' \item createCombiner, which turns a V into a C (e.g., creates a one-element list) #' \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) - -#' \item mergeCombiners, to combine two C's into a single one (e.g., concatentates +#' \item mergeCombiners, to combine two C's into a single one (e.g., concatenates #' two lists). #' } #' From 6882dcaa61d17d92c8d621b86e74fe7a88202c0e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 022/103] spelling: concatenating Signed-off-by: Josh Soref --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 2ce53782d9af0..da9267831be79 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2772,7 +2772,7 @@ setMethod("merge", #' Creates a list of columns by replacing the intersected ones with aliases #' #' Creates a list of columns by replacing the intersected ones with aliases. -#' The name of the alias column is formed by concatanating the original column name and a suffix. +#' The name of the alias column is formed by concatenating the original column name and a suffix. #' #' @param x a SparkDataFrame #' @param intersectedColNames a list of intersected column names of the SparkDataFrame From 1382dce99e33f1c0fd8bff91782e6a062b6f4036 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 023/103] spelling: confidence Signed-off-by: Josh Soref --- R/pkg/R/mllib_fpm.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_fpm.R b/R/pkg/R/mllib_fpm.R index 30bc51b932041..65a43514930f0 100644 --- a/R/pkg/R/mllib_fpm.R +++ b/R/pkg/R/mllib_fpm.R @@ -125,7 +125,7 @@ setMethod("spark.freqItemsets", signature(object = "FPGrowthModel"), #' The \code{SparkDataFrame} contains five columns: #' \code{antecedent} (an array of the same type as the input column), #' \code{consequent} (an array of the same type as the input column), -#' \code{condfidence} (confidence for the rule) +#' \code{confidence} (confidence for the rule) #' \code{lift} (lift for the rule) #' and \code{support} (support for the rule) #' @rdname spark.fpGrowth From ed8d7aa4b10e65c10f0a9057f383c764be1f512c Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 024/103] spelling: configurations Signed-off-by: Josh Soref --- R/pkg/tests/fulltests/test_Serde.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R index e01f6ee005218..a52289e43ca5e 100644 --- a/R/pkg/tests/fulltests/test_Serde.R +++ b/R/pkg/tests/fulltests/test_Serde.R @@ -125,7 +125,7 @@ test_that("SerDe of list of lists", { sparkR.session.stop() -# Note that this test should be at the end of tests since the configruations used here are not +# Note that this test should be at the end of tests since the configurations used here are not # specific to sessions, and the Spark context is restarted. test_that("createDataFrame large objects", { for (encryptionEnabled in list("true", "false")) { From 5c4c91da74999ab1571a9d412b44ba75658578c6 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 025/103] spelling: conjunction Signed-off-by: Josh Soref --- python/pyspark/resource/requests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/resource/requests.py b/python/pyspark/resource/requests.py index 74d26d04312c4..4deb22b5948f0 100644 --- a/python/pyspark/resource/requests.py +++ b/python/pyspark/resource/requests.py @@ -189,7 +189,7 @@ def requests(self): class TaskResourceRequest(object): """ - A task resource request. This is used in conjuntion with the + A task resource request. This is used in conjunction with the :class:`pyspark.resource.ResourceProfile` to programmatically specify the resources needed for an RDD that will be applied at the stage level. The amount is specified as a Double to allow for saying you want more than 1 task per resource. Valid values @@ -226,7 +226,7 @@ def amount(self): class TaskResourceRequests(object): """ - A set of task resource requests. This is used in conjuntion with the + A set of task resource requests. This is used in conjunction with the :class:`pyspark.resource.ResourceProfileBuilder` to programmatically specify the resources needed for an RDD that will be applied at the stage level. From b0774f371647ff03ac77ba8dfd20424745671acc Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 026/103] spelling: connection Signed-off-by: Josh Soref --- R/pkg/inst/worker/daemon.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index fb9db63b07cd0..4589bb9c6ad1b 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -32,7 +32,7 @@ inputCon <- socketConnection( SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET")) -# Waits indefinitely for a socket connecion by default. +# Waits indefinitely for a socket connection by default. selectTimeout <- NULL while (TRUE) { @@ -72,7 +72,7 @@ while (TRUE) { } }) } else if (is.null(children)) { - # If it is NULL, there are no children. Waits indefinitely for a socket connecion. + # If it is NULL, there are no children. Waits indefinitely for a socket connection. selectTimeout <- NULL } From d5025b5986ba4b28848e8c58732399e2441a33fc Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 027/103] spelling: contains Signed-off-by: Josh Soref --- .../spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index b5a360167679e..1091de9967ece 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala @@ -356,7 +356,7 @@ trait MesosSchedulerUtils extends Logging { * https://github.com/apache/mesos/blob/master/src/common/values.cpp * https://github.com/apache/mesos/blob/master/src/common/attributes.cpp * - * @param constraintsVal constains string consisting of ';' separated key-value pairs (separated + * @param constraintsVal contains string consisting of ';' separated key-value pairs (separated * by ':') * @return Map of constraints to match resources offers. */ From 73a7da515a7a45dae3e032593c341417108f5679 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 028/103] spelling: converting Signed-off-by: Josh Soref --- R/pkg/R/types.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 5d48a9eee2799..dfa83c35665ce 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -68,7 +68,7 @@ rToSQLTypes <- as.environment(list( "character" = "string", "logical" = "boolean")) -# Helper function of coverting decimal type. When backend returns column type in the +# Helper function of converting decimal type. When backend returns column type in the # format of decimal(,) (e.g., decimal(10, 0)), this function coverts the column type # as double type. This function converts backend returned types that are not the key # of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES. From a84552e35059f9458ca49ebe06b4e8c7b0ba63cd Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 029/103] spelling: corresponding Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index dffa9362fabce..f50bd1bc1347b 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -236,7 +236,7 @@ def _extract_code_globals(co): out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresonding to the one + # syntax generates a constant code object corresponding to the one # of the nested function's As the nested function may itself need # global variables, we need to introspect its code, extract its # globals, (look for code object in it's co_consts attribute..) and From 5595a09dad17dccd4586afb1e16dcf13b65bd818 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 030/103] spelling: crypto Signed-off-by: Josh Soref --- .../main/java/org/apache/spark/network/crypto/AuthEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java index 64fdb32a67ada..c2b2edc7f07d5 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java @@ -287,7 +287,7 @@ private byte[] doCipherOp(int mode, byte[] in, boolean isFinal) } } } catch (InternalError ie) { - // SPARK-25535. The commons-cryto library will throw InternalError if something goes wrong, + // SPARK-25535. The commons-crypto library will throw InternalError if something goes wrong, // and leave bad state behind in the Java wrappers, so it's not safe to use them afterwards. if (mode == Cipher.ENCRYPT_MODE) { this.encryptor = null; From 651fe9c24cf0a92be6dc1d0b722a6b63d06f8e19 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 031/103] spelling: datasource Signed-off-by: Josh Soref --- python/pyspark/ml/tests/test_image.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/tests/test_image.py b/python/pyspark/ml/tests/test_image.py index ceecdae971c99..1001598779d48 100644 --- a/python/pyspark/ml/tests/test_image.py +++ b/python/pyspark/ml/tests/test_image.py @@ -33,7 +33,7 @@ def test_read_images(self): self.assertEqual(df.count(), 4) first_row = df.take(1)[0][0] # compare `schema.simpleString()` instead of directly compare schema, - # because the df loaded from datasouce may change schema column nullability. + # because the df loaded from datasource may change schema column nullability. self.assertEqual(df.schema.simpleString(), ImageSchema.imageSchema.simpleString()) self.assertEqual(df.schema["image"].dataType.simpleString(), ImageSchema.columnSchema.simpleString()) From 31b8e1433cd18ab74a61ef5aaaea406bcbe0ceea Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 032/103] spelling: dependencies Signed-off-by: Josh Soref --- python/test_support/userlibrary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/test_support/userlibrary.py b/python/test_support/userlibrary.py index 73fd26e71f10d..90cd30723ddfe 100755 --- a/python/test_support/userlibrary.py +++ b/python/test_support/userlibrary.py @@ -16,7 +16,7 @@ # """ -Used to test shipping of code depenencies with SparkContext.addPyFile(). +Used to test shipping of code dependencies with SparkContext.addPyFile(). """ From 3d04958ca8167f98c4eb59f4c46f0a72bf94ee33 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:16 -0500 Subject: [PATCH 033/103] spelling: described Signed-off-by: Josh Soref --- R/pkg/R/WindowSpec.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 5268a13cbf46b..be47d0117ed7f 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -54,7 +54,7 @@ setMethod("show", "WindowSpec", #' Defines the partitioning columns in a WindowSpec. #' #' @param x a WindowSpec. -#' @param col a column to partition on (desribed by the name or Column). +#' @param col a column to partition on (described by the name or Column). #' @param ... additional column(s) to partition on. #' @return A WindowSpec. #' @rdname partitionBy From 57114f15a38efe5be33c050a4ac780c2860350fd Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 034/103] spelling: directory Signed-off-by: Josh Soref --- R/install-dev.bat | 2 +- python/docs/source/getting_started/install.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/install-dev.bat b/R/install-dev.bat index c570d93049a14..ae5aa589a19d1 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -26,7 +26,7 @@ MKDIR %SPARK_HOME%\R\lib rem When you pass the package path directly as an argument to R CMD INSTALL, rem it takes the path as 'C:\projects\spark\R\..\R\pkg"' as an example at -rem R 4.0. To work around this, directly go to the directoy and install it. +rem R 4.0. To work around this, directly go to the directory and install it. rem See also SPARK-32074 pushd %SPARK_HOME%\R\pkg\ R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" . diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index d78716d9fe575..2e56ddb3c3960 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -126,7 +126,7 @@ Manually Downloading -------------------- PySpark is included in the distributions available at the `Apache Spark website `_. -You can download a distribution you want from the site. After that, uncompress the tar file into the directoy where you want +You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want to install Spark, for example, as below: .. code-block:: bash From 379316d4d2706450d16475783c1c648987d19e19 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 035/103] spelling: dispatch Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index 1c683610a4c73..f4c888fdd1317 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -630,7 +630,7 @@ def reducer_override(self, obj): return self._function_reduce(obj) else: # fallback to save_global, including the Pickler's - # distpatch_table + # dispatch_table return NotImplemented else: From bd025158da7a8b2ef0f2ce7bfb796686a4af356b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 036/103] spelling: do not Signed-off-by: Josh Soref --- python/pyspark/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 1b09d327a5dfe..8ca4bb37e5fa4 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -59,7 +59,7 @@ def report_times(outfile, boot, init, finish): def add_path(path): - # worker can be used, so donot add path multiple times + # worker can be used, so do not add path multiple times if path not in sys.path: # overwrite system packages sys.path.insert(1, path) From 68b7756c5a065a40cf65c1fbd9ace532838f28cf Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 22:40:51 -0500 Subject: [PATCH 037/103] spelling: does not Signed-off-by: Josh Soref --- python/pyspark/ml/tests/test_algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index f8b61b7c57919..50475210607c8 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -116,7 +116,7 @@ def test_output_columns(self): output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"]) - def test_parallelism_doesnt_change_output(self): + def test_parallelism_does_not_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], From 62c3f3a8ef1a5db1f504d06f2c9f8b83bf16001d Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 038/103] spelling: don't Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle_fast.py | 2 +- .../main/scala/org/apache/spark/streaming/util/HdfsUtils.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index f4c888fdd1317..f632fcaf618b9 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -179,7 +179,7 @@ def _class_getstate(obj): clsdict.pop('__weakref__', None) if issubclass(type(obj), abc.ABCMeta): - # If obj is an instance of an ABCMeta subclass, dont pickle the + # If obj is an instance of an ABCMeta subclass, don't pickle the # cache/negative caches populated during isinstance/issubclass # checks, but pickle the list of registered subclasses of obj. clsdict.pop('_abc_cache', None) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala index 006bcad5d68c2..ef040681adf37 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala @@ -39,7 +39,7 @@ private[streaming] object HdfsUtils { throw new IllegalStateException("File exists and there is no append support!") } } else { - // we dont' want to use hdfs erasure coding, as that lacks support for append and hflush + // we don't want to use hdfs erasure coding, as that lacks support for append and hflush SparkHadoopUtil.createFile(dfs, dfsPath, false) } } From 335daf3f5700f720670c2d9d675bd86190852051 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 039/103] spelling: dynamic Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index f632fcaf618b9..db3e115b0ca88 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -407,7 +407,7 @@ def _class_reduce(obj): def _function_setstate(obj, state): - """Update the state of a dynaamic function. + """Update the state of a dynamic function. As __closure__ and __globals__ are readonly attributes of a function, we cannot rely on the native setstate routine of pickle.load_build, that calls From c9f83c25a844b268dc3b4e8c19043e0bdf5d30b0 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 040/103] spelling: e.g. Signed-off-by: Josh Soref --- R/CRAN_RELEASE.md | 2 +- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/install.R | 2 +- R/pkg/R/mllib_tree.R | 4 ++-- R/pkg/R/utils.R | 2 +- R/pkg/vignettes/sparkr-vignettes.Rmd | 2 +- python/pyspark/sql/dataframe.py | 2 +- .../src/main/scala/org/apache/spark/deploy/mesos/config.scala | 2 +- .../apache/spark/network/yarn/YarnShuffleServiceSuite.scala | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md index 4d9b6416c01cb..2f410cf8bfd94 100644 --- a/R/CRAN_RELEASE.md +++ b/R/CRAN_RELEASE.md @@ -25,7 +25,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. -Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`). +Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, e.g. `yum -q -y install qpdf`). To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible. diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index da9267831be79..70aeaa55646f7 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3231,7 +3231,7 @@ setMethod("describe", #' \item stddev #' \item min #' \item max -#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%") +#' \item arbitrary approximate percentiles specified as a percentage (e.g., "75\%") #' } #' If no statistics are given, this function computes count, mean, stddev, min, #' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max. diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ea2c0b4c0f42f..5bc5ae07c5f03 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -289,7 +289,7 @@ sparkCachePath <- function() { } # Length of the Spark cache specific relative path segments for each platform -# eg. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix +# e.g. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix # Must match sparkCachePath() exactly. sparkCacheRelPathLength <- function() { if (is_windows()) { diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index f6aa48f5fa04a..b5a014b0a3cfd 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -53,7 +53,7 @@ setClass("DecisionTreeRegressionModel", representation(jobj = "jobj")) #' @note DecisionTreeClassificationModel since 2.3.0 setClass("DecisionTreeClassificationModel", representation(jobj = "jobj")) -# Create the summary of a tree ensemble model (eg. Random Forest, GBT) +# Create the summary of a tree ensemble model (e.g. Random Forest, GBT) summary.treeEnsemble <- function(model) { jobj <- model@jobj formula <- callJMethod(jobj, "formula") @@ -73,7 +73,7 @@ summary.treeEnsemble <- function(model) { jobj = jobj) } -# Prints the summary of tree ensemble models (eg. Random Forest, GBT) +# Prints the summary of tree ensemble models (e.g. Random Forest, GBT) print.summary.treeEnsemble <- function(x) { jobj <- x$jobj cat("Formula: ", x$formula) diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index d6f9f927d5cdc..264cbfc9ba929 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -930,7 +930,7 @@ getOne <- function(x, envir, inherits = TRUE, ifnotfound = NULL) { } # Returns a vector of parent directories, traversing up count times, starting with a full path -# eg. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return +# e.g. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return # this "/Users/user/Library/Caches/spark/spark2.2" # and "/Users/user/Library/Caches/spark" traverseParentDirs <- function(x, count) { diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 3713e6c784855..a0608748696a3 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -146,7 +146,7 @@ sparkR.session.stop() Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs. -After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html). +After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (e.g. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html). ```{r, eval=FALSE} install.spark() diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 9fae27a2d9c6c..fe7d26d1bcfd2 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1497,7 +1497,7 @@ def summary(self, *statistics): - stddev - min - max - - arbitrary approximate percentiles specified as a percentage (eg, 75%) + - arbitrary approximate percentiles specified as a percentage (e.g., 75%) If no statistics are given, this function computes count, mean, stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max. diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala index bd42f6f05655f..5927af176062d 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala @@ -230,7 +230,7 @@ package object config { ConfigBuilder("spark.mesos.appJar.local.resolution.mode") .doc("Provides support for the `local:///` scheme to reference the app jar resource in " + "cluster mode. If user uses a local resource (`local:///path/to/jar`) and the config " + - "option is not used it defaults to `host` eg. the mesos fetcher tries to get the " + + "option is not used it defaults to `host` e.g. the mesos fetcher tries to get the " + "resource from the host's file system. If the value is unknown it prints a warning msg " + "in the dispatcher logs and defaults to `host`. If the value is `container` then spark " + "submit in the container will use the jar in the container's path: `/path/to/jar`.") diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala index c2bdd971a0fe9..188a48509212d 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala @@ -250,7 +250,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (Some(shuffleInfo2)) s2.stop() - // another stop & restart should be fine though (eg., we recover from previous corruption) + // another stop & restart should be fine though (e.g., we recover from previous corruption) s3 = new YarnShuffleService s3.setRecoveryPath(new Path(recoveryLocalDir.toURI)) s3.init(yarnConfig) From dcc899dd1fd9adb25769c7420147fe6a0c363339 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 041/103] spelling: eagerly Signed-off-by: Josh Soref --- python/pyspark/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 9c9e3f4b3c881..3c4c4f731fd2b 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -601,7 +601,7 @@ def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): tempFile.close() return reader_func(tempFile.name) finally: - # we eagerily reads the file so we can delete right after. + # we eagerly reads the file so we can delete right after. os.unlink(tempFile.name) def pickleFile(self, name, minPartitions=None): From 23ca389165b18359d873aeb04b93e53ac23668ad Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 042/103] spelling: environment Signed-off-by: Josh Soref --- python/docs/source/getting_started/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 2e56ddb3c3960..a90f5fe159553 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -105,7 +105,7 @@ Now activate the newly created environment with the following command: conda activate pyspark_env You can install pyspark by `Using PyPI <#using-pypi>`_ to install PySpark in the newly created -environment, for example as below. It will install PySpark under the new virtual environemnt +environment, for example as below. It will install PySpark under the new virtual environment ``pyspark_env`` created above. .. code-block:: bash From c49101b01783d41e99e6edd573265c4f7c831b9f Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:17 -0500 Subject: [PATCH 043/103] spelling: exclusion Signed-off-by: Josh Soref --- .../scala/org/apache/spark/streaming/dstream/DStream.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index e037f26088347..ca4f3670d5ad7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -960,7 +960,7 @@ object DStream { /** Get the creation site of a DStream from the stack trace of when the DStream is created. */ private[streaming] def getCreationSite(): CallSite = { /** Filtering function that excludes non-user classes for a streaming application */ - def streamingExclustionFunction(className: String): Boolean = { + def streamingExclusionFunction(className: String): Boolean = { def doesMatch(r: Regex): Boolean = r.findFirstIn(className).isDefined val isSparkClass = doesMatch(SPARK_CLASS_REGEX) val isSparkExampleClass = doesMatch(SPARK_EXAMPLES_CLASS_REGEX) @@ -972,6 +972,6 @@ object DStream { // non-Spark and non-Scala class, as the rest would streaming application classes. (isSparkClass || isScalaClass) && !isSparkExampleClass && !isSparkStreamingTestClass } - org.apache.spark.util.Utils.getCallSite(streamingExclustionFunction) + org.apache.spark.util.Utils.getCallSite(streamingExclusionFunction) } } From 0c4228e034bc30b97cf556bff8e3d74dca9a31ef Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:18 -0500 Subject: [PATCH 044/103] spelling: external Signed-off-by: Josh Soref --- python/pyspark/shuffle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 89be6295f9888..4ba846227188c 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -418,7 +418,7 @@ def _cleanup(self): class ExternalSorter(object): """ - ExtenalSorter will divide the elements into chunks, sort them in + ExternalSorter will divide the elements into chunks, sort them in memory and dump them into disks, finally merge them back. The spilling will only happen when the used memory goes above From cdec0246d27319818ac8c0e4ccb192c19f0d740d Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:14 -0500 Subject: [PATCH 045/103] spelling: github Signed-off-by: Josh Soref --- dev/appveyor-guide.md | 12 +++++------ dev/create-release/known_translations | 2 +- dev/create-release/releaseutils.py | 6 +++--- dev/create-release/translate-contributors.py | 22 ++++++++++---------- dev/github_jira_sync.py | 8 +++---- dev/run-tests-jenkins.py | 18 ++++++++-------- dev/run-tests.py | 6 +++--- dev/tests/pr_merge_ability.sh | 2 +- dev/tests/pr_public_classes.sh | 2 +- 9 files changed, 39 insertions(+), 39 deletions(-) diff --git a/dev/appveyor-guide.md b/dev/appveyor-guide.md index a8c0c1ef23ac3..c68b5de9e61d0 100644 --- a/dev/appveyor-guide.md +++ b/dev/appveyor-guide.md @@ -33,22 +33,22 @@ Currently, SparkR on Windows is being tested with [AppVeyor](https://ci.appveyor 2016-09-04 11 07 58 -- Click "Github". +- Click "GitHub". 2016-09-04 11 08 10 -#### After signing up, go to profile to link Github and AppVeyor. +#### After signing up, go to profile to link GitHub and AppVeyor. - Click your account and then click "Profile". 2016-09-04 11 09 43 -- Enable the link with GitHub via clicking "Link Github account". +- Enable the link with GitHub via clicking "Link GitHub account". 2016-09-04 11 09 52 -- Click "Authorize application" in Github site. +- Click "Authorize application" in GitHub site. 2016-09-04 11 10 05 @@ -63,11 +63,11 @@ Currently, SparkR on Windows is being tested with [AppVeyor](https://ci.appveyor 2016-08-30 12 16 35 -- Since we will use Github here, click the "GITHUB" button and then click "Authorize Github" so that AppVeyor can access the Github logs (e.g. commits). +- Since we will use GitHub here, click the "GITHUB" button and then click "Authorize GitHub" so that AppVeyor can access the GitHub logs (e.g. commits). 2016-09-04 11 10 22 -- Click "Authorize application" from Github (the above step will pop up this page). +- Click "Authorize application" from GitHub (the above step will pop up this page). 2016-09-04 11 10 27 diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations index ff41cccde0140..64bd9ada1bf61 100644 --- a/dev/create-release/known_translations +++ b/dev/create-release/known_translations @@ -1,5 +1,5 @@ # This is a mapping of names to be translated through translate-contributors.py -# The format expected on each line should be: - +# The format expected on each line should be: - 012huang - Weiyi Huang 07ARB - Ankit Raj Boudh 10110346 - Xian Liu diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index cc7ad931198a2..a0e9695d58361 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -110,7 +110,7 @@ def __str__(self): # Under the hood, this runs a `git log` on that tag and parses the fields # from the command output to construct a list of Commit objects. Note that # because certain fields reside in the commit description and cannot be parsed -# through the Github API itself, we need to do some intelligent regex parsing +# through the GitHub API itself, we need to do some intelligent regex parsing # to extract those fields. # # This is written using Git 1.8.5. @@ -140,7 +140,7 @@ def get_commits(tag): sys.exit("Unexpected format in commit: %s" % commit_digest) [_hash, author, title] = commit_digest.split(field_end_marker) # The PR number and github username is in the commit message - # itself and cannot be accessed through any Github API + # itself and cannot be accessed through any GitHub API pr_number = None match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body) if match: @@ -252,7 +252,7 @@ def nice_join(str_list): return ", ".join(str_list[:-1]) + ", and " + str_list[-1] -# Return the full name of the specified user on Github +# Return the full name of the specified user on GitHub # If the user doesn't exist, return None def get_github_name(author, github_client): if github_client: diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 8340266527fc6..be5611ce65a7d 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -17,7 +17,7 @@ # This script translates invalid authors in the contributors list generated # by generate-contributors.py. When the script encounters an author name that -# is considered invalid, it searches Github and JIRA in an attempt to search +# is considered invalid, it searches GitHub and JIRA in an attempt to search # for replacements. This tool runs in two modes: # # (1) Interactive mode: For each invalid author name, this script presents @@ -68,7 +68,7 @@ if INTERACTIVE_MODE: print("Running in interactive mode. To disable this, provide the --non-interactive flag.") -# Setup Github and JIRA clients +# Setup GitHub and JIRA clients jira_options = {"server": JIRA_API_BASE} jira_client = JIRA(options=jira_options, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) github_client = Github(GITHUB_API_TOKEN) @@ -89,11 +89,11 @@ # Generate candidates for the given author. This should only be called if the given author # name does not represent a full name as this operation is somewhat expensive. Under the -# hood, it makes several calls to the Github and JIRA API servers to find the candidates. +# hood, it makes several calls to the GitHub and JIRA API servers to find the candidates. # # This returns a list of (candidate name, source) 2-tuples. E.g. # [ -# (NOT_FOUND, "No full name found for Github user andrewor14"), +# (NOT_FOUND, "No full name found for GitHub user andrewor14"), # ("Andrew Or", "Full name of JIRA user andrewor14"), # ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), # ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), @@ -104,12 +104,12 @@ def generate_candidates(author, issues): candidates = [] - # First check for full name of Github user + # First check for full name of GitHub user github_name = get_github_name(author, github_client) if github_name: - candidates.append((github_name, "Full name of Github user %s" % author)) + candidates.append((github_name, "Full name of GitHub user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) + candidates.append((NOT_FOUND, "No full name found for GitHub user %s" % author)) # Then do the same for JIRA user jira_name = get_jira_name(author, jira_client) if jira_name: @@ -151,7 +151,7 @@ def generate_candidates(author, issues): candidates[i] = (candidate, source) return candidates -# Translate each invalid author by searching for possible candidates from Github and JIRA +# Translate each invalid author by searching for possible candidates from GitHub and JIRA # In interactive mode, this script presents the user with a list of choices and have the user # select from this list. Additionally, the user may also choose to enter a custom name. # In non-interactive mode, this script picks the first valid author name from the candidates @@ -180,12 +180,12 @@ def generate_candidates(author, issues): issues = temp_author.split("/")[1:] candidates = generate_candidates(author, issues) # Print out potential replacement candidates along with the sources, e.g. - # [X] No full name found for Github user andrewor14 + # [X] No full name found for GitHub user andrewor14 # [X] No assignee found for SPARK-1763 # [0] Andrew Or - Full name of JIRA user andrewor14 # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 - # [3] andrewor14 - Raw Github username + # [3] andrewor14 - Raw GitHub username # [4] Custom candidate_names = [] bad_prompts = [] # Prompts that can't actually be selected; print these first. @@ -207,7 +207,7 @@ def generate_candidates(author, issues): print(p) # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: - print(" [%d] %s - Raw Github username" % (raw_index, author)) + print(" [%d] %s - Raw GitHub username" % (raw_index, author)) print(" [%d] Custom" % custom_index) response = raw_input(" Your choice: ") last_index = custom_index diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index 9bcebaa22ab86..7eca03fc63d04 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Utility for updating JIRA's with information about Github pull requests +# Utility for updating JIRA's with information about GitHub pull requests import json import os @@ -142,7 +142,7 @@ def reset_pr_labels(pr_num, jira_components): jira_prs = get_jira_prs() previous_max = get_max_pr() -print("Retrieved %s JIRA PR's from Github" % len(jira_prs)) +print("Retrieved %s JIRA PR's from GitHub" % len(jira_prs)) jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] print("%s PR's remain after excluding visted ones" % len(jira_prs)) @@ -157,7 +157,7 @@ def reset_pr_labels(pr_num, jira_components): considered = considered + [pr_num] url = pr['html_url'] - title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) + title = "[GitHub] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) try: page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue + "/remotelink")) existing_links = map(lambda l: l['object']['url'], page) @@ -174,7 +174,7 @@ def reset_pr_labels(pr_num, jira_components): destination = {"title": title, "url": url, "icon": icon} # For all possible fields see: # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links - # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} + # application = {"name": "GitHub pull requests", "type": "org.apache.spark.jira.github"} jira_client.add_remote_link(issue, destination) comment = "User '%s' has created a pull request for this issue:" % pr['user']['login'] diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index 610fb1fd27027..4309a74773e89 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -38,7 +38,7 @@ def print_err(msg): def post_message_to_github(msg, ghprb_pull_id): - print("Attempting to post to Github...") + print("Attempting to post to GitHub...") api_url = os.getenv("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark") url = api_url + "/issues/" + ghprb_pull_id + "/comments" @@ -57,12 +57,12 @@ def post_message_to_github(msg, ghprb_pull_id): if response.getcode() == 201: print(" > Post successful.") except HTTPError as http_e: - print_err("Failed to post message to Github.") + print_err("Failed to post message to GitHub.") print_err(" > http_code: %s" % http_e.code) print_err(" > api_response: %s" % http_e.read()) print_err(" > data: %s" % posted_message) except URLError as url_e: - print_err("Failed to post message to Github.") + print_err("Failed to post message to GitHub.") print_err(" > urllib_status: %s" % url_e.reason[1]) print_err(" > data: %s" % posted_message) @@ -89,7 +89,7 @@ def run_pr_checks(pr_tests, ghprb_actual_commit, sha1): """ Executes a set of pull request checks to ease development and report issues with various components such as style, linting, dependencies, compatibilities, etc. - @return a list of messages to post back to Github + @return a list of messages to post back to GitHub """ # Ensure we save off the current HEAD to revert to current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip() @@ -109,7 +109,7 @@ def run_tests(tests_timeout): """ Runs the `dev/run-tests` script and responds with the correct error message under the various failure scenarios. - @return a tuple containing the test result code and the result note to post to Github + @return a tuple containing the test result code and the result note to post to GitHub """ test_result_code = subprocess.Popen(['timeout', @@ -198,16 +198,16 @@ def main(): # To write a PR test: # * the file must reside within the dev/tests directory # * be an executable bash script - # * accept three arguments on the command line, the first being the Github PR long commit - # hash, the second the Github SHA1 hash, and the final the current PR hash + # * accept three arguments on the command line, the first being the GitHub PR long commit + # hash, the second the GitHub SHA1 hash, and the final the current PR hash # * and, lastly, return string output to be included in the pr message output that will - # be posted to Github + # be posted to GitHub pr_tests = [ "pr_merge_ability", "pr_public_classes" ] - # `bind_message_base` returns a function to generate messages for Github posting + # `bind_message_base` returns a function to generate messages for GitHub posting github_message = functools.partial(pr_message, build_display_name, build_url, diff --git a/dev/run-tests.py b/dev/run-tests.py index 5bdbc0ffb850c..a1001ec5fd280 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -636,7 +636,7 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally or Github Actions. + # else we're running locally or GitHub Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") @@ -654,12 +654,12 @@ def main(): included_tags = [] excluded_tags = [] if should_only_test_modules: - # If we're running the tests in Github Actions, attempt to detect and test + # If we're running the tests in GitHub Actions, attempt to detect and test # only the affected modules. if test_env == "github_actions": if os.environ["GITHUB_INPUT_BRANCH"] != "": # Dispatched request - # Note that it assumes Github Actions has already merged + # Note that it assumes GitHub Actions has already merged # the given `GITHUB_INPUT_BRANCH` branch. changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=os.environ["GITHUB_SHA"]) diff --git a/dev/tests/pr_merge_ability.sh b/dev/tests/pr_merge_ability.sh index 25fdbccac4dd8..a32667730f76c 100755 --- a/dev/tests/pr_merge_ability.sh +++ b/dev/tests/pr_merge_ability.sh @@ -22,7 +22,7 @@ # another branch and returning results to be published. More details can be # found at dev/run-tests-jenkins. # -# Arg1: The Github Pull Request Actual Commit +# Arg1: The GitHub Pull Request Actual Commit # known as `ghprbActualCommit` in `run-tests-jenkins` # Arg2: The SHA1 hash # known as `sha1` in `run-tests-jenkins` diff --git a/dev/tests/pr_public_classes.sh b/dev/tests/pr_public_classes.sh index 479d1851fe0b8..ad1ad5e736594 100755 --- a/dev/tests/pr_public_classes.sh +++ b/dev/tests/pr_public_classes.sh @@ -22,7 +22,7 @@ # another branch and returning results to be published. More details can be # found at dev/run-tests-jenkins. # -# Arg1: The Github Pull Request Actual Commit +# Arg1: The GitHub Pull Request Actual Commit # known as `ghprbActualCommit` in `run-tests-jenkins` ghprbActualCommit="$1" From 12268ee59ef4b9164f3002a5c3ca8d009f05a47f Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:18 -0500 Subject: [PATCH 046/103] spelling: groupby Signed-off-by: Josh Soref --- python/pyspark/sql/tests/test_pandas_grouped_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index ee68b95fc478d..a639a8d51f55c 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -484,7 +484,7 @@ def dummy_pandas_udf(df): col('temp0.key') == col('temp1.key')) self.assertEquals(res.count(), 5) - def test_mixed_scalar_udfs_followed_by_grouby_apply(self): + def test_mixed_scalar_udfs_followed_by_groupby_apply(self): df = self.spark.range(0, 10).toDF('v1') df = df.withColumn('v2', udf(lambda x: x + 1, 'int')(df['v1'])) \ .withColumn('v3', pandas_udf(lambda x: x + 2, 'int')(df['v1'])) From 993d10d51fbaf357c0a830242ad80094b4be802c Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:18 -0500 Subject: [PATCH 047/103] spelling: grouped Signed-off-by: Josh Soref --- R/pkg/R/deserialize.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 5d22340fb62a0..89a8fbecd36b0 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -250,7 +250,7 @@ readDeserializeWithKeysInArrow <- function(inputCon) { keys <- readMultipleObjects(inputCon) - # Read keys to map with each groupped batch later. + # Read keys to map with each grouped batch later. list(keys = keys, data = data) } From 457b52cd035a01ec0c381ac3a0eb1765cb986c35 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:18 -0500 Subject: [PATCH 048/103] spelling: grouping Signed-off-by: Josh Soref --- R/pkg/tests/fulltests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 45de1ef1bd3d1..0b8afa3858cb3 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -3666,7 +3666,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", { } # Computes the arithmetic mean of the second column by grouping - # on the first and third columns. Output the groupping value and the average. + # on the first and third columns. Output the grouping value and the average. schema <- structType(structField("a", "integer"), structField("c", "string"), structField("avg", "double")) df3 <- gapply( From ec0fa843179a3667a91461cc68a08f658e74f36d Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:18 -0500 Subject: [PATCH 049/103] spelling: i.e. Signed-off-by: Josh Soref --- R/pkg/R/context.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index e3c9d9f8793d6..9627a4ed8093f 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -86,7 +86,7 @@ makeSplits <- function(numSerializedSlices, length) { # For instance, for numSerializedSlices of 22, length of 50 # [1] 0 0 2 2 4 4 6 6 6 9 9 11 11 13 13 15 15 15 18 18 20 20 22 22 22 # [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47 - # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced. + # Notice the slice group with 3 slices (i.e. 6, 15, 22) are roughly evenly spaced. # We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD if (numSerializedSlices > 0) { unlist(lapply(0: (numSerializedSlices - 1), function(x) { From a962c938338e4db51834117ccea70aabd2d93bfb Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:18 -0500 Subject: [PATCH 050/103] spelling: impurity Signed-off-by: Josh Soref --- python/pyspark/ml/regression.py | 2 +- python/pyspark/ml/regression.pyi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 5ce484d964a5a..d37654a7388f5 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1491,7 +1491,7 @@ def setParams(self, *, featuresCol="features", labelCol="label", predictionCol=" maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impuriy="variance", featureSubsetStrategy="all", validationTol=0.01, + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, weightCol=None): """ diff --git a/python/pyspark/ml/regression.pyi b/python/pyspark/ml/regression.pyi index 5cb0e7a5092f7..a3f4644de2a68 100644 --- a/python/pyspark/ml/regression.pyi +++ b/python/pyspark/ml/regression.pyi @@ -477,7 +477,7 @@ class GBTRegressor( maxIter: int = ..., stepSize: float = ..., seed: Optional[int] = ..., - impuriy: str = ..., + impurity: str = ..., featureSubsetStrategy: str = ..., validationTol: float = ..., validationIndicatorCol: Optional[str] = ..., From 8e276713975b4815591fe59fdac39b17dbd99df0 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 051/103] spelling: initialized Signed-off-by: Josh Soref --- python/pyspark/mllib/regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 77bca86ac1b27..d5a23a2217bd0 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -739,7 +739,7 @@ def _validate(self, dstream): "dstream should be a DStream object, got %s" % type(dstream)) if not self._model: raise ValueError( - "Model must be intialized using setInitialWeights") + "Model must be initialized using setInitialWeights") @since("1.5.0") def predictOn(self, dstream): From c08cf690262c0c3ce9592d2069226107c2396ca6 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 052/103] spelling: insertion Signed-off-by: Josh Soref --- python/pyspark/rdd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 1964070040cdf..c3702fc02bf66 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1253,7 +1253,7 @@ def histogram(self, buckets): and 50 we would have a histogram of 1,0,1. If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), - this can be switched from an O(log n) inseration to O(1) per + this can be switched from an O(log n) insertion to O(1) per element (where n is the number of buckets). Buckets must be sorted, not contain any duplicates, and have From a5e5a9e48f9009d1b1725dfae03649092461b4f0 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 053/103] spelling: jarray Signed-off-by: Josh Soref --- R/pkg/tests/fulltests/test_jvm_api.R | 6 +++--- python/pyspark/sql/utils.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/pkg/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R index 8b3b4f73de170..3bf6ae556c079 100644 --- a/R/pkg/tests/fulltests/test_jvm_api.R +++ b/R/pkg/tests/fulltests/test_jvm_api.R @@ -20,11 +20,11 @@ context("JVM API") sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) test_that("Create and call methods on object", { - jarr <- sparkR.newJObject("java.util.ArrayList") + jarray <- sparkR.newJObject("java.util.ArrayList") # Add an element to the array - sparkR.callJMethod(jarr, "add", 1L) + sparkR.callJMethod(jarray, "add", 1L) # Check if get returns the same element - expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L) + expect_equal(sparkR.callJMethod(jarray, "get", 0L), 1L) }) test_that("Call static methods", { diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 18f8ba29f95a2..f5db783d2b5bc 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -151,10 +151,10 @@ def toJArray(gateway, jtype, arr): arr : python type list """ - jarr = gateway.new_array(jtype, len(arr)) + jarray = gateway.new_array(jtype, len(arr)) for i in range(0, len(arr)): - jarr[i] = arr[i] - return jarr + jarray[i] = arr[i] + return jarray def require_test_compiled(): From c87ecebcd8aafc2c00fd8df0cb241355a9ca3452 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 054/103] spelling: large Signed-off-by: Josh Soref --- python/docs/source/getting_started/quickstart.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/getting_started/quickstart.ipynb b/python/docs/source/getting_started/quickstart.ipynb index ab3645591955f..5d2ca331e3afc 100644 --- a/python/docs/source/getting_started/quickstart.ipynb +++ b/python/docs/source/getting_started/quickstart.ipynb @@ -392,7 +392,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`DataFrame.collect()` collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too larget to fit in the driver side because it collects all the data from executors to the driver side." + "`DataFrame.collect()` collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side." ] }, { From f7059957e9d368380c169ff791af3b7f6421b6e8 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 055/103] spelling: literal Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index f50bd1bc1347b..26bad5f40d7b4 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -457,7 +457,7 @@ def _is_parametrized_type_hint(obj): is_typing = getattr(obj, '__origin__', None) is not None # typing_extensions.Literal - is_litteral = getattr(obj, '__values__', None) is not None + is_literal = getattr(obj, '__values__', None) is not None # typing_extensions.Final is_final = getattr(obj, '__type__', None) is not None @@ -469,7 +469,7 @@ def _is_parametrized_type_hint(obj): getattr(obj, '__result__', None) is not None and getattr(obj, '__args__', None) is not None ) - return any((is_typing, is_litteral, is_final, is_union, is_tuple, + return any((is_typing, is_literal, is_final, is_union, is_tuple, is_callable)) def _create_parametrized_type_hint(origin, args): From 0cb419250fda8bb439614e8829dd029cd7226063 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 056/103] spelling: managed Signed-off-by: Josh Soref --- .../spark/network/server/OneForOneStreamManagerSuite.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java index 45e1836da641f..634b40ed450ee 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java @@ -72,7 +72,7 @@ public void testMissingChunk() { Assert.assertNotNull(getChunk(manager, streamId, 2)); manager.connectionTerminated(dummyChannel); - // loaded buffers are not released yet as in production a MangedBuffer returned by getChunk() + // loaded buffers are not released yet as in production a ManagedBuffer returned by getChunk() // would only be released by Netty after it is written to the network Mockito.verify(buffer1, Mockito.never()).release(); Mockito.verify(buffer2, Mockito.never()).release(); From 05d969040e8cf5f809326ff0a582267e297a58a2 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:19 -0500 Subject: [PATCH 057/103] spelling: millis Signed-off-by: Josh Soref --- .../apache/spark/network/util/TransportFrameDecoderSuite.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java index 4b67aa80351d2..163c52b023822 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java @@ -98,7 +98,7 @@ public void testConsolidationPerf() throws Exception { writtenBytes += pieceBytes; } logger.info("Writing 300MiB frame buf with consolidation of threshold " + threshold - + " took " + totalTime + " milis"); + + " took " + totalTime + " millis"); } finally { for (ByteBuf buf : retained) { release(buf); From 4034350bc0b4009f770760d76ee111a3aec15800 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 058/103] spelling: natural Signed-off-by: Josh Soref --- .../java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java index d7423537ddfcf..4d7f76f673865 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java @@ -133,7 +133,7 @@ class LevelDBTypeInfo { // First create the parent indices, then the child indices. ti.indices().forEach(idx -> { - // In LevelDB, there is no parent index for the NUTURAL INDEX. + // In LevelDB, there is no parent index for the NATURAL INDEX. if (idx.parent().isEmpty() || idx.value().equals(KVIndex.NATURAL_INDEX_NAME)) { indices.put(idx.value(), new Index(idx, ti.getAccessor(idx.value()), null)); } From d55899c49bae4b6b10d005e9aa89ad6ef84476d3 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 059/103] spelling: non deterministic Signed-off-by: Josh Soref --- python/pyspark/sql/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 79c319d462c88..84fb333a288f4 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1261,7 +1261,7 @@ def spark_partition_id(): Notes ----- - This is indeterministic because it depends on data partitioning and task scheduling. + This is non deterministic because it depends on data partitioning and task scheduling. Examples -------- From 3099ce0a813846ec8b18510fb77ba6fe9835ee3e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 060/103] spelling: not Signed-off-by: Josh Soref --- python/pyspark/sql/tests/test_udf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index a7dcbfd32ac1c..9a1c0edcce4ed 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -459,7 +459,7 @@ def test_udf_with_string_return_type(self): self.assertTupleEqual(expected, actual) - def test_udf_shouldnt_accept_noncallable_object(self): + def test_udf_should_not_accept_noncallable_object(self): non_callable = None self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType()) @@ -683,7 +683,7 @@ def tearDown(self): if SparkContext._active_spark_context is not None: SparkContext._active_spark_context.stop() - def test_udf_init_shouldnt_initialize_context(self): + def test_udf_init_should_not_initialize_context(self): UserDefinedFunction(lambda x: x, StringType()) self.assertIsNone( From 3aedf4e21c5db7e07d7bbebdd8d41be64c35a1fe Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 061/103] spelling: nullable Signed-off-by: Josh Soref --- .../spark/unsafe/types/UTF8StringPropertyCheckSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala index 69a082053aa65..ab488e18ba3f4 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala @@ -192,7 +192,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp } } - val nullalbeSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString)) + val nullableSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString)) test("concat") { def concat(origin: Seq[String]): String = @@ -201,7 +201,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp forAll { (inputs: Seq[String]) => assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(inputs.mkString)) } - forAll (nullalbeSeq) { (inputs: Seq[String]) => + forAll (nullableSeq) { (inputs: Seq[String]) => assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(concat(inputs))) } } @@ -216,7 +216,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) === toUTF8(inputs.mkString(sep))) } - forAll(randomString, nullalbeSeq) {(sep: String, inputs: Seq[String]) => + forAll(randomString, nullableSeq) {(sep: String, inputs: Seq[String]) => assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) === toUTF8(concatWs(sep, inputs))) } From 611c9392352b93d218bcf4086cf2b9278b1fc02d Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 062/103] spelling: numbers Signed-off-by: Josh Soref --- .../src/test/java/org/apache/hadoop/net/ServerSocketUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java b/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java index df0ebcc9871ac..89e012ecd42e1 100644 --- a/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java +++ b/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java @@ -112,7 +112,7 @@ public static int waitForPort(int port, int retries) * The ports are all closed afterwards, * so other network services started may grab those same ports. * - * @param numPorts number of required port nubmers + * @param numPorts number of required port numbers * @return array of available port numbers * @throws IOException */ From ee82affb4b612bc47bd634b3b6d54a000b46799a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 063/103] spelling: occurred Signed-off-by: Josh Soref --- python/pyspark/java_gateway.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index eafa5d90f9ff8..172b451190aa9 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -208,7 +208,7 @@ def local_connect_and_auth(port, auth_secret): return (sockfile, sock) except socket.error as e: emsg = str(e) - errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg)) + errors.append("tried to connect to %s, but an error occurred: %s" % (sa, emsg)) sock.close() sock = None raise Exception("could not open socket: %s" % errors) From 721a2f0d328c66ee5d3c909194726a2e38513106 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 064/103] spelling: optimize Signed-off-by: Josh Soref --- project/MimaExcludes.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index c772ec2e068a5..5a66bfca27a27 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -1729,7 +1729,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") ) ++ Seq( - // [SPARK-21680][ML][MLLIB]optimzie Vector compress + // [SPARK-21680][ML][MLLIB]optimize Vector compress ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize") ) ++ Seq( From 257a6e0c9e3c7637470573f3da7ad9a87aee9bd7 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 065/103] spelling: panel Signed-off-by: Josh Soref --- python/docs/source/_static/css/pyspark.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/_static/css/pyspark.css b/python/docs/source/_static/css/pyspark.css index 2fd8720e2fa0d..1e493c4c868e6 100644 --- a/python/docs/source/_static/css/pyspark.css +++ b/python/docs/source/_static/css/pyspark.css @@ -51,7 +51,7 @@ h3 { max-width: 80%; } -/* Left pannel size */ +/* Left panel size */ @media (min-width: 768px) { .col-md-3 { flex: 0 0 20%; From 9b86884551e8088679a8f8872f5c298ceb1eb76b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 066/103] spelling: parallelism Signed-off-by: Josh Soref --- R/pkg/R/context.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 9627a4ed8093f..cca6c2c817de9 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -116,7 +116,7 @@ makeSplits <- function(numSerializedSlices, length) { #' This change affects both createDataFrame and spark.lapply. #' In the specific one case that it is used to convert R native object into SparkDataFrame, it has #' always been kept at the default of 1. In the case the object is large, we are explicitly setting -#' the parallism to numSlices (which is still 1). +#' the parallelism to numSlices (which is still 1). #' #' Specifically, we are changing to split positions to match the calculation in positions() of #' ParallelCollectionRDD in Spark. From 6c12d5284bf9fd7a35c77642441cc101004f472a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:20 -0500 Subject: [PATCH 067/103] spelling: parallelize Signed-off-by: Josh Soref --- python/pyspark/tests/test_context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py index d86f6c3c1571c..8397ef1c4b62d 100644 --- a/python/pyspark/tests/test_context.py +++ b/python/pyspark/tests/test_context.py @@ -175,8 +175,8 @@ def test_parallelize_eager_cleanup(self): with SparkContext() as sc: temp_files = os.listdir(sc._temp_dir) rdd = sc.parallelize([0, 1, 2]) - post_parallalize_temp_files = os.listdir(sc._temp_dir) - self.assertEqual(temp_files, post_parallalize_temp_files) + post_parallelize_temp_files = os.listdir(sc._temp_dir) + self.assertEqual(temp_files, post_parallelize_temp_files) def test_set_conf(self): # This is for an internal use case. When there is an existing SparkContext, From bb7f8ee0cf1780080da1e7a73965945ca88ff95f Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 068/103] spelling: parameter Signed-off-by: Josh Soref --- python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 84fb333a288f4..f2e9a48c39b59 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -4071,7 +4071,7 @@ def _get_lambda_parameters(f): # We should exclude functions that use # variable args and keyword argnames # as well as keyword only args - supported_parmeter_types = { + supported_parameter_types = { inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.POSITIONAL_ONLY, } @@ -4086,7 +4086,7 @@ def _get_lambda_parameters(f): ) # and all arguments can be used as positional - if not all(p.kind in supported_parmeter_types for p in parameters): + if not all(p.kind in supported_parameter_types for p in parameters): raise ValueError( "f should use only POSITIONAL or POSITIONAL OR KEYWORD arguments" ) From ee02a857b58529bc20db9a60da8ffcc525f7dcad Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 069/103] spelling: partitioner Signed-off-by: Josh Soref --- python/pyspark/rdd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index c3702fc02bf66..34faaacff5eb3 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2292,7 +2292,7 @@ def groupWith(self, other, *others): """ return python_cogroup((self, other) + others, numPartitions=None) - # TODO: add variant with custom parittioner + # TODO: add variant with custom partitioner def cogroup(self, other, numPartitions=None): """ For each key k in `self` or `other`, return a resulting RDD that From c0713412bb22689a390e210475b04dc9c2be7a18 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 070/103] spelling: persistent Signed-off-by: Josh Soref --- .../deploy/k8s/features/MountVolumesFeatureStepSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala index 95ee37e3daa41..38f8fac1858f1 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala @@ -42,7 +42,7 @@ class MountVolumesFeatureStepSuite extends SparkFunSuite { assert(configuredPod.container.getVolumeMounts.get(0).getReadOnly === false) } - test("Mounts pesistentVolumeClaims") { + test("Mounts persistentVolumeClaims") { val volumeConf = KubernetesVolumeSpec( "testVolume", "/tmp", From 6ab889a063fa218d3385b429e4f9aa8816ba72c1 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 071/103] spelling: position Signed-off-by: Josh Soref --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 039d28a3a37b6..f0ce86a494e01 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -144,7 +144,7 @@ NULL #' @param y Column to compute on. #' @param pos In \itemize{ #' \item \code{locate}: a start position of search. -#' \item \code{overlay}: a start postiton for replacement. +#' \item \code{overlay}: a start position for replacement. #' } #' @param len In \itemize{ #' \item \code{lpad} the maximum length of each output result. From 59e50b16bc0f58dcc61875a5511602b5d849adf8 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 072/103] spelling: preemption Signed-off-by: Josh Soref --- .../main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 57af76b46fe64..ac50c1c77a24e 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -781,7 +781,7 @@ private[yarn] class YarnAllocator( val (exitCausedByApp, containerExitReason) = exitStatus match { case ContainerExitStatus.SUCCESS => (false, s"Executor for container $containerId exited because of a YARN event (e.g., " + - "pre-emption) and not because of an error in the running job.") + "preemption) and not because of an error in the running job.") case ContainerExitStatus.PREEMPTED => // Preemption is not the fault of the running tasks, since YARN preempts containers // merely to do resource sharing, and tasks that fail due to preempted executors could From a0ef0cadc8a7f358f9d8f41103b491db1cd1a448 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 073/103] spelling: preferred Signed-off-by: Josh Soref --- R/pkg/R/column.R | 2 +- python/pyspark/sql/column.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index b515907c1cf33..9fa117ccb6281 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -440,7 +440,7 @@ setMethod("withField", #' ) #' #' # However, if you are going to add/replace multiple nested fields, -#' # it is preffered to extract out the nested struct before +#' # it is preferred to extract out the nested struct before #' # adding/replacing multiple fields e.g. #' head( #' withColumn( diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 345e81bd2d73e..760805400aca9 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -425,7 +425,7 @@ def dropFields(self, *fieldNames): +--------------+ However, if you are going to add/replace multiple nested fields, - it is preffered to extract out the nested struct before + it is preferred to extract out the nested struct before adding/replacing multiple fields e.g. >>> df.select(col("a").withField( From 447bc29d44595be29bbd3996fdc25ad30db18201 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 074/103] spelling: progress Signed-off-by: Josh Soref --- R/pkg/R/streaming.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/streaming.R b/R/pkg/R/streaming.R index 5eccbdc9d3818..2bcfb363f9d24 100644 --- a/R/pkg/R/streaming.R +++ b/R/pkg/R/streaming.R @@ -93,7 +93,7 @@ setMethod("explain", #' lastProgress #' -#' Prints the most recent progess update of this streaming query in JSON format. +#' Prints the most recent progress update of this streaming query in JSON format. #' #' @param x a StreamingQuery. #' @rdname lastProgress From 060d36c62a9d25dd192739adfaa35558033f4332 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 075/103] spelling: pycharm Signed-off-by: Josh Soref --- python/docs/source/development/debugging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/development/debugging.rst b/python/docs/source/development/debugging.rst index bc141a6f44a6f..829919858f67a 100644 --- a/python/docs/source/development/debugging.rst +++ b/python/docs/source/development/debugging.rst @@ -54,7 +54,7 @@ Enter the name of this new configuration, for example, ``MyRemoteDebugger`` and .. image:: ../../../../docs/img/pyspark-remote-debug1.png :alt: PyCharm remote debugger setting -| After that, you should install the corresponding version of the ``pydevd-pycahrm`` package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install. +| After that, you should install the corresponding version of the ``pydevd-pycharm`` package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install. .. code-block:: text From 6b0dd91dcb30e6bd9c6df3509d2dd53aae7f4231 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 076/103] spelling: randomly Signed-off-by: Josh Soref --- python/pyspark/mllib/tests/test_streaming_algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py index b94fb2778d88d..f6c6779e83f13 100644 --- a/python/pyspark/mllib/tests/test_streaming_algorithms.py +++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py @@ -189,7 +189,7 @@ def generateLogisticInput(offset, scale, nPoints, seed): Generate 1 / (1 + exp(-x * scale + offset)) where, - x is randomnly distributed and the threshold + x is randomly distributed and the threshold and labels for each sample in x is obtained from a random uniform distribution. """ From 0c713fca8fdb4798b174a65f95fbc7e3e046c862 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:21 -0500 Subject: [PATCH 077/103] spelling: reconstruct Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index 26bad5f40d7b4..58c274bd79720 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -88,7 +88,7 @@ def g(): DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL # Track the provenance of reconstructed dynamic classes to make it possible to -# recontruct instances from the matching singleton class definition when +# reconstruct instances from the matching singleton class definition when # appropriate and preserve the usual "isinstance" semantics of Python objects. _DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary() _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() From cbbd9fbb995e3683da20325bb55e171902657a1e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 078/103] spelling: repository Signed-off-by: Josh Soref --- python/docs/source/development/testing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/development/testing.rst b/python/docs/source/development/testing.rst index 08fd730a19f4b..3eab8d04511d6 100644 --- a/python/docs/source/development/testing.rst +++ b/python/docs/source/development/testing.rst @@ -53,5 +53,5 @@ Running tests using GitHub Actions ---------------------------------- You can run the full PySpark tests by using GitHub Actions in your own forked GitHub -repositry with a few clicks. Please refer to +repository with a few clicks. Please refer to `Running tests in your forked repository using GitHub Actions `_ for more details. From e8bd1ac797363d051214eeb98be0e3f804ce2883 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 079/103] spelling: reuses Signed-off-by: Josh Soref --- python/pyspark/sql/pandas/_typing/protocols/frame.pyi | 2 +- python/pyspark/sql/pandas/_typing/protocols/series.pyi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi index de679ee2cd017..9148e7a2dca8e 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -# This Protocol resuses core Pandas annotation. +# This Protocol reuses core Pandas annotation. # Overall pipeline looks as follows # - Stubgen pandas.core.frame # - Add Protocol as a base class diff --git a/python/pyspark/sql/pandas/_typing/protocols/series.pyi b/python/pyspark/sql/pandas/_typing/protocols/series.pyi index 14babb067da0d..f2de2e8b129fd 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/series.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/series.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -# This Protocol resuses core Pandas annotation. +# This Protocol reuses core Pandas annotation. # Overall pipeline looks as follows # - Stubgen pandas.core.series # - Add Protocol as a base class From ae4ecd0001230d405816ce3807137031a25bc8a5 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 080/103] spelling: search Signed-off-by: Josh Soref --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index b8dda22240042..c6aa5f0b58285 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -635,13 +635,13 @@ public UTF8String trimLeft() { public UTF8String trimLeft(UTF8String trimString) { if (trimString == null) return null; // the searching byte position in the source string - int srchIdx = 0; + int searchIdx = 0; // the first beginning byte position of a non-matching character int trimIdx = 0; - while (srchIdx < numBytes) { + while (searchIdx < numBytes) { UTF8String searchChar = copyUTF8String( - srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1); + searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1); int searchCharBytes = searchChar.numBytes; // try to find the matching for the searchChar in the trimString set if (trimString.find(searchChar, 0) >= 0) { @@ -650,9 +650,9 @@ public UTF8String trimLeft(UTF8String trimString) { // no matching, exit the search break; } - srchIdx += searchCharBytes; + searchIdx += searchCharBytes; } - if (srchIdx == 0) { + if (searchIdx == 0) { // Nothing trimmed return this; } From 6e4237db99e4f01a42578985966a15063504aa5e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 081/103] spelling: selector Signed-off-by: Josh Soref --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 82b9a6db1eb92..8138f34d7a19e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -5798,7 +5798,7 @@ def setHandleInvalid(self, value): class _VarianceThresholdSelectorParams(HasFeaturesCol, HasOutputCol): """ Params for :py:class:`VarianceThresholdSelector` and - :py:class:`VarianceThresholdSelectorrModel`. + :py:class:`VarianceThresholdSelectorModel`. .. versionadded:: 3.1.0 """ From 39106373a2a034f47c6ce32a92527be2beba9003 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 082/103] spelling: sequential Signed-off-by: Josh Soref --- R/pkg/tests/fulltests/test_utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R index c3fb9046fcda4..6c83a137cfb7b 100644 --- a/R/pkg/tests/fulltests/test_utils.R +++ b/R/pkg/tests/fulltests/test_utils.R @@ -116,7 +116,7 @@ test_that("cleanClosure on R functions", { actual <- get("y", envir = env, inherits = FALSE) expect_equal(actual, y) - # Test for combination for nested and sequenctial functions in a closure + # Test for combination for nested and sequential functions in a closure f1 <- function(x) x + 1 f2 <- function(x) f1(x) + 2 userFunc <- function(x) { f1(x); f2(x) } From 92e6efaaa0536db36df51b85c6501e287b4ec129 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 083/103] spelling: spark Signed-off-by: Josh Soref --- R/pkg/R/SQLContext.R | 2 +- python/docs/source/getting_started/quickstart.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index c0ac68332ec41..5ed0481f33d8f 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -203,7 +203,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { }) } - # SPAKR-SQL does not support '.' in column name, so replace it with '_' + # SPARK-SQL does not support '.' in column name, so replace it with '_' # TODO(davies): remove this once SPARK-2775 is fixed names <- lapply(names, function(n) { nn <- gsub(".", "_", n, fixed = TRUE) diff --git a/python/docs/source/getting_started/quickstart.ipynb b/python/docs/source/getting_started/quickstart.ipynb index 5d2ca331e3afc..550b532fefc14 100644 --- a/python/docs/source/getting_started/quickstart.ipynb +++ b/python/docs/source/getting_started/quickstart.ipynb @@ -11,7 +11,7 @@ "\n", "There is also other useful information in Apache Spark documentation site, see the latest version of [Spark SQL and DataFrames](https://spark.apache.org/docs/latest/sql-programming-guide.html), [RDD Programming Guide](https://spark.apache.org/docs/latest/rdd-programming-guide.html), [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html), [Spark Streaming Programming Guide](https://spark.apache.org/docs/latest/streaming-programming-guide.html) and [Machine Learning Library (MLlib) Guide](https://spark.apache.org/docs/latest/ml-guide.html).\n", "\n", - "PySaprk applications start with initializing `SparkSession` which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users." + "PySpark applications start with initializing `SparkSession` which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users." ] }, { From f2150fefde20439ee1e322a5d443861e1ca71d4e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 084/103] spelling: specified Signed-off-by: Josh Soref --- python/pyspark/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 3c4c4f731fd2b..8dc6251323571 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -258,7 +258,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, sys.path.insert(1, filepath) except Exception: warnings.warn( - "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to " + "Failed to add file [%s] specified in 'spark.submit.pyFiles' to " "Python path:\n %s" % (path, "\n ".join(sys.path)), RuntimeWarning) From 5b1d6af51c09ae36644279ed54b3902d2109166a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 085/103] spelling: state Signed-off-by: Josh Soref --- .../apache/spark/streaming/rdd/MapWithStateRDDSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala index 58ce3a93251a9..f06b1feb8c0cd 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala @@ -320,7 +320,7 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B makeStateRDDWithLongLineageDataRDD, reliableCheckpoint = true, rddCollectFunc _) /** Generate MapWithStateRDD with parent state RDD having a long lineage */ - def makeStateRDDWithLongLineageParenttateRDD( + def makeStateRDDWithLongLineageParentStateRDD( longLineageRDD: RDD[Int]): MapWithStateRDD[Int, Int, Int, Int] = { // Create a MapWithStateRDD that has a long lineage using the data RDD with a long lineage @@ -337,9 +337,9 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B } testRDD( - makeStateRDDWithLongLineageParenttateRDD, reliableCheckpoint = true, rddCollectFunc _) + makeStateRDDWithLongLineageParentStateRDD, reliableCheckpoint = true, rddCollectFunc _) testRDDPartitions( - makeStateRDDWithLongLineageParenttateRDD, reliableCheckpoint = true, rddCollectFunc _) + makeStateRDDWithLongLineageParentStateRDD, reliableCheckpoint = true, rddCollectFunc _) } test("checkpointing empty state RDD") { From fc076be536797976418f41bdfbf7caee16c1f82b Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 086/103] spelling: stream Signed-off-by: Josh Soref --- R/pkg/inst/worker/worker.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R index 9ef10c0394a17..dd271f91d0084 100644 --- a/R/pkg/inst/worker/worker.R +++ b/R/pkg/inst/worker/worker.R @@ -85,7 +85,7 @@ outputResult <- function(serializer, output, outputCon) { } # Constants -specialLengths <- list(END_OF_STERAM = 0L, TIMING_DATA = -1L) +specialLengths <- list(END_OF_STREAM = 0L, TIMING_DATA = -1L) # Timing R process boot bootTime <- currentTimeSecs() @@ -285,7 +285,7 @@ SparkR:::writeDouble(outputCon, computeInputElapsDiff) # compute SparkR:::writeDouble(outputCon, outputComputeElapsDiff) # output # End of output -SparkR:::writeInt(outputCon, specialLengths$END_OF_STERAM) +SparkR:::writeInt(outputCon, specialLengths$END_OF_STREAM) close(outputCon) close(inputCon) From 366209e4def56d82b9f46bd4ff0536b8b55648e2 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 087/103] spelling: struct Signed-off-by: Josh Soref --- python/pyspark/sql/pandas/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 5c33ee84a791e..4cd0b196d3366 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -99,7 +99,7 @@ def pandas_udf(f=None, returnType=None, functionType=None): ... s3['col2'] = s1 + s2.str.len() ... return s3 ... - >>> # Create a Spark DataFrame that has three columns including a sturct column. + >>> # Create a Spark DataFrame that has three columns including a struct column. ... df = spark.createDataFrame( ... [[1, "a string", ("a nested string",)]], ... "long_col long, string_col string, struct_col struct") From dd21a5e8d20b28231a78d9a04676cc7ea48b52b9 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 088/103] spelling: subclassing Signed-off-by: Josh Soref --- python/pyspark/cloudpickle/cloudpickle_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index db3e115b0ca88..3c48ff7b0a885 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -6,7 +6,7 @@ is only available for Python versions 3.8+, a lot of backward-compatibility code is also removed. -Note that the C Pickler sublassing API is CPython-specific. Therefore, some +Note that the C Pickler subclassing API is CPython-specific. Therefore, some guards present in cloudpickle.py that were written to handle PyPy specificities are not present in cloudpickle_fast.py """ From a56a2e162f368cb7d69e4a9e77bf6049a7c29f76 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 089/103] spelling: subscriber Signed-off-by: Josh Soref --- .../cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala index 3f2cb485bbb31..22764d9d2eb0e 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala @@ -52,7 +52,7 @@ import org.apache.spark.util.ThreadUtils * time-windowed chunks. Each subscriber can choose to receive their snapshot chunks at different * time intervals. *
- * The subcriber notification callback is guaranteed to be called from a single thread at a time. + * The subscriber notification callback is guaranteed to be called from a single thread at a time. */ private[spark] class ExecutorPodsSnapshotsStoreImpl(subscribersExecutor: ScheduledExecutorService) extends ExecutorPodsSnapshotsStore with Logging { @@ -142,7 +142,7 @@ private[spark] class ExecutorPodsSnapshotsStoreImpl(subscribersExecutor: Schedul } if (notificationCount.decrementAndGet() > 0) { - // There was another concurrent request for this subcriber. Schedule a task to + // There was another concurrent request for this subscriber. Schedule a task to // immediately process snapshots again, so that the subscriber can pick up any // changes that may have happened between the time it started looking at snapshots // above, and the time the concurrent request arrived. From c3acb51d963efb72be5e07c219ed60d3a055ca98 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:22 -0500 Subject: [PATCH 090/103] spelling: succeeded Signed-off-by: Josh Soref --- R/pkg/tests/fulltests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 0b8afa3858cb3..97bf6baa939c1 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2092,7 +2092,7 @@ test_that("higher order functions", { createDataFrame(data.frame(id = 1)), expr("CAST(array(1.0, 2.0, -3.0, -4.0) AS array) xs"), expr("CAST(array(0.0, 3.0, 48.0) AS array) ys"), - expr("array('FAILED', 'SUCCEDED') as vs"), + expr("array('FAILED', 'SUCCEEDED') as vs"), expr("map('foo', 1, 'bar', 2) as mx"), expr("map('foo', 42, 'bar', -1, 'baz', 0) as my") ) From a874671d547f0f96ed30617b743dcea89254f66a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 091/103] spelling: suppress Signed-off-by: Josh Soref --- project/SparkBuild.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index f0eec5a047e90..a5951e0452943 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -198,7 +198,7 @@ object SparkBuild extends PomBuild { ) // Silencer: Scala compiler plugin for warning suppression - // Aim: enable fatal warnings, but supress ones related to using of deprecated APIs + // Aim: enable fatal warnings, but suppress ones related to using of deprecated APIs // depends on scala version: // <2.13 - silencer 1.6.0 and compiler settings to enable fatal warnings // 2.13.0,2.13.1 - silencer 1.7.1 and compiler settings to enable fatal warnings @@ -222,7 +222,7 @@ object SparkBuild extends PomBuild { "-Xfatal-warnings", "-deprecation", "-Ywarn-unused-import", - "-P:silencer:globalFilters=.*deprecated.*" //regex to catch deprecation warnings and supress them + "-P:silencer:globalFilters=.*deprecated.*" //regex to catch deprecation warnings and suppress them ) } else { Seq( From b2a8e612d12ac523f5d8979fa55a6bf7301b566c Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 092/103] spelling: temporary Signed-off-by: Josh Soref --- R/pkg/tests/fulltests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 97bf6baa939c1..0b4ecca097c59 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -3964,7 +3964,7 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { paste("Error in listFunctions : analysis error - Database", "'zxwtyswklpf_db' does not exist")) - # recoverPartitions does not work with tempory view + # recoverPartitions does not work with temporary view expect_error(recoverPartitions("cars"), "no such table - Table or view 'cars' not found in database 'default'") expect_error(refreshTable("cars"), NA) From 77841da0c5897abe18edd12ec48718e69610bb99 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 093/103] spelling: the Signed-off-by: Josh Soref --- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/RDD.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 70aeaa55646f7..31a651ea1279b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3743,7 +3743,7 @@ setMethod("histogram", #' #' @param x a SparkDataFrame. #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}. -#' @param tableName yhe name of the table in the external database. +#' @param tableName the name of the table in the external database. #' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore' #' save mode (it is 'error' by default) #' @param ... additional JDBC database connection properties. diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7a1d157bb8a36..046af3b3a4620 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -970,7 +970,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", MAXINT))))) # If the first sample didn't turn out large enough, keep trying to # take samples; this shouldn't happen often because we use a big - # multiplier for thei initial size + # multiplier for the initial size while (length(samples) < total) samples <- collectRDD(sampleRDD(x, withReplacement, fraction, as.integer(ceiling(stats::runif(1, From 7db5761227f87f055888b16f4f3fbb301b6e66f5 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 094/103] spelling: tracked Signed-off-by: Josh Soref --- .../org/apache/spark/streaming/MapWithStateSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala index b2b8d2f41fc80..3ffaa62bd75ac 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala @@ -541,12 +541,12 @@ class MapWithStateSuite extends SparkFunSuite with LocalStreamingContext // Setup the stream computation val ssc = new StreamingContext(sc, Seconds(1)) val inputStream = new TestInputStream(ssc, input, numPartitions = 2) - val trackeStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec) + val trackedStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec) val collectedOutputs = new ConcurrentLinkedQueue[Seq[T]] - val outputStream = new TestOutputStream(trackeStateStream, collectedOutputs) + val outputStream = new TestOutputStream(trackedStateStream, collectedOutputs) val collectedStateSnapshots = new ConcurrentLinkedQueue[Seq[(K, S)]] val stateSnapshotStream = new TestOutputStream( - trackeStateStream.stateSnapshots(), collectedStateSnapshots) + trackedStateStream.stateSnapshots(), collectedStateSnapshots) outputStream.register() stateSnapshotStream.register() From e829cbe8ac144dd1a360de622e5d2f45f127d48e Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 22:41:32 -0500 Subject: [PATCH 095/103] spelling: transferred Signed-off-by: Josh Soref --- .../spark/network/crypto/AuthEngineSuite.java | 10 +++++----- .../network/protocol/MessageWithHeaderSuite.java | 4 ++-- .../spark/network/sasl/SparkSaslSuite.java | 16 ++++++++-------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java index 0790f0079c2bd..1c2061699a128 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java @@ -150,8 +150,8 @@ public void testEncryptedMessage() throws Exception { ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(buf); - while (emsg.transfered() < emsg.count()) { - emsg.transferTo(channel, emsg.transfered()); + while (emsg.transferred() < emsg.count()) { + emsg.transferTo(channel, emsg.transferred()); } assertEquals(data.length, channel.length()); } finally { @@ -196,9 +196,9 @@ public Long answer(InvocationOnMock invocationOnMock) throws Throwable { TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(region); ByteArrayWritableChannel channel = new ByteArrayWritableChannel(testDataLength); // "transferTo" should act correctly when the underlying FileRegion transfers 0 bytes. - assertEquals(0L, emsg.transferTo(channel, emsg.transfered())); - assertEquals(testDataLength, emsg.transferTo(channel, emsg.transfered())); - assertEquals(emsg.transfered(), emsg.count()); + assertEquals(0L, emsg.transferTo(channel, emsg.transferred())); + assertEquals(testDataLength, emsg.transferTo(channel, emsg.transferred())); + assertEquals(emsg.transferred(), emsg.count()); assertEquals(4, channel.length()); } finally { client.close(); diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java index 3bff34e210e3c..af1c2878672c0 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java @@ -129,8 +129,8 @@ private void testFileRegionBody(int totalWrites, int writesPerCall) throws Excep private ByteBuf doWrite(MessageWithHeader msg, int minExpectedWrites) throws Exception { int writes = 0; ByteArrayWritableChannel channel = new ByteArrayWritableChannel((int) msg.count()); - while (msg.transfered() < msg.count()) { - msg.transferTo(channel, msg.transfered()); + while (msg.transferred() < msg.count()) { + msg.transferTo(channel, msg.transferred()); writes++; } assertTrue("Not enough writes!", minExpectedWrites <= writes); diff --git a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java index ecaeec98da182..32c9acd327213 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java @@ -191,28 +191,28 @@ public void testEncryptedMessage() throws Exception { SaslEncryption.EncryptedMessage emsg = new SaslEncryption.EncryptedMessage(backend, msg, 1024); - long count = emsg.transferTo(channel, emsg.transfered()); + long count = emsg.transferTo(channel, emsg.transferred()); assertTrue(count < data.length); assertTrue(count > 0); // Here, the output buffer is full so nothing should be transferred. - assertEquals(0, emsg.transferTo(channel, emsg.transfered())); + assertEquals(0, emsg.transferTo(channel, emsg.transferred())); // Now there's room in the buffer, but not enough to transfer all the remaining data, // so the dummy count should be returned. channel.reset(); - assertEquals(1, emsg.transferTo(channel, emsg.transfered())); + assertEquals(1, emsg.transferTo(channel, emsg.transferred())); // Eventually, the whole message should be transferred. for (int i = 0; i < data.length / 32 - 2; i++) { channel.reset(); - assertEquals(1, emsg.transferTo(channel, emsg.transfered())); + assertEquals(1, emsg.transferTo(channel, emsg.transferred())); } channel.reset(); - count = emsg.transferTo(channel, emsg.transfered()); + count = emsg.transferTo(channel, emsg.transferred()); assertTrue("Unexpected count: " + count, count > 1 && count < data.length); - assertEquals(data.length, emsg.transfered()); + assertEquals(data.length, emsg.transferred()); } finally { msg.release(); } @@ -237,9 +237,9 @@ public void testEncryptedMessageChunking() throws Exception { new SaslEncryption.EncryptedMessage(backend, msg.convertToNetty(), data.length / 8); ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); - while (emsg.transfered() < emsg.count()) { + while (emsg.transferred() < emsg.count()) { channel.reset(); - emsg.transferTo(channel, emsg.transfered()); + emsg.transferTo(channel, emsg.transferred()); } verify(backend, times(8)).wrap(any(byte[].class), anyInt(), anyInt()); From 710b57976cb417e5a3aae1bc23a8af8addabc54a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 096/103] spelling: unencrypted Signed-off-by: Josh Soref --- .../org/apache/spark/network/shuffle/SimpleDownloadFile.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java index 670612fd6f66a..97ecaa627b66c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java @@ -32,7 +32,7 @@ * A DownloadFile that does not take any encryption settings into account for reading and * writing data. * - * This does *not* mean the data in the file is un-encrypted -- it could be that the data is + * This does *not* mean the data in the file is unencrypted -- it could be that the data is * already encrypted when its written, and subsequent layer is responsible for decrypting. */ public class SimpleDownloadFile implements DownloadFile { From 3aed2448f442dd00bbf9294f579c40766acefe45 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 097/103] spelling: unsigned Signed-off-by: Josh Soref --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f0ce86a494e01..d5a5861d79b15 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2879,7 +2879,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"), }) #' @details -#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is +#' \code{shiftRightUnsigned}: (Unsigned) shifts the given value numBits right. If the given value is #' a long value, it will return a long value else it will return an integer value. #' #' @rdname column_math_functions From e0886bc1801338efb4da8ddbe739f7e8fe33107a Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 098/103] spelling: uploaded Signed-off-by: Josh Soref --- .../src/main/scala/org/apache/spark/deploy/k8s/Config.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala index e3af1ccc24f1c..41194f3a2676f 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala @@ -420,7 +420,7 @@ private[spark] object Config extends Logging { val KUBERNETES_FILE_UPLOAD_PATH = ConfigBuilder("spark.kubernetes.file.upload.path") .doc("Hadoop compatible file system path where files from the local file system " + - "will be uploded to in cluster mode.") + "will be uploaded to in cluster mode.") .version("3.0.0") .stringConf .createOptional From 581c627d0382b4d6a826fcccf9e2bd159bdb3c0d Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:23 -0500 Subject: [PATCH 099/103] spelling: uploading Signed-off-by: Josh Soref --- dev/create-release/release-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 240f4c8dfd371..d2953a86afafd 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -452,7 +452,7 @@ if [[ "$1" == "publish-release" ]]; then if ! is_dry_run; then nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id - echo "Uplading files to $nexus_upload" + echo "Uploading files to $nexus_upload" for file in $(find . -type f) do # strip leading ./ From 4041ea72d6b0ad518918180926de3331bad92e73 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:24 -0500 Subject: [PATCH 100/103] spelling: visited Signed-off-by: Josh Soref --- dev/github_jira_sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index 7eca03fc63d04..27451bba905dd 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -144,7 +144,7 @@ def reset_pr_labels(pr_num, jira_components): previous_max = get_max_pr() print("Retrieved %s JIRA PR's from GitHub" % len(jira_prs)) jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] -print("%s PR's remain after excluding visted ones" % len(jira_prs)) +print("%s PR's remain after excluding visited ones" % len(jira_prs)) num_updates = 0 considered = [] From 69349c7b120a12ae8e13afa3918dff85a8e8446c Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:24 -0500 Subject: [PATCH 101/103] spelling: warning Signed-off-by: Josh Soref --- python/pyspark/mllib/evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index f3be827fb6e4f..38808ed5f48f7 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -439,7 +439,7 @@ def meanAveragePrecision(self): """ Returns the mean average precision (MAP) of all the queries. If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. + a log warning is generated. """ return self.call("meanAveragePrecision") @@ -448,7 +448,7 @@ def meanAveragePrecisionAt(self, k): """ Returns the mean average precision (MAP) at first k ranking of all the queries. If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. + a log warning is generated. """ return self.call("meanAveragePrecisionAt", int(k)) From c5a047844a018826b1a4321169ebc52f4f92af43 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:24 -0500 Subject: [PATCH 102/103] spelling: without Signed-off-by: Josh Soref --- R/pkg/R/RDD.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 046af3b3a4620..408a3ff25b2b2 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -1512,7 +1512,7 @@ setMethod("glom", #' #' @param x An RDD. #' @param y An RDD. -#' @return a new RDD created by performing the simple union (witout removing +#' @return a new RDD created by performing the simple union (without removing #' duplicates) of two input RDDs. #' @examples #'\dontrun{ From 070e6bbc5c3d02c825000e833614181d247ae6b4 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 10 Nov 2020 21:17:24 -0500 Subject: [PATCH 103/103] spelling: written Signed-off-by: Josh Soref --- python/pyspark/streaming/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index c4dc0d3af3332..2e6d7ede88551 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -281,7 +281,7 @@ def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_ def textFileStream(self, directory): """ Create an input stream that monitors a Hadoop-compatible file system - for new files and reads them as text files. Files must be wrriten to the + for new files and reads them as text files. Files must be written to the monitored directory by "moving" them from another location within the same file system. File names starting with . are ignored. The text files must be encoded as UTF-8.