From 5dd3630d6a937ba8634054940543509d9186e68e Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Wed, 16 Nov 2016 17:47:17 -0800 Subject: [PATCH 1/8] [SPARK-18535][UI][YARN] Redact sensitive information from Spark logs and UI This commit adds a new property called `spark.secret.redactionPattern` that allows users to specify a scala regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches the property or environment variable name, its value is redacted from the environment UI and various logs like YARN and event logs. This change uses this property to redact information from event logs and YARN logs. It also, updates the UI code to adhere to this property instead of hardcoding the logic to decipher which properties are sensitive. For testing: 1. Unit tests are added to ensure that redaction works. 2. A YARN job reading data off of S3 with confidential information (hadoop credential provider password) being provided in the environment variables of driver and executor. And, afterwards, logs were grepped to make sure that no mention of secret password was present. It was also ensure that the job was able to read the data off of S3 correctly, thereby ensuring that the sensitive information was being trickled down to the right places to read the data. 3. The event logs were checked to make sure no mention of secret password was present. 4. UI environment tab was checked to make sure there was no secret information being displayed. --- .../spark/internal/config/package.scala | 9 +++++++ .../scheduler/EventLoggingListener.scala | 17 ++++++++++++- .../apache/spark/ui/env/EnvironmentPage.scala | 11 +++------ .../apache/spark/ui/env/EnvironmentTab.scala | 1 + .../scala/org/apache/spark/util/Utils.scala | 11 ++++++++- .../scheduler/EventLoggingListenerSuite.scala | 24 +++++++++++++++++++ .../org/apache/spark/util/UtilsSuite.scala | 14 +++++++++++ docs/configuration.md | 10 ++++++++ .../spark/deploy/yarn/ExecutorRunnable.scala | 3 +-- 9 files changed, 88 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 2951bdc18bc57..0ecda92f35dec 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -223,4 +223,13 @@ package object config { " bigger files.") .longConf .createWithDefault(4 * 1024 * 1024) + + private[spark] val SECRET_REDACTION_PATTERN = + ConfigBuilder("spark.secret.redactionPattern") + .doc("Scala regex(case-sensitive) to decide which Spark configuration properties and " + + "environment variables in driver and executor environments contain sensitive information." + + " When this regex matches the property or environment variable name, its value is " + + "redacted from the environment UI and various logs like YARN and event logs") + .stringConf + .createWithDefault("secret|password|SECRET|PASSWORD") } diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index ce7877469f03f..f010ca7760b6a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -153,7 +153,9 @@ private[spark] class EventLoggingListener( override def onTaskEnd(event: SparkListenerTaskEnd): Unit = logEvent(event) - override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = logEvent(event) + override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = { + logEvent(redactEvent(event)) + } // Events that trigger a flush override def onStageCompleted(event: SparkListenerStageCompleted): Unit = { @@ -231,6 +233,19 @@ private[spark] class EventLoggingListener( } } + + private def redactEvent(event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { + // "Spark Properties" entry will always exist because the map is always populated with it. + val redactedProps = event + .environmentDetails + .get("Spark Properties") + .get + .map(Utils.redact(sparkConf)) + val redactedEnvironmentDetails = event.environmentDetails + + ("Spark Properties" -> redactedProps) + SparkListenerEnvironmentUpdate(redactedEnvironmentDetails) + } + } private[spark] object EventLoggingListener extends Logging { diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala index 9f6e9a6c9037b..d7dc2f5bd1e37 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala @@ -22,21 +22,16 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node import org.apache.spark.ui.{UIUtils, WebUIPage} +import org.apache.spark.util.Utils private[ui] class EnvironmentPage(parent: EnvironmentTab) extends WebUIPage("") { private val listener = parent.listener - private def removePass(kv: (String, String)): (String, String) = { - if (kv._1.toLowerCase.contains("password") || kv._1.toLowerCase.contains("secret")) { - (kv._1, "******") - } else kv - } - def render(request: HttpServletRequest): Seq[Node] = { val runtimeInformationTable = UIUtils.listingTable( propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true) - val sparkPropertiesTable = UIUtils.listingTable( - propertyHeader, propertyRow, listener.sparkProperties.map(removePass), fixedWidth = true) + val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, listener + .sparkProperties.map(Utils.redact(parent.conf)), fixedWidth = true) val systemPropertiesTable = UIUtils.listingTable( propertyHeader, propertyRow, listener.systemProperties, fixedWidth = true) val classpathEntriesTable = UIUtils.listingTable( diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala index f62260c6f6e1d..70b3ffd95e605 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala @@ -23,6 +23,7 @@ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener + val conf = parent.conf attachPage(new EnvironmentPage(this)) } diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 748d729554fca..abf71dde777b2 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -55,7 +55,7 @@ import org.slf4j.Logger import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES} +import org.apache.spark.internal.config._ import org.apache.spark.network.util.JavaUtils import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance} import org.apache.spark.util.logging.RollingFileAppender @@ -2555,6 +2555,15 @@ private[spark] object Utils extends Logging { sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten } } + + private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + def redact(conf: SparkConf)(kv: (String, String)): (String, String) = { + val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r + if (redactionPattern.findFirstIn(kv._1).isDefined) { + (kv._1, REDACTION_REPLACEMENT_TEXT) + } else kv + } + } private[util] object CallerContext extends Logging { diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index 8a5ec37eeb66c..8664114bd0101 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -95,6 +95,30 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit } } + test("Event logging with password redaction") { + val secretPassword = "secret_password" + val conf = getLoggingConf(testDirPath, None).set("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", + secretPassword) + sc = new SparkContext("local-cluster[2,2,1024]", "test", conf) + assert(sc.eventLogger.isDefined) + val eventLogger = sc.eventLogger.get + + sc.parallelize(1 to 10000).count() + sc.stop() + + val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem) + val eventLog = Source.fromInputStream(logData).mkString + // Make sure nothing secret shows up anywhere + assert(!eventLog.contains(secretPassword), s"Secret password ($secretPassword) not redacted " + + s"from event logs:\n $eventLog") + val expected = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"*********(redacted)"""" + // Make sure every occurrence of the property is accompanied by a redaction text. + val regex = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"([^"]*)"""".r + val matches = regex.findAllIn(eventLog) + assert(matches.nonEmpty) + matches.foreach(matched => assert(matched.equals(expected))) + } + test("Log overwriting") { val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None) val logPath = new URI(logUri).getPath diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index feacfb7642f27..bb27d1f6b5db1 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -974,4 +974,18 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(pValue > threshold) } + + test("redact sensitive information") { + val sparkConf = new SparkConf + sparkConf.set("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", "secret_password") + sparkConf.set("spark.my.password", "secret_password") + sparkConf.set("spark.my.secret", "secret_password") + sparkConf.set("spark.regular.property", "not_a_secret") + val redactedConf = sparkConf.getAll.map(Utils.redact(sparkConf)).toMap + assert(redactedConf.get("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD").get == Utils + .REDACTION_REPLACEMENT_TEXT) + assert(redactedConf.get("spark.my.password").get == Utils.REDACTION_REPLACEMENT_TEXT) + assert(redactedConf.get("spark.my.secret").get == Utils.REDACTION_REPLACEMENT_TEXT) + assert(redactedConf.get("spark.regular.property").get == "not_a_secret") + } } diff --git a/docs/configuration.md b/docs/configuration.md index a3b4ff01e6d92..5e8bd41a7f7ba 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -356,6 +356,16 @@ Apart from these, the following properties are also available, and may be useful process. The user can specify multiple of these to set multiple environment variables. + + spark.secret.redactionPattern + secret|password|SECRET|PASSWORD + + Scala regex(case-sensitive) to decide which Spark configuration properties and environment + variables in driver and executor environments contain sensitive information. When this + regex matches the property or environment variable name, its value is redacted from the + environment UI and various logs like YARN and event logs. + + spark.python.profile false diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index 8e0533f39ae53..f937564ab8840 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -36,7 +36,6 @@ import org.apache.hadoop.yarn.ipc.YarnRPC import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import org.apache.spark.{SecurityManager, SparkConf, SparkException} -import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.launcher.YarnCommandBuilderUtils @@ -75,7 +74,7 @@ private[yarn] class ExecutorRunnable( |=============================================================================== |YARN executor launch context: | env: - |${env.map { case (k, v) => s" $k -> $v\n" }.mkString} + |${env.map(Utils.redact(sparkConf)).map { case (k, v) => s" $k -> $v\n" }.mkString} | command: | ${commands.mkString(" \\ \n ")} | From b0ad319f43d39181c15f0b30b9bb45c8dc0ee279 Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Tue, 22 Nov 2016 16:04:18 -0800 Subject: [PATCH 2/8] Review feedback: 1. Renaming the property to not have word secret in it. 2. Making the regex case insensitive. 3. Other minor changes --- .../spark/internal/config/package.scala | 12 +++++----- .../scala/org/apache/spark/util/Utils.scala | 1 + .../scheduler/EventLoggingListenerSuite.scala | 2 +- .../org/apache/spark/util/UtilsSuite.scala | 24 +++++++++++++------ docs/configuration.md | 11 ++++----- 5 files changed, 30 insertions(+), 20 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 0ecda92f35dec..eaf6a787ce778 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -225,11 +225,11 @@ package object config { .createWithDefault(4 * 1024 * 1024) private[spark] val SECRET_REDACTION_PATTERN = - ConfigBuilder("spark.secret.redactionPattern") - .doc("Scala regex(case-sensitive) to decide which Spark configuration properties and " + - "environment variables in driver and executor environments contain sensitive information." + - " When this regex matches the property or environment variable name, its value is " + - "redacted from the environment UI and various logs like YARN and event logs") + ConfigBuilder("spark.redacton.regex") + .doc("Regex to decide which Spark configuration properties and environment variables in " + + "driver and executor environments contain sensitive information. When this regex matches " + + "a property , its value is redacted from the environment UI and various logs like YARN " + + "and event logs") .stringConf - .createWithDefault("secret|password|SECRET|PASSWORD") + .createWithDefault("(?i)secret|password") } diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index abf71dde777b2..76f3aa6f5852f 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2557,6 +2557,7 @@ private[spark] object Utils extends Logging { } private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + def redact(conf: SparkConf)(kv: (String, String)): (String, String) = { val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r if (redactionPattern.findFirstIn(kv._1).isDefined) { diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index 8664114bd0101..3331ba55c35f8 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -116,7 +116,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit val regex = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"([^"]*)"""".r val matches = regex.findAllIn(eventLog) assert(matches.nonEmpty) - matches.foreach(matched => assert(matched.equals(expected))) + matches.foreach{ matched => assert(matched.equals(expected)) } } test("Log overwriting") { diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index bb27d1f6b5db1..cc0d51eff1a62 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -977,15 +977,25 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { test("redact sensitive information") { val sparkConf = new SparkConf - sparkConf.set("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", "secret_password") - sparkConf.set("spark.my.password", "secret_password") - sparkConf.set("spark.my.secret", "secret_password") + + // Set some secret keys + val secretKeys = Seq("" + + "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", + "spark.my.password", + "spark.my.sECreT") + secretKeys.foreach { key => + sparkConf.set(key, "secret_password") + } + // Set a non-secret key sparkConf.set("spark.regular.property", "not_a_secret") + + // Redact sensitive information val redactedConf = sparkConf.getAll.map(Utils.redact(sparkConf)).toMap - assert(redactedConf.get("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD").get == Utils - .REDACTION_REPLACEMENT_TEXT) - assert(redactedConf.get("spark.my.password").get == Utils.REDACTION_REPLACEMENT_TEXT) - assert(redactedConf.get("spark.my.secret").get == Utils.REDACTION_REPLACEMENT_TEXT) + + // Assert that secret information got redacted while the regular property remained the same + secretKeys.foreach { key => + assert(redactedConf.get(key).get == Utils.REDACTION_REPLACEMENT_TEXT) + } assert(redactedConf.get("spark.regular.property").get == "not_a_secret") } } diff --git a/docs/configuration.md b/docs/configuration.md index 5e8bd41a7f7ba..aa201c6b6a7ea 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -357,13 +357,12 @@ Apart from these, the following properties are also available, and may be useful - spark.secret.redactionPattern - secret|password|SECRET|PASSWORD + spark.redaction.regex + (?i)secret|password - Scala regex(case-sensitive) to decide which Spark configuration properties and environment - variables in driver and executor environments contain sensitive information. When this - regex matches the property or environment variable name, its value is redacted from the - environment UI and various logs like YARN and event logs. + Regex to decide which Spark configuration properties and environment variables in driver and + executor environments contain sensitive information. When this regex matches a property, its + value is redacted from the environment UI and various logs like YARN and event logs. From 78e439837821cecccde960af37380644393778d6 Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Tue, 22 Nov 2016 16:14:20 -0800 Subject: [PATCH 3/8] Fixing a typo --- .../main/scala/org/apache/spark/internal/config/package.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index eaf6a787ce778..5c436c700cf43 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -225,7 +225,7 @@ package object config { .createWithDefault(4 * 1024 * 1024) private[spark] val SECRET_REDACTION_PATTERN = - ConfigBuilder("spark.redacton.regex") + ConfigBuilder("spark.redaction.regex") .doc("Regex to decide which Spark configuration properties and environment variables in " + "driver and executor environments contain sensitive information. When this regex matches " + "a property , its value is redacted from the environment UI and various logs like YARN " + From eed33db29d09b4730617fe1502da69a99ea9df42 Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Tue, 22 Nov 2016 17:42:22 -0800 Subject: [PATCH 4/8] More review feedback. Having Utils.redact() take in the list of tuples to update. The only pending comment left is the making the test in EventLoggingListenerSuite better --- .../apache/spark/scheduler/EventLoggingListener.scala | 4 ++-- .../org/apache/spark/ui/env/EnvironmentPage.scala | 5 +++-- core/src/main/scala/org/apache/spark/util/Utils.scala | 11 +++++++---- .../test/scala/org/apache/spark/util/UtilsSuite.scala | 2 +- .../apache/spark/deploy/yarn/ExecutorRunnable.scala | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index f010ca7760b6a..5826df0c4083f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -236,11 +236,11 @@ private[spark] class EventLoggingListener( private def redactEvent(event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { // "Spark Properties" entry will always exist because the map is always populated with it. - val redactedProps = event + val props = event .environmentDetails .get("Spark Properties") .get - .map(Utils.redact(sparkConf)) + val redactedProps = Utils.redact(sparkConf, props) val redactedEnvironmentDetails = event.environmentDetails + ("Spark Properties" -> redactedProps) SparkListenerEnvironmentUpdate(redactedEnvironmentDetails) diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala index d7dc2f5bd1e37..b11f8f1555f17 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala @@ -30,8 +30,9 @@ private[ui] class EnvironmentPage(parent: EnvironmentTab) extends WebUIPage("") def render(request: HttpServletRequest): Seq[Node] = { val runtimeInformationTable = UIUtils.listingTable( propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true) - val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, listener - .sparkProperties.map(Utils.redact(parent.conf)), fixedWidth = true) + val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, + Utils.redact(parent.conf, listener.sparkProperties), fixedWidth = true) + val systemPropertiesTable = UIUtils.listingTable( propertyHeader, propertyRow, listener.systemProperties, fixedWidth = true) val classpathEntriesTable = UIUtils.listingTable( diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 76f3aa6f5852f..93f3522852366 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2558,11 +2558,14 @@ private[spark] object Utils extends Logging { private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" - def redact(conf: SparkConf)(kv: (String, String)): (String, String) = { + def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, String)] = { val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r - if (redactionPattern.findFirstIn(kv._1).isDefined) { - (kv._1, REDACTION_REPLACEMENT_TEXT) - } else kv + kvs.map { kv => + if (redactionPattern.findFirstIn(kv._1).isDefined) { + (kv._1, REDACTION_REPLACEMENT_TEXT) + } + else kv + } } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index cc0d51eff1a62..53cbd99cb569d 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -990,7 +990,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { sparkConf.set("spark.regular.property", "not_a_secret") // Redact sensitive information - val redactedConf = sparkConf.getAll.map(Utils.redact(sparkConf)).toMap + val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap // Assert that secret information got redacted while the regular property remained the same secretKeys.foreach { key => diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index f937564ab8840..868c2edc5a463 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -74,7 +74,7 @@ private[yarn] class ExecutorRunnable( |=============================================================================== |YARN executor launch context: | env: - |${env.map(Utils.redact(sparkConf)).map { case (k, v) => s" $k -> $v\n" }.mkString} + |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString} | command: | ${commands.mkString(" \\ \n ")} | From 84a7ef38c67d76a1c79a16542f98e8752e1fe8ed Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Wed, 23 Nov 2016 11:07:47 -0800 Subject: [PATCH 5/8] More review feedback --- .../org/apache/spark/internal/config/package.scala | 4 ++-- .../spark/scheduler/EventLoggingListener.scala | 5 +---- .../src/main/scala/org/apache/spark/util/Utils.scala | 7 +++---- .../scala/org/apache/spark/util/UtilsSuite.scala | 12 ++++-------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 5c436c700cf43..a69a2b5645ddb 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -228,8 +228,8 @@ package object config { ConfigBuilder("spark.redaction.regex") .doc("Regex to decide which Spark configuration properties and environment variables in " + "driver and executor environments contain sensitive information. When this regex matches " + - "a property , its value is redacted from the environment UI and various logs like YARN " + - "and event logs") + "a property, its value is redacted from the environment UI and various logs like YARN " + + "and event logs.") .stringConf .createWithDefault("(?i)secret|password") } diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 5826df0c4083f..4aa2c44fccca5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -233,13 +233,10 @@ private[spark] class EventLoggingListener( } } - private def redactEvent(event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { // "Spark Properties" entry will always exist because the map is always populated with it. val props = event - .environmentDetails - .get("Spark Properties") - .get + .environmentDetails("Spark Properties") val redactedProps = Utils.redact(sparkConf, props) val redactedEnvironmentDetails = event.environmentDetails + ("Spark Properties" -> redactedProps) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 93f3522852366..249e0b3cb96e6 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2561,11 +2561,10 @@ private[spark] object Utils extends Logging { def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, String)] = { val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r kvs.map { kv => - if (redactionPattern.findFirstIn(kv._1).isDefined) { - (kv._1, REDACTION_REPLACEMENT_TEXT) + redactionPattern.findFirstIn(kv._1) + .map{ ignore => (kv._1, REDACTION_REPLACEMENT_TEXT) } + .getOrElse(kv) } - else kv - } } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 53cbd99cb569d..3bdaab291a7c9 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -979,13 +979,11 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { val sparkConf = new SparkConf // Set some secret keys - val secretKeys = Seq("" + + val secretKeys = Seq( "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", "spark.my.password", "spark.my.sECreT") - secretKeys.foreach { key => - sparkConf.set(key, "secret_password") - } + secretKeys.foreach { key => sparkConf.set(key, "secret_password") } // Set a non-secret key sparkConf.set("spark.regular.property", "not_a_secret") @@ -993,9 +991,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap // Assert that secret information got redacted while the regular property remained the same - secretKeys.foreach { key => - assert(redactedConf.get(key).get == Utils.REDACTION_REPLACEMENT_TEXT) - } - assert(redactedConf.get("spark.regular.property").get == "not_a_secret") + secretKeys.foreach { key => assert(redactedConf(key) == Utils.REDACTION_REPLACEMENT_TEXT) } + assert(redactedConf("spark.regular.property") == "not_a_secret") } } From 549881bae3652a3fa9aac9701b8aaf50ef46bd79 Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Wed, 23 Nov 2016 11:48:11 -0800 Subject: [PATCH 6/8] Making the test in EventLoggingListenerSuite less bloated --- .../scheduler/EventLoggingListener.scala | 3 +- .../scheduler/EventLoggingListenerSuite.scala | 28 ++++++------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 4aa2c44fccca5..6a45edbb31a80 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -233,7 +233,8 @@ private[spark] class EventLoggingListener( } } - private def redactEvent(event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { + private[spark] def redactEvent(event: SparkListenerEnvironmentUpdate): + SparkListenerEnvironmentUpdate = { // "Spark Properties" entry will always exist because the map is always populated with it. val props = event .environmentDetails("Spark Properties") diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index 3331ba55c35f8..230e2c34d0d6c 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -96,27 +96,15 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit } test("Event logging with password redaction") { + val key = "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD" val secretPassword = "secret_password" - val conf = getLoggingConf(testDirPath, None).set("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", - secretPassword) - sc = new SparkContext("local-cluster[2,2,1024]", "test", conf) - assert(sc.eventLogger.isDefined) - val eventLogger = sc.eventLogger.get - - sc.parallelize(1 to 10000).count() - sc.stop() - - val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem) - val eventLog = Source.fromInputStream(logData).mkString - // Make sure nothing secret shows up anywhere - assert(!eventLog.contains(secretPassword), s"Secret password ($secretPassword) not redacted " + - s"from event logs:\n $eventLog") - val expected = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"*********(redacted)"""" - // Make sure every occurrence of the property is accompanied by a redaction text. - val regex = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"([^"]*)"""".r - val matches = regex.findAllIn(eventLog) - assert(matches.nonEmpty) - matches.foreach{ matched => assert(matched.equals(expected)) } + val conf = getLoggingConf(testDirPath, None) + .set(key, secretPassword) + val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf) + val envDetails = SparkEnv.environmentDetails(conf, "FIFO", Seq.empty, Seq.empty) + val event = SparkListenerEnvironmentUpdate(envDetails) + val redactedProps = eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap + assert(redactedProps(key) == "*********(redacted)") } test("Log overwriting") { From 61a961cc9ac34d0f87afba2e2a72b6922272aaaf Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Wed, 23 Nov 2016 15:14:38 -0800 Subject: [PATCH 7/8] More feedback --- .../org/apache/spark/scheduler/EventLoggingListener.scala | 8 +++----- core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++-- .../src/test/scala/org/apache/spark/util/UtilsSuite.scala | 4 ++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 6a45edbb31a80..f97363d21f78e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -233,12 +233,10 @@ private[spark] class EventLoggingListener( } } - private[spark] def redactEvent(event: SparkListenerEnvironmentUpdate): - SparkListenerEnvironmentUpdate = { + private[spark] def redactEvent( + event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { // "Spark Properties" entry will always exist because the map is always populated with it. - val props = event - .environmentDetails("Spark Properties") - val redactedProps = Utils.redact(sparkConf, props) + val redactedProps = Utils.redact(sparkConf, event.environmentDetails("Spark Properties")) val redactedEnvironmentDetails = event.environmentDetails + ("Spark Properties" -> redactedProps) SparkListenerEnvironmentUpdate(redactedEnvironmentDetails) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 249e0b3cb96e6..fa5c8a2c18d00 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2562,9 +2562,9 @@ private[spark] object Utils extends Logging { val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r kvs.map { kv => redactionPattern.findFirstIn(kv._1) - .map{ ignore => (kv._1, REDACTION_REPLACEMENT_TEXT) } + .map { ignore => (kv._1, REDACTION_REPLACEMENT_TEXT) } .getOrElse(kv) - } + } } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 3bdaab291a7c9..fb7b91222b499 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -991,7 +991,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap // Assert that secret information got redacted while the regular property remained the same - secretKeys.foreach { key => assert(redactedConf(key) == Utils.REDACTION_REPLACEMENT_TEXT) } - assert(redactedConf("spark.regular.property") == "not_a_secret") + secretKeys.foreach { key => assert(redactedConf(key) === Utils.REDACTION_REPLACEMENT_TEXT) } + assert(redactedConf("spark.regular.property") === "not_a_secret") } } From 49015acaf35a2e6b2491c57cd1c71d12ce5a8a64 Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Wed, 23 Nov 2016 15:42:40 -0800 Subject: [PATCH 8/8] Function indentation at 4 --- .../scala/org/apache/spark/scheduler/EventLoggingListener.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index f97363d21f78e..f39565edd2358 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -234,7 +234,7 @@ private[spark] class EventLoggingListener( } private[spark] def redactEvent( - event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { + event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { // "Spark Properties" entry will always exist because the map is always populated with it. val redactedProps = Utils.redact(sparkConf, event.environmentDetails("Spark Properties")) val redactedEnvironmentDetails = event.environmentDetails +