diff --git a/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties b/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties index 9be86b650d091..777c5f2b25915 100644 --- a/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties +++ b/common/utils/src/main/resources/org/apache/spark/log4j2-defaults.properties @@ -22,8 +22,8 @@ rootLogger.appenderRef.stdout.ref = console appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = JsonTemplateLayout -appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex # Settings to quiet third party logs that are too verbose logger.jetty.name = org.sparkproject.jetty diff --git a/common/utils/src/main/resources/org/apache/spark/log4j2-pattern-layout-defaults.properties b/common/utils/src/main/resources/org/apache/spark/log4j2-json-layout.properties similarity index 94% rename from common/utils/src/main/resources/org/apache/spark/log4j2-pattern-layout-defaults.properties rename to common/utils/src/main/resources/org/apache/spark/log4j2-json-layout.properties index 777c5f2b25915..9be86b650d091 100644 --- a/common/utils/src/main/resources/org/apache/spark/log4j2-pattern-layout-defaults.properties +++ b/common/utils/src/main/resources/org/apache/spark/log4j2-json-layout.properties @@ -22,8 +22,8 @@ rootLogger.appenderRef.stdout.ref = console appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex +appender.console.layout.type = JsonTemplateLayout +appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json # Settings to quiet third party logs that are too verbose logger.jetty.name = org.sparkproject.jetty diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala index 7471b764bd2b3..4b60cb20f0732 100644 --- a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala @@ -337,9 +337,9 @@ trait Logging { if (Logging.defaultSparkLog4jConfig || Logging.islog4j2DefaultConfigured()) { Logging.defaultSparkLog4jConfig = true val defaultLogProps = if (Logging.isStructuredLoggingEnabled) { - "org/apache/spark/log4j2-defaults.properties" + "org/apache/spark/log4j2-json-layout.properties" } else { - "org/apache/spark/log4j2-pattern-layout-defaults.properties" + "org/apache/spark/log4j2-defaults.properties" } Option(SparkClassUtils.getSparkClassLoader.getResource(defaultLogProps)) match { case Some(url) => @@ -398,7 +398,7 @@ private[spark] object Logging { @volatile private var initialized = false @volatile private var defaultRootLevel: Level = null @volatile private var defaultSparkLog4jConfig = false - @volatile private var structuredLoggingEnabled = true + @volatile private var structuredLoggingEnabled = false @volatile private[spark] var sparkShellThresholdLevel: Level = null @volatile private[spark] var setLogLevelPrinted: Boolean = false diff --git a/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java b/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java index 6959fe11820ff..1fab167adfeb0 100644 --- a/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java +++ b/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java @@ -21,11 +21,27 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.logging.log4j.Level; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; + +import org.apache.spark.internal.Logging$; import org.apache.spark.internal.SparkLogger; import org.apache.spark.internal.SparkLoggerFactory; public class StructuredSparkLoggerSuite extends SparkLoggerSuiteBase { + // Enable Structured Logging before running the tests + @BeforeAll + public static void setup() { + Logging$.MODULE$.enableStructuredLogging(); + } + + // Disable Structured Logging after running the tests + @AfterAll + public static void teardown() { + Logging$.MODULE$.disableStructuredLogging(); + } + private static final SparkLogger LOGGER = SparkLoggerFactory.getLogger(StructuredSparkLoggerSuite.class); diff --git a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala index 7631c25662219..9615eb2263636 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.util import scala.jdk.CollectionConverters._ +import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.internal.{Logging, MDC} @@ -26,7 +27,16 @@ import org.apache.spark.internal.LogKeys.{EXIT_CODE, OFFSET, RANGE} class MDCSuite extends AnyFunSuite // scalastyle:ignore funsuite - with Logging { + with Logging + with BeforeAndAfterAll { + + override def beforeAll(): Unit = { + Logging.enableStructuredLogging() + } + + override def afterAll(): Unit = { + Logging.disableStructuredLogging() + } test("check MDC message") { val log = log"This is a log, exitcode ${MDC(EXIT_CODE, 10086)}" diff --git a/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala index 2ba2b15c49f33..248136798b362 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala @@ -17,19 +17,16 @@ package org.apache.spark.util import org.apache.logging.log4j.Level -import org.scalatest.BeforeAndAfterAll import org.apache.spark.internal.Logging -class PatternLoggingSuite extends LoggingSuiteBase with BeforeAndAfterAll { +class PatternLoggingSuite extends LoggingSuiteBase { override def className: String = classOf[PatternLoggingSuite].getSimpleName override def logFilePath: String = "target/pattern.log" override def beforeAll(): Unit = Logging.disableStructuredLogging() - override def afterAll(): Unit = Logging.enableStructuredLogging() - override def expectedPatternForBasicMsg(level: Level): String = { s""".*$level $className: This is a log message\n""" } diff --git a/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala index 48951c2084f17..0026b696f0695 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala @@ -23,14 +23,21 @@ import java.nio.file.Files import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.logging.log4j.Level +import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.internal.{LogEntry, Logging, LogKey, LogKeys, MDC, MessageWithContext} trait LoggingSuiteBase extends AnyFunSuite // scalastyle:ignore funsuite + with BeforeAndAfterAll with Logging { + override def afterAll(): Unit = { + super.afterAll() + Logging.disableStructuredLogging() + } + def className: String def logFilePath: String @@ -202,7 +209,7 @@ trait LoggingSuiteBase } } - private val customLog = log"${MDC(CustomLogKeys.CUSTOM_LOG_KEY, "Custom log message.")}" + private lazy val customLog = log"${MDC(CustomLogKeys.CUSTOM_LOG_KEY, "Custom log message.")}" test("Logging with custom LogKey") { Seq( (Level.ERROR, () => logError(customLog)), @@ -265,6 +272,13 @@ class StructuredLoggingSuite extends LoggingSuiteBase { override def className: String = classOf[StructuredLoggingSuite].getSimpleName override def logFilePath: String = "target/structured.log" + override def beforeAll(): Unit = { + super.beforeAll() + Logging.enableStructuredLogging() + } + + override def afterAll(): Unit = super.afterAll() + private val jsonMapper = new ObjectMapper().registerModule(DefaultScalaModule) private def compactAndToRegexPattern(json: String): String = { jsonMapper.readTree(json).toString. diff --git a/conf/log4j2.properties.pattern-layout-template b/conf/log4j2-json-layout.properties.template similarity index 80% rename from conf/log4j2.properties.pattern-layout-template rename to conf/log4j2-json-layout.properties.template index 011fca58c9b2a..76499bb6691e7 100644 --- a/conf/log4j2.properties.pattern-layout-template +++ b/conf/log4j2-json-layout.properties.template @@ -19,17 +19,11 @@ rootLogger.level = info rootLogger.appenderRef.stdout.ref = console -# In the pattern layout configuration below, we specify an explicit `%ex` conversion -# pattern for logging Throwables. If this was omitted, then (by default) Log4J would -# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional -# class packaging information. That extra information can sometimes add a substantial -# performance overhead, so we disable it in our default logging config. -# For more information, see SPARK-39361. appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex +appender.console.layout.type = JsonTemplateLayout +appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json # Set the default spark-shell/spark-sql log level to WARN. When running the # spark-shell/spark-sql, the log level for these classes is used to overwrite diff --git a/conf/log4j2.properties.template b/conf/log4j2.properties.template index 76499bb6691e7..011fca58c9b2a 100644 --- a/conf/log4j2.properties.template +++ b/conf/log4j2.properties.template @@ -19,11 +19,17 @@ rootLogger.level = info rootLogger.appenderRef.stdout.ref = console +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = JsonTemplateLayout -appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex # Set the default spark-shell/spark-sql log level to WARN. When running the # spark-shell/spark-sql, the log level for these classes is used to overwrite diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 324ef701c4266..295f7f96158a2 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -162,7 +162,7 @@ package object config { "PySpark shell.") .version("4.0.0") .booleanConf - .createWithDefault(true) + .createWithDefault(false) private[spark] val LEGACY_TASK_NAME_MDC_ENABLED = ConfigBuilder("spark.log.legacyTaskNameMdc.enabled") diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index e7b65bf1a4eff..536c6b4447aac 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2689,7 +2689,7 @@ private[spark] object Utils * loading SparkConf. */ def resetStructuredLogging(sparkConf: SparkConf): Unit = { - if (sparkConf.getBoolean(STRUCTURED_LOGGING_ENABLED.key, defaultValue = true)) { + if (sparkConf.get(STRUCTURED_LOGGING_ENABLED)) { Logging.enableStructuredLogging() } else { Logging.disableStructuredLogging() diff --git a/docs/configuration.md b/docs/configuration.md index 6957ca9a03d23..301ffb0ec229c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -3753,15 +3753,20 @@ Note: When running Spark on YARN in `cluster` mode, environment variables need t # Configuring Logging -Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can configure it by adding a -`log4j2.properties` file in the `conf` directory. One way to start is to copy the existing templates `log4j2.properties.template` or `log4j2.properties.pattern-layout-template` located there. +Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can configure it by adding a `log4j2.properties` file in the `conf` directory. To get started, copy one of the provided templates: `log4j2.properties.template` (for plain text logging) or `log4j2-json-layout.properties.template` (for structured logging). + +## Plain Text Logging +The default logging format is plain text, using Log4j's [Pattern Layout](https://logging.apache.org/log4j/2.x/manual/pattern-layout.html). + +MDC (Mapped Diagnostic Context) information is not included by default in plain text logs. To include it, update the `PatternLayout` configuration in the `log4j2.properties` file. For example, add `%X{task_name}` to include the task name in logs. Additionally, use `spark.sparkContext.setLocalProperty("key", "value")` to add custom data to the MDC. ## Structured Logging -Starting from version 4.0.0, `spark-submit` has adopted the [JSON Template Layout](https://logging.apache.org/log4j/2.x/manual/json-template-layout.html) for logging, which outputs logs in JSON format. This format facilitates querying logs using Spark SQL with the JSON data source. Additionally, the logs include all Mapped Diagnostic Context (MDC) information for search and debugging purposes. +Starting with version 4.0.0, `spark-submit` supports optional structured logging using the [JSON Template Layout](https://logging.apache.org/log4j/2.x/manual/json-template-layout.html). This format enables efficient querying of logs with Spark SQL using the JSON data source and includes all MDC information for improved searchability and debugging. -To configure the layout of structured logging, start with the `log4j2.properties.template` file. +To enable structured logging and include MDC information, set the configuration `spark.log.structuredLogging.enabled` to `true` (default is `false`). For additional customization, copy `log4j2-json-layout.properties.template` to `conf/log4j2.properties` and adjust as needed. -To query Spark logs using Spark SQL, you can use the following code snippets: +### Querying Structured Logs with Spark SQL +To query structured logs in JSON format, use the following code snippet: **Python:** ```python @@ -3777,14 +3782,6 @@ import org.apache.spark.util.LogUtils.SPARK_LOG_SCHEMA val logDf = spark.read.schema(SPARK_LOG_SCHEMA).json("path/to/logs") ``` **Note**: If you're using the interactive shell (pyspark shell or spark-shell), you can omit the import statement in the code because SPARK_LOG_SCHEMA is already available in the shell's context. -## Plain Text Logging -If you prefer plain text logging, you have two options: -- Disable structured JSON logging by setting the Spark configuration `spark.log.structuredLogging.enabled` to `false`. -- Use a custom log4j configuration file. Rename `conf/log4j2.properties.pattern-layout-template` to `conf/log4j2.properties`. This reverts to the default configuration prior to Spark 4.0, which utilizes [PatternLayout](https://logging.apache.org/log4j/2.x/manual/layouts.html#PatternLayout) for logging all messages in plain text. - -MDC information is not included by default when with plain text logging. In order to print it in the logs, you can update the patternLayout in the file. For example, you can add `%X{task_name}` to print the task name in the logs. -Moreover, you can use `spark.sparkContext.setLocalProperty(s"mdc.$name", "value")` to add user specific data into MDC. -The key in MDC will be the string of `mdc.$name`. # Overriding configuration directory diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md index 49737392312a7..9dcf4ad8a2984 100644 --- a/docs/core-migration-guide.md +++ b/docs/core-migration-guide.md @@ -44,10 +44,6 @@ license: | - Since Spark 4.0, Spark uses the external shuffle service for deleting shuffle blocks for deallocated executors when the shuffle is no longer needed. To restore the legacy behavior, you can set `spark.shuffle.service.removeShuffle` to `false`. -- Starting with Spark 4.0, the default logging format for `spark-submit` has changed from plain text to JSON lines to improve log analysis. If you prefer plain text logs, you have two options: - - Set the Spark configuration `spark.log.structuredLogging.enabled` to `false`. For example, you can use `JDK_JAVA_OPTIONS=-Dspark.log.structuredLogging.enabled=false`. - - Use a custom log4j configuration file, such as renaming the template file `conf/log4j2.properties.pattern-layout-template` to `conf/log4j2.properties`. - - Since Spark 4.0, the MDC (Mapped Diagnostic Context) key for Spark task names in Spark logs has been changed from `mdc.taskName` to `task_name`. To use the key `mdc.taskName`, you can set `spark.log.legacyTaskNameMdc.enabled` to `true`. - Since Spark 4.0, Spark performs speculative executions less aggressively with `spark.speculation.multiplier=3` and `spark.speculation.quantile=0.9`. To restore the legacy behavior, you can set `spark.speculation.multiplier=1.5` and `spark.speculation.quantile=0.75`. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala index 873337e7a4242..861b0bf0f3945 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala @@ -33,12 +33,18 @@ class LogQuerySuite extends QueryTest with SharedSparkSession with Logging { new File(pwd + "/target/LogQuerySuite.log") } + override def beforeAll(): Unit = { + super.beforeAll() + Logging.enableStructuredLogging() + } + override def afterAll(): Unit = { super.afterAll() // Clear the log file if (logFile.exists()) { logFile.delete() } + Logging.disableStructuredLogging() } private def createTempView(viewName: String): Unit = {