From 6ef59e6067742e8826741af8f49ca8ac3319273a Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 5 Jul 2024 11:42:13 -0700
Subject: [PATCH 01/28] add structured logging style script and github workflow

---
 .github/workflows/build_and_test.yml |  2 +
 dev/structured-logging.py            | 64 ++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 dev/structured-logging.py

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9dc9d85520c2c..6f290ec4c74ef 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -702,6 +702,8 @@ jobs:
       run: ./dev/mima
     - name: Scala linter
       run: ./dev/lint-scala
+    - name: Scala structured logging check
+      run: ./dev/structured-logging.py
     - name: Java linter
       run: ./dev/lint-java
     - name: Spark connect jvm client mima check
diff --git a/dev/structured-logging.py b/dev/structured-logging.py
new file mode 100644
index 0000000000000..69a05585b677c
--- /dev/null
+++ b/dev/structured-logging.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import re
+import glob
+
+from sparktestsupport import SPARK_HOME
+
+def main():
+    nonmigrated_pattern = r'log(Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))'
+    whitelist_file = 'dev/structured-logging-whitelist.txt'
+
+    nonmigrated_files = []
+    whitelist_files = set()
+
+    with open(whitelist_file, 'r') as wf:
+        whitelist_patterns = [line.strip() for line in wf if line.strip()]
+
+    scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
+
+    for file in scala_files:
+        for whitelist_pattern in whitelist_patterns:
+            if re.search(whitelist_pattern, file):
+                whitelist_files.add(file)
+                break
+
+    for file in scala_files:
+        if file not in whitelist_files:
+            with open(file, 'r') as f:
+                content = f.read().replace('\t', '').strip()
+                if re.search(nonmigrated_pattern, content):
+                    nonmigrated_files.append(file)
+
+    if not nonmigrated_files:
+        print("Structured logging style check passed.")
+        sys.exit(0)
+    else:
+        print("Structured logging style check failed for the following:")
+        for file in nonmigrated_files:
+            print(file)
+
+        sys.exit(-1)
+
+
+if __name__ == "__main__":
+    main()

From d640b944aa2300f2c662b484b130b90f3b22fec4 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 5 Jul 2024 13:50:48 -0700
Subject: [PATCH 02/28] improve error message

---
 dev/structured-logging.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/dev/structured-logging.py b/dev/structured-logging.py
index 69a05585b677c..485634d95d4dd 100644
--- a/dev/structured-logging.py
+++ b/dev/structured-logging.py
@@ -25,37 +25,42 @@
 from sparktestsupport import SPARK_HOME
 
 def main():
-    nonmigrated_pattern = r'log(Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))'
-    whitelist_file = 'dev/structured-logging-whitelist.txt'
+    nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))'
+    excluded_file_patterns = [
+        '[Tt]est',
+        './sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala'
+        './sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala'
+    ]
 
-    nonmigrated_files = []
-    whitelist_files = set()
-
-    with open(whitelist_file, 'r') as wf:
-        whitelist_patterns = [line.strip() for line in wf if line.strip()]
+    nonmigrated_files = {}
+    excluded_files = set()
 
     scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
 
     for file in scala_files:
-        for whitelist_pattern in whitelist_patterns:
-            if re.search(whitelist_pattern, file):
-                whitelist_files.add(file)
+        for exclude_pattern in excluded_file_patterns:
+            if re.search(exclude_pattern, file):
+                excluded_files.add(file)
                 break
 
     for file in scala_files:
-        if file not in whitelist_files:
+        if file not in excluded_files:
             with open(file, 'r') as f:
-                content = f.read().replace('\t', '').strip()
-                if re.search(nonmigrated_pattern, content):
-                    nonmigrated_files.append(file)
+                lines = f.readlines()
+                for line_number, line in enumerate(lines, start=1):
+                    if re.search(nonmigrated_pattern, line):
+                        if file not in nonmigrated_files:
+                            nonmigrated_files[file] = []
+                        nonmigrated_files[file].append((line_number, line))
 
     if not nonmigrated_files:
         print("Structured logging style check passed.")
         sys.exit(0)
     else:
-        print("Structured logging style check failed for the following:")
-        for file in nonmigrated_files:
-            print(file)
+        for file_path, issues in nonmigrated_files.items():
+            print(f"[error] Structured logging style check failed for the file '{file}' at:")
+            for line_number, code in issues:
+                print(f"Line: {line_number} code: {code}")
 
         sys.exit(-1)
 

From b86222834368fe1997849413b41aaeea5f447e3d Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 5 Jul 2024 14:23:45 -0700
Subject: [PATCH 03/28] match scala build error message

---
 dev/structured-logging.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/dev/structured-logging.py b/dev/structured-logging.py
index 485634d95d4dd..188dc3575f14e 100644
--- a/dev/structured-logging.py
+++ b/dev/structured-logging.py
@@ -26,10 +26,12 @@
 
 def main():
     nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))'
+
+    # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         '[Tt]est',
-        './sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala'
-        './sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala'
+        '/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala',
+        '/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala'
     ]
 
     nonmigrated_files = {}
@@ -41,29 +43,31 @@ def main():
         for exclude_pattern in excluded_file_patterns:
             if re.search(exclude_pattern, file):
                 excluded_files.add(file)
-                break
 
     for file in scala_files:
         if file not in excluded_files:
             with open(file, 'r') as f:
                 lines = f.readlines()
                 for line_number, line in enumerate(lines, start=1):
-                    if re.search(nonmigrated_pattern, line):
+                    matches = list(re.finditer(nonmigrated_pattern, line))
+                    if matches:
                         if file not in nonmigrated_files:
                             nonmigrated_files[file] = []
-                        nonmigrated_files[file].append((line_number, line))
+                        for match in matches:
+                            start_char = match.start()
+                            nonmigrated_files[file].append((line_number, start_char))
 
     if not nonmigrated_files:
         print("Structured logging style check passed.")
         sys.exit(0)
     else:
         for file_path, issues in nonmigrated_files.items():
-            print(f"[error] Structured logging style check failed for the file '{file}' at:")
-            for line_number, code in issues:
-                print(f"Line: {line_number} code: {code}")
+            for line_number, start_char in issues:
+                print(f"[error] {file_path}:{line_number}:{start_char}")
+                print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. 
+                Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
 
         sys.exit(-1)
 
-
 if __name__ == "__main__":
     main()

From 42de4c5f3b7304fc2df61ae29c3c553f54221417 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 5 Jul 2024 17:09:51 -0700
Subject: [PATCH 04/28] account for line breaks in regex match

---
 ...logging.py => structured-logging-style.py} | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)
 rename dev/{structured-logging.py => structured-logging-style.py} (74%)

diff --git a/dev/structured-logging.py b/dev/structured-logging-style.py
similarity index 74%
rename from dev/structured-logging.py
rename to dev/structured-logging-style.py
index 188dc3575f14e..6edda5ef1d64d 100644
--- a/dev/structured-logging.py
+++ b/dev/structured-logging-style.py
@@ -25,7 +25,8 @@
 from sparktestsupport import SPARK_HOME
 
 def main():
-    nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))'
+    nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r].*(\$|\+).*\))'
+    pattern = re.compile(nonmigrated_pattern)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
@@ -35,27 +36,29 @@ def main():
     ]
 
     nonmigrated_files = {}
-    excluded_files = set()
 
     scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
 
     for file in scala_files:
+        skip_file = False
         for exclude_pattern in excluded_file_patterns:
             if re.search(exclude_pattern, file):
-                excluded_files.add(file)
+                skip_file = True
+                break
 
-    for file in scala_files:
-        if file not in excluded_files:
+        if not skip_file:
             with open(file, 'r') as f:
-                lines = f.readlines()
-                for line_number, line in enumerate(lines, start=1):
-                    matches = list(re.finditer(nonmigrated_pattern, line))
-                    if matches:
-                        if file not in nonmigrated_files:
-                            nonmigrated_files[file] = []
-                        for match in matches:
-                            start_char = match.start()
-                            nonmigrated_files[file].append((line_number, start_char))
+                content = f.read()
+                matches = list(pattern.finditer(content))
+
+                if matches:
+                    nonmigrated_files[file] = []
+                    for match in matches:
+                        start_pos = match.start()
+                        preceding_content = content[:start_pos]
+                        line_number = preceding_content.count('\n') + 1
+                        start_char = start_pos - preceding_content.rfind('\n') - 1
+                        nonmigrated_files[file].append((line_number, start_char))
 
     if not nonmigrated_files:
         print("Structured logging style check passed.")

From 95e86cc1744a294baefef5bf856e8c221ae2ecbf Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 5 Jul 2024 20:18:20 -0700
Subject: [PATCH 05/28] fix script name

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6f290ec4c74ef..3d3acbcb52fb3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -703,7 +703,7 @@ jobs:
     - name: Scala linter
       run: ./dev/lint-scala
     - name: Scala structured logging check
-      run: ./dev/structured-logging.py
+      run: ./dev/structured-logging-style.py
     - name: Java linter
       run: ./dev/lint-java
     - name: Spark connect jvm client mima check

From 667bc58431a9fefad78cf51c1b0b79b141c08a95 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Sat, 6 Jul 2024 08:32:55 -0700
Subject: [PATCH 06/28] update regex

---
 dev/structured-logging-style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 dev/structured-logging-style.py

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
old mode 100644
new mode 100755
index 6edda5ef1d64d..c9bd32120c8c9
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -25,7 +25,7 @@
 from sparktestsupport import SPARK_HOME
 
 def main():
-    nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r].*(\$|\+).*\))'
+    nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r]*(\$|\+).*\))'
     pattern = re.compile(nonmigrated_pattern)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check

From 58e763dc2f8f4145d8bde6acd2c6c42af7554482 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Mon, 8 Jul 2024 10:11:25 -0700
Subject: [PATCH 07/28] separate mono regex

---
 dev/structured-logging-style.py | 56 ++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index c9bd32120c8c9..1eaf5013b9e8b 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -25,8 +25,9 @@
 from sparktestsupport import SPARK_HOME
 
 def main():
-    nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r]*(\$|\+).*\))'
-    pattern = re.compile(nonmigrated_pattern)
+    log_pattern = r'log(?:Info|Warning|Error)\((.*?)\)'
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*'
+    compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
@@ -49,26 +50,53 @@ def main():
         if not skip_file:
             with open(file, 'r') as f:
                 content = f.read()
-                matches = list(pattern.finditer(content))
 
-                if matches:
+                log_statements = re.finditer(log_pattern, content, re.DOTALL)
+                # log_statements = [statement.group(1).strip() for statement in log_statements]
+
+                if log_statements:
                     nonmigrated_files[file] = []
-                    for match in matches:
-                        start_pos = match.start()
-                        preceding_content = content[:start_pos]
-                        line_number = preceding_content.count('\n') + 1
-                        start_char = start_pos - preceding_content.rfind('\n') - 1
-                        nonmigrated_files[file].append((line_number, start_char))
+                    for log_statement in log_statements:
+                        if compiled_inner_log_pattern.fullmatch(log_statement.group(1)):
+                            start_pos = log_statement.start()
+                            preceding_content = content[:start_pos]
+                            line_number = preceding_content.count('\n') + 1
+                            start_char = start_pos - preceding_content.rfind('\n') - 1
+                            nonmigrated_files[file].append((line_number, start_char, log_statement.group(1)))
+
+                # for log_statement in log_statements:
+                #     if compiled_inner_log_pattern.search(log_statement):
+                #         nonmigrated_files[file].append()
+
+
+                # matches = list(pattern.finditer(content))
+                # matches2 = pattern.findall(content)
+                #
+                # for m in matches2:
+                #     print(f"****** ${m}")
+                #
+                # if matches:
+                #     nonmigrated_files[file] = []
+                #     for match in matches:
+                #         start_pos = match.start()
+                #         preceding_content = content[:start_pos]
+                #         line_number = preceding_content.count('\n') + 1
+                #         start_char = start_pos - preceding_content.rfind('\n') - 1
+                #         nonmigrated_files[file].append((line_number, start_char))
 
     if not nonmigrated_files:
         print("Structured logging style check passed.")
         sys.exit(0)
     else:
         for file_path, issues in nonmigrated_files.items():
-            for line_number, start_char in issues:
-                print(f"[error] {file_path}:{line_number}:{start_char}")
-                print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. 
-                Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
+            if issues:
+                print(file_path)
+        # for file_path, issues in nonmigrated_files.items():
+        #     for line_number, start_char in issues:
+        #         pass
+        #         print(f"[error] {file_path}:{line_number}:{start_char}")
+        #         print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
+        #         Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
 
         sys.exit(-1)
 

From 0f26de8d7d589c65ed2c3f0fa9c50fa153414a7d Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Mon, 8 Jul 2024 11:54:30 -0700
Subject: [PATCH 08/28] add java files check

---
 dev/structured-logging-style.py | 41 ++++++++-------------------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 1eaf5013b9e8b..18db71d89c852 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -38,9 +38,10 @@ def main():
 
     nonmigrated_files = {}
 
-    scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
+    files = [file for ext in ('*.scala', '*.java')
+             for file in glob.glob(os.path.join(SPARK_HOME, '**', ext), recursive=True)]
 
-    for file in scala_files:
+    for file in files:
         skip_file = False
         for exclude_pattern in excluded_file_patterns:
             if re.search(exclude_pattern, file):
@@ -52,7 +53,6 @@ def main():
                 content = f.read()
 
                 log_statements = re.finditer(log_pattern, content, re.DOTALL)
-                # log_statements = [statement.group(1).strip() for statement in log_statements]
 
                 if log_statements:
                     nonmigrated_files[file] = []
@@ -62,41 +62,18 @@ def main():
                             preceding_content = content[:start_pos]
                             line_number = preceding_content.count('\n') + 1
                             start_char = start_pos - preceding_content.rfind('\n') - 1
-                            nonmigrated_files[file].append((line_number, start_char, log_statement.group(1)))
-
-                # for log_statement in log_statements:
-                #     if compiled_inner_log_pattern.search(log_statement):
-                #         nonmigrated_files[file].append()
-
-
-                # matches = list(pattern.finditer(content))
-                # matches2 = pattern.findall(content)
-                #
-                # for m in matches2:
-                #     print(f"****** ${m}")
-                #
-                # if matches:
-                #     nonmigrated_files[file] = []
-                #     for match in matches:
-                #         start_pos = match.start()
-                #         preceding_content = content[:start_pos]
-                #         line_number = preceding_content.count('\n') + 1
-                #         start_char = start_pos - preceding_content.rfind('\n') - 1
-                #         nonmigrated_files[file].append((line_number, start_char))
+                            nonmigrated_files[file].append((line_number, start_char))
 
     if not nonmigrated_files:
         print("Structured logging style check passed.")
         sys.exit(0)
     else:
         for file_path, issues in nonmigrated_files.items():
-            if issues:
-                print(file_path)
-        # for file_path, issues in nonmigrated_files.items():
-        #     for line_number, start_char in issues:
-        #         pass
-        #         print(f"[error] {file_path}:{line_number}:{start_char}")
-        #         print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
-        #         Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
+            for line_number, start_char in issues:
+                pass
+                print(f"[error] {file_path}:{line_number}:{start_char}")
+                print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
+                Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
 
         sys.exit(-1)
 

From 89596889a792385103f00f87a196048d39cab8b5 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Mon, 8 Jul 2024 12:06:52 -0700
Subject: [PATCH 09/28] reformat python

---
 dev/structured-logging-style.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 18db71d89c852..7138f7795b4b5 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -24,22 +24,26 @@
 
 from sparktestsupport import SPARK_HOME
 
+
 def main():
-    log_pattern = r'log(?:Info|Warning|Error)\((.*?)\)'
+    log_pattern = r"log(?:Info|Warning|Error)\((.*?)\)"
     inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*'
     compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
-        '[Tt]est',
-        '/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala',
-        '/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala'
+        "[Tt]est",
+        "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
+        "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
     ]
 
     nonmigrated_files = {}
 
-    files = [file for ext in ('*.scala', '*.java')
-             for file in glob.glob(os.path.join(SPARK_HOME, '**', ext), recursive=True)]
+    files = [
+        file
+        for ext in ("*.scala", "*.java")
+        for file in glob.glob(os.path.join(SPARK_HOME, "**", ext), recursive=True)
+    ]
 
     for file in files:
         skip_file = False
@@ -49,7 +53,7 @@ def main():
                 break
 
         if not skip_file:
-            with open(file, 'r') as f:
+            with open(file, "r") as f:
                 content = f.read()
 
                 log_statements = re.finditer(log_pattern, content, re.DOTALL)
@@ -60,8 +64,8 @@ def main():
                         if compiled_inner_log_pattern.fullmatch(log_statement.group(1)):
                             start_pos = log_statement.start()
                             preceding_content = content[:start_pos]
-                            line_number = preceding_content.count('\n') + 1
-                            start_char = start_pos - preceding_content.rfind('\n') - 1
+                            line_number = preceding_content.count("\n") + 1
+                            start_char = start_pos - preceding_content.rfind("\n") - 1
                             nonmigrated_files[file].append((line_number, start_char))
 
     if not nonmigrated_files:
@@ -72,10 +76,13 @@ def main():
             for line_number, start_char in issues:
                 pass
                 print(f"[error] {file_path}:{line_number}:{start_char}")
-                print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
-                Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
+                print(
+                    """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
+                Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`."""
+                )
 
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     main()

From 0bd4e8aa85a3f983fac98662774ca9d5aebcdb5f Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Mon, 8 Jul 2024 15:18:43 -0700
Subject: [PATCH 10/28] check only scala files

---
 dev/structured-logging-style.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 7138f7795b4b5..573fb2367556e 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -33,19 +33,14 @@ def main():
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         "[Tt]est",
-        "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
         "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
     ]
 
     nonmigrated_files = {}
 
-    files = [
-        file
-        for ext in ("*.scala", "*.java")
-        for file in glob.glob(os.path.join(SPARK_HOME, "**", ext), recursive=True)
-    ]
+    scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
 
-    for file in files:
+    for file in scala_files:
         skip_file = False
         for exclude_pattern in excluded_file_patterns:
             if re.search(exclude_pattern, file):

From c8b1b2776c63e362854e45c93dfe6ecd97cac19f Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Mon, 8 Jul 2024 15:28:13 -0700
Subject: [PATCH 11/28] add CodeGenerator to exclude list due to large code
 formatter variable

---
 dev/structured-logging-style.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 573fb2367556e..6ebdacef7329c 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -33,12 +33,13 @@ def main():
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         "[Tt]est",
+        "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
         "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
     ]
 
     nonmigrated_files = {}
 
-    scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
+    scala_files = glob.glob(os.path.join(SPARK_HOME, "**", "*.scala"), recursive=True)
 
     for file in scala_files:
         skip_file = False

From ae67d94686880935d6537beaeec361a9958d1831 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Tue, 9 Jul 2024 05:41:34 -0700
Subject: [PATCH 12/28] check if script exists

---
 .github/workflows/build_and_test.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 62c76cd27794c..0420570ebb918 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -694,7 +694,10 @@ jobs:
     - name: Scala linter
       run: ./dev/lint-scala
     - name: Scala structured logging check
-      run: ./dev/structured-logging-style.py
+      run: |
+        if [ -f ./dev/structured-logging-style.py ]; then
+            ./dev/structured-logging-style.py
+        fi
     - name: Java linter
       run: ./dev/lint-java
     - name: Spark connect jvm client mima check

From 9f17a40f90422b57e1233b452fcb710ed908e12f Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Tue, 9 Jul 2024 10:23:38 -0700
Subject: [PATCH 13/28] fix inner regex

---
 dev/structured-logging-style.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 6ebdacef7329c..126074a48a943 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -26,8 +26,8 @@
 
 
 def main():
-    log_pattern = r"log(?:Info|Warning|Error)\((.*?)\)"
-    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*'
+    log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*|.* \+ ".*?"'
     compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check
@@ -57,7 +57,13 @@ def main():
                 if log_statements:
                     nonmigrated_files[file] = []
                     for log_statement in log_statements:
-                        if compiled_inner_log_pattern.fullmatch(log_statement.group(1)):
+                        log_statement_str = log_statement.group(0).strip()
+                        # trim first ( and last )
+                        first_paren_index = log_statement_str.find('(')
+                        inner_log_statement = log_statement_str[first_paren_index + 1:-1]
+
+                        if compiled_inner_log_pattern.fullmatch(inner_log_statement):
+                            # print(f"log statement: ${inner_log_statement}")
                             start_pos = log_statement.start()
                             preceding_content = content[:start_pos]
                             line_number = preceding_content.count("\n") + 1

From 4bb0ccf43f28691fde7c0330e69e48a679b22bb6 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Tue, 9 Jul 2024 16:53:12 -0700
Subject: [PATCH 14/28] add exclude file

---
 dev/structured-logging-style.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 126074a48a943..5d4408136504a 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -27,14 +27,16 @@
 
 def main():
     log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
-    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*|.* \+ ".*?"'
-    compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL)
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?"(?:\$|\+[^"]+)|[^"]+\+\s*".*?"'
+    compiled_inner_log_pattern = re.compile(inner_log_pattern)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         "[Tt]est",
         "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
+        "/Users/amanda.liu/Documents/Databricks/spark/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
         "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
+        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala"
     ]
 
     nonmigrated_files = {}
@@ -60,10 +62,13 @@ def main():
                         log_statement_str = log_statement.group(0).strip()
                         # trim first ( and last )
                         first_paren_index = log_statement_str.find('(')
-                        inner_log_statement = log_statement_str[first_paren_index + 1:-1]
+                        inner_log_statement = re.sub(r'\s+', '', log_statement_str[first_paren_index + 1:-1])
+
+                        # log_statement_str[first_paren_index + 1:-1].replace(" ", "").replace("\t", "").replace("\n", "")
 
                         if compiled_inner_log_pattern.fullmatch(inner_log_statement):
-                            # print(f"log statement: ${inner_log_statement}")
+                            print(file)
+                            print(f"log statement: ${inner_log_statement}")
                             start_pos = log_statement.start()
                             preceding_content = content[:start_pos]
                             line_number = preceding_content.count("\n") + 1
@@ -75,13 +80,16 @@ def main():
         sys.exit(0)
     else:
         for file_path, issues in nonmigrated_files.items():
+            if issues:
+                pass
+                # print(file_path)
             for line_number, start_char in issues:
                 pass
-                print(f"[error] {file_path}:{line_number}:{start_char}")
-                print(
-                    """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
-                Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`."""
-                )
+                # print(f"[error] {file_path}:{line_number}:{start_char}")
+                # print(
+                #     """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
+                # Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`."""
+                # )
 
         sys.exit(-1)
 

From 97c3a744e908d112e9102a49f5794477b8ff2b1d Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Tue, 9 Jul 2024 17:10:36 -0700
Subject: [PATCH 15/28] revise error message

---
 dev/structured-logging-style.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 5d4408136504a..4d2dfadbb2741 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -36,7 +36,7 @@ def main():
         "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
         "/Users/amanda.liu/Documents/Databricks/spark/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
         "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
-        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala"
+        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
     ]
 
     nonmigrated_files = {}
@@ -61,14 +61,12 @@ def main():
                     for log_statement in log_statements:
                         log_statement_str = log_statement.group(0).strip()
                         # trim first ( and last )
-                        first_paren_index = log_statement_str.find('(')
-                        inner_log_statement = re.sub(r'\s+', '', log_statement_str[first_paren_index + 1:-1])
-
-                        # log_statement_str[first_paren_index + 1:-1].replace(" ", "").replace("\t", "").replace("\n", "")
+                        first_paren_index = log_statement_str.find("(")
+                        inner_log_statement = re.sub(
+                            r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
+                        )
 
                         if compiled_inner_log_pattern.fullmatch(inner_log_statement):
-                            print(file)
-                            print(f"log statement: ${inner_log_statement}")
                             start_pos = log_statement.start()
                             preceding_content = content[:start_pos]
                             line_number = preceding_content.count("\n") + 1
@@ -80,16 +78,12 @@ def main():
         sys.exit(0)
     else:
         for file_path, issues in nonmigrated_files.items():
-            if issues:
-                pass
-                # print(file_path)
             for line_number, start_char in issues:
-                pass
-                # print(f"[error] {file_path}:{line_number}:{start_char}")
-                # print(
-                #     """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
-                # Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`."""
-                # )
+                print(f"[error] {file_path}:{line_number}:{start_char}")
+                print(
+                    """[error]\tLogging message should use Structured Logging Framework style, such as log"...${MDC(TASK_ID, taskId)..."
+                Refer to the guidelines in the file `internal/Logging.scala`."""
+                )
 
         sys.exit(-1)
 

From 1b3950d324409493289446ab244baa62fe33de58 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Wed, 10 Jul 2024 05:46:03 -0700
Subject: [PATCH 16/28] update error message

---
 dev/structured-logging-style.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index 4d2dfadbb2741..de02aebd6a404 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -33,9 +33,9 @@ def main():
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         "[Tt]est",
-        "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
-        "/Users/amanda.liu/Documents/Databricks/spark/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
-        "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
+        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
+        "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
+        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
         "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
     ]
 
@@ -81,7 +81,7 @@ def main():
             for line_number, start_char in issues:
                 print(f"[error] {file_path}:{line_number}:{start_char}")
                 print(
-                    """[error]\tLogging message should use Structured Logging Framework style, such as log"...${MDC(TASK_ID, taskId)..."
+                    """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...".
                 Refer to the guidelines in the file `internal/Logging.scala`."""
                 )
 

From 6f47405dec7bd0b14c28705fac8c0223f16a7df6 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Wed, 10 Jul 2024 06:04:28 -0700
Subject: [PATCH 17/28] check if file is a directory

---
 dev/structured-logging-style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index de02aebd6a404..c78b75f3eb3d9 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -50,7 +50,7 @@ def main():
                 skip_file = True
                 break
 
-        if not skip_file:
+        if not skip_file and not os.path.isdir(file):
             with open(file, "r") as f:
                 content = f.read()
 

From dd0fb478b06a4abbdbaf86341eddad6371e30fca Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Wed, 10 Jul 2024 15:25:57 -0700
Subject: [PATCH 18/28] update regex

---
 dev/structured-logging-style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
index c78b75f3eb3d9..999f0a2bd02cb 100755
--- a/dev/structured-logging-style.py
+++ b/dev/structured-logging-style.py
@@ -27,7 +27,7 @@
 
 def main():
     log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
-    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?"(?:\$|\+[^"]+)|[^"]+\+\s*".*?"'
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"'
     compiled_inner_log_pattern = re.compile(inner_log_pattern)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check

From d6206a90787361289d0af00b55edf37a5dbe6cf9 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Thu, 11 Jul 2024 09:01:49 -0700
Subject: [PATCH 19/28] rename script

---
 dev/structured_logging_style.py | 91 +++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100755 dev/structured_logging_style.py

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
new file mode 100755
index 0000000000000..9f56cbd2ab357
--- /dev/null
+++ b/dev/structured_logging_style.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import re
+import glob
+
+
+def main():
+    log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"'
+    compiled_inner_log_pattern = re.compile(inner_log_pattern)
+
+    # Regex patterns for file paths to exclude from the Structured Logging style check
+    excluded_file_patterns = [
+        "[Tt]est",
+        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
+        "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
+        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
+        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
+    ]
+
+    nonmigrated_files = {}
+
+    scala_files = glob.glob(os.path.join("../", "**", "*.scala"), recursive=True)
+
+    for file in scala_files:
+        skip_file = False
+        for exclude_pattern in excluded_file_patterns:
+            if re.search(exclude_pattern, file):
+                skip_file = True
+                break
+
+        if not skip_file and not os.path.isdir(file):
+            with open(file, "r") as f:
+                content = f.read()
+
+                log_statements = re.finditer(log_pattern, content, re.DOTALL)
+
+                if log_statements:
+                    nonmigrated_files[file] = []
+                    for log_statement in log_statements:
+                        log_statement_str = log_statement.group(0).strip()
+                        # trim first ( and last )
+                        first_paren_index = log_statement_str.find("(")
+                        inner_log_statement = re.sub(
+                            r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
+                        )
+
+                        if compiled_inner_log_pattern.fullmatch(inner_log_statement):
+                            start_pos = log_statement.start()
+                            preceding_content = content[:start_pos]
+                            line_number = preceding_content.count("\n") + 1
+                            start_char = start_pos - preceding_content.rfind("\n") - 1
+                            nonmigrated_files[file].append((line_number, start_char))
+
+    if not nonmigrated_files:
+        print("Structured logging style check passed.", file=sys.stderr)
+        sys.exit(0)
+    else:
+        for file_path, issues in nonmigrated_files.items():
+            for line_number, start_char in issues:
+                print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
+                print(
+                    """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...".
+                Refer to the guidelines in the file `internal/Logging.scala`.""",
+                    file=sys.stderr,
+                )
+
+        sys.exit(-1)
+
+
+if __name__ == "__main__":
+    main()

From 62e3c0d759db86305daef794e0b927aa34ec05ca Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Thu, 11 Jul 2024 10:42:00 -0700
Subject: [PATCH 20/28] style

---
 dev/structured_logging_style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index 9f56cbd2ab357..f756633f4abf6 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -25,7 +25,7 @@
 
 def main():
     log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
-    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"'
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!")).*|[^"]+\+\s*".*?"'
     compiled_inner_log_pattern = re.compile(inner_log_pattern)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check

From e6d88b2bbcaa0790b39b4f34d16962813ca723fd Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Thu, 11 Jul 2024 14:18:18 -0700
Subject: [PATCH 21/28] update gha yml

---
 .github/workflows/build_and_test.yml |  4 +-
 dev/structured-logging-style.py      | 92 ----------------------------
 2 files changed, 2 insertions(+), 94 deletions(-)
 delete mode 100755 dev/structured-logging-style.py

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0420570ebb918..0069906a762c3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -695,8 +695,8 @@ jobs:
       run: ./dev/lint-scala
     - name: Scala structured logging check
       run: |
-        if [ -f ./dev/structured-logging-style.py ]; then
-            ./dev/structured-logging-style.py
+        if [ -f ./dev/structured_logging_style.py ]; then
+            ./dev/structured_logging_style.py
         fi
     - name: Java linter
       run: ./dev/lint-java
diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
deleted file mode 100755
index 999f0a2bd02cb..0000000000000
--- a/dev/structured-logging-style.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import sys
-import re
-import glob
-
-from sparktestsupport import SPARK_HOME
-
-
-def main():
-    log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
-    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"'
-    compiled_inner_log_pattern = re.compile(inner_log_pattern)
-
-    # Regex patterns for file paths to exclude from the Structured Logging style check
-    excluded_file_patterns = [
-        "[Tt]est",
-        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
-        "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
-        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
-        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
-    ]
-
-    nonmigrated_files = {}
-
-    scala_files = glob.glob(os.path.join(SPARK_HOME, "**", "*.scala"), recursive=True)
-
-    for file in scala_files:
-        skip_file = False
-        for exclude_pattern in excluded_file_patterns:
-            if re.search(exclude_pattern, file):
-                skip_file = True
-                break
-
-        if not skip_file and not os.path.isdir(file):
-            with open(file, "r") as f:
-                content = f.read()
-
-                log_statements = re.finditer(log_pattern, content, re.DOTALL)
-
-                if log_statements:
-                    nonmigrated_files[file] = []
-                    for log_statement in log_statements:
-                        log_statement_str = log_statement.group(0).strip()
-                        # trim first ( and last )
-                        first_paren_index = log_statement_str.find("(")
-                        inner_log_statement = re.sub(
-                            r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
-                        )
-
-                        if compiled_inner_log_pattern.fullmatch(inner_log_statement):
-                            start_pos = log_statement.start()
-                            preceding_content = content[:start_pos]
-                            line_number = preceding_content.count("\n") + 1
-                            start_char = start_pos - preceding_content.rfind("\n") - 1
-                            nonmigrated_files[file].append((line_number, start_char))
-
-    if not nonmigrated_files:
-        print("Structured logging style check passed.")
-        sys.exit(0)
-    else:
-        for file_path, issues in nonmigrated_files.items():
-            for line_number, start_char in issues:
-                print(f"[error] {file_path}:{line_number}:{start_char}")
-                print(
-                    """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...".
-                Refer to the guidelines in the file `internal/Logging.scala`."""
-                )
-
-        sys.exit(-1)
-
-
-if __name__ == "__main__":
-    main()

From 72f0c1b5405236abeded3039eb0c09d088bfd967 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Thu, 11 Jul 2024 14:58:24 -0700
Subject: [PATCH 22/28] line char max

---
 dev/structured_logging_style.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index f756633f4abf6..52f62bfce426c 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -31,9 +31,11 @@ def main():
     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         "[Tt]est",
-        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
+        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen"
+        "/CodeGenerator.scala",
         "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
-        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
+        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver"
+        "/SparkSQLCLIService.scala",
         "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
     ]
 
@@ -79,8 +81,9 @@ def main():
             for line_number, start_char in issues:
                 print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
                 print(
-                    """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...".
-                Refer to the guidelines in the file `internal/Logging.scala`.""",
+                    f"[error]\tPlease use the Structured Logging Framework for logging messages "
+                    f'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
+                    f"\nRefer to the guidelines in the file `internal/Logging.scala`.",
                     file=sys.stderr,
                 )
 

From 7bdf67c9147a654b11f28810d33114f09aef6325 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Mon, 15 Jul 2024 06:18:39 -0700
Subject: [PATCH 23/28] stylize the error message

---
 dev/structured_logging_style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index 52f62bfce426c..897504bf6b16e 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -83,7 +83,7 @@ def main():
                 print(
                     f"[error]\tPlease use the Structured Logging Framework for logging messages "
                     f'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
-                    f"\nRefer to the guidelines in the file `internal/Logging.scala`.",
+                    f"\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
                     file=sys.stderr,
                 )
 

From bed425679caef710d2e95725dba51592a5d314d3 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Wed, 17 Jul 2024 15:37:04 -0700
Subject: [PATCH 24/28] modify + regex

---
 dev/structured_logging_style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index 897504bf6b16e..8ddd95015b00d 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -25,7 +25,7 @@
 
 def main():
     log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
-    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!")).*|[^"]+\+\s*".*?"'
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"'
     compiled_inner_log_pattern = re.compile(inner_log_pattern)
 
     # Regex patterns for file paths to exclude from the Structured Logging style check

From a253657c4bd376cbb4fdd63cdbd277fd031b86d1 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 19 Jul 2024 10:55:47 -0700
Subject: [PATCH 25/28] run script with python3.9

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b7c52831ce2cc..576f64f3a0869 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -696,7 +696,7 @@ jobs:
     - name: Scala structured logging check
       run: |
         if [ -f ./dev/structured_logging_style.py ]; then
-            ./dev/structured_logging_style.py
+            python3.9 ./dev/structured_logging_style.py
         fi
     - name: Java linter
       run: ./dev/lint-java

From 0c223830bf3a692528c80b7fdea3b69e447391ad Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 19 Jul 2024 11:45:46 -0700
Subject: [PATCH 26/28] update success condition check

---
 dev/structured_logging_style.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index 8ddd95015b00d..ee43abbc8e8be 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -73,7 +73,7 @@ def main():
                             start_char = start_pos - preceding_content.rfind("\n") - 1
                             nonmigrated_files[file].append((line_number, start_char))
 
-    if not nonmigrated_files:
+    if all(len(issues) == 0 for issues in nonmigrated_files.values()):
         print("Structured logging style check passed.", file=sys.stderr)
         sys.exit(0)
     else:

From 2333388728a6ab3da939c7417f24b3e2aa884e96 Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 19 Jul 2024 17:47:39 -0700
Subject: [PATCH 27/28] lint fix

---
 dev/structured_logging_style.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index ee43abbc8e8be..4a97053817819 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -81,9 +81,9 @@ def main():
             for line_number, start_char in issues:
                 print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
                 print(
-                    f"[error]\tPlease use the Structured Logging Framework for logging messages "
-                    f'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
-                    f"\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
+                    '[error]\tPlease use the Structured Logging Framework for logging messages '
+                    'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
+                    '\n\tRefer to the guidelines in the file `internal/Logging.scala`.',
                     file=sys.stderr,
                 )
 

From 5f789893234460e7cb96534169ebad32be6d48ee Mon Sep 17 00:00:00 2001
From: Amanda Liu <amanda.liu@databricks.com>
Date: Fri, 19 Jul 2024 21:26:37 -0700
Subject: [PATCH 28/28] reformat

---
 dev/structured_logging_style.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
index 4a97053817819..eeeefab3a977c 100755
--- a/dev/structured_logging_style.py
+++ b/dev/structured_logging_style.py
@@ -81,9 +81,9 @@ def main():
             for line_number, start_char in issues:
                 print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
                 print(
-                    '[error]\tPlease use the Structured Logging Framework for logging messages '
+                    "[error]\tPlease use the Structured Logging Framework for logging messages "
                     'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
-                    '\n\tRefer to the guidelines in the file `internal/Logging.scala`.',
+                    "\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
                     file=sys.stderr,
                 )