From 6ef59e6067742e8826741af8f49ca8ac3319273a Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 5 Jul 2024 11:42:13 -0700 Subject: [PATCH 01/28] add structured logging style script and github workflow --- .github/workflows/build_and_test.yml | 2 + dev/structured-logging.py | 64 ++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 dev/structured-logging.py diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9dc9d85520c2c..6f290ec4c74ef 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -702,6 +702,8 @@ jobs: run: ./dev/mima - name: Scala linter run: ./dev/lint-scala + - name: Scala structured logging check + run: ./dev/structured-logging.py - name: Java linter run: ./dev/lint-java - name: Spark connect jvm client mima check diff --git a/dev/structured-logging.py b/dev/structured-logging.py new file mode 100644 index 0000000000000..69a05585b677c --- /dev/null +++ b/dev/structured-logging.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import re +import glob + +from sparktestsupport import SPARK_HOME + +def main(): + nonmigrated_pattern = r'log(Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))' + whitelist_file = 'dev/structured-logging-whitelist.txt' + + nonmigrated_files = [] + whitelist_files = set() + + with open(whitelist_file, 'r') as wf: + whitelist_patterns = [line.strip() for line in wf if line.strip()] + + scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True) + + for file in scala_files: + for whitelist_pattern in whitelist_patterns: + if re.search(whitelist_pattern, file): + whitelist_files.add(file) + break + + for file in scala_files: + if file not in whitelist_files: + with open(file, 'r') as f: + content = f.read().replace('\t', '').strip() + if re.search(nonmigrated_pattern, content): + nonmigrated_files.append(file) + + if not nonmigrated_files: + print("Structured logging style check passed.") + sys.exit(0) + else: + print("Structured logging style check failed for the following:") + for file in nonmigrated_files: + print(file) + + sys.exit(-1) + + +if __name__ == "__main__": + main() From d640b944aa2300f2c662b484b130b90f3b22fec4 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 5 Jul 2024 13:50:48 -0700 Subject: [PATCH 02/28] improve error message --- dev/structured-logging.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/dev/structured-logging.py b/dev/structured-logging.py index 69a05585b677c..485634d95d4dd 100644 --- a/dev/structured-logging.py +++ b/dev/structured-logging.py @@ -25,37 +25,42 @@ from sparktestsupport import SPARK_HOME def main(): - nonmigrated_pattern = r'log(Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))' - whitelist_file = 'dev/structured-logging-whitelist.txt' + nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))' + excluded_file_patterns = [ + '[Tt]est', + './sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala' + './sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala' + ] - nonmigrated_files = [] - whitelist_files = set() - - with open(whitelist_file, 'r') as wf: - whitelist_patterns = [line.strip() for line in wf if line.strip()] + nonmigrated_files = {} + excluded_files = set() scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True) for file in scala_files: - for whitelist_pattern in whitelist_patterns: - if re.search(whitelist_pattern, file): - whitelist_files.add(file) + for exclude_pattern in excluded_file_patterns: + if re.search(exclude_pattern, file): + excluded_files.add(file) break for file in scala_files: - if file not in whitelist_files: + if file not in excluded_files: with open(file, 'r') as f: - content = f.read().replace('\t', '').strip() - if re.search(nonmigrated_pattern, content): - nonmigrated_files.append(file) + lines = f.readlines() + for line_number, line in enumerate(lines, start=1): + if re.search(nonmigrated_pattern, line): + if file not in nonmigrated_files: + nonmigrated_files[file] = [] + nonmigrated_files[file].append((line_number, line)) if not nonmigrated_files: print("Structured logging style check passed.") sys.exit(0) else: - print("Structured logging style check failed for the following:") - for file in nonmigrated_files: - print(file) + for file_path, issues in nonmigrated_files.items(): + print(f"[error] Structured logging style check failed for the file '{file}' at:") + for line_number, code in issues: + print(f"Line: {line_number} code: {code}") sys.exit(-1) From b86222834368fe1997849413b41aaeea5f447e3d Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 5 Jul 2024 14:23:45 -0700 Subject: [PATCH 03/28] match scala build error message --- dev/structured-logging.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/dev/structured-logging.py b/dev/structured-logging.py index 485634d95d4dd..188dc3575f14e 100644 --- a/dev/structured-logging.py +++ b/dev/structured-logging.py @@ -26,10 +26,12 @@ def main(): nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))' + + # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ '[Tt]est', - './sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala' - './sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala' + '/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala', + '/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala' ] nonmigrated_files = {} @@ -41,29 +43,31 @@ def main(): for exclude_pattern in excluded_file_patterns: if re.search(exclude_pattern, file): excluded_files.add(file) - break for file in scala_files: if file not in excluded_files: with open(file, 'r') as f: lines = f.readlines() for line_number, line in enumerate(lines, start=1): - if re.search(nonmigrated_pattern, line): + matches = list(re.finditer(nonmigrated_pattern, line)) + if matches: if file not in nonmigrated_files: nonmigrated_files[file] = [] - nonmigrated_files[file].append((line_number, line)) + for match in matches: + start_char = match.start() + nonmigrated_files[file].append((line_number, start_char)) if not nonmigrated_files: print("Structured logging style check passed.") sys.exit(0) else: for file_path, issues in nonmigrated_files.items(): - print(f"[error] Structured logging style check failed for the file '{file}' at:") - for line_number, code in issues: - print(f"Line: {line_number} code: {code}") + for line_number, start_char in issues: + print(f"[error] {file_path}:{line_number}:{start_char}") + print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. + Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""") sys.exit(-1) - if __name__ == "__main__": main() From 42de4c5f3b7304fc2df61ae29c3c553f54221417 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 5 Jul 2024 17:09:51 -0700 Subject: [PATCH 04/28] account for line breaks in regex match --- ...logging.py => structured-logging-style.py} | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) rename dev/{structured-logging.py => structured-logging-style.py} (74%) diff --git a/dev/structured-logging.py b/dev/structured-logging-style.py similarity index 74% rename from dev/structured-logging.py rename to dev/structured-logging-style.py index 188dc3575f14e..6edda5ef1d64d 100644 --- a/dev/structured-logging.py +++ b/dev/structured-logging-style.py @@ -25,7 +25,8 @@ from sparktestsupport import SPARK_HOME def main(): - nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|s".*(?:\$|\+\s*[^\s"]).*"\))' + nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r].*(\$|\+).*\))' + pattern = re.compile(nonmigrated_pattern) # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ @@ -35,27 +36,29 @@ def main(): ] nonmigrated_files = {} - excluded_files = set() scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True) for file in scala_files: + skip_file = False for exclude_pattern in excluded_file_patterns: if re.search(exclude_pattern, file): - excluded_files.add(file) + skip_file = True + break - for file in scala_files: - if file not in excluded_files: + if not skip_file: with open(file, 'r') as f: - lines = f.readlines() - for line_number, line in enumerate(lines, start=1): - matches = list(re.finditer(nonmigrated_pattern, line)) - if matches: - if file not in nonmigrated_files: - nonmigrated_files[file] = [] - for match in matches: - start_char = match.start() - nonmigrated_files[file].append((line_number, start_char)) + content = f.read() + matches = list(pattern.finditer(content)) + + if matches: + nonmigrated_files[file] = [] + for match in matches: + start_pos = match.start() + preceding_content = content[:start_pos] + line_number = preceding_content.count('\n') + 1 + start_char = start_pos - preceding_content.rfind('\n') - 1 + nonmigrated_files[file].append((line_number, start_char)) if not nonmigrated_files: print("Structured logging style check passed.") From 95e86cc1744a294baefef5bf856e8c221ae2ecbf Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 5 Jul 2024 20:18:20 -0700 Subject: [PATCH 05/28] fix script name --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6f290ec4c74ef..3d3acbcb52fb3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -703,7 +703,7 @@ jobs: - name: Scala linter run: ./dev/lint-scala - name: Scala structured logging check - run: ./dev/structured-logging.py + run: ./dev/structured-logging-style.py - name: Java linter run: ./dev/lint-java - name: Spark connect jvm client mima check From 667bc58431a9fefad78cf51c1b0b79b141c08a95 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Sat, 6 Jul 2024 08:32:55 -0700 Subject: [PATCH 06/28] update regex --- dev/structured-logging-style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 dev/structured-logging-style.py diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py old mode 100644 new mode 100755 index 6edda5ef1d64d..c9bd32120c8c9 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -25,7 +25,7 @@ from sparktestsupport import SPARK_HOME def main(): - nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r].*(\$|\+).*\))' + nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r]*(\$|\+).*\))' pattern = re.compile(nonmigrated_pattern) # Regex patterns for file paths to exclude from the Structured Logging style check From 58e763dc2f8f4145d8bde6acd2c6c42af7554482 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Mon, 8 Jul 2024 10:11:25 -0700 Subject: [PATCH 07/28] separate mono regex --- dev/structured-logging-style.py | 56 ++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index c9bd32120c8c9..1eaf5013b9e8b 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -25,8 +25,9 @@ from sparktestsupport import SPARK_HOME def main(): - nonmigrated_pattern = r'log(?:Info|Warning|Error)\((?:".*"\.format\(.*\)|(?:s|\\)".*[\n\r]*(\$|\+).*\))' - pattern = re.compile(nonmigrated_pattern) + log_pattern = r'log(?:Info|Warning|Error)\((.*?)\)' + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*' + compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL) # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ @@ -49,26 +50,53 @@ def main(): if not skip_file: with open(file, 'r') as f: content = f.read() - matches = list(pattern.finditer(content)) - if matches: + log_statements = re.finditer(log_pattern, content, re.DOTALL) + # log_statements = [statement.group(1).strip() for statement in log_statements] + + if log_statements: nonmigrated_files[file] = [] - for match in matches: - start_pos = match.start() - preceding_content = content[:start_pos] - line_number = preceding_content.count('\n') + 1 - start_char = start_pos - preceding_content.rfind('\n') - 1 - nonmigrated_files[file].append((line_number, start_char)) + for log_statement in log_statements: + if compiled_inner_log_pattern.fullmatch(log_statement.group(1)): + start_pos = log_statement.start() + preceding_content = content[:start_pos] + line_number = preceding_content.count('\n') + 1 + start_char = start_pos - preceding_content.rfind('\n') - 1 + nonmigrated_files[file].append((line_number, start_char, log_statement.group(1))) + + # for log_statement in log_statements: + # if compiled_inner_log_pattern.search(log_statement): + # nonmigrated_files[file].append() + + + # matches = list(pattern.finditer(content)) + # matches2 = pattern.findall(content) + # + # for m in matches2: + # print(f"****** ${m}") + # + # if matches: + # nonmigrated_files[file] = [] + # for match in matches: + # start_pos = match.start() + # preceding_content = content[:start_pos] + # line_number = preceding_content.count('\n') + 1 + # start_char = start_pos - preceding_content.rfind('\n') - 1 + # nonmigrated_files[file].append((line_number, start_char)) if not nonmigrated_files: print("Structured logging style check passed.") sys.exit(0) else: for file_path, issues in nonmigrated_files.items(): - for line_number, start_char in issues: - print(f"[error] {file_path}:{line_number}:{start_char}") - print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. - Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""") + if issues: + print(file_path) + # for file_path, issues in nonmigrated_files.items(): + # for line_number, start_char in issues: + # pass + # print(f"[error] {file_path}:{line_number}:{start_char}") + # print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. + # Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""") sys.exit(-1) From 0f26de8d7d589c65ed2c3f0fa9c50fa153414a7d Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Mon, 8 Jul 2024 11:54:30 -0700 Subject: [PATCH 08/28] add java files check --- dev/structured-logging-style.py | 41 ++++++++------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 1eaf5013b9e8b..18db71d89c852 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -38,9 +38,10 @@ def main(): nonmigrated_files = {} - scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True) + files = [file for ext in ('*.scala', '*.java') + for file in glob.glob(os.path.join(SPARK_HOME, '**', ext), recursive=True)] - for file in scala_files: + for file in files: skip_file = False for exclude_pattern in excluded_file_patterns: if re.search(exclude_pattern, file): @@ -52,7 +53,6 @@ def main(): content = f.read() log_statements = re.finditer(log_pattern, content, re.DOTALL) - # log_statements = [statement.group(1).strip() for statement in log_statements] if log_statements: nonmigrated_files[file] = [] @@ -62,41 +62,18 @@ def main(): preceding_content = content[:start_pos] line_number = preceding_content.count('\n') + 1 start_char = start_pos - preceding_content.rfind('\n') - 1 - nonmigrated_files[file].append((line_number, start_char, log_statement.group(1))) - - # for log_statement in log_statements: - # if compiled_inner_log_pattern.search(log_statement): - # nonmigrated_files[file].append() - - - # matches = list(pattern.finditer(content)) - # matches2 = pattern.findall(content) - # - # for m in matches2: - # print(f"****** ${m}") - # - # if matches: - # nonmigrated_files[file] = [] - # for match in matches: - # start_pos = match.start() - # preceding_content = content[:start_pos] - # line_number = preceding_content.count('\n') + 1 - # start_char = start_pos - preceding_content.rfind('\n') - 1 - # nonmigrated_files[file].append((line_number, start_char)) + nonmigrated_files[file].append((line_number, start_char)) if not nonmigrated_files: print("Structured logging style check passed.") sys.exit(0) else: for file_path, issues in nonmigrated_files.items(): - if issues: - print(file_path) - # for file_path, issues in nonmigrated_files.items(): - # for line_number, start_char in issues: - # pass - # print(f"[error] {file_path}:{line_number}:{start_char}") - # print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. - # Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""") + for line_number, start_char in issues: + pass + print(f"[error] {file_path}:{line_number}:{start_char}") + print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. + Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""") sys.exit(-1) From 89596889a792385103f00f87a196048d39cab8b5 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Mon, 8 Jul 2024 12:06:52 -0700 Subject: [PATCH 09/28] reformat python --- dev/structured-logging-style.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 18db71d89c852..7138f7795b4b5 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -24,22 +24,26 @@ from sparktestsupport import SPARK_HOME + def main(): - log_pattern = r'log(?:Info|Warning|Error)\((.*?)\)' + log_pattern = r"log(?:Info|Warning|Error)\((.*?)\)" inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*' compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL) # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ - '[Tt]est', - '/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala', - '/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala' + "[Tt]est", + "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", + "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", ] nonmigrated_files = {} - files = [file for ext in ('*.scala', '*.java') - for file in glob.glob(os.path.join(SPARK_HOME, '**', ext), recursive=True)] + files = [ + file + for ext in ("*.scala", "*.java") + for file in glob.glob(os.path.join(SPARK_HOME, "**", ext), recursive=True) + ] for file in files: skip_file = False @@ -49,7 +53,7 @@ def main(): break if not skip_file: - with open(file, 'r') as f: + with open(file, "r") as f: content = f.read() log_statements = re.finditer(log_pattern, content, re.DOTALL) @@ -60,8 +64,8 @@ def main(): if compiled_inner_log_pattern.fullmatch(log_statement.group(1)): start_pos = log_statement.start() preceding_content = content[:start_pos] - line_number = preceding_content.count('\n') + 1 - start_char = start_pos - preceding_content.rfind('\n') - 1 + line_number = preceding_content.count("\n") + 1 + start_char = start_pos - preceding_content.rfind("\n") - 1 nonmigrated_files[file].append((line_number, start_char)) if not nonmigrated_files: @@ -72,10 +76,13 @@ def main(): for line_number, start_char in issues: pass print(f"[error] {file_path}:{line_number}:{start_char}") - print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. - Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""") + print( + """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. + Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""" + ) sys.exit(-1) + if __name__ == "__main__": main() From 0bd4e8aa85a3f983fac98662774ca9d5aebcdb5f Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Mon, 8 Jul 2024 15:18:43 -0700 Subject: [PATCH 10/28] check only scala files --- dev/structured-logging-style.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 7138f7795b4b5..573fb2367556e 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -33,19 +33,14 @@ def main(): # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ "[Tt]est", - "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", ] nonmigrated_files = {} - files = [ - file - for ext in ("*.scala", "*.java") - for file in glob.glob(os.path.join(SPARK_HOME, "**", ext), recursive=True) - ] + scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True) - for file in files: + for file in scala_files: skip_file = False for exclude_pattern in excluded_file_patterns: if re.search(exclude_pattern, file): From c8b1b2776c63e362854e45c93dfe6ecd97cac19f Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Mon, 8 Jul 2024 15:28:13 -0700 Subject: [PATCH 11/28] add CodeGenerator to exclude list due to large code formatter variable --- dev/structured-logging-style.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 573fb2367556e..6ebdacef7329c 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -33,12 +33,13 @@ def main(): # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ "[Tt]est", + "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", ] nonmigrated_files = {} - scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True) + scala_files = glob.glob(os.path.join(SPARK_HOME, "**", "*.scala"), recursive=True) for file in scala_files: skip_file = False From ae67d94686880935d6537beaeec361a9958d1831 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Tue, 9 Jul 2024 05:41:34 -0700 Subject: [PATCH 12/28] check if script exists --- .github/workflows/build_and_test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 62c76cd27794c..0420570ebb918 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -694,7 +694,10 @@ jobs: - name: Scala linter run: ./dev/lint-scala - name: Scala structured logging check - run: ./dev/structured-logging-style.py + run: | + if [ -f ./dev/structured-logging-style.py ]; then + ./dev/structured-logging-style.py + fi - name: Java linter run: ./dev/lint-java - name: Spark connect jvm client mima check From 9f17a40f90422b57e1233b452fcb710ed908e12f Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Tue, 9 Jul 2024 10:23:38 -0700 Subject: [PATCH 13/28] fix inner regex --- dev/structured-logging-style.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 6ebdacef7329c..126074a48a943 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -26,8 +26,8 @@ def main(): - log_pattern = r"log(?:Info|Warning|Error)\((.*?)\)" - inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*' + log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*|.* \+ ".*?"' compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL) # Regex patterns for file paths to exclude from the Structured Logging style check @@ -57,7 +57,13 @@ def main(): if log_statements: nonmigrated_files[file] = [] for log_statement in log_statements: - if compiled_inner_log_pattern.fullmatch(log_statement.group(1)): + log_statement_str = log_statement.group(0).strip() + # trim first ( and last ) + first_paren_index = log_statement_str.find('(') + inner_log_statement = log_statement_str[first_paren_index + 1:-1] + + if compiled_inner_log_pattern.fullmatch(inner_log_statement): + # print(f"log statement: ${inner_log_statement}") start_pos = log_statement.start() preceding_content = content[:start_pos] line_number = preceding_content.count("\n") + 1 From 4bb0ccf43f28691fde7c0330e69e48a679b22bb6 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Tue, 9 Jul 2024 16:53:12 -0700 Subject: [PATCH 14/28] add exclude file --- dev/structured-logging-style.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 126074a48a943..5d4408136504a 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -27,14 +27,16 @@ def main(): log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" - inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*|.* \+ ".*?"' - compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL) + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?"(?:\$|\+[^"]+)|[^"]+\+\s*".*?"' + compiled_inner_log_pattern = re.compile(inner_log_pattern) # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ "[Tt]est", "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", + "/Users/amanda.liu/Documents/Databricks/spark/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", + "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala" ] nonmigrated_files = {} @@ -60,10 +62,13 @@ def main(): log_statement_str = log_statement.group(0).strip() # trim first ( and last ) first_paren_index = log_statement_str.find('(') - inner_log_statement = log_statement_str[first_paren_index + 1:-1] + inner_log_statement = re.sub(r'\s+', '', log_statement_str[first_paren_index + 1:-1]) + + # log_statement_str[first_paren_index + 1:-1].replace(" ", "").replace("\t", "").replace("\n", "") if compiled_inner_log_pattern.fullmatch(inner_log_statement): - # print(f"log statement: ${inner_log_statement}") + print(file) + print(f"log statement: ${inner_log_statement}") start_pos = log_statement.start() preceding_content = content[:start_pos] line_number = preceding_content.count("\n") + 1 @@ -75,13 +80,16 @@ def main(): sys.exit(0) else: for file_path, issues in nonmigrated_files.items(): + if issues: + pass + # print(file_path) for line_number, start_char in issues: pass - print(f"[error] {file_path}:{line_number}:{start_char}") - print( - """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. - Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""" - ) + # print(f"[error] {file_path}:{line_number}:{start_char}") + # print( + # """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. + # Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""" + # ) sys.exit(-1) From 97c3a744e908d112e9102a49f5794477b8ff2b1d Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Tue, 9 Jul 2024 17:10:36 -0700 Subject: [PATCH 15/28] revise error message --- dev/structured-logging-style.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 5d4408136504a..4d2dfadbb2741 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -36,7 +36,7 @@ def main(): "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", "/Users/amanda.liu/Documents/Databricks/spark/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", - "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala" + "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala", ] nonmigrated_files = {} @@ -61,14 +61,12 @@ def main(): for log_statement in log_statements: log_statement_str = log_statement.group(0).strip() # trim first ( and last ) - first_paren_index = log_statement_str.find('(') - inner_log_statement = re.sub(r'\s+', '', log_statement_str[first_paren_index + 1:-1]) - - # log_statement_str[first_paren_index + 1:-1].replace(" ", "").replace("\t", "").replace("\n", "") + first_paren_index = log_statement_str.find("(") + inner_log_statement = re.sub( + r"\s+", "", log_statement_str[first_paren_index + 1 : -1] + ) if compiled_inner_log_pattern.fullmatch(inner_log_statement): - print(file) - print(f"log statement: ${inner_log_statement}") start_pos = log_statement.start() preceding_content = content[:start_pos] line_number = preceding_content.count("\n") + 1 @@ -80,16 +78,12 @@ def main(): sys.exit(0) else: for file_path, issues in nonmigrated_files.items(): - if issues: - pass - # print(file_path) for line_number, start_char in issues: - pass - # print(f"[error] {file_path}:{line_number}:{start_char}") - # print( - # """[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s. - # Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""" - # ) + print(f"[error] {file_path}:{line_number}:{start_char}") + print( + """[error]\tLogging message should use Structured Logging Framework style, such as log"...${MDC(TASK_ID, taskId)..." + Refer to the guidelines in the file `internal/Logging.scala`.""" + ) sys.exit(-1) From 1b3950d324409493289446ab244baa62fe33de58 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Wed, 10 Jul 2024 05:46:03 -0700 Subject: [PATCH 16/28] update error message --- dev/structured-logging-style.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index 4d2dfadbb2741..de02aebd6a404 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -33,9 +33,9 @@ def main(): # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ "[Tt]est", - "/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", - "/Users/amanda.liu/Documents/Databricks/spark/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", - "/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", + "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", + "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", + "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala", ] @@ -81,7 +81,7 @@ def main(): for line_number, start_char in issues: print(f"[error] {file_path}:{line_number}:{start_char}") print( - """[error]\tLogging message should use Structured Logging Framework style, such as log"...${MDC(TASK_ID, taskId)..." + """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...". Refer to the guidelines in the file `internal/Logging.scala`.""" ) From 6f47405dec7bd0b14c28705fac8c0223f16a7df6 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Wed, 10 Jul 2024 06:04:28 -0700 Subject: [PATCH 17/28] check if file is a directory --- dev/structured-logging-style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index de02aebd6a404..c78b75f3eb3d9 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -50,7 +50,7 @@ def main(): skip_file = True break - if not skip_file: + if not skip_file and not os.path.isdir(file): with open(file, "r") as f: content = f.read() From dd0fb478b06a4abbdbaf86341eddad6371e30fca Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Wed, 10 Jul 2024 15:25:57 -0700 Subject: [PATCH 18/28] update regex --- dev/structured-logging-style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py index c78b75f3eb3d9..999f0a2bd02cb 100755 --- a/dev/structured-logging-style.py +++ b/dev/structured-logging-style.py @@ -27,7 +27,7 @@ def main(): log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" - inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?"(?:\$|\+[^"]+)|[^"]+\+\s*".*?"' + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"' compiled_inner_log_pattern = re.compile(inner_log_pattern) # Regex patterns for file paths to exclude from the Structured Logging style check From d6206a90787361289d0af00b55edf37a5dbe6cf9 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Thu, 11 Jul 2024 09:01:49 -0700 Subject: [PATCH 19/28] rename script --- dev/structured_logging_style.py | 91 +++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 dev/structured_logging_style.py diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py new file mode 100755 index 0000000000000..9f56cbd2ab357 --- /dev/null +++ b/dev/structured_logging_style.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import re +import glob + + +def main(): + log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"' + compiled_inner_log_pattern = re.compile(inner_log_pattern) + + # Regex patterns for file paths to exclude from the Structured Logging style check + excluded_file_patterns = [ + "[Tt]est", + "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", + "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", + "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", + "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala", + ] + + nonmigrated_files = {} + + scala_files = glob.glob(os.path.join("../", "**", "*.scala"), recursive=True) + + for file in scala_files: + skip_file = False + for exclude_pattern in excluded_file_patterns: + if re.search(exclude_pattern, file): + skip_file = True + break + + if not skip_file and not os.path.isdir(file): + with open(file, "r") as f: + content = f.read() + + log_statements = re.finditer(log_pattern, content, re.DOTALL) + + if log_statements: + nonmigrated_files[file] = [] + for log_statement in log_statements: + log_statement_str = log_statement.group(0).strip() + # trim first ( and last ) + first_paren_index = log_statement_str.find("(") + inner_log_statement = re.sub( + r"\s+", "", log_statement_str[first_paren_index + 1 : -1] + ) + + if compiled_inner_log_pattern.fullmatch(inner_log_statement): + start_pos = log_statement.start() + preceding_content = content[:start_pos] + line_number = preceding_content.count("\n") + 1 + start_char = start_pos - preceding_content.rfind("\n") - 1 + nonmigrated_files[file].append((line_number, start_char)) + + if not nonmigrated_files: + print("Structured logging style check passed.", file=sys.stderr) + sys.exit(0) + else: + for file_path, issues in nonmigrated_files.items(): + for line_number, start_char in issues: + print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr) + print( + """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...". + Refer to the guidelines in the file `internal/Logging.scala`.""", + file=sys.stderr, + ) + + sys.exit(-1) + + +if __name__ == "__main__": + main() From 62e3c0d759db86305daef794e0b927aa34ec05ca Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Thu, 11 Jul 2024 10:42:00 -0700 Subject: [PATCH 20/28] style --- dev/structured_logging_style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index 9f56cbd2ab357..f756633f4abf6 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -25,7 +25,7 @@ def main(): log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" - inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"' + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!")).*|[^"]+\+\s*".*?"' compiled_inner_log_pattern = re.compile(inner_log_pattern) # Regex patterns for file paths to exclude from the Structured Logging style check From e6d88b2bbcaa0790b39b4f34d16962813ca723fd Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Thu, 11 Jul 2024 14:18:18 -0700 Subject: [PATCH 21/28] update gha yml --- .github/workflows/build_and_test.yml | 4 +- dev/structured-logging-style.py | 92 ---------------------------- 2 files changed, 2 insertions(+), 94 deletions(-) delete mode 100755 dev/structured-logging-style.py diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0420570ebb918..0069906a762c3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -695,8 +695,8 @@ jobs: run: ./dev/lint-scala - name: Scala structured logging check run: | - if [ -f ./dev/structured-logging-style.py ]; then - ./dev/structured-logging-style.py + if [ -f ./dev/structured_logging_style.py ]; then + ./dev/structured_logging_style.py fi - name: Java linter run: ./dev/lint-java diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py deleted file mode 100755 index 999f0a2bd02cb..0000000000000 --- a/dev/structured-logging-style.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import sys -import re -import glob - -from sparktestsupport import SPARK_HOME - - -def main(): - log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" - inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"' - compiled_inner_log_pattern = re.compile(inner_log_pattern) - - # Regex patterns for file paths to exclude from the Structured Logging style check - excluded_file_patterns = [ - "[Tt]est", - "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", - "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", - "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", - "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala", - ] - - nonmigrated_files = {} - - scala_files = glob.glob(os.path.join(SPARK_HOME, "**", "*.scala"), recursive=True) - - for file in scala_files: - skip_file = False - for exclude_pattern in excluded_file_patterns: - if re.search(exclude_pattern, file): - skip_file = True - break - - if not skip_file and not os.path.isdir(file): - with open(file, "r") as f: - content = f.read() - - log_statements = re.finditer(log_pattern, content, re.DOTALL) - - if log_statements: - nonmigrated_files[file] = [] - for log_statement in log_statements: - log_statement_str = log_statement.group(0).strip() - # trim first ( and last ) - first_paren_index = log_statement_str.find("(") - inner_log_statement = re.sub( - r"\s+", "", log_statement_str[first_paren_index + 1 : -1] - ) - - if compiled_inner_log_pattern.fullmatch(inner_log_statement): - start_pos = log_statement.start() - preceding_content = content[:start_pos] - line_number = preceding_content.count("\n") + 1 - start_char = start_pos - preceding_content.rfind("\n") - 1 - nonmigrated_files[file].append((line_number, start_char)) - - if not nonmigrated_files: - print("Structured logging style check passed.") - sys.exit(0) - else: - for file_path, issues in nonmigrated_files.items(): - for line_number, start_char in issues: - print(f"[error] {file_path}:{line_number}:{start_char}") - print( - """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...". - Refer to the guidelines in the file `internal/Logging.scala`.""" - ) - - sys.exit(-1) - - -if __name__ == "__main__": - main() From 72f0c1b5405236abeded3039eb0c09d088bfd967 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Thu, 11 Jul 2024 14:58:24 -0700 Subject: [PATCH 22/28] line char max --- dev/structured_logging_style.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index f756633f4abf6..52f62bfce426c 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -31,9 +31,11 @@ def main(): # Regex patterns for file paths to exclude from the Structured Logging style check excluded_file_patterns = [ "[Tt]est", - "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala", + "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen" + "/CodeGenerator.scala", "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", - "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala", + "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver" + "/SparkSQLCLIService.scala", "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala", ] @@ -79,8 +81,9 @@ def main(): for line_number, start_char in issues: print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr) print( - """[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...". - Refer to the guidelines in the file `internal/Logging.scala`.""", + f"[error]\tPlease use the Structured Logging Framework for logging messages " + f'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."' + f"\nRefer to the guidelines in the file `internal/Logging.scala`.", file=sys.stderr, ) From 7bdf67c9147a654b11f28810d33114f09aef6325 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Mon, 15 Jul 2024 06:18:39 -0700 Subject: [PATCH 23/28] stylize the error message --- dev/structured_logging_style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index 52f62bfce426c..897504bf6b16e 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -83,7 +83,7 @@ def main(): print( f"[error]\tPlease use the Structured Logging Framework for logging messages " f'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."' - f"\nRefer to the guidelines in the file `internal/Logging.scala`.", + f"\n\tRefer to the guidelines in the file `internal/Logging.scala`.", file=sys.stderr, ) From bed425679caef710d2e95725dba51592a5d314d3 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Wed, 17 Jul 2024 15:37:04 -0700 Subject: [PATCH 24/28] modify + regex --- dev/structured_logging_style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index 897504bf6b16e..8ddd95015b00d 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -25,7 +25,7 @@ def main(): log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" - inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!")).*|[^"]+\+\s*".*?"' + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"' compiled_inner_log_pattern = re.compile(inner_log_pattern) # Regex patterns for file paths to exclude from the Structured Logging style check From a253657c4bd376cbb4fdd63cdbd277fd031b86d1 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 19 Jul 2024 10:55:47 -0700 Subject: [PATCH 25/28] run script with python3.9 --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b7c52831ce2cc..576f64f3a0869 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -696,7 +696,7 @@ jobs: - name: Scala structured logging check run: | if [ -f ./dev/structured_logging_style.py ]; then - ./dev/structured_logging_style.py + python3.9 ./dev/structured_logging_style.py fi - name: Java linter run: ./dev/lint-java From 0c223830bf3a692528c80b7fdea3b69e447391ad Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 19 Jul 2024 11:45:46 -0700 Subject: [PATCH 26/28] update success condition check --- dev/structured_logging_style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index 8ddd95015b00d..ee43abbc8e8be 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -73,7 +73,7 @@ def main(): start_char = start_pos - preceding_content.rfind("\n") - 1 nonmigrated_files[file].append((line_number, start_char)) - if not nonmigrated_files: + if all(len(issues) == 0 for issues in nonmigrated_files.values()): print("Structured logging style check passed.", file=sys.stderr) sys.exit(0) else: From 2333388728a6ab3da939c7417f24b3e2aa884e96 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 19 Jul 2024 17:47:39 -0700 Subject: [PATCH 27/28] lint fix --- dev/structured_logging_style.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index ee43abbc8e8be..4a97053817819 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -81,9 +81,9 @@ def main(): for line_number, start_char in issues: print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr) print( - f"[error]\tPlease use the Structured Logging Framework for logging messages " - f'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."' - f"\n\tRefer to the guidelines in the file `internal/Logging.scala`.", + '[error]\tPlease use the Structured Logging Framework for logging messages ' + 'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."' + '\n\tRefer to the guidelines in the file `internal/Logging.scala`.', file=sys.stderr, ) From 5f789893234460e7cb96534169ebad32be6d48ee Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Fri, 19 Jul 2024 21:26:37 -0700 Subject: [PATCH 28/28] reformat --- dev/structured_logging_style.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py index 4a97053817819..eeeefab3a977c 100755 --- a/dev/structured_logging_style.py +++ b/dev/structured_logging_style.py @@ -81,9 +81,9 @@ def main(): for line_number, start_char in issues: print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr) print( - '[error]\tPlease use the Structured Logging Framework for logging messages ' + "[error]\tPlease use the Structured Logging Framework for logging messages " 'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."' - '\n\tRefer to the guidelines in the file `internal/Logging.scala`.', + "\n\tRefer to the guidelines in the file `internal/Logging.scala`.", file=sys.stderr, )