From 44731120f8a79d4feb98017cf4179e4ee472bd1d Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 02:40:10 +0000 Subject: [PATCH 1/7] Phase 1: Extract test durations to JUnit XML --- .github/workflows/run_tests_against_package.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml index 4b82a118e0..a019cf6725 100644 --- a/.github/workflows/run_tests_against_package.yml +++ b/.github/workflows/run_tests_against_package.yml @@ -195,6 +195,7 @@ jobs: -v \ -m "${FINAL_PYTEST_MARKER}" \ --durations=0 \ + --junitxml=test-results-${INPUTS_DEVICE_TYPE}-${INPUTS_WORKER_GROUP}.xml \ $PYTEST_COV_ARGS \ $SPLIT_ARGS \ ${INPUTS_PYTEST_EXTRA_ARGS} @@ -227,3 +228,10 @@ jobs: # If scheduled, upload to scheduled flag only. If PR, upload to regular flag only. flags: ${{ inputs.is_scheduled_run == 'true' && 'scheduled' || 'regular' }} verbose: true + - name: Upload Test Results XML + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ inputs.device_type }}-${{ inputs.worker_group }} + path: test-results-*.xml + if-no-files-found: ignore From b32cb5a789d2a747f2320c485ef5f67592d92484 Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 03:58:09 +0000 Subject: [PATCH 2/7] Phase 2 & 3: Add workflow and script to track test performance regressions --- .../workflows/test_performance_tracker.yml | 62 ++++++++++++++++++ tests/utils/parse_junit_to_benchmark.py | 65 +++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 .github/workflows/test_performance_tracker.yml create mode 100644 tests/utils/parse_junit_to_benchmark.py diff --git a/.github/workflows/test_performance_tracker.yml b/.github/workflows/test_performance_tracker.yml new file mode 100644 index 0000000000..1b317e1047 --- /dev/null +++ b/.github/workflows/test_performance_tracker.yml @@ -0,0 +1,62 @@ +name: Test Performance Tracker + +on: + workflow_run: + workflows: ["MaxText Package Tests"] + types: + - completed + +permissions: + contents: write + pull-requests: write + +jobs: + track-performance: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' }} + steps: + - uses: actions/checkout@v4 + + - name: Download all test results + uses: actions/download-artifact@v4 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + run-id: ${{ github.event.workflow_run.id }} + path: test-results + pattern: test-results-* + merge-multiple: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Parse JUnit XML to Benchmark format + run: | + python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json + + - name: Track Test Durations (Main) + if: github.event.workflow_run.head_branch == 'main' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: MaxText Test Execution Times + tool: 'customSmallerIsBetter' + output-file-path: benchmark-results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: '150%' + comment-on-alert: true + fail-on-alert: false + + - name: Verify Test Durations (PR) + if: github.event.workflow_run.head_branch != 'main' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: MaxText Test Execution Times + tool: 'customSmallerIsBetter' + output-file-path: benchmark-results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: false + alert-threshold: '150%' + comment-on-alert: true + fail-on-alert: true diff --git a/tests/utils/parse_junit_to_benchmark.py b/tests/utils/parse_junit_to_benchmark.py new file mode 100644 index 0000000000..87c6efb6f3 --- /dev/null +++ b/tests/utils/parse_junit_to_benchmark.py @@ -0,0 +1,65 @@ +import xml.etree.ElementTree as ET +import glob +import json +import sys +import os + +def main(): + if len(sys.argv) < 3: + print("Usage: python parse_junit_to_benchmark.py ") + sys.exit(1) + + xml_dir = sys.argv[1] + output_json = sys.argv[2] + + benchmarks = [] + total_times_by_device = {} + + xml_files = glob.glob(os.path.join(xml_dir, "*.xml")) + for xml_file in xml_files: + basename = os.path.basename(xml_file) + # e.g., test-results-tpu-1.xml -> device = tpu + device = "unknown" + parts = basename.replace(".xml", "").split("-") + if len(parts) >= 3: + device = parts[2] + + try: + tree = ET.parse(xml_file) + except Exception as e: + print(f"Error parsing {xml_file}: {e}") + continue + + root = tree.getroot() + + for testsuite in root.iter('testsuite'): + for testcase in testsuite.iter('testcase'): + name = testcase.get('name') + classname = testcase.get('classname') + time_val = float(testcase.get('time', 0.0)) + + # Prefix with device to distinguish test times on different hardware + full_name = f"[{device.upper()}] {classname}::{name}" + + benchmarks.append({ + "name": full_name, + "unit": "sec", + "value": time_val + }) + + total_times_by_device[device] = total_times_by_device.get(device, 0.0) + time_val + + for device, total_time in total_times_by_device.items(): + benchmarks.append({ + "name": f"Total {device.upper()} Test Suite Time", + "unit": "sec", + "value": total_time + }) + + with open(output_json, "w") as f: + json.dump(benchmarks, f, indent=2) + + print(f"Parsed {len(xml_files)} XML files and extracted {len(benchmarks)} duration metrics.") + +if __name__ == "__main__": + main() From 92f5b6bc59ff89ac6ee74449e53b96f6fb6ae6cb Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 06:51:23 +0000 Subject: [PATCH 3/7] Refactor tracking job directly into main CI pipeline so PRs can test the execution --- .github/workflows/ci_pipeline.yml | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml index 0f66a51e91..178e702eeb 100644 --- a/.github/workflows/ci_pipeline.yml +++ b/.github/workflows/ci_pipeline.yml @@ -351,3 +351,58 @@ jobs: with: failed_run_id: '${{ github.run_id }}' secrets: inherit + + track_performance: + name: Track Test Performance + needs: [tpu-tests, gpu-tests, cpu-tests] + if: ${{ always() && !cancelled() }} + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + + - name: Download all test results + uses: actions/download-artifact@v4 + with: + path: test-results + pattern: test-results-* + merge-multiple: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Parse JUnit XML to Benchmark format + run: | + python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json + echo "Parsed Benchmark Results:" + cat benchmark-results.json + + - name: Track Test Durations (Main) + if: github.ref == 'refs/heads/main' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: MaxText Test Execution Times + tool: 'customSmallerIsBetter' + output-file-path: benchmark-results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: '150%' + comment-on-alert: true + fail-on-alert: false + + - name: Verify Test Durations (PR) + if: github.ref != 'refs/heads/main' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: MaxText Test Execution Times + tool: 'customSmallerIsBetter' + output-file-path: benchmark-results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: false + alert-threshold: '150%' + comment-on-alert: true + fail-on-alert: true From 389854891b9679c8db09477c901463ebe19e99f9 Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 08:02:53 +0000 Subject: [PATCH 4/7] Use actions/cache for benchmark data storage --- .github/workflows/ci_pipeline.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml index 178e702eeb..006d6f0aa0 100644 --- a/.github/workflows/ci_pipeline.yml +++ b/.github/workflows/ci_pipeline.yml @@ -377,10 +377,19 @@ jobs: - name: Parse JUnit XML to Benchmark format run: | + mkdir -p ./cache python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json echo "Parsed Benchmark Results:" cat benchmark-results.json + - name: Restore Benchmark Data + uses: actions/cache@v4 + with: + path: ./cache + key: ${{ runner.os }}-benchmark-${{ github.run_id }} + restore-keys: | + ${{ runner.os }}-benchmark- + - name: Track Test Durations (Main) if: github.ref == 'refs/heads/main' uses: benchmark-action/github-action-benchmark@v1 @@ -388,8 +397,8 @@ jobs: name: MaxText Test Execution Times tool: 'customSmallerIsBetter' output-file-path: benchmark-results.json + external-data-json-path: ./cache/benchmark-data.json github-token: ${{ secrets.GITHUB_TOKEN }} - auto-push: true alert-threshold: '150%' comment-on-alert: true fail-on-alert: false @@ -401,8 +410,8 @@ jobs: name: MaxText Test Execution Times tool: 'customSmallerIsBetter' output-file-path: benchmark-results.json + external-data-json-path: ./cache/benchmark-data.json github-token: ${{ secrets.GITHUB_TOKEN }} - auto-push: false alert-threshold: '150%' comment-on-alert: true fail-on-alert: true From 390bb68c235eb978cbb8a45ba705f5eef0b07723 Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 08:37:40 +0000 Subject: [PATCH 5/7] Updated CI Pipeline for GCS-backed Durations --- .github/workflows/ci_pipeline.yml | 24 +++++-- .../workflows/test_performance_tracker.yml | 62 ------------------- 2 files changed, 18 insertions(+), 68 deletions(-) delete mode 100644 .github/workflows/test_performance_tracker.yml diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml index 006d6f0aa0..d679c59f22 100644 --- a/.github/workflows/ci_pipeline.yml +++ b/.github/workflows/ci_pipeline.yml @@ -359,6 +359,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: write + id-token: write pull-requests: write steps: - uses: actions/checkout@v4 @@ -382,13 +383,19 @@ jobs: echo "Parsed Benchmark Results:" cat benchmark-results.json - - name: Restore Benchmark Data - uses: actions/cache@v4 + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 with: - path: ./cache - key: ${{ runner.os }}-benchmark-${{ github.run_id }} - restore-keys: | - ${{ runner.os }}-benchmark- + workload_identity_provider: ${{ vars.GCP_WIF_PROVIDER }} + service_account: ${{ vars.SERVICE_ACCOUNT_EMAIL }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Fetch Baseline Benchmark Data from GCS + run: | + mkdir -p ./cache + gcloud storage cp gs://maxtext-test-assets/benchmark-data.json ./cache/benchmark-data.json || true - name: Track Test Durations (Main) if: github.ref == 'refs/heads/main' @@ -415,3 +422,8 @@ jobs: alert-threshold: '150%' comment-on-alert: true fail-on-alert: true + + - name: Upload Updated Baseline to GCS + if: github.ref == 'refs/heads/main' + run: | + gcloud storage cp ./cache/benchmark-data.json gs://maxtext-test-assets/benchmark-data.json diff --git a/.github/workflows/test_performance_tracker.yml b/.github/workflows/test_performance_tracker.yml deleted file mode 100644 index 1b317e1047..0000000000 --- a/.github/workflows/test_performance_tracker.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: Test Performance Tracker - -on: - workflow_run: - workflows: ["MaxText Package Tests"] - types: - - completed - -permissions: - contents: write - pull-requests: write - -jobs: - track-performance: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' }} - steps: - - uses: actions/checkout@v4 - - - name: Download all test results - uses: actions/download-artifact@v4 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - run-id: ${{ github.event.workflow_run.id }} - path: test-results - pattern: test-results-* - merge-multiple: true - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Parse JUnit XML to Benchmark format - run: | - python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json - - - name: Track Test Durations (Main) - if: github.event.workflow_run.head_branch == 'main' - uses: benchmark-action/github-action-benchmark@v1 - with: - name: MaxText Test Execution Times - tool: 'customSmallerIsBetter' - output-file-path: benchmark-results.json - github-token: ${{ secrets.GITHUB_TOKEN }} - auto-push: true - alert-threshold: '150%' - comment-on-alert: true - fail-on-alert: false - - - name: Verify Test Durations (PR) - if: github.event.workflow_run.head_branch != 'main' - uses: benchmark-action/github-action-benchmark@v1 - with: - name: MaxText Test Execution Times - tool: 'customSmallerIsBetter' - output-file-path: benchmark-results.json - github-token: ${{ secrets.GITHUB_TOKEN }} - auto-push: false - alert-threshold: '150%' - comment-on-alert: true - fail-on-alert: true From dec2070e43ef7ee271fe5ff4d23bd2efb8de82f5 Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 08:46:51 +0000 Subject: [PATCH 6/7] ci: allow baseline upload to GCS bucket from any branch for testing --- .github/workflows/ci_pipeline.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml index d679c59f22..ff8d847d50 100644 --- a/.github/workflows/ci_pipeline.yml +++ b/.github/workflows/ci_pipeline.yml @@ -424,6 +424,5 @@ jobs: fail-on-alert: true - name: Upload Updated Baseline to GCS - if: github.ref == 'refs/heads/main' run: | gcloud storage cp ./cache/benchmark-data.json gs://maxtext-test-assets/benchmark-data.json From ee796e87cc0c925bd4a879123284cde34f211085 Mon Sep 17 00:00:00 2001 From: hsuan-lun-chiang Date: Wed, 1 Jul 2026 09:35:47 +0000 Subject: [PATCH 7/7] ci: fix track_performance job permission by running in cloud-sdk container on self-hosted buildkit runner --- .github/workflows/ci_pipeline.yml | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml index ff8d847d50..773b358bef 100644 --- a/.github/workflows/ci_pipeline.yml +++ b/.github/workflows/ci_pipeline.yml @@ -356,13 +356,17 @@ jobs: name: Track Test Performance needs: [tpu-tests, gpu-tests, cpu-tests] if: ${{ always() && !cancelled() }} - runs-on: ubuntu-latest + runs-on: linux-x86-ct6e-180-4tpu + container: google/cloud-sdk:524.0.0 permissions: contents: write id-token: write pull-requests: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 + + - name: Mark git repositories as safe + run: git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Download all test results uses: actions/download-artifact@v4 @@ -371,11 +375,6 @@ jobs: pattern: test-results-* merge-multiple: true - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Parse JUnit XML to Benchmark format run: | mkdir -p ./cache @@ -383,15 +382,6 @@ jobs: echo "Parsed Benchmark Results:" cat benchmark-results.json - - name: Authenticate to Google Cloud - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: ${{ vars.GCP_WIF_PROVIDER }} - service_account: ${{ vars.SERVICE_ACCOUNT_EMAIL }} - - - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@v2 - - name: Fetch Baseline Benchmark Data from GCS run: | mkdir -p ./cache