From 44731120f8a79d4feb98017cf4179e4ee472bd1d Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 02:40:10 +0000
Subject: [PATCH 1/7] Phase 1: Extract test durations to JUnit XML

---
 .github/workflows/run_tests_against_package.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
index 4b82a118e0..a019cf6725 100644
--- a/.github/workflows/run_tests_against_package.yml
+++ b/.github/workflows/run_tests_against_package.yml
@@ -195,6 +195,7 @@ jobs:
               -v \
               -m "${FINAL_PYTEST_MARKER}" \
               --durations=0 \
+              --junitxml=test-results-${INPUTS_DEVICE_TYPE}-${INPUTS_WORKER_GROUP}.xml \
               $PYTEST_COV_ARGS \
               $SPLIT_ARGS \
               ${INPUTS_PYTEST_EXTRA_ARGS}
@@ -227,3 +228,10 @@ jobs:
           # If scheduled, upload to scheduled flag only. If PR, upload to regular flag only.
           flags: ${{ inputs.is_scheduled_run == 'true' && 'scheduled' || 'regular' }}
           verbose: true
+      - name: Upload Test Results XML
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results-${{ inputs.device_type }}-${{ inputs.worker_group }}
+          path: test-results-*.xml
+          if-no-files-found: ignore

From b32cb5a789d2a747f2320c485ef5f67592d92484 Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 03:58:09 +0000
Subject: [PATCH 2/7] Phase 2 & 3: Add workflow and script to track test
 performance regressions

---
 .../workflows/test_performance_tracker.yml    | 62 ++++++++++++++++++
 tests/utils/parse_junit_to_benchmark.py       | 65 +++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 .github/workflows/test_performance_tracker.yml
 create mode 100644 tests/utils/parse_junit_to_benchmark.py

diff --git a/.github/workflows/test_performance_tracker.yml b/.github/workflows/test_performance_tracker.yml
new file mode 100644
index 0000000000..1b317e1047
--- /dev/null
+++ b/.github/workflows/test_performance_tracker.yml
@@ -0,0 +1,62 @@
+name: Test Performance Tracker
+
+on:
+  workflow_run:
+    workflows: ["MaxText Package Tests"]
+    types:
+      - completed
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  track-performance:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' }}
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Download all test results
+        uses: actions/download-artifact@v4
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+          path: test-results
+          pattern: test-results-*
+          merge-multiple: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Parse JUnit XML to Benchmark format
+        run: |
+          python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json
+
+      - name: Track Test Durations (Main)
+        if: github.event.workflow_run.head_branch == 'main'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: MaxText Test Execution Times
+          tool: 'customSmallerIsBetter'
+          output-file-path: benchmark-results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          alert-threshold: '150%'
+          comment-on-alert: true
+          fail-on-alert: false
+
+      - name: Verify Test Durations (PR)
+        if: github.event.workflow_run.head_branch != 'main'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: MaxText Test Execution Times
+          tool: 'customSmallerIsBetter'
+          output-file-path: benchmark-results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: false
+          alert-threshold: '150%'
+          comment-on-alert: true
+          fail-on-alert: true
diff --git a/tests/utils/parse_junit_to_benchmark.py b/tests/utils/parse_junit_to_benchmark.py
new file mode 100644
index 0000000000..87c6efb6f3
--- /dev/null
+++ b/tests/utils/parse_junit_to_benchmark.py
@@ -0,0 +1,65 @@
+import xml.etree.ElementTree as ET
+import glob
+import json
+import sys
+import os
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: python parse_junit_to_benchmark.py <xml_dir> <output_json>")
+        sys.exit(1)
+        
+    xml_dir = sys.argv[1]
+    output_json = sys.argv[2]
+    
+    benchmarks = []
+    total_times_by_device = {}
+    
+    xml_files = glob.glob(os.path.join(xml_dir, "*.xml"))
+    for xml_file in xml_files:
+        basename = os.path.basename(xml_file)
+        # e.g., test-results-tpu-1.xml -> device = tpu
+        device = "unknown"
+        parts = basename.replace(".xml", "").split("-")
+        if len(parts) >= 3:
+            device = parts[2]
+            
+        try:
+            tree = ET.parse(xml_file)
+        except Exception as e:
+            print(f"Error parsing {xml_file}: {e}")
+            continue
+            
+        root = tree.getroot()
+        
+        for testsuite in root.iter('testsuite'):
+            for testcase in testsuite.iter('testcase'):
+                name = testcase.get('name')
+                classname = testcase.get('classname')
+                time_val = float(testcase.get('time', 0.0))
+                
+                # Prefix with device to distinguish test times on different hardware
+                full_name = f"[{device.upper()}] {classname}::{name}"
+                
+                benchmarks.append({
+                    "name": full_name,
+                    "unit": "sec",
+                    "value": time_val
+                })
+                
+                total_times_by_device[device] = total_times_by_device.get(device, 0.0) + time_val
+
+    for device, total_time in total_times_by_device.items():
+        benchmarks.append({
+            "name": f"Total {device.upper()} Test Suite Time",
+            "unit": "sec",
+            "value": total_time
+        })
+        
+    with open(output_json, "w") as f:
+        json.dump(benchmarks, f, indent=2)
+        
+    print(f"Parsed {len(xml_files)} XML files and extracted {len(benchmarks)} duration metrics.")
+
+if __name__ == "__main__":
+    main()

From 92f5b6bc59ff89ac6ee74449e53b96f6fb6ae6cb Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 06:51:23 +0000
Subject: [PATCH 3/7] Refactor tracking job directly into main CI pipeline so
 PRs can test the execution

---
 .github/workflows/ci_pipeline.yml | 55 +++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index 0f66a51e91..178e702eeb 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -351,3 +351,58 @@ jobs:
     with:
       failed_run_id: '${{ github.run_id }}'
     secrets: inherit
+
+  track_performance:
+    name: Track Test Performance
+    needs: [tpu-tests, gpu-tests, cpu-tests]
+    if: ${{ always() && !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Download all test results
+        uses: actions/download-artifact@v4
+        with:
+          path: test-results
+          pattern: test-results-*
+          merge-multiple: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Parse JUnit XML to Benchmark format
+        run: |
+          python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json
+          echo "Parsed Benchmark Results:"
+          cat benchmark-results.json
+
+      - name: Track Test Durations (Main)
+        if: github.ref == 'refs/heads/main'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: MaxText Test Execution Times
+          tool: 'customSmallerIsBetter'
+          output-file-path: benchmark-results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          alert-threshold: '150%'
+          comment-on-alert: true
+          fail-on-alert: false
+
+      - name: Verify Test Durations (PR)
+        if: github.ref != 'refs/heads/main'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: MaxText Test Execution Times
+          tool: 'customSmallerIsBetter'
+          output-file-path: benchmark-results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: false
+          alert-threshold: '150%'
+          comment-on-alert: true
+          fail-on-alert: true

From 389854891b9679c8db09477c901463ebe19e99f9 Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 08:02:53 +0000
Subject: [PATCH 4/7] Use actions/cache for benchmark data storage

---
 .github/workflows/ci_pipeline.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index 178e702eeb..006d6f0aa0 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -377,10 +377,19 @@ jobs:
 
       - name: Parse JUnit XML to Benchmark format
         run: |
+          mkdir -p ./cache
           python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json
           echo "Parsed Benchmark Results:"
           cat benchmark-results.json
 
+      - name: Restore Benchmark Data
+        uses: actions/cache@v4
+        with:
+          path: ./cache
+          key: ${{ runner.os }}-benchmark-${{ github.run_id }}
+          restore-keys: |
+            ${{ runner.os }}-benchmark-
+
       - name: Track Test Durations (Main)
         if: github.ref == 'refs/heads/main'
         uses: benchmark-action/github-action-benchmark@v1
@@ -388,8 +397,8 @@ jobs:
           name: MaxText Test Execution Times
           tool: 'customSmallerIsBetter'
           output-file-path: benchmark-results.json
+          external-data-json-path: ./cache/benchmark-data.json
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          auto-push: true
           alert-threshold: '150%'
           comment-on-alert: true
           fail-on-alert: false
@@ -401,8 +410,8 @@ jobs:
           name: MaxText Test Execution Times
           tool: 'customSmallerIsBetter'
           output-file-path: benchmark-results.json
+          external-data-json-path: ./cache/benchmark-data.json
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          auto-push: false
           alert-threshold: '150%'
           comment-on-alert: true
           fail-on-alert: true

From 390bb68c235eb978cbb8a45ba705f5eef0b07723 Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 08:37:40 +0000
Subject: [PATCH 5/7] Updated CI Pipeline for GCS-backed Durations

---
 .github/workflows/ci_pipeline.yml             | 24 +++++--
 .../workflows/test_performance_tracker.yml    | 62 -------------------
 2 files changed, 18 insertions(+), 68 deletions(-)
 delete mode 100644 .github/workflows/test_performance_tracker.yml

diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index 006d6f0aa0..d679c59f22 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -359,6 +359,7 @@ jobs:
     runs-on: ubuntu-latest
     permissions:
       contents: write
+      id-token: write
       pull-requests: write
     steps:
       - uses: actions/checkout@v4
@@ -382,13 +383,19 @@ jobs:
           echo "Parsed Benchmark Results:"
           cat benchmark-results.json
 
-      - name: Restore Benchmark Data
-        uses: actions/cache@v4
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v2
         with:
-          path: ./cache
-          key: ${{ runner.os }}-benchmark-${{ github.run_id }}
-          restore-keys: |
-            ${{ runner.os }}-benchmark-
+          workload_identity_provider: ${{ vars.GCP_WIF_PROVIDER }}
+          service_account: ${{ vars.SERVICE_ACCOUNT_EMAIL }}
+
+      - name: Set up Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+
+      - name: Fetch Baseline Benchmark Data from GCS
+        run: |
+          mkdir -p ./cache
+          gcloud storage cp gs://maxtext-test-assets/benchmark-data.json ./cache/benchmark-data.json || true
 
       - name: Track Test Durations (Main)
         if: github.ref == 'refs/heads/main'
@@ -415,3 +422,8 @@ jobs:
           alert-threshold: '150%'
           comment-on-alert: true
           fail-on-alert: true
+
+      - name: Upload Updated Baseline to GCS
+        if: github.ref == 'refs/heads/main'
+        run: |
+          gcloud storage cp ./cache/benchmark-data.json gs://maxtext-test-assets/benchmark-data.json
diff --git a/.github/workflows/test_performance_tracker.yml b/.github/workflows/test_performance_tracker.yml
deleted file mode 100644
index 1b317e1047..0000000000
--- a/.github/workflows/test_performance_tracker.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: Test Performance Tracker
-
-on:
-  workflow_run:
-    workflows: ["MaxText Package Tests"]
-    types:
-      - completed
-
-permissions:
-  contents: write
-  pull-requests: write
-
-jobs:
-  track-performance:
-    runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' }}
-    steps:
-      - uses: actions/checkout@v4
-      
-      - name: Download all test results
-        uses: actions/download-artifact@v4
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          run-id: ${{ github.event.workflow_run.id }}
-          path: test-results
-          pattern: test-results-*
-          merge-multiple: true
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
-      - name: Parse JUnit XML to Benchmark format
-        run: |
-          python3 tests/utils/parse_junit_to_benchmark.py test-results benchmark-results.json
-
-      - name: Track Test Durations (Main)
-        if: github.event.workflow_run.head_branch == 'main'
-        uses: benchmark-action/github-action-benchmark@v1
-        with:
-          name: MaxText Test Execution Times
-          tool: 'customSmallerIsBetter'
-          output-file-path: benchmark-results.json
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          auto-push: true
-          alert-threshold: '150%'
-          comment-on-alert: true
-          fail-on-alert: false
-
-      - name: Verify Test Durations (PR)
-        if: github.event.workflow_run.head_branch != 'main'
-        uses: benchmark-action/github-action-benchmark@v1
-        with:
-          name: MaxText Test Execution Times
-          tool: 'customSmallerIsBetter'
-          output-file-path: benchmark-results.json
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          auto-push: false
-          alert-threshold: '150%'
-          comment-on-alert: true
-          fail-on-alert: true

From dec2070e43ef7ee271fe5ff4d23bd2efb8de82f5 Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 08:46:51 +0000
Subject: [PATCH 6/7] ci: allow baseline upload to GCS bucket from any branch
 for testing

---
 .github/workflows/ci_pipeline.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index d679c59f22..ff8d847d50 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -424,6 +424,5 @@ jobs:
           fail-on-alert: true
 
       - name: Upload Updated Baseline to GCS
-        if: github.ref == 'refs/heads/main'
         run: |
           gcloud storage cp ./cache/benchmark-data.json gs://maxtext-test-assets/benchmark-data.json

From ee796e87cc0c925bd4a879123284cde34f211085 Mon Sep 17 00:00:00 2001
From: hsuan-lun-chiang <hsuan-lun.chiang@cienet.com>
Date: Wed, 1 Jul 2026 09:35:47 +0000
Subject: [PATCH 7/7] ci: fix track_performance job permission by running in
 cloud-sdk container on self-hosted buildkit runner

---
 .github/workflows/ci_pipeline.yml | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index ff8d847d50..773b358bef 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -356,13 +356,17 @@ jobs:
     name: Track Test Performance
     needs: [tpu-tests, gpu-tests, cpu-tests]
     if: ${{ always() && !cancelled() }}
-    runs-on: ubuntu-latest
+    runs-on: linux-x86-ct6e-180-4tpu
+    container: google/cloud-sdk:524.0.0
     permissions:
       contents: write
       id-token: write
       pull-requests: write
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
+
+      - name: Mark git repositories as safe
+        run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
       
       - name: Download all test results
         uses: actions/download-artifact@v4
@@ -371,11 +375,6 @@ jobs:
           pattern: test-results-*
           merge-multiple: true
 
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
       - name: Parse JUnit XML to Benchmark format
         run: |
           mkdir -p ./cache
@@ -383,15 +382,6 @@ jobs:
           echo "Parsed Benchmark Results:"
           cat benchmark-results.json
 
-      - name: Authenticate to Google Cloud
-        uses: google-github-actions/auth@v2
-        with:
-          workload_identity_provider: ${{ vars.GCP_WIF_PROVIDER }}
-          service_account: ${{ vars.SERVICE_ACCOUNT_EMAIL }}
-
-      - name: Set up Cloud SDK
-        uses: google-github-actions/setup-gcloud@v2
-
       - name: Fetch Baseline Benchmark Data from GCS
         run: |
           mkdir -p ./cache