A few updates to the night tests (sgl-project#17694)

Kangyan-Zhou · Johnsonms · commit b7e827ee2ce5 · 2026-02-14T01:41:18.000Z
diff --git a/python/sglang/test/accuracy_test_runner.py b/python/sglang/test/accuracy_test_runner.py
@@ -40,6 +40,7 @@ class AccuracyTestResult:
     baseline_accuracy: float
     error: Optional[str]
     latency: Optional[float] = None
+    variant: Optional[str] = None
 
 
 def write_accuracy_github_summary(
@@ -54,16 +55,18 @@ def write_accuracy_github_summary(
         dataset: Dataset name used for evaluation
         results: List of AccuracyTestResult objects
     """
-    summary = f"## {test_name} - Accuracy ({dataset})\n"
-    summary += "| model | status | score | baseline | error |\n"
-    summary += "| ----- | ------ | ----- | -------- | ----- |\n"
+    summary = f"#### {test_name} - Accuracy ({dataset})\n"
+    summary += "| config | status | score | baseline | error |\n"
+    summary += "| ------ | ------ | ----- | -------- | ----- |\n"
 
     for result in results:
         status_emoji = "✅" if result.passed else "❌"
         score_str = f"{result.score:.4f}" if result.score is not None else "N/A"
         baseline_str = f"{result.baseline_accuracy:.4f}"
         error_str = result.error if result.error else "-"
-        summary += f"| {result.model} | {status_emoji} | {score_str} | {baseline_str} | {error_str} |\n"
+        # Use variant name if available, otherwise use model path
+        config_name = result.variant if result.variant else result.model
+        summary += f"| {config_name} | {status_emoji} | {score_str} | {baseline_str} | {error_str} |\n"
 
     write_github_step_summary(summary)
 
@@ -239,6 +242,7 @@ def run_accuracy_test(
             score=None,
             baseline_accuracy=params.baseline_accuracy,
             error=error,
+            variant=model.variant,
         )
 
     # Validate against baseline
@@ -265,4 +269,5 @@ def run_accuracy_test(
         baseline_accuracy=params.baseline_accuracy,
         error=error if not passed else None,
         latency=latency,
+        variant=model.variant,
     )
diff --git a/test/registered/8-gpu-models/test_deepseek_v31.py b/test/registered/8-gpu-models/test_deepseek_v31.py
@@ -60,7 +60,7 @@ def test_deepseek_v31_all_variants(self):
 
         run_combined_tests(
             models=variants,
-            test_name="DeepSeek-V3.1 Unified",
+            test_name="DeepSeek-V3.1",
             accuracy_params=AccuracyTestParams(
                 dataset="gsm8k", baseline_accuracy=0.935
             ),
diff --git a/test/registered/8-gpu-models/test_deepseek_v32.py b/test/registered/8-gpu-models/test_deepseek_v32.py
@@ -82,7 +82,7 @@ def test_deepseek_v32_all_variants(self):
 
         run_combined_tests(
             models=variants,
-            test_name="DeepSeek-V3.2 Unified",
+            test_name="DeepSeek-V3.2",
             accuracy_params=AccuracyTestParams(
                 dataset="gsm8k", baseline_accuracy=GSM8K_BASELINE
             ),
diff --git a/test/registered/8-gpu-models/test_deepseek_v32_cp_single_node.py b/test/registered/8-gpu-models/test_deepseek_v32_cp_single_node.py
@@ -69,7 +69,7 @@ def test_deepseek_v32_cp_variants(self):
             ModelLaunchSettings(
                 DEEPSEEK_V32_EXP_MODEL_PATH,
                 tp_size=8,
-                extra_args=BASE_ARGS + ["--tp=8"] + MTP_ARGS + CP_ROUND_ROBIN_ARGS,
+                extra_args=BASE_ARGS + MTP_ARGS + CP_ROUND_ROBIN_ARGS,
                 variant="CP-round-robin-split",
             ),
         ]
diff --git a/test/registered/8-gpu-models/test_glm_46.py b/test/registered/8-gpu-models/test_glm_46.py
@@ -40,7 +40,7 @@ def test_glm_46(self):
 
         run_combined_tests(
             models=variants,
-            test_name="GLM-4.6 Unified",
+            test_name="GLM-4.6",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.80),
             performance_params=PerformanceTestParams(
                 profile_dir="performance_profiles_glm_4_6",
diff --git a/test/registered/8-gpu-models/test_glm_46_fp8.py b/test/registered/8-gpu-models/test_glm_46_fp8.py
@@ -51,7 +51,7 @@ def test_glm_46_fp8_all_variants(self):
 
         run_combined_tests(
             models=variants,
-            test_name="GLM-4.6-FP8 Unified",
+            test_name="GLM-4.6-FP8",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.80),
             performance_params=PerformanceTestParams(
                 profile_dir="performance_profiles_glm_4_6_fp8",
diff --git a/test/registered/8-gpu-models/test_kimi_k2.py b/test/registered/8-gpu-models/test_kimi_k2.py
@@ -41,7 +41,7 @@ def test_kimi_k2(self):
 
         run_combined_tests(
             models=variants,
-            test_name="Kimi-K2-Thinking Unified",
+            test_name="Kimi-K2-Thinking",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.94),
             performance_params=PerformanceTestParams(
                 profile_dir="performance_profiles_kimi_k2_thinking",
diff --git a/test/registered/8-gpu-models/test_llama4.py b/test/registered/8-gpu-models/test_llama4.py
@@ -44,7 +44,7 @@ def test_llama4(self):
 
         run_combined_tests(
             models=variants,
-            test_name="Llama-4-Scout Unified",
+            test_name="Llama-4-Scout",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.9),
             performance_params=PerformanceTestParams(
                 profile_dir="performance_profiles_llama4",
diff --git a/test/registered/8-gpu-models/test_minimax_m2.py b/test/registered/8-gpu-models/test_minimax_m2.py
@@ -43,7 +43,7 @@ def test_minimax_m2(self):
 
         run_combined_tests(
             models=variants,
-            test_name="MiniMax-M2 Unified",
+            test_name="MiniMax-M2",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.80),
             performance_params=PerformanceTestParams(
                 profile_dir="performance_profiles_minimax_m2",
diff --git a/test/registered/8-gpu-models/test_mistral_large3.py b/test/registered/8-gpu-models/test_mistral_large3.py
@@ -76,7 +76,7 @@ def test_mistral_large3_all_variants(self):
 
         run_combined_tests(
             models=variants,
-            test_name="Mistral-Large-3 Unified",
+            test_name="Mistral-Large-3",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.90),
             performance_params=PerformanceTestParams(
                 profile_dir="performance_profiles_mistral_large3",
diff --git a/test/registered/8-gpu-models/test_qwen3_235b.py b/test/registered/8-gpu-models/test_qwen3_235b.py
@@ -4,46 +4,67 @@
 from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.performance_test_runner import PerformanceTestParams
 from sglang.test.run_combined_tests import run_combined_tests
-from sglang.test.test_utils import ModelLaunchSettings, is_blackwell_system
+from sglang.test.test_utils import ModelLaunchSettings
 
 # Runs on both H200 and B200 via nightly-8-gpu-common suite
 register_cuda_ci(est_time=1800, suite="nightly-8-gpu-common", nightly=True)
 
-QWEN3_235B_MODEL_PATH = "Qwen/Qwen3-235B-A22B-Instruct-2507"
+QWEN3_235B_FP8_MODEL_PATH = "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+QWEN3_235B_EAGLE3_MODEL_PATH = (
+    "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge-Meituan"
+)
 
 
-@unittest.skipIf(not is_blackwell_system(), "Requires B200")
-class TestQwen3235B(unittest.TestCase):
-    """Unified test class for Qwen3-235B performance and accuracy.
+class TestQwen3235BFP8(unittest.TestCase):
+    """Test class for Qwen3-235B-FP8 performance and accuracy.
 
-    Single variant with simple TP=8 configuration.
-    Runs BOTH:
+    Two variants:
+    - basic: TP=8
+    - eagle3: TP=8 + EP=2 + EAGLE3 speculative decoding
+
+    Each variant runs BOTH:
     - Performance test (using NightlyBenchmarkRunner)
-    - Accuracy test (using run_eval with mgsm_en)
+    - Accuracy test (using run_eval with gsm8k)
     """
 
-    def test_qwen3_235b(self):
-        """Run performance and accuracy for Qwen3-235B."""
+    def test_qwen3_235b_fp8_all_variants(self):
+        """Run performance and accuracy for Qwen3-235B-FP8."""
         base_args = [
             "--tp=8",
             "--trust-remote-code",
         ]
+        eagle3_args = [
+            "--ep=2",
+            "--speculative-algorithm=EAGLE3",
+            f"--speculative-draft-model-path={QWEN3_235B_EAGLE3_MODEL_PATH}",
+            "--speculative-num-steps=3",
+            "--speculative-eagle-topk=1",
+            "--speculative-num-draft-tokens=4",
+        ]
 
         variants = [
+            # Variant: "basic" - TP=8
             ModelLaunchSettings(
-                QWEN3_235B_MODEL_PATH,
+                QWEN3_235B_FP8_MODEL_PATH,
                 tp_size=8,
                 extra_args=base_args,
                 variant="TP8",
             ),
+            # Variant: "eagle3" - TP=8 + EP=2 + EAGLE3 speculative decoding
+            ModelLaunchSettings(
+                QWEN3_235B_FP8_MODEL_PATH,
+                tp_size=8,
+                extra_args=base_args + eagle3_args,
+                variant="TP8+EP2+EAGLE3",
+            ),
         ]
 
         run_combined_tests(
             models=variants,
-            test_name="Qwen3-235B Unified",
+            test_name="Qwen3-235B-FP8",
             accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.88),
             performance_params=PerformanceTestParams(
-                profile_dir="performance_profiles_qwen3_235b",
+                profile_dir="performance_profiles_qwen3_235b_fp8",
             ),
         )
 

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def test_deepseek_v32_cp_variants(self):`
`69`	`69`	`ModelLaunchSettings(`
`70`	`70`	`DEEPSEEK_V32_EXP_MODEL_PATH,`
`71`	`71`	`tp_size=8,`
`72`		`- extra_args=BASE_ARGS + ["--tp=8"] + MTP_ARGS + CP_ROUND_ROBIN_ARGS,`
	`72`	`+ extra_args=BASE_ARGS + MTP_ARGS + CP_ROUND_ROBIN_ARGS,`
`73`	`73`	`variant="CP-round-robin-split",`
`74`	`74`	`),`
`75`	`75`	`]`