Skip to content

Commit b7e827e

Browse files
Kangyan-ZhouJohnsonms
authored andcommitted
A few updates to the night tests (sgl-project#17694)
1 parent 9c4b947 commit b7e827e

File tree

11 files changed

+52
-26
lines changed

11 files changed

+52
-26
lines changed

python/sglang/test/accuracy_test_runner.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class AccuracyTestResult:
4040
baseline_accuracy: float
4141
error: Optional[str]
4242
latency: Optional[float] = None
43+
variant: Optional[str] = None
4344

4445

4546
def write_accuracy_github_summary(
@@ -54,16 +55,18 @@ def write_accuracy_github_summary(
5455
dataset: Dataset name used for evaluation
5556
results: List of AccuracyTestResult objects
5657
"""
57-
summary = f"## {test_name} - Accuracy ({dataset})\n"
58-
summary += "| model | status | score | baseline | error |\n"
59-
summary += "| ----- | ------ | ----- | -------- | ----- |\n"
58+
summary = f"#### {test_name} - Accuracy ({dataset})\n"
59+
summary += "| config | status | score | baseline | error |\n"
60+
summary += "| ------ | ------ | ----- | -------- | ----- |\n"
6061

6162
for result in results:
6263
status_emoji = "✅" if result.passed else "❌"
6364
score_str = f"{result.score:.4f}" if result.score is not None else "N/A"
6465
baseline_str = f"{result.baseline_accuracy:.4f}"
6566
error_str = result.error if result.error else "-"
66-
summary += f"| {result.model} | {status_emoji} | {score_str} | {baseline_str} | {error_str} |\n"
67+
# Use variant name if available, otherwise use model path
68+
config_name = result.variant if result.variant else result.model
69+
summary += f"| {config_name} | {status_emoji} | {score_str} | {baseline_str} | {error_str} |\n"
6770

6871
write_github_step_summary(summary)
6972

@@ -239,6 +242,7 @@ def run_accuracy_test(
239242
score=None,
240243
baseline_accuracy=params.baseline_accuracy,
241244
error=error,
245+
variant=model.variant,
242246
)
243247

244248
# Validate against baseline
@@ -265,4 +269,5 @@ def run_accuracy_test(
265269
baseline_accuracy=params.baseline_accuracy,
266270
error=error if not passed else None,
267271
latency=latency,
272+
variant=model.variant,
268273
)

test/registered/8-gpu-models/test_deepseek_v31.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_deepseek_v31_all_variants(self):
6060

6161
run_combined_tests(
6262
models=variants,
63-
test_name="DeepSeek-V3.1 Unified",
63+
test_name="DeepSeek-V3.1",
6464
accuracy_params=AccuracyTestParams(
6565
dataset="gsm8k", baseline_accuracy=0.935
6666
),

test/registered/8-gpu-models/test_deepseek_v32.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_deepseek_v32_all_variants(self):
8282

8383
run_combined_tests(
8484
models=variants,
85-
test_name="DeepSeek-V3.2 Unified",
85+
test_name="DeepSeek-V3.2",
8686
accuracy_params=AccuracyTestParams(
8787
dataset="gsm8k", baseline_accuracy=GSM8K_BASELINE
8888
),

test/registered/8-gpu-models/test_deepseek_v32_cp_single_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def test_deepseek_v32_cp_variants(self):
6969
ModelLaunchSettings(
7070
DEEPSEEK_V32_EXP_MODEL_PATH,
7171
tp_size=8,
72-
extra_args=BASE_ARGS + ["--tp=8"] + MTP_ARGS + CP_ROUND_ROBIN_ARGS,
72+
extra_args=BASE_ARGS + MTP_ARGS + CP_ROUND_ROBIN_ARGS,
7373
variant="CP-round-robin-split",
7474
),
7575
]

test/registered/8-gpu-models/test_glm_46.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def test_glm_46(self):
4040

4141
run_combined_tests(
4242
models=variants,
43-
test_name="GLM-4.6 Unified",
43+
test_name="GLM-4.6",
4444
accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.80),
4545
performance_params=PerformanceTestParams(
4646
profile_dir="performance_profiles_glm_4_6",

test/registered/8-gpu-models/test_glm_46_fp8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def test_glm_46_fp8_all_variants(self):
5151

5252
run_combined_tests(
5353
models=variants,
54-
test_name="GLM-4.6-FP8 Unified",
54+
test_name="GLM-4.6-FP8",
5555
accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.80),
5656
performance_params=PerformanceTestParams(
5757
profile_dir="performance_profiles_glm_4_6_fp8",

test/registered/8-gpu-models/test_kimi_k2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_kimi_k2(self):
4141

4242
run_combined_tests(
4343
models=variants,
44-
test_name="Kimi-K2-Thinking Unified",
44+
test_name="Kimi-K2-Thinking",
4545
accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.94),
4646
performance_params=PerformanceTestParams(
4747
profile_dir="performance_profiles_kimi_k2_thinking",

test/registered/8-gpu-models/test_llama4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_llama4(self):
4444

4545
run_combined_tests(
4646
models=variants,
47-
test_name="Llama-4-Scout Unified",
47+
test_name="Llama-4-Scout",
4848
accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.9),
4949
performance_params=PerformanceTestParams(
5050
profile_dir="performance_profiles_llama4",

test/registered/8-gpu-models/test_minimax_m2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def test_minimax_m2(self):
4343

4444
run_combined_tests(
4545
models=variants,
46-
test_name="MiniMax-M2 Unified",
46+
test_name="MiniMax-M2",
4747
accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.80),
4848
performance_params=PerformanceTestParams(
4949
profile_dir="performance_profiles_minimax_m2",

test/registered/8-gpu-models/test_mistral_large3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_mistral_large3_all_variants(self):
7676

7777
run_combined_tests(
7878
models=variants,
79-
test_name="Mistral-Large-3 Unified",
79+
test_name="Mistral-Large-3",
8080
accuracy_params=AccuracyTestParams(dataset="gsm8k", baseline_accuracy=0.90),
8181
performance_params=PerformanceTestParams(
8282
profile_dir="performance_profiles_mistral_large3",

0 commit comments

Comments
 (0)