Skip to content

Commit 695b284

Browse files
committed
Expand benchmark manifest matrices
1 parent b1697a8 commit 695b284

3 files changed

Lines changed: 309 additions & 4 deletions

File tree

crates/izwi-cli/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ izwi --output-format json bench run benchmarks/local.toml --artifact-dir benchma
151151
izwi bench compare current.json baseline.json --tolerance-percent 5
152152
```
153153

154+
Benchmark manifests support `[benchmarks.matrix]` tables for cartesian runs across fields like `model`, `concurrent`, `max_tokens`, `file`, and `duration_secs`.
155+
154156
### Config
155157

156158
```bash

crates/izwi-cli/src/commands/bench.rs

Lines changed: 289 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ struct BenchmarkManifest {
258258
benchmarks: Vec<BenchmarkManifestCase>,
259259
}
260260

261-
#[derive(Debug, Deserialize)]
261+
#[derive(Debug, Clone, Deserialize)]
262262
struct BenchmarkManifestCase {
263263
name: Option<String>,
264264
command: String,
@@ -273,6 +273,43 @@ struct BenchmarkManifestCase {
273273
file: Option<String>,
274274
language: Option<String>,
275275
duration_secs: Option<u64>,
276+
matrix: Option<BenchmarkManifestMatrix>,
277+
}
278+
279+
#[derive(Debug, Clone, Deserialize, Default)]
280+
struct BenchmarkManifestMatrix {
281+
model: Option<Vec<String>>,
282+
iterations: Option<Vec<u32>>,
283+
concurrent: Option<Vec<u32>>,
284+
warmup: Option<Vec<bool>>,
285+
prompt: Option<Vec<String>>,
286+
system: Option<Vec<String>>,
287+
max_tokens: Option<Vec<usize>>,
288+
text: Option<Vec<String>>,
289+
file: Option<Vec<String>>,
290+
language: Option<Vec<String>>,
291+
duration_secs: Option<Vec<u64>>,
292+
}
293+
294+
#[derive(Debug, Clone)]
295+
struct MatrixDimension {
296+
key: &'static str,
297+
values: Vec<MatrixValue>,
298+
}
299+
300+
#[derive(Debug, Clone)]
301+
enum MatrixValue {
302+
Model(String),
303+
Iterations(u32),
304+
Concurrent(u32),
305+
Warmup(bool),
306+
Prompt(String),
307+
System(String),
308+
MaxTokens(usize),
309+
Text(String),
310+
File(String),
311+
Language(String),
312+
DurationSecs(u64),
276313
}
277314

278315
#[derive(Debug, Serialize)]
@@ -530,6 +567,198 @@ fn format_case_list(cases: &[String]) -> String {
530567
}
531568
}
532569

570+
impl MatrixValue {
571+
fn apply(&self, case: &mut BenchmarkManifestCase) {
572+
match self {
573+
MatrixValue::Model(value) => case.model = Some(value.clone()),
574+
MatrixValue::Iterations(value) => case.iterations = Some(*value),
575+
MatrixValue::Concurrent(value) => case.concurrent = Some(*value),
576+
MatrixValue::Warmup(value) => case.warmup = Some(*value),
577+
MatrixValue::Prompt(value) => case.prompt = Some(value.clone()),
578+
MatrixValue::System(value) => case.system = Some(value.clone()),
579+
MatrixValue::MaxTokens(value) => case.max_tokens = Some(*value),
580+
MatrixValue::Text(value) => case.text = Some(value.clone()),
581+
MatrixValue::File(value) => case.file = Some(value.clone()),
582+
MatrixValue::Language(value) => case.language = Some(value.clone()),
583+
MatrixValue::DurationSecs(value) => case.duration_secs = Some(*value),
584+
}
585+
}
586+
587+
fn label_value(&self) -> String {
588+
match self {
589+
MatrixValue::Model(value)
590+
| MatrixValue::Prompt(value)
591+
| MatrixValue::System(value)
592+
| MatrixValue::Text(value)
593+
| MatrixValue::File(value)
594+
| MatrixValue::Language(value) => matrix_label_string(value),
595+
MatrixValue::Iterations(value) => value.to_string(),
596+
MatrixValue::Concurrent(value) => value.to_string(),
597+
MatrixValue::MaxTokens(value) => value.to_string(),
598+
MatrixValue::DurationSecs(value) => value.to_string(),
599+
MatrixValue::Warmup(value) => value.to_string(),
600+
}
601+
}
602+
}
603+
604+
impl BenchmarkManifestMatrix {
605+
fn dimensions(&self) -> Result<Vec<MatrixDimension>> {
606+
let mut dimensions = Vec::new();
607+
add_matrix_dimension(&mut dimensions, "model", &self.model, MatrixValue::Model)?;
608+
add_matrix_dimension(
609+
&mut dimensions,
610+
"iterations",
611+
&self.iterations,
612+
MatrixValue::Iterations,
613+
)?;
614+
add_matrix_dimension(
615+
&mut dimensions,
616+
"concurrent",
617+
&self.concurrent,
618+
MatrixValue::Concurrent,
619+
)?;
620+
add_matrix_dimension(&mut dimensions, "warmup", &self.warmup, MatrixValue::Warmup)?;
621+
add_matrix_dimension(&mut dimensions, "prompt", &self.prompt, MatrixValue::Prompt)?;
622+
add_matrix_dimension(&mut dimensions, "system", &self.system, MatrixValue::System)?;
623+
add_matrix_dimension(
624+
&mut dimensions,
625+
"max_tokens",
626+
&self.max_tokens,
627+
MatrixValue::MaxTokens,
628+
)?;
629+
add_matrix_dimension(&mut dimensions, "text", &self.text, MatrixValue::Text)?;
630+
add_matrix_dimension(&mut dimensions, "file", &self.file, MatrixValue::File)?;
631+
add_matrix_dimension(
632+
&mut dimensions,
633+
"language",
634+
&self.language,
635+
MatrixValue::Language,
636+
)?;
637+
add_matrix_dimension(
638+
&mut dimensions,
639+
"duration_secs",
640+
&self.duration_secs,
641+
MatrixValue::DurationSecs,
642+
)?;
643+
if dimensions.is_empty() {
644+
return Err(CliError::InvalidInput(
645+
"Benchmark matrix must include at least one non-empty field".to_string(),
646+
));
647+
}
648+
Ok(dimensions)
649+
}
650+
}
651+
652+
fn add_matrix_dimension<T, F>(
653+
dimensions: &mut Vec<MatrixDimension>,
654+
key: &'static str,
655+
values: &Option<Vec<T>>,
656+
map: F,
657+
) -> Result<()>
658+
where
659+
T: Clone,
660+
F: Fn(T) -> MatrixValue,
661+
{
662+
let Some(values) = values else {
663+
return Ok(());
664+
};
665+
if values.is_empty() {
666+
return Err(CliError::InvalidInput(format!(
667+
"Benchmark matrix field `{key}` must contain at least one value"
668+
)));
669+
}
670+
dimensions.push(MatrixDimension {
671+
key,
672+
values: values.iter().cloned().map(map).collect(),
673+
});
674+
Ok(())
675+
}
676+
677+
fn matrix_label_string(value: &str) -> String {
678+
let normalized = value.split_whitespace().collect::<Vec<_>>().join(" ");
679+
if normalized.chars().count() <= 32 {
680+
normalized
681+
} else {
682+
format!(
683+
"{}~{:016x}",
684+
normalized.chars().take(32).collect::<String>(),
685+
stable_label_hash(&normalized)
686+
)
687+
}
688+
}
689+
690+
fn stable_label_hash(value: &str) -> u64 {
691+
let mut hash = 0xcbf29ce484222325_u64;
692+
for byte in value.as_bytes() {
693+
hash ^= u64::from(*byte);
694+
hash = hash.wrapping_mul(0x100000001b3);
695+
}
696+
hash
697+
}
698+
699+
fn expand_manifest_cases(manifest: &BenchmarkManifest) -> Result<Vec<BenchmarkManifestCase>> {
700+
let mut expanded = Vec::new();
701+
for case in &manifest.benchmarks {
702+
expanded.extend(expand_manifest_case(case)?);
703+
}
704+
reject_duplicate_manifest_case_names(&expanded)?;
705+
Ok(expanded)
706+
}
707+
708+
fn expand_manifest_case(case: &BenchmarkManifestCase) -> Result<Vec<BenchmarkManifestCase>> {
709+
let Some(matrix) = case.matrix.as_ref() else {
710+
return Ok(vec![case.clone()]);
711+
};
712+
let dimensions = matrix.dimensions()?;
713+
let mut expanded = vec![(case.clone(), Vec::<String>::new())];
714+
715+
for dimension in dimensions {
716+
let mut next = Vec::new();
717+
for (base, labels) in expanded {
718+
for value in &dimension.values {
719+
let mut case = base.clone();
720+
value.apply(&mut case);
721+
let mut labels = labels.clone();
722+
labels.push(format!("{}={}", dimension.key, value.label_value()));
723+
next.push((case, labels));
724+
}
725+
}
726+
expanded = next;
727+
}
728+
729+
Ok(expanded
730+
.into_iter()
731+
.map(|(mut case, labels)| {
732+
case.matrix = None;
733+
case.name = Some(match case.name.as_deref() {
734+
Some(name) => format!("{name}[{}]", labels.join(",")),
735+
None => format!(
736+
"{}[{}]",
737+
case.command.to_ascii_lowercase(),
738+
labels.join(",")
739+
),
740+
});
741+
case
742+
})
743+
.collect())
744+
}
745+
746+
fn reject_duplicate_manifest_case_names(cases: &[BenchmarkManifestCase]) -> Result<()> {
747+
let mut names = BTreeSet::new();
748+
for (index, case) in cases.iter().enumerate() {
749+
let name = case
750+
.name
751+
.clone()
752+
.unwrap_or_else(|| format!("case-{}", index + 1));
753+
if !names.insert(name.clone()) {
754+
return Err(CliError::InvalidInput(format!(
755+
"Benchmark manifest expands to duplicate case name `{name}`"
756+
)));
757+
}
758+
}
759+
Ok(())
760+
}
761+
533762
async fn read_json_report(path: &Path) -> Result<serde_json::Value> {
534763
let text = tokio::fs::read_to_string(path)
535764
.await
@@ -654,6 +883,7 @@ async fn bench_manifest(
654883
"Benchmark manifest must include at least one [[benchmarks]] entry".to_string(),
655884
));
656885
}
886+
let benchmark_cases = expand_manifest_cases(&manifest)?;
657887

658888
let suite_server = manifest.server.as_deref().unwrap_or(server).to_string();
659889
let started_at = Utc::now();
@@ -664,18 +894,18 @@ async fn bench_manifest(
664894
if options.interactive() {
665895
theme.step(
666896
1,
667-
manifest.benchmarks.len(),
897+
benchmark_cases.len(),
668898
&format!("Running benchmark manifest {}", manifest_path.display()),
669899
);
670900
}
671901

672-
for (index, case) in manifest.benchmarks.iter().enumerate() {
902+
for (index, case) in benchmark_cases.iter().enumerate() {
673903
if options.interactive() {
674904
let label = case.name.as_deref().unwrap_or(case.command.as_str());
675905
theme.info(&format!(
676906
"Case {}/{}: {}",
677907
index + 1,
678-
manifest.benchmarks.len(),
908+
benchmark_cases.len(),
679909
label
680910
));
681911
}
@@ -2454,4 +2684,59 @@ mod tests {
24542684
let err = report_entry_map(entries, "Current").expect_err("duplicates should fail");
24552685
assert!(format!("{err}").contains("duplicate benchmark case name `duplicate`"));
24562686
}
2687+
2688+
#[test]
2689+
fn manifest_matrix_expands_cartesian_cases() {
2690+
let manifest: BenchmarkManifest = toml::from_str(
2691+
r#"
2692+
[[benchmarks]]
2693+
name = "chat-short"
2694+
command = "chat"
2695+
prompt = "hello"
2696+
iterations = 1
2697+
2698+
[benchmarks.matrix]
2699+
model = ["m1", "m2"]
2700+
concurrent = [1, 2]
2701+
"#,
2702+
)
2703+
.expect("manifest should parse");
2704+
2705+
let cases = expand_manifest_cases(&manifest).expect("matrix should expand");
2706+
let names: Vec<_> = cases
2707+
.iter()
2708+
.map(|case| case.name.as_deref().expect("expanded cases are named"))
2709+
.collect();
2710+
assert_eq!(
2711+
names,
2712+
vec![
2713+
"chat-short[model=m1,concurrent=1]",
2714+
"chat-short[model=m1,concurrent=2]",
2715+
"chat-short[model=m2,concurrent=1]",
2716+
"chat-short[model=m2,concurrent=2]",
2717+
]
2718+
);
2719+
assert_eq!(cases[0].model.as_deref(), Some("m1"));
2720+
assert_eq!(cases[0].concurrent, Some(1));
2721+
assert_eq!(cases[3].model.as_deref(), Some("m2"));
2722+
assert_eq!(cases[3].concurrent, Some(2));
2723+
}
2724+
2725+
#[test]
2726+
fn manifest_matrix_rejects_duplicate_expanded_names() {
2727+
let manifest: BenchmarkManifest = toml::from_str(
2728+
r#"
2729+
[[benchmarks]]
2730+
name = "chat-short"
2731+
command = "chat"
2732+
2733+
[benchmarks.matrix]
2734+
concurrent = [1, 1]
2735+
"#,
2736+
)
2737+
.expect("manifest should parse");
2738+
2739+
let err = expand_manifest_cases(&manifest).expect_err("duplicate matrix names should fail");
2740+
assert!(format!("{err}").contains("duplicate case name `chat-short[concurrent=1]`"));
2741+
}
24572742
}

docs/user/cli/bench.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,24 @@ concurrent = 2
164164

165165
Supported `command` values are `chat`, `tts`, `asr`, and `throughput`. Relative ASR file paths resolve from the manifest directory.
166166

167+
Each benchmark can include a `[benchmarks.matrix]` table to generate a cartesian matrix from array values. Scalar values on the benchmark are used as defaults, and matrix values override them per generated case.
168+
169+
```toml
170+
[[benchmarks]]
171+
name = "chat-short"
172+
command = "chat"
173+
prompt = "Summarize why batching helps transformer prefill."
174+
iterations = 10
175+
warmup = true
176+
177+
[benchmarks.matrix]
178+
model = ["Qwen3.5-4B", "Qwen3.5-8B"]
179+
concurrent = [1, 2, 4]
180+
max_tokens = [64, 128]
181+
```
182+
183+
The example expands to 12 named cases such as `chat-short[model=Qwen3.5-4B,concurrent=1,max_tokens=64]`. Duplicate expanded case names are rejected so JSON reports can be compared safely by case identity.
184+
167185
When `--artifact-dir` is provided, the CLI writes:
168186

169187
- `report.json` — suite report with all case summaries and samples

0 commit comments

Comments
 (0)