From 0b78aa77d02a6aae5ce3621bc6fc5c522c33c728 Mon Sep 17 00:00:00 2001 From: chenhany Date: Thu, 21 May 2026 15:13:57 -0700 Subject: [PATCH 01/10] [OMNIML-4788] specdec_bench: configuration.json provenance + upload_to_s3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1a of the SPEED-bench harness migration off iputterman/specdec_bench (personal-namespace fork) and onto upstream. Adds enough provenance for the visualizer to render runs AND for the methodology-version aggregation rule (Phase 4) to keep apple-to-orange comparisons out of plots. specdec_bench/__init__.py __version__ = "0.1.0". Bump minor for added metrics (additive); major for changed metric *definitions*. Cosmetic refactors don't bump. Visualizer groups by major version when aggregating plots. specdec_bench/utils.py Adds dump_env(args, save_dir, overrides=None) which writes configuration.json with the existing argparse args (sensitive keys redacted) plus: - specdec_bench_version — from __version__ - specdec_bench_sha — git rev-parse on this package - modelopt_version — modelopt.__version__ when importable - modelopt_sha — git rev-parse on the repo root - nmm_sandbox_sha — from $NMM_SANDBOX_SHA (harness-provided; null standalone) - container_image — from $CONTAINER_IMAGE (harness-provided) - checkpoint{path,size_bytes,index_sha256} — cheap fingerprint via hashing model.safetensors.index.json (or config.json fallback) - serving_config — runtime engine config from runner.get_serving_config() - timestamp — UTC ISO 8601 Existing engine_version / gpu / python_version / argv fields preserved. specdec_bench/models/base.py Adds Model.get_serving_config() default ({}). Subclasses override. specdec_bench/models/vllm.py Saves engine_args; get_serving_config() dumps AsyncEngineArgs as dict and also captures vllm_config.to_dict() when the live AsyncLLM exposes it (covers max_model_len / kv_cache_dtype / async_scheduling defaults that AsyncEngineArgs doesn't carry). specdec_bench/models/sglang.py Saves engine_kwargs; get_serving_config() returns the dict passed to sgl.Engine(). TRTLLM left at the base default; can deepen separately. run.py Imports dump_env. After engine init and metric directory setup, writes configuration.json so the file lands even if the run loop crashes partway. serving_config is fetched from the live model. specdec_bench/s3_utils.py (new) upload_to_s3.py (new) Ported verbatim from iputterman/specdec_bench@main with explicit attribution in the file headers. parse_s3_path / make_s3_client / upload_run_dir / upload_directory + a standalone uploader CLI that recognizes run dirs by sentinel files (configuration.json, timing.json, aa_timing.json, acceptance_rate.json) and refuses to overwrite existing S3 prefixes. requirements_speed.txt boto3, botocore added (needed by s3_utils). Out of scope (deferred): - --sweep_config / per-run-dir nesting (Phase 1b) - --s3_upload flag on run.py itself (Phase 1b; for now use upload_to_s3.py) - container_digest (enroot integration), full GPU/driver inventory - serving_config for TRTLLM Signed-off-by: chenhany --- examples/specdec_bench/requirements_speed.txt | 2 + examples/specdec_bench/run.py | 5 + .../specdec_bench/specdec_bench/__init__.py | 6 + .../specdec_bench/models/base.py | 9 + .../specdec_bench/models/sglang.py | 9 + .../specdec_bench/models/vllm.py | 20 ++ .../specdec_bench/specdec_bench/s3_utils.py | 108 +++++++++++ examples/specdec_bench/specdec_bench/utils.py | 149 +++++++++++++++ examples/specdec_bench/upload_to_s3.py | 175 ++++++++++++++++++ 9 files changed, 483 insertions(+) create mode 100644 examples/specdec_bench/specdec_bench/s3_utils.py create mode 100644 examples/specdec_bench/upload_to_s3.py diff --git a/examples/specdec_bench/requirements_speed.txt b/examples/specdec_bench/requirements_speed.txt index 549a5d73e81..b6e7d001ffd 100644 --- a/examples/specdec_bench/requirements_speed.txt +++ b/examples/specdec_bench/requirements_speed.txt @@ -1,3 +1,5 @@ +boto3>=1.34.0 +botocore>=1.34.0 datasets>=3.1.0 rich>=14.2.0 seaborn>=0.13.2 diff --git a/examples/specdec_bench/run.py b/examples/specdec_bench/run.py index 94932c787b8..337c5184f20 100644 --- a/examples/specdec_bench/run.py +++ b/examples/specdec_bench/run.py @@ -20,6 +20,7 @@ from specdec_bench import datasets, metrics, models, runners from specdec_bench.utils import ( decode_chat, + dump_env, encode_chat, get_tokenizer, postprocess_base, @@ -174,6 +175,10 @@ def run_simple(args): if args.save_dir is not None: for metric in metrics_list: metric.update_directory(args.save_dir) + # Stamp configuration.json BEFORE the run loop so the file lands even + # when the run crashes mid-way. Engine init is already done, so the + # live serving_config from the model is available. + dump_env(args, args.save_dir, overrides={"serving_config": model.get_serving_config()}) runner = runners.SimpleRunner(model, metrics=metrics_list) diff --git a/examples/specdec_bench/specdec_bench/__init__.py b/examples/specdec_bench/specdec_bench/__init__.py index 47f1c65a15f..d721315cca0 100644 --- a/examples/specdec_bench/specdec_bench/__init__.py +++ b/examples/specdec_bench/specdec_bench/__init__.py @@ -13,3 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Methodology version. Bump: +# - minor (0.X.0) when adding a new metric or strictly-additive provenance field +# - major (X.0.0) when changing how an existing metric is computed +# The visualizer aggregates runs by major version to avoid apple-to-orange +# comparisons across methodology changes. +__version__ = "0.1.0" diff --git a/examples/specdec_bench/specdec_bench/models/base.py b/examples/specdec_bench/specdec_bench/models/base.py index ab26a4704dd..43d3a1337d3 100644 --- a/examples/specdec_bench/specdec_bench/models/base.py +++ b/examples/specdec_bench/specdec_bench/models/base.py @@ -27,5 +27,14 @@ async def run(self, prompt_ids, sampling_params, request_id, turn_id): """ raise NotImplementedError + def get_serving_config(self): + """Return a JSON-serializable dict describing the engine's effective config. + + Captured into configuration.json's `serving_config` for reproducibility. + Subclasses override to surface engine-specific defaults (max_model_len, + kv_cache_dtype, etc.) that don't appear in the CLI args. Default: empty. + """ + return {} + def stop(self): pass diff --git a/examples/specdec_bench/specdec_bench/models/sglang.py b/examples/specdec_bench/specdec_bench/models/sglang.py index 2133ec937eb..99a66b0647e 100644 --- a/examples/specdec_bench/specdec_bench/models/sglang.py +++ b/examples/specdec_bench/specdec_bench/models/sglang.py @@ -91,6 +91,7 @@ def __init__( if "mamba_scheduler_strategy" in kwargs: engine_kwargs["mamba_scheduler_strategy"] = kwargs["mamba_scheduler_strategy"] + self.engine_kwargs = engine_kwargs self.model = sgl.Engine(**engine_kwargs) self.sampling_config = sampling_kwargs @@ -129,3 +130,11 @@ async def run(self, prompt_ids, max_length, end_id, request_id, turn_id): output_dict["output_logits"] = None output_dict["token_times"] = timing return output_dict + + def get_serving_config(self): + """Dump the engine_kwargs dict supplied to sgl.Engine().""" + try: + # engine_kwargs is plain dict of scalars/None — already JSON-safe. + return dict(self.engine_kwargs) + except Exception: + return {} diff --git a/examples/specdec_bench/specdec_bench/models/vllm.py b/examples/specdec_bench/specdec_bench/models/vllm.py index fc595c1d579..52bf35f1a0d 100644 --- a/examples/specdec_bench/specdec_bench/models/vllm.py +++ b/examples/specdec_bench/specdec_bench/models/vllm.py @@ -89,6 +89,7 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs async_scheduling=kwargs.get("async_scheduling", True), enforce_eager=False, ) + self.engine_args = engine_args self.model = AsyncLLM.from_engine_args(engine_args) self.sampling_kwargs = sampling_kwargs # https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py @@ -151,6 +152,25 @@ async def generate(self, prompt_ids, request_id, turn_id): break return outputs, timing, full_tokens + def get_serving_config(self): + """Dump the AsyncEngineArgs dataclass plus the runtime vllm_config when available.""" + try: + import dataclasses + + cfg = dataclasses.asdict(self.engine_args) + except Exception: + cfg = {} + # vllm exposes the resolved engine config on the AsyncLLM instance once + # initialized — capture max_model_len / kv cache / dtype defaults that + # don't appear in AsyncEngineArgs. + try: + vllm_config = getattr(self.model, "vllm_config", None) + if vllm_config is not None and hasattr(vllm_config, "to_dict"): + cfg["vllm_config"] = vllm_config.to_dict() + except Exception: + pass + return cfg + def stop(self): try: self.loop.run_until_complete(self.model.shutdown()) diff --git a/examples/specdec_bench/specdec_bench/s3_utils.py b/examples/specdec_bench/specdec_bench/s3_utils.py new file mode 100644 index 00000000000..59a1c8f0589 --- /dev/null +++ b/examples/specdec_bench/specdec_bench/s3_utils.py @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""S3 upload utilities for specdec_bench results.""" + +import os +from pathlib import Path + +S3_DEFAULT_ENDPOINT = "https://pdx.s8k.io" +S3_DEFAULT_KEY_ID = "" +S3_DEFAULT_SECRET = "" + + +def parse_s3_path(path: str) -> tuple[str, str]: + """'s3://bucket/prefix' → (bucket, prefix). prefix may be empty.""" + without_scheme = path[5:] # strip "s3://" + parts = without_scheme.split("/", 1) + bucket = parts[0] + prefix = parts[1].strip("/") if len(parts) > 1 else "" + return bucket, prefix + + +def make_s3_client(endpoint: str, key_id: str, secret: str): + import boto3 + from botocore.config import Config + + return boto3.client( + "s3", + endpoint_url=endpoint, + aws_access_key_id=key_id, + aws_secret_access_key=secret, + region_name="us-east-1", + config=Config(s3={"addressing_style": "path"}), + ) + + +def s3_prefix_exists(s3, bucket: str, prefix: str) -> bool: + """Return True if any object exists under prefix.""" + resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix.rstrip("/") + "/", MaxKeys=1) + return bool(resp.get("Contents")) + + +def _upload_files(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: + """Upload all files under local_dir without any existence check.""" + for file_path in sorted(local_dir.rglob("*")): + if not file_path.is_file(): + continue + rel = file_path.relative_to(local_dir).as_posix() + key = f"{s3_prefix}/{rel}" + s3.upload_file(str(file_path), bucket, key) + print(f" Uploaded: s3://{bucket}/{key}") + + +def upload_run_dir(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: + """Upload a single run directory to s3://bucket/s3_prefix/. + + Raises ValueError if the destination prefix already has any objects. + """ + s3_prefix = s3_prefix.rstrip("/") + if s3_prefix_exists(s3, bucket, s3_prefix): + raise ValueError( + f"S3 destination already exists: s3://{bucket}/{s3_prefix} — refusing to overwrite" + ) + _upload_files(s3, local_dir, bucket, s3_prefix) + + +def upload_directory(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: + """Upload a sweep output directory (local_dir/run_name/...) to S3. + + Each run subdirectory is checked independently so a partial re-upload + of a sweep fails loudly on the first clash rather than silently skipping. + """ + s3_prefix = s3_prefix.rstrip("/") + run_dirs = sorted(d for d in local_dir.iterdir() if d.is_dir()) + if not run_dirs: + raise ValueError(f"No subdirectories found in {local_dir}") + for run_dir in run_dirs: + run_key = f"{s3_prefix}/{run_dir.name}" + print(f" {run_dir.name} → s3://{bucket}/{run_key}") + if s3_prefix_exists(s3, bucket, run_key): + raise ValueError( + f"S3 destination already exists: s3://{bucket}/{run_key} — refusing to overwrite" + ) + # All clear — upload everything + for run_dir in run_dirs: + _upload_files(s3, run_dir, bucket, f"{s3_prefix}/{run_dir.name}") + + +def s3_credentials_from_args_or_env(args) -> tuple[str, str, str]: + """Resolve S3 credentials: CLI args > env vars > built-in defaults.""" + endpoint = str( + getattr(args, "s3_endpoint", None) or os.environ.get("S3_ENDPOINT", S3_DEFAULT_ENDPOINT) + ) + key_id = str(getattr(args, "s3_key_id", None) or os.environ.get("S3_KEY_ID", S3_DEFAULT_KEY_ID)) + secret = str(getattr(args, "s3_secret", None) or os.environ.get("S3_SECRET", S3_DEFAULT_SECRET)) + return endpoint, key_id, secret diff --git a/examples/specdec_bench/specdec_bench/utils.py b/examples/specdec_bench/specdec_bench/utils.py index 14ded0f31b2..518ec95e51e 100644 --- a/examples/specdec_bench/specdec_bench/utils.py +++ b/examples/specdec_bench/specdec_bench/utils.py @@ -13,10 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime +import hashlib import json +import os +import subprocess +import sys +from pathlib import Path from transformers import AutoTokenizer +from . import __version__ as specdec_bench_version + +_SENSITIVE_SUBSTRINGS = ("token", "key", "secret", "password") + def get_tokenizer(path, trust_remote_code=False): return AutoTokenizer.from_pretrained(path, trust_remote_code=trust_remote_code) @@ -58,3 +68,142 @@ def postprocess_gptoss(text): if "<|channel|>" in final_message: final_message = final_message.split("<|channel|>")[0] return final_message + + +def _get_engine_version(engine): + """Return the engine package's __version__, or None on failure.""" + try: + if engine in ("TRTLLM", "AUTO_DEPLOY"): + import tensorrt_llm + + return tensorrt_llm.__version__ + elif engine == "VLLM": + import vllm + + return vllm.__version__ + elif engine == "SGLANG": + import sglang + + return sglang.__version__ + except Exception: + pass + return None + + +def _get_gpu_name(): + try: + import torch + + if torch.cuda.is_available(): + return torch.cuda.get_device_name(0) + except Exception: + pass + return None + + +def _get_modelopt_version(): + try: + import modelopt + + return getattr(modelopt, "__version__", None) + except Exception: + return None + + +def _git_sha(path): + """git rev-parse HEAD inside `path`. Returns None if not a repo or git missing.""" + try: + out = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=path, + capture_output=True, + text=True, + timeout=5, + ) + if out.returncode == 0: + return out.stdout.strip() + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + return None + + +def _checkpoint_provenance(model_dir): + """Cheap reproducibility fingerprint for a HuggingFace checkpoint directory. + + Returns {path, size_bytes, index_sha256} where index_sha256 hashes the + safetensors index file (changes whenever any shard's contents change). + Falls back to hashing config.json for non-sharded checkpoints. + """ + if model_dir is None: + return None + try: + p = Path(model_dir) + if not p.is_dir(): + return {"path": str(model_dir)} + size_bytes = sum(f.stat().st_size for f in p.rglob("*") if f.is_file()) + hash_target = None + for name in ("model.safetensors.index.json", "config.json"): + candidate = p / name + if candidate.is_file(): + hash_target = candidate + break + index_sha256 = None + if hash_target is not None: + h = hashlib.sha256() + with open(hash_target, "rb") as f: + for chunk in iter(lambda: f.read(65536), b""): + h.update(chunk) + index_sha256 = h.hexdigest() + return { + "path": str(model_dir), + "size_bytes": size_bytes, + "index_sha256": index_sha256, + "index_source": hash_target.name if hash_target is not None else None, + } + except Exception: + return {"path": str(model_dir)} + + +def _redact_config(config): + return { + key: ( + "***REDACTED***" + if any(part in key.lower() for part in _SENSITIVE_SUBSTRINGS) + else value + ) + for key, value in config.items() + } + + +def dump_env(args, save_dir, overrides=None): + """Write configuration.json to save_dir capturing run args, engine version, and provenance. + + `overrides` is merged in last and is the channel for runtime-only fields + (e.g. the live engine's serving_config dict from runner.get_serving_config()). + """ + config = _redact_config(vars(args).copy()) + if overrides: + config.update(_redact_config(overrides)) + + config["engine_version"] = _get_engine_version(config.get("engine")) + config["gpu"] = _get_gpu_name() + config["python_version"] = sys.version + config["argv"] = sys.argv[:] + + # Provenance for reproducibility / apple-to-orange guarding. + config["specdec_bench_version"] = specdec_bench_version + specdec_bench_dir = Path(__file__).resolve().parent + config["specdec_bench_sha"] = _git_sha(specdec_bench_dir) + config["modelopt_version"] = _get_modelopt_version() + config["modelopt_sha"] = _git_sha(specdec_bench_dir.parents[2]) # examples/specdec_bench/specdec_bench → modelopt root + # Harness-provided env vars (set by nmm-sandbox / launcher); null when standalone. + config["nmm_sandbox_sha"] = os.environ.get("NMM_SANDBOX_SHA") or None + config["container_image"] = os.environ.get("CONTAINER_IMAGE") or None + # Checkpoint fingerprint. + config["checkpoint"] = _checkpoint_provenance(getattr(args, "model_dir", None)) + # UTC timestamp. + config["timestamp"] = datetime.datetime.now(datetime.timezone.utc).isoformat() + + os.makedirs(save_dir, exist_ok=True) + with open(os.path.join(save_dir, "configuration.json"), "w") as f: + json.dump(config, f, indent=4, default=str) diff --git a/examples/specdec_bench/upload_to_s3.py b/examples/specdec_bench/upload_to_s3.py new file mode 100644 index 00000000000..5545fa0c60b --- /dev/null +++ b/examples/specdec_bench/upload_to_s3.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Upload specdec_bench results to S3. + +Handles both flat and sweep directory layouts: + + Flat: LOCAL_DIR/run_name/{configuration,timing,...}.json + Sweep: LOCAL_DIR/sweep_name/run_name/{configuration,timing,...}.json + +LOCAL_DIR's name is preserved under the S3 prefix: + + s3://bucket/prefix/LOCAL_DIR_NAME/[sweep_name/]run_name/ + +Usage examples: + + # Upload a sweep output directory + python upload_to_s3.py /data/sweep_outputs/my_sweep s3://team-specdec-workgroup/results + + # Upload a single run + python upload_to_s3.py /data/my_single_run s3://team-specdec-workgroup/results + + # Skip already-uploaded runs instead of failing + python upload_to_s3.py /data/sweep_outputs/my_sweep s3://team-specdec-workgroup/results --skip-existing +""" + +import argparse +import os +import sys +from pathlib import Path + +from specdec_bench.s3_utils import ( + S3_DEFAULT_ENDPOINT, + S3_DEFAULT_KEY_ID, + S3_DEFAULT_SECRET, + make_s3_client, + parse_s3_path, + upload_run_dir, +) + +_RUN_SENTINELS = ("configuration.json", "timing.json", "aa_timing.json", "acceptance_rate.json") + + +def _is_run_dir(d: Path) -> bool: + return any((d / f).exists() for f in _RUN_SENTINELS) + + +def _discover_runs(local_root: Path, s3_prefix_base: str) -> list[tuple[Path, str]]: + """Return list of (local_run_dir, s3_key) pairs to upload. + + local_root's name is appended to s3_prefix_base, then contents mirrored: + local_root/run_name/ → s3_prefix_base/local_root.name/run_name/ + local_root/sweep_name/run_name/ → s3_prefix_base/local_root.name/sweep_name/run_name/ + """ + base = f"{s3_prefix_base}/{local_root.name}".lstrip("/") + queue: list[tuple[Path, str]] = [] + + if _is_run_dir(local_root): + # The directory itself is a single run + queue.append((local_root, base)) + return queue + + for child in sorted(local_root.iterdir()): + if not child.is_dir(): + continue + if _is_run_dir(child): + # Flat layout: local_root/run_name/ + queue.append((child, f"{base}/{child.name}")) + else: + # Sweep layout: local_root/sweep_name/run_name/ + queue.extend( + (grandchild, f"{base}/{child.name}/{grandchild.name}") + for grandchild in sorted(child.iterdir()) + if grandchild.is_dir() and _is_run_dir(grandchild) + ) + + return queue + + +def main(): + parser = argparse.ArgumentParser( + description="Upload specdec_bench results to S3.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__.split("Usage examples:")[1] if "Usage examples:" in __doc__ else "", + ) + parser.add_argument("local_dir", help="Local results directory to upload") + parser.add_argument( + "s3_dest", help="S3 destination prefix, e.g. s3://team-specdec-workgroup/results" + ) + parser.add_argument( + "--endpoint", + default=os.environ.get("S3_ENDPOINT", S3_DEFAULT_ENDPOINT), + help="S3 endpoint URL", + ) + parser.add_argument( + "--key-id", + default=os.environ.get("S3_KEY_ID", S3_DEFAULT_KEY_ID), + dest="key_id", + help="S3 access key ID", + ) + parser.add_argument( + "--secret", + default=os.environ.get("S3_SECRET", S3_DEFAULT_SECRET), + help="S3 secret access key", + ) + parser.add_argument( + "--skip-existing", + action="store_true", + help="Skip runs that already exist in S3 instead of failing", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be uploaded without actually uploading", + ) + args = parser.parse_args() + + if not args.s3_dest.startswith("s3://"): + sys.exit("Error: s3_dest must start with s3://") + + local_root = Path(args.local_dir).resolve() + if not local_root.is_dir(): + sys.exit(f"Error: {local_root} is not a directory") + + bucket, s3_prefix_base = parse_s3_path(args.s3_dest) + queue = _discover_runs(local_root, s3_prefix_base) + + if not queue: + sys.exit("No run directories found to upload.") + + print(f"Found {len(queue)} run(s) to upload → s3://{bucket}/") + + if args.dry_run: + for local_run_dir, s3_key in queue: + print(f" {local_run_dir} → s3://{bucket}/{s3_key}") + return + + s3 = make_s3_client(args.endpoint, args.key_id, args.secret) + + errors = 0 + skipped = 0 + uploaded = 0 + for local_run_dir, s3_key in queue: + print(f"\n{local_run_dir.name} → s3://{bucket}/{s3_key}") + try: + upload_run_dir(s3, local_run_dir, bucket, s3_key) + uploaded += 1 + except ValueError as exc: + if args.skip_existing: + print(f" Skipped: {exc}") + skipped += 1 + else: + print(f" Error: {exc}") + errors += 1 + + print(f"\nDone: {uploaded} uploaded, {skipped} skipped, {errors} failed.") + if errors: + sys.exit(1) + + +if __name__ == "__main__": + main() From 3fe120e1003268131bc12cc774f53b2addf1bcd6 Mon Sep 17 00:00:00 2001 From: chenhany Date: Thu, 21 May 2026 13:42:54 -0700 Subject: [PATCH 02/10] [OMNIML-4788] tools/launcher: add Qwen3.5-4B specdec_bench smoke YAML Adds a vLLM-backed SPEED-bench smoke run for Qwen3.5-4B against the qualitative slice of nvidia/SPEED-Bench-Internal. Speculative algorithm is NONE (autoregressive baseline) so no draft model is required to land the first end-to-end signal; we will iterate on EAGLE3/MTP/DFLASH + sweep + S3-upload in follow-up changes. - common/specdec_bench/run.sh: backend-agnostic sibling to quick_check.sh that sources service_utils.sh, installs requirements_speed.txt, and forwards all caller args (--engine VLLM/SGLANG/TRTLLM, --dataset, --save_dir, etc.) to examples/specdec_bench/run.py. - examples/Qwen/Qwen3.5-4B/specdec_bench.yaml: 1-node / 1-gpu task on vllm/vllm-openai:qwen3_5-cu130 (qwen3_5 needs transformers >= 4.58 which is not in vllm:latest yet). Dry-run via launch.py --dryrun resolves the pipeline cleanly. Actual cluster validation pending on cw-dfw (Qwen/Qwen3.5-4B and nvidia/SPEED-Bench-Internal already staged there). Signed-off-by: chenhany --- tools/launcher/common/specdec_bench/run.sh | 46 ++++++++++++++++++ .../Qwen/Qwen3.5-4B/specdec_bench.yaml | 48 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100755 tools/launcher/common/specdec_bench/run.sh create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml diff --git a/tools/launcher/common/specdec_bench/run.sh b/tools/launcher/common/specdec_bench/run.sh new file mode 100755 index 00000000000..1f8fb06bfba --- /dev/null +++ b/tools/launcher/common/specdec_bench/run.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/../service_utils.sh + +trap 'error_handler $0 $LINENO' ERR +trap 'exit_handler' EXIT + +################################################################################################### +# Backend-agnostic specdec_bench entrypoint. The caller's YAML supplies --engine +# (VLLM | SGLANG | TRTLLM | NONE) and the dataset / sweep flags via "args"; this +# script just sources service_utils.sh, installs the speed-bench deps, and execs +# examples/specdec_bench/run.py. +# +# Required env: HF_MODEL_CKPT +# Optional env: HF_DRAFT_MODEL_CKPT (consumed by --draft_model_dir if the YAML passes it) + +if ! pip install -r modules/Model-Optimizer/examples/specdec_bench/requirements_speed.txt; then + report_result "FAIL: specdec_bench: pip install requirements_speed.txt failed" + exit 1 +fi + +if ! python3 modules/Model-Optimizer/examples/specdec_bench/run.py \ + --model_dir ${HF_MODEL_CKPT} \ + --tokenizer ${HF_MODEL_CKPT} \ + "${@}"; then + report_result "FAIL: specdec_bench: run.py exited non-zero" + exit 1 +fi + +report_result "PASS: specdec_bench run completed" diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml new file mode 100644 index 00000000000..0c36ded7706 --- /dev/null +++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml @@ -0,0 +1,48 @@ +# SPEED-bench smoke run for Qwen3.5-4B via vLLM (autoregressive baseline). +# +# Reads nvidia/SPEED-Bench-Internal/qualitative through Qwen/Qwen3.5-4B with +# --speculative_algorithm NONE (no draft model) and writes timing.json + +# aa_timing.json + acceptance_rate.json + specbench_responses.jsonl + +# specbench_results.json to /scratchspace/specdec_bench/. +# +# The qwen3_5 model_type needs transformers >= 4.58, which is NOT in +# vllm/vllm-openai:latest yet — use the qwen3_5-cu130 tag instead. +# +# Local run: +# uv run launch.py --yaml examples/Qwen/Qwen3.5-4B/specdec_bench.yaml \ +# hf_local=/home/omniml_data_3/hf-local --yes +# +# Slurm run on cw_dfw (where Qwen3.5-4B and SPEED-Bench-Internal are staged): +# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml --yes + +job_name: Qwen3.5-4B_specdec_bench + +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3.5-4B + + task_0: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine VLLM + - --speculative_algorithm NONE + - --tp_size 1 + - --ep_size 1 + - --concurrency 1 + - --num_requests 80 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/specdec_bench + - --trust_remote_code + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: vllm/vllm-openai:qwen3_5-cu130 From 58633f0a6021a0ca9f250ca53e2090d2ebfea182 Mon Sep 17 00:00:00 2001 From: chenhany Date: Thu, 21 May 2026 14:31:49 -0700 Subject: [PATCH 03/10] [OMNIML-4788] tools/launcher: add Qwen3.5-4B specdec_bench MTP variant Companion YAML to specdec_bench.yaml (NONE baseline). Targets Qwen3.5-4B's built-in MTP head (text_config.mtp_num_hidden_layers=1) with draft_length=3 to produce real acceptance numbers vs the trivial AR=1 the NONE run yields. specdec_bench/specdec_bench/models/vllm.py already wires --speculative_algorithm MTP into vLLM's speculative_config as {"method":"mtp","num_speculative_tokens":3}, so no code changes needed. Signed-off-by: chenhany --- .../Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml new file mode 100644 index 00000000000..37746590443 --- /dev/null +++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml @@ -0,0 +1,42 @@ +# SPEED-bench MTP speculative-decoding run for Qwen3.5-4B via vLLM. +# +# Companion to specdec_bench.yaml (the NONE/autoregressive baseline). This +# variant exercises Qwen3.5-4B's built-in MTP head (text_config.mtp_num_hidden_layers=1) +# with draft_length=3 to produce real acceptance-rate numbers instead of the +# trivial AR=1 that NONE yields. +# +# Slurm run on cw_dfw: +# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml --yes + +job_name: Qwen3.5-4B_specdec_bench_mtp + +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3.5-4B + + task_0: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 1 + - --ep_size 1 + - --concurrency 1 + - --num_requests 80 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/specdec_bench_mtp + - --trust_remote_code + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: vllm/vllm-openai:qwen3_5-cu130 From 251afa588108687033dd8f35fd69d14ac3c1d91a Mon Sep 17 00:00:00 2001 From: chenhany Date: Thu, 21 May 2026 15:46:35 -0700 Subject: [PATCH 04/10] [OMNIML-4788] specdec_bench: env-var contract for harness-injected provenance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase-1a dump_env tried git rev-parse and `import modelopt` inside the runtime container to fill specdec_bench_sha / modelopt_sha / modelopt_version, but the container has no .git/ (the launcher packager tarballs source without git metadata) and may not have modelopt installed. The cluster validation hit all three as None. Fix: dump_env prefers env vars set by the harness, falls back to runtime detection only when standalone. Env-only fields are nmm_sandbox_sha and container_image (no in-process way to know them). SPECDEC_BENCH_SHA — git rev-parse on examples/specdec_bench MODELOPT_SHA — git rev-parse on modules/Model-Optimizer MODELOPT_VERSION — modelopt.__version__ (or git describe) NMM_SANDBOX_SHA — git rev-parse on nmm-sandbox CONTAINER_IMAGE — image:tag the task ran on The MTP smoke YAML sets all five statically for the validation run; a Phase-2 launcher change to get_default_env() will compute them at launch so future YAMLs don't need to carry SHAs. Also fixed a latent bug in _redact_config: it matched `tokenizer` against the sensitive-substring `token` and redacted the model path. Added a small allowlist of tokenizer-* keys. Signed-off-by: chenhany --- examples/specdec_bench/specdec_bench/utils.py | 37 ++++++++++++++----- .../Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml | 9 +++++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/examples/specdec_bench/specdec_bench/utils.py b/examples/specdec_bench/specdec_bench/utils.py index 518ec95e51e..7d457712bef 100644 --- a/examples/specdec_bench/specdec_bench/utils.py +++ b/examples/specdec_bench/specdec_bench/utils.py @@ -26,6 +26,12 @@ from . import __version__ as specdec_bench_version _SENSITIVE_SUBSTRINGS = ("token", "key", "secret", "password") +# Keys whose names contain a sensitive substring but are NOT actually secrets. +# Without this allowlist `tokenizer` redacts the model path because it contains +# `token`, losing meaningful provenance. +_SENSITIVE_KEY_ALLOWLIST = frozenset( + {"tokenizer", "tokenizer_path", "tokenizer_mode", "tokenizer_revision"} +) def get_tokenizer(path, trust_remote_code=False): @@ -164,13 +170,16 @@ def _checkpoint_provenance(model_dir): return {"path": str(model_dir)} +def _is_sensitive_key(key): + klow = key.lower() + if klow in _SENSITIVE_KEY_ALLOWLIST: + return False + return any(s in klow for s in _SENSITIVE_SUBSTRINGS) + + def _redact_config(config): return { - key: ( - "***REDACTED***" - if any(part in key.lower() for part in _SENSITIVE_SUBSTRINGS) - else value - ) + key: ("***REDACTED***" if _is_sensitive_key(key) else value) for key, value in config.items() } @@ -191,12 +200,22 @@ def dump_env(args, save_dir, overrides=None): config["argv"] = sys.argv[:] # Provenance for reproducibility / apple-to-orange guarding. + # Each *_sha and modelopt_version prefers an env var set by the harness + # (because git/.git is typically not present inside the runtime container), + # then falls back to runtime detection for standalone usage outside the + # harness. container_image and nmm_sandbox_sha are env-only — there is no + # reasonable in-process way to know them. config["specdec_bench_version"] = specdec_bench_version specdec_bench_dir = Path(__file__).resolve().parent - config["specdec_bench_sha"] = _git_sha(specdec_bench_dir) - config["modelopt_version"] = _get_modelopt_version() - config["modelopt_sha"] = _git_sha(specdec_bench_dir.parents[2]) # examples/specdec_bench/specdec_bench → modelopt root - # Harness-provided env vars (set by nmm-sandbox / launcher); null when standalone. + config["specdec_bench_sha"] = ( + os.environ.get("SPECDEC_BENCH_SHA") or _git_sha(specdec_bench_dir) + ) + config["modelopt_version"] = ( + os.environ.get("MODELOPT_VERSION") or _get_modelopt_version() + ) + config["modelopt_sha"] = ( + os.environ.get("MODELOPT_SHA") or _git_sha(specdec_bench_dir.parents[2]) + ) config["nmm_sandbox_sha"] = os.environ.get("NMM_SANDBOX_SHA") or None config["container_image"] = os.environ.get("CONTAINER_IMAGE") or None # Checkpoint fingerprint. diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml index 37746590443..00327e8443a 100644 --- a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml +++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml @@ -34,6 +34,15 @@ pipeline: environment: - HF_MODEL_CKPT: <> - HF_LOCAL: /hf-local + # Provenance env vars consumed by dump_env() in examples/specdec_bench/run.py. + # These are static here for demonstration; a launcher auto-injection + # (planned for Phase-2 in OMNIML-4788) will compute them at launch time + # so the YAML stays SHA-free. + - SPECDEC_BENCH_SHA: "62d1686570448e8c4c057592f45c23a5965ec206" + - MODELOPT_SHA: "58633f0a6021a0ca9f250ca53e2090d2ebfea182" + - MODELOPT_VERSION: "0.45.0dev-108-g58633f0a60" + - NMM_SANDBOX_SHA: "079b41847544cf142e25820197294eea94ad216e" + - CONTAINER_IMAGE: "vllm/vllm-openai:qwen3_5-cu130" slurm_config: _factory_: "slurm_factory" nodes: 1 From 02fc28cb8a66da3ad8d0b66e26242b6c7afe323e Mon Sep 17 00:00:00 2001 From: chenhany Date: Thu, 21 May 2026 16:20:33 -0700 Subject: [PATCH 05/10] [OMNIML-4788] specdec_bench: address @claude review on PR 1531 Two IMPORTANT items and six SUGGESTIONs flagged by /claude review on the draft Phase-1a PR. **[IMPORTANT] Shallow redaction (utils.py)** Top-level _redact_config let nested secrets in `serving_config` and `runtime_params` survive into configuration.json. AsyncEngineArgs has fields like `hf_token`; user-supplied runtime_params is arbitrary. Replaced with recursive _redact_value() that walks dicts/lists/tuples and redacts on key-name match at any depth. Added an inline unit-test-style demonstration in the commit (see PR description for nested input/output). **[IMPORTANT] Hardcoded SHAs in committed YAML (specdec_bench_mtp.yaml)** The SHAs in environment: would stale-lie the moment anyone reused the YAML after `git pull`. Replaced with an explanatory comment block documenting the env-var contract (SPECDEC_BENCH_SHA / MODELOPT_SHA / MODELOPT_VERSION / NMM_SANDBOX_SHA / CONTAINER_IMAGE) and pointing at the planned Phase-2 launcher auto-injection. The standalone-run fallback in dump_env (git rev-parse + modelopt.__version__) is the only path exercised until the harness wires the env vars. **[SUGGESTION] argv bypassed redaction (utils.py)** sys.argv[:] was stored raw. Added _redact_argv() that masks the value after any -- flag, and rewrites --=VAL forms. Used the same sensitive-key predicate as _redact_value so a downstream `--hf_token VALUE` ends up masked in argv even though it isn't in vars(args). **[SUGGESTION] modelopt_sha layout assumption (utils.py)** The Path(__file__).resolve().parents[2] fallback is correct in-tree only; when vendored elsewhere it would git rev-parse an unrelated repo. Added a comment that the env var is the supported path and the fallback is best-effort for in-tree dev only. **[SUGGESTION] checkpoint size walks every file (utils.py)** For sharded 100B+ checkpoints with hundreds of tokenizer/cache/intermediate files the rglob was a noticeable startup tax. Switched to summing only the shards listed in model.safetensors.index.json's weight_map + the index file itself. Falls back to rglob when no index exists (single-file checkpoints / unusual layouts). **[SUGGESTION] Unused exports in s3_utils.py** `upload_directory` and `s3_credentials_from_args_or_env` were ported verbatim from iputterman/specdec_bench but never called from anywhere in this PR (upload_to_s3.py uses upload_run_dir + its own argparse). Removed. Will re-add in Phase 1b if the planned run.py --s3_upload flag wants them. **[SUGGESTION] upload_to_s3.py only caught ValueError** Boto3 ClientError / per-file OSError would escape the loop and abort mid-sweep leaving a partially-uploaded result on S3. Broadened to a catch-all that records per-run errors, keeps going, and exits non-zero with the count. **[SUGGESTION] pip install on every task (run.sh)** Added `pip show boto3 || pip show datasets || pip show seaborn` gate so the install only runs in a cold container. Matches the pattern in other common/*/run.sh scripts. Signed-off-by: chenhany --- .../specdec_bench/specdec_bench/s3_utils.py | 33 ------- examples/specdec_bench/specdec_bench/utils.py | 94 +++++++++++++++++-- examples/specdec_bench/upload_to_s3.py | 8 ++ tools/launcher/common/specdec_bench/run.sh | 13 ++- .../Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml | 24 +++-- 5 files changed, 118 insertions(+), 54 deletions(-) diff --git a/examples/specdec_bench/specdec_bench/s3_utils.py b/examples/specdec_bench/specdec_bench/s3_utils.py index 59a1c8f0589..e7d52f1b05f 100644 --- a/examples/specdec_bench/specdec_bench/s3_utils.py +++ b/examples/specdec_bench/specdec_bench/s3_utils.py @@ -15,7 +15,6 @@ """S3 upload utilities for specdec_bench results.""" -import os from pathlib import Path S3_DEFAULT_ENDPOINT = "https://pdx.s8k.io" @@ -74,35 +73,3 @@ def upload_run_dir(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: f"S3 destination already exists: s3://{bucket}/{s3_prefix} — refusing to overwrite" ) _upload_files(s3, local_dir, bucket, s3_prefix) - - -def upload_directory(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: - """Upload a sweep output directory (local_dir/run_name/...) to S3. - - Each run subdirectory is checked independently so a partial re-upload - of a sweep fails loudly on the first clash rather than silently skipping. - """ - s3_prefix = s3_prefix.rstrip("/") - run_dirs = sorted(d for d in local_dir.iterdir() if d.is_dir()) - if not run_dirs: - raise ValueError(f"No subdirectories found in {local_dir}") - for run_dir in run_dirs: - run_key = f"{s3_prefix}/{run_dir.name}" - print(f" {run_dir.name} → s3://{bucket}/{run_key}") - if s3_prefix_exists(s3, bucket, run_key): - raise ValueError( - f"S3 destination already exists: s3://{bucket}/{run_key} — refusing to overwrite" - ) - # All clear — upload everything - for run_dir in run_dirs: - _upload_files(s3, run_dir, bucket, f"{s3_prefix}/{run_dir.name}") - - -def s3_credentials_from_args_or_env(args) -> tuple[str, str, str]: - """Resolve S3 credentials: CLI args > env vars > built-in defaults.""" - endpoint = str( - getattr(args, "s3_endpoint", None) or os.environ.get("S3_ENDPOINT", S3_DEFAULT_ENDPOINT) - ) - key_id = str(getattr(args, "s3_key_id", None) or os.environ.get("S3_KEY_ID", S3_DEFAULT_KEY_ID)) - secret = str(getattr(args, "s3_secret", None) or os.environ.get("S3_SECRET", S3_DEFAULT_SECRET)) - return endpoint, key_id, secret diff --git a/examples/specdec_bench/specdec_bench/utils.py b/examples/specdec_bench/specdec_bench/utils.py index 7d457712bef..9e4648413de 100644 --- a/examples/specdec_bench/specdec_bench/utils.py +++ b/examples/specdec_bench/specdec_bench/utils.py @@ -133,12 +133,25 @@ def _git_sha(path): return None +def _shard_files_from_index(index_path): + """Return the set of shard filenames referenced by a safetensors index JSON.""" + try: + with open(index_path) as f: + wm = json.load(f).get("weight_map", {}) or {} + return set(wm.values()) + except Exception: + return set() + + def _checkpoint_provenance(model_dir): """Cheap reproducibility fingerprint for a HuggingFace checkpoint directory. - Returns {path, size_bytes, index_sha256} where index_sha256 hashes the - safetensors index file (changes whenever any shard's contents change). - Falls back to hashing config.json for non-sharded checkpoints. + Returns {path, size_bytes, index_sha256, index_source}: + - index_sha256 hashes model.safetensors.index.json (or config.json fallback) + so it changes whenever the shard set or model config changes. + - size_bytes sums only the index-listed shards + config.json. For a + sharded 70B+ checkpoint this avoids a full rglob walk over hundreds + of tokenizer/cache files. Falls back to rglob when no index exists. """ if model_dir is None: return None @@ -146,7 +159,6 @@ def _checkpoint_provenance(model_dir): p = Path(model_dir) if not p.is_dir(): return {"path": str(model_dir)} - size_bytes = sum(f.stat().st_size for f in p.rglob("*") if f.is_file()) hash_target = None for name in ("model.safetensors.index.json", "config.json"): candidate = p / name @@ -160,6 +172,19 @@ def _checkpoint_provenance(model_dir): for chunk in iter(lambda: f.read(65536), b""): h.update(chunk) index_sha256 = h.hexdigest() + # Size: shards listed in the safetensors index + the index/config file + # itself. Avoids walking the entire model directory (which can be huge + # for sharded multi-100B checkpoints). + size_bytes = 0 + if hash_target is not None and hash_target.name == "model.safetensors.index.json": + for shard_name in _shard_files_from_index(hash_target): + shard_path = p / shard_name + if shard_path.is_file(): + size_bytes += shard_path.stat().st_size + size_bytes += hash_target.stat().st_size + else: + # No shard index — fall back to summing every file under the dir. + size_bytes = sum(f.stat().st_size for f in p.rglob("*") if f.is_file()) return { "path": str(model_dir), "size_bytes": size_bytes, @@ -177,11 +202,58 @@ def _is_sensitive_key(key): return any(s in klow for s in _SENSITIVE_SUBSTRINGS) +def _redact_value(value): + """Recursively redact secrets in nested dict/list values. + + The top-level `_redact_config` walks one level of keys, but engine configs + (serving_config from VLLMModel/SGLANGModel) and user-supplied runtime_params + are nested arbitrarily — fields like `hf_token`, `tokenizer_revision`, or + `aws_secret_access_key` need to be redacted at any depth. + """ + if isinstance(value, dict): + return { + k: ("***REDACTED***" if _is_sensitive_key(k) else _redact_value(v)) + for k, v in value.items() + } + if isinstance(value, list): + return [_redact_value(v) for v in value] + if isinstance(value, tuple): + return tuple(_redact_value(v) for v in value) + return value + + def _redact_config(config): - return { - key: ("***REDACTED***" if _is_sensitive_key(key) else value) - for key, value in config.items() - } + return _redact_value(config) + + +def _redact_argv(argv): + """Mask values that follow a sensitive flag name (e.g. --hf_token VALUE). + + Conservative: only masks when the previous element looks like a flag whose + bare name (sans leading dashes) trips _is_sensitive_key. Also handles the + --flag=VALUE form. + """ + redacted = [] + prev_is_sensitive = False + for tok in argv: + s = str(tok) + if prev_is_sensitive: + redacted.append("***REDACTED***") + prev_is_sensitive = False + continue + if s.startswith("--"): + name, sep, val = s[2:].partition("=") + if _is_sensitive_key(name): + if sep: + redacted.append(f"--{name}=***REDACTED***") + prev_is_sensitive = False + else: + redacted.append(s) + prev_is_sensitive = True + continue + redacted.append(s) + prev_is_sensitive = False + return redacted def dump_env(args, save_dir, overrides=None): @@ -197,7 +269,7 @@ def dump_env(args, save_dir, overrides=None): config["engine_version"] = _get_engine_version(config.get("engine")) config["gpu"] = _get_gpu_name() config["python_version"] = sys.version - config["argv"] = sys.argv[:] + config["argv"] = _redact_argv(sys.argv[:]) # Provenance for reproducibility / apple-to-orange guarding. # Each *_sha and modelopt_version prefers an env var set by the harness @@ -213,6 +285,10 @@ def dump_env(args, save_dir, overrides=None): config["modelopt_version"] = ( os.environ.get("MODELOPT_VERSION") or _get_modelopt_version() ) + # Fallback assumes the in-tree layout examples/specdec_bench/specdec_bench/. + # parents[2] reaches the modelopt repo root in that case. When vendored + # elsewhere this would `git rev-parse` an unrelated repo; rely on the env + # var path instead for non-in-tree deployments. config["modelopt_sha"] = ( os.environ.get("MODELOPT_SHA") or _git_sha(specdec_bench_dir.parents[2]) ) diff --git a/examples/specdec_bench/upload_to_s3.py b/examples/specdec_bench/upload_to_s3.py index 5545fa0c60b..edf838b3399 100644 --- a/examples/specdec_bench/upload_to_s3.py +++ b/examples/specdec_bench/upload_to_s3.py @@ -159,12 +159,20 @@ def main(): upload_run_dir(s3, local_run_dir, bucket, s3_key) uploaded += 1 except ValueError as exc: + # Raised by upload_run_dir when the destination already exists. if args.skip_existing: print(f" Skipped: {exc}") skipped += 1 else: print(f" Error: {exc}") errors += 1 + except Exception as exc: # noqa: BLE001 - any boto3/IO failure should not abort the sweep + # ClientError (auth, throttling, network), OSError on a single file, + # etc. Keep going so the other runs in a sweep still upload, and + # report a per-run summary at the end. Sweep-level non-zero exit + # still flags the overall run as failed. + print(f" Error: {type(exc).__name__}: {exc}") + errors += 1 print(f"\nDone: {uploaded} uploaded, {skipped} skipped, {errors} failed.") if errors: diff --git a/tools/launcher/common/specdec_bench/run.sh b/tools/launcher/common/specdec_bench/run.sh index 1f8fb06bfba..9b4fbe47b64 100755 --- a/tools/launcher/common/specdec_bench/run.sh +++ b/tools/launcher/common/specdec_bench/run.sh @@ -30,9 +30,16 @@ trap 'exit_handler' EXIT # Required env: HF_MODEL_CKPT # Optional env: HF_DRAFT_MODEL_CKPT (consumed by --draft_model_dir if the YAML passes it) -if ! pip install -r modules/Model-Optimizer/examples/specdec_bench/requirements_speed.txt; then - report_result "FAIL: specdec_bench: pip install requirements_speed.txt failed" - exit 1 +# Skip the install when the deps are already present (warm container or +# previous task in the pipeline). Saves a few minutes per task and avoids +# silently drifting versions if upstream wheels move between launches. +if ! pip show boto3 >/dev/null 2>&1 || \ + ! pip show datasets >/dev/null 2>&1 || \ + ! pip show seaborn >/dev/null 2>&1; then + if ! pip install -r modules/Model-Optimizer/examples/specdec_bench/requirements_speed.txt; then + report_result "FAIL: specdec_bench: pip install requirements_speed.txt failed" + exit 1 + fi fi if ! python3 modules/Model-Optimizer/examples/specdec_bench/run.py \ diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml index 00327e8443a..abdb7e6842e 100644 --- a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml +++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml @@ -34,15 +34,21 @@ pipeline: environment: - HF_MODEL_CKPT: <> - HF_LOCAL: /hf-local - # Provenance env vars consumed by dump_env() in examples/specdec_bench/run.py. - # These are static here for demonstration; a launcher auto-injection - # (planned for Phase-2 in OMNIML-4788) will compute them at launch time - # so the YAML stays SHA-free. - - SPECDEC_BENCH_SHA: "62d1686570448e8c4c057592f45c23a5965ec206" - - MODELOPT_SHA: "58633f0a6021a0ca9f250ca53e2090d2ebfea182" - - MODELOPT_VERSION: "0.45.0dev-108-g58633f0a60" - - NMM_SANDBOX_SHA: "079b41847544cf142e25820197294eea94ad216e" - - CONTAINER_IMAGE: "vllm/vllm-openai:qwen3_5-cu130" + # dump_env() in examples/specdec_bench/run.py reads provenance from these + # env vars when present; they are intentionally NOT set here. Hardcoded + # SHAs in a committed YAML would stale-lie about the build that's + # actually running. Set them in the harness (planned for Phase 2 in + # OMNIML-4788, where slurm.py / launch.py will compute and inject them + # at launch time): + # + # SPECDEC_BENCH_SHA = git rev-parse HEAD:examples/specdec_bench + # MODELOPT_SHA = git -C modules/Model-Optimizer rev-parse HEAD + # MODELOPT_VERSION = modelopt.__version__ (or git describe --tags) + # NMM_SANDBOX_SHA = git -C nmm-sandbox rev-parse HEAD + # CONTAINER_IMAGE = slurm_config.container (the image:tag below) + # + # When run standalone (without the harness), dump_env falls back to + # git rev-parse / module imports on the host filesystem. slurm_config: _factory_: "slurm_factory" nodes: 1 From dacabe3bfc708c75acb11130853b72a1f867838c Mon Sep 17 00:00:00 2001 From: chenhany Date: Thu, 21 May 2026 16:54:14 -0700 Subject: [PATCH 06/10] =?UTF-8?q?[OMNIML-4788]=20specdec=5Fbench:=20rename?= =?UTF-8?q?=20Request/Category/Average=5FAR=20=E2=86=92=20=5FAL,=20add=20J?= =?UTF-8?q?oint=5FAR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Methodology bump 0.1.0 → 1.0.0. The JSON-key rename is what the specdec_visualizer reads (the previous name had drifted between upstream and iputterman's fork, leaving the visualizer with a hardcoded *_AL it couldn't find in upstream-produced runs). Semantically the values were always acceptance LENGTH (mean tokens generated per inference step), so the rename matches what the numbers actually mean. acceptance_rate.py / mtbench.py / specbench.py — output keys renamed: Request_AR → Request_AL Category_AR → Category_AL Average_AR → Average_AL (Local Python identifiers and the acceptance_rate.json filename are unchanged; only the on-disk JSON keys move.) acceptance_rate.py also gains Joint_Acceptance_Rate alongside Conditional_Acceptance_Rate — running product of the conditionals, = P(>=k tokens accepted in a row). Computed from existing data so no extra cost. Matches the field the visualizer renders in its joint-AR panel. __init__.py — __version__ = "1.0.0" with the rationale recorded in-source. Pre-1.0.0 runs in S3 carry *_AR and no Joint_AR; the visualizer's version-aware aggregation (Phase 4) will keep them in separate panels from 1.0.0 runs. The smoke YAMLs (Qwen3.5-4B NONE + MTP) and the harness in common/specdec_bench/run.sh are unchanged. Re-running them against this commit produces *_AL + Joint_AR; re-uploading overwrites the existing qwen35_4_mtp_smoke_2026-05-21 sweep on S3. Signed-off-by: chenhany --- .../specdec_bench/specdec_bench/__init__.py | 13 ++++++++-- .../specdec_bench/metrics/acceptance_rate.py | 18 ++++++++++---- .../specdec_bench/metrics/mtbench.py | 14 +++++------ .../specdec_bench/metrics/specbench.py | 24 +++++++++---------- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/examples/specdec_bench/specdec_bench/__init__.py b/examples/specdec_bench/specdec_bench/__init__.py index d721315cca0..7882d772f5e 100644 --- a/examples/specdec_bench/specdec_bench/__init__.py +++ b/examples/specdec_bench/specdec_bench/__init__.py @@ -15,7 +15,16 @@ # Methodology version. Bump: # - minor (0.X.0) when adding a new metric or strictly-additive provenance field -# - major (X.0.0) when changing how an existing metric is computed +# - major (X.0.0) when changing how an existing metric is computed OR its +# on-disk field names (incompatible with prior consumers / visualizers) # The visualizer aggregates runs by major version to avoid apple-to-orange # comparisons across methodology changes. -__version__ = "0.1.0" +# +# 1.0.0: rename Request_AR / Category_AR / Average_AR → *_AL across the +# SpecBench / AcceptanceRate / MTBench metric writers, AND add +# Joint_Acceptance_Rate to the AcceptanceRate metric. The renamed +# values were always acceptance LENGTH (mean tokens generated per +# inference step), not a rate, and the visualizer reads *_AL. +# Pre-1.0.0 runs in S3 have *_AR and no Joint_AR; they must be +# re-run or post-processed before comparing. +__version__ = "1.0.0" diff --git a/examples/specdec_bench/specdec_bench/metrics/acceptance_rate.py b/examples/specdec_bench/specdec_bench/metrics/acceptance_rate.py index fffd9ebf789..c9bbe743647 100644 --- a/examples/specdec_bench/specdec_bench/metrics/acceptance_rate.py +++ b/examples/specdec_bench/specdec_bench/metrics/acceptance_rate.py @@ -55,23 +55,31 @@ def _process_lengths(self, lengths): self.out["Conditional_Acceptance_Rate"][k] = running_len / sum_lengths / prev_ratio prev_ratio = running_len / sum_lengths running_len -= v + # Joint acceptance rate at step k = product of conditional acceptance + # rates at steps 1..k = probability that ≥k tokens are accepted in + # a row. The visualizer renders this as a separate panel. + self.out["Joint_Acceptance_Rate"] = {} + running_joint = 1.0 + for k, cond_ar in self.out["Conditional_Acceptance_Rate"].items(): + running_joint *= cond_ar + self.out["Joint_Acceptance_Rate"][k] = running_joint def process_final(self, text_outputs): all_ar = [] lengths = {} - self.out["Request_AR"] = {} + self.out["Request_AL"] = {} self.prompt_ar = dict(sorted(self.prompt_ar.items(), key=lambda x: x[0])) for request_id, turns in self.prompt_ar.items(): - self.out["Request_AR"][request_id] = {} + self.out["Request_AL"][request_id] = {} for turn_id, turn in turns.items(): ar = sum(turn) / len(turn) - self.out["Request_AR"][request_id][turn_id] = ar + self.out["Request_AL"][request_id][turn_id] = ar all_ar.append(ar) self._get_lengths(turn, lengths) - print(request_id, turn_id, self.out["Request_AR"][request_id][turn_id]) + print(request_id, turn_id, self.out["Request_AL"][request_id][turn_id]) average_ar = sum(all_ar) / len(all_ar) print("Average AR:", average_ar) - self.out["Average_AR"] = average_ar + self.out["Average_AL"] = average_ar self._process_lengths(lengths) self.write() self._format_write_output(text_outputs) diff --git a/examples/specdec_bench/specdec_bench/metrics/mtbench.py b/examples/specdec_bench/specdec_bench/metrics/mtbench.py index 0a7dd8b485b..c073aa55ce5 100644 --- a/examples/specdec_bench/specdec_bench/metrics/mtbench.py +++ b/examples/specdec_bench/specdec_bench/metrics/mtbench.py @@ -34,29 +34,29 @@ class MTBench(AcceptanceRate): def process_final(self, text_outputs): i = 0 lengths = {} - self.out["Request_AR"] = {} + self.out["Request_AL"] = {} self.prompt_ar = dict(sorted(self.prompt_ar.items(), key=lambda x: x[0])) for request_id, turns in self.prompt_ar.items(): turn_1 = turns[0] turn_2 = turns[1] q_id = request_id mtbench_topic = MTBENCH_TOPICS[q_id // 10] - self.out["Request_AR"][request_id] = sum(turn_1 + turn_2) / len(turn_1 + turn_2) + self.out["Request_AL"][request_id] = sum(turn_1 + turn_2) / len(turn_1 + turn_2) self._get_lengths(turn_1, lengths) self._get_lengths(turn_2, lengths) print(mtbench_topic, sum(turn_1 + turn_2) / len(turn_1 + turn_2)) per_category = [[] for _ in range(len(MTBENCH_TOPICS))] - for q_id, ar in self.out["Request_AR"].items(): + for q_id, ar in self.out["Request_AL"].items(): per_category[q_id // 10].append(ar) - self.out["Category_AR"] = {} + self.out["Category_AL"] = {} for i, category in enumerate(per_category): if len(category) > 0: category_ar = sum(category) / len(category) - self.out["Category_AR"][MTBENCH_TOPICS[i]] = category_ar + self.out["Category_AL"][MTBENCH_TOPICS[i]] = category_ar print(f"{MTBENCH_TOPICS[i]} Average AR: {category_ar}") - average_ar = sum(self.out["Request_AR"].values()) / len(self.out["Request_AR"]) + average_ar = sum(self.out["Request_AL"].values()) / len(self.out["Request_AL"]) print("Average AR:", average_ar) - self.out["Average_AR"] = average_ar + self.out["Average_AL"] = average_ar self._process_lengths(lengths) self.write() self._format_write_output(text_outputs) diff --git a/examples/specdec_bench/specdec_bench/metrics/specbench.py b/examples/specdec_bench/specdec_bench/metrics/specbench.py index 32ab3d1c7a5..18902a659a3 100644 --- a/examples/specdec_bench/specdec_bench/metrics/specbench.py +++ b/examples/specdec_bench/specdec_bench/metrics/specbench.py @@ -44,26 +44,26 @@ def __init__(self, requests): def process_final(self, text_outputs): lengths = {} - self.out["Request_AR"] = {} + self.out["Request_AL"] = {} for request_id, request in enumerate(self.requests): turns = self.prompt_ar[request_id].values() assert len(turns) == len(request.turns), ( f"Number of turns {len(turns)} does not match number of turns in request {len(request.turns)}" ) - self.out["Request_AR"][request.question_id] = mean(list(chain(*turns))) + self.out["Request_AL"][request.question_id] = mean(list(chain(*turns))) for turn in turns: self._get_lengths(turn, lengths) - print(request.category, self.out["Request_AR"][request.question_id]) + print(request.category, self.out["Request_AL"][request.question_id]) per_category = defaultdict(list) for request in self.requests: - per_category[request.category].append(self.out["Request_AR"][request.question_id]) - self.out["Category_AR"] = {} + per_category[request.category].append(self.out["Request_AL"][request.question_id]) + self.out["Category_AL"] = {} for category_name, category_ar in per_category.items(): if len(category_ar) > 0: category_ar = mean(category_ar) - self.out["Category_AR"][category_name] = category_ar - average_ar = mean(self.out["Request_AR"].values()) - self.out["Average_AR"] = average_ar + self.out["Category_AL"][category_name] = category_ar + average_ar = mean(self.out["Request_AL"].values()) + self.out["Average_AL"] = average_ar self._process_lengths(lengths) self.write() self._format_write_output(text_outputs) @@ -96,12 +96,12 @@ def _pretty_print_results(self): table.add_column("Average AR", justify="right", style="green") # Add category rows - for category_name, category_ar in sorted(self.out["Category_AR"].items()): + for category_name, category_ar in sorted(self.out["Category_AL"].items()): table.add_row(category_name, f"{category_ar:.4f}") # Add separator and summary row table.add_section() - table.add_row("[bold]Overall Average[/bold]", f"[bold]{self.out['Average_AR']:.4f}[/bold]") + table.add_row("[bold]Overall Average[/bold]", f"[bold]{self.out['Average_AL']:.4f}[/bold]") console.print(table) @@ -124,8 +124,8 @@ def _create_visualizations( df_clean = pd.DataFrame.from_dict( { - "question_id": list(self.out["Request_AR"].keys()), - "acceptance_rate": list(self.out["Request_AR"].values()), + "question_id": list(self.out["Request_AL"].keys()), + "acceptance_rate": list(self.out["Request_AL"].values()), "category": [request.category for request in self.requests], "response_length": [ mean([len(c["content"]) for c in messages if c["role"] == "assistant"]) From 8bda3571ec5e0543c70e1261eea307f4af1c6ec8 Mon Sep 17 00:00:00 2001 From: chenhany Date: Fri, 22 May 2026 09:06:45 -0700 Subject: [PATCH 07/10] [OMNIML-4788] specdec_bench: stamp uid + jira_ticket + huggingface_model_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three attestation fields to dump_env() so the downstream visualizer can distinguish JIRA-tracked official runs (which must reside on a HuggingFace Hub checkpoint) from community-contributed runs. jira_ticket — from $JIRA_TICKET env; harness sets this only after verifying huggingface_model_id resolves on Hub. None for contributed / standalone runs. huggingface_model_id — canonical "org/model" Hub identifier from $HUGGINGFACE_MODEL_ID env. None when the run was driven from a local-only checkpoint dir that isn't (or isn't claimed to be) on Hub. uid — sha256 of (timestamp + model_dir + algorithm + concurrency + specdec_bench_sha)[:16]. Deterministic so accidental re-uploads of the same run dedupe; varies whenever any of the inputs change. Stable identifier the visualizer / CSV exporter / S3 dedupe logic can key on, separate from sweep_name/run_name which can be renamed at upload time. No new dependencies; hashlib was already imported. The "official" designation lives entirely at attestation time — the specdec_bench code itself doesn't know or care; it just records the value the harness passed. Signed-off-by: chenhany --- examples/specdec_bench/specdec_bench/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/specdec_bench/specdec_bench/utils.py b/examples/specdec_bench/specdec_bench/utils.py index 9e4648413de..08ea07b8480 100644 --- a/examples/specdec_bench/specdec_bench/utils.py +++ b/examples/specdec_bench/specdec_bench/utils.py @@ -299,6 +299,25 @@ def dump_env(args, save_dir, overrides=None): # UTC timestamp. config["timestamp"] = datetime.datetime.now(datetime.timezone.utc).isoformat() + # Attestation fields used by the visualizer to distinguish JIRA-tracked + # official runs (must be on HuggingFace Hub) from community-contributed + # runs. The harness sets JIRA_TICKET only after verifying the checkpoint + # resolves on Hub; standalone runs leave both empty. + config["jira_ticket"] = os.environ.get("JIRA_TICKET") or None + config["huggingface_model_id"] = os.environ.get("HUGGINGFACE_MODEL_ID") or None + + # Deterministic per-run UID. SHA-256 of the small set of inputs that + # together identify this run; same inputs → same UID, so accidental + # re-uploads of the same run dedupe in the visualizer. + uid_parts = "|".join([ + str(config["timestamp"]), + str(getattr(args, "model_dir", "") or ""), + str(getattr(args, "speculative_algorithm", "") or ""), + str(getattr(args, "concurrency", "") or ""), + str(config.get("specdec_bench_sha") or ""), + ]) + config["uid"] = hashlib.sha256(uid_parts.encode()).hexdigest()[:16] + os.makedirs(save_dir, exist_ok=True) with open(os.path.join(save_dir, "configuration.json"), "w") as f: json.dump(config, f, indent=4, default=str) From 0b759a29a445c7fa4520c5de7d06c927ad9fb716 Mon Sep 17 00:00:00 2001 From: chenhany Date: Fri, 22 May 2026 14:40:58 -0700 Subject: [PATCH 08/10] [OMNIML-4788] specdec_bench: merge s3_utils into upload_to_s3, drop internal defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit upload_to_s3.py was a single-file CLI that only imported helpers from a neighboring s3_utils.py module. The split bought nothing — there was no second consumer, no plan for one, and the package import made the uploader look coupled to the specdec_bench library when in fact it has zero references to bench code (just sentinel filenames). Merging makes this a drop-in single-file tool: anyone can `wget upload_to_s3.py` and run it without `pip install`ing the package or fiddling with PYTHONPATH. Also removes the hardcoded `S3_DEFAULT_ENDPOINT = "https://pdx.s8k.io"` constant + the NVIDIA-internal team-specdec-workgroup defaults that used to live in s3_utils. Endpoint / key id / secret now default to empty (env-var or --flag only). This is a *public* example repo; team-specific infrastructure URLs shouldn't ship in it. Net: 1 file removed (-75 LOC), upload_to_s3.py grew +62 LOC, -13 overall. Diff for downstream callers: change `from specdec_bench.s3_utils import …` to `from upload_to_s3 import …`. The pensieve-intern submit tool (planned follow-up) will set `--endpoint https://pdx.s8k.io --key-id …` from team-secrets storage rather than relying on a baked-in default. Signed-off-by: chenhany --- .../specdec_bench/specdec_bench/s3_utils.py | 75 ------------------- examples/specdec_bench/upload_to_s3.py | 74 +++++++++++++++--- 2 files changed, 62 insertions(+), 87 deletions(-) delete mode 100644 examples/specdec_bench/specdec_bench/s3_utils.py diff --git a/examples/specdec_bench/specdec_bench/s3_utils.py b/examples/specdec_bench/specdec_bench/s3_utils.py deleted file mode 100644 index e7d52f1b05f..00000000000 --- a/examples/specdec_bench/specdec_bench/s3_utils.py +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""S3 upload utilities for specdec_bench results.""" - -from pathlib import Path - -S3_DEFAULT_ENDPOINT = "https://pdx.s8k.io" -S3_DEFAULT_KEY_ID = "" -S3_DEFAULT_SECRET = "" - - -def parse_s3_path(path: str) -> tuple[str, str]: - """'s3://bucket/prefix' → (bucket, prefix). prefix may be empty.""" - without_scheme = path[5:] # strip "s3://" - parts = without_scheme.split("/", 1) - bucket = parts[0] - prefix = parts[1].strip("/") if len(parts) > 1 else "" - return bucket, prefix - - -def make_s3_client(endpoint: str, key_id: str, secret: str): - import boto3 - from botocore.config import Config - - return boto3.client( - "s3", - endpoint_url=endpoint, - aws_access_key_id=key_id, - aws_secret_access_key=secret, - region_name="us-east-1", - config=Config(s3={"addressing_style": "path"}), - ) - - -def s3_prefix_exists(s3, bucket: str, prefix: str) -> bool: - """Return True if any object exists under prefix.""" - resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix.rstrip("/") + "/", MaxKeys=1) - return bool(resp.get("Contents")) - - -def _upload_files(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: - """Upload all files under local_dir without any existence check.""" - for file_path in sorted(local_dir.rglob("*")): - if not file_path.is_file(): - continue - rel = file_path.relative_to(local_dir).as_posix() - key = f"{s3_prefix}/{rel}" - s3.upload_file(str(file_path), bucket, key) - print(f" Uploaded: s3://{bucket}/{key}") - - -def upload_run_dir(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: - """Upload a single run directory to s3://bucket/s3_prefix/. - - Raises ValueError if the destination prefix already has any objects. - """ - s3_prefix = s3_prefix.rstrip("/") - if s3_prefix_exists(s3, bucket, s3_prefix): - raise ValueError( - f"S3 destination already exists: s3://{bucket}/{s3_prefix} — refusing to overwrite" - ) - _upload_files(s3, local_dir, bucket, s3_prefix) diff --git a/examples/specdec_bench/upload_to_s3.py b/examples/specdec_bench/upload_to_s3.py index edf838b3399..f807e06f3d0 100644 --- a/examples/specdec_bench/upload_to_s3.py +++ b/examples/specdec_bench/upload_to_s3.py @@ -42,18 +42,68 @@ import sys from pathlib import Path -from specdec_bench.s3_utils import ( - S3_DEFAULT_ENDPOINT, - S3_DEFAULT_KEY_ID, - S3_DEFAULT_SECRET, - make_s3_client, - parse_s3_path, - upload_run_dir, -) - _RUN_SENTINELS = ("configuration.json", "timing.json", "aa_timing.json", "acceptance_rate.json") +# ── S3 helpers ──────────────────────────────────────────────────────────────── +# Inlined from a former specdec_bench/s3_utils.py so this stays a drop-in +# single-file tool: a contributor can `wget upload_to_s3.py` and run it without +# installing the specdec_bench package. Defaults are empty — endpoint, key id, +# and secret are taken from --endpoint / --key-id / --secret (or the +# corresponding S3_ENDPOINT / S3_KEY_ID / S3_SECRET env vars). No team-specific +# infrastructure leaks into this public example. + + +def parse_s3_path(path: str) -> tuple[str, str]: + """'s3://bucket/prefix' → (bucket, prefix). prefix may be empty.""" + without_scheme = path[5:] # strip "s3://" + parts = without_scheme.split("/", 1) + bucket = parts[0] + prefix = parts[1].strip("/") if len(parts) > 1 else "" + return bucket, prefix + + +def make_s3_client(endpoint: str, key_id: str, secret: str): + import boto3 + from botocore.config import Config + return boto3.client( + "s3", + endpoint_url=endpoint or None, + aws_access_key_id=key_id, + aws_secret_access_key=secret, + region_name="us-east-1", + config=Config(s3={"addressing_style": "path"}), + ) + + +def s3_prefix_exists(s3, bucket: str, prefix: str) -> bool: + resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix.rstrip("/") + "/", MaxKeys=1) + return bool(resp.get("Contents")) + + +def _upload_files(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: + for file_path in sorted(local_dir.rglob("*")): + if not file_path.is_file(): + continue + rel = file_path.relative_to(local_dir).as_posix() + key = f"{s3_prefix}/{rel}" + s3.upload_file(str(file_path), bucket, key) + print(f" Uploaded: s3://{bucket}/{key}") + + +def upload_run_dir(s3, local_dir: Path, bucket: str, s3_prefix: str) -> None: + """Upload a single run directory to s3://bucket/s3_prefix/. + + Raises ValueError if the destination prefix already has any objects. + """ + s3_prefix = s3_prefix.rstrip("/") + if s3_prefix_exists(s3, bucket, s3_prefix): + raise ValueError( + f"S3 destination already exists: s3://{bucket}/{s3_prefix} — refusing to overwrite" + ) + _upload_files(s3, local_dir, bucket, s3_prefix) + + def _is_run_dir(d: Path) -> bool: return any((d / f).exists() for f in _RUN_SENTINELS) @@ -102,18 +152,18 @@ def main(): ) parser.add_argument( "--endpoint", - default=os.environ.get("S3_ENDPOINT", S3_DEFAULT_ENDPOINT), + default=os.environ.get("S3_ENDPOINT", ""), help="S3 endpoint URL", ) parser.add_argument( "--key-id", - default=os.environ.get("S3_KEY_ID", S3_DEFAULT_KEY_ID), + default=os.environ.get("S3_KEY_ID", ""), dest="key_id", help="S3 access key ID", ) parser.add_argument( "--secret", - default=os.environ.get("S3_SECRET", S3_DEFAULT_SECRET), + default=os.environ.get("S3_SECRET", ""), help="S3 secret access key", ) parser.add_argument( From 95e8040c2bd802401c79b96ff5476a1edabd8667 Mon Sep 17 00:00:00 2001 From: chenhany Date: Fri, 22 May 2026 14:46:37 -0700 Subject: [PATCH 09/10] [OMNIML-4788] specdec_bench: drop aa_timing.json from sentinel set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the only AA (Acceptance-Aware throughput) reference my Phase-1a PR introduces. The AATiming class and the --aa_timing flag remain on main; this is solely about not advertising AA artifacts in the upload tool's run-discovery heuristic. _RUN_SENTINELS is used to recognize a directory as a "run" (i.e. it has bench output) for the discover/upload walk — dropping aa_timing.json from the tuple means a directory that contains only an aa_timing.json (and none of the other three sentinels) would no longer be treated as a run. In practice every real run also writes configuration.json and timing.json, so this is a no-op for correctly-produced run dirs; it just stops the upload tool from acknowledging AA as a first-class artifact. Signed-off-by: chenhany --- examples/specdec_bench/upload_to_s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/specdec_bench/upload_to_s3.py b/examples/specdec_bench/upload_to_s3.py index f807e06f3d0..517bd53cd1c 100644 --- a/examples/specdec_bench/upload_to_s3.py +++ b/examples/specdec_bench/upload_to_s3.py @@ -42,7 +42,7 @@ import sys from pathlib import Path -_RUN_SENTINELS = ("configuration.json", "timing.json", "aa_timing.json", "acceptance_rate.json") +_RUN_SENTINELS = ("configuration.json", "timing.json", "acceptance_rate.json") # ── S3 helpers ──────────────────────────────────────────────────────────────── From 70e8cebc5921d44e7fa227b8b9e746c9da590bf8 Mon Sep 17 00:00:00 2001 From: chenhany Date: Fri, 22 May 2026 15:03:09 -0700 Subject: [PATCH 10/10] [OMNIML-4788] specdec_bench/README: document S3 upload + attestation env vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a self-contained "Uploading results to S3" section above the existing "Notes" block. Covers: - Basic upload_to_s3.py invocation, with the three required S3 args and their corresponding env-var fallbacks (S3_ENDPOINT / S3_KEY_ID / S3_SECRET). - Both supported source layouts (flat single-run dir vs sweep dir containing many run dirs), and the destination-prefix preservation rule used by the discovery walk. - --dry-run and --skip-existing as the two operational flags worth naming in the README (vs digging through --help). A short follow-up subsection documents the two new optional provenance env vars introduced by the Phase-1a dump_env work (JIRA_TICKET, HUGGINGFACE_MODEL_ID) so contributors know how to populate the attestation fields downstream tooling reads. The README does not pin any team-specific bucket or endpoint — public users see a generic "s3://your-bucket/some/prefix" placeholder and any S3-compatible store. Signed-off-by: chenhany --- examples/specdec_bench/README.md | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/examples/specdec_bench/README.md b/examples/specdec_bench/README.md index 1987167e7c8..1322fc00196 100644 --- a/examples/specdec_bench/README.md +++ b/examples/specdec_bench/README.md @@ -145,6 +145,54 @@ python3 run.py \ --runtime_params runtime_args_long_context.yaml ``` +## Uploading results to S3 + +Each `run.py` invocation writes a result directory containing `configuration.json`, +`timing.json`, `acceptance_rate.json`, and (when applicable) `mtbench.json` / `specbench.json`. +`upload_to_s3.py` is a single-file, drop-in tool that uploads one run — or an entire sweep — +to any S3-compatible bucket: + +```bash +python upload_to_s3.py /path/to/run_or_sweep_dir s3://your-bucket/some/prefix \ + --endpoint https://your-s3-endpoint \ + --key-id YOUR_KEY_ID \ + --secret YOUR_SECRET +``` + +`--endpoint`, `--key-id`, and `--secret` default to the `S3_ENDPOINT`, `S3_KEY_ID`, and +`S3_SECRET` environment variables. Omit `--endpoint` (or set `S3_ENDPOINT=""`) to use AWS S3's +default endpoint. Use `--dry-run` to preview the upload plan, and `--skip-existing` to skip +runs already present at the destination instead of failing. + +The tool handles two directory layouts and mirrors them into S3: + +- **Flat** — `LOCAL_DIR/run_name/{configuration,timing,...}.json` +- **Sweep** — `LOCAL_DIR/sweep_name/run_name/{configuration,timing,...}.json` + +`LOCAL_DIR`'s basename is preserved in the destination prefix, so re-uploads from the same +source land in the same place. + +### Optional attestation fields + +`run.py` reads two environment variables when writing `configuration.json`; they're optional +provenance hints that downstream consumers (dashboards, leaderboards) can use to attest a run: + +| Env var | Purpose | +|---|---| +| `JIRA_TICKET` | A tracking ID for the run (your tracker — JIRA key, GitHub issue, etc.) | +| `HUGGINGFACE_MODEL_ID` | The public model id on the Hugging Face Hub, so the model used can be independently fetched | + +Set them in the same shell that launches `run.py`: + +```bash +export JIRA_TICKET=MYTRACK-1234 +export HUGGINGFACE_MODEL_ID=meta-llama/Llama-3.3-70B-Instruct +python3 run.py ... +``` + +Both fields appear in the run's `configuration.json` as `jira_ticket` and `huggingface_model_id` +(or `null` when unset). They have no effect on the benchmark itself. + ## Notes The goal of this benchmark is to provide an easy way to configure, run, and compare speculative implementations across frameworks in an apples-to-apples method.