Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
update
  • Loading branch information
ispobock committed Jan 27, 2026
commit f3cdc37bd63e7750da0f57484d98b480a73e4263
12 changes: 4 additions & 8 deletions python/sglang/srt/managers/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2081,14 +2081,10 @@ def _get_new_batch_prefill_raw(
if self.dllm_staging_reqs.non_empty():
self.dllm_staging_reqs.update_chunked_status()

# Print stats
if self.current_scheduler_metrics_enabled:
self.log_prefill_stats(
adder,
can_run_list,
running_bs=len(self.running_batch.reqs),
running_bs_offline_batch=0,
)
# Record for logging prefill stats after forward
self.adder = adder
self.can_run_list = can_run_list
self.running_bs = len(self.running_batch.reqs)

# Record metrics
for req in can_run_list:
Expand Down
23 changes: 17 additions & 6 deletions python/sglang/srt/managers/scheduler_metrics_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def log_prefill_stats(
can_run_list: List[Req],
running_bs: int,
running_bs_offline_batch: int,
can_run_cuda_graph: bool,
):
gap_latency = time.perf_counter() - self.last_prefill_stats_tic
self.last_prefill_stats_tic = time.perf_counter()
Expand Down Expand Up @@ -204,7 +205,7 @@ def log_prefill_stats(
self.stats.new_token_ratio = adder.new_token_ratio
iter_msg = f" [{self.forward_ct + 1}]" if LOG_FORWARD_ITERS else ""

f = (
msg = (
f"Prefill batch{iter_msg}, "
f"#new-seq: {len(can_run_list)}, "
f"#new-token: {adder.log_input_tokens}, "
Expand All @@ -215,11 +216,21 @@ def log_prefill_stats(
)

if self.disaggregation_mode == DisaggregationMode.PREFILL:
f += f"#prealloc-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
f += f"#inflight-req: {len(self.disagg_prefill_inflight_queue)}, "
f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
msg += f"#prealloc-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
msg += f"#inflight-req: {len(self.disagg_prefill_inflight_queue)}, "
msg += f"input throughput (token/s): {self.last_input_throughput:.2f}, "

logger.info(f)
graph_backend = defaultdict(
lambda: "cuda graph",
{
"cpu": "cpu graph",
"npu": "npu graph",
},
)

msg += f"{graph_backend[self.device]}: {can_run_cuda_graph}"

logger.info(msg)

if self.enable_metrics:
# Basics
Expand Down Expand Up @@ -393,7 +404,7 @@ def log_decode_stats(
msg += (
f"{graph_backend[self.device]}: {can_run_cuda_graph}, "
f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}, "
f"#queue-req: {len(self.waiting_queue)}"
)

logger.info(msg)
Expand Down
18 changes: 18 additions & 0 deletions python/sglang/srt/managers/scheduler_output_processor_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,15 @@ def process_batch_result_prefill(

self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)

if self.current_scheduler_metrics_enabled:
self.log_prefill_stats(
adder=self.adder,
can_run_list=self.can_run_list,
running_bs=self.running_bs,
running_bs_offline_batch=0,
can_run_cuda_graph=result.can_run_cuda_graph,
)

def _resolve_spec_overlap_token_ids(
self: Scheduler, result: GenerationBatchResult, batch: ScheduleBatch
) -> List[List[int]]:
Expand Down Expand Up @@ -356,6 +365,15 @@ def process_batch_result_dllm(
self.stream_output(batch.reqs, batch.return_logprob)
self.token_to_kv_pool_allocator.free_group_end()

if self.current_scheduler_metrics_enabled:
self.log_prefill_stats(
adder=self.adder,
can_run_list=self.can_run_list,
running_bs=self.running_bs,
running_bs_offline_batch=0,
can_run_cuda_graph=result.can_run_cuda_graph,
)

def process_batch_result_decode(
self: Scheduler,
batch: ScheduleBatch,
Expand Down
30 changes: 20 additions & 10 deletions python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2169,7 +2169,9 @@ def forward_extend(
forward_batch: ForwardBatch,
skip_attn_backend_init: bool = False,
pp_proxy_tensors=None,
) -> Union[LogitsProcessorOutput, PPProxyTensors, EmbeddingPoolerOutput]:
) -> Tuple[
Union[LogitsProcessorOutput, PPProxyTensors, EmbeddingPoolerOutput], bool
]:
kwargs = {}
if self.support_pp:
kwargs["pp_proxy_tensors"] = pp_proxy_tensors
Expand All @@ -2178,20 +2180,28 @@ def forward_extend(
if not self.is_generation:
kwargs["get_embedding"] = True

if (
can_run_graph = (
self.piecewise_cuda_graph_runner is not None
and self.piecewise_cuda_graph_runner.can_run(forward_batch)
):
return self.piecewise_cuda_graph_runner.replay(forward_batch, **kwargs)
)

if can_run_graph:
return (
self.piecewise_cuda_graph_runner.replay(forward_batch, **kwargs),
can_run_graph,
)

if not skip_attn_backend_init:
self.attn_backend.init_forward_metadata(forward_batch)

return self.model.forward(
forward_batch.input_ids,
forward_batch.positions,
forward_batch,
**kwargs,
return (
self.model.forward(
forward_batch.input_ids,
forward_batch.positions,
forward_batch,
**kwargs,
),
can_run_graph,
)

def forward_idle(
Expand Down Expand Up @@ -2347,7 +2357,7 @@ def _forward_raw(
forward_count=split_forward_count,
)
elif forward_batch.forward_mode.is_extend(include_draft_extend_v2=True):
ret = self.forward_extend(
ret, can_run_graph = self.forward_extend(
forward_batch,
skip_attn_backend_init=skip_attn_backend_init,
pp_proxy_tensors=pp_proxy_tensors,
Expand Down
Loading