move top-p and top-k suport into a individual PR

pytorch · Gasoonjia · Apr 27, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026
commit 61d47aa5ed2c0c1e19d06a80abbec49c7b66e5ac
@@ -770,23 +770,10 @@ def _export_cuda(model, config, args):
     decode_tokens = torch.tensor([[0]], dtype=torch.long)
     decode_pos = torch.tensor([0], dtype=torch.long)
     decode_temperature = torch.tensor([1.0], dtype=torch.float32)
-    # top_k / top_p are runtime scalar tensors (parallel to temperature) so
-    # the same .pte can be re-driven with different sampling configurations
-    # without re-export. Default examples are no-op values: top_k=V (keep
-    # all tokens), top_p=1.0 (keep full nucleus). Callers override them at
-    # runtime by binding different scalar tensors.
-    decode_top_k = torch.tensor(config.vocab_size, dtype=torch.int64)
-    decode_top_p = torch.tensor(1.0, dtype=torch.float32)
     with torch.no_grad():
         decode_ep = export(
             model,
-            (
-                decode_tokens,
-                decode_pos,
-                decode_temperature,
-                decode_top_k,
-                decode_top_p,
-            ),
+            (decode_tokens, decode_pos, decode_temperature),
             strict=True,
         )
     print("Decode export successful!")
@@ -803,26 +790,16 @@ def _export_cuda(model, config, args):
     prefill_tokens = torch.zeros((1, example_prefill_len), dtype=torch.long)
     prefill_pos = torch.arange(example_prefill_len, dtype=torch.long)
     prefill_temperature = torch.tensor([1.0], dtype=torch.float32)
-    prefill_top_k = torch.tensor(config.vocab_size, dtype=torch.int64)
-    prefill_top_p = torch.tensor(1.0, dtype=torch.float32)
     seq_dim = Dim("seq_len", min=2, max=config.max_seq_len - 1)
     prefill_dynamic_shapes = (
         {1: seq_dim},  # tokens
         {0: seq_dim},  # input_pos
         None,  # temperature (static scalar tensor)
-        None,  # top_k (static scalar tensor — runtime-bindable)
-        None,  # top_p (static scalar tensor — runtime-bindable)
     )
     with torch.no_grad():
         prefill_ep = export(
             model,
-            (
-                prefill_tokens,
-                prefill_pos,
-                prefill_temperature,
-                prefill_top_k,
-                prefill_top_p,
-            ),
+            (prefill_tokens, prefill_pos, prefill_temperature),
             dynamic_shapes=prefill_dynamic_shapes,
             strict=True,
         )

@@ -37,14 +37,6 @@ DEFINE_string(
 DEFINE_double(temperature, 0.8, "Sampling temperature (0 = greedy).");
 DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
 DEFINE_bool(cuda_graph, false, "Enable CUDA graph for decode method.");
-DEFINE_int64(
-    top_k,
-    -1,
-    "Top-k sampling cutoff (<=0 = no-op default of vocab_size, keeps all tokens).");
-DEFINE_double(
-    top_p,
-    1.0,
-    "Top-p (nucleus) sampling threshold. 1.0 = no-op (keeps full nucleus).");
 
 namespace llm = ::executorch::extension::llm;
 using ::executorch::extension::from_blob;
@@ -206,22 +198,6 @@ int main(int argc, char** argv) {
   auto temp_tensor =
       from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
 
-  // top_k / top_p are 0-D scalar tensors matching the export-time signature
-  // (see examples/models/qwen3_5_moe/export.py). The default flag values
-  // (top_k = vocab_size, top_p = 1.0) are mathematical no-ops: the sort+
-  // scatter subgraph still runs (it was traced into the graph at export
-  // time), but produces all-False filter masks so logits pass through
-  // unchanged. Override at runtime to enable real filtering.
-  int64_t vocab_size = metadata.count(llm::kVocabSize)
-      ? metadata[llm::kVocabSize]
-      : static_cast<int64_t>(tokenizer->vocab_size());
-  int64_t top_k_val = (FLAGS_top_k <= 0) ? vocab_size : FLAGS_top_k;
-  float top_p_val = static_cast<float>(FLAGS_top_p);
-  auto top_k_tensor =
-      from_blob(&top_k_val, {}, executorch::aten::ScalarType::Long);
-  auto top_p_tensor =
-      from_blob(&top_p_val, {}, executorch::aten::ScalarType::Float);
-
   // ---------------------------------------------------------------
   // Prefill
   // ---------------------------------------------------------------
@@ -252,8 +228,6 @@ int main(int argc, char** argv) {
   prefill_inputs.push_back(tokens_tensor);
   prefill_inputs.push_back(pos_tensor);
   prefill_inputs.push_back(temp_tensor);
-  prefill_inputs.push_back(top_k_tensor);
-  prefill_inputs.push_back(top_p_tensor);
 
   auto prefill_result = module->execute(run_method, prefill_inputs);
   if (prefill_result.error() != Error::Ok) {
@@ -302,8 +276,6 @@ int main(int argc, char** argv) {
     decode_inputs.push_back(EValue(decode_tokens));
     decode_inputs.push_back(EValue(decode_pos));
     decode_inputs.push_back(EValue(temp_tensor));
-    decode_inputs.push_back(EValue(top_k_tensor));
-    decode_inputs.push_back(EValue(top_p_tensor));
 
     auto decode_result = module->execute("decode", decode_inputs);
     if (decode_result.error() != Error::Ok) {

@@ -631,8 +631,6 @@ def forward(
         tokens: torch.LongTensor,
         input_pos: torch.LongTensor,
         temperature: Optional[torch.Tensor] = None,
-        top_k: Optional[torch.Tensor] = None,
-        top_p: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         x = self.embed_tokens(tokens)
         for layer in self.layers:
@@ -642,16 +640,17 @@ def forward(
         # logits so callers (eval, custom samplers) can inspect every
         # position. Otherwise apply the prefill optimization and only
         # materialize ``[B, V]`` for the last token.
-        if temperature is None and top_k is None and top_p is None:
-            return self.lm_head(x)
+        if temperature is None:
+            return self.lm_head(x).float()  # [B, T, V] float32
         logits = self.lm_head(x[:, -1, :]).float()  # [B, V] float32
         # GPU-side Gumbel-max sampling: argmax(logits/T + gumbel_noise) is
         # equivalent to drawing from softmax(logits/T) but stays entirely
-        # on-device.
+        # on-device. Algorithm reference:
+        # https://huggingface.co/blog/cxdu/fastsampling
         # TODO(gasoonjia): once the on-device sampling stack lands, promote
         # ``sample`` into a shared CUDA sampling utility reusable by other
-        # models.
-        return sample(logits, temperature, top_k, top_p)  # [B, 1]
+        # models, and add top-k / top-p filtering support.
+        return sample(logits, temperature)  # [B, 1]
 
     @staticmethod
     def from_hf_checkpoint(model_dir, max_seq_len=4096):

@@ -1,12 +1,12 @@
 """
-GPU-side Gumbel-max sampler with optional top-k / top-p filtering.
+GPU-side Gumbel-max sampler.
 
 Self-contained sampling utility that can be imported by other models. Lives
 in its own file so it can be reused without pulling in the heavy MoE module.
 
-All sampling parameters (``temperature``, ``top_k``, ``top_p``) are
-**runtime tensors** so a single exported program can be re-driven with
-different sampling configurations without re-export.
+``temperature`` is a runtime tensor so a single exported program can be
+re-driven with different sampling configurations without re-export.
+
 """
 
 from typing import Optional
@@ -17,20 +17,12 @@
 def sample(
     logits: torch.Tensor,
     temperature: Optional[torch.Tensor] = None,
-    top_k: Optional[torch.Tensor] = None,
-    top_p: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    """GPU-side Gumbel-max sampler with optional top-k / top-p filtering.
-
-    All three sampling knobs are *runtime* scalar tensors so the caller can
-    change them between calls without re-exporting the graph. The Python-
-    level ``is None`` checks are static (decided at trace time) and select
-    which subgraph is emitted; once provided, the actual values are pure
-    tensors and the kernels are fully data-driven.
+    """GPU-side Gumbel-max sampler.
 
-    When ``temperature``, ``top_k`` and ``top_p`` are all ``None`` (the
-    eager / eval default), the function is a no-op and returns ``logits``
-    unchanged — useful for callers that just want to inspect raw logits.
+    When ``temperature`` is ``None`` (the eager / eval default) the function
+    is a no-op and returns ``logits`` unchanged — useful for callers that
+    just want to inspect raw logits.
 
     Otherwise it draws from ``softmax(logits / temperature)`` entirely
     on-device using the Gumbel-max trick:
@@ -41,58 +33,23 @@ def sample(
     float32 logits. The contract is documented as ``[B, V]`` float32 and
     callers are expected to ``.float()``-cast before invoking ``sample``.
 
+    TODO(gasoonjia): add top-k / top-p filtering support in a follow-up PR.
+
     Args:
         logits: ``[B, V]`` float32 logits.
         temperature: 0-D or 1-D float tensor (clamped to >= 1e-6 to avoid
-            divide-by-zero). ``None`` skips temperature scaling.
-        top_k: 0-D or 1-D int tensor — keep only the top ``k`` logits.
-            ``None`` skips top-k filtering. ``k >= V`` is also a no-op.
-        top_p: 0-D or 1-D float tensor — nucleus threshold; keep the
-            smallest set of logits whose cumulative softmax probability
-            is >= ``top_p``. ``None`` (or ``>= 1.0``) disables top-p.
+            divide-by-zero). ``None`` skips temperature scaling and the
+            sampler returns the unmodified ``logits`` tensor.
 
     Returns:
         ``[B, 1]`` float32 tensor of sampled token IDs, or the unmodified
-        ``logits`` tensor when all sampling parameters are ``None``.
+        ``logits`` tensor when ``temperature`` is ``None``.
     """
     # No sampling configured — return raw logits.
-    if temperature is None and top_k is None and top_p is None:
+    if temperature is None:
         return logits
 
-    if temperature is not None:
-        logits = logits / temperature.clamp(min=1e-6)
-
-    # Single sort handles both top-k and top-p filtering — both branches
-    # need descending logits anyway, so we share the sort to keep the
-    # graph small.
-    if top_k is not None or top_p is not None:
-        sorted_logits, sorted_idx = torch.sort(logits, dim=-1, descending=True)
-        sorted_remove = torch.zeros_like(sorted_logits, dtype=torch.bool)
-
-        if top_k is not None:
-            # Position >= k → drop. Works for any tensor k via broadcast;
-            # k >= V naturally becomes a no-op (mask is all-False).
-            pos = torch.arange(sorted_logits.size(-1), device=sorted_logits.device)
-            sorted_remove = sorted_remove | (pos >= top_k.to(pos.dtype))
-
-        if top_p is not None:
-            cum_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
-            p_remove = cum_probs > top_p
-            # Shift right by one so the highest-prob token is always kept,
-            # even when its single-token prob already exceeds top_p.
-            p_remove = torch.cat(
-                [torch.zeros_like(p_remove[..., :1]), p_remove[..., :-1]],
-                dim=-1,
-            )
-            sorted_remove = sorted_remove | p_remove
-
-        sorted_logits = torch.where(
-            sorted_remove,
-            torch.full_like(sorted_logits, float("-inf")),
-            sorted_logits,
-        )
-        # Scatter the masked sorted logits back into original token order.
-        logits = torch.empty_like(logits).scatter_(-1, sorted_idx, sorted_logits)
+    logits = logits / temperature.clamp(min=1e-6)
 
     # Gumbel-max sampling — equivalent to sampling from softmax(logits)
     # but fully on-device and CUDA-graph friendly. The 1e-20 epsilons are