diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index c3b7c058ee6..68ded356b99 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -145,8 +145,8 @@ jobs:
         # Run CUDA backend Python tests
         python -m pytest backends/cuda/tests backends/cuda/passes/tests -v -o "addopts="
 
-        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
-        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
+        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache)
+        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py -v -o "addopts="
 
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 157cc05a54f..b3e82e2ec51 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -107,10 +107,10 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp
                             runtime/shims/cuda_guard.cpp
 )
 
-# Only build CUDA shims when CUDA language/toolchain is available.
+# Only build int4mm shim when CUDA language/toolchain is available.
 if(CMAKE_CUDA_COMPILER)
   list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu
-       runtime/shims/sort.cu runtime/shims/rand.cu
+       runtime/shims/sort.cu
   )
 endif()
 
@@ -152,8 +152,7 @@ endif()
 # retention.
 if(_cuda_is_msvc_toolchain)
   target_link_libraries(
-    aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand
-                            ${CMAKE_DL_LIBS}
+    aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart ${CMAKE_DL_LIBS}
   )
   # Link object library directly so symbols are pulled exactly once while
   # avoiding duplicate static/object inclusion and interface leakage.
@@ -163,7 +162,7 @@ else()
     aoti_cuda_shims
     PRIVATE cuda_platform
     PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
-           CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}
+           CUDA::cudart ${CMAKE_DL_LIBS}
   )
 endif()
 
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 5c6395c8b5b..061b0d6a29a 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -146,7 +146,6 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
             "at::_ops::_weight_int4pack_mm::call": None,
             "at::_ops::sort_stable::call": None,
-            "aoti_torch_cuda_randint_low_out": None,
         }
 
     @classmethod
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index f13f41ab8b7..726f89c8125 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -33,7 +33,6 @@ runtime.cxx_library(
         "shims/cuda_guard.cpp",
         "shims/int4mm.cu",
         "shims/memory.cpp",
-        "shims/rand.cu",
         "shims/sort.cu",
         "shims/tensor_attribute.cpp",
     ],
@@ -42,7 +41,6 @@ runtime.cxx_library(
         "shims/int4mm.cuh",
         "shims/int4mm.h",
         "shims/memory.h",
-        "shims/rand.h",
         "shims/sort.h",
         "shims/tensor_attribute.h",
         "utils.h",
diff --git a/backends/cuda/runtime/shims/rand.cu b/backends/cuda/runtime/shims/rand.cu
deleted file mode 100644
index 375cbc9255c..00000000000
--- a/backends/cuda/runtime/shims/rand.cu
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cuda/runtime/shims/rand.h>
-
-#include <executorch/backends/aoti/slim/cuda/guard.h>
-#include <executorch/backends/aoti/slim/factory/empty.h>
-#include <executorch/backends/aoti/slim/util/size_util.h>
-#include <executorch/runtime/platform/assert.h>
-#include <executorch/runtime/platform/log.h>
-
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-
-#include <cstdint>
-#include <ctime>
-#include <mutex>
-#include <vector>
-
-namespace executorch::backends::cuda {
-
-namespace c10 = executorch::backends::aoti::slim::c10;
-using c10::Device;
-using c10::DeviceIndex;
-using c10::DeviceType;
-using c10::ScalarType;
-using executorch::backends::aoti::slim::empty_strided;
-using executorch::backends::aoti::slim::IntArrayRef;
-using executorch::backends::aoti::slim::makeArrayRef;
-
-namespace {
-
-// ---- GPU-resident RNG state ----
-// Seed and counter live in device memory allocated during the first call
-// (warmup phase, before CUDA graph capture). The counter is atomically
-// advanced by each kernel invocation on-device, so it automatically
-// produces different random sequences on every CUDA graph replay.
-
-struct RngState {
-  unsigned long long seed;
-  unsigned long long counter;
-  // Per-launch scratch — written by advance_counter_kernel and read by
-  // the main RNG kernels. Single-threaded host driver is assumed
-  // (typical inference / CUDA-graph replay use case).
-  unsigned long long base_scratch;
-};
-
-static RngState* d_rng = nullptr;
-// std::call_once guarantees one-shot initialization even when shims are
-// invoked from multiple host threads (e.g. concurrent models / streams).
-static std::once_flag g_rng_init_flag;
-
-// Initialize RNG state on the given stream.
-// Must be called during warmup (before graph capture). Subsequent calls
-// from any thread are no-ops thanks to std::call_once.
-void ensure_rng_init(cudaStream_t stream) {
-  std::call_once(g_rng_init_flag, [&]() {
-    cudaMallocAsync(&d_rng, sizeof(RngState), stream);
-    RngState h;
-    h.seed = static_cast<unsigned long long>(time(nullptr));
-    h.counter = 0;
-    h.base_scratch = 0;
-    cudaMemcpyAsync(
-        d_rng, &h, sizeof(RngState), cudaMemcpyHostToDevice, stream);
-    // Synchronize to ensure the copy completes before we return
-    // (the host-side RngState `h` is on the stack).
-    cudaStreamSynchronize(stream);
-  });
-}
-
-// Philox-based randint kernel. Reads its base offset from `rng->base_scratch`
-// (populated by `advance_counter_kernel` immediately before this launch).
-// This replaces the previous per-element atomicAdd contention with a single
-// atomic per kernel launch.
-__global__ void philox_randint_graph_kernel(
-    int64_t* __restrict__ out,
-    int64_t numel,
-    int64_t low,
-    int64_t range,
-    RngState* __restrict__ rng) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    curandStatePhilox4_32_10_t state;
-    curand_init(rng->seed, idx, rng->base_scratch, &state);
-    double val = curand_uniform_double(&state);
-    int64_t ival = static_cast<int64_t>(val * range);
-    out[idx] = low + (ival >= range ? range - 1 : ival);
-  }
-}
-
-// Philox-based uniform float32 generator (graph-safe version).
-__global__ void philox_rand_float_graph_kernel(
-    float* __restrict__ out,
-    int64_t numel,
-    RngState* __restrict__ rng) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    curandStatePhilox4_32_10_t state;
-    curand_init(rng->seed, idx, rng->base_scratch, &state);
-    out[idx] = curand_uniform(&state);
-  }
-}
-
-// Philox-based uniform bfloat16 generator (graph-safe version).
-__global__ void philox_rand_bf16_graph_kernel(
-    uint16_t* __restrict__ out,
-    int64_t numel,
-    RngState* __restrict__ rng) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    curandStatePhilox4_32_10_t state;
-    curand_init(rng->seed, idx, rng->base_scratch, &state);
-    float val = curand_uniform(&state);
-    uint32_t bits;
-    memcpy(&bits, &val, sizeof(uint32_t));
-    uint32_t lsb = (bits >> 16) & 1;
-    bits += 0x7FFFu + lsb;
-    out[idx] = static_cast<uint16_t>(bits >> 16);
-  }
-}
-
-// Single-thread helper that grabs a contiguous range of `numel` offsets
-// from the on-device counter and writes the base into `rng->base_scratch`.
-// Replaces `numel` per-element atomics with a single atomic per launch
-// while staying graph-capturable.
-__global__ void advance_counter_kernel(
-    RngState* __restrict__ rng,
-    unsigned long long numel) {
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    rng->base_scratch = atomicAdd(&rng->counter, numel);
-  }
-}
-
-} // anonymous namespace
-
-extern "C" {
-
-AOTITorchError aoti_torch_cuda_rand(
-    const int64_t* size,
-    int64_t size_len_,
-    int32_t* dtype,
-    int32_t* layout,
-    int32_t* device,
-    int32_t device_index_,
-    int32_t* pin_memory,
-    SlimTensor** ret0) {
-  (void)layout;
-  (void)device;
-  (void)pin_memory;
-
-  ET_CHECK_OR_RETURN_ERROR(
-      ret0 != nullptr,
-      InvalidArgument,
-      "aoti_torch_cuda_rand: ret0 is null");
-
-  // Default to float32 if dtype not specified.
-  ScalarType scalar_type = ScalarType::Float;
-  if (dtype != nullptr) {
-    scalar_type = static_cast<ScalarType>(*dtype);
-  }
-
-  // Compute contiguous strides and total elements.
-  std::vector<int64_t> strides(size_len_);
-  int64_t numel = 1;
-  for (int64_t i = size_len_ - 1; i >= 0; i--) {
-    strides[i] = numel;
-    numel *= size[i];
-  }
-
-  // Allocate output tensor.
-  IntArrayRef sizes_ref(size, static_cast<size_t>(size_len_));
-  *ret0 = new SlimTensor(empty_strided(
-      sizes_ref,
-      makeArrayRef(strides),
-      scalar_type,
-      Device(DeviceType::CUDA, static_cast<DeviceIndex>(device_index_))));
-
-  if (numel == 0) {
-    return Error::Ok;
-  }
-
-  // Get the current CUDA stream.
-  auto stream_result = getCurrentCUDAStream(0);
-  ET_CHECK_OR_RETURN_ERROR(
-      stream_result.ok(),
-      Internal,
-      "aoti_torch_cuda_rand: failed to get CUDA stream");
-  cudaStream_t stream = stream_result.get();
-
-  ensure_rng_init(stream);
-
-  constexpr int kThreads = 256;
-  int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);
-
-  // Single atomicAdd per launch — grabs `numel` consecutive counter slots
-  // for the kernel below, eliminating per-element contention on the GPU
-  // counter.
-  advance_counter_kernel<<<1, 1, 0, stream>>>(
-      d_rng, static_cast<unsigned long long>(numel));
-
-  if (scalar_type == ScalarType::Float) {
-    philox_rand_float_graph_kernel<<<blocks, kThreads, 0, stream>>>(
-        static_cast<float*>((*ret0)->data_ptr()), numel, d_rng);
-  } else if (scalar_type == ScalarType::BFloat16) {
-    philox_rand_bf16_graph_kernel<<<blocks, kThreads, 0, stream>>>(
-        static_cast<uint16_t*>((*ret0)->data_ptr()), numel, d_rng);
-  } else {
-    ET_LOG(
-        Error,
-        "aoti_torch_cuda_rand: unsupported dtype %d",
-        static_cast<int>(scalar_type));
-    return Error::NotSupported;
-  }
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_cuda_randint_low_out(
-    SlimTensor* out,
-    int64_t low,
-    int64_t high,
-    const int64_t* size,
-    int64_t size_len_) {
-  ET_CHECK_OR_RETURN_ERROR(
-      out != nullptr,
-      InvalidArgument,
-      "aoti_torch_cuda_randint_low_out: out tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      high > low,
-      InvalidArgument,
-      "aoti_torch_cuda_randint_low_out: requires high > low");
-
-  int64_t numel = 1;
-  for (int64_t i = 0; i < size_len_; i++) {
-    numel *= size[i];
-  }
-  if (numel == 0) {
-    return Error::Ok;
-  }
-
-  // Get the current CUDA stream.
-  auto stream_result = getCurrentCUDAStream(0);
-  ET_CHECK_OR_RETURN_ERROR(
-      stream_result.ok(),
-      Internal,
-      "aoti_torch_cuda_randint_low_out: failed to get CUDA stream");
-  cudaStream_t stream = stream_result.get();
-
-  ensure_rng_init(stream);
-
-  int64_t range = high - low;
-  int64_t* out_data = static_cast<int64_t*>(out->data_ptr());
-
-  constexpr int kThreads = 256;
-  int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);
-  // One atomicAdd per launch; subsequent kernel reads `rng->base_scratch`.
-  advance_counter_kernel<<<1, 1, 0, stream>>>(
-      d_rng, static_cast<unsigned long long>(numel));
-  philox_randint_graph_kernel<<<blocks, kThreads, 0, stream>>>(
-      out_data, numel, low, range, d_rng);
-
-  return Error::Ok;
-}
-
-} // extern "C"
-
-} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/rand.h b/backends/cuda/runtime/shims/rand.h
deleted file mode 100644
index e0c63be75a1..00000000000
--- a/backends/cuda/runtime/shims/rand.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#include <executorch/backends/aoti/export.h>
-#include <executorch/backends/aoti/slim/core/slim_tensor.h>
-#include <executorch/backends/aoti/slim/core/slim_tensor_view_incl.h>
-#include <executorch/runtime/core/error.h>
-
-namespace executorch::backends::cuda {
-
-using executorch::runtime::Error;
-using AOTITorchError = Error;
-
-using SlimTensor = executorch::backends::aoti::slim::SlimTensor;
-
-extern "C" {
-
-/**
- * Generates a tensor filled with uniform random values in [0, 1).
- *
- * Implements the AOTI shim for aten::rand.default on CUDA. Uses cuRAND
- * Philox counter-based RNG with GPU-resident state. The counter is
- * atomically advanced by each kernel invocation on-device, making it
- * fully compatible with CUDA graph capture and replay — each replay
- * produces different values because the counter increments on the GPU.
- *
- * Supports float32 and bfloat16 output dtypes.
- */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_rand(
-    const int64_t* size,
-    int64_t size_len_,
-    int32_t* dtype,
-    int32_t* layout,
-    int32_t* device,
-    int32_t device_index_,
-    int32_t* pin_memory,
-    SlimTensor** ret0);
-
-/**
- * Fills a pre-allocated int64 tensor with random integers in [low, high).
- *
- * Implements the AOTI shim for aten::randint.low_out on CUDA. Used by
- * Inductor's Philox RNG to generate random seeds. Each thread atomically
- * advances a GPU-resident counter for unique offsets, making this fully
- * compatible with CUDA graph capture and replay.
- */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(
-    SlimTensor* out,
-    int64_t low,
-    int64_t high,
-    const int64_t* size,
-    int64_t size_len_);
-
-} // extern "C"
-
-} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt
index aec5219d680..c80db9f523d 100644
--- a/backends/cuda/runtime/shims/tests/CMakeLists.txt
+++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt
@@ -42,7 +42,6 @@ set(CUDA_SHIM_TESTS
     test_aoti_torch_delete_tensor_object
     test_aoti_torch__reinterpret_tensor
     test_aoti_torch_copy_
-    test_aoti_torch_cuda_rand
     test_aoti_torch_new_tensor_handle
     test_aoti_torch_item_bool
     test_aoti_torch_assign_tensors_out
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index b68043f7feb..cf8bc5c93e5 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -38,7 +38,6 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_copy_")
     cuda_shim_cpp_unittest("aoti_torch_cuda_guard")
     cuda_shim_cpp_unittest("aoti_torch_cuda__weight_int4pack_mm")
-    cuda_shim_cpp_unittest("aoti_torch_cuda_rand")
     cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch_item_bool")
     cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_rand.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_rand.cpp
deleted file mode 100644
index b776bb39497..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_rand.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <executorch/backends/aoti/slim/c10/core/DeviceType.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/rand.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-#include <gtest/gtest.h>
-
-#include <cstdint>
-#include <cstring>
-#include <limits>
-#include <vector>
-
-// Use explicit types to avoid ambiguity between different Tensor typedefs
-using executorch::backends::cuda::aoti_torch_cuda_rand;
-using executorch::backends::cuda::aoti_torch_cuda_randint_low_out;
-using executorch::backends::cuda::aoti_torch_empty_strided;
-using executorch::backends::cuda::AOTITorchError;
-using executorch::runtime::Error;
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-// Tensor type definition using SlimTensor
-using Tensor = executorch::backends::aoti::slim::SlimTensor;
-
-namespace {
-
-// Helper: convert raw bfloat16 bits (uint16_t) to float for value checks.
-float bfloat16_bits_to_float(uint16_t bits) {
-  uint32_t expanded = static_cast<uint32_t>(bits) << 16;
-  float result;
-  std::memcpy(&result, &expanded, sizeof(float));
-  return result;
-}
-
-} // namespace
-
-// Test fixture for aoti_torch_cuda_rand tests
-class AOTITorchCudaRandTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
-
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
-  }
-
-  // Helper: create a pre-allocated Int64 (Long) CUDA tensor used as the
-  // `out` argument for randint_low_out.
-  Tensor* create_int64_tensor(const std::vector<int64_t>& sizes) {
-    Tensor* tensor = nullptr;
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        nullptr, // default contiguous strides
-        static_cast<int32_t>(slim_c10::ScalarType::Long),
-        static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-        0, // device index
-        &tensor);
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-
-  // Helper: copy a CUDA tensor's raw bytes back to host.
-  template <typename T>
-  std::vector<T> copy_to_host(const Tensor* tensor, int64_t numel) {
-    std::vector<T> host(static_cast<size_t>(numel));
-    cudaError_t err = cudaMemcpy(
-        host.data(),
-        tensor->data_ptr(),
-        static_cast<size_t>(numel) * sizeof(T),
-        cudaMemcpyDeviceToHost);
-    EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed";
-    return host;
-  }
-};
-
-// ----------------------------------------------------------------------------
-// aoti_torch_cuda_rand tests
-// ----------------------------------------------------------------------------
-
-// Basic float32 rand: produces a tensor in [0, 1).
-TEST_F(AOTITorchCudaRandTest, RandFloat32Basic) {
-  std::vector<int64_t> sizes = {4, 8};
-  int64_t numel = 4 * 8;
-  int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float);
-
-  Tensor* out = nullptr;
-  AOTITorchError error = aoti_torch_cuda_rand(
-      sizes.data(),
-      static_cast<int64_t>(sizes.size()),
-      &dtype,
-      /*layout=*/nullptr,
-      /*device=*/nullptr,
-      /*device_index_=*/0,
-      /*pin_memory=*/nullptr,
-      &out);
-
-  ASSERT_EQ(error, Error::Ok) << "aoti_torch_cuda_rand should succeed";
-  ASSERT_NE(out, nullptr);
-  EXPECT_EQ(out->dim(), 2);
-  EXPECT_EQ(out->size(0), 4);
-  EXPECT_EQ(out->size(1), 8);
-  EXPECT_EQ(out->numel(), numel);
-  ASSERT_NE(out->data_ptr(), nullptr);
-
-  cudaDeviceSynchronize();
-  auto host = copy_to_host<float>(out, numel);
-  for (int64_t i = 0; i < numel; ++i) {
-    EXPECT_GE(host[i], 0.0f) << "value at " << i << " < 0";
-    EXPECT_LT(host[i], 1.0f) << "value at " << i << " >= 1";
-  }
-}
-
-// Default dtype path: when dtype is null the shim defaults to float32.
-TEST_F(AOTITorchCudaRandTest, RandDefaultDtypeIsFloat) {
-  std::vector<int64_t> sizes = {16};
-  Tensor* out = nullptr;
-  AOTITorchError error = aoti_torch_cuda_rand(
-      sizes.data(),
-      static_cast<int64_t>(sizes.size()),
-      /*dtype=*/nullptr,
-      /*layout=*/nullptr,
-      /*device=*/nullptr,
-      /*device_index_=*/0,
-      /*pin_memory=*/nullptr,
-      &out);
-
-  ASSERT_EQ(error, Error::Ok);
-  ASSERT_NE(out, nullptr);
-  EXPECT_EQ(out->dtype(), slim_c10::ScalarType::Float);
-  EXPECT_EQ(out->numel(), 16);
-}
-
-// BFloat16 rand: values must lie in [0, 1).
-TEST_F(AOTITorchCudaRandTest, RandBFloat16Basic) {
-  std::vector<int64_t> sizes = {32};
-  int64_t numel = 32;
-  int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::BFloat16);
-
-  Tensor* out = nullptr;
-  AOTITorchError error = aoti_torch_cuda_rand(
-      sizes.data(),
-      static_cast<int64_t>(sizes.size()),
-      &dtype,
-      /*layout=*/nullptr,
-      /*device=*/nullptr,
-      /*device_index_=*/0,
-      /*pin_memory=*/nullptr,
-      &out);
-
-  ASSERT_EQ(error, Error::Ok);
-  ASSERT_NE(out, nullptr);
-  EXPECT_EQ(out->dtype(), slim_c10::ScalarType::BFloat16);
-  EXPECT_EQ(out->numel(), numel);
-
-  cudaDeviceSynchronize();
-  auto host = copy_to_host<uint16_t>(out, numel);
-  for (int64_t i = 0; i < numel; ++i) {
-    float v = bfloat16_bits_to_float(host[i]);
-    EXPECT_GE(v, 0.0f) << "bf16 value at " << i << " < 0";
-    EXPECT_LT(v, 1.0f) << "bf16 value at " << i << " >= 1";
-  }
-}
-
-// Empty tensor: numel == 0 should be a no-op success.
-TEST_F(AOTITorchCudaRandTest, RandEmptyTensor) {
-  std::vector<int64_t> sizes = {0, 4};
-  int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float);
-
-  Tensor* out = nullptr;
-  AOTITorchError error = aoti_torch_cuda_rand(
-      sizes.data(),
-      static_cast<int64_t>(sizes.size()),
-      &dtype,
-      /*layout=*/nullptr,
-      /*device=*/nullptr,
-      /*device_index_=*/0,
-      /*pin_memory=*/nullptr,
-      &out);
-
-  ASSERT_EQ(error, Error::Ok);
-  ASSERT_NE(out, nullptr);
-  EXPECT_EQ(out->numel(), 0);
-}
-
-// Unsupported dtype should return NotSupported.
-TEST_F(AOTITorchCudaRandTest, RandUnsupportedDtypeFails) {
-  std::vector<int64_t> sizes = {8};
-  // Long is not supported by aoti_torch_cuda_rand.
-  int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Long);
-
-  Tensor* out = nullptr;
-  AOTITorchError error = aoti_torch_cuda_rand(
-      sizes.data(),
-      static_cast<int64_t>(sizes.size()),
-      &dtype,
-      /*layout=*/nullptr,
-      /*device=*/nullptr,
-      /*device_index_=*/0,
-      /*pin_memory=*/nullptr,
-      &out);
-
-  EXPECT_EQ(error, Error::NotSupported);
-}
-
-// Null ret0 should fail with InvalidArgument.
-TEST_F(AOTITorchCudaRandTest, RandNullRet0Fails) {
-  std::vector<int64_t> sizes = {4};
-  int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float);
-
-  AOTITorchError error = aoti_torch_cuda_rand(
-      sizes.data(),
-      static_cast<int64_t>(sizes.size()),
-      &dtype,
-      /*layout=*/nullptr,
-      /*device=*/nullptr,
-      /*device_index_=*/0,
-      /*pin_memory=*/nullptr,
-      /*ret0=*/nullptr);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-// Two invocations should advance the GPU-resident counter and produce
-// different sequences (extremely high probability for non-trivial sizes).
-TEST_F(AOTITorchCudaRandTest, RandTwoCallsProduceDifferentValues) {
-  std::vector<int64_t> sizes = {64};
-  int64_t numel = 64;
-  int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float);
-
-  Tensor* out_a = nullptr;
-  Tensor* out_b = nullptr;
-
-  ASSERT_EQ(
-      aoti_torch_cuda_rand(
-          sizes.data(),
-          static_cast<int64_t>(sizes.size()),
-          &dtype,
-          nullptr,
-          nullptr,
-          0,
-          nullptr,
-          &out_a),
-      Error::Ok);
-  ASSERT_EQ(
-      aoti_torch_cuda_rand(
-          sizes.data(),
-          static_cast<int64_t>(sizes.size()),
-          &dtype,
-          nullptr,
-          nullptr,
-          0,
-          nullptr,
-          &out_b),
-      Error::Ok);
-
-  cudaDeviceSynchronize();
-  auto host_a = copy_to_host<float>(out_a, numel);
-  auto host_b = copy_to_host<float>(out_b, numel);
-
-  // The two draws must not be bit-for-bit identical.
-  bool any_diff = false;
-  for (int64_t i = 0; i < numel; ++i) {
-    if (host_a[i] != host_b[i]) {
-      any_diff = true;
-      break;
-    }
-  }
-  EXPECT_TRUE(any_diff)
-      << "two consecutive aoti_torch_cuda_rand calls produced identical values";
-}
-
-// ----------------------------------------------------------------------------
-// aoti_torch_cuda_randint_low_out tests
-// ----------------------------------------------------------------------------
-
-// Basic randint into a pre-allocated int64 tensor; values lie in [low, high).
-TEST_F(AOTITorchCudaRandTest, RandintBasicRange) {
-  std::vector<int64_t> sizes = {32};
-  int64_t numel = 32;
-  int64_t low = -5;
-  int64_t high = 17;
-
-  Tensor* out = create_int64_tensor(sizes);
-  ASSERT_NE(out, nullptr);
-
-  AOTITorchError error = aoti_torch_cuda_randint_low_out(
-      out, low, high, sizes.data(), static_cast<int64_t>(sizes.size()));
-  ASSERT_EQ(error, Error::Ok);
-
-  cudaDeviceSynchronize();
-  auto host = copy_to_host<int64_t>(out, numel);
-  for (int64_t i = 0; i < numel; ++i) {
-    EXPECT_GE(host[i], low);
-    EXPECT_LT(host[i], high);
-  }
-}
-
-// Empty out tensor: numel == 0 should be a no-op success.
-TEST_F(AOTITorchCudaRandTest, RandintEmptyTensor) {
-  std::vector<int64_t> sizes = {0};
-  Tensor* out = create_int64_tensor(sizes);
-  ASSERT_NE(out, nullptr);
-
-  AOTITorchError error = aoti_torch_cuda_randint_low_out(
-      out, /*low=*/0, /*high=*/10, sizes.data(), 1);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-// Null `out` tensor must return InvalidArgument.
-TEST_F(AOTITorchCudaRandTest, RandintNullOutFails) {
-  std::vector<int64_t> sizes = {4};
-  AOTITorchError error = aoti_torch_cuda_randint_low_out(
-      /*out=*/nullptr, 0, 10, sizes.data(), 1);
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-// `high <= low` must return InvalidArgument.
-TEST_F(AOTITorchCudaRandTest, RandintInvalidRangeFails) {
-  std::vector<int64_t> sizes = {4};
-  Tensor* out = create_int64_tensor(sizes);
-  ASSERT_NE(out, nullptr);
-
-  // high == low
-  EXPECT_EQ(
-      aoti_torch_cuda_randint_low_out(
-          out, /*low=*/3, /*high=*/3, sizes.data(), 1),
-      Error::InvalidArgument);
-  // high < low
-  EXPECT_EQ(
-      aoti_torch_cuda_randint_low_out(
-          out, /*low=*/5, /*high=*/2, sizes.data(), 1),
-      Error::InvalidArgument);
-}
-
-// Calling randint twice should produce different sequences via the on-device
-// counter advance. With numel=1 (the typical Inductor seed-gen pattern) we
-// run a few iterations to make collision extremely unlikely.
-TEST_F(AOTITorchCudaRandTest, RandintAdvancesCounter) {
-  std::vector<int64_t> sizes = {1};
-  Tensor* out = create_int64_tensor(sizes);
-  ASSERT_NE(out, nullptr);
-
-  constexpr int kIters = 8;
-  std::vector<int64_t> draws;
-  draws.reserve(kIters);
-  for (int i = 0; i < kIters; ++i) {
-    AOTITorchError error = aoti_torch_cuda_randint_low_out(
-        out,
-        /*low=*/0,
-        /*high=*/std::numeric_limits<int32_t>::max(),
-        sizes.data(),
-        1);
-    ASSERT_EQ(error, Error::Ok);
-    cudaDeviceSynchronize();
-    auto host = copy_to_host<int64_t>(out, 1);
-    draws.push_back(host[0]);
-  }
-
-  // Not all draws should be equal.
-  bool any_diff = false;
-  for (int i = 1; i < kIters; ++i) {
-    if (draws[i] != draws[0]) {
-      any_diff = true;
-      break;
-    }
-  }
-  EXPECT_TRUE(any_diff) << "randint counter did not advance across calls";
-}
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
index ac6c112c08c..b1260a46dc8 100644
--- a/examples/models/qwen3_5_moe/export.py
+++ b/examples/models/qwen3_5_moe/export.py
@@ -768,11 +768,10 @@ def _export_cuda(model, config, args):
     print("Exporting decode method...")
     decode_tokens = torch.tensor([[0]], dtype=torch.long)
     decode_pos = torch.tensor([0], dtype=torch.long)
-    decode_temperature = torch.tensor([1.0], dtype=torch.float32)
     with torch.no_grad():
         decode_ep = export(
             model,
-            (decode_tokens, decode_pos, decode_temperature),
+            (decode_tokens, decode_pos),
             strict=True,
         )
     print("Decode export successful!")
@@ -784,21 +783,18 @@ def _export_cuda(model, config, args):
     # that reject longer prompts at runtime.
     _set_batched_moe(model, True)
     print("Exporting prefill method...")
-
     example_prefill_len = config.max_seq_len - 1
     prefill_tokens = torch.zeros((1, example_prefill_len), dtype=torch.long)
     prefill_pos = torch.arange(example_prefill_len, dtype=torch.long)
-    prefill_temperature = torch.tensor([1.0], dtype=torch.float32)
     seq_dim = Dim("seq_len", min=2, max=config.max_seq_len - 1)
     prefill_dynamic_shapes = (
         {1: seq_dim},  # tokens
         {0: seq_dim},  # input_pos
-        None,  # temperature (static scalar tensor)
     )
     with torch.no_grad():
         prefill_ep = export(
             model,
-            (prefill_tokens, prefill_pos, prefill_temperature),
+            (prefill_tokens, prefill_pos),
             dynamic_shapes=prefill_dynamic_shapes,
             strict=True,
         )
diff --git a/examples/models/qwen3_5_moe/inference.py b/examples/models/qwen3_5_moe/inference.py
index 6f9aa933535..c824f6a6444 100644
--- a/examples/models/qwen3_5_moe/inference.py
+++ b/examples/models/qwen3_5_moe/inference.py
@@ -77,45 +77,46 @@ def generate(
 
     Prefills one token at a time (the recurrent path; chunked FLA via
     @triton_op is used for T>1 prefill in the exported PTE).
-
-    The model performs Gumbel-max sampling on-device: forward() returns
-    a sampled token ID [B, 1] instead of logits [B, T, V].
     """
     if eos_token_ids is None:
         eos_token_ids = set()
 
     input_ids = tokenizer.encode(prompt).ids
 
-    # Temperature tensor (use small epsilon for greedy to avoid div-by-zero)
-    temp_val = max(temperature, 1e-6)
-    temp_tensor = torch.tensor([temp_val], dtype=torch.float32, device="cuda")
-
     # Prefill: one token at a time
     with torch.no_grad():
         for i, tok_id in enumerate(input_ids):
             tok = torch.tensor([[tok_id]], dtype=torch.long, device="cuda")
             pos = torch.tensor([i], dtype=torch.long, device="cuda")
-            sampled = model(tok, pos, temp_tensor)
+            logits = model(tok, pos)
 
-    # First generated token (model returns [B, 1] float token ID)
-    next_token_id = int(sampled.item())
-    generated = [next_token_id]
+    # Sample first generated token
+    next_token = _sample(logits[:, -1, :], temperature)
+    generated = [next_token.item()]
 
     # Decode: one token at a time
     seq_len = len(input_ids)
     with torch.no_grad():
         for i in range(max_new_tokens - 1):
-            tok = torch.tensor([[next_token_id]], dtype=torch.long, device="cuda")
-            pos = torch.tensor([seq_len + i], dtype=torch.long, device="cuda")
-            sampled = model(tok, pos, temp_tensor)
-            next_token_id = int(sampled.item())
-            generated.append(next_token_id)
-            if next_token_id in eos_token_ids:
+            pos = torch.tensor([seq_len + i], device="cuda")
+            logits = model(next_token.unsqueeze(0), pos)
+            next_token = _sample(logits[:, -1, :], temperature)
+            tok_id = next_token.item()
+            generated.append(tok_id)
+            if tok_id in eos_token_ids:
                 break
 
     return tokenizer.decode(generated)
 
 
+def _sample(logits, temperature):
+    """Sample from logits with temperature."""
+    if temperature <= 0:
+        return logits.argmax(dim=-1)
+    probs = torch.softmax(logits / temperature, dim=-1)
+    return torch.multinomial(probs, num_samples=1).squeeze(-1)
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Run inference on prequantized Qwen3.5 MoE"
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
index c5024890645..66b62f57c0c 100644
--- a/examples/models/qwen3_5_moe/main.cpp
+++ b/examples/models/qwen3_5_moe/main.cpp
@@ -10,7 +10,7 @@
 
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/backend/interface.h>
@@ -48,33 +48,6 @@ using ::executorch::runtime::EValue;
 
 using SizesType = executorch::aten::SizesType;
 
-// Read a sampled token from the model output tensor [B, 1].
-// The model performs Gumbel-max sampling on-device and returns a single
-// float token ID. This function copies it from GPU and casts to uint64.
-static uint64_t read_token(const executorch::aten::Tensor& output) {
-  const void* ptr = output.const_data_ptr();
-
-  cudaPointerAttributes attrs;
-  bool on_device = cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess &&
-      attrs.type == cudaMemoryTypeDevice;
-
-  float val;
-  if (on_device) {
-    cudaError_t err =
-        cudaMemcpy(&val, ptr, sizeof(float), cudaMemcpyDeviceToHost);
-    if (err != cudaSuccess) {
-      ET_LOG(
-          Error,
-          "read_token: cudaMemcpy D2H failed: %s",
-          cudaGetErrorString(err));
-      return 0;
-    }
-  } else {
-    memcpy(&val, ptr, sizeof(float));
-  }
-  return static_cast<uint64_t>(val);
-}
-
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -110,7 +83,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  // Create Module with share_memory_arenas=true so prefill and decode
+  // Create Module with share_memory_arenas=true so prefill and forward
   // share mutable buffers (KV cache, conv_state, recurrent_state).
   std::vector<std::string> data_files;
   if (!FLAGS_data_path.empty()) {
@@ -220,20 +193,10 @@ int main(int argc, char** argv) {
   stats.inference_start_ms = llm::time_in_ms();
 
   // ---------------------------------------------------------------
-  // Sampling tensors (shared between prefill and decode)
+  // Prefill or decode-only
   // ---------------------------------------------------------------
   auto S = [](int64_t v) -> SizesType { return static_cast<SizesType>(v); };
 
-  // Use a very small temperature for greedy to avoid division by zero
-  // while keeping the Gumbel noise negligible relative to logit differences.
-  float temp_val =
-      FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
-  auto temp_tensor =
-      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
-
-  // ---------------------------------------------------------------
-  // Prefill
-  // ---------------------------------------------------------------
   uint64_t cur_token = 0;
 
   // Use prefill method for T>=2, decode method for T=1
@@ -260,7 +223,6 @@ int main(int argc, char** argv) {
   std::vector<EValue> prefill_inputs;
   prefill_inputs.push_back(tokens_tensor);
   prefill_inputs.push_back(pos_tensor);
-  prefill_inputs.push_back(temp_tensor);
 
   auto prefill_result = module->execute(run_method, prefill_inputs);
   if (prefill_result.error() != Error::Ok) {
@@ -269,7 +231,10 @@ int main(int argc, char** argv) {
   }
   auto& prefill_outputs = prefill_result.get();
 
-  cur_token = read_token(prefill_outputs[0].toTensor());
+  auto logits_tensor = prefill_outputs[0].toTensor();
+  auto logits_ptr =
+      std::make_shared<executorch::aten::Tensor>(std::move(logits_tensor));
+  cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature);
 
   stats.prompt_eval_end_ms = llm::time_in_ms();
 
@@ -308,7 +273,6 @@ int main(int argc, char** argv) {
     std::vector<EValue> decode_inputs;
     decode_inputs.push_back(EValue(decode_tokens));
     decode_inputs.push_back(EValue(decode_pos));
-    decode_inputs.push_back(EValue(temp_tensor));
 
     auto decode_result = module->execute("decode", decode_inputs);
     if (decode_result.error() != Error::Ok) {
@@ -317,8 +281,14 @@ int main(int argc, char** argv) {
     }
     auto& decode_outputs = decode_result.get();
 
+    auto step_logits = decode_outputs[0].toTensor();
+    auto step_logits_ptr =
+        std::make_shared<executorch::aten::Tensor>(std::move(step_logits));
+
     prev_token = cur_token;
-    cur_token = read_token(decode_outputs[0].toTensor());
+    stats.on_sampling_begin();
+    cur_token = llm::logits_to_token(*step_logits_ptr, FLAGS_temperature);
+    stats.on_sampling_end();
 
     if (step == 0) {
       stats.first_token_ms = llm::time_in_ms();
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
index 81c093f5652..021e068ea28 100644
--- a/examples/models/qwen3_5_moe/model.py
+++ b/examples/models/qwen3_5_moe/model.py
@@ -17,12 +17,10 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Optional
 
 import torch
 import torch.nn as nn
 
-from executorch.examples.models.qwen3_5_moe.sampler import sample
 from torch.nn import functional as F
 
 
@@ -594,10 +592,6 @@ def forward(self, x, input_pos):
         return x
 
 
-# ---------------------------------------------------------------------------
-# Top-level model
-
-
 class Qwen35MoE(nn.Module):
 
     def __init__(self, config):
@@ -611,30 +605,13 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
     def forward(
-        self,
-        tokens: torch.LongTensor,
-        input_pos: torch.LongTensor,
-        temperature: Optional[torch.Tensor] = None,
+        self, tokens: torch.LongTensor, input_pos: torch.LongTensor
     ) -> torch.Tensor:
         x = self.embed_tokens(tokens)
         for layer in self.layers:
             x = layer(x, input_pos)
         x = self.norm(x)
-        # When no sampling is requested, return the full ``[B, T, V]``
-        # logits so callers (eval, custom samplers) can inspect every
-        # position. Otherwise apply the prefill optimization and only
-        # materialize ``[B, V]`` for the last token.
-        if temperature is None:
-            return self.lm_head(x).float()  # [B, T, V] float32
-        logits = self.lm_head(x[:, -1, :]).float()  # [B, V] float32
-        # GPU-side Gumbel-max sampling: argmax(logits/T + gumbel_noise) is
-        # equivalent to drawing from softmax(logits/T) but stays entirely
-        # on-device. Algorithm reference:
-        # https://huggingface.co/blog/cxdu/fastsampling
-        # TODO(gasoonjia): once the on-device sampling stack lands, promote
-        # ``sample`` into a shared CUDA sampling utility reusable by other
-        # models, and add top-k / top-p filtering support.
-        return sample(logits, temperature)  # [B, 1]
+        return self.lm_head(x)
 
     @staticmethod
     def from_hf_checkpoint(model_dir, max_seq_len=4096):
diff --git a/examples/models/qwen3_5_moe/sampler.py b/examples/models/qwen3_5_moe/sampler.py
deleted file mode 100644
index dd1cec5bcd6..00000000000
--- a/examples/models/qwen3_5_moe/sampler.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""
-GPU-side Gumbel-max sampler.
-
-Self-contained sampling utility that can be imported by other models. Lives
-in its own file so it can be reused without pulling in the heavy MoE module.
-
-``temperature`` is a runtime tensor so a single exported program can be
-re-driven with different sampling configurations without re-export.
-
-"""
-
-from typing import Optional
-
-import torch
-
-
-def sample(
-    logits: torch.Tensor,
-    temperature: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """GPU-side Gumbel-max sampler.
-
-    When ``temperature`` is ``None`` (the eager / eval default) the function
-    is a no-op and returns ``logits`` unchanged — useful for callers that
-    just want to inspect raw logits.
-
-    Otherwise it draws from ``softmax(logits / temperature)`` entirely
-    on-device using the Gumbel-max trick:
-    ``argmax(logits / T + gumbel_noise)``
-    (reference: https://huggingface.co/blog/cxdu/fastsampling).
-
-    NOTE: the ``1e-20`` epsilons used in the Gumbel transform assume
-    float32 logits. The contract is documented as ``[B, V]`` float32 and
-    callers are expected to ``.float()``-cast before invoking ``sample``.
-
-    TODO(gasoonjia): add top-k / top-p filtering support in a follow-up PR.
-
-    Args:
-        logits: ``[B, V]`` float32 logits.
-        temperature: 0-D or 1-D float tensor (clamped to >= 1e-6 to avoid
-            divide-by-zero). ``None`` skips temperature scaling and the
-            sampler returns the unmodified ``logits`` tensor.
-
-    Returns:
-        ``[B, 1]`` float32 tensor of sampled token IDs, or the unmodified
-        ``logits`` tensor when ``temperature`` is ``None``.
-    """
-    # No sampling configured — return raw logits.
-    if temperature is None:
-        return logits
-
-    logits = logits / temperature.clamp(min=1e-6)
-
-    # Gumbel-max sampling — equivalent to sampling from softmax(logits)
-    # but fully on-device and CUDA-graph friendly. The 1e-20 epsilons are
-    # safe for float32 (smallest positive normal ~1.18e-38) — see the
-    # float32 note in the docstring.
-    noise = torch.rand_like(logits)
-    gumbel = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
-    return (logits + gumbel).argmax(dim=-1, keepdim=True).float()
diff --git a/examples/models/qwen3_5_moe/test_sampler.py b/examples/models/qwen3_5_moe/test_sampler.py
deleted file mode 100644
index 260c3029525..00000000000
--- a/examples/models/qwen3_5_moe/test_sampler.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Tests for the standalone ``sample`` function in
-``examples/models/qwen3_5_moe/sampler.py``.
-
-``temperature`` is a runtime scalar tensor so the same exported graph can
-be re-driven with different sampling configurations without re-export.
-
-NOTE: top-k / top-p tests are intentionally omitted — that support is
-deferred to a follow-up PR.
-
-Usage:
-    python -m pytest examples/models/qwen3_5_moe/test_sampler.py -v
-"""
-
-import unittest
-
-import torch
-
-from executorch.examples.models.qwen3_5_moe.sampler import sample
-
-
-def _temp(value: float = 1.0) -> torch.Tensor:
-    return torch.tensor([value], dtype=torch.float32)
-
-
-class TestSampler(unittest.TestCase):
-    def setUp(self):
-        torch.manual_seed(0)
-
-    # ------------------------------------------------------------------
-    # No-op path: when temperature is None the function returns the
-    # input logits unchanged.
-    # ------------------------------------------------------------------
-    def test_temperature_none_returns_logits(self):
-        logits = torch.randn(2, 8)
-        out = sample(logits)
-        self.assertIs(out, logits)
-        self.assertEqual(out.shape, (2, 8))
-
-    # ------------------------------------------------------------------
-    # Output shape / dtype contract when sampling is enabled.
-    # ------------------------------------------------------------------
-    def test_output_shape_and_dtype(self):
-        logits = torch.randn(3, 17)
-        out = sample(logits, temperature=_temp(1.0))
-        self.assertEqual(out.shape, (3, 1))
-        self.assertEqual(out.dtype, torch.float32)
-        self.assertTrue(torch.all(out >= 0))
-        self.assertTrue(torch.all(out < logits.size(-1)))
-
-    # ------------------------------------------------------------------
-    # Sampling with temperature matches the inline Gumbel-max sampler
-    # bit-for-bit.
-    # ------------------------------------------------------------------
-    def test_temperature_matches_legacy_gumbel(self):
-        logits = torch.randn(2, 32)
-        temperature = _temp(0.8)
-
-        torch.manual_seed(123)
-        scaled = logits / temperature.clamp(min=1e-6)
-        noise = torch.rand_like(scaled)
-        gumbel = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
-        expected = (scaled + gumbel).argmax(dim=-1, keepdim=True).float()
-
-        torch.manual_seed(123)
-        actual = sample(logits, temperature=temperature)
-        self.assertTrue(torch.equal(actual, expected))
-
-    # ------------------------------------------------------------------
-    # Low temperature → near-greedy sampling.
-    # ------------------------------------------------------------------
-    def test_low_temperature_is_near_greedy(self):
-        torch.manual_seed(0)
-        logits = torch.randn(1, 32) * 2.0
-        argmax_id = int(logits.argmax(dim=-1).item())
-
-        agree = 0
-        trials = 100
-        for trial in range(trials):
-            torch.manual_seed(trial + 500)
-            tok = int(sample(logits, temperature=_temp(1e-4)).item())
-            if tok == argmax_id:
-                agree += 1
-        self.assertGreaterEqual(agree, int(trials * 0.9))
-
-    # ------------------------------------------------------------------
-    # Empirical distribution check: top-1 should be the most frequent.
-    # ------------------------------------------------------------------
-    def test_distribution_peaks_at_argmax(self):
-        torch.manual_seed(0)
-        logits = torch.tensor([[3.0, 1.0, 0.5, -1.0, 0.0]])
-        argmax_id = int(logits.argmax(dim=-1).item())
-
-        counts = torch.zeros(logits.size(-1), dtype=torch.long)
-        trials = 2000
-        for trial in range(trials):
-            torch.manual_seed(trial)
-            tok = int(sample(logits, temperature=_temp(1.0)).item())
-            counts[tok] += 1
-        self.assertEqual(int(counts.argmax().item()), argmax_id)
-
-    # ------------------------------------------------------------------
-    # Runtime control: changing temperature between calls produces
-    # different draws without re-creating the graph.
-    # ------------------------------------------------------------------
-    def test_temperature_runtime_controllable(self):
-        torch.manual_seed(0)
-        logits = torch.randn(1, 64) * 4.0
-        argmax_id = int(logits.argmax(dim=-1).item())
-
-        torch.manual_seed(7)
-        cold = int(sample(logits, temperature=_temp(1e-4)).item())
-        torch.manual_seed(7)
-        hot = int(sample(logits, temperature=_temp(5.0)).item())
-
-        # Cold sampling should hit argmax; hot sampling is unconstrained
-        # but is still a valid token id.
-        self.assertEqual(cold, argmax_id)
-        self.assertGreaterEqual(hot, 0)
-        self.assertLess(hot, logits.size(-1))
-
-    # ------------------------------------------------------------------
-    # CUDA smoke test.
-    # ------------------------------------------------------------------
-    def test_runs_on_cuda(self):
-        if not torch.cuda.is_available():
-            self.skipTest("CUDA not available")
-        logits = torch.randn(2, 64, device="cuda")
-        temperature = torch.tensor([0.9], dtype=torch.float32, device="cuda")
-        out = sample(logits, temperature=temperature)
-        self.assertEqual(out.device.type, "cuda")
-        self.assertEqual(out.shape, (2, 1))
-
-
-if __name__ == "__main__":
-    unittest.main()