Address comments

pytorch · larryliu0820 · Oct 11, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
commit f40b1fb64cc6d22869a2f592f3246203db740353
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
@@ -56,18 +56,16 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
   if (it == internal::tensor_to_strides.end()) {
     needs_update = true;
   } else {
-    // Check if cached values are still valid
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
     auto tensor_strides = tensor->strides();
-    if (it->second.size() != static_cast<size_t>(tensor->dim())) {
-      needs_update = true;
-    } else {
-      for (int i = 0; i < tensor->dim(); i++) {
-        if (it->second[i] != tensor_strides[i]) {
-          needs_update = true;
-          break;
-        }
-      }
-    }
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_strides.begin(),
+        tensor_strides.end());
   }
 
   if (needs_update) {
@@ -106,18 +104,16 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
   if (it == internal::tensor_to_sizes.end()) {
     needs_update = true;
   } else {
-    // Check if cached values are still valid
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
     auto tensor_sizes = tensor->sizes();
-    if (it->second.size() != static_cast<size_t>(tensor->dim())) {
-      needs_update = true;
-    } else {
-      for (int i = 0; i < tensor->dim(); i++) {
-        if (it->second[i] != tensor_sizes[i]) {
-          needs_update = true;
-          break;
-        }
-      }
-    }
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_sizes.begin(),
+        tensor_sizes.end());
   }
 
   if (needs_update) {

@@ -36,6 +36,29 @@ optimum-cli export executorch \
 
 This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit activation linear quantization.
 
+## CUDA Support
+If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
+
+**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
+
+### Exporting with CUDA
+```
+optimum-cli export executorch \
+  --model "mistralai/Voxtral-Mini-3B-2507" \
+  --task "multimodal-text-to-text" \
+  --recipe "cuda" \
+  --dtype bfloat16 \
+  --device cuda \
+  --max_seq_len 1024 \
+  --output_dir="voxtral"
+```
+
+This will generate:
+- `model.pte` - The exported model
+- `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
+
+See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
+
 # Running the model
 To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's MultiModal runner API.
 The Voxtral runner will do the following things:
@@ -56,6 +79,8 @@ python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --stack_
 ```
 
 ## Building the multimodal runner
+
+### Building for CPU (XNNPack)
 ```
 # Build and install ExecuTorch
 cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release
@@ -64,6 +89,26 @@ cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -
 cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release
 ```
 
+### Building for CUDA
+```
+# Install ExecuTorch with CUDA support
+CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+
+# Build the multimodal runner with CUDA
+cmake --preset llm \
+      -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out -S.
+cmake --build cmake-out -j16 --target install --config Release
+
+cmake -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Sexamples/models/voxtral \
+      -Bcmake-out/examples/models/voxtral/
+cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+```
+
 ## Running the model
 You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
 
@@ -88,6 +133,12 @@ If you already have a preprocessed mel spectrogram saved as a `.bin` file, you c
   --audio_path path/to/preprocessed_audio.bin
 ```
 
+
+**For CUDA:** Add the `--data_path` argument to provide the CUDA kernel blob to the commands above:
+```
+  --data_path path/to/aoti_cuda_blob.ptd
+```
+
 Example output:
 ```
 The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that

@@ -157,9 +157,10 @@ convert_to_bfloat16(const ::executorch::extension::TensorPtr& src_tensor) {
 
   auto bf16_tensor = ::executorch::extension::empty_like(
       src_tensor, ::executorch::aten::ScalarType::BFloat16);
-  auto* bf16_data = bf16_tensor->mutable_data_ptr<::c10::BFloat16>();
+  auto* bf16_data =
+      bf16_tensor->mutable_data_ptr<::executorch::aten::ScalarType::BFloat16>();
   for (size_t i = 0; i < num_elements; ++i) {
-    bf16_data[i] = ::c10::BFloat16(float_data[i]);
+    bf16_data[i] = ::executorch::aten::ScalarType::BFloat16(float_data[i]);
   }
 
   return bf16_tensor;