pytorch · mergennachin · Apr 16, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -184,7 +184,7 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
-  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4|SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     TASK=""
     MAX_SEQ_LEN=""
@@ -194,7 +194,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -216,7 +216,7 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
-  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4|SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     RUNNER_TARGET="qwen3_5_moe_runner"
     RUNNER_PATH="qwen3_5_moe"
@@ -230,7 +230,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -181,6 +181,8 @@ jobs:
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
             name: "Qwen3.5-35B-A3B-HQQ-INT4"
+          - repo: "SocialLocalMobile"
+            name: "Qwen3.6-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -191,7 +193,7 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
-          # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
+          # Qwen MoE uses prequantized checkpoints, only tile-packed
           - model:
               repo: "SocialLocalMobile"
               name: "Qwen3.5-35B-A3B-HQQ-INT4"
@@ -200,6 +202,14 @@ jobs:
               repo: "SocialLocalMobile"
               name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
+          - model:
+              repo: "SocialLocalMobile"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            quant: "non-quantized"
+          - model:
+              repo: "SocialLocalMobile"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
               repo: "mistralai"
@@ -254,7 +264,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
@@ -311,6 +321,8 @@ jobs:
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
             name: "Qwen3.5-35B-A3B-HQQ-INT4"
+          - repo: "SocialLocalMobile"
+            name: "Qwen3.6-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -321,7 +333,7 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
-          # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
+          # Qwen MoE uses prequantized checkpoints, only tile-packed
           - model:
               repo: "SocialLocalMobile"
               name: "Qwen3.5-35B-A3B-HQQ-INT4"
@@ -330,6 +342,14 @@ jobs:
               repo: "SocialLocalMobile"
               name: "Qwen3.5-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
+          - model:
+              repo: "SocialLocalMobile"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            quant: "non-quantized"
+          - model:
+              repo: "SocialLocalMobile"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
+            quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
               repo: "mistralai"
@@ -378,7 +398,7 @@ jobs:
             quant: "non-quantized"
     with:
       timeout: 90
-      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false

@@ -30,6 +30,24 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the
 compiled CUDA kernels and quantized weights. Int4 quantization is
 recommended — the model is too large to fit in VRAM at bf16.
 
+### Quick start: prequantized weights
+
+The fastest path is to export from prequantized weights, which skips
+the slow quantization step entirely.
+
+Prequantized checkpoints are available for download:
+- [SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+- [SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
+
+```bash
+python export.py --prequantized <path-to-checkpoint-dir>
+```
+
+See [Generating Prequantized Weights](#generating-prequantized-weights)
+to create your own.
+
+### Quantize and Export
+
 ```bash
 python export.py \
     --model-id Qwen/Qwen3.5-35B-A3B \
@@ -60,7 +78,8 @@ python export.py \
 | `--qlinear-group-size` | `32` | Group size for linear quantization |
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--hqq` | off | Use HQQ scale-only optimization for expert quantization (slower, better accuracy) |
-| `--prequantized` | (none) | Path to prequantized bundle directory (skips quantization) |
+| `--sensitive` | off | Sensitivity-aware mixed precision (bf16/INT8/INT4). Required for models without quantization-aware training (e.g. Qwen3.6) |
+| `--prequantized` | (none) | Path to prequantized checkpoint directory (skips quantization) |
 | `--turboquant` | off | Enable TurboQuant TQ4 KV cache compression (3.8x cache savings) |
 
 ### TurboQuant KV Cache Compression
@@ -72,11 +91,11 @@ KV cache compression (3.8x savings) on the 10 full-attention layers.
 python export.py --prequantized qwen35_moe_int4_hqq --turboquant
 ```
 
-### Prequantized Export
+### Generating Prequantized Weights
 
 Quantization is slow (~30 min with HQQ). To avoid re-quantizing on every
-export, use `quantize_and_save.py` to create a self-contained bundle, then
-export from it:
+export, use `quantize_and_save.py` to create a prequantized checkpoint
+directory, then export from it:
 
 ```bash
 # Step 1: Quantize once (slow)
@@ -88,13 +107,24 @@ python quantize_and_save.py \
     --hqq \
     --output qwen35_moe_int4_hqq
 
-# Step 2: Export from bundle (fast, no --model-dir needed)
+# Step 2: Export from prequantized checkpoint (fast, no --model-dir needed)
 python export.py \
     --prequantized qwen35_moe_int4_hqq
 ```
 
-The bundle contains `model.safetensors`, `config.json`, and tokenizer files.
-It can be uploaded to HuggingFace Hub for easy sharing.
+For models without quantization-aware training (e.g. Qwen 3.6), use
+`--sensitive` for mixed-precision quantization:
+
+```bash
+python quantize_and_save.py \
+    --model-dir ~/models/Qwen3.6-35B-A3B \
+    --sensitive \
+    --hqq \
+    --output qwen36_moe_int4_hqq
+```
+
+The output directory contains `model.safetensors`, `config.json`, and
+tokenizer files. It can be uploaded to HuggingFace Hub for easy sharing.
 
 ## Build