Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ case "$HF_MODEL" in
PREPROCESSOR_FEATURE_SIZE=""
PREPROCESSOR_OUTPUT=""
;;
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4|SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
MODEL_NAME="qwen3_5_moe"
TASK=""
MAX_SEQ_LEN=""
Expand All @@ -194,7 +194,7 @@ case "$HF_MODEL" in
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
exit 1
;;
esac
Expand Down
4 changes: 2 additions & 2 deletions .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ case "$HF_MODEL" in
AUDIO_FILE="test_audio.wav"
IMAGE_PATH=""
;;
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4|SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
MODEL_NAME="qwen3_5_moe"
RUNNER_TARGET="qwen3_5_moe_runner"
RUNNER_PATH="qwen3_5_moe"
Expand All @@ -230,7 +230,7 @@ case "$HF_MODEL" in
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
exit 1
;;
esac
Expand Down
28 changes: 24 additions & 4 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ jobs:
name: "dinov2-small-imagenet1k-1-layer"
- repo: "SocialLocalMobile"
name: "Qwen3.5-35B-A3B-HQQ-INT4"
- repo: "SocialLocalMobile"
name: "Qwen3.6-35B-A3B-HQQ-INT4"
quant:
- "non-quantized"
- "quantized-int4-tile-packed"
Expand All @@ -191,7 +193,7 @@ jobs:
repo: "google"
name: "gemma-3-4b-it"
quant: "quantized-int4-weight-only"
# Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
# Qwen MoE uses prequantized checkpoints, only tile-packed
- model:
repo: "SocialLocalMobile"
name: "Qwen3.5-35B-A3B-HQQ-INT4"
Expand All @@ -200,6 +202,14 @@ jobs:
repo: "SocialLocalMobile"
name: "Qwen3.5-35B-A3B-HQQ-INT4"
quant: "quantized-int4-weight-only"
- model:
repo: "SocialLocalMobile"
name: "Qwen3.6-35B-A3B-HQQ-INT4"
quant: "non-quantized"
- model:
repo: "SocialLocalMobile"
name: "Qwen3.6-35B-A3B-HQQ-INT4"
quant: "quantized-int4-weight-only"
# Voxtral Realtime only supports int4-tile-packed on CUDA
- model:
repo: "mistralai"
Expand Down Expand Up @@ -254,7 +264,7 @@ jobs:
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
Expand Down Expand Up @@ -311,6 +321,8 @@ jobs:
name: "dinov2-small-imagenet1k-1-layer"
- repo: "SocialLocalMobile"
name: "Qwen3.5-35B-A3B-HQQ-INT4"
- repo: "SocialLocalMobile"
name: "Qwen3.6-35B-A3B-HQQ-INT4"
quant:
- "non-quantized"
- "quantized-int4-tile-packed"
Expand All @@ -321,7 +333,7 @@ jobs:
repo: "google"
name: "gemma-3-4b-it"
quant: "quantized-int4-weight-only"
# Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
# Qwen MoE uses prequantized checkpoints, only tile-packed
- model:
repo: "SocialLocalMobile"
name: "Qwen3.5-35B-A3B-HQQ-INT4"
Expand All @@ -330,6 +342,14 @@ jobs:
repo: "SocialLocalMobile"
name: "Qwen3.5-35B-A3B-HQQ-INT4"
quant: "quantized-int4-weight-only"
- model:
repo: "SocialLocalMobile"
name: "Qwen3.6-35B-A3B-HQQ-INT4"
quant: "non-quantized"
- model:
repo: "SocialLocalMobile"
name: "Qwen3.6-35B-A3B-HQQ-INT4"
quant: "quantized-int4-weight-only"
# Voxtral Realtime only supports int4-tile-packed on CUDA
- model:
repo: "mistralai"
Expand Down Expand Up @@ -378,7 +398,7 @@ jobs:
quant: "non-quantized"
with:
timeout: 90
runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
Expand Down
44 changes: 37 additions & 7 deletions examples/models/qwen3_5_moe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,24 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the
compiled CUDA kernels and quantized weights. Int4 quantization is
recommended — the model is too large to fit in VRAM at bf16.

### Quick start: prequantized weights

The fastest path is to export from prequantized weights, which skips
the slow quantization step entirely.

Prequantized checkpoints are available for download:
- [SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
- [SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)

```bash
python export.py --prequantized <path-to-checkpoint-dir>
```

See [Generating Prequantized Weights](#generating-prequantized-weights)
to create your own.

### Quantize and Export

```bash
python export.py \
--model-id Qwen/Qwen3.5-35B-A3B \
Expand Down Expand Up @@ -60,7 +78,8 @@ python export.py \
| `--qlinear-group-size` | `32` | Group size for linear quantization |
| `--qembedding` | (none) | Embedding quantization: `8w` |
| `--hqq` | off | Use HQQ scale-only optimization for expert quantization (slower, better accuracy) |
| `--prequantized` | (none) | Path to prequantized bundle directory (skips quantization) |
| `--sensitive` | off | Sensitivity-aware mixed precision (bf16/INT8/INT4). Required for models without quantization-aware training (e.g. Qwen3.6) |
| `--prequantized` | (none) | Path to prequantized checkpoint directory (skips quantization) |
| `--turboquant` | off | Enable TurboQuant TQ4 KV cache compression (3.8x cache savings) |

### TurboQuant KV Cache Compression
Expand All @@ -72,11 +91,11 @@ KV cache compression (3.8x savings) on the 10 full-attention layers.
python export.py --prequantized qwen35_moe_int4_hqq --turboquant
```

### Prequantized Export
### Generating Prequantized Weights

Quantization is slow (~30 min with HQQ). To avoid re-quantizing on every
export, use `quantize_and_save.py` to create a self-contained bundle, then
export from it:
export, use `quantize_and_save.py` to create a prequantized checkpoint
directory, then export from it:

```bash
# Step 1: Quantize once (slow)
Expand All @@ -88,13 +107,24 @@ python quantize_and_save.py \
--hqq \
--output qwen35_moe_int4_hqq

# Step 2: Export from bundle (fast, no --model-dir needed)
# Step 2: Export from prequantized checkpoint (fast, no --model-dir needed)
python export.py \
--prequantized qwen35_moe_int4_hqq
```

The bundle contains `model.safetensors`, `config.json`, and tokenizer files.
It can be uploaded to HuggingFace Hub for easy sharing.
For models without quantization-aware training (e.g. Qwen 3.6), use
`--sensitive` for mixed-precision quantization:

```bash
python quantize_and_save.py \
--model-dir ~/models/Qwen3.6-35B-A3B \
--sensitive \
--hqq \
--output qwen36_moe_int4_hqq
```

The output directory contains `model.safetensors`, `config.json`, and
tokenizer files. It can be uploaded to HuggingFace Hub for easy sharing.

## Build

Expand Down
Loading
Loading