pytorch · zeel2104 · Apr 23, 2026 · Apr 24, 2026 · Apr 27, 2026 · Apr 28, 2026
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -489,6 +489,12 @@ jobs:
             name: "gemma3-1b"
         use-custom: [false, true]
         qconfig: ["4w", "nvfp4"]
+        include:
+          - model:
+              id: "google/gemma-4-E2B-it"
+              name: "gemma4-e2b"
+            use-custom: true
+            qconfig: "4w"
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -506,12 +512,21 @@ jobs:
         MODEL_NAME="${{ matrix.model.name }}"
         USE_CUSTOM="${{ matrix.use-custom }}"
         QCONFIG="${{ matrix.qconfig }}"
+        MODEL_REVISION=""
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          MODEL_REVISION="b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf"
+        fi
 
         CUSTOM_ARGS=""
         if [ "${USE_CUSTOM}" = "true" ]; then
           CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
         fi
 
+        QEMBEDDING_ARGS="--qembedding ${QCONFIG}"
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          QEMBEDDING_ARGS=""
+        fi
+
         echo "::group::Install ExecuTorch and configure MLX build"
         ${CONDA_RUN} python install_executorch.py > /dev/null
         ${CONDA_RUN} cmake --preset mlx-release
@@ -522,23 +537,32 @@ jobs:
         ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          # Gemma 4 requires a newer Transformers build than the CI-wide
+          # optimum-executorch pin currently brings in. Keep this pinned to the
+          # locally validated commit instead of floating on Transformers HEAD.
+          GEMMA4_TRANSFORMERS_COMMIT=61461a7bcb458db7cf6eeea49678b9ab776a7821
+          ${CONDA_RUN} pip install -U "transformers @ git+https://github.com/huggingface/transformers.git@${GEMMA4_TRANSFORMERS_COMMIT}"
+        fi
         echo "::endgroup::"
 
         ${CONDA_RUN} pip list
 
         echo "::group::Export ${MODEL_NAME}"
         ${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
           --model-id "${MODEL_ID}" \
+          ${MODEL_REVISION:+--revision "${MODEL_REVISION}"} \
           --output /tmp/${MODEL_NAME}.pte \
           --qlinear ${QCONFIG} \
-          --qembedding ${QCONFIG} \
+          ${QEMBEDDING_ARGS} \
           ${CUSTOM_ARGS}
         echo "::endgroup::"
 
         echo "::group::Run ${MODEL_NAME} inference"
         OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
           --pte /tmp/${MODEL_NAME}.pte \
           --model-id "${MODEL_ID}" \
+          ${MODEL_REVISION:+--revision "${MODEL_REVISION}"} \
           --prompt "What is the capital of France?" \
           --max-new-tokens 50 2>&1)
         echo "$OUTPUT"

diff --git a/backends/mlx/builder/program_builder.py b/backends/mlx/builder/program_builder.py
@@ -444,26 +444,50 @@ def _make_io_slots(self):  # noqa: C901
             else:
                 raise NotImplementedError(f"Support for input {arg} is not implemented")
 
+        placeholder_nodes = {
+            node.name: node for node in self.ep.graph.nodes if node.op == "placeholder"
+        }
+
+        # Allocate placeholder-backed slots in graph-signature order instead of
+        # raw FX node traversal order. This keeps lifted constant tids stable
+        # across equivalent exports, which matters for models like Gemma 4 that
+        # carry multiple rotary constant placeholders with similar structure.
+        for name in constant_tensors:
+            node = placeholder_nodes.get(name)
+            if node is None or node.users == {}:
+                continue
+            self.make_or_get_slot(node, id_space=IdSpace.Constant)
+
+        for name in user_inputs:
+            node = placeholder_nodes.get(name)
+            if node is None or node.users == {}:
+                continue
+            val = node.meta.get("val", None)
+            if isinstance(val, torch.Tensor) and not val.is_contiguous():
+                raise ValueError(
+                    f"MLX backend requires contiguous input tensors, "
+                    f"but input '{node.name}' has non-contiguous strides. "
+                    f"shape={list(val.shape)}, stride={list(val.stride())}. "
+                    f"Ensure example inputs passed to torch.export.export() "
+                    f"are contiguous (call .contiguous() on them)."
+                )
+            self.make_or_get_slot(node, id_space=IdSpace.Input)
+
+        for name in mutable_buffers:
+            node = placeholder_nodes.get(name)
+            if node is None or node.users == {}:
+                continue
+            self.make_or_get_slot(node, id_space=IdSpace.MutableBuffer)
+
+        classified_placeholders = (
+            set(constant_tensors) | set(user_inputs) | set(mutable_buffers)
+        )
+
         for node in self.ep.graph.nodes:
             if node.op == "placeholder":
                 if node.users == {}:
                     continue
-                if node.name in constant_tensors:
-                    self.make_or_get_slot(node, id_space=IdSpace.Constant)
-                elif node.name in user_inputs:
-                    val = node.meta.get("val", None)
-                    if isinstance(val, torch.Tensor) and not val.is_contiguous():
-                        raise ValueError(
-                            f"MLX backend requires contiguous input tensors, "
-                            f"but input '{node.name}' has non-contiguous strides. "
-                            f"shape={list(val.shape)}, stride={list(val.stride())}. "
-                            f"Ensure example inputs passed to torch.export.export() "
-                            f"are contiguous (call .contiguous() on them)."
-                        )
-                    self.make_or_get_slot(node, id_space=IdSpace.Input)
-                elif node.name in mutable_buffers:
-                    self.make_or_get_slot(node, id_space=IdSpace.MutableBuffer)
-                else:
+                if node.name not in classified_placeholders:
                     raise NotImplementedError(
                         f"Support for placeholder {node.name} is not implemented"
                     )

diff --git a/backends/mlx/examples/llm/README.md b/backends/mlx/examples/llm/README.md
@@ -9,6 +9,7 @@ This example demonstrates how to export and run LLMs using the MLX delegate for
 - **KV Cache**: Efficient KV cache implementation for autoregressive generation
 - **Custom Ops**: Uses `mlx::custom_sdpa` and `mlx::kv_cache_update` for optimal execution on MLX
 - **Pybindings**: Run inference using ExecuTorch Python bindings
+- **Gemma 4**: Text-only export and run flow supports processor-backed checkpoints such as `google/gemma-4-E2B-it`
 
 ## Requirements
 
@@ -52,6 +53,25 @@ python -m executorch.backends.mlx.examples.llm.export_llm_hf \
     --use-custom-kv-cache \
     --qlinear 4w \
     --qembedding 4w
+
+# Gemma 4 text-only export
+python -m executorch.backends.mlx.examples.llm.export_llm_hf \
+    --model-id "google/gemma-4-E2B-it" \
+    --revision "b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf" \
+    --output gemma4_hf_int4.pte \
+    --use-custom-sdpa \
+    --use-custom-kv-cache \
+    --qlinear 4w
+```
+
+Gemma 4 support is currently validated for the text-only path using
+`--use-custom-sdpa --use-custom-kv-cache --qlinear 4w`.
+
+Validated with `transformers` commit
+`61461a7bcb458db7cf6eeea49678b9ab776a7821`:
+
+```bash
+pip install -U "transformers @ git+https://github.com/huggingface/transformers.git@61461a7bcb458db7cf6eeea49678b9ab776a7821"
 ```
 
 ### Options
@@ -81,12 +101,25 @@ python -m executorch.backends.mlx.examples.llm.run_llm_hf \
     --prompt "Explain quantum computing in simple terms"
 ```
 
+Gemma 4 checkpoints may use `AutoProcessor` instead of `AutoTokenizer`; `run_llm_hf` now supports both paths automatically for text-only prompts.
+
+Validated Gemma 4 run command:
+
+```bash
+python -m executorch.backends.mlx.examples.llm.run_llm_hf \
+    --pte gemma4_hf_int4.pte \
+    --model-id google/gemma-4-E2B-it \
+    --revision b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf \
+    --prompt "What is the capital of France?" \
+    --max-new-tokens 50
+```
+
 ### Options
 
 | Option | Default | Description |
 |--------|---------|-------------|
 | `--pte` | `llama_hf.pte` | Path to .pte file |
-| `--model-id` | `unsloth/Llama-3.2-1B-Instruct` | HuggingFace model ID (for tokenizer) |
+| `--model-id` | `unsloth/Llama-3.2-1B-Instruct` | HuggingFace model ID (for tokenizer or processor) |
 | `--prompt` | `The quick brown fox` | Input prompt |
 | `--max-new-tokens` | `50` | Maximum tokens to generate |