Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions models/stt/qwen3-forced-aligner-0.6b/coreml/convert-coreml.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> N
# Audio Encoder Conversion
# ---------------------------------------------------------------------------

def convert_audio_encoder(model, settings: ExportSettings) -> Path:
def convert_audio_encoder(model, settings: ExportSettings, *, no_optimize: bool = False) -> Path:
"""Convert the audio encoder (24-layer, 1024 dim) to CoreML."""
typer.echo("\n=== Converting Audio Encoder ===")

Expand Down Expand Up @@ -232,6 +232,7 @@ def convert_audio_encoder(model, settings: ExportSettings) -> Path:
traced, inputs, outputs, settings,
compute_units_override=settings.compute_units,
compute_precision_override=None, # default FP16
no_optimize=no_optimize,
)

path = settings.output_dir / "forced_aligner_audio_encoder.mlpackage"
Expand All @@ -244,7 +245,7 @@ def convert_audio_encoder(model, settings: ExportSettings) -> Path:
# Audio Conv Conversion (split encoder: conv frontend only)
# ---------------------------------------------------------------------------

def convert_audio_conv(model, settings: ExportSettings) -> Path:
def convert_audio_conv(model, settings: ExportSettings, *, no_optimize: bool = False) -> Path:
"""Convert the audio encoder conv frontend (no transformer) to CoreML."""
typer.echo("\n=== Converting Audio Conv (split encoder) ===")

Expand Down Expand Up @@ -285,6 +286,7 @@ def convert_audio_conv(model, settings: ExportSettings) -> Path:
traced, inputs, outputs, settings,
compute_units_override=settings.compute_units,
compute_precision_override=None, # default FP16
no_optimize=no_optimize,
)

path = settings.output_dir / "forced_aligner_audio_conv.mlpackage"
Expand All @@ -297,7 +299,7 @@ def convert_audio_conv(model, settings: ExportSettings) -> Path:
# Audio Transformer Conversion (split encoder: transformer + projection)
# ---------------------------------------------------------------------------

def convert_audio_transformer(model, settings: ExportSettings) -> Path:
def convert_audio_transformer(model, settings: ExportSettings, *, no_optimize: bool = False) -> Path:
"""Convert the audio encoder transformer + projection to CoreML.

This takes concatenated conv features from multiple chunks and runs
Expand Down Expand Up @@ -345,6 +347,7 @@ def convert_audio_transformer(model, settings: ExportSettings) -> Path:
traced, inputs, outputs, settings,
compute_units_override=settings.compute_units,
compute_precision_override=ct.precision.FLOAT32,
no_optimize=no_optimize,
)

path = settings.output_dir / "forced_aligner_audio_transformer.mlpackage"
Expand All @@ -360,7 +363,7 @@ def convert_audio_transformer(model, settings: ExportSettings) -> Path:
# Text Embedding Conversion
# ---------------------------------------------------------------------------

def convert_embedding(model, settings: ExportSettings) -> Path:
def convert_embedding(model, settings: ExportSettings, *, no_optimize: bool = False) -> Path:
"""Convert the token embedding layer to CoreML."""
typer.echo("\n=== Converting Token Embedding ===")

Expand Down Expand Up @@ -396,6 +399,7 @@ def convert_embedding(model, settings: ExportSettings) -> Path:
coreml_model = _coreml_convert(
traced, inputs, outputs, settings,
compute_units_override=settings.compute_units,
no_optimize=no_optimize,
)

path = settings.output_dir / "forced_aligner_embedding.mlpackage"
Expand All @@ -408,7 +412,7 @@ def convert_embedding(model, settings: ExportSettings) -> Path:
# LM Head Conversion
# ---------------------------------------------------------------------------

def convert_lm_head(model, settings: ExportSettings) -> Path:
def convert_lm_head(model, settings: ExportSettings, *, no_optimize: bool = False) -> Path:
"""Convert the LM head (norm + linear) to CoreML.

For ForcedAligner, the LM head processes the FULL sequence at once
Expand Down Expand Up @@ -454,6 +458,7 @@ def convert_lm_head(model, settings: ExportSettings) -> Path:
traced, inputs, outputs, settings,
compute_units_override=settings.compute_units,
compute_precision_override=ct.precision.FLOAT32,
no_optimize=no_optimize,
)

path = settings.output_dir / "forced_aligner_lm_head.mlpackage"
Expand All @@ -466,7 +471,7 @@ def convert_lm_head(model, settings: ExportSettings) -> Path:
# Prefill Decoder Conversion (NAR — single pass)
# ---------------------------------------------------------------------------

def convert_decoder_prefill(model, settings: ExportSettings) -> Path:
def convert_decoder_prefill(model, settings: ExportSettings, *, no_optimize: bool = False) -> Path:
"""Convert the full decoder stack for NAR prefill.

Unlike Qwen3-ASR which needs autoregressive decode with KV cache,
Expand Down Expand Up @@ -534,6 +539,7 @@ def convert_decoder_prefill(model, settings: ExportSettings) -> Path:
traced, inputs, outputs, settings,
compute_units_override=settings.compute_units,
compute_precision_override=ct.precision.FLOAT32,
no_optimize=no_optimize,
)

path = settings.output_dir / "forced_aligner_decoder_prefill.mlpackage"
Expand Down Expand Up @@ -672,27 +678,27 @@ def convert(
component_paths: Dict[str, object] = {}

if "audio_encoder" in convert_list:
path = convert_audio_encoder(model, settings)
path = convert_audio_encoder(model, settings, no_optimize=no_optimize)
component_paths["audio_encoder"] = {"path": path.name}

if "audio_conv" in convert_list:
path = convert_audio_conv(model, settings)
path = convert_audio_conv(model, settings, no_optimize=no_optimize)
component_paths["audio_conv"] = {"path": path.name}

if "audio_transformer" in convert_list:
path = convert_audio_transformer(model, settings)
path = convert_audio_transformer(model, settings, no_optimize=no_optimize)
component_paths["audio_transformer"] = {"path": path.name}

if "embedding" in convert_list:
path = convert_embedding(model, settings)
path = convert_embedding(model, settings, no_optimize=no_optimize)
component_paths["embedding"] = {"path": path.name}

if "lm_head" in convert_list:
path = convert_lm_head(model, settings)
path = convert_lm_head(model, settings, no_optimize=no_optimize)
component_paths["lm_head"] = {"path": path.name}

if "decoder_prefill" in convert_list:
path = convert_decoder_prefill(model, settings)
path = convert_decoder_prefill(model, settings, no_optimize=no_optimize)
component_paths["decoder_prefill"] = {"path": path.name, "num_layers": 28}

write_metadata(settings, component_paths, model_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ def compare(
flac_map[f.name] = str(f)

all_errors = []
for sample in ref_data[:num_files]:
for sample in ref_data["samples"][:num_files]:
audio_path = sample["audio"]
# Resolve relative/short paths
if not Path(audio_path).exists():
Expand All @@ -746,7 +746,7 @@ def compare(
else:
typer.echo(f" WARNING: Cannot find {audio_path}, skipping")
continue
text = sample["text"]
text = sample["transcript"]

typer.echo(f"\n=== {Path(audio_path).name} ===")

Expand All @@ -763,8 +763,8 @@ def compare(
for i in range(n):
ref = ref_alignments[i]
hyp = coreml_alignments[i]
start_err = abs(ref["start_ms"] - hyp.start_ms)
end_err = abs(ref["end_ms"] - hyp.end_ms)
start_err = abs(ref["start_time_ms"] - hyp.start_ms)
end_err = abs(ref["end_time_ms"] - hyp.end_ms)
sample_errors.extend([start_err, end_err])
all_errors.extend([start_err, end_err])

Expand All @@ -783,9 +783,9 @@ def compare(
hyp = coreml_alignments[i]
typer.echo(
f" {ref['text']:12s} "
f"PT: {ref['start_ms']:7.1f}-{ref['end_ms']:7.1f} "
f"PT: {ref['start_time_ms']:7.1f}-{ref['end_time_ms']:7.1f} "
f"CM: {hyp.start_ms:7.1f}-{hyp.end_ms:7.1f} "
f"Δ: {abs(ref['start_ms'] - hyp.start_ms):5.1f}/{abs(ref['end_ms'] - hyp.end_ms):5.1f}ms"
f"Δ: {abs(ref['start_time_ms'] - hyp.start_ms):5.1f}/{abs(ref['end_time_ms'] - hyp.end_ms):5.1f}ms"
)

# Overall summary
Expand Down