Skip to content

Commit d82bde2

Browse files
committed
Add auto model download from HuggingFace on first launch
1 parent 20d57c5 commit d82bde2

10 files changed

Lines changed: 400 additions & 53 deletions

File tree

PR_DESCRIPTION.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Add Voxtral Realtime Windows WPF Application
2+
3+
## Summary
4+
5+
Adds a Windows WPF (.NET 8) desktop application for Voxtral Realtime — a real-time speech-to-text transcription tool powered by ExecuTorch. This is the Windows counterpart to the existing macOS SwiftUI application.
6+
7+
## Key Features
8+
9+
- **Real-time transcription** — Live speech-to-text using the Voxtral model running locally via ExecuTorch
10+
- **Dictation mode** — Speak and auto-paste transcribed text into any target application via clipboard
11+
- **Silence detection** — Peak-based audio level monitoring with configurable silence threshold and timeout for auto-stop
12+
- **Text processing pipeline** — Post-processing with configurable text replacements and snippet expansion
13+
- **Session history** — Persistent transcript storage with search and browsing
14+
- **Global hotkey** — System-wide keyboard shortcut to start/stop transcription
15+
- **MVVM architecture** — Clean separation using `CommunityToolkit.Mvvm` with `ObservableObject` view models
16+
17+
## Architecture
18+
19+
```
20+
VoxtralRealtime/
21+
├── Models/ # Data models (Session, Snippet, ReplacementEntry, Enums)
22+
├── Converters/ # WPF value converters
23+
├── Services/ # Core services
24+
│ ├── RunnerBridge.cs # ExecuTorch model bridge
25+
│ ├── AudioCaptureService.cs # NAudio-based mic capture
26+
│ ├── ClipboardPasteService.cs # Win32 clipboard + paste
27+
│ ├── GlobalHotkeyService.cs # System-wide hotkey registration
28+
│ ├── TextPipeline.cs # Post-processing pipeline
29+
│ ├── PersistenceService.cs # JSON file storage
30+
│ └── AppLogger.cs # File-based logging
31+
├── ViewModels/ # MVVM view models
32+
│ ├── TranscriptStoreViewModel.cs # Central state management
33+
│ ├── DictationViewModel.cs # Dictation flow + silence monitor
34+
│ ├── SettingsViewModel.cs # App configuration
35+
│ ├── ReplacementStoreViewModel.cs # Text replacements
36+
│ └── SnippetStoreViewModel.cs # Text snippets
37+
├── Views/ # WPF XAML views
38+
│ ├── MainWindow # Shell with sidebar + detail layout
39+
│ ├── WelcomeView # Home/landing page
40+
│ ├── TranscriptView # Live transcript display
41+
│ ├── SidebarView # Navigation sidebar
42+
│ ├── RecordingControlsBar # Toolbar (transcribe/pause/done)
43+
│ ├── DictationWindow # Floating dictation overlay
44+
│ ├── AudioLevelControl # Real-time audio level meter
45+
│ ├── SettingsView # Configuration UI
46+
│ └── *ManagementViews # Replacement & snippet editors
47+
└── Resources/ # Styles and assets
48+
```
49+
50+
## Also Included
51+
52+
- `.gitignore` updated with .NET/C# build artifact rules (`bin/`, `obj/`, `publish/`, etc.)
53+
- `build.bat` for release builds
54+
- `upload_models.py` helper for model distribution
55+
- `README.md` with setup and build instructions
56+
57+
## Test Plan
58+
59+
- Built and tested manually on Windows 10/11 with .NET 8
60+
- Verified real-time transcription, dictation auto-paste, silence detection, and session persistence

voxtral_realtime/windows/README.md

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ This is the Windows equivalent of the [macOS Voxtral Realtime app](../macos/).
66

77
## Quick Start
88

9-
Download `VoxtralRealtime-Setup.exe` from the [Releases](https://github.com/meta-pytorch/executorch-examples/releases) page and run the installer. Everything is bundled -- the app, runner, model weights, and tokenizer. No additional downloads required.
10-
11-
After install, launch from the Start Menu or desktop shortcut and click "Start Transcription".
9+
1. Download `VoxtralRealtime-Setup.exe` from the [Releases](https://github.com/meta-pytorch/executorch-examples/releases) page and run the installer
10+
2. Launch from the Start Menu or desktop shortcut
11+
3. Click **"Load Model"** — the app will automatically download the required model files (~5.2 GB) from HuggingFace on first launch
12+
4. Once loaded, click **"Start Transcription"**
1213

1314
### Requirements
1415

@@ -23,6 +24,7 @@ After install, launch from the Start Menu or desktop shortcut and click "Start T
2324
- **Text Snippets** - Voice-triggered templates for common text blocks
2425
- **Dictation Mode** - Ctrl+Space global hotkey, floating overlay, auto-paste to any app, auto-stop on 2s silence
2526
- **Audio Level Visualization** - Real-time waveform display
27+
- **Auto Model Download** - Model weights are downloaded from HuggingFace on first launch with progress tracking
2628

2729
## Keyboard Shortcuts
2830

@@ -41,24 +43,19 @@ For developers who want to build the app themselves.
4143

4244
- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0)
4345
- Pre-built `voxtral_realtime_runner.exe` (see [Building the Runner](#building-the-runner))
44-
- Model files from HuggingFace (see [Model Files](#model-files))
45-
46-
### Model Files
47-
48-
```powershell
49-
pip install huggingface_hub
50-
huggingface-cli download younghan-meta/Voxtral-Mini-4B-Realtime-2602-ExecuTorch-CUDA-Windows --local-dir voxtral_rt_exports
51-
huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602 tekken.json --local-dir voxtral_tokenizer
52-
```
5346

5447
### Building the Runner
5548

49+
Build the `voxtral_realtime_runner.exe` from the [ExecuTorch](https://github.com/pytorch/executorch) repo:
50+
5651
```bash
5752
cd executorch
5853
cmake --preset voxtral-realtime-cuda
5954
cmake --build --preset voxtral-realtime-cuda
6055
```
6156

57+
The runner will be at `cmake-out/examples/models/voxtral_realtime/Release/voxtral_realtime_runner.exe`.
58+
6259
### Build and Run
6360

6461
```powershell
@@ -68,16 +65,26 @@ dotnet build --configuration Release
6865
dotnet run --project VoxtralRealtime --configuration Release
6966
```
7067

68+
Model files will be auto-downloaded on first launch to the `models/` directory next to the executable. To download them manually instead:
69+
70+
```powershell
71+
pip install huggingface_hub
72+
huggingface-cli download younghan-meta/Voxtral-Mini-4B-Realtime-2602-ExecuTorch-CUDA-Windows --local-dir models
73+
huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602 tekken.json --local-dir models
74+
```
75+
7176
### Publish Standalone Executable
7277

78+
Create a single self-contained exe (no .NET runtime required on target machine):
79+
7380
```powershell
7481
cd VoxtralRealtime
7582
dotnet publish VoxtralRealtime --configuration Release --runtime win-x64 --self-contained true /p:PublishSingleFile=true /p:IncludeNativeLibrariesForSelfExtract=true /p:DebugType=none -o publish
7683
```
7784

7885
### Create Installer
7986

80-
Builds a self-contained installer that bundles the app, runner, model weights, and tokenizer:
87+
Builds an installer that bundles the app and runner. Model weights are downloaded automatically on first launch.
8188

8289
```powershell
8390
# 1. Install Inno Setup (one-time)
@@ -87,19 +94,20 @@ winget install JRSoftware.InnoSetup
8794
cd VoxtralRealtime
8895
dotnet publish VoxtralRealtime --configuration Release --runtime win-x64 --self-contained true /p:PublishSingleFile=true /p:IncludeNativeLibrariesForSelfExtract=true /p:DebugType=none -o publish
8996
90-
# 3. Build the installer
97+
# 3. Build the installer (set EXECUTORCH_ROOT to your ExecuTorch repo path)
9198
cd ..
92-
ISCC installer.iss
99+
$env:EXECUTORCH_ROOT = "C:\path\to\executorch"
100+
& "$env:LOCALAPPDATA\Programs\Inno Setup 6\ISCC.exe" installer.iss
93101
```
94102

95103
The output `installer-output\VoxtralRealtime-Setup.exe` includes:
96104
- App executable (self-contained, no .NET runtime needed)
97105
- `voxtral_realtime_runner.exe` + `aoti_cuda_shims.dll`
98-
- Model weights (`model.pte`, `preprocessor.pte`, `aoti_cuda_blob.ptd`)
99-
- Tokenizer (`tekken.json`)
100106
- Start Menu and optional desktop shortcuts
101107
- Clean uninstall via Windows Settings
102108

109+
Model weights (`model.pte`, `preprocessor.pte`, `aoti_cuda_blob.ptd`, `tekken.json`) are **not bundled** — they are downloaded from HuggingFace on first launch, keeping the installer small (~49 MB).
110+
103111
## Architecture
104112

105113
The app wraps the `voxtral_realtime_runner.exe` C++ binary via stdin/stdout pipes:

voxtral_realtime/windows/VoxtralRealtime/VoxtralRealtime/Models/Enums.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ public enum SessionState
1616
public enum ModelState
1717
{
1818
Unloaded,
19+
Downloading,
1920
Loading,
2021
Ready
2122
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// Copyright (c) Meta Platforms, Inc. and affiliates.
2+
// All rights reserved.
3+
// This source code is licensed under the BSD-style license found in the
4+
// LICENSE file in the root directory of this source tree.
5+
6+
using System.IO;
7+
using System.Net.Http;
8+
9+
namespace VoxtralRealtime.Services;
10+
11+
public record DownloadProgress(
12+
string FileName,
13+
long BytesDownloaded,
14+
long TotalBytes,
15+
int FileIndex,
16+
int TotalFiles);
17+
18+
public class ModelDownloadService
19+
{
20+
private static readonly HttpClient Http = new()
21+
{
22+
Timeout = TimeSpan.FromHours(2)
23+
};
24+
25+
public static readonly string DefaultCacheDir = Path.Combine(
26+
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
27+
"VoxtralRealtime", "models");
28+
29+
private static readonly (string RepoId, string FileName)[] ModelFiles =
30+
{
31+
("younghan-meta/Voxtral-Mini-4B-Realtime-2602-ExecuTorch-CUDA-Windows", "model.pte"),
32+
("younghan-meta/Voxtral-Mini-4B-Realtime-2602-ExecuTorch-CUDA-Windows", "preprocessor.pte"),
33+
("younghan-meta/Voxtral-Mini-4B-Realtime-2602-ExecuTorch-CUDA-Windows", "aoti_cuda_blob.ptd"),
34+
("mistralai/Voxtral-Mini-4B-Realtime-2602", "tekken.json"),
35+
};
36+
37+
public event Action<DownloadProgress>? ProgressChanged;
38+
39+
public List<string> GetMissingFiles(string cacheDir)
40+
{
41+
var missing = new List<string>();
42+
foreach (var (_, fileName) in ModelFiles)
43+
{
44+
var path = Path.Combine(cacheDir, fileName);
45+
if (!File.Exists(path))
46+
missing.Add(fileName);
47+
}
48+
return missing;
49+
}
50+
51+
public async Task DownloadAllAsync(string cacheDir, CancellationToken ct = default)
52+
{
53+
Directory.CreateDirectory(cacheDir);
54+
55+
int fileIndex = 0;
56+
int totalFiles = ModelFiles.Length;
57+
58+
foreach (var (repoId, fileName) in ModelFiles)
59+
{
60+
fileIndex++;
61+
var destPath = Path.Combine(cacheDir, fileName);
62+
63+
if (File.Exists(destPath))
64+
{
65+
ProgressChanged?.Invoke(new DownloadProgress(
66+
fileName, 1, 1, fileIndex, totalFiles));
67+
continue;
68+
}
69+
70+
var url = $"https://huggingface.co/{repoId}/resolve/main/{fileName}";
71+
await DownloadFileAsync(url, destPath, fileName, fileIndex, totalFiles, ct);
72+
}
73+
}
74+
75+
private async Task DownloadFileAsync(
76+
string url, string destPath, string fileName,
77+
int fileIndex, int totalFiles, CancellationToken ct)
78+
{
79+
var tmpPath = destPath + ".tmp";
80+
81+
try
82+
{
83+
using var response = await Http.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, ct);
84+
response.EnsureSuccessStatusCode();
85+
86+
var totalBytes = response.Content.Headers.ContentLength ?? -1;
87+
88+
await using var stream = await response.Content.ReadAsStreamAsync(ct);
89+
await using var fileStream = new FileStream(tmpPath, FileMode.Create, FileAccess.Write, FileShare.None, 81920);
90+
91+
var buffer = new byte[81920];
92+
long bytesRead = 0;
93+
int read;
94+
95+
while ((read = await stream.ReadAsync(buffer, ct)) > 0)
96+
{
97+
await fileStream.WriteAsync(buffer.AsMemory(0, read), ct);
98+
bytesRead += read;
99+
100+
ProgressChanged?.Invoke(new DownloadProgress(
101+
fileName, bytesRead, totalBytes, fileIndex, totalFiles));
102+
}
103+
104+
await fileStream.FlushAsync(ct);
105+
}
106+
catch
107+
{
108+
try { File.Delete(tmpPath); } catch { }
109+
throw;
110+
}
111+
112+
File.Move(tmpPath, destPath, overwrite: true);
113+
}
114+
115+
public static string FormatBytes(long bytes)
116+
{
117+
if (bytes < 0) return "?";
118+
if (bytes < 1024) return $"{bytes} B";
119+
if (bytes < 1024 * 1024) return $"{bytes / 1024.0:F0} KB";
120+
if (bytes < 1024L * 1024 * 1024) return $"{bytes / (1024.0 * 1024):F1} MB";
121+
return $"{bytes / (1024.0 * 1024 * 1024):F2} GB";
122+
}
123+
}

voxtral_realtime/windows/VoxtralRealtime/VoxtralRealtime/ViewModels/SettingsViewModel.cs

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,22 @@ namespace VoxtralRealtime.ViewModels;
1212

1313
public partial class SettingsViewModel : ObservableObject
1414
{
15-
private static readonly string DefaultRunnerPath = Path.Combine(
16-
@"C:\Users\younghan\project\executorch",
17-
@"cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe");
15+
private static readonly string AppDir = AppDomain.CurrentDomain.BaseDirectory;
1816

19-
private static readonly string DefaultModelPath = Path.Combine(
20-
@"C:\Users\younghan\project\executorch",
21-
@"voxtral_rt_exports_wsl\model.pte");
22-
23-
private static readonly string DefaultPreprocessorPath = Path.Combine(
24-
@"C:\Users\younghan\project\executorch",
25-
@"voxtral_rt_exports_wsl\preprocessor.pte");
26-
27-
private static readonly string DefaultDataPath = Path.Combine(
28-
@"C:\Users\younghan\project\executorch",
29-
@"voxtral_rt_exports_wsl\aoti_cuda_blob.ptd");
17+
// Models directory: always next to the exe (install dir or dev dir).
18+
// Models are either bundled by the installer or auto-downloaded on first launch.
19+
public static readonly string ModelsDir = Path.Combine(AppDir, "models");
3020

21+
private static readonly string DefaultRunnerPath =
22+
Path.Combine(AppDir, "runner", "voxtral_realtime_runner.exe");
23+
private static readonly string DefaultModelPath =
24+
Path.Combine(ModelsDir, "model.pte");
25+
private static readonly string DefaultPreprocessorPath =
26+
Path.Combine(ModelsDir, "preprocessor.pte");
27+
private static readonly string DefaultDataPath =
28+
Path.Combine(ModelsDir, "aoti_cuda_blob.ptd");
3129
private static readonly string DefaultTokenizerPath =
32-
@"C:\Users\younghan\models\Voxtral-Mini-4B-Realtime-2602\tekken.json";
30+
Path.Combine(ModelsDir, "tekken.json");
3331

3432
[ObservableProperty] private string _runnerPath = DefaultRunnerPath;
3533
[ObservableProperty] private string _modelPath = DefaultModelPath;
@@ -71,15 +69,23 @@ private void Load()
7169
var data = PersistenceService.Load<SettingsData>(PersistenceService.SettingsPath);
7270
if (data == null) return;
7371

74-
RunnerPath = data.RunnerPath ?? DefaultRunnerPath;
75-
ModelPath = data.ModelPath ?? DefaultModelPath;
76-
TokenizerPath = data.TokenizerPath ?? DefaultTokenizerPath;
77-
PreprocessorPath = data.PreprocessorPath ?? DefaultPreprocessorPath;
78-
DataPath = data.DataPath ?? DefaultDataPath;
72+
// Only restore saved paths if the files actually exist;
73+
// otherwise fall back to defaults (triggers auto-download).
74+
RunnerPath = FileOrDefault(data.RunnerPath, DefaultRunnerPath);
75+
ModelPath = FileOrDefault(data.ModelPath, DefaultModelPath);
76+
TokenizerPath = FileOrDefault(data.TokenizerPath, DefaultTokenizerPath);
77+
PreprocessorPath = FileOrDefault(data.PreprocessorPath, DefaultPreprocessorPath);
78+
DataPath = FileOrDefault(data.DataPath, DefaultDataPath);
7979
SilenceThreshold = data.SilenceThreshold;
8080
SilenceTimeoutSeconds = data.SilenceTimeoutSeconds;
8181
}
8282

83+
private static string FileOrDefault(string? saved, string fallback)
84+
{
85+
if (!string.IsNullOrEmpty(saved) && File.Exists(saved)) return saved;
86+
return fallback;
87+
}
88+
8389
private class SettingsData
8490
{
8591
[JsonPropertyName("runnerPath")] public string? RunnerPath { get; set; }

0 commit comments

Comments
 (0)