FluidInference
diff --git a/‎AGENTS.md‎
Lines changed: 2 additions & 1 deletion b/‎AGENTS.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Documentation/Benchmarks.md‎
Lines changed: 26 additions & 24 deletions b/‎Documentation/Benchmarks.md‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎Sources/FluidAudio/ASR/ANEOptimizer.swift‎
Lines changed: 2 additions & 3 deletions b/‎Sources/FluidAudio/ASR/ANEOptimizer.swift‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎Sources/FluidAudio/ASR/AsrManager.swift‎
Lines changed: 27 additions & 70 deletions b/‎Sources/FluidAudio/ASR/AsrManager.swift‎
Lines changed: 27 additions & 70 deletions
@@ -22,11 +22,12 @@ swift format --in-place --recursive --configuration .swift-format Sources/ Tests
 - **NEVER** create dummy/mock models or synthetic audio data - use real models only
 - **NEVER** create simplified versions - implement full solutions or consult first
 - **NEVER** run `git push` unless explicitly requested by user
+- **ONLY** add or run tests when explicitly requested by the user
 
 ## Code Style (swift-format config)
 - Line length: 120 chars, 4-space indentation
 - Import order: `import CoreML`, `import Foundation`, `import OSLog` (OrderedImports rule)
 - Naming: lowerCamelCase for variables/functions, UpperCamelCase for types
 - Error handling: Use proper Swift error handling, no force unwrapping in production
 - Documentation: Triple-slash comments (`///`) for public APIs
-- Thread safety: Use actors, `@MainActor`, or proper locking - never `@unchecked Sendable`
+- Thread safety: Use actors, `@MainActor`, or proper locking - never `@unchecked Sendable`
@@ -11,33 +11,35 @@ swift run fluidaudio fleurs-benchmark --languages en_us,it_it,es_419,fr_fr,de_de
 ```
 
 ```text
-================================================================================
-FLEURS BENCHMARK SUMMARY
-================================================================================
-
-Language                  | WER%   | CER%   | RTFx    | Duration | Processed | Skipped
------------------------------------------------------------------------------------------
-English (US)              | 5.7    | 2.8    | 136.7   | 3442.9s  | 350       | -
-French (France)           | 5.8    | 2.4    | 136.5   | 560.8s   | 52        | 298
-German (Germany)          | 3.1    | 1.2    | 152.2   | 62.1s    | 5         | -
-Italian (Italy)           | 4.3    | 2.0    | 153.7   | 743.3s   | 50        | -
-Russian (Russia)          | 7.7    | 2.8    | 134.1   | 621.2s   | 50        | -
-Spanish (Spain)           | 6.5    | 3.0    | 152.3   | 586.9s   | 50        | -
-Ukrainian (Ukraine)       | 6.5    | 1.9    | 132.5   | 528.2s   | 50        | -
------------------------------------------------------------------------------------------
-AVERAGE                   | 5.6    | 2.3    | 142.6   | 6545.5s  | 607       | 298
+[22:00:56.652] [INFO] [FLEURSBenchmark] ================================================================================
+[22:00:56.652] [INFO] [FLEURSBenchmark] FLEURS BENCHMARK SUMMARY
+[22:00:56.652] [INFO] [FLEURSBenchmark] ================================================================================
+[22:00:56.652] [INFO] [FLEURSBenchmark]
+[22:00:56.652] [INFO] [FLEURSBenchmark] Language                  | WER%   | CER%   | RTFx    | Duration | Processed | Skipped
+[22:00:56.652] [INFO] [FLEURSBenchmark] -----------------------------------------------------------------------------------------
+[22:00:56.652] [INFO] [FLEURSBenchmark] English (US)              | 5.8    | 2.9    | 189.4   | 3442.9s  | 350       | -
+[22:00:56.652] [INFO] [FLEURSBenchmark] French (France)           | 8.8    | 3.8    | 181.3   | 560.8s   | 52        | 298
+[22:00:56.652] [INFO] [FLEURSBenchmark] German (Germany)          | 4.2    | 1.2    | 211.2   | 62.1s    | 5         | -
+[22:00:56.652] [INFO] [FLEURSBenchmark] Italian (Italy)           | 2.8    | 1.0    | 206.6   | 743.3s   | 50        | -
+[22:00:56.652] [INFO] [FLEURSBenchmark] Russian (Russia)          | 7.0    | 2.3    | 185.3   | 621.2s   | 50        | -
+[22:00:56.652] [INFO] [FLEURSBenchmark] Spanish (Spain)           | 4.0    | 1.8    | 207.9   | 586.9s   | 50        | -
+[22:00:56.652] [INFO] [FLEURSBenchmark] Ukrainian (Ukraine)       | 7.2    | 2.1    | 182.8   | 528.2s   | 50        | -
+[22:00:56.652] [INFO] [FLEURSBenchmark] -----------------------------------------------------------------------------------------
+[22:00:56.652] [INFO] [FLEURSBenchmark] AVERAGE                   | 5.7    | 2.2    | 194.9   | 6545.5s  | 607       | 298
 ```
 
 ```text
-2620 files per dataset • Test runtime: 4m 1s • 09/04/2025, 1:55 AM EDT
---- Benchmark Results ---
-   Dataset: librispeech test-clean
-   Files processed: 2620
-   Average WER: 2.7%
-   Median WER: 0.0%
-   Average CER: 1.1%
-   Median RTFx: 99.3x
-   Overall RTFx: 109.6x (19452.5s / 177.5s)
+[22:06:25.813] [INFO] [Benchmark] 2620 files per dataset • Test runtime: 3m 12s • 09/19/2025, 10:06 PM EDT
+[22:06:25.813] [INFO] [Benchmark] --- Benchmark Results ---
+[22:06:25.813] [INFO] [Benchmark]    Dataset: librispeech test-clean
+[22:06:25.813] [INFO] [Benchmark]    Files processed: 2620
+[22:06:25.813] [INFO] [Benchmark]    Average WER: 2.7%
+[22:06:25.813] [INFO] [Benchmark]    Median WER: 0.0%
+[22:06:25.813] [INFO] [Benchmark]    Average CER: 1.1%
+[22:06:25.813] [INFO] [Benchmark]    Median RTFx: 132.0x
+[22:06:25.813] [INFO] [Benchmark] Results saved to: asr_benchmark_results.json
+[22:06:25.813] [INFO] [Benchmark] ASR benchmark completed successfully
+[22:06:25.813] [INFO] [Benchmark]    Overall RTFx: 146.5x (19452.5s / 132.8s)
 ```
 
 ## Voice Activity Detection
 
@@ -43,7 +43,7 @@ public enum ANEOptimizer {
 
     /// Configure optimal compute units for each model type
     public static func optimalComputeUnits(for modelType: ModelType) -> MLComputeUnits {
-        // Testing shows CPU+ANE is fastest for all models, including melspectrogram
+        // Testing shows CPU+ANE is fastest for all models, including fused mel encoder
         return .cpuAndNeuralEngine
     }
 
@@ -131,8 +131,7 @@ public enum ANEOptimizer {
 
     /// Model type enumeration for compute unit selection
     public enum ModelType {
-        case melSpectrogram
-        case encoder
+        case melEncoder
         case decoder
         case joint
     }
 
@@ -15,8 +15,7 @@ public final class AsrManager {
     internal let config: ASRConfig
     private let audioConverter: AudioConverter = AudioConverter()
 
-    internal var melspectrogramModel: MLModel?
-    internal var encoderModel: MLModel?
+    internal var melEncoderModel: MLModel?
     internal var decoderModel: MLModel?
     internal var jointModel: MLModel?
 
@@ -43,36 +42,30 @@ public final class AsrManager {
         AsrModels.optimizedPredictionOptions()
     }()
 
-    // Persistent feature providers for zero-copy model chaining
-    private var zeroCopyProviders: [String: ZeroCopyFeatureProvider] = [:]
-
     public init(config: ASRConfig = .default) {
         self.config = config
 
-        // Initialize decoder states with fallback
-        do {
-            self.microphoneDecoderState = try TdtDecoderState()
-            self.systemDecoderState = try TdtDecoderState()
-        } catch {
-            logger.warning("Failed to create ANE-aligned decoder states, using standard allocation")
-            // This should rarely happen, but if it does, we'll create them during first use
-            self.microphoneDecoderState = TdtDecoderState(fallback: true)
-            self.systemDecoderState = TdtDecoderState(fallback: true)
-        }
+        self.microphoneDecoderState = TdtDecoderState.make()
+        self.systemDecoderState = TdtDecoderState.make()
 
         // Pre-warm caches if possible
         Task {
             await sharedMLArrayCache.prewarm(shapes: [
-                ([1, 240000], .float32),
-                ([1], .int32),
-                ([2, 1, 640], .float32),
+                ([NSNumber(value: 1), NSNumber(value: 240_000)], .float32),
+                ([NSNumber(value: 1)], .int32),
+                (
+                    [
+                        NSNumber(value: 2),
+                        NSNumber(value: 1),
+                        NSNumber(value: ASRConstants.decoderHiddenSize),
+                    ], .float32
+                ),
             ])
         }
     }
 
     public var isAvailable: Bool {
-        return melspectrogramModel != nil && encoderModel != nil && decoderModel != nil
-            && jointModel != nil
+        return melEncoderModel != nil && decoderModel != nil && jointModel != nil
     }
 
     /// Initialize ASR Manager with pre-loaded models
@@ -81,8 +74,7 @@ public final class AsrManager {
         logger.info("Initializing AsrManager with provided models")
 
         self.asrModels = models
-        self.melspectrogramModel = models.melspectrogram
-        self.encoderModel = models.encoder
+        self.melEncoderModel = models.melEncoder
         self.decoderModel = models.decoder
         self.jointModel = models.joint
         self.vocabulary = models.vocabulary
@@ -112,7 +104,7 @@ public final class AsrManager {
         return array
     }
 
-    func prepareMelSpectrogramInput(
+    func prepareMelEncoderInput(
         _ audioSamples: [Float], actualLength: Int? = nil
     ) async throws
         -> MLFeatureProvider
@@ -141,37 +133,6 @@ public final class AsrManager {
         ])
     }
 
-    func prepareEncoderInput(_ melspectrogramOutput: MLFeatureProvider) throws -> MLFeatureProvider {
-        // Zero-copy: chain mel-spectrogram outputs directly to encoder inputs
-        if let provider = ZeroCopyFeatureProvider.chain(
-            from: melspectrogramOutput,
-            outputName: "melspectrogram",
-            to: "audio_signal"
-        ) {
-            // Also need to chain the length
-            if let melLength = melspectrogramOutput.featureValue(for: "melspectrogram_length") {
-                let features = [
-                    "audio_signal": provider.featureValue(for: "audio_signal")!,
-                    "length": melLength,
-                ]
-                return ZeroCopyFeatureProvider(features: features)
-            }
-        }
-
-        // Fallback to copying if zero-copy fails
-        let melspectrogram = try extractFeatureValue(
-            from: melspectrogramOutput, key: "melspectrogram",
-            errorMessage: "Invalid mel-spectrogram output")
-        let melspectrogramLength = try extractFeatureValue(
-            from: melspectrogramOutput, key: "melspectrogram_length",
-            errorMessage: "Invalid mel-spectrogram length output")
-
-        return try createFeatureProvider(features: [
-            ("audio_signal", melspectrogram),
-            ("length", melspectrogramLength),
-        ])
-    }
-
     private func prepareDecoderInput(
         hiddenState: MLMultiArray,
         cellState: MLMultiArray
@@ -181,7 +142,7 @@ public final class AsrManager {
 
         return try createFeatureProvider(features: [
             ("targets", targetArray),
-            ("target_lengths", targetLengthArray),
+            ("target_length", targetLengthArray),
             ("h_in", hiddenState),
             ("c_in", cellState),
         ])
@@ -225,21 +186,18 @@ public final class AsrManager {
     }
 
     private func loadAllModels(
-        melspectrogramPath: URL,
-        encoderPath: URL,
+        melEncoderPath: URL,
         decoderPath: URL,
         jointPath: URL,
         configuration: MLModelConfiguration
-    ) async throws -> (melspectrogram: MLModel, encoder: MLModel, decoder: MLModel, joint: MLModel) {
-        async let melspectrogram = loadModel(
-            path: melspectrogramPath, name: "mel-spectrogram", configuration: configuration)
-        async let encoder = loadModel(
-            path: encoderPath, name: "encoder", configuration: configuration)
+    ) async throws -> (melEncoder: MLModel, decoder: MLModel, joint: MLModel) {
+        async let melEncoder = loadModel(
+            path: melEncoderPath, name: "mel-encoder", configuration: configuration)
         async let decoder = loadModel(
             path: decoderPath, name: "decoder", configuration: configuration)
         async let joint = loadModel(path: jointPath, name: "joint", configuration: configuration)
 
-        return try await (melspectrogram, encoder, decoder, joint)
+        return try await (melEncoder, decoder, joint)
     }
 
     private static func getDefaultModelsDirectory() -> URL {
@@ -255,18 +213,17 @@ public final class AsrManager {
     }
 
     public func resetState() {
-        microphoneDecoderState = TdtDecoderState(fallback: true)
-        systemDecoderState = TdtDecoderState(fallback: true)
+        microphoneDecoderState = TdtDecoderState.make()
+        systemDecoderState = TdtDecoderState.make()
     }
 
     public func cleanup() {
-        melspectrogramModel = nil
-        encoderModel = nil
+        melEncoderModel = nil
         decoderModel = nil
         jointModel = nil
-        // Reset decoder states - use fallback initializer that won't throw
-        microphoneDecoderState = TdtDecoderState(fallback: true)
-        systemDecoderState = TdtDecoderState(fallback: true)
+        // Reset decoder states using fresh allocations for deterministic behavior
+        microphoneDecoderState = TdtDecoderState.make()
+        systemDecoderState = TdtDecoderState.make()
         logger.info("AsrManager resources cleaned up")
     }