fix: some minor fixes for android

yagarwal1307 · yagarwal1307 · commit c948802131f0 · 2025-07-25T23:44:57.000+05:30
diff --git a/android_core/app/src/main/java/com/augmentos/augmentos_core/smarterglassesmanager/speechrecognition/augmentos/SherpaOnnxTranscriber.java b/android_core/app/src/main/java/com/augmentos/augmentos_core/smarterglassesmanager/speechrecognition/augmentos/SherpaOnnxTranscriber.java
@@ -7,6 +7,7 @@
 
 import com.k2fsa.sherpa.onnx.*;
 
+import java.io.ByteArrayOutputStream;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.concurrent.*;
@@ -102,7 +103,8 @@ public void init() {
      */
     public void acceptAudio(byte[] pcm16le) {
         if (!running.get()) return;
-        pcmQueue.offer(pcm16le);
+        byte[] copiedData = pcm16le.clone();
+        pcmQueue.offer(copiedData);
     }
 
     /**
@@ -119,10 +121,26 @@ private void startProcessingThread() {
      * Pulls audio from queue, feeds into Sherpa, emits partial/final results.
      */
     private void runLoop() {
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
         while (running.get()) {
             try {
-                byte[] data = pcmQueue.poll(100, TimeUnit.MILLISECONDS);
-                if (data == null || recognizer == null || stream == null) continue;
+                if (recognizer == null || stream == null) continue;
+
+                int chunksCollected = 0;
+                buffer.reset();
+
+                // each chunk is 10ms of audio. so we collect 10 chunks to make 100ms of audio before processing
+                while (chunksCollected < 10) {
+                    byte[] data = pcmQueue.poll(50, TimeUnit.MILLISECONDS);
+                    if (data != null) {
+                        buffer.write(data);
+                        chunksCollected++;
+                    } else {
+                        break;
+                    }
+                }
+                byte[] fullData = buffer.toByteArray();
+                if (fullData == null) continue;
 
                 // Convert PCM to float [-1.0, 1.0]
                 float[] floatBuf = toFloatArray(data);
diff --git a/android_core/app/src/main/java/com/augmentos/augmentos_core/smarterglassesmanager/speechrecognition/augmentos/SpeechRecAugmentos.java b/android_core/app/src/main/java/com/augmentos/augmentos_core/smarterglassesmanager/speechrecognition/augmentos/SpeechRecAugmentos.java
@@ -54,8 +54,8 @@ public class SpeechRecAugmentos extends SpeechRecFramework {
     private SherpaOnnxTranscriber sherpaTranscriber;
     
     // Backend data sending control flags
-    private volatile boolean shouldSendPcmToBackend = true;
-    private volatile boolean shouldSendTranscriptionToBackend = false;
+    private volatile boolean sendPcmToBackend = true;
+    private volatile boolean sendTranscriptionToBackend = false;
 
     private SpeechRecAugmentos(Context context) {
         this.mContext = context;
@@ -128,8 +128,11 @@ private void sendFormattedTranscriptionToBackend(String text, boolean isFinal, l
             long currentTime = System.currentTimeMillis();
             long relativeTime = currentTime - sessionStartTime;
             
-            // Adjust start time based on result type
-            // Final results use longer timespan than partial results
+
+            // TODO: This is wrong
+            // Instead we should have some timer inside sherpa that is reset on every final result.
+            // Also I don't think we need speaker id as there is no diarization.
+
             int timeOffset = isFinal ? 2000 : 1000;
             transcription.put("startTime", relativeTime - timeOffset);
             transcription.put("endTime", relativeTime);
@@ -254,7 +257,6 @@ private void startVadProcessingThread() {
                         }
                     }
                     // If poll times out, just continue the loop
-
                 } catch (InterruptedException e) {
                     Thread.currentThread().interrupt();
                     break;
@@ -270,10 +272,6 @@ private void sendVadStatus(boolean isNowSpeaking) {
         ServerComms.getInstance().sendVadStatus(isNowSpeaking);
     }
 
-    public boolean sendPcmToBackend = true;
-
-    public boolean sendTranscriptionToBackend = false;
-
     /**
      * Called by external code to feed raw PCM chunks (16-bit, 16kHz).
      * runs VAD on decoded data to tell whether or not we should send the encoded data to the backend
@@ -310,8 +308,6 @@ public void ingestAudioChunk(byte[] audioChunk) {
                     lc3RollingBuffer.remove(0); // Remove oldest chunks to maintain rolling window
                 }
             }
-
-
             //SENDING STUFF
             // If bypassing VAD for debugging or currently speaking, send data live
             if (bypassVadForDebugging || isSpeaking) {
@@ -356,16 +352,14 @@ public void ingestLC3AudioChunk(byte[] LC3audioChunk) {
             }
         }
 
-        if (sendTranscriptionToBackend) {
-            if (bypassVadForDebugging || isSpeaking) {
-                if (sherpaTranscriber != null) {
-                    // TODO: Verify whether this would work properly
-                    // Invoking this because same sendAudioChunk handling in servercomms
-                    // So assuming this would also work
-                    sherpaTranscriber.acceptAudio(LC3audioChunk);
-                }
-            }
-        }
+        // TODO: Should we use this?
+        // if (sendTranscriptionToBackend) {
+        //     if (bypassVadForDebugging || isSpeaking) {
+        //         if (sherpaTranscriber != null) {
+        //             sherpaTranscriber.acceptAudio(LC3audioChunk);
+        //         }
+        //     }
+        // }
     }
 
     /**
@@ -487,22 +481,22 @@ public void microphoneStateChanged(boolean state, List<SpeechRequiredDataType> r
             sherpaTranscriber.microphoneStateChanged(state);
         }
 
-        // Set shouldSendPcmToBackend and shouldSendTranscriptionToBackend based on required data
+        // Set sendPcmToBackend and sendTranscriptionToBackend based on required data
         // if state is PCM_OR_TRANS then based on the bandwidth of the internet if it falls below certain threshold decide to send PCM or Transcription
-        if (requiredData.contains(SpeechRequiredDataType.PCM_OR_TRANS)) {
+        if (requiredData.contains(SpeechRequiredDataType.PCM_OR_TRANSCRIPTION)) {
             // TODO: Implement bandwidth detection logic
             // For now, default to transcription as it's more bandwidth efficient
             // In the future, check network quality and decide:
             // - If high bandwidth: send PCM for better quality
             // - If low bandwidth: send transcription for efficiency
             // For now default to pcm
-            shouldSendPcmToBackend = state;
+            sendPcmToBackend = state;
         }
-        if (requiredData.contains(SpeechRequiredDataType.PCM_AUDIO)) {
-            shouldSendPcmToBackend = state;
-            }
-            if (requiredData.contains(SpeechRequiredDataType.TRANSCRIPTION)) {
-            shouldSendTranscriptionToBackend = state;
+        if (requiredData.contains(SpeechRequiredDataType.PCM)) {
+            sendPcmToBackend = state;
+        }
+        if (requiredData.contains(SpeechRequiredDataType.TRANSCRIPTION)) {
+            sendTranscriptionToBackend = state;
         }
         
         
diff --git a/mobile/ios/AOS.xcodeproj/project.pbxproj b/mobile/ios/AOS.xcodeproj/project.pbxproj
@@ -18,6 +18,7 @@
 		6DAA26512E2BF04E00B9AEC4 /* decoder.onnx in Resources */ = {isa = PBXBuildFile; fileRef = 6DAA264A2E2BF04E00B9AEC4 /* decoder.onnx */; };
 		6DAA26522E2BF04E00B9AEC4 /* tokens.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6DAA264D2E2BF04E00B9AEC4 /* tokens.txt */; };
 		6DDF1F572E2BED9800EAD638 /* SherpaOnnx.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6D62BA742E2BDFD100140E04 /* SherpaOnnx.swift */; };
+		6DFC99692E33FA9400611B81 /* SpeechRequiredDataType.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DFC99682E33FA9400611B81 /* SpeechRequiredDataType.swift */; };
 		96905EF65AED1B983A6B3ABC /* libPods-AOS.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 58EEBF8E8E6FB1BC6CAF49B5 /* libPods-AOS.a */; };
 		A1C4A5080DE65B36D4AECB21 /* PrivacyInfo.xcprivacy in Resources */ = {isa = PBXBuildFile; fileRef = B18D5B1AA3EE1022A1FC3B30 /* PrivacyInfo.xcprivacy */; };
 		B18059E884C0ABDD17F3DC3D /* ExpoModulesProvider.swift in Sources */ = {isa = PBXBuildFile; fileRef = FAC715A2D49A985799AEE119 /* ExpoModulesProvider.swift */; };
@@ -86,6 +87,7 @@
 		6DAA264C2E2BF04E00B9AEC4 /* joiner.onnx */ = {isa = PBXFileReference; lastKnownFileType = text; path = joiner.onnx; sourceTree = "<group>"; };
 		6DAA264D2E2BF04E00B9AEC4 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = tokens.txt; sourceTree = "<group>"; };
 		6DCD26AF2E2BF6B00086760F /* sherpa-onnx.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; path = "sherpa-onnx.xcframework"; sourceTree = "<group>"; };
+		6DFC99682E33FA9400611B81 /* SpeechRequiredDataType.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SpeechRequiredDataType.swift; sourceTree = "<group>"; };
 		7A4D352CD337FB3A3BF06240 /* Pods-AOS.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-AOS.release.xcconfig"; path = "Target Support Files/Pods-AOS/Pods-AOS.release.xcconfig"; sourceTree = "<group>"; };
 		AA286B85B6C04FC6940260E9 /* SplashScreen.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; name = SplashScreen.storyboard; path = AOS/SplashScreen.storyboard; sourceTree = "<group>"; };
 		B18D5B1AA3EE1022A1FC3B30 /* PrivacyInfo.xcprivacy */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xml; name = PrivacyInfo.xcprivacy; path = AOS/PrivacyInfo.xcprivacy; sourceTree = "<group>"; };
@@ -276,6 +278,7 @@
 		C5DE979D2DDD0EA80032FC99 /* BleManager */ = {
 			isa = PBXGroup;
 			children = (
+				6DFC99682E33FA9400611B81 /* SpeechRequiredDataType.swift */,
 				6D9355E62E2B9B2300A5A14A /* SherpaOnnxTranscriber.swift */,
 				C5D2C75D2E1DAB50004A983D /* AudioManager.swift */,
 				C577E8A32E19C7CB00522AB8 /* MentraLiveManager.swift */,
@@ -684,6 +687,7 @@
 				C5DE97AE2DDD0EA80032FC99 /* MicrophoneManager.swift in Sources */,
 				C5DE97AF2DDD0EA80032FC99 /* WebSocketManager.swift in Sources */,
 				C5DE97B02DDD0EA80032FC99 /* CalendarManager.swift in Sources */,
+				6DFC99692E33FA9400611B81 /* SpeechRequiredDataType.swift in Sources */,
 				B18059E884C0ABDD17F3DC3D /* ExpoModulesProvider.swift in Sources */,
 				C5D2C75E2E1DAB50004A983D /* AudioManager.swift in Sources */,
 				F314884EB5724196A394670E /* noop-file.swift in Sources */,
diff --git a/mobile/ios/BleManager/AOSManager.swift b/mobile/ios/BleManager/AOSManager.swift
@@ -131,7 +131,7 @@ struct ViewState {
         // Initialize the transcriber
         if let transcriber = self.transcriber {
             transcriber.initialize()
-            // transcriber.transcriptDelegate = self
+            transcriber.transcriptDelegate = self
             CoreCommsService.log("SherpaOnnxTranscriber fully initialized")
         }
     }
diff --git a/mobile/ios/BleManager/ServerComms.swift b/mobile/ios/BleManager/ServerComms.swift
@@ -497,7 +497,7 @@ class ServerComms {
       }
       
       // Convert string array to enum array
-      let requiredData = SpeechRequiredDataType.fromStringArray(requiredDataStrings)
+      var requiredData = SpeechRequiredDataType.fromStringArray(requiredDataStrings)
 
       // Treat empty array as PCM only
       if requiredData.isEmpty {
diff --git a/mobile/ios/BleManager/SherpaOnnxTranscriber.swift b/mobile/ios/BleManager/SherpaOnnxTranscriber.swift
@@ -136,8 +136,6 @@ class SherpaOnnxTranscriber {
                 self?.transcriptDelegate?.didReceivePartialTranscription(text)
             }
         }
-        
-        CoreCommsService.log("Generated \(isFinal ? "final" : "partial") transcription: \(text)")
     }
     
     /**

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ struct ViewState {`
`131`	`131`	`// Initialize the transcriber`
`132`	`132`	`if let transcriber = self.transcriber {`
`133`	`133`	`transcriber.initialize()`
`134`		`- // transcriber.transcriptDelegate = self`
	`134`	`+ transcriber.transcriptDelegate = self`
`135`	`135`	`CoreCommsService.log("SherpaOnnxTranscriber fully initialized")`
`136`	`136`	`}`
`137`	`137`	`}`
Original file line number	Diff line number	Diff line change
`@@ -497,7 +497,7 @@ class ServerComms {`
`497`	`497`	`}`
`498`	`498`
`499`	`499`	`// Convert string array to enum array`
`500`		`- let requiredData = SpeechRequiredDataType.fromStringArray(requiredDataStrings)`
	`500`	`+ var requiredData = SpeechRequiredDataType.fromStringArray(requiredDataStrings)`
`501`	`501`
`502`	`502`	`// Treat empty array as PCM only`
`503`	`503`	`if requiredData.isEmpty {`
Original file line number	Diff line number	Diff line change
`@@ -136,8 +136,6 @@ class SherpaOnnxTranscriber {`
`136`	`136`	`self?.transcriptDelegate?.didReceivePartialTranscription(text)`
`137`	`137`	`}`
`138`	`138`	`}`
`139`		`-`
`140`		`- CoreCommsService.log("Generated \(isFinal ? "final" : "partial") transcription: \(text)")`
`141`	`139`	`}`
`142`	`140`
`143`	`141`	`/**`