Skip to content

Commit c948802

Browse files
committed
fix: some minor fixes for android
1 parent 4f9e8ce commit c948802

File tree

6 files changed

+50
-36
lines changed

6 files changed

+50
-36
lines changed

android_core/app/src/main/java/com/augmentos/augmentos_core/smarterglassesmanager/speechrecognition/augmentos/SherpaOnnxTranscriber.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import com.k2fsa.sherpa.onnx.*;
99

10+
import java.io.ByteArrayOutputStream;
1011
import java.nio.ByteBuffer;
1112
import java.nio.ByteOrder;
1213
import java.util.concurrent.*;
@@ -102,7 +103,8 @@ public void init() {
102103
*/
103104
public void acceptAudio(byte[] pcm16le) {
104105
if (!running.get()) return;
105-
pcmQueue.offer(pcm16le);
106+
byte[] copiedData = pcm16le.clone();
107+
pcmQueue.offer(copiedData);
106108
}
107109

108110
/**
@@ -119,10 +121,26 @@ private void startProcessingThread() {
119121
* Pulls audio from queue, feeds into Sherpa, emits partial/final results.
120122
*/
121123
private void runLoop() {
124+
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
122125
while (running.get()) {
123126
try {
124-
byte[] data = pcmQueue.poll(100, TimeUnit.MILLISECONDS);
125-
if (data == null || recognizer == null || stream == null) continue;
127+
if (recognizer == null || stream == null) continue;
128+
129+
int chunksCollected = 0;
130+
buffer.reset();
131+
132+
// each chunk is 10ms of audio. so we collect 10 chunks to make 100ms of audio before processing
133+
while (chunksCollected < 10) {
134+
byte[] data = pcmQueue.poll(50, TimeUnit.MILLISECONDS);
135+
if (data != null) {
136+
buffer.write(data);
137+
chunksCollected++;
138+
} else {
139+
break;
140+
}
141+
}
142+
byte[] fullData = buffer.toByteArray();
143+
if (fullData == null) continue;
126144

127145
// Convert PCM to float [-1.0, 1.0]
128146
float[] floatBuf = toFloatArray(data);

android_core/app/src/main/java/com/augmentos/augmentos_core/smarterglassesmanager/speechrecognition/augmentos/SpeechRecAugmentos.java

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ public class SpeechRecAugmentos extends SpeechRecFramework {
5454
private SherpaOnnxTranscriber sherpaTranscriber;
5555

5656
// Backend data sending control flags
57-
private volatile boolean shouldSendPcmToBackend = true;
58-
private volatile boolean shouldSendTranscriptionToBackend = false;
57+
private volatile boolean sendPcmToBackend = true;
58+
private volatile boolean sendTranscriptionToBackend = false;
5959

6060
private SpeechRecAugmentos(Context context) {
6161
this.mContext = context;
@@ -128,8 +128,11 @@ private void sendFormattedTranscriptionToBackend(String text, boolean isFinal, l
128128
long currentTime = System.currentTimeMillis();
129129
long relativeTime = currentTime - sessionStartTime;
130130

131-
// Adjust start time based on result type
132-
// Final results use longer timespan than partial results
131+
132+
// TODO: This is wrong
133+
// Instead we should have some timer inside sherpa that is reset on every final result.
134+
// Also I don't think we need speaker id as there is no diarization.
135+
133136
int timeOffset = isFinal ? 2000 : 1000;
134137
transcription.put("startTime", relativeTime - timeOffset);
135138
transcription.put("endTime", relativeTime);
@@ -254,7 +257,6 @@ private void startVadProcessingThread() {
254257
}
255258
}
256259
// If poll times out, just continue the loop
257-
258260
} catch (InterruptedException e) {
259261
Thread.currentThread().interrupt();
260262
break;
@@ -270,10 +272,6 @@ private void sendVadStatus(boolean isNowSpeaking) {
270272
ServerComms.getInstance().sendVadStatus(isNowSpeaking);
271273
}
272274

273-
public boolean sendPcmToBackend = true;
274-
275-
public boolean sendTranscriptionToBackend = false;
276-
277275
/**
278276
* Called by external code to feed raw PCM chunks (16-bit, 16kHz).
279277
* runs VAD on decoded data to tell whether or not we should send the encoded data to the backend
@@ -310,8 +308,6 @@ public void ingestAudioChunk(byte[] audioChunk) {
310308
lc3RollingBuffer.remove(0); // Remove oldest chunks to maintain rolling window
311309
}
312310
}
313-
314-
315311
//SENDING STUFF
316312
// If bypassing VAD for debugging or currently speaking, send data live
317313
if (bypassVadForDebugging || isSpeaking) {
@@ -356,16 +352,14 @@ public void ingestLC3AudioChunk(byte[] LC3audioChunk) {
356352
}
357353
}
358354

359-
if (sendTranscriptionToBackend) {
360-
if (bypassVadForDebugging || isSpeaking) {
361-
if (sherpaTranscriber != null) {
362-
// TODO: Verify whether this would work properly
363-
// Invoking this because same sendAudioChunk handling in servercomms
364-
// So assuming this would also work
365-
sherpaTranscriber.acceptAudio(LC3audioChunk);
366-
}
367-
}
368-
}
355+
// TODO: Should we use this?
356+
// if (sendTranscriptionToBackend) {
357+
// if (bypassVadForDebugging || isSpeaking) {
358+
// if (sherpaTranscriber != null) {
359+
// sherpaTranscriber.acceptAudio(LC3audioChunk);
360+
// }
361+
// }
362+
// }
369363
}
370364

371365
/**
@@ -487,22 +481,22 @@ public void microphoneStateChanged(boolean state, List<SpeechRequiredDataType> r
487481
sherpaTranscriber.microphoneStateChanged(state);
488482
}
489483

490-
// Set shouldSendPcmToBackend and shouldSendTranscriptionToBackend based on required data
484+
// Set sendPcmToBackend and sendTranscriptionToBackend based on required data
491485
// if state is PCM_OR_TRANS then based on the bandwidth of the internet if it falls below certain threshold decide to send PCM or Transcription
492-
if (requiredData.contains(SpeechRequiredDataType.PCM_OR_TRANS)) {
486+
if (requiredData.contains(SpeechRequiredDataType.PCM_OR_TRANSCRIPTION)) {
493487
// TODO: Implement bandwidth detection logic
494488
// For now, default to transcription as it's more bandwidth efficient
495489
// In the future, check network quality and decide:
496490
// - If high bandwidth: send PCM for better quality
497491
// - If low bandwidth: send transcription for efficiency
498492
// For now default to pcm
499-
shouldSendPcmToBackend = state;
493+
sendPcmToBackend = state;
500494
}
501-
if (requiredData.contains(SpeechRequiredDataType.PCM_AUDIO)) {
502-
shouldSendPcmToBackend = state;
503-
}
504-
if (requiredData.contains(SpeechRequiredDataType.TRANSCRIPTION)) {
505-
shouldSendTranscriptionToBackend = state;
495+
if (requiredData.contains(SpeechRequiredDataType.PCM)) {
496+
sendPcmToBackend = state;
497+
}
498+
if (requiredData.contains(SpeechRequiredDataType.TRANSCRIPTION)) {
499+
sendTranscriptionToBackend = state;
506500
}
507501

508502

mobile/ios/AOS.xcodeproj/project.pbxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
6DAA26512E2BF04E00B9AEC4 /* decoder.onnx in Resources */ = {isa = PBXBuildFile; fileRef = 6DAA264A2E2BF04E00B9AEC4 /* decoder.onnx */; };
1919
6DAA26522E2BF04E00B9AEC4 /* tokens.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6DAA264D2E2BF04E00B9AEC4 /* tokens.txt */; };
2020
6DDF1F572E2BED9800EAD638 /* SherpaOnnx.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6D62BA742E2BDFD100140E04 /* SherpaOnnx.swift */; };
21+
6DFC99692E33FA9400611B81 /* SpeechRequiredDataType.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DFC99682E33FA9400611B81 /* SpeechRequiredDataType.swift */; };
2122
96905EF65AED1B983A6B3ABC /* libPods-AOS.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 58EEBF8E8E6FB1BC6CAF49B5 /* libPods-AOS.a */; };
2223
A1C4A5080DE65B36D4AECB21 /* PrivacyInfo.xcprivacy in Resources */ = {isa = PBXBuildFile; fileRef = B18D5B1AA3EE1022A1FC3B30 /* PrivacyInfo.xcprivacy */; };
2324
B18059E884C0ABDD17F3DC3D /* ExpoModulesProvider.swift in Sources */ = {isa = PBXBuildFile; fileRef = FAC715A2D49A985799AEE119 /* ExpoModulesProvider.swift */; };
@@ -86,6 +87,7 @@
8687
6DAA264C2E2BF04E00B9AEC4 /* joiner.onnx */ = {isa = PBXFileReference; lastKnownFileType = text; path = joiner.onnx; sourceTree = "<group>"; };
8788
6DAA264D2E2BF04E00B9AEC4 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = tokens.txt; sourceTree = "<group>"; };
8889
6DCD26AF2E2BF6B00086760F /* sherpa-onnx.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; path = "sherpa-onnx.xcframework"; sourceTree = "<group>"; };
90+
6DFC99682E33FA9400611B81 /* SpeechRequiredDataType.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SpeechRequiredDataType.swift; sourceTree = "<group>"; };
8991
7A4D352CD337FB3A3BF06240 /* Pods-AOS.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-AOS.release.xcconfig"; path = "Target Support Files/Pods-AOS/Pods-AOS.release.xcconfig"; sourceTree = "<group>"; };
9092
AA286B85B6C04FC6940260E9 /* SplashScreen.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; name = SplashScreen.storyboard; path = AOS/SplashScreen.storyboard; sourceTree = "<group>"; };
9193
B18D5B1AA3EE1022A1FC3B30 /* PrivacyInfo.xcprivacy */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xml; name = PrivacyInfo.xcprivacy; path = AOS/PrivacyInfo.xcprivacy; sourceTree = "<group>"; };
@@ -276,6 +278,7 @@
276278
C5DE979D2DDD0EA80032FC99 /* BleManager */ = {
277279
isa = PBXGroup;
278280
children = (
281+
6DFC99682E33FA9400611B81 /* SpeechRequiredDataType.swift */,
279282
6D9355E62E2B9B2300A5A14A /* SherpaOnnxTranscriber.swift */,
280283
C5D2C75D2E1DAB50004A983D /* AudioManager.swift */,
281284
C577E8A32E19C7CB00522AB8 /* MentraLiveManager.swift */,
@@ -684,6 +687,7 @@
684687
C5DE97AE2DDD0EA80032FC99 /* MicrophoneManager.swift in Sources */,
685688
C5DE97AF2DDD0EA80032FC99 /* WebSocketManager.swift in Sources */,
686689
C5DE97B02DDD0EA80032FC99 /* CalendarManager.swift in Sources */,
690+
6DFC99692E33FA9400611B81 /* SpeechRequiredDataType.swift in Sources */,
687691
B18059E884C0ABDD17F3DC3D /* ExpoModulesProvider.swift in Sources */,
688692
C5D2C75E2E1DAB50004A983D /* AudioManager.swift in Sources */,
689693
F314884EB5724196A394670E /* noop-file.swift in Sources */,

mobile/ios/BleManager/AOSManager.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ struct ViewState {
131131
// Initialize the transcriber
132132
if let transcriber = self.transcriber {
133133
transcriber.initialize()
134-
// transcriber.transcriptDelegate = self
134+
transcriber.transcriptDelegate = self
135135
CoreCommsService.log("SherpaOnnxTranscriber fully initialized")
136136
}
137137
}

mobile/ios/BleManager/ServerComms.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ class ServerComms {
497497
}
498498

499499
// Convert string array to enum array
500-
let requiredData = SpeechRequiredDataType.fromStringArray(requiredDataStrings)
500+
var requiredData = SpeechRequiredDataType.fromStringArray(requiredDataStrings)
501501

502502
// Treat empty array as PCM only
503503
if requiredData.isEmpty {

mobile/ios/BleManager/SherpaOnnxTranscriber.swift

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,6 @@ class SherpaOnnxTranscriber {
136136
self?.transcriptDelegate?.didReceivePartialTranscription(text)
137137
}
138138
}
139-
140-
CoreCommsService.log("Generated \(isFinal ? "final" : "partial") transcription: \(text)")
141139
}
142140

143141
/**

0 commit comments

Comments
 (0)