-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSpeechRecognition.h
More file actions
116 lines (89 loc) · 2.38 KB
/
SpeechRecognition.h
File metadata and controls
116 lines (89 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#ifndef SPEECHRECOGNITION_H
#define SPEECHRECOGNITION_H
#define NOMINMAX
#include <string>
#include <ixwebsocket/IXHttpClient.h>
#include <combaseapi.h>
#include <iostream>
#include <functional>
#include <cstdint>
#include <queue>
#include <mutex>
#include <thread>
#include "AudioCapture.h"
#include "sherpa-onnx/c-api/c-api.h"
enum LLMEngineType {
StandardOpenAI = 0,
LLaMA_CPP = 1
};
struct SamplingConfig {
int maxTokens;
float temperature;
float topP;
};
struct RemoteLLMConfig {
std::string apiToken;
std::string apiBaseUrl;
std::string modelName;
bool isSakuraLLM;
std::string promptTemplate;
LLMEngineType engineType = StandardOpenAI;
SamplingConfig samplingConfig;
};
class SpeechSubtitle {
public:
SpeechSubtitle(const std::string &text, const std::string &lang)
: text_(text), lang_(lang) {
}
std::string getText() const {
return text_;
}
void setText(const std::string &text) {
text_ = text;
}
std::string getLang() const {
return lang_;
}
void setLang(const std::string &lang) {
lang_ = lang;
}
private:
std::string text_; // 字幕文本
std::string lang_; // 字幕语言
};
class SpeechRecognition {
public:
// Constructor now takes the config file path
SpeechRecognition(const std::string &configFilePath);
~SpeechRecognition();
void init();
void start();
void stop();
SpeechSubtitle getSubtitle();
std::string getTranslate(const std::string &text);
private:
void capture();
void loadConfig();
static bool initNetSystem();
// Global config / status
std::string configFilePath;
bool running = false;
std::mutex subtitlesMutex;
// Audio capture pipeline
AudioCapture audioCapture;
std::thread captureThread;
// ASR pipeline
int modelSampleRate = 16000;
float tailPaddings[4800] = {0.}; // 0.3 seconds at 16 kHz sample rate
SherpaOnnxOfflineRecognizerConfig recognizer_config;
const SherpaOnnxOfflineRecognizer *recognizer = nullptr;
SherpaOnnxVadModelConfig vadConfig;
const SherpaOnnxVoiceActivityDetector *vad = nullptr;
std::function<void(short *, int32_t, int32_t)> asrCallback;
// LLM based translate pipeline
ix::HttpClient httpClient;
RemoteLLMConfig remoteLLMConfig;
// Render queue
std::queue<SpeechSubtitle> subtitles;
};
#endif