JikkyoSubtitle/SpeechRecognition.h at master · Sg4Dylan/JikkyoSubtitle · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#ifndef SPEECHRECOGNITION_H
#define SPEECHRECOGNITION_H

#define NOMINMAX
#include <string>
#include <ixwebsocket/IXHttpClient.h>
#include <combaseapi.h>
#include <iostream>
#include <functional>
#include <cstdint>
#include <queue>
#include <mutex>
#include <thread>

#include "AudioCapture.h"
#include "sherpa-onnx/c-api/c-api.h"

enum LLMEngineType {
    StandardOpenAI = 0,
    LLaMA_CPP = 1
};

struct SamplingConfig {
    int maxTokens;
    float temperature;
    float topP;
};

struct RemoteLLMConfig {
    std::string apiToken;
    std::string apiBaseUrl;
    std::string modelName;
    bool isSakuraLLM;
    std::string promptTemplate;
    LLMEngineType engineType = StandardOpenAI;
    SamplingConfig samplingConfig;
};

class SpeechSubtitle {
public:
    SpeechSubtitle(const std::string &text, const std::string &lang)
        : text_(text), lang_(lang) {
    }

    std::string getText() const {
        return text_;
    }

    void setText(const std::string &text) {
        text_ = text;
    }

    std::string getLang() const {
        return lang_;
    }

    void setLang(const std::string &lang) {
        lang_ = lang;
    }

private:
    std::string text_; // 字幕文本
    std::string lang_; // 字幕语言
};

class SpeechRecognition {
public:
    // Constructor now takes the config file path
    SpeechRecognition(const std::string &configFilePath);

    ~SpeechRecognition();

    void init();

    void start();

    void stop();

    SpeechSubtitle getSubtitle();

    std::string getTranslate(const std::string &text);

private:
    void capture();

    void loadConfig();

    static bool initNetSystem();

    // Global config / status
    std::string configFilePath;
    bool running = false;
    std::mutex subtitlesMutex;

    // Audio capture pipeline
    AudioCapture audioCapture;
    std::thread captureThread;

    // ASR pipeline
    int modelSampleRate = 16000;
    float tailPaddings[4800] = {0.}; // 0.3 seconds at 16 kHz sample rate
    SherpaOnnxOfflineRecognizerConfig recognizer_config;
    const SherpaOnnxOfflineRecognizer *recognizer = nullptr;
    SherpaOnnxVadModelConfig vadConfig;
    const SherpaOnnxVoiceActivityDetector *vad = nullptr;
    std::function<void(short *, int32_t, int32_t)> asrCallback;

    // LLM based translate pipeline
    ix::HttpClient httpClient;
    RemoteLLMConfig remoteLLMConfig;

    // Render queue
    std::queue<SpeechSubtitle> subtitles;
};

#endif