moshi-swift/Moshi/AudioRT.swift at main · kyutai-labs/moshi-swift · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import AVFoundation
import Foundation
import Synchronization

class ThreadSafeChannel<T> {
    private var buffer: [T] = []
    private let queue = DispatchQueue(label: "tschannel", attributes: .concurrent)
    private let semaphore = DispatchSemaphore(value: 0)

    func send(_ value: T) {
        queue.async(flags: .barrier) {
            self.buffer.append(value)
            self.semaphore.signal()
        }
    }

    func receive() -> T? {
        semaphore.wait()
        return queue.sync {
            guard !buffer.isEmpty else { return nil }
            return buffer.removeFirst()
        }
    }
}

// The code below is probably macos specific and unlikely to work on ios.
class MicrophoneCapture {
    private let audioEngine: AVAudioEngine
    private let channel: ThreadSafeChannel<[Float]>
    private let bufferedLen: Atomic<Int> = .init(0)

    init() {
        audioEngine = AVAudioEngine()
        channel = ThreadSafeChannel()
    }

    func startCapturing() {
        let inputNode = audioEngine.inputNode
        // Setting the voice mode on macos causes weird hangs of the microphone
        // so we discard it for now.
        #if os(iOS)
            do {
            //    try inputNode.setVoiceProcessingEnabled(true)
            } catch {
                print("could not set voice processing on the input node")
            }
        #endif

        // Desired format: 1 channel (mono), 24kHz, Float32
        let desiredSampleRate: Double = 24000.0
        let desiredChannelCount: AVAudioChannelCount = 1

        let inputFormat = inputNode.inputFormat(forBus: 0)
        // Create a custom audio format with the desired settings
        guard
            let mono24kHzFormat = AVAudioFormat(
                commonFormat: .pcmFormatFloat32,
                sampleRate: desiredSampleRate,
                channels: desiredChannelCount,
                interleaved: false)
        else {
            print("Could not create target format")
            return
        }

        // Resample the buffer to match the desired format
        let converter = AVAudioConverter(from: inputFormat, to: mono24kHzFormat)

        // Install a tap to capture audio and resample to the target format
        inputNode.installTap(onBus: 0, bufferSize: 1920, format: inputFormat) { buffer, _ in
            let targetLen = Int(buffer.frameLength) * 24000 / Int(inputFormat.sampleRate)
            let convertedBuffer = AVAudioPCMBuffer(
                pcmFormat: mono24kHzFormat, frameCapacity: AVAudioFrameCount(targetLen))!
            var error: NSError? = nil
            let inputBlock: AVAudioConverterInputBlock = { inNumPackets, outStatus in
                outStatus.pointee = .haveData
                return buffer
            }

            converter?.convert(to: convertedBuffer, error: &error, withInputFrom: inputBlock)

            if let error = error {
                print("Conversion error: \(error)")
                return
            }

            self.processAudioBuffer(buffer: convertedBuffer)
        }

        // Start the audio engine
        do {
            audioEngine.prepare()
            try audioEngine.start()
            print("Microphone capturing started at 24kHz, mono")
        } catch {
            print("Error starting audio engine: \(error)")
        }
    }

    private func processAudioBuffer(buffer: AVAudioPCMBuffer) {
        guard let channelData = buffer.floatChannelData else { return }
        let frameCount = Int(buffer.frameLength)

        let pcmData = Array(UnsafeBufferPointer(start: channelData[0], count: frameCount)).map {
            $0
        }
        bufferedLen.add(pcmData.count, ordering: .sequentiallyConsistent)
        channel.send(pcmData)
    }

    func stopCapturing() {
        audioEngine.stop()
        audioEngine.inputNode.removeTap(onBus: 0)
        print("Microphone capturing stopped")
    }

    func receive() -> [Float]? {
        let data = channel.receive()
        if let data = data {
            bufferedLen.subtract(data.count, ordering: .sequentiallyConsistent)
        }
        return data
    }

    func bufferedDuration() -> Double {
        return Double(bufferedLen.load(ordering: .sequentiallyConsistent)) / 24000.0
    }
}

class FloatRingBuffer {
    private var buffer: [Float]
    private let capacity: Int
    private var readIndex = 0
    private var writeIndex = 0
    private var count = 0

    private let lock = NSLock()

    init(capacity: Int) {
        self.capacity = capacity
        self.buffer = [Float](repeating: 0, count: capacity)
    }

    func currentCount() -> Int {
        lock.lock()
        defer { lock.unlock() }
        return count
    }

    func write(_ values: [Float]) -> Bool {
        lock.lock()
        defer { lock.unlock() }

        if values.count + count > capacity {
            return false
        }
        for value in values {
            buffer[writeIndex] = value
            writeIndex = (writeIndex + 1) % capacity
            count += 1
        }
        return true
    }

    func read(maxCount: Int) -> [Float] {
        lock.lock()
        defer { lock.unlock() }

        var values: [Float] = []
        for _ in 0..<maxCount {
            if count == 0 {
                break
            }
            let value = buffer[readIndex]
            values.append(value)
            readIndex = (readIndex + 1) % capacity
            count -= 1
        }
        return values
    }
}

class AudioPlayer {
    private let audioEngine: AVAudioEngine
    private let ringBuffer: FloatRingBuffer
    private let sampleRate: Double

    init(sampleRate: Double) {
        audioEngine = AVAudioEngine()
        ringBuffer = FloatRingBuffer(capacity: Int(sampleRate * 4))
        self.sampleRate = sampleRate
    }

    func bufferedDuration() -> Double {
        Double(ringBuffer.currentCount()) / sampleRate
    }

    func startPlaying() throws {
        let audioFormat = AVAudioFormat(standardFormatWithSampleRate: self.sampleRate, channels: 1)!
        let sourceNode = AVAudioSourceNode(format: audioFormat) {
            _, _, frameCount, audioBufferList -> OSStatus in
            let audioBuffers = UnsafeMutableAudioBufferListPointer(audioBufferList)
            guard let channelData = audioBuffers[0].mData?.assumingMemoryBound(to: Float.self)
            else {
                // TODO: Get a proper error here that would work on ios.
                return 1
            }
            let data = self.ringBuffer.read(maxCount: Int(frameCount))
            for i in 0..<Int(frameCount) {
                channelData[i] = i < data.count ? data[i] : 0
            }
            return noErr
        }

        let af = sourceNode.inputFormat(forBus: 0)
        print("playing audio-format \(af)")
        audioEngine.attach(sourceNode)
        audioEngine.connect(sourceNode, to: audioEngine.mainMixerNode, format: audioFormat)
        try audioEngine.start()
    }

    func send(_ values: [Float]) -> Bool {
        ringBuffer.write(values)
    }
}

func setDefaultToStd() {
    #if os(iOS)
        do {
            let audioSession = AVAudioSession.sharedInstance()
            try audioSession.setCategory(.playAndRecord, mode: .default, options: [.allowAirPlay, .allowBluetooth])
            try audioSession.setActive(true)
            try audioSession.overrideOutputAudioPort(.none)
        } catch {
            print("failed to configure audio session: \(error.localizedDescription)")
        }
    #endif
}

func setDefaultToSpeaker() {
    #if os(iOS)
        do {
            let audioSession = AVAudioSession.sharedInstance()
            try audioSession.setCategory(.playAndRecord, mode: .default, options: .defaultToSpeaker)
            try audioSession.setActive(true)
            try audioSession.overrideOutputAudioPort(.speaker)
        } catch {
            print("failed to configure audio session: \(error.localizedDescription)")
        }
    #endif
}