From 42904e156c57722ed0920ccf8d6ba2116ff6cefe Mon Sep 17 00:00:00 2001 From: Bruno Berisso Date: Tue, 24 Jan 2017 12:13:21 +0000 Subject: [PATCH 1/4] Review the access rights. Now the public classes are 'public final' instead of open. The same goes for the functions --- TLSphinx/Config.swift | 4 ++-- TLSphinx/Decoder.swift | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/TLSphinx/Config.swift b/TLSphinx/Config.swift index 76dae46..8168f5c 100644 --- a/TLSphinx/Config.swift +++ b/TLSphinx/Config.swift @@ -9,7 +9,7 @@ import Foundation import Sphinx.Base -open class Config { +public final class Config { var cmdLnConf: OpaquePointer? fileprivate var cArgs: [UnsafeMutablePointer?] @@ -38,7 +38,7 @@ open class Config { } - open var showDebugInfo: Bool { + public var showDebugInfo: Bool { get { if cmdLnConf != nil { return cmd_ln_str_r(cmdLnConf, "-logfn") == nil diff --git a/TLSphinx/Decoder.swift b/TLSphinx/Decoder.swift index 9e5245b..90c04ed 100644 --- a/TLSphinx/Decoder.swift +++ b/TLSphinx/Decoder.swift @@ -11,7 +11,7 @@ import AVFoundation import Sphinx -private enum SpeechStateEnum : CustomStringConvertible { +fileprivate enum SpeechStateEnum : CustomStringConvertible { case silence case speech case utterance @@ -31,7 +31,7 @@ private enum SpeechStateEnum : CustomStringConvertible { } -private extension AVAudioPCMBuffer { +fileprivate extension AVAudioPCMBuffer { func toDate() -> Data { let channels = UnsafeBufferPointer(start: int16ChannelData, count: 1) @@ -42,14 +42,21 @@ private extension AVAudioPCMBuffer { } -open class Decoder { +public enum DecodeErrors : Error { + case CantReadSpeachFile(String) + case CantSetAudioSession(NSError) + case NoAudioInputAvailable + case CantStartAudioEngine(NSError) + case CantAddWordsWhileDecodeingSpeech +} + + +public final class Decoder { fileprivate var psDecoder: OpaquePointer? fileprivate var engine: AVAudioEngine! fileprivate var speechState: SpeechStateEnum - open var bufferSize: Int = 2048 - public init?(config: Config) { speechState = .silence @@ -69,7 +76,7 @@ open class Decoder { deinit { let refCount = ps_free(psDecoder) - assert(refCount == 0, "Can't free decoder, it's shared among instances") + assert(refCount == 0, "Can't free decoder because it's shared among instances") } @discardableResult fileprivate func process_raw(_ data: Data) -> CInt { @@ -216,7 +223,7 @@ open class Decoder { } } - open func stopDecodingSpeech () { + public func stopDecodingSpeech () { engine.stop() engine.mainMixerNode.removeTap(onBus: 0) engine.reset() From 638599d90cf20f5defc778537c53a96b3e315c43 Mon Sep 17 00:00:00 2001 From: Bruno Berisso Date: Tue, 24 Jan 2017 12:21:46 +0000 Subject: [PATCH 2/4] - Add exceptions to some decoder metods to better handle errors. - Chenge some 'if' statements for 'guards', mostely in the tests - Use STrue | SFalse instead of 1 | 0 to denote true | false when applicable --- TLSphinx/Config.swift | 2 +- TLSphinx/Decoder.swift | 123 ++++++++++++++++----------------- TLSphinx/Globals.swift | 7 +- TLSphinx/Hypotesis.swift | 8 ++- TLSphinxTests/Basic.swift | 87 +++++++++++++++-------- TLSphinxTests/LiveDecode.swift | 64 +++++++++-------- 6 files changed, 160 insertions(+), 131 deletions(-) diff --git a/TLSphinx/Config.swift b/TLSphinx/Config.swift index 8168f5c..72f7b0b 100644 --- a/TLSphinx/Config.swift +++ b/TLSphinx/Config.swift @@ -22,7 +22,7 @@ public final class Config { return [strdup(name),strdup(value)] } - cmdLnConf = cmd_ln_parse_r(nil, ps_args(), CInt(cArgs.count), &cArgs, STrue) + cmdLnConf = cmd_ln_parse_r(nil, ps_args(), CInt(cArgs.count), &cArgs, STrue32) if cmdLnConf == nil { return nil diff --git a/TLSphinx/Decoder.swift b/TLSphinx/Decoder.swift index 90c04ed..d499b15 100644 --- a/TLSphinx/Decoder.swift +++ b/TLSphinx/Decoder.swift @@ -33,9 +33,10 @@ fileprivate enum SpeechStateEnum : CustomStringConvertible { fileprivate extension AVAudioPCMBuffer { - func toDate() -> Data { + func toData() -> Data { let channels = UnsafeBufferPointer(start: int16ChannelData, count: 1) - let ch0Data = Data(bytes: UnsafeMutablePointer(channels[0]), count:Int(frameCapacity * format.streamDescription.pointee.mBytesPerFrame)) + let ch0Data = Data(bytes: UnsafeMutablePointer(channels[0]), + count: Int(frameCapacity * format.streamDescription.pointee.mBytesPerFrame)) return ch0Data } @@ -60,16 +61,9 @@ public final class Decoder { public init?(config: Config) { speechState = .silence - - if config.cmdLnConf != nil{ - psDecoder = ps_init(config.cmdLnConf) - - if psDecoder == nil { - return nil - } - - } else { - psDecoder = nil + psDecoder = config.cmdLnConf.flatMap(ps_init) + + if psDecoder == nil { return nil } } @@ -80,9 +74,11 @@ public final class Decoder { } @discardableResult fileprivate func process_raw(_ data: Data) -> CInt { - //Sphinx expect words of 2 bytes but the NSFileHandle read one byte at time so the lenght of the data for sphinx is the half of the real one. + let dataLenght = data.count / 2 - let numberOfFrames = ps_process_raw(psDecoder, (data as NSData).bytes.bindMemory(to: int16.self, capacity: data.count), dataLenght, SFalse, SFalse) + let numberOfFrames = data.withUnsafeBytes { (bytes : UnsafePointer) -> Int32 in + ps_process_raw(psDecoder, bytes, dataLenght, SFalse32, SFalse32) + } let hasSpeech = in_speech() switch (speechState) { @@ -100,7 +96,7 @@ public final class Decoder { } fileprivate func in_speech() -> Bool { - return ps_get_in_speech(psDecoder) == 1 + return ps_get_in_speech(psDecoder) == STrue } @discardableResult fileprivate func start_utt() -> Bool { @@ -114,79 +110,75 @@ public final class Decoder { fileprivate func get_hyp() -> Hypothesis? { var score: int32 = 0 - if let string = ps_get_hyp(psDecoder, &score) { - if let text = String(validatingUTF8: string) { - return Hypothesis(text: text, score: Int(score)) - } else { - return nil - } + guard let string = ps_get_hyp(psDecoder, &score) else { + return nil + } + + if let text = String(validatingUTF8: string) { + return Hypothesis(text: text, score: Int(score)) } else { return nil } } - - fileprivate func hypotesisForSpeechAtPath (_ filePath: String) -> Hypothesis? { - - if let fileHandle = FileHandle(forReadingAtPath: filePath) { - - start_utt() - - let hypothesis = fileHandle.reduceChunks(bufferSize, initial: nil, reducer: { [unowned self] (data: Data, partialHyp: Hypothesis?) -> Hypothesis? in - - self.process_raw(data) - - var resultantHyp = partialHyp - if self.speechState == .utterance { - - self.end_utt() - resultantHyp = partialHyp + self.get_hyp() - self.start_utt() - } - - return resultantHyp - }) - - end_utt() - fileHandle.closeFile() - - //Process any pending speech - if speechState == .speech { - return hypothesis + get_hyp() - } else { - return hypothesis + + fileprivate func hypotesisForSpeech (inFile fileHandle: FileHandle) -> Hypothesis? { + + start_utt() + + let hypothesis = fileHandle.reduceChunks(2048, initial: nil, reducer: { + (data: Data, partialHyp: Hypothesis?) -> Hypothesis? in + + process_raw(data) + + var resultantHyp = partialHyp + if speechState == .utterance { + + end_utt() + resultantHyp = partialHyp + get_hyp() + start_utt() } - + + return resultantHyp + }) + + end_utt() + + //Process any pending speech + if speechState == .speech { + return hypothesis + get_hyp() } else { - return nil + return hypothesis } } - - open func decodeSpeechAtPath (_ filePath: String, complete: @escaping (Hypothesis?) -> ()) { - + + public func decodeSpeech (atPath filePath: String, complete: @escaping (Hypothesis?) -> ()) throws { + + guard let fileHandle = FileHandle(forReadingAtPath: filePath) else { + throw DecodeErrors.CantReadSpeachFile(filePath) + } + DispatchQueue.global().async { - - let hypothesis = self.hypotesisForSpeechAtPath(filePath) - + let hypothesis = self.hypotesisForSpeech(inFile:fileHandle) + fileHandle.closeFile() DispatchQueue.main.async { complete(hypothesis) } } } - open func startDecodingSpeech (_ utteranceComplete: @escaping (Hypothesis?) -> ()) { + public func startDecodingSpeech (_ utteranceComplete: @escaping (Hypothesis?) -> ()) throws { do { try AVAudioSession.sharedInstance().setCategory(AVAudioSessionCategoryRecord) } catch let error as NSError { print("Error setting the shared AVAudioSession: \(error)") - return + throw DecodeErrors.CantSetAudioSession(error) } engine = AVAudioEngine() guard let input = engine.inputNode else { - print("Can't get input node") - return + throw DecodeErrors.NoAudioInputAvailable } let formatIn = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 44100, channels: 1, interleaved: false) @@ -202,9 +194,9 @@ public final class Decoder { self.end_utt() let hypothesis = self.get_hyp() - DispatchQueue.main.async(execute: { + DispatchQueue.main.async { utteranceComplete(hypothesis) - }) + } self.start_utt() } @@ -220,6 +212,7 @@ public final class Decoder { } catch let error as NSError { end_utt() print("Can't start AVAudioEngine: \(error)") + throw DecodeErrors.CantStartAudioEngine(error) } } diff --git a/TLSphinx/Globals.swift b/TLSphinx/Globals.swift index f3d49e0..36710d2 100644 --- a/TLSphinx/Globals.swift +++ b/TLSphinx/Globals.swift @@ -8,8 +8,11 @@ import Foundation -let STrue: CInt = 1 -let SFalse: CInt = 0 +let STrue: uint8 = 1 +let SFalse: uint8 = 0 + +let STrue32: CInt = 1 +let SFalse32: CInt = 0 extension FileHandle { diff --git a/TLSphinx/Hypotesis.swift b/TLSphinx/Hypotesis.swift index e3ef325..76836a8 100644 --- a/TLSphinx/Hypotesis.swift +++ b/TLSphinx/Hypotesis.swift @@ -23,8 +23,10 @@ extension Hypothesis : CustomStringConvertible { } -func +(lhs: Hypothesis, rhs: Hypothesis) -> Hypothesis { - return Hypothesis(text: lhs.text + " " + rhs.text, score: (lhs.score + rhs.score) / 2) +extension Hypothesis { + static func +(lhs: Hypothesis, rhs: Hypothesis) -> Hypothesis { + return Hypothesis(text: lhs.text + " " + rhs.text, score: (lhs.score + rhs.score) / 2) + } } func +(lhs: Hypothesis?, rhs: Hypothesis?) -> Hypothesis? { @@ -37,4 +39,4 @@ func +(lhs: Hypothesis?, rhs: Hypothesis?) -> Hypothesis? { return rhs } } -} \ No newline at end of file +} diff --git a/TLSphinxTests/Basic.swift b/TLSphinxTests/Basic.swift index a633657..5d838d0 100644 --- a/TLSphinxTests/Basic.swift +++ b/TLSphinxTests/Basic.swift @@ -13,49 +13,82 @@ import TLSphinx class BasicTests: XCTestCase { - func getModelPath() -> String? { - return Bundle(for: BasicTests.self).path(forResource: "en-us", ofType: nil) + func getModelPath() -> NSString? { + return Bundle(for: BasicTests.self).path(forResource: "en-us", ofType: nil) as NSString? } func testConfig() { - - if let modelPath = getModelPath() { - - let hmm = (modelPath as NSString).appendingPathComponent("en-us") - let lm = (modelPath as NSString).appendingPathComponent("en-us.lm.dmp") - let dict = (modelPath as NSString).appendingPathComponent("cmudict-en-us.dict") - - let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) - - XCTAssert(config != nil, "Pass") - - } else { + + guard let modelPath = getModelPath() else { XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") + return } + + let hmm = modelPath.appendingPathComponent("en-us") + let lm = modelPath.appendingPathComponent("en-us.lm.dmp") + let dict = modelPath.appendingPathComponent("cmudict-en-us.dict") + + let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) + XCTAssert(config != nil, "Pass") } func testDecoder() { - if let modelPath = getModelPath() { + guard let modelPath = getModelPath() else { + XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") + return + } - let hmm = (modelPath as NSString).appendingPathComponent("en-us") - let lm = (modelPath as NSString).appendingPathComponent("en-us.lm.dmp") - let dict = (modelPath as NSString).appendingPathComponent("cmudict-en-us.dict") + let hmm = modelPath.appendingPathComponent("en-us") + let lm = modelPath.appendingPathComponent("en-us.lm.dmp") + let dict = modelPath.appendingPathComponent("cmudict-en-us.dict") + + guard let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) else { + XCTFail("Can't run test without a valid config") + return + } + + let decoder = Decoder(config:config) + XCTAssert(decoder != nil, "Pass") + } + + func testSpeechFromFile() { + + guard let modelPath = getModelPath() else { + XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") + return + } + + let hmm = modelPath.appendingPathComponent("en-us") + let lm = modelPath.appendingPathComponent("en-us.lm.dmp") + let dict = modelPath.appendingPathComponent("cmudict-en-us.dict") + + guard let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) else { + XCTFail("Can't run test without a valid config") + return + } + + guard let decoder = Decoder(config:config) else { + XCTFail("Can't run test without a decoder") + return + } + + let audioFile = modelPath.appendingPathComponent("goforward.raw") + let expectation = self.expectation(description: "Decode finish") + + try! decoder.decodeSpeech(atPath: audioFile) { - if let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) { - let decoder = Decoder(config:config) + if let hyp = $0 { + + print("Text: \(hyp.text) - Score: \(hyp.score)") + XCTAssert(hyp.text == "go forward ten meters", "Pass") - XCTAssert(decoder != nil, "Pass") } else { - XCTFail("Can't run test without a valid config") + XCTFail("Fail to decode audio") } - } else { - XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") + expectation.fulfill() } - } - - func testSpeechFromFile() { if let modelPath = getModelPath() { diff --git a/TLSphinxTests/LiveDecode.swift b/TLSphinxTests/LiveDecode.swift index df95b2b..d05835d 100644 --- a/TLSphinxTests/LiveDecode.swift +++ b/TLSphinxTests/LiveDecode.swift @@ -13,44 +13,42 @@ import TLSphinx class LiveDecode: XCTestCase { - func getModelPath() -> String? { - return Bundle(for: LiveDecode.self).path(forResource: "en-us", ofType: nil) + func getModelPath() -> NSString? { + return Bundle(for: LiveDecode.self).path(forResource: "en-us", ofType: nil) as NSString? } func testAVAudioRecorder() { - if let modelPath = getModelPath() { - - let hmm = (modelPath as NSString).appendingPathComponent("en-us") - let lm = (modelPath as NSString).appendingPathComponent("en-us.lm.dmp") - let dict = (modelPath as NSString).appendingPathComponent("cmudict-en-us.dict") - - if let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) { - - config.showDebugInfo = false - - if let decoder = Decoder(config:config) { - decoder.startDecodingSpeech { (hyp) -> () in - print("Utterance: \(hyp)") - } - - let theExpectation = expectation(description: "") - - DispatchQueue.main.asyncAfter(deadline: DispatchTime(uptimeNanoseconds: DispatchTime.now().rawValue) + Double(Int64(15.0 * Double(NSEC_PER_SEC))) / Double(NSEC_PER_SEC)) { - decoder.stopDecodingSpeech() - theExpectation.fulfill() - } - - waitForExpectations(timeout: NSTimeIntervalSince1970, handler: nil) - } - - } else { - XCTFail("Can't run test without a valid config") - } - - } else { + guard let modelPath = getModelPath() else { XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") + return + } + + let hmm = modelPath.appendingPathComponent("en-us") + let lm = modelPath.appendingPathComponent("en-us.lm.dmp") + let dict = modelPath.appendingPathComponent("cmudict-en-us.dict") + + guard let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) else { + XCTFail("Can't run test without a valid config") + return } - } + config.showDebugInfo = false + + guard let decoder = Decoder(config:config) else { + XCTFail("Can't run test without a decoder") + return + } + + try! decoder.startDecodingSpeech { print("Utterance: \($0)") } + + let theExpectation = expectation(description: "") + + DispatchQueue.main.asyncAfter(deadline: DispatchTime(uptimeNanoseconds: DispatchTime.now().rawValue) + Double(Int64(15.0 * Double(NSEC_PER_SEC))) / Double(NSEC_PER_SEC)) { + decoder.stopDecodingSpeech() + theExpectation.fulfill() + } + + waitForExpectations(timeout: NSTimeIntervalSince1970) + } } From 3ac85829fbcb3f2260265ace6367b7529ec9c153 Mon Sep 17 00:00:00 2001 From: Bruno Berisso Date: Tue, 24 Jan 2017 12:25:48 +0000 Subject: [PATCH 3/4] Add AVAudioConverter to try a new way of read data from the AVAudioEngin in live decoding. The idea is read the data in a more appealing format for iOS (float 32, 16000 Hz) and convert it (with AVAudioConverter) to the Sphinx format (int 16, 16000Hz). AVAudioConverter is only available from iOS 9.0 so the deployment traget needs to change. --- TLSphinx.xcodeproj/project.pbxproj | 8 +++---- TLSphinx/Decoder.swift | 35 ++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/TLSphinx.xcodeproj/project.pbxproj b/TLSphinx.xcodeproj/project.pbxproj index 2023383..2abd497 100644 --- a/TLSphinx.xcodeproj/project.pbxproj +++ b/TLSphinx.xcodeproj/project.pbxproj @@ -308,7 +308,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 8.3; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; MTL_ENABLE_DEBUG_INFO = YES; ONLY_ACTIVE_ARCH = YES; SDKROOT = iphoneos; @@ -352,7 +352,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 8.3; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; MTL_ENABLE_DEBUG_INFO = NO; SDKROOT = iphoneos; SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule"; @@ -382,7 +382,7 @@ ); INFOPLIST_FILE = TLSphinx/Info.plist; INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks"; - IPHONEOS_DEPLOYMENT_TARGET = 8.2; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks"; LIBRARY_SEARCH_PATHS = ( "$(PROJECT_DIR)/Sphinx/lib/pocketsphinx", @@ -416,7 +416,7 @@ ); INFOPLIST_FILE = TLSphinx/Info.plist; INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks"; - IPHONEOS_DEPLOYMENT_TARGET = 8.2; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks"; LIBRARY_SEARCH_PATHS = ( "$(PROJECT_DIR)/Sphinx/lib/pocketsphinx", diff --git a/TLSphinx/Decoder.swift b/TLSphinx/Decoder.swift index d499b15..44e9ca5 100644 --- a/TLSphinx/Decoder.swift +++ b/TLSphinx/Decoder.swift @@ -181,14 +181,36 @@ public final class Decoder { throw DecodeErrors.NoAudioInputAvailable } - let formatIn = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 44100, channels: 1, interleaved: false) - engine.connect(input, to: engine.outputNode, format: formatIn) + let mixer = AVAudioMixerNode() + engine.attach(mixer) + engine.connect(input, to: mixer, format: input.outputFormat(forBus: 0)) - input.installTap(onBus: 0, bufferSize: 4096, format: formatIn, block: { (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) -> Void in + let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 16000, channels: 1, interleaved: false) + let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false) + let bufferMapper = AVAudioConverter(from: formatIn, to: formatOut) - let audioData = buffer.toDate() + mixer.installTap(onBus: 0, bufferSize: 2048, format: formatIn, block: { + [unowned self] (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in + + let sphinxBuffer = AVAudioPCMBuffer(pcmFormat: formatOut, frameCapacity: buffer.frameCapacity) + + //This is needed because the 'frameLenght' default value is 0 (since iOS 10) and cause the 'convert' call + //to faile with an error (Error Domain=NSOSStatusErrorDomain Code=-50 "(null)") + //More here: http://stackoverflow.com/questions/39714244/avaudioconverter-is-broken-in-ios-10 + sphinxBuffer.frameLength = sphinxBuffer.frameCapacity + + do { + try bufferMapper.convert(to: sphinxBuffer, from: buffer) + } catch(let error as NSError) { + print(error) + return + } + + let audioData = sphinxBuffer.toData() self.process_raw(audioData) + print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)") + if self.speechState == .utterance { self.end_utt() @@ -202,9 +224,6 @@ public final class Decoder { } }) - engine.mainMixerNode.outputVolume = 0.0 - engine.prepare() - start_utt() do { @@ -218,8 +237,6 @@ public final class Decoder { public func stopDecodingSpeech () { engine.stop() - engine.mainMixerNode.removeTap(onBus: 0) - engine.reset() engine = nil } } From 86af70a924effed9464db43015c6612b2a3c3768 Mon Sep 17 00:00:00 2001 From: Bruno Berisso Date: Tue, 24 Jan 2017 12:31:50 +0000 Subject: [PATCH 4/4] Add a new API to add words to the recognition dictionary on runtime. Be aware that new words can't be added while a recognition is in progress. You should add new words before start a recognition process. The API expect an array of tuples of String with the form: (word: 'HELLO', phones: 'HH EH L OW'). The first component is the word in plain English. The second is the pronunciation phones as appear in the cmudict (more here: http://www.speech.cs.cmu.edu/tools/lextool.html) In the future the second component should be calculated --- .../model/en-us/basic-lm/6844.dic | 3 + .../pocketsphinx/model/en-us/basic-lm/6844.lm | 35 ++++++++ TLSphinx/Decoder.swift | 12 +++ TLSphinxTests/Basic.swift | 88 +++++++++++-------- 4 files changed, 101 insertions(+), 37 deletions(-) create mode 100644 Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.dic create mode 100644 Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.lm diff --git a/Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.dic b/Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.dic new file mode 100644 index 0000000..b73e7e8 --- /dev/null +++ b/Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.dic @@ -0,0 +1,3 @@ +FORWARD F AO R W ER D +GO G OW +TEN T EH N diff --git a/Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.lm b/Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.lm new file mode 100644 index 0000000..7ae892e --- /dev/null +++ b/Sphinx/share/pocketsphinx/model/en-us/basic-lm/6844.lm @@ -0,0 +1,35 @@ +Language model created by QuickLM on Mon Jan 23 09:11:27 EST 2017 +Copyright (c) 1996-2010 Carnegie Mellon University and Alexander I. Rudnicky + +The model is in standard ARPA format, designed by Doug Paul while he was at MITRE. + +The code that was used to produce this language model is available in Open Source. +Please visit http://www.speech.cs.cmu.edu/tools/ for more information + +The (fixed) discount mass is 0.5. The backoffs are computed using the ratio method. +This model based on a corpus of 1 sentences and 5 words + +\data\ +ngram 1=5 +ngram 2=4 +ngram 3=3 + +\1-grams: +-1.0000 -0.3010 +-1.0000 -0.2553 +-1.0000 FORWARD -0.2553 +-1.0000 GO -0.2553 +-1.0000 TEN -0.2553 + +\2-grams: +-0.3010 GO 0.0000 +-0.3010 FORWARD TEN 0.0000 +-0.3010 GO FORWARD 0.0000 +-0.3010 TEN -0.3010 + +\3-grams: +-0.3010 GO FORWARD +-0.3010 FORWARD TEN +-0.3010 GO FORWARD TEN + +\end\ diff --git a/TLSphinx/Decoder.swift b/TLSphinx/Decoder.swift index 44e9ca5..378fc3e 100644 --- a/TLSphinx/Decoder.swift +++ b/TLSphinx/Decoder.swift @@ -239,4 +239,16 @@ public final class Decoder { engine.stop() engine = nil } + + public func add(words:Array<(word: String, phones: String)>) throws { + + guard engine == nil || !engine.isRunning else { + throw DecodeErrors.CantAddWordsWhileDecodeingSpeech + } + + for (word,phones) in words { + let update = words.last?.word == word ? STrue32 : SFalse32 + ps_add_word(psDecoder, word, phones, update) + } + } } diff --git a/TLSphinxTests/Basic.swift b/TLSphinxTests/Basic.swift index 5d838d0..ab63b77 100644 --- a/TLSphinxTests/Basic.swift +++ b/TLSphinxTests/Basic.swift @@ -90,47 +90,61 @@ class BasicTests: XCTestCase { expectation.fulfill() } - if let modelPath = getModelPath() { - - let hmm = (modelPath as NSString).appendingPathComponent("en-us") - let lm = (modelPath as NSString).appendingPathComponent("en-us.lm.dmp") - let dict = (modelPath as NSString).appendingPathComponent("cmudict-en-us.dict") - - if let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) { - if let decoder = Decoder(config:config) { - - let audioFile = (modelPath as NSString).appendingPathComponent("goforward.raw") - let expectation = self.expectation(description: "Decode finish") - - decoder.decodeSpeechAtPath(audioFile) { - - if let hyp = $0 { - - print("Text: \(hyp.text) - Score: \(hyp.score)") - XCTAssert(hyp.text == "go forward ten meters", "Pass") - - } else { - XCTFail("Fail to decode audio") - } - - expectation.fulfill() + waitForExpectations(timeout: NSTimeIntervalSince1970) + } + + func testAddWordToLenguageModel() { + + guard let modelPath = getModelPath() else { + XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") + return + } + + let basicModelPath = (modelPath.appendingPathComponent("basic-lm") as NSString) + let hmm = modelPath.appendingPathComponent("en-us") + let lm = basicModelPath.appendingPathComponent("6844.lm") + let dict = basicModelPath.appendingPathComponent("6844.dic") + + guard let config = Config(args: ("-hmm", hmm), ("-lm", lm), ("-dict", dict)) else { + XCTFail("Can't run test without a valid config") + return + } + + guard let decoder = Decoder(config:config) else { + XCTFail("Can't run test without a decoder") + return + } + + let audioFile = modelPath.appendingPathComponent("goforward.raw") + let expectation = self.expectation(description: "Decode finish") + + try! decoder.decodeSpeech(atPath: audioFile) { [unowned decoder] in + + if let hyp = $0 { + + print("Text: \(hyp.text) - Score: \(hyp.score)") + XCTAssert(hyp.text == "GO FORWARD TEN", "Pass") + + try! decoder.add(words:[("METERS","M IY T ER Z")]) + + try! decoder.decodeSpeech(atPath: audioFile) { + if let hyp = $0 { + + print("Text: \(hyp.text) - Score: \(hyp.score)") + XCTAssert(hyp.text == "GO FORWARD TEN METERS", "Pass") + } else { + XCTFail("Fail to decode audio") } - - waitForExpectations(timeout: NSTimeIntervalSince1970, handler: { (_) -> Void in - - }) - - } else { - XCTFail("Can't run test without a decoder") + + expectation.fulfill() } - + } else { - XCTFail("Can't run test without a valid config") + XCTFail("Fail to decode audio") + expectation.fulfill() } - - } else { - XCTFail("Can't access pocketsphinx model. Bundle root: \(Bundle.main)") } - + + waitForExpectations(timeout: NSTimeIntervalSince1970) } }