Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 0 additions & 105 deletions Package.resolved

This file was deleted.

5 changes: 1 addition & 4 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,13 @@ let package = Package(
targets: ["FluidAudioCLI"]
),
],
dependencies: [
.package(url: "https://github.com/huggingface/swift-transformers", from: "1.3.0")
],
dependencies: [],
targets: [
.target(
name: "FluidAudio",
dependencies: [
"FastClusterWrapper",
"MachTaskSelfWrapper",
.product(name: "Tokenizers", package: "swift-transformers"),
],
path: "Sources/FluidAudio",
exclude: [
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import Foundation

/// BPE tokenizer for CTC vocabulary boosting.
/// Only implements encoding - no decoding, chat templates, or other features.
/// Supports the specific tokenizer.json format used by Parakeet models.
///
/// Text normalization: Applies lowercasing + NFKC normalization before BPE encoding,
/// matching the standard NeMo CTC tokenization pipeline.
public final class BpeTokenizer: Sendable {
private let vocab: [String: Int]
private let merges: [(String, String)]
private let addedTokens: [String: Int]

public enum Error: Swift.Error, LocalizedError {
case fileNotFound(URL)
case invalidJSON(String)
case missingField(String)
case unsupportedTokenizerType(String)

public var errorDescription: String? {
switch self {
case .fileNotFound(let url):
return "tokenizer.json not found at \(url.path)"
case .invalidJSON(let message):
return "Invalid JSON: \(message)"
case .missingField(let field):
return "Missing required field: \(field)"
case .unsupportedTokenizerType(let type):
return "Unsupported tokenizer type: \(type). Only 'BPE' is supported."
}
}
}

/// Load tokenizer from a folder containing tokenizer.json
public static func load(from modelFolder: URL) throws -> BpeTokenizer {
let tokenizerPath = modelFolder.appendingPathComponent("tokenizer.json")

guard FileManager.default.fileExists(atPath: tokenizerPath.path) else {
throw Error.fileNotFound(tokenizerPath)
}

let data = try Data(contentsOf: tokenizerPath)

guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] else {
throw Error.invalidJSON("Root is not a dictionary")
}

// Parse model section
guard let model = json["model"] as? [String: Any] else {
throw Error.missingField("model")
}

guard let modelType = model["type"] as? String else {
throw Error.missingField("model.type")
}

guard modelType == "BPE" else {
throw Error.unsupportedTokenizerType(modelType)
}

// Parse vocabulary: {"token": id, ...}
guard let vocabDict = model["vocab"] as? [String: Int] else {
throw Error.missingField("model.vocab")
}

// Parse merges: ["a b", "c d", ...]
guard let mergesArray = model["merges"] as? [String] else {
throw Error.missingField("model.merges")
}

let merges = mergesArray.compactMap { mergeStr -> (String, String)? in
let parts = mergeStr.split(separator: " ", maxSplits: 1)
guard parts.count == 2 else { return nil }
return (String(parts[0]), String(parts[1]))
}

// Parse added_tokens (special tokens like <unk>, <pad>)
var addedTokensDict: [String: Int] = [:]
let addedTokensList = (json["added_tokens"] as? [[String: Any]]) ?? []
for token in addedTokensList {
guard let content = token["content"] as? String,
let id = token["id"] as? Int
else { continue }
addedTokensDict[content] = id
}

return BpeTokenizer(
vocab: vocabDict,
merges: merges,
addedTokens: addedTokensDict
)
}

private init(vocab: [String: Int], merges: [(String, String)], addedTokens: [String: Int]) {
self.vocab = vocab
self.merges = merges
self.addedTokens = addedTokens
}

/// Encode text to token IDs using BPE
public func encode(_ text: String, addSpecialTokens: Bool = false) -> [Int] {
// Normalize: lowercase + NFKC normalization (matches NeMo CTC models)
let normalized = text.lowercased().precomposedStringWithCompatibilityMapping

// Pre-tokenize: replace spaces with ▁ (sentencepiece style)
let preprocessed = "▁" + normalized.replacingOccurrences(of: " ", with: "▁")

// Split into characters
var word = preprocessed.map { String($0) }

// Apply BPE merges iteratively
while true {
// Find the highest priority merge (earliest in merges list)
var bestMergeIndex: Int? = nil
var bestMergePair: (String, String)? = nil

for i in 0..<word.count - 1 {
let pair = (word[i], word[i + 1])

// Check if this pair has a merge rule
guard let mergeIndex = merges.firstIndex(where: { $0.0 == pair.0 && $0.1 == pair.1 }) else {
continue
}

// Update best merge if this is higher priority (lower index)
if bestMergeIndex.map({ mergeIndex < $0 }) ?? true {
bestMergeIndex = mergeIndex
bestMergePair = pair
}
}

// No more merges possible
guard let (first, second) = bestMergePair else { break }

// Apply the merge to ALL occurrences of the winning pair (standard BPE)
var newWord: [String] = []
var i = 0
while i < word.count {
if i < word.count - 1 && word[i] == first && word[i + 1] == second {
newWord.append(first + second)
i += 2 // Skip the next token since we merged it
} else {
newWord.append(word[i])
i += 1
}
}
word = newWord
}

// Convert tokens to IDs
return word.compactMap { token -> Int? in
// Check added tokens first (special tokens)
if let id = addedTokens[token] {
return id
}
// Then check vocabulary
if let id = vocab[token] {
return id
}
// Unknown token - return <unk> ID if available
return addedTokens["<unk>"] ?? vocab["<unk>"] ?? 0
}
}
}
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
import Foundation
@preconcurrency import Tokenizers

/// Type alias to disambiguate from local Tokenizer class
private typealias HFTokenizerProtocol = Tokenizers.Tokenizer

// MARK: - CTC Tokenizer

/// CTC tokenizer using HuggingFace tokenizer.json for accurate BPE tokenization.
/// This provides tokenization matching the original model training.
public final class CtcTokenizer: Sendable {
private let hfTokenizer: HFTokenizer
private let bpeTokenizer: BpeTokenizer

/// Errors that can occur during tokenizer initialization
public enum Error: Swift.Error, LocalizedError {
Expand Down Expand Up @@ -47,8 +43,12 @@ public final class CtcTokenizer: Sendable {
throw Error.tokenizerNotFound(modelDirectory)
}

let hfTokenizer = try await HFTokenizer(modelFolder: modelDirectory)
return CtcTokenizer(hfTokenizer: hfTokenizer)
do {
let bpeTokenizer = try BpeTokenizer.load(from: modelDirectory)
return CtcTokenizer(bpeTokenizer: bpeTokenizer)
} catch {
throw Error.initializationFailed(error)
}
}

/// Load the CTC tokenizer asynchronously using the default 110m model directory.
Expand All @@ -62,8 +62,8 @@ public final class CtcTokenizer: Sendable {
// MARK: - Private Init

/// Private initializer used by async factory method
private init(hfTokenizer: HFTokenizer) {
self.hfTokenizer = hfTokenizer
private init(bpeTokenizer: BpeTokenizer) {
self.bpeTokenizer = bpeTokenizer
}

// MARK: - Encoding/Decoding
Expand All @@ -73,7 +73,7 @@ public final class CtcTokenizer: Sendable {
/// - Parameter text: Text to encode
/// - Returns: Array of token IDs
public func encode(_ text: String) -> [Int] {
hfTokenizer.encode(text)
bpeTokenizer.encode(text, addSpecialTokens: false)
}

/// Get the CTC model directory path
Expand All @@ -92,46 +92,3 @@ public final class CtcTokenizer: Sendable {
.appendingPathComponent("parakeet-ctc-110m-coreml", isDirectory: true)
}
}

// MARK: - HuggingFace Tokenizer (Private Implementation)

/// HuggingFace tokenizer that loads tokenizer.json directly using swift-transformers.
/// This provides accurate BPE tokenization matching the original model training.
/// Marked Sendable because it's immutable after initialization.
private final class HFTokenizer: Sendable {
private let tokenizer: any HFTokenizerProtocol

/// Load tokenizer from a local model folder containing tokenizer.json
///
/// Required files in folder:
/// - tokenizer.json (main tokenizer data)
/// - tokenizer_config.json (tokenizer settings)
///
/// - Parameter modelFolder: URL to folder containing tokenizer files
init(modelFolder: URL) async throws {
// Verify required files exist
let tokenizerJsonPath = modelFolder.appendingPathComponent("tokenizer.json")
let tokenizerConfigPath = modelFolder.appendingPathComponent("tokenizer_config.json")

guard FileManager.default.fileExists(atPath: tokenizerJsonPath.path) else {
throw CtcTokenizer.Error.missingFile("tokenizer.json", modelFolder)
}
guard FileManager.default.fileExists(atPath: tokenizerConfigPath.path) else {
throw CtcTokenizer.Error.missingFile("tokenizer_config.json", modelFolder)
}

do {
self.tokenizer = try await AutoTokenizer.from(modelFolder: modelFolder)
} catch {
throw CtcTokenizer.Error.initializationFailed(error)
}
}

// MARK: - Encoding

/// Encode text to token IDs without special tokens.
func encode(_ text: String) -> [Int] {
tokenizer.encode(text: text, addSpecialTokens: false)
}

}
Loading