Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Documentation/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Use `OfflineDiarizerManager` when you need offline DER parity or want to run the

**Speaker Enrollment:** `enrollSpeaker(withAudio:sourceSampleRate:named:...)` feeds known-speaker audio before streaming to label a slot.

**Lifecycle:** `reset()` clears streaming state but keeps the model loaded. `cleanup()` releases everything.
**Lifecycle:** `finalizeSession()` flushes trailing context so the last true frame becomes finalized. `reset()` clears streaming state but keeps the model loaded. `cleanup()` releases everything.

---

Expand Down
2 changes: 1 addition & 1 deletion Documentation/Diarization/DiarizerTimeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,4 @@ When `timeline.addChunk(_:)` is called internally by the diarizer:
3. It iterates over all `DiarizerSpeaker` tracks, evaluating the boundaries (using `onsetThreshold` and `offsetThreshold`) to grow existing segments or spawn new ones.
4. Tentative segments are cleared and rebuilt from the trailing `tentativePredictions` array during every streaming tick.

When the stream naturally finishes, the `Diarizer` automatically invokes `timeline.finalize()`, which flushes any remaining tentative segments up to finalized status and applies the `minFramesOn` deletion rules.
When the stream naturally finishes, call `Diarizer.finalizeSession()`. The diarizer flushes trailing context first, then invokes `timeline.finalize()`, which promotes any remaining tentative segments to finalized status and applies the `minFramesOn` deletion rules.
6 changes: 5 additions & 1 deletion Documentation/Diarization/LS-EEND.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,14 @@ if let update = try diarizer.process() {
// Convenience: add + process in one call
if let update = try diarizer.process(samples: audioChunk) { ... }

// Flush remaining frames at end of stream
// Flush remaining committed + preview frames at end of stream
try diarizer.finalizeSession()
let finalTimeline = diarizer.timeline
```

Notes:
- `finalizeSession()` pads the tail with silence when needed so the last true frame is emitted as finalized output before the timeline is finalized.

### Speaker Enrollment

Use speaker enrollment to warm LS-EEND with a known speaker before the live stream starts. Enrollment keeps the active streaming session, resets the visible timeline back to frame 0, and preserves the speaker name inside the `DiarizerTimeline`.
Expand Down Expand Up @@ -288,6 +291,7 @@ Real-world integration testing with 4-speaker audio reveals specific enrollment
### Lifecycle

```swift
try diarizer.finalizeSession() // Flush trailing context before reading final output
diarizer.reset() // Reset streaming state for a new audio stream (keeps model loaded)
diarizer.cleanup() // Release all resources including the loaded model
```
Expand Down
8 changes: 8 additions & 0 deletions Documentation/Diarization/Sortformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,10 @@ public struct SortformerSegment {
│ └─→ timeline.addChunk(result) │
│ └─→ Update segments per speaker │
│ │
│ 3. finalizeSession() │
│ └─→ pad trailing silence until last true frame is emitted │
│ └─→ timeline.finalize() │
│ │
└────────────────────────────────────────────────────────────────┘
```

Expand Down Expand Up @@ -444,6 +448,8 @@ audioEngine.installTap { buffer in
updateSpeakerDisplay(diarizer.timeline)
}
}

try diarizer.finalizeSession()
```

### Batch Processing
Expand All @@ -466,6 +472,8 @@ for (index, speaker) in timeline.speakers {
}
```

`finalizeSession()` is only needed for streaming mode. It pads enough trailing silence to flush Sortformer's right-context preview frames, then finalizes the timeline so `numTentativeFrames == 0`.

### Speaker Enrollment

Use speaker enrollment to warm Sortformer with known speakers before live audio starts. Enrollment preserves the speaker cache / FIFO state, resets the visible timeline, and keeps the speaker name in the `DiarizerTimeline`.
Expand Down
16 changes: 16 additions & 0 deletions Sources/FluidAudio/Diarizer/DiarizerTimeline.swift
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,11 @@ public final class DiarizerTimeline {
queue.sync { _tentativePredictions.count / speakerCapacity }
}

/// Total number of frames (finalized + tentative)
public var numFrames: Int {
queue.sync { _numFinalizedFrames + _tentativePredictions.count / speakerCapacity }
}

/// Speakers in the timeline
public var speakers: [Int: DiarizerSpeaker] {
get { queue.sync { _speakers } }
Expand All @@ -706,6 +711,7 @@ public final class DiarizerTimeline {
}
}

/// Whether the timeline has any segments
public var hasSegments: Bool {
speakers.values.contains(where: \.hasSegments)
}
Expand All @@ -715,6 +721,16 @@ public final class DiarizerTimeline {
Float(numFinalizedFrames) * config.frameDurationSeconds
}

/// Duration of tentative predictions in seconds
public var tentativeDuration: Float {
Float(numTentativeFrames) * config.frameDurationSeconds
}

/// Duration of all predictions (finalized + tentative) in seconds
public var duration: Float {
Float(numFrames) * config.frameDurationSeconds
}

/// Maximum number of speakers
public var speakerCapacity: Int {
config.numSpeakers
Expand Down
37 changes: 16 additions & 21 deletions Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizerAPI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -406,10 +406,7 @@ public final class LSEENDDiarizer: Diarizer {
return try processLocked()
}

/// Process a chunk of audio in one call.
///
/// Convenience method that combines `addAudio()` and `process()`.
///
/// Add and process a chunk of audio in one call.
/// - Parameters:
/// - samples: Audio samples to process.
/// - sourceSampleRate: Sample rate of `samples`, or `nil` if already at the model rate.
Expand Down Expand Up @@ -678,14 +675,15 @@ public final class LSEENDDiarizer: Diarizer {
defer { lock.unlock() }

guard let engine = _engine, let session = _session else { return nil }
let numSpeakers = engine.metadata.realOutputDim
var lastResult: DiarizerChunkResult?

// Flush pending audio first — clear unconditionally so failed audio isn't retained.
// Using defer + direct pass avoids a CoW copy.
if !pendingAudio.isEmpty {
defer { pendingAudio.removeAll(keepingCapacity: true) }
let pushedUpdate = try session.pushAudio(pendingAudio)
if let update = pushedUpdate {
let numSpeakers = engine.metadata.realOutputDim
let flushedResult = DiarizerChunkResult(
startFrame: _numFramesProcessed,
finalizedPredictions: flattenRowMajor(update.probabilities, numSpeakers: numSpeakers),
Expand All @@ -695,29 +693,26 @@ public final class LSEENDDiarizer: Diarizer {
)
_numFramesProcessed += flushedResult.finalizedFrameCount
try _timeline.addChunk(flushedResult)
lastResult = flushedResult
}
}

guard let finalUpdate = try session.finalize() else {
_session = nil
_timeline.finalize()
return nil
if let finalUpdate = try session.finalize() {
let finalResult = DiarizerChunkResult(
startFrame: _numFramesProcessed,
finalizedPredictions: flattenRowMajor(finalUpdate.probabilities, numSpeakers: numSpeakers),
finalizedFrameCount: finalUpdate.probabilities.rows,
tentativePredictions: [],
tentativeFrameCount: 0
)
_numFramesProcessed += finalResult.finalizedFrameCount
try _timeline.addChunk(finalResult)
lastResult = finalResult
}

let numSpeakers = engine.metadata.realOutputDim
let result = DiarizerChunkResult(
startFrame: _numFramesProcessed,
finalizedPredictions: flattenRowMajor(finalUpdate.probabilities, numSpeakers: numSpeakers),
finalizedFrameCount: finalUpdate.probabilities.rows,
tentativePredictions: [],
tentativeFrameCount: 0
)
_numFramesProcessed += result.finalizedFrameCount
try _timeline.addChunk(result)
_timeline.finalize()
_session = nil

return result
return lastResult
}

// MARK: - Private
Expand Down
47 changes: 34 additions & 13 deletions Sources/FluidAudio/Diarizer/LS-EEND/LSEENDModelInference.swift
Original file line number Diff line number Diff line change
Expand Up @@ -208,17 +208,11 @@ public final class LSEENDInferenceHelper {
/// - Returns: Complete inference result with logits and probabilities.
public func infer(samples: [Float], sampleRate: Int) throws -> LSEENDInferenceResult {
let normalizedAudio = try resampleIfNeeded(samples: samples, sampleRate: sampleRate)
let features = try offlineFeatureExtractor.extractFeatures(audio: normalizedAudio)
let session = try createSession(inputSampleRate: targetSampleRate)
session.totalInputSamples = normalizedAudio.count
let committed = try session.ingestFeatures(features)
let pending = session.totalFeatureFrames - session.emittedFrames
let tail =
try pending > 0
? session.flushTail(from: session.state, pendingFrames: pending) : .empty(columns: decodeMaxSpeakers)
let fullLogits = committed.appendingRows(tail)
session.fullLogitChunks = fullLogits.isEmpty ? [] : [fullLogits]
session.emittedFrames = fullLogits.rows
if !normalizedAudio.isEmpty {
_ = try session.pushAudio(normalizedAudio)
}
_ = try session.finalize()
return session.snapshot()
}

Expand Down Expand Up @@ -518,13 +512,40 @@ public final class LSEENDStreamingSession {
guard !finalized else {
return nil
}
let features = try featureExtractor.finalize()
let committed = try ingestFeatures(features)

var committedFullLogits = LSEENDMatrix.empty(columns: engine.decodeMaxSpeakers)
let targetEndFrame = Int(
round(Double(totalInputSamples) / Double(max(inputSampleRate, 1)) * engine.modelFrameHz))
let exactPaddingSamples = exactFinalizationPaddingSamples(targetEndFrame: targetEndFrame)
if exactPaddingSamples > 0 {
let features = try featureExtractor.pushAudio([Float](repeating: 0, count: exactPaddingSamples))
let committed = try ingestFeatures(features)
if committed.rows > 0 {
committedFullLogits = committedFullLogits.appendingRows(committed)
}
}

let finalFeatures = try featureExtractor.finalize()
let finalCommitted = try ingestFeatures(finalFeatures)
if finalCommitted.rows > 0 {
committedFullLogits = committedFullLogits.appendingRows(finalCommitted)
}

let pending = totalFeatureFrames - emittedFrames
let tail =
try pending > 0 ? flushTail(from: state, pendingFrames: pending) : .empty(columns: engine.decodeMaxSpeakers)
emittedFrames += tail.rows
finalized = true
return try buildUpdate(committedFullLogits: committed.appendingRows(tail), includePreview: false)
return try buildUpdate(committedFullLogits: committedFullLogits.appendingRows(tail), includePreview: false)
}

private func exactFinalizationPaddingSamples(targetEndFrame: Int) -> Int {
guard targetEndFrame > 0 else {
return 0
}
let stableBlockSize = engine.metadata.resolvedHopLength * engine.metadata.resolvedSubsampling
let requiredTotalSamples = targetEndFrame * stableBlockSize
return max(0, requiredTotalSamples - totalInputSamples)
}

/// Assembles the full inference result from all committed frames emitted so far.
Expand Down
Loading
Loading