FluidInference · Alex-Wengg · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/Documentation/API.md b/Documentation/API.md
@@ -83,7 +83,7 @@ Use `OfflineDiarizerManager` when you need offline DER parity or want to run the
 
 **Speaker Enrollment:** `enrollSpeaker(withAudio:sourceSampleRate:named:...)` feeds known-speaker audio before streaming to label a slot.
 
-**Lifecycle:** `reset()` clears streaming state but keeps the model loaded. `cleanup()` releases everything.
+**Lifecycle:** `finalizeSession()` flushes trailing context so the last true frame becomes finalized. `reset()` clears streaming state but keeps the model loaded. `cleanup()` releases everything.
 
 ---
 

diff --git a/Documentation/Diarization/DiarizerTimeline.md b/Documentation/Diarization/DiarizerTimeline.md
@@ -157,4 +157,4 @@ When `timeline.addChunk(_:)` is called internally by the diarizer:
 3. It iterates over all `DiarizerSpeaker` tracks, evaluating the boundaries (using `onsetThreshold` and `offsetThreshold`) to grow existing segments or spawn new ones.
 4. Tentative segments are cleared and rebuilt from the trailing `tentativePredictions` array during every streaming tick. 
 
-When the stream naturally finishes, the `Diarizer` automatically invokes `timeline.finalize()`, which flushes any remaining tentative segments up to finalized status and applies the `minFramesOn` deletion rules.
+When the stream naturally finishes, call `Diarizer.finalizeSession()`. The diarizer flushes trailing context first, then invokes `timeline.finalize()`, which promotes any remaining tentative segments to finalized status and applies the `minFramesOn` deletion rules.
diff --git a/Documentation/Diarization/LS-EEND.md b/Documentation/Diarization/LS-EEND.md
@@ -226,11 +226,14 @@ if let update = try diarizer.process() {
 // Convenience: add + process in one call
 if let update = try diarizer.process(samples: audioChunk) { ... }
 
-// Flush remaining frames at end of stream
+// Flush remaining frames at the end of a stream
 try diarizer.finalizeSession()
 let finalTimeline = diarizer.timeline
 ```
 
+Notes:
+- `finalizeSession()` flushes the remaining audio by padding the end with silence.
+
 ### Speaker Enrollment
 
 Use speaker enrollment to warm LS-EEND with a known speaker before the live stream starts. Enrollment keeps the active streaming session, resets the visible timeline back to frame 0, and preserves the speaker name inside the `DiarizerTimeline`.
@@ -288,6 +291,7 @@ Real-world integration testing with 4-speaker audio reveals specific enrollment
 ### Lifecycle
 
 ```swift
+try diarizer.finalizeSession() // Flush trailing context before reading final output
 diarizer.reset()     // Reset streaming state for a new audio stream (keeps model loaded)
 diarizer.cleanup()   // Release all resources including the loaded model
 ```

diff --git a/Documentation/Diarization/Sortformer.md b/Documentation/Diarization/Sortformer.md
@@ -368,6 +368,10 @@ public struct SortformerSegment {
 │         └─→ timeline.addChunk(result)                          │
 │             └─→ Update segments per speaker                    │
 │                                                                │
+│  3. finalizeSession()                                          │
+│     └─→ pad trailing silence until last true frame is emitted  │
+│     └─→ timeline.finalize()                                    │
+│                                                                │
 └────────────────────────────────────────────────────────────────┘
 ```
 
@@ -444,6 +448,8 @@ audioEngine.installTap { buffer in
         updateSpeakerDisplay(diarizer.timeline)
     }
 }
+
+try diarizer.finalizeSession()
 ```
 
 ### Batch Processing
@@ -466,6 +472,8 @@ for (index, speaker) in timeline.speakers {
 }
 ```
 
+`finalizeSession()` is only needed for streaming mode. It pads enough trailing silence to flush Sortformer's right-context preview frames, then finalizes the timeline so `numTentativeFrames == 0`.
+
 ### Speaker Enrollment
 
 Use speaker enrollment to warm Sortformer with known speakers before live audio starts. Enrollment preserves the speaker cache / FIFO state, resets the visible timeline, and keeps the speaker name in the `DiarizerTimeline`.

diff --git a/Sources/FluidAudio/Diarizer/DiarizerTimeline.swift b/Sources/FluidAudio/Diarizer/DiarizerTimeline.swift
@@ -506,6 +506,9 @@ public struct DiarizerSegment: Sendable, Identifiable, Comparable, Equatable {
     /// Duration of one frame in seconds
     public let frameDurationSeconds: Float
 
+    /// Confidence in this speech segment (average speech probability from the diarizer)
+    public var confidence: Float = 0.0
+
     /// Start time in seconds
     public var startTime: Float { Float(startFrame) * frameDurationSeconds }
 
@@ -523,29 +526,33 @@ public struct DiarizerSegment: Sendable, Identifiable, Comparable, Equatable {
         startFrame: Int,
         endFrame: Int,
         finalized: Bool = true,
-        frameDurationSeconds: Float
+        frameDurationSeconds: Float,
+        confidence: Float = 0
     ) {
         self.id = UUID()
         self.speakerIndex = speakerIndex
         self.startFrame = startFrame
         self.endFrame = endFrame
         self.isFinalized = finalized
         self.frameDurationSeconds = frameDurationSeconds
+        self.confidence = confidence
     }
 
     public init(
         speakerIndex: Int,
         startTime: Float,
         endTime: Float,
         finalized: Bool = true,
-        frameDurationSeconds: Float
+        frameDurationSeconds: Float,
+        confidence: Float = 0
     ) {
         self.id = UUID()
         self.speakerIndex = speakerIndex
         self.startFrame = Int(round(startTime / frameDurationSeconds))
         self.endFrame = Int(round(endTime / frameDurationSeconds))
         self.isFinalized = finalized
         self.frameDurationSeconds = frameDurationSeconds
+        self.confidence = confidence
     }
 
     /// Check if this overlaps with another segment
@@ -639,6 +646,13 @@ public struct DiarizerChunkResult: Sendable {
 /// Generalizes `SortformerTimeline` for any frame-based diarizer. Works with
 /// both Sortformer (fixed 4 speakers) and LS-EEND (variable speaker count).
 public final class DiarizerTimeline {
+    private struct ClosedSegmentStats {
+        var start: Int
+        var end: Int
+        var activitySum: Float
+        var activeFrameCount: Int
+    }
+
     public enum KeptOnReset {
         case nothing
         case namedSpeakers
@@ -650,15 +664,21 @@ public final class DiarizerTimeline {
     private struct StreamingState {
         var startFrame: Int
         var isSpeaking: Bool
-        var lastSegment: (start: Int, end: Int)
+        var activitySum: Float
+        var activeFrameCount: Int
+        var lastSegment: ClosedSegmentStats?
 
         init(
             startFrame: Int = 0,
             isSpeaking: Bool = false,
-            lastSegment: (start: Int, end: Int) = (-1, -1)
+            activitySum: Float = 0,
+            activeFrameCount: Int = 0,
+            lastSegment: ClosedSegmentStats? = nil
         ) {
             self.startFrame = startFrame
             self.isSpeaking = isSpeaking
+            self.activitySum = activitySum
+            self.activeFrameCount = activeFrameCount
             self.lastSegment = lastSegment
         }
     }
@@ -688,6 +708,11 @@ public final class DiarizerTimeline {
         queue.sync { _tentativePredictions.count / speakerCapacity }
     }
 
+    /// Total number of frames (finalized + tentative)
+    public var numFrames: Int {
+        queue.sync { _numFinalizedFrames + _tentativePredictions.count / speakerCapacity }
+    }
+
     /// Speakers in the timeline
     public var speakers: [Int: DiarizerSpeaker] {
         get { queue.sync { _speakers } }
@@ -706,6 +731,7 @@ public final class DiarizerTimeline {
         }
     }
 
+    /// Whether the timeline has any segments
     public var hasSegments: Bool {
         speakers.values.contains(where: \.hasSegments)
     }
@@ -715,6 +741,16 @@ public final class DiarizerTimeline {
         Float(numFinalizedFrames) * config.frameDurationSeconds
     }
 
+    /// Duration of tentative predictions in seconds
+    public var tentativeDuration: Float {
+        Float(numTentativeFrames) * config.frameDurationSeconds
+    }
+
+    /// Duration of all predictions (finalized + tentative) in seconds
+    public var duration: Float {
+        Float(numFrames) * config.frameDurationSeconds
+    }
+
     /// Maximum number of speakers
     public var speakerCapacity: Int {
         config.numSpeakers
@@ -1103,42 +1139,64 @@ public final class DiarizerTimeline {
 
             var start = state.startFrame
             var speaking = state.isSpeaking
+            var activitySum = state.activitySum
+            var activeFrameCount = state.activeFrameCount
             var lastSegment = state.lastSegment
             var wasLastSegmentFinal = isFinalized
 
             for i in 0..<numFrames {
                 let index = speakerIndex + i * numSpeakers
+                let activity = predictions[index]
 
                 if speaking {
-                    if predictions[index] >= offset {
+                    if activity >= offset {
+                        activitySum += activity
+                        activeFrameCount += 1
                         continue
                     }
 
                     speaking = false
                     let end = frameOffset + i + padOffset
 
-                    guard end - start > minFramesOn else { continue }
+                    guard end - start > minFramesOn else {
+                        activitySum = 0
+                        activeFrameCount = 0
+                        continue
+                    }
 
                     wasLastSegmentFinal = isFinalized && (end < tentativeStartFrame)
+                    let confidence = activeFrameCount > 0 ? (activitySum / Float(activeFrameCount)) : 0
 
                     let newSegment = DiarizerSegment(
                         speakerIndex: speakerIndex,
                         startFrame: start,
                         endFrame: end,
                         finalized: wasLastSegmentFinal,
-                        frameDurationSeconds: frameDuration
+                        frameDurationSeconds: frameDuration,
+                        confidence: confidence
                     )
 
                     provideSpeaker(forSlot: speakerIndex).append(newSegment)
 
-                    lastSegment = (start, end)
+                    lastSegment = ClosedSegmentStats(
+                        start: start,
+                        end: end,
+                        activitySum: activitySum,
+                        activeFrameCount: activeFrameCount
+                    )
+                    activitySum = 0
+                    activeFrameCount = 0
 
-                } else if predictions[index] > onset {
+                } else if activity > onset {
                     start = max(0, frameOffset + i - padOnset)
                     speaking = true
+                    activitySum = activity
+                    activeFrameCount = 1
 
-                    if start - lastSegment.end <= minFramesOff {
+                    if let lastSegment, start - lastSegment.end <= minFramesOff {
                         start = lastSegment.start
+                        activitySum += lastSegment.activitySum
+                        activeFrameCount += lastSegment.activeFrameCount
                         _speakers[speakerIndex]?.popLast(fromFinalized: wasLastSegmentFinal)
                     }
                 }
@@ -1147,18 +1205,22 @@ public final class DiarizerTimeline {
             if isFinalized {
                 states[speakerIndex].startFrame = start
                 states[speakerIndex].isSpeaking = speaking
+                states[speakerIndex].activitySum = activitySum
+                states[speakerIndex].activeFrameCount = activeFrameCount
                 states[speakerIndex].lastSegment = lastSegment
             }
 
             if addTrailingTentative {
                 let end = frameOffset + numFrames + padOffset
                 if speaking && (end > start) {
+                    let confidence = activeFrameCount > 0 ? (activitySum / Float(activeFrameCount)) : 0
                     let newSegment = DiarizerSegment(
                         speakerIndex: speakerIndex,
                         startFrame: start,
                         endFrame: end,
                         finalized: false,
-                        frameDurationSeconds: frameDuration
+                        frameDurationSeconds: frameDuration,
+                        confidence: confidence
                     )
                     provideSpeaker(forSlot: speakerIndex).appendTentative(newSegment)
                 }
-Original file line number
+Diff line change
@@ Expand Up @@
     **Speaker Enrollment:** `enrollSpeaker(withAudio:sourceSampleRate:named:...)` feeds known-speaker audio before streaming to label a slot.
-    **Lifecycle:** `reset()` clears streaming state but keeps the model loaded. `cleanup()` releases everything.
+    **Lifecycle:** `finalizeSession()` flushes trailing context so the last true frame becomes finalized. `reset()` clears streaming state but keeps the model loaded. `cleanup()` releases everything.
     ---
@@ Expand Down @@