Skip to content

Commit 9fb3b12

Browse files
committed
feat(app): break same-speaker blocks at natural silence gaps (>2s)
Prevents long monologs from merging into a single giant block by breaking at pauses longer than 2 seconds. This produces natural paragraph-like blocks even when only one speaker is present (e.g. mic-only diarization in dual-source mode).
1 parent 80ec14b commit 9fb3b12

File tree

2 files changed

+57
-3
lines changed

2 files changed

+57
-3
lines changed

app/MeetingTranscriber/Sources/DiarizationProcess.swift

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,17 +129,22 @@ enum DiarizationProcess {
129129
)
130130
}
131131

132+
/// Maximum silence gap (seconds) before breaking a same-speaker block.
133+
/// Pauses longer than this start a new paragraph even for the same speaker.
134+
static let mergeGapThreshold: TimeInterval = 2.0
135+
132136
/// Merge consecutive segments from the same speaker into single blocks.
133137
/// Preserves the start timestamp of the first segment and end timestamp of the last.
134-
/// Text is joined with spaces.
138+
/// Text is joined with spaces. A silence gap > `mergeGapThreshold` forces a break.
135139
static func mergeConsecutiveSpeakers(
136140
_ segments: [TimestampedSegment],
137141
) -> [TimestampedSegment] {
138142
guard var current = segments.first else { return [] }
139143

140144
var merged: [TimestampedSegment] = []
141145
for seg in segments.dropFirst() {
142-
if seg.speaker == current.speaker {
146+
let silenceGap = seg.start - current.end
147+
if seg.speaker == current.speaker, silenceGap <= mergeGapThreshold {
143148
current = TimestampedSegment(
144149
start: current.start,
145150
end: seg.end,

app/MeetingTranscriber/Tests/DiarizationProcessTests.swift

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
@testable import MeetingTranscriber
22
import XCTest
33

4-
final class DiarizationProcessTests: XCTestCase {
4+
final class DiarizationProcessTests: XCTestCase { // swiftlint:disable:this type_body_length
55
// MARK: - Speaker Assignment
66

77
func testAssignSpeakers() {
@@ -349,6 +349,55 @@ final class DiarizationProcessTests: XCTestCase {
349349
XCTAssertEqual(merged.count, 3)
350350
}
351351

352+
func testMergeConsecutiveSpeakers_silenceGapBreaksSameSpeaker() {
353+
// Same speaker but >2s gap between segments — should NOT merge
354+
let segments = [
355+
TimestampedSegment(start: 0, end: 5, text: "First thought.", speaker: "Alice"),
356+
TimestampedSegment(start: 8, end: 13, text: "Second thought.", speaker: "Alice"),
357+
]
358+
let merged = DiarizationProcess.mergeConsecutiveSpeakers(segments)
359+
XCTAssertEqual(merged.count, 2, "Silence gap >2s should break same-speaker block")
360+
XCTAssertEqual(merged[0].text, "First thought.")
361+
XCTAssertEqual(merged[1].text, "Second thought.")
362+
}
363+
364+
func testMergeConsecutiveSpeakers_smallGapStillMerges() {
365+
// Same speaker with <2s gap — should merge
366+
let segments = [
367+
TimestampedSegment(start: 0, end: 5, text: "Hello.", speaker: "Alice"),
368+
TimestampedSegment(start: 6, end: 10, text: "How are you?", speaker: "Alice"),
369+
]
370+
let merged = DiarizationProcess.mergeConsecutiveSpeakers(segments)
371+
XCTAssertEqual(merged.count, 1, "Small gap should still merge")
372+
XCTAssertEqual(merged[0].text, "Hello. How are you?")
373+
}
374+
375+
func testMergeConsecutiveSpeakers_exactThresholdMerges() {
376+
// Gap exactly at threshold (2.0s) — should still merge
377+
let segments = [
378+
TimestampedSegment(start: 0, end: 5, text: "A.", speaker: "Alice"),
379+
TimestampedSegment(start: 7, end: 10, text: "B.", speaker: "Alice"),
380+
]
381+
let merged = DiarizationProcess.mergeConsecutiveSpeakers(segments)
382+
XCTAssertEqual(merged.count, 1, "Gap exactly at threshold should merge")
383+
}
384+
385+
func testMergeConsecutiveSpeakers_longMonologBrokenByPauses() {
386+
// Simulates a long monolog with natural pauses — should break into blocks
387+
let segments = [
388+
TimestampedSegment(start: 0, end: 10, text: "First paragraph.", speaker: "Roman"),
389+
TimestampedSegment(start: 10.5, end: 20, text: "Still first.", speaker: "Roman"),
390+
TimestampedSegment(start: 23, end: 30, text: "Second paragraph.", speaker: "Roman"),
391+
TimestampedSegment(start: 30.5, end: 40, text: "Still second.", speaker: "Roman"),
392+
TimestampedSegment(start: 45, end: 55, text: "Third paragraph.", speaker: "Roman"),
393+
]
394+
let merged = DiarizationProcess.mergeConsecutiveSpeakers(segments)
395+
XCTAssertEqual(merged.count, 3, "Long monolog should break at natural pauses")
396+
XCTAssertEqual(merged[0].text, "First paragraph. Still first.")
397+
XCTAssertEqual(merged[1].text, "Second paragraph. Still second.")
398+
XCTAssertEqual(merged[2].text, "Third paragraph.")
399+
}
400+
352401
func testDiarizationErrorDescription() {
353402
let error: DiarizationError = .notAvailable
354403
XCTAssertEqual(error.errorDescription, "Diarization not available")

0 commit comments

Comments
 (0)