Skip to content

Commit 3d777da

Browse files
Add unit tests for transcript processing
- Test timeline building with various intervals and edge cases - Test filtering of [Music] and [Applause] markers - Test timestamp parsing and formatting utilities - Test roundtrip conversions and precision handling
1 parent 389d9bf commit 3d777da

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import unittest
2+
from transcripts import build_timeline_from_transcript, parse_timestamp_to_seconds, format_duration
3+
4+
5+
class TestTimelineBuilder(unittest.TestCase):
6+
"""Test the timeline building functionality for chapter generation."""
7+
8+
def test_build_timeline_basic(self):
9+
"""Test that timeline is built correctly from raw transcript data."""
10+
raw_transcript = [
11+
{'text': 'Hello everyone', 'start': 0, 'duration': 2},
12+
{'text': 'Welcome to the show', 'start': 2, 'duration': 3},
13+
{'text': 'Today we are talking about OpenTelemetry', 'start': 35, 'duration': 4},
14+
{'text': 'It is very exciting', 'start': 39, 'duration': 2},
15+
]
16+
17+
timeline = build_timeline_from_transcript(raw_transcript, sample_interval=30)
18+
19+
# Should have entries at 00:00:00 and 00:00:30
20+
self.assertIn('[00:00:00]', timeline)
21+
self.assertIn('[00:00:30]', timeline)
22+
self.assertIn('Hello everyone', timeline)
23+
self.assertIn('OpenTelemetry', timeline)
24+
25+
def test_build_timeline_filters_music(self):
26+
"""Test that [Music] and similar markers are filtered out."""
27+
raw_transcript = [
28+
{'text': 'Hello everyone', 'start': 0, 'duration': 2},
29+
{'text': '[Music]', 'start': 2, 'duration': 5},
30+
{'text': '[Applause]', 'start': 7, 'duration': 3},
31+
{'text': 'Welcome back', 'start': 28, 'duration': 2}, # Within ±5 window of 30s
32+
]
33+
34+
timeline = build_timeline_from_transcript(raw_transcript, sample_interval=30)
35+
36+
# Should not contain filtered markers
37+
self.assertNotIn('[Music]', timeline)
38+
self.assertNotIn('[Applause]', timeline)
39+
# Should contain actual speech
40+
self.assertIn('Hello everyone', timeline)
41+
self.assertIn('Welcome back', timeline)
42+
43+
def test_build_timeline_empty_transcript(self):
44+
"""Test that empty transcript returns empty timeline."""
45+
timeline = build_timeline_from_transcript([], sample_interval=30)
46+
self.assertEqual(timeline, "")
47+
48+
timeline = build_timeline_from_transcript(None, sample_interval=30)
49+
self.assertEqual(timeline, "")
50+
51+
def test_build_timeline_custom_interval(self):
52+
"""Test that custom sample intervals work correctly."""
53+
raw_transcript = [
54+
{'text': 'Start', 'start': 0, 'duration': 1},
55+
{'text': 'At 10 seconds', 'start': 10, 'duration': 2},
56+
{'text': 'At 20 seconds', 'start': 20, 'duration': 2},
57+
{'text': 'At 30 seconds', 'start': 30, 'duration': 2},
58+
]
59+
60+
# Sample every 10 seconds
61+
timeline = build_timeline_from_transcript(raw_transcript, sample_interval=10)
62+
63+
# Should have entries at 0, 10, 20, 30
64+
self.assertIn('[00:00:00]', timeline)
65+
self.assertIn('[00:00:10]', timeline)
66+
self.assertIn('[00:00:20]', timeline)
67+
self.assertIn('[00:00:30]', timeline)
68+
69+
70+
def test_build_timeline_preserves_precision(self):
71+
"""Test that timeline captures precise timestamps, not just rounded intervals."""
72+
raw_transcript = [
73+
{'text': 'Starting at 3 seconds', 'start': 3, 'duration': 2},
74+
{'text': 'Now at 32 seconds', 'start': 32, 'duration': 2},
75+
{'text': 'And at 91 seconds', 'start': 91, 'duration': 2},
76+
]
77+
78+
# Sample every 30 seconds with ±5 second window
79+
# 0s sample (window 0-5) captures text at 3s
80+
# 30s sample (window 25-35) captures text at 32s
81+
# 90s sample (window 85-95) captures text at 91s
82+
timeline = build_timeline_from_transcript(raw_transcript, sample_interval=30)
83+
84+
# Should capture text from the sampling points
85+
self.assertIn('[00:00:00]', timeline)
86+
self.assertIn('[00:00:30]', timeline)
87+
self.assertIn('[00:01:30]', timeline)
88+
# Verify text is captured at each sample point
89+
self.assertIn('3 seconds', timeline)
90+
self.assertIn('32 seconds', timeline)
91+
self.assertIn('91 seconds', timeline)
92+
93+
94+
class TestTimestampParsing(unittest.TestCase):
95+
"""Test timestamp parsing and formatting functions."""
96+
97+
def test_parse_timestamp_to_seconds(self):
98+
"""Test converting timestamp strings to seconds."""
99+
self.assertEqual(parse_timestamp_to_seconds('00:00:00'), 0)
100+
self.assertEqual(parse_timestamp_to_seconds('00:01:00'), 60)
101+
self.assertEqual(parse_timestamp_to_seconds('00:05:30'), 330)
102+
self.assertEqual(parse_timestamp_to_seconds('01:00:00'), 3600)
103+
self.assertEqual(parse_timestamp_to_seconds('01:23:45'), 5025)
104+
105+
# Test MM:SS format
106+
self.assertEqual(parse_timestamp_to_seconds('05:30'), 330)
107+
self.assertEqual(parse_timestamp_to_seconds('1:00'), 60)
108+
109+
def test_format_duration(self):
110+
"""Test formatting seconds into HH:MM:SS."""
111+
self.assertEqual(format_duration(0), '00:00:00')
112+
self.assertEqual(format_duration(60), '00:01:00')
113+
self.assertEqual(format_duration(330), '00:05:30')
114+
self.assertEqual(format_duration(3600), '01:00:00')
115+
self.assertEqual(format_duration(5025), '01:23:45')
116+
117+
def test_timestamp_roundtrip(self):
118+
"""Test that parsing and formatting are inverse operations."""
119+
timestamps = ['00:00:00', '00:05:30', '01:23:45', '02:00:00']
120+
for ts in timestamps:
121+
seconds = parse_timestamp_to_seconds(ts)
122+
formatted = format_duration(seconds)
123+
self.assertEqual(formatted, ts)
124+
125+
126+
if __name__ == '__main__':
127+
unittest.main()
128+

0 commit comments

Comments
 (0)