-
Notifications
You must be signed in to change notification settings - Fork 243
224 lines (193 loc) · 11.5 KB
/
diarizer-benchmark.yml
File metadata and controls
224 lines (193 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
name: Diarizer Performance Benchmark
on:
pull_request:
branches: [main]
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
benchmark:
name: Single File Performance Benchmark
runs-on: macos-15
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v5
- uses: swift-actions/setup-swift@v2
with:
swift-version: "6.1"
- name: Cache Swift packages and build
uses: actions/cache@v4
with:
path: |
.build
~/Library/Caches/org.swift.swiftpm
key: ${{ runner.os }}-diarizer-${{ hashFiles('Package.swift', 'Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }}
- name: Cache Diarizer models
uses: actions/cache@v4
with:
path: ~/Library/Application Support/FluidAudio/Models/speaker-diarization-coreml
key: ${{ runner.os }}-diarizer-models-${{ hashFiles('Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }}
- name: Cache AMI dataset
uses: actions/cache@v4
with:
path: ~/FluidAudioDatasets/ami_official
key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }}
- name: Build package
run: swift build -c release
- name: Run Single File Benchmark
id: benchmark
run: |
echo "🚀 Running single file benchmark..."
# Record start time
BENCHMARK_START=$(date +%s)
swift run fluidaudiocli diarization-benchmark --auto-download --single-file ES2004a --output benchmark_results.json
# Check if results file was generated
if [ -f benchmark_results.json ]; then
echo "SUCCESS=true" >> $GITHUB_OUTPUT
else
echo "❌ Benchmark failed - no results file generated"
echo "SUCCESS=false" >> $GITHUB_OUTPUT
fi
# Calculate execution time
BENCHMARK_END=$(date +%s)
EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START))
EXECUTION_MINS=$((EXECUTION_TIME / 60))
EXECUTION_SECS=$((EXECUTION_TIME % 60))
echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT
timeout-minutes: 35
- name: Show benchmark_results.json
if: always()
run: |
echo "--- benchmark_results.json ---"
cat benchmark_results.json || echo "benchmark_results.json not found"
echo "-----------------------------"
- name: Extract benchmark metrics with jq
id: extract
run: |
# The output is now an array, so we need to access the first element
DER=$(jq '.[0].der' benchmark_results.json)
JER=$(jq '.[0].jer' benchmark_results.json)
RTF=$(jq '.[0].rtfx' benchmark_results.json)
DURATION="1049" # ES2004a duration in seconds
SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' benchmark_results.json)
# Extract detailed timing information
TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' benchmark_results.json)
MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' benchmark_results.json)
MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' benchmark_results.json)
AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' benchmark_results.json)
SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' benchmark_results.json)
EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' benchmark_results.json)
CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' benchmark_results.json)
INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' benchmark_results.json)
echo "DER=${DER}" >> $GITHUB_OUTPUT
echo "JER=${JER}" >> $GITHUB_OUTPUT
echo "RTF=${RTF}" >> $GITHUB_OUTPUT
echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT
echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT
echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT
echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT
echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT
echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT
echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT
echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT
echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT
# Validate RTFx - 0 indicates benchmark failure
if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
echo "RTFx value: $RTF"
exit 1
fi
- name: Comment PR with Benchmark Results
if: always()
uses: actions/github-script@v7
with:
script: |
const der = parseFloat('${{ steps.extract.outputs.DER }}');
const jer = parseFloat('${{ steps.extract.outputs.JER }}');
const rtf = parseFloat('${{ steps.extract.outputs.RTF }}');
const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1);
const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}';
const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}');
const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}');
const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}');
const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}');
const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}');
const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}');
const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}');
const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}');
const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' || 'N/A';
let comment = '## Speaker Diarization Benchmark Results\n\n';
comment += '### Speaker Diarization Performance\n';
comment += '_Evaluating "who spoke when" detection accuracy_\n\n';
comment += '| Metric | Value | Target | Status | Description |\n';
comment += '|--------|-------|--------|---------|-------------|\n';
comment += `| **DER** | **${der.toFixed(1)}%** | <30% | ${der < 30 ? '✅' : '⚠️'} | Diarization Error Rate (lower is better) |\n`;
comment += `| **JER** | **${jer.toFixed(1)}%** | <25% | ${jer < 25 ? '✅' : '⚠️'} | Jaccard Error Rate |\n`;
comment += `| **RTFx** | **${rtf.toFixed(2)}x** | >1.0x | ${rtf > 1.0 ? '✅' : '⚠️'} | Real-Time Factor (higher is faster) |\n\n`;
comment += '### Diarization Pipeline Timing Breakdown\n';
comment += '_Time spent in each stage of speaker diarization_\n\n';
comment += '| Stage | Time (s) | % | Description |\n';
comment += '|-------|----------|---|-------------|\n';
comment += `| Model Download | ${modelDownloadTime.toFixed(3)} | ${(modelDownloadTime/totalTime*100).toFixed(1)} | Fetching diarization models |\n`;
comment += `| Model Compile | ${modelCompileTime.toFixed(3)} | ${(modelCompileTime/totalTime*100).toFixed(1)} | CoreML compilation |\n`;
comment += `| Audio Load | ${audioLoadTime.toFixed(3)} | ${(audioLoadTime/totalTime*100).toFixed(1)} | Loading audio file |\n`;
comment += `| Segmentation | ${segmentationTime.toFixed(3)} | ${(segmentationTime/totalTime*100).toFixed(1)} | Detecting speech regions |\n`;
comment += `| Embedding | ${embeddingTime.toFixed(3)} | ${(embeddingTime/totalTime*100).toFixed(1)} | Extracting speaker voices |\n`;
comment += `| Clustering | ${clusteringTime.toFixed(3)} | ${(clusteringTime/totalTime*100).toFixed(1)} | Grouping same speakers |\n`;
comment += `| **Total** | **${totalTime.toFixed(3)}** | **100** | **Full pipeline** |\n\n`;
comment += '### Speaker Diarization Research Comparison\n';
comment += '_Research baselines typically achieve 18-30% DER on standard datasets_\n\n';
comment += '| Method | DER | Notes |\n';
comment += '|--------|-----|-------|\n';
comment += '| **FluidAudio** | **' + der.toFixed(1) + '%** | **On-device CoreML** |\n';
comment += '| Research baseline | 18-30% | Standard dataset performance |\n\n';
comment += '**Note**: RTFx shown above is from GitHub Actions runner. On Apple Silicon with ANE:\n';
comment += '- **M2 MacBook Air (2022)**: Runs at **150 RTFx** real-time\n';
comment += '- Performance scales with Apple Neural Engine capabilities\n\n';
comment += `<sub>🎯 **Speaker Diarization Test** • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s diarization time • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`;
// Add hidden identifier for reliable comment detection
comment += '<!-- fluidaudio-benchmark-single-file -->';
try {
// First, try to find existing benchmark comment
const comments = await github.rest.issues.listComments({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
});
// Look for existing benchmark comment (identified by the hidden tag)
const existingComment = comments.data.find(comment => {
const isBot = comment.user.type === 'Bot' ||
comment.user.login === 'github-actions[bot]' ||
comment.user.login.includes('[bot]');
const hasIdentifier = comment.body.includes('<!-- fluidaudio-benchmark-single-file -->');
const hasHeader = comment.body.includes('## 🎯 Single File Benchmark Results');
return isBot && (hasIdentifier || hasHeader);
});
if (existingComment) {
// Update existing comment
await github.rest.issues.updateComment({
comment_id: existingComment.id,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
console.log('✅ Successfully updated existing benchmark comment');
} else {
// Create new comment if none exists
await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
console.log('✅ Successfully posted new benchmark results comment');
}
} catch (error) {
console.error('❌ Failed to update/post comment:', error.message);
// Don't fail the workflow just because commenting failed
}