FluidAudio/.github/workflows/diarizer-benchmark.yml at d0c3088628bb49945b42755c4ac2d56b18538786 · FluidInference/FluidAudio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
name: Diarizer Performance Benchmark
on:
  pull_request:
    branches: [main]
    types: [opened, synchronize, reopened]

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  benchmark:
    name: Single File Performance Benchmark
    runs-on: macos-15
    permissions:
      contents: read
      pull-requests: write

    steps:
      - name: Checkout code
        uses: actions/checkout@v5

      - uses: swift-actions/setup-swift@v2
        with:
          swift-version: "6.1"

      - name: Cache Swift packages and build
        uses: actions/cache@v4
        with:
          path: |
            .build
            ~/Library/Caches/org.swift.swiftpm
          key: ${{ runner.os }}-diarizer-${{ hashFiles('Package.swift', 'Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }}

      - name: Cache Diarizer models
        uses: actions/cache@v4
        with:
          path: ~/Library/Application Support/FluidAudio/Models/speaker-diarization-coreml
          key: ${{ runner.os }}-diarizer-models-${{ hashFiles('Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }}

      - name: Cache AMI dataset
        uses: actions/cache@v4
        with:
          path: ~/FluidAudioDatasets/ami_official
          key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }}

      - name: Build package
        run: swift build -c release

      - name: Run Single File Benchmark
        id: benchmark
        run: |
          echo "🚀 Running single file benchmark..."

          # Record start time
          BENCHMARK_START=$(date +%s)

          swift run fluidaudiocli diarization-benchmark --auto-download --single-file ES2004a --output benchmark_results.json

          # Check if results file was generated
          if [ -f benchmark_results.json ]; then
            echo "SUCCESS=true" >> $GITHUB_OUTPUT
          else
            echo "❌ Benchmark failed - no results file generated"
            echo "SUCCESS=false" >> $GITHUB_OUTPUT
          fi

          # Calculate execution time
          BENCHMARK_END=$(date +%s)
          EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START))
          EXECUTION_MINS=$((EXECUTION_TIME / 60))
          EXECUTION_SECS=$((EXECUTION_TIME % 60))

          echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT
        timeout-minutes: 35

      - name: Show benchmark_results.json
        if: always()
        run: |
          echo "--- benchmark_results.json ---"
          cat benchmark_results.json || echo "benchmark_results.json not found"
          echo "-----------------------------"

      - name: Extract benchmark metrics with jq
        id: extract
        run: |
          # The output is now an array, so we need to access the first element
          DER=$(jq '.[0].der' benchmark_results.json)
          JER=$(jq '.[0].jer' benchmark_results.json)
          RTF=$(jq '.[0].rtfx' benchmark_results.json)
          DURATION="1049"  # ES2004a duration in seconds
          SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' benchmark_results.json)

          # Extract detailed timing information
          TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' benchmark_results.json)
          MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' benchmark_results.json)
          MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' benchmark_results.json)
          AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' benchmark_results.json)
          SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' benchmark_results.json)
          EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' benchmark_results.json)
          CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' benchmark_results.json)
          INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' benchmark_results.json)

          echo "DER=${DER}" >> $GITHUB_OUTPUT
          echo "JER=${JER}" >> $GITHUB_OUTPUT
          echo "RTF=${RTF}" >> $GITHUB_OUTPUT
          echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT
          echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT
          echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT
          echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT
          echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT
          echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT
          echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT
          echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT
          echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
          echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT

          # Validate RTFx - 0 indicates benchmark failure
          if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
            echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
            echo "RTFx value: $RTF"
            exit 1
          fi

      - name: Comment PR with Benchmark Results
        if: always()
        uses: actions/github-script@v7
        with:
          script: |
            const der = parseFloat('${{ steps.extract.outputs.DER }}');
            const jer = parseFloat('${{ steps.extract.outputs.JER }}');
            const rtf = parseFloat('${{ steps.extract.outputs.RTF }}');
            const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1);
            const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}';
            const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}');
            const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}');
            const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}');
            const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}');
            const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}');
            const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}');
            const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}');
            const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}');
            const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' || 'N/A';

            let comment = '## Speaker Diarization Benchmark Results\n\n';
            comment += '### Speaker Diarization Performance\n';
            comment += '_Evaluating "who spoke when" detection accuracy_\n\n';
            comment += '| Metric | Value | Target | Status | Description |\n';
            comment += '|--------|-------|--------|---------|-------------|\n';
            comment += `| **DER** | **${der.toFixed(1)}%** | <30% | ${der < 30 ? '✅' : '⚠️'} | Diarization Error Rate (lower is better) |\n`;
            comment += `| **JER** | **${jer.toFixed(1)}%** | <25% | ${jer < 25 ? '✅' : '⚠️'} | Jaccard Error Rate |\n`;
            comment += `| **RTFx** | **${rtf.toFixed(2)}x** | >1.0x | ${rtf > 1.0 ? '✅' : '⚠️'} | Real-Time Factor (higher is faster) |\n\n`;

            comment += '### Diarization Pipeline Timing Breakdown\n';
            comment += '_Time spent in each stage of speaker diarization_\n\n';
            comment += '| Stage | Time (s) | % | Description |\n';
            comment += '|-------|----------|---|-------------|\n';
            comment += `| Model Download | ${modelDownloadTime.toFixed(3)} | ${(modelDownloadTime/totalTime*100).toFixed(1)} | Fetching diarization models |\n`;
            comment += `| Model Compile | ${modelCompileTime.toFixed(3)} | ${(modelCompileTime/totalTime*100).toFixed(1)} | CoreML compilation |\n`;
            comment += `| Audio Load | ${audioLoadTime.toFixed(3)} | ${(audioLoadTime/totalTime*100).toFixed(1)} | Loading audio file |\n`;
            comment += `| Segmentation | ${segmentationTime.toFixed(3)} | ${(segmentationTime/totalTime*100).toFixed(1)} | Detecting speech regions |\n`;
            comment += `| Embedding | ${embeddingTime.toFixed(3)} | ${(embeddingTime/totalTime*100).toFixed(1)} | Extracting speaker voices |\n`;
            comment += `| Clustering | ${clusteringTime.toFixed(3)} | ${(clusteringTime/totalTime*100).toFixed(1)} | Grouping same speakers |\n`;
            comment += `| **Total** | **${totalTime.toFixed(3)}** | **100** | **Full pipeline** |\n\n`;

            comment += '### Speaker Diarization Research Comparison\n';
            comment += '_Research baselines typically achieve 18-30% DER on standard datasets_\n\n';
            comment += '| Method | DER | Notes |\n';
            comment += '|--------|-----|-------|\n';
            comment += '| **FluidAudio** | **' + der.toFixed(1) + '%** | **On-device CoreML** |\n';
            comment += '| Research baseline | 18-30% | Standard dataset performance |\n\n';

            comment += '**Note**: RTFx shown above is from GitHub Actions runner. On Apple Silicon with ANE:\n';
            comment += '- **M2 MacBook Air (2022)**: Runs at **150 RTFx** real-time\n';
            comment += '- Performance scales with Apple Neural Engine capabilities\n\n';

            comment += `<sub>🎯 **Speaker Diarization Test** • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s diarization time • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`;

            // Add hidden identifier for reliable comment detection
            comment += '<!-- fluidaudio-benchmark-single-file -->';

            try {
              // First, try to find existing benchmark comment
              const comments = await github.rest.issues.listComments({
                issue_number: context.issue.number,
                owner: context.repo.owner,
                repo: context.repo.repo,
              });

              // Look for existing benchmark comment (identified by the hidden tag)
              const existingComment = comments.data.find(comment => {
                const isBot = comment.user.type === 'Bot' ||
                             comment.user.login === 'github-actions[bot]' ||
                             comment.user.login.includes('[bot]');

                const hasIdentifier = comment.body.includes('<!-- fluidaudio-benchmark-single-file -->');
                const hasHeader = comment.body.includes('## 🎯 Single File Benchmark Results');

                return isBot && (hasIdentifier || hasHeader);
              });

              if (existingComment) {
                // Update existing comment
                await github.rest.issues.updateComment({
                  comment_id: existingComment.id,
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  body: comment
                });
                console.log('✅ Successfully updated existing benchmark comment');
              } else {
                // Create new comment if none exists
                await github.rest.issues.createComment({
                  issue_number: context.issue.number,
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  body: comment
                });
                console.log('✅ Successfully posted new benchmark results comment');
              }
            } catch (error) {
              console.error('❌ Failed to update/post comment:', error.message);
              // Don't fail the workflow just because commenting failed
            }