-
Notifications
You must be signed in to change notification settings - Fork 253
Fix DER calculation and add diarization proper AMI benchmarking #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7bcdb6a
12102cb
a3b4468
2d126ab
3705df5
41bbec2
4e4735f
94f881c
faaf338
606683d
c1b5136
6f38e93
aac8257
0d061db
2e1bc2e
ffb119d
bf9998b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| name: Performance Benchmark | ||
|
|
||
| on: | ||
| pull_request: | ||
| branches: [main] | ||
| types: [opened, synchronize, reopened] | ||
|
|
||
| concurrency: | ||
| group: ${{ github.workflow }}-${{ github.ref }} | ||
| cancel-in-progress: true | ||
|
|
||
| jobs: | ||
| benchmark: | ||
| name: Single File Performance Benchmark | ||
| runs-on: macos-latest | ||
| permissions: | ||
| contents: read | ||
| pull-requests: write | ||
|
|
||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Setup Swift 6.1 | ||
| uses: swift-actions/setup-swift@v2 | ||
| with: | ||
| swift-version: "6.1" | ||
|
|
||
| - name: Build package | ||
| run: swift build | ||
|
|
||
| - name: Run Single File Benchmark | ||
| id: benchmark | ||
| run: | | ||
| echo "🚀 Running single file benchmark..." | ||
| # Run benchmark with ES2004a file and save results to JSON | ||
| swift run fluidaudio benchmark --auto-download --single-file ES2004a --output benchmark_results.json | ||
|
|
||
| # Extract key metrics from JSON output | ||
| if [ -f benchmark_results.json ]; then | ||
| # Parse JSON results (using basic tools available in GitHub runners) | ||
| AVERAGE_DER=$(cat benchmark_results.json | grep -o '"averageDER":[0-9]*\.?[0-9]*' | cut -d':' -f2) | ||
| AVERAGE_JER=$(cat benchmark_results.json | grep -o '"averageJER":[0-9]*\.?[0-9]*' | cut -d':' -f2) | ||
| PROCESSED_FILES=$(cat benchmark_results.json | grep -o '"processedFiles":[0-9]*' | cut -d':' -f2) | ||
|
|
||
| # Get first result details | ||
| RTF=$(cat benchmark_results.json | grep -o '"realTimeFactor":[0-9]*\.?[0-9]*' | head -1 | cut -d':' -f2) | ||
| DURATION=$(cat benchmark_results.json | grep -o '"durationSeconds":[0-9]*\.?[0-9]*' | head -1 | cut -d':' -f2) | ||
| SPEAKER_COUNT=$(cat benchmark_results.json | grep -o '"speakerCount":[0-9]*' | head -1 | cut -d':' -f2) | ||
|
|
||
| echo "DER=${AVERAGE_DER}" >> $GITHUB_OUTPUT | ||
| echo "JER=${AVERAGE_JER}" >> $GITHUB_OUTPUT | ||
| echo "RTF=${RTF}" >> $GITHUB_OUTPUT | ||
| echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT | ||
| echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT | ||
| echo "PROCESSED_FILES=${PROCESSED_FILES}" >> $GITHUB_OUTPUT | ||
| echo "SUCCESS=true" >> $GITHUB_OUTPUT | ||
| else | ||
| echo "❌ Benchmark failed - no results file generated" | ||
| echo "SUCCESS=false" >> $GITHUB_OUTPUT | ||
| fi | ||
| timeout-minutes: 25 | ||
|
|
||
| - name: Comment PR with Benchmark Results | ||
| if: always() | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const success = '${{ steps.benchmark.outputs.SUCCESS }}' === 'true'; | ||
|
|
||
| let comment = '## 🎯 Single File Benchmark Results\n\n'; | ||
|
|
||
| if (success) { | ||
| const der = parseFloat('${{ steps.benchmark.outputs.DER }}').toFixed(1); | ||
| const jer = parseFloat('${{ steps.benchmark.outputs.JER }}').toFixed(1); | ||
| const rtf = parseFloat('${{ steps.benchmark.outputs.RTF }}').toFixed(2); | ||
| const duration = parseFloat('${{ steps.benchmark.outputs.DURATION }}').toFixed(1); | ||
| const speakerCount = '${{ steps.benchmark.outputs.SPEAKER_COUNT }}'; | ||
|
|
||
| comment += `**Test File:** ES2004a (${duration}s audio)\n\n`; | ||
| comment += '| Metric | Value | Target | Status |\n'; | ||
| comment += '|--------|-------|--------|---------|\n'; | ||
| comment += `| **DER** (Diarization Error Rate) | ${der}% | < 30% | ${der < 30 ? '✅' : '❌'} |\n`; | ||
| comment += `| **JER** (Jaccard Error Rate) | ${jer}% | < 25% | ${jer < 25 ? '✅' : '❌'} |\n`; | ||
| comment += `| **RTF** (Real-Time Factor) | ${rtf}x | < 1.0x | ${rtf < 1.0 ? '✅' : '❌'} |\n`; | ||
| comment += `| **Speakers Detected** | ${speakerCount} | - | ℹ️ |\n\n`; | ||
|
|
||
| // Performance assessment | ||
| if (der < 20) { | ||
| comment += '🎉 **Excellent Performance!** - Competitive with state-of-the-art research\n'; | ||
| } else if (der < 30) { | ||
| comment += '✅ **Good Performance** - Meeting target benchmarks\n'; | ||
| } else { | ||
| comment += '⚠️ **Performance Below Target** - Consider parameter optimization\n'; | ||
| } | ||
|
|
||
| comment += '\n📊 **Research Comparison:**\n'; | ||
| comment += '- Powerset BCE (2023): 18.5% DER\n'; | ||
| comment += '- EEND (2019): 25.3% DER\n'; | ||
| comment += '- x-vector clustering: 28.7% DER\n'; | ||
|
|
||
| } else { | ||
| comment += '❌ **Benchmark Failed**\n\n'; | ||
| comment += 'The single file benchmark could not complete successfully. '; | ||
| comment += 'This may be due to:\n'; | ||
| comment += '- Network issues downloading test data\n'; | ||
| comment += '- Model initialization problems\n'; | ||
| comment += '- Audio processing errors\n\n'; | ||
| comment += 'Please check the workflow logs for detailed error information.'; | ||
| } | ||
|
|
||
| comment += '\n\n---\n*Automated benchmark using AMI corpus ES2004a test file*'; | ||
|
|
||
| github.rest.issues.createComment({ | ||
| issue_number: context.issue.number, | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| body: comment | ||
| }); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,26 +1,32 @@ | ||
| name: CoreML Build Compile | ||
| name: Build and Test | ||
|
|
||
| on: | ||
| pull_request: | ||
| branches: [ main ] | ||
| branches: [main] | ||
| push: | ||
| branches: [main] | ||
|
|
||
| concurrency: | ||
| group: ${{ github.workflow }}-${{ github.ref }} | ||
| cancel-in-progress: true | ||
|
|
||
| jobs: | ||
| verify-coreml: | ||
| name: Verify CoreMLDiarizerManager Builds | ||
| build-and-test: | ||
| name: Build and Test Swift Package | ||
| runs-on: macos-latest | ||
|
|
||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Setup Swift 6.1 | ||
| uses: swift-actions/setup-swift@v2 | ||
| with: | ||
| swift-version: '6.1' | ||
| - name: Setup Swift 6.1 | ||
| uses: swift-actions/setup-swift@v2 | ||
| with: | ||
| swift-version: "6.1" | ||
|
|
||
| - name: Build package | ||
| run: swift build | ||
| - name: Build package | ||
| run: swift build | ||
|
|
||
| - name: Verify DiarizerManager runs | ||
| run: swift test --filter testManagerBasicValidation | ||
| timeout-minutes: 5 | ||
| - name: Run tests | ||
| run: swift test | ||
| timeout-minutes: 10 |
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we do something about this CLUADE.md name, since this PR and the commits were targeted toward benchmarking
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, this is the default Claude Code file it uses overtime. We want to build it up. its like a readme for claude code |
Uh oh!
There was an error while loading. Please reload this page.