Skip to content

Commit b1386ad

Browse files
committed
Fix voice transcription
1 parent f7136b6 commit b1386ad

File tree

10 files changed

+190
-6
lines changed

10 files changed

+190
-6
lines changed

README.fr.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ Le sous-agent a accès aux outils (message, web_search, etc.) et peut communique
772772
### Fournisseurs
773773

774774
> [!NOTE]
775-
> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages vocaux Telegram seront automatiquement transcrits.
775+
> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages audio de n'importe quel canal seront automatiquement transcrits au niveau de l'agent.
776776
777777
| Fournisseur | Utilisation | Obtenir une Clé API |
778778
| ------------------------ | ---------------------------------------- | ------------------------------------------------------ |

README.ja.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -728,7 +728,7 @@ HEARTBEAT_OK 応答 ユーザーが直接結果を受け取る
728728
### プロバイダー
729729

730730
> [!NOTE]
731-
> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、Telegram の音声メッセージが自動的に文字起こしされます
731+
> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、あらゆるチャンネルからの音声メッセージがエージェントレベルで自動的に文字起こしされます
732732
733733
| プロバイダー | 用途 | API キー取得先 |
734734
| --- | --- | --- |

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ The subagent has access to tools (message, web_search, etc.) and can communicate
818818
### Providers
819819

820820
> [!NOTE]
821-
> Groq provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed.
821+
> Groq provides free voice transcription via Whisper. If configured, audio messages from any channel will be automatically transcribed at the agent level.
822822
823823
| Provider | Purpose | Get API Key |
824824
| -------------------------- | --------------------------------------- | -------------------------------------------------------------------- |

README.pt-br.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,7 @@ O subagente tem acesso às ferramentas (message, web_search, etc.) e pode se com
766766
### Provedores
767767

768768
> [!NOTE]
769-
> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de voz do Telegram serão automaticamente transcritas.
769+
> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de áudio de qualquer canal serão automaticamente transcritas no nível do agente.
770770
771771
| Provedor | Finalidade | Obter API Key |
772772
| --- | --- | --- |

README.vi.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ Subagent có quyền truy cập các công cụ (message, web_search, v.v.) và
740740
### Nhà cung cấp (Providers)
741741

742742
> [!NOTE]
743-
> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn thoại trên Telegram sẽ được tự động chuyển thành văn bản.
743+
> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn âm thanh từ bất kỳ kênh nào sẽ được tự động chuyển thành văn bản ở cấp độ agent.
744744
745745
| Nhà cung cấp | Mục đích | Lấy API Key |
746746
| --- | --- | --- |

README.zh.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ Agent 读取 HEARTBEAT.md
418418
### 提供商 (Providers)
419419

420420
> [!NOTE]
421-
> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,Telegram 语音消息将被自动转录为文字
421+
> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,任意渠道的音频消息都将在 Agent 层面自动转录为文字
422422
423423
| 提供商 | 用途 | 获取 API Key |
424424
| -------------------- | ---------------------------- | -------------------------------------------------------------------- |

cmd/picoclaw/internal/gateway/helpers.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"os"
88
"os/signal"
99
"path/filepath"
10+
"strings"
1011
"time"
1112

1213
"github.com/sipeed/picoclaw/cmd/picoclaw/internal"
@@ -36,6 +37,7 @@ import (
3637
"github.com/sipeed/picoclaw/pkg/providers"
3738
"github.com/sipeed/picoclaw/pkg/state"
3839
"github.com/sipeed/picoclaw/pkg/tools"
40+
"github.com/sipeed/picoclaw/pkg/voice"
3941
)
4042

4143
func gatewayCmd(debug bool) error {
@@ -134,6 +136,22 @@ func gatewayCmd(debug bool) error {
134136
agentLoop.SetChannelManager(channelManager)
135137
agentLoop.SetMediaStore(mediaStore)
136138

139+
// Wire up voice transcription if Groq API key is available
140+
groqAPIKey := cfg.Providers.Groq.APIKey
141+
if groqAPIKey == "" {
142+
for _, mc := range cfg.ModelList {
143+
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" {
144+
groqAPIKey = mc.APIKey
145+
break
146+
}
147+
}
148+
}
149+
if groqAPIKey != "" {
150+
transcriber := voice.NewGroqTranscriber(groqAPIKey)
151+
agentLoop.SetTranscriber(transcriber)
152+
logger.InfoC("voice", "Groq voice transcription enabled (agent-level)")
153+
}
154+
137155
enabledChannels := channelManager.GetEnabledChannels()
138156
if len(enabledChannels) > 0 {
139157
fmt.Printf("✓ Channels enabled: %s\n", enabledChannels)

pkg/agent/loop.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ import (
1818
"time"
1919
"unicode/utf8"
2020

21+
"regexp"
22+
2123
"github.com/sipeed/picoclaw/pkg/bus"
2224
"github.com/sipeed/picoclaw/pkg/channels"
2325
"github.com/sipeed/picoclaw/pkg/config"
@@ -30,6 +32,7 @@ import (
3032
"github.com/sipeed/picoclaw/pkg/state"
3133
"github.com/sipeed/picoclaw/pkg/tools"
3234
"github.com/sipeed/picoclaw/pkg/utils"
35+
"github.com/sipeed/picoclaw/pkg/voice"
3336
)
3437

3538
type AgentLoop struct {
@@ -42,6 +45,7 @@ type AgentLoop struct {
4245
fallback *providers.FallbackChain
4346
channelManager *channels.Manager
4447
mediaStore media.MediaStore
48+
transcriber voice.Transcriber
4549
}
4650

4751
// processOptions configures how a message is processed
@@ -262,6 +266,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
262266
al.mediaStore = s
263267
}
264268

269+
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
270+
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
271+
al.transcriber = t
272+
}
273+
274+
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
275+
276+
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
277+
// replaces audio annotations in msg.Content with the transcribed text.
278+
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
279+
if al.transcriber == nil || !al.transcriber.IsAvailable() || al.mediaStore == nil || len(msg.Media) == 0 {
280+
return msg
281+
}
282+
283+
// Transcribe each audio media ref in order.
284+
var transcriptions []string
285+
for _, ref := range msg.Media {
286+
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
287+
if err != nil {
288+
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
289+
continue
290+
}
291+
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
292+
continue
293+
}
294+
result, err := al.transcriber.Transcribe(ctx, path)
295+
if err != nil {
296+
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
297+
transcriptions = append(transcriptions, "")
298+
continue
299+
}
300+
transcriptions = append(transcriptions, result.Text)
301+
}
302+
303+
if len(transcriptions) == 0 {
304+
return msg
305+
}
306+
307+
// Replace audio annotations sequentially with transcriptions.
308+
idx := 0
309+
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
310+
if idx >= len(transcriptions) {
311+
return match
312+
}
313+
text := transcriptions[idx]
314+
idx++
315+
return "[voice: " + text + "]"
316+
})
317+
318+
// Append any remaining transcriptions not matched by an annotation.
319+
for ; idx < len(transcriptions); idx++ {
320+
newContent += "\n[voice: " + transcriptions[idx] + "]"
321+
}
322+
323+
msg.Content = newContent
324+
return msg
325+
}
326+
265327
// inferMediaType determines the media type ("image", "audio", "video", "file")
266328
// from a filename and MIME content type.
267329
func inferMediaType(filename, contentType string) string {
@@ -364,6 +426,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
364426
"session_key": msg.SessionKey,
365427
})
366428

429+
msg = al.transcribeAudioInMessage(ctx, msg)
430+
367431
// Route system messages to processSystemMessage
368432
if msg.Channel == "system" {
369433
return al.processSystemMessage(ctx, msg)

pkg/voice/transcriber.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ import (
1616
"github.com/sipeed/picoclaw/pkg/utils"
1717
)
1818

19+
type Transcriber interface {
20+
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
21+
IsAvailable() bool
22+
}
23+
1924
type GroqTranscriber struct {
2025
apiKey string
2126
apiBase string

pkg/voice/transcriber_test.go

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package voice
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"net/http"
7+
"net/http/httptest"
8+
"os"
9+
"path/filepath"
10+
"testing"
11+
)
12+
13+
// Ensure GroqTranscriber satisfies the Transcriber interface at compile time.
14+
var _ Transcriber = (*GroqTranscriber)(nil)
15+
16+
func TestIsAvailable(t *testing.T) {
17+
tests := []struct {
18+
name string
19+
apiKey string
20+
want bool
21+
}{
22+
{"with key", "sk-test-key", true},
23+
{"empty key", "", false},
24+
}
25+
for _, tc := range tests {
26+
t.Run(tc.name, func(t *testing.T) {
27+
tr := NewGroqTranscriber(tc.apiKey)
28+
if got := tr.IsAvailable(); got != tc.want {
29+
t.Errorf("IsAvailable() = %v, want %v", got, tc.want)
30+
}
31+
})
32+
}
33+
}
34+
35+
func TestTranscribe(t *testing.T) {
36+
// Write a minimal fake audio file so the transcriber can open and send it.
37+
tmpDir := t.TempDir()
38+
audioPath := filepath.Join(tmpDir, "clip.ogg")
39+
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
40+
t.Fatalf("failed to write fake audio file: %v", err)
41+
}
42+
43+
t.Run("success", func(t *testing.T) {
44+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
45+
if r.URL.Path != "/audio/transcriptions" {
46+
t.Errorf("unexpected path: %s", r.URL.Path)
47+
}
48+
if r.Header.Get("Authorization") != "Bearer sk-test" {
49+
t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization"))
50+
}
51+
w.Header().Set("Content-Type", "application/json")
52+
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
53+
Text: "hello world",
54+
Language: "en",
55+
Duration: 1.5,
56+
})
57+
}))
58+
defer srv.Close()
59+
60+
tr := NewGroqTranscriber("sk-test")
61+
tr.apiBase = srv.URL
62+
63+
resp, err := tr.Transcribe(context.Background(), audioPath)
64+
if err != nil {
65+
t.Fatalf("Transcribe() error: %v", err)
66+
}
67+
if resp.Text != "hello world" {
68+
t.Errorf("Text = %q, want %q", resp.Text, "hello world")
69+
}
70+
if resp.Language != "en" {
71+
t.Errorf("Language = %q, want %q", resp.Language, "en")
72+
}
73+
})
74+
75+
t.Run("api error", func(t *testing.T) {
76+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
77+
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
78+
}))
79+
defer srv.Close()
80+
81+
tr := NewGroqTranscriber("sk-bad")
82+
tr.apiBase = srv.URL
83+
84+
_, err := tr.Transcribe(context.Background(), audioPath)
85+
if err == nil {
86+
t.Fatal("expected error for non-200 response, got nil")
87+
}
88+
})
89+
90+
t.Run("missing file", func(t *testing.T) {
91+
tr := NewGroqTranscriber("sk-test")
92+
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
93+
if err == nil {
94+
t.Fatal("expected error for missing file, got nil")
95+
}
96+
})
97+
}

0 commit comments

Comments
 (0)