From 447a4ba317c11b6924e64e1216be467e3938929b Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Wed, 15 Apr 2026 18:38:13 +0800 Subject: [PATCH 1/2] add voice input feature --- packages/app/src/components/prompt-input.tsx | 50 ++- .../app/src/components/prompt-input/voice.tsx | 303 ++++++++++++++ packages/core/src/v1/config/config.ts | 2 + packages/core/src/v1/config/voice.ts | 42 ++ packages/opencode/src/config/config.ts | 2 +- packages/opencode/src/provider/transform.ts | 8 + .../src/server/routes/instance/httpapi/api.ts | 2 + .../routes/instance/httpapi/groups/audio.ts | 108 +++++ .../routes/instance/httpapi/handlers/audio.ts | 157 +++++++ .../server/routes/instance/httpapi/server.ts | 4 + packages/opencode/src/session/message-v2.ts | 3 + packages/opencode/src/voice/config.ts | 41 ++ packages/opencode/src/voice/error.ts | 27 ++ packages/opencode/src/voice/index.ts | 392 ++++++++++++++++++ packages/opencode/src/voice/lalm.txt | 37 ++ packages/tui/src/component/prompt/index.tsx | 64 +++ packages/tui/src/component/prompt/voice.ts | 180 ++++++++ packages/tui/src/config/index.tsx | 12 +- packages/tui/src/config/keybind.ts | 2 + packages/tui/src/util/voice.ts | 231 +++++++++++ packages/ui/src/components/icon.tsx | 1 + packages/web/src/content/docs/config.mdx | 49 +++ packages/web/src/content/docs/tui.mdx | 7 + 23 files changed, 1720 insertions(+), 4 deletions(-) create mode 100644 packages/app/src/components/prompt-input/voice.tsx create mode 100644 packages/core/src/v1/config/voice.ts create mode 100644 packages/opencode/src/server/routes/instance/httpapi/groups/audio.ts create mode 100644 packages/opencode/src/server/routes/instance/httpapi/handlers/audio.ts create mode 100644 packages/opencode/src/voice/config.ts create mode 100644 packages/opencode/src/voice/error.ts create mode 100644 packages/opencode/src/voice/index.ts create mode 100644 packages/opencode/src/voice/lalm.txt create mode 100644 packages/tui/src/component/prompt/voice.ts create mode 100644 packages/tui/src/util/voice.ts diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx index 61a5c55d3e1a..ea83512dbd70 100644 --- a/packages/app/src/components/prompt-input.tsx +++ b/packages/app/src/components/prompt-input.tsx @@ -81,6 +81,7 @@ import { useQueryOptions } from "@/context/server-sync" import { pathKey } from "@/utils/path-key" import { base64Encode } from "@opencode-ai/core/util/encode" import { displayName } from "@/pages/layout/helpers" +import { createVoiceInput, VoiceButton } from "./prompt-input/voice" interface PromptInputProps { class?: string @@ -602,6 +603,23 @@ export const PromptInput: Component = (props) => { }) } + command.register(() => [ + { + id: "prompt.voice", + title: "Voice input", + description: "Start or stop voice recording", + category: "Prompt", + keybind: "mod+shift+v", + onSelect: () => { + if (voice.hasLastRecording() && !voice.transcribing()) { + void voice.confirmRetry() + } else { + void voice.toggleVoice() + } + }, + }, + ]) + const agentList = createMemo(() => sync.data.agent .filter((agent) => !agent.hidden && agent.mode !== "primary") @@ -1032,6 +1050,14 @@ export const PromptInput: Component = (props) => { return true } + const voice = createVoiceInput({ + sdk, + editorText: () => prompt.current().map((part) => ("content" in part ? part.content : "")).join(""), + addPart, + editorRef, + queueScroll, + }) + const addToHistory = (prompt: Prompt, mode: "normal" | "shell") => { const currentHistory = mode === "shell" ? shellHistory : history const setCurrentHistory = mode === "shell" ? setShellHistory : setHistory @@ -1634,6 +1660,16 @@ export const PromptInput: Component = (props) => { + = (props) => {
-
+
= (props) => { {language.t("common.cancel")}
-
+
= (props) => {
+
diff --git a/packages/app/src/components/prompt-input/voice.tsx b/packages/app/src/components/prompt-input/voice.tsx new file mode 100644 index 000000000000..4e4c60d825d4 --- /dev/null +++ b/packages/app/src/components/prompt-input/voice.tsx @@ -0,0 +1,303 @@ +import { createMemo, Match, onCleanup, Switch, type Component } from "solid-js" +import { createStore } from "solid-js/store" +import { showToast } from "@opencode-ai/ui/toast" +import { Button } from "@opencode-ai/ui/button" +import { Icon } from "@opencode-ai/ui/icon" +import { Spinner } from "@opencode-ai/ui/spinner" +import { Tooltip, TooltipKeybind } from "@opencode-ai/ui/tooltip" +import type { useSDK } from "@/context/sdk" +import { useSessionLayout } from "@/pages/session/session-layout" + +type VoiceInput = { + sdk: ReturnType + editorText: () => string + addPart: (part: { type: "text"; content: string; start: number; end: number }) => void + editorRef: HTMLDivElement + queueScroll: () => void +} + +const isVoiceSupported = () => + typeof navigator !== "undefined" && + typeof window !== "undefined" && + Boolean(navigator.mediaDevices?.getUserMedia) && + typeof MediaRecorder !== "undefined" + +export function createVoiceInput(input: VoiceInput) { + const { params } = useSessionLayout() + + const [state, setState] = createStore({ + recording: false, + transcribing: false, + lastRecording: undefined as Blob | undefined, + }) + const recording = () => state.recording + const transcribing = () => state.transcribing + const hasLastRecording = () => Boolean(state.lastRecording) + const audio = { + recorder: undefined as MediaRecorder | undefined, + stream: undefined as MediaStream | undefined, + controller: undefined as AbortController | undefined, + chunks: [] as Blob[], + mime: "", + } + + const stopStream = () => { + audio.stream?.getTracks().forEach((track) => track.stop()) + audio.stream = undefined + } + + const recordStart = async () => { + if (!isVoiceSupported()) { + showToast({ + title: "Voice input unavailable", + description: "Your browser does not support audio recording.", + }) + return false + } + if (audio.recorder) return false + + const stream = await navigator.mediaDevices + .getUserMedia({ audio: true }) + .catch(() => undefined) + if (!stream) { + showToast({ + title: "Microphone blocked", + description: "Allow microphone access to start recording.", + }) + return false + } + + audio.stream = stream + + const preferred = "audio/webm;codecs=opus" + const fallback = "audio/webm" + const mime = MediaRecorder.isTypeSupported(preferred) + ? preferred + : MediaRecorder.isTypeSupported(fallback) + ? fallback + : "" + if (!mime) { + stopStream() + showToast({ + title: "Voice input unavailable", + description: "This browser does not support the available audio formats.", + }) + return false + } + const recorder = new MediaRecorder(stream, { mimeType: mime }) + + audio.mime = recorder.mimeType || mime + audio.chunks = [] + audio.recorder = recorder + + recorder.ondataavailable = (event) => { + if (event.data.size === 0) return + audio.chunks.push(event.data) + } + + recorder.start() + setState("recording", true) + return true + } + + const recordStop = async () => { + if (!audio.recorder) return + const recorder = audio.recorder + audio.recorder = undefined + + const result = new Promise((resolve) => { + recorder.onstop = () => { + resolve(new Blob(audio.chunks, { type: audio.mime || "audio/webm" })) + } + }) + + recorder.stop() + const blob = await result + stopStream() + setState("recording", false) + return blob + } + + const transcribeAudio = async (blob: Blob) => { + if (!blob.size) { + showToast({ + title: "No audio captured", + description: "Try recording again.", + }) + return + } + + const mime = blob.type || "audio/webm" + const prompt = input.editorText() + const controller = new AbortController() + audio.controller = controller + setState("transcribing", true) + + const arrayBuffer = await blob.arrayBuffer() + const bytes = new Uint8Array(arrayBuffer) + // String.fromCharCode has a max argument limit; chunk to avoid stack overflow + const chunks: string[] = [] + for (let i = 0; i < bytes.length; i += 8192) { + chunks.push(String.fromCharCode(...bytes.subarray(i, i + 8192))) + } + const base64 = btoa(chunks.join("")) + + const result = await input.sdk.client + .audio.transcribe( + { + directory: input.sdk.directory, + audio: base64, + mime, + ...(prompt.trim() ? { prompt } : {}), + ...(params.id ? { sessionID: params.id } : {}), + }, + { signal: controller.signal, throwOnError: true }, + ) + .then((res) => ({ ok: true as const, text: res.data.text })) + .catch((error) => ({ + ok: false as const, + message: error instanceof Error ? error.message : String(error), + })) + + audio.controller = undefined + + if (!result.ok) { + setState("transcribing", false) + if (controller.signal.aborted) return + showToast({ + title: "Transcription failed", + description: result.message || "Press Retry to try again.", + }) + return + } + + setState("transcribing", false) + + if (controller.signal.aborted) return + + const text = result.text ?? "" + + if (!text.trim()) { + showToast({ + title: "No speech detected", + description: "Press Retry to try again.", + }) + return + } + + // Success — clear saved recording + setState("lastRecording", undefined) + + input.addPart({ type: "text", content: text, start: 0, end: 0 }) + requestAnimationFrame(() => { + input.editorRef.focus() + input.queueScroll() + }) + } + + const confirmRetry = async () => { + const blob = state.lastRecording + if (!blob) return + await transcribeAudio(blob) + } + + const cancelRetry = () => { + setState("lastRecording", undefined) + } + + const toggleVoice = async () => { + if (transcribing()) { + const controller = audio.controller + if (controller) { + controller.abort() + setState("transcribing", false) + setState("lastRecording", undefined) + showToast({ + title: "Transcription cancelled", + description: "Stopped the current transcription.", + }) + } + return + } + + if (recording()) { + const blob = await recordStop() + if (!blob) return + setState("lastRecording", blob) + await transcribeAudio(blob) + return + } + + await recordStart() + } + + const voiceTitle = createMemo(() => + transcribing() ? "Cancel transcription" : recording() ? "Stop recording" : "Voice input", + ) + + onCleanup(() => { + if (transcribing()) { + const controller = audio.controller + if (controller) controller.abort() + setState("transcribing", false) + } + setState("lastRecording", undefined) + if (!recording()) return + void recordStop() + }) + + return { + recording, + transcribing, + hasLastRecording, + voiceTitle, + toggleVoice, + confirmRetry, + cancelRetry, + } +} + +export const VoiceButton: Component<{ + voiceTitle: () => string + toggleVoice: () => void + confirmRetry: () => void + cancelRetry: () => void + recording: () => boolean + transcribing: () => boolean + hasLastRecording: () => boolean + keybind: string +}> = (props) => ( + + +
+ + + + + + +
+
+ + + + + +
+) diff --git a/packages/core/src/v1/config/config.ts b/packages/core/src/v1/config/config.ts index 2e773f71e256..6f1c352ab001 100644 --- a/packages/core/src/v1/config/config.ts +++ b/packages/core/src/v1/config/config.ts @@ -16,6 +16,7 @@ import { ConfigPluginV1 } from "./plugin" import { ConfigProviderV1 } from "./provider" import { ConfigServerV1 } from "./server" import { ConfigSkillsV1 } from "./skills" +import { ConfigVoiceV1 } from "./voice" export type Layout = ConfigLayoutV1.Layout @@ -53,6 +54,7 @@ export const Info = Schema.Struct({ description: "Enable or disable snapshot tracking. When false, filesystem snapshots are not recorded and undoing or reverting will not undo/redo file changes. Defaults to true.", }), + voice: Schema.optional(ConfigVoiceV1.Info).annotate({ description: "Voice transcription settings" }), plugin: Schema.optional(Schema.mutable(Schema.Array(ConfigPluginV1.Spec))), share: Schema.optional(Schema.Literals(["manual", "auto", "disabled"])).annotate({ description: diff --git a/packages/core/src/v1/config/voice.ts b/packages/core/src/v1/config/voice.ts new file mode 100644 index 000000000000..58587bc0fc95 --- /dev/null +++ b/packages/core/src/v1/config/voice.ts @@ -0,0 +1,42 @@ +export * as ConfigVoiceV1 from "./voice" + +import { Schema } from "effect" +import { PositiveInt } from "../../schema" + +export const Whisper = Schema.Struct({ + url: Schema.optional(Schema.String).annotate({ description: "Whisper API URL" }), + apiKey: Schema.optional(Schema.String).annotate({ description: "Whisper API key" }), + model: Schema.optional(Schema.String).annotate({ description: "Whisper model name" }), + language: Schema.optional(Schema.String).annotate({ description: "Whisper language code" }), +}).annotate({ identifier: "VoiceWhisperConfig" }) +export type Whisper = Schema.Schema.Type + +export const Lalm = Schema.Struct({ + model: Schema.optional(Schema.String).annotate({ + description: "Model to use for audio transcription in the format of provider/model, eg openai/gpt-4o-audio-preview", + }), + system: Schema.optional(Schema.String).annotate({ description: "Large Audio Language Model system prompt" }), + instruction: Schema.optional(Schema.String).annotate({ + description: "Instruction text appended after the audio content to guide transcription behavior", + }), + audio_input_format: Schema.optional(Schema.Literals(["input_audio", "audio_url"])).annotate({ + description: + 'Audio input format for the LLM API. "input_audio" (default) sends audio as OpenAI-style base64 parts. "audio_url" sends audio as data-URL parts compatible with SiliconFlow/Qwen-style APIs.', + }), +}).annotate({ identifier: "VoiceLalmConfig" }) +export type Lalm = Schema.Schema.Type + +export const Info = Schema.Struct({ + type: Schema.optional(Schema.Literals(["whisper", "lalm"])).annotate({ + description: "Transcription provider type", + }), + whisper: Schema.optional(Whisper).annotate({ description: "Whisper transcription settings" }), + lalm: Schema.optional(Lalm).annotate({ description: "Large Audio Language Model transcription settings" }), + hot_words: Schema.optional(Schema.String).annotate({ + description: "Comma-separated hot words to improve transcription accuracy for domain-specific terms", + }), + context_pairs: Schema.optional(PositiveInt).annotate({ + description: "Number of recent user/assistant conversation pairs to include as transcription context (default: 3)", + }), +}).annotate({ identifier: "VoiceConfig" }) +export type Info = Schema.Schema.Type diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts index 7f568f492073..3f5445fe78cf 100644 --- a/packages/opencode/src/config/config.ts +++ b/packages/opencode/src/config/config.ts @@ -108,7 +108,7 @@ async function resolveLoadedPlugins("AudioError")( + { + name: Schema.Literal("AudioError"), + data: Schema.Struct({ + message: Schema.String, + }), + }, + { httpApiStatus: 400 }, +) {} + +export const AudioApi = HttpApi.make("audio") + .add( + HttpApiGroup.make("audio") + .add( + HttpApiEndpoint.post("transcribe", "/voice/transcribe", { + query: WorkspaceRoutingQuery, + payload: TranscribeRequest, + success: described(TranscribeResponse, "Transcription result"), + error: AudioApiError, + }).annotateMerge( + OpenApi.annotations({ + identifier: "audio.transcribe", + summary: "Transcribe audio", + description: + "Transcribe base64-encoded audio data with Whisper or an audio language model", + }), + ), + ) + .annotateMerge( + OpenApi.annotations({ + title: "audio", + description: "Audio transcription routes.", + }), + ) + .middleware(InstanceContextMiddleware) + .middleware(WorkspaceRoutingMiddleware) + .middleware(Authorization), + ) + .annotateMerge( + OpenApi.annotations({ + title: "audio", + version: "0.0.1", + description: "Audio transcription routes.", + }), + ) diff --git a/packages/opencode/src/server/routes/instance/httpapi/handlers/audio.ts b/packages/opencode/src/server/routes/instance/httpapi/handlers/audio.ts new file mode 100644 index 000000000000..47979693d39d --- /dev/null +++ b/packages/opencode/src/server/routes/instance/httpapi/handlers/audio.ts @@ -0,0 +1,157 @@ +import { Effect } from "effect" +import { HttpApiBuilder } from "effect/unstable/httpapi" +import { HttpServerRequest } from "effect/unstable/http" +import { InstanceHttpApi } from "../api" +import { Voice } from "@/voice" +import { AudioApiError, TranscribeRequest } from "../groups/audio" +import type { Info } from "@/config/config" +import { Config } from "@/config/config" +import { Session } from "@/session/session" +import { Vcs } from "@/project/vcs" +import { WorkspaceRouteContext } from "../middleware/workspace-routing" +import { MessageV2 } from "@/session/message-v2" +import { SessionID } from "@/session/schema" + +const toVoiceOverride = ( + payload: typeof TranscribeRequest.Type, + serverVoice: Info["voice"], +): Info["voice"] | undefined => { + const v = payload.voice + if (!v) return undefined + return { + ...serverVoice, + ...v.type && { type: v.type }, + ...v.hot_words && { hot_words: v.hot_words }, + ...v.whisper && { + whisper: { + ...serverVoice?.whisper, + ...v.whisper, + }, + }, + ...v.lalm && { + lalm: { + ...serverVoice?.lalm, + ...v.lalm.model && { model: `${v.lalm.model.providerID}/${v.lalm.model.modelID}` }, + ...v.lalm.system && { system: v.lalm.system }, + ...v.lalm.instruction && { instruction: v.lalm.instruction }, + ...v.lalm.audio_input_format && { audio_input_format: v.lalm.audio_input_format }, + }, + }, + } +} + +const buildConversationContext = ( + messages: MessageV2.WithParts[], + limit: number, +): string => { + const pairs: Array<{ user: string; assistant?: string }> = [] + let pendingAssistant: string | undefined + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i] + if (msg.info.role === "assistant" && pendingAssistant === undefined && !msg.info.summary) { + const text = msg.parts + .filter((p): p is MessageV2.TextPart => p.type === "text") + .map((p) => p.text) + .join(" ") + .trim() + if (text) pendingAssistant = text + } + if (msg.info.role === "user") { + const text = msg.parts + .filter((p): p is MessageV2.TextPart => p.type === "text" && !p.synthetic) + .map((p) => p.text) + .join(" ") + .trim() + if (!text) continue + pairs.push({ user: text, assistant: pendingAssistant }) + pendingAssistant = undefined + if (pairs.length >= limit) break + } + } + if (pendingAssistant !== undefined && pairs.length < limit) { + pairs.push({ user: "", assistant: pendingAssistant }) + } + return pairs + .reverse() + .flatMap((p) => [ + p.user ? `User: ${p.user}` : undefined, + p.assistant ? `Assistant: ${p.assistant}` : undefined, + ]) + .filter((s): s is string => s !== undefined) + .join("\n") +} + +const buildPrompt = Effect.fn("AudioHttpApi.buildPrompt")(function* (input: { + sessionID?: SessionID + extraPrompt?: string +}) { + const route = yield* WorkspaceRouteContext + const vcs = yield* Vcs.Service + + const parts: string[] = [] + + if (route.directory) parts.push(`directory: ${route.directory}`) + const branch = yield* vcs.branch().pipe(Effect.catch(() => Effect.succeed(undefined))) + if (branch) parts.push(`branch: ${branch}`) + + if (input.sessionID) { + const session = yield* Session.Service + const config = yield* Config.Service + const cfg = yield* config.get() + const limit = cfg.voice?.context_pairs ?? 3 + + const messages = yield* session.messages({ sessionID: input.sessionID, limit: 50 }) + if (messages.length > 0) { + parts.push(buildConversationContext(messages, limit)) + } + } + + if (input.extraPrompt?.trim()) parts.push(input.extraPrompt) + + return parts.filter((s) => s.trim()).join("\n") +}) + +export const audioHandlers = HttpApiBuilder.group(InstanceHttpApi, "audio", (handlers) => + Effect.gen(function* () { + const voice = yield* Voice.Service + const config = yield* Config.Service + + const transcribe = Effect.fn("AudioHttpApi.transcribe")(function* (ctx: { + payload: typeof TranscribeRequest.Type + }) { + const buffer = new Uint8Array(Buffer.from(ctx.payload.audio, "base64")) + const blob = new Blob([buffer], { type: ctx.payload.mime }) + const file = new File([blob], "audio.mp3", { type: ctx.payload.mime }) + const request = yield* HttpServerRequest.HttpServerRequest + const signal = request.source instanceof Request ? request.source.signal : undefined + + const prompt = yield* buildPrompt({ + sessionID: ctx.payload.sessionID, + extraPrompt: ctx.payload.prompt, + }).pipe( + Effect.mapError((cause) => + new AudioApiError({ name: "AudioError", data: { message: cause.message } }), + ), + ) + + const cfg = yield* config.get() + + const images = ctx.payload.images?.length ? [...ctx.payload.images] : undefined + + return yield* voice.transcribe({ + file, + mime: ctx.payload.mime, + prompt, + signal, + images, + voice: toVoiceOverride(ctx.payload, cfg.voice), + }).pipe( + Effect.mapError((error) => + new AudioApiError({ name: "AudioError", data: { message: error.message } }), + ), + ) + }) + + return handlers.handle("transcribe", transcribe) + }), +) diff --git a/packages/opencode/src/server/routes/instance/httpapi/server.ts b/packages/opencode/src/server/routes/instance/httpapi/server.ts index 8c5c0ad96d67..df5207b9c837 100644 --- a/packages/opencode/src/server/routes/instance/httpapi/server.ts +++ b/packages/opencode/src/server/routes/instance/httpapi/server.ts @@ -46,6 +46,7 @@ import { Snapshot } from "@/snapshot" import { Storage } from "@/storage/storage" import { ToolRegistry } from "@/tool/registry" import { Truncate } from "@/tool/truncate" +import { Voice } from "@/voice" import { Worktree } from "@/worktree" import { RuntimeFlags } from "@/effect/runtime-flags" import { MoveSession } from "@opencode-ai/core/control-plane/move-session" @@ -76,6 +77,7 @@ import { import { EventApi } from "./groups/event" import { PtyConnectApi } from "./groups/pty" import { eventHandlers } from "./handlers/event" +import { audioHandlers } from "./handlers/audio" import { configHandlers } from "./handlers/config" import { controlHandlers } from "./handlers/control" import { controlPlaneHandlers } from "./handlers/control-plane" @@ -143,6 +145,7 @@ const ptyConnectApiRoutes = HttpApiBuilder.layer(PtyConnectApi).pipe( ) const instanceApiRoutes = HttpApiBuilder.layer(InstanceHttpApi).pipe( Layer.provide([ + audioHandlers, configHandlers, experimentalHandlers, fileHandlers, @@ -243,6 +246,7 @@ const app = LayerNode.group([ Format.node, Project.node, Vcs.node, + Voice.node, Workspace.node, Worktree.node, Installation.node, diff --git a/packages/opencode/src/session/message-v2.ts b/packages/opencode/src/session/message-v2.ts index 1590e0890372..f91f6faee234 100644 --- a/packages/opencode/src/session/message-v2.ts +++ b/packages/opencode/src/session/message-v2.ts @@ -17,8 +17,11 @@ import { User, WithParts, type ToolPart, + TextPart, } from "@opencode-ai/core/v1/session" +export type { WithParts, TextPart } + import { NamedError } from "@opencode-ai/core/util/error" import { APICallError, convertToModelMessages, LoadAPIKeyError, type ModelMessage, type UIMessage } from "ai" import { Database } from "@opencode-ai/core/database/database" diff --git a/packages/opencode/src/voice/config.ts b/packages/opencode/src/voice/config.ts new file mode 100644 index 000000000000..29064a9e7d5a --- /dev/null +++ b/packages/opencode/src/voice/config.ts @@ -0,0 +1,41 @@ +import type { Info } from "@/config/config" + +type Voice = Info["voice"] + +export type AudioInputFormat = "input_audio" | "audio_url" + +export function providerType(voice: Voice, fallback?: Voice): "whisper" | "lalm" { + return voice?.type ?? fallback?.type ?? "lalm" +} + +export function lalm(voice: Voice) { + const model = voice?.lalm?.model + if (!model) { + return { + ok: false as const, + message: "Missing voice.lalm.model (format: provider/model, e.g. openai/gpt-4o-audio-preview)", + } + } + return { ok: true as const, config: { ...(voice?.lalm ?? {}), model } } +} + +export function audioInputFormat(voice: Voice): AudioInputFormat { + return voice?.lalm?.audio_input_format ?? "input_audio" +} + +export function whisper(voice: Voice) { + const apiKey = voice?.whisper?.apiKey + if (!apiKey) { + return { ok: false as const, message: "Missing voice.whisper.apiKey" } + } + return { ok: true as const, config: { ...(voice?.whisper ?? {}), apiKey } } +} + +export function status(voice: Voice) { + const type = providerType(voice) + const result = type === "lalm" ? lalm(voice) : whisper(voice) + if (result.ok) return { ok: true as const, type } + return { ok: false as const, type, message: result.message } +} + +export * as VoiceConfig from "./config" diff --git a/packages/opencode/src/voice/error.ts b/packages/opencode/src/voice/error.ts new file mode 100644 index 000000000000..493d444b623c --- /dev/null +++ b/packages/opencode/src/voice/error.ts @@ -0,0 +1,27 @@ +import { Effect, Schema } from "effect" + +export class VoiceError extends Schema.TaggedErrorClass()("VoiceError", { + message: Schema.String, + cause: Schema.optional(Schema.Defect), +}) {} + +export type Error = VoiceError + +export function abortable(effect: Effect.Effect, signal?: AbortSignal) { + if (!signal) return effect + return effect.pipe( + Effect.raceFirst( + Effect.callback((resume) => { + if (signal.aborted) { + resume(Effect.fail(new VoiceError({ message: "Voice transcription aborted" }))) + return + } + const abort = () => resume(Effect.fail(new VoiceError({ message: "Voice transcription aborted" }))) + signal.addEventListener("abort", abort, { once: true }) + return Effect.sync(() => signal.removeEventListener("abort", abort)) + }), + ), + ) +} + +export * as VoiceErrors from "./error" diff --git a/packages/opencode/src/voice/index.ts b/packages/opencode/src/voice/index.ts new file mode 100644 index 000000000000..0dd63b3a8581 --- /dev/null +++ b/packages/opencode/src/voice/index.ts @@ -0,0 +1,392 @@ +import { Effect, Layer, Context, Schema, Stream } from "effect" +import { Usage as LLMUsage } from "@opencode-ai/llm" +import { type Info, Config } from "@/config/config" +import { Provider } from "@/provider/provider" +import * as ProviderTransform from "@/provider/transform" +import { VoiceConfig } from "@/voice/config" +import { VoiceError, abortable } from "@/voice/error" +import { errorMessage } from "@/util/error" +import { ChildProcess, ChildProcessSpawner } from "effect/unstable/process" +import { HttpClient, HttpClientRequest, HttpClientResponse } from "effect/unstable/http" +import { CrossSpawnSpawner } from "@opencode-ai/core/cross-spawn-spawner" +import { LayerNode } from "@opencode-ai/core/effect/layer-node" +import { httpClient } from "@opencode-ai/core/effect/layer-node-platform" +import { serviceUse } from "@opencode-ai/core/effect/service-use" +import { generateText } from "ai" +import PROMPT from "./lalm.txt" + +const log = { + info(message?: unknown, extra?: Record) { + console.debug("[voice]", message, extra ?? "") + }, + time(_message: string, _extra?: Record) { + return { [Symbol.dispose]: () => {} } + }, +} + +const WhisperResponse = Schema.Struct({ + text: Schema.optional(Schema.String), +}) + +/** + * Builds the audio content part for an LLM message. + * + * Two formats are supported via the `audio_input_format` config: + * + * - `"input_audio"` (default): uses `{ type: "file", mediaType: "audio/*" }`. + * The AI SDK converts this to OpenAI-style `input_audio` parts automatically. + * + * - `"audio_url"`: also uses a `file` content part but injects `audio_url` data + * through `providerOptions.openaiCompatible`. The openai-compatible provider SDK + * spreads `providerOptions.openaiCompatible` into every content part it serialises + * (line ~99 of convert-to-openai-compatible-chat-messages.ts: + * `return { type: 'input_audio', input_audio: {...}, ...partMetadata }`). + * By placing `type: "audio_url"` and `audio_url: { url }` in that object, + * the spread overwrites `type` and injects the `audio_url` field — entirely + * through the SDK's public providerOptions API. The stale `input_audio` field + * remains in the output but is ignored because `type: "audio_url"` tells the + * API to read from `audio_url` instead. + */ +function audioContentPart(audio: Uint8Array, mediaType: string, format: "input_audio" | "audio_url") { + const base64 = Buffer.from(audio).toString("base64") + const filePart = { type: "file" as const, data: audio, mediaType } + if (format === "input_audio") return filePart + return { + ...filePart, + providerOptions: { + openaiCompatible: { + type: "audio_url", + audio_url: { url: `data:${mediaType};base64,${base64}` }, + }, + }, + } +} + +export type TranscribeInput = { + file: File + mime: string + prompt?: string + signal?: AbortSignal + images?: string[] + voice?: Info["voice"] +} + +export type TranscribeResult = { + text: string + usage?: LLMUsage +} + +export interface Interface { + readonly transcribe: (input: TranscribeInput) => Effect.Effect +} + +export class Service extends Context.Service()("@opencode/Voice") {} + +export const use = serviceUse(Service) + +function appendPrompt(context: string, prompt?: string) { + const trimmed = prompt?.trim() + if (!trimmed) return context + if (!context) return trimmed + return `${context}\n${trimmed}` +} + +export const layer = Layer.effect( + Service, + Effect.gen(function* () { + const config = yield* Config.Service + const spawner = yield* ChildProcessSpawner.ChildProcessSpawner + const provider = yield* Provider.Service + const http = yield* HttpClient.HttpClient + + // --- Audio helpers --- + + const toWavOrMp3 = Effect.fn("Voice.toWavOrMp3")( + function* (input: { buffer: ArrayBuffer; mime: string }) { + const isWav = input.mime.includes("wav") + const isMp3 = input.mime.includes("mpeg") || input.mime.includes("mp3") + if (isWav || isMp3) { + const name = isWav ? "audio.wav" : "audio.mp3" + const mime = isWav ? "audio/wav" : "audio/mpeg" + return { buffer: input.buffer, name, mime } + } + + const handle = yield* spawner + .spawn( + ChildProcess.make("ffmpeg", [ + "-y", "-i", "pipe:0", + "-ac", "1", "-f", "mp3", "pipe:1", + ], { + stdin: Stream.make(new Uint8Array(input.buffer)), + stdout: "pipe", + stderr: "pipe", + }), + ) + .pipe( + Effect.mapError((cause) => + new VoiceError({ + message: + typeof cause === "object" && cause !== null && "code" in cause && (cause as { code: string }).code === "ENOENT" + ? `ffmpeg is not installed. Install ffmpeg to convert ${input.mime} audio for transcription.` + : "Failed to start ffmpeg for voice audio conversion", + cause, + }), + ), + ) + + const [stdout, stderr, code] = yield* Effect.all( + [ + Stream.runFold( + handle.stdout, + () => ({ chunks: Array(), bytes: 0 }), + (acc, chunk) => { + acc.chunks.push(chunk) + acc.bytes += chunk.length + return acc + }, + ).pipe(Effect.map((result) => Buffer.concat(result.chunks, result.bytes))), + Stream.mkString(Stream.decodeText(handle.stderr)), + handle.exitCode, + ], + { concurrency: 3 }, + ).pipe( + Effect.mapError((cause) => + new VoiceError({ + message: "Failed to convert voice audio with ffmpeg", + cause, + }), + ), + ) + + if (code !== 0) { + return yield* new VoiceError({ + message: `ffmpeg conversion failed (exit code ${code}): ${stderr.trim() || "unknown error"}`, + }) + } + if (!stdout.byteLength) { + return yield* new VoiceError({ message: "ffmpeg conversion produced no audio output" }) + } + + return { + buffer: stdout.buffer.slice(stdout.byteOffset, stdout.byteOffset + stdout.byteLength), + name: "audio.mp3", + mime: "audio/mpeg", + } as const + }, + Effect.scoped, + ) + + const prepareAudio = Effect.fn("Voice.prepareAudio")(function* (file: File, mime: string) { + const content = yield* Effect.tryPromise({ + try: () => file.arrayBuffer(), + catch: (cause) => new VoiceError({ message: "Failed to read voice audio file", cause }), + }) + return yield* toWavOrMp3({ buffer: content, mime }) + }) + + // --- Whisper transcription --- + + const transcribeWhisper = Effect.fn("Whisper.transcribe")(function* (input: { + file: File + mime: string + prompt?: string + signal?: AbortSignal + voice?: Info["voice"] + }) { + const cfg = yield* config.get() + const voice = input.voice ?? cfg.voice + const whisper = VoiceConfig.whisper(voice) + if (!whisper.ok) return yield* new VoiceError({ message: whisper.message }) + + const prepared = yield* prepareAudio(input.file, input.mime) + const prompt = appendPrompt(input.prompt ?? "", voice?.hot_words) + + const form = new FormData() + const audioBytes = new Uint8Array(prepared.buffer.byteLength) + audioBytes.set(new Uint8Array(prepared.buffer)) + form.append("file", new Blob([audioBytes], { type: prepared.mime }), prepared.name) + form.append("model", whisper.config.model ?? "whisper-1") + form.append("response_format", "json") + if (whisper.config.language) { + form.append("language", whisper.config.language) + } + if (prompt) { + form.append("prompt", prompt) + } + + const url = whisper.config.url ?? "https://api.openai.com/v1/audio/transcriptions" + log.info("whisper request", { + url, + model: whisper.config.model ?? "whisper-1", + bytes: prepared.buffer.byteLength, + }) + + const result = yield* abortable( + http + .execute( + HttpClientRequest.post(url).pipe( + HttpClientRequest.bearerToken(whisper.config.apiKey), + HttpClientRequest.bodyFormData(form), + ), + ) + .pipe( + Effect.mapError((cause) => + new VoiceError({ message: errorMessage(cause), cause }), + ), + ), + input.signal, + ) + + if (result.status < 200 || result.status >= 300) { + const body = yield* result.text.pipe(Effect.catch(() => Effect.succeed(""))) + return yield* new VoiceError({ + message: body || `Whisper request failed (${result.status})`, + }) + } + + log.info("whisper response", { contentType: result.headers["content-type"] }) + const payload = yield* HttpClientResponse.schemaBodyJson(WhisperResponse)(result).pipe( + Effect.mapError((cause) => + new VoiceError({ + message: "Failed to decode Whisper transcription response", + cause, + }), + ), + ) + log.info("transcribed", { provider: "whisper", text: payload.text }) + return { text: payload.text ?? "" } + }) + + // --- LALM transcription --- + + const transcribeLalm = Effect.fn("Lalm.transcribe")(function* (input: { + file: File + mime: string + prompt?: string + signal?: AbortSignal + images?: string[] + voice?: Info["voice"] + }) { + const cfg = yield* config.get() + const voice = input.voice ?? cfg.voice + const lalm = VoiceConfig.lalm(voice) + if (!lalm.ok) return yield* new VoiceError({ message: lalm.message }) + + const { providerID, modelID } = Provider.parseModel(lalm.config.model) + + const prepared = yield* prepareAudio(input.file, input.mime) + const mediaType = prepared.mime.includes("wav") ? "audio/wav" : "audio/mpeg" + + const context = appendPrompt(input.prompt ?? "", voice?.hot_words) + + const system = (lalm.config.system ?? PROMPT).trim() + const instruction = lalm.config.instruction ?? "Transcribe the audio between