From 9f6e1a925826976ada50ccb2751eb7fa959facd1 Mon Sep 17 00:00:00 2001 From: zhangmo8 Date: Tue, 20 Jan 2026 12:31:08 +0800 Subject: [PATCH 1/4] feat: voice ai text to speech --- .../presenter/configPresenter/providers.ts | 15 + .../managers/providerInstanceManager.ts | 3 + .../providers/voiceAIProvider.ts | 469 ++++++++++++++++++ .../ModelProviderSettingsDetail.vue | 5 + .../components/VoiceAIProviderConfig.vue | 204 ++++++++ src/renderer/src/assets/llm-icons/voiceai.svg | 11 + .../src/components/icons/ModelIcon.vue | 2 + .../components/message/MessageBlockAudio.vue | 112 +++++ .../message/MessageItemAssistant.vue | 23 + .../components/settings/ModelConfigDialog.vue | 3 +- src/renderer/src/i18n/en-US/settings.json | 27 + src/renderer/src/i18n/zh-CN/settings.json | 27 + src/renderer/src/stores/chat.ts | 7 +- src/renderer/src/stores/providerStore.ts | 40 +- src/shared/chat.d.ts | 1 + src/shared/types/core/chat.ts | 1 + 16 files changed, 946 insertions(+), 4 deletions(-) create mode 100644 src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts create mode 100644 src/renderer/settings/components/VoiceAIProviderConfig.vue create mode 100644 src/renderer/src/assets/llm-icons/voiceai.svg create mode 100644 src/renderer/src/components/message/MessageBlockAudio.vue diff --git a/src/main/presenter/configPresenter/providers.ts b/src/main/presenter/configPresenter/providers.ts index a6cf6ec77..db04a89d4 100644 --- a/src/main/presenter/configPresenter/providers.ts +++ b/src/main/presenter/configPresenter/providers.ts @@ -217,6 +217,21 @@ export const DEFAULT_PROVIDERS: LLM_PROVIDER_BASE[] = [ defaultBaseUrl: 'https://api.openai.com/v1' } }, + { + id: 'voiceai', + name: 'Voice.ai', + apiType: 'voiceai', + apiKey: '', + baseUrl: 'https://dev.voice.ai', + enable: false, + websites: { + official: 'https://voice.ai/', + apiKey: 'https://voice.ai/app/dashboard/developers', + docs: 'https://voice.ai/docs/introduction', + models: 'https://voice.ai/docs/api-reference/text-to-speech/list-voices', + defaultBaseUrl: 'https://dev.voice.ai' + } + }, { id: 'gemini', name: 'Gemini', diff --git a/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts b/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts index f55260155..0aed25228 100644 --- a/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts +++ b/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts @@ -35,6 +35,7 @@ import { PoeProvider } from '../providers/poeProvider' import { JiekouProvider } from '../providers/jiekouProvider' import { ZenmuxProvider } from '../providers/zenmuxProvider' import { O3fanProvider } from '../providers/o3fanProvider' +import { VoiceAIProvider } from '../providers/voiceAIProvider' import { RateLimitManager } from './rateLimitManager' import { StreamState } from '../types' import { AcpSessionPersistence } from '../../agentPresenter/acp' @@ -86,6 +87,7 @@ export class ProviderInstanceManager { ['anthropic', AnthropicProvider], ['doubao', DoubaoProvider], ['openai', OpenAIProvider], + ['voiceai', VoiceAIProvider], ['openai-responses', OpenAIResponsesProvider], ['cherryin', CherryInProvider], ['lmstudio', LMStudioProvider], @@ -118,6 +120,7 @@ export class ProviderInstanceManager { ['anthropic', AnthropicProvider], ['doubao', DoubaoProvider], ['openai', OpenAIProvider], + ['voiceai', VoiceAIProvider], ['openai-compatible', OpenAICompatibleProvider], ['openai-responses', OpenAIResponsesProvider], ['lmstudio', LMStudioProvider], diff --git a/src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts b/src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts new file mode 100644 index 000000000..24971ed13 --- /dev/null +++ b/src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts @@ -0,0 +1,469 @@ +import { + ChatMessage, + IConfigPresenter, + LLM_PROVIDER, + LLMResponse, + MODEL_META, + LLMCoreStreamEvent, + MCPToolDefinition, + ModelConfig +} from '@shared/presenter' +import { createStreamEvent } from '@shared/types/core/llm-events' +import { BaseLLMProvider } from '../baseProvider' +import { proxyConfig } from '../../proxyConfig' +import { ProxyAgent } from 'undici' + +const DEFAULT_BASE_URL = 'https://dev.voice.ai' +const DEFAULT_AUDIO_FORMAT = 'mp3' +const DEFAULT_TTS_MODEL = 'voiceai-tts-v1-latest' +const DEFAULT_LANGUAGE = 'en' +const DEFAULT_TEMPERATURE = 1 +const DEFAULT_TOP_P = 0.8 +const SUPPORTED_LANGUAGES = new Set([ + 'en', + 'ca', + 'sv', + 'es', + 'fr', + 'de', + 'it', + 'pt', + 'pl', + 'ru', + 'nl' +]) + +const AUDIO_MIME_TYPE: Record = { + mp3: 'audio/mpeg', + wav: 'audio/wav', + pcm: 'audio/pcm' +} + +type VoiceStatusResponse = { + voice_id: string + name?: string | null + status?: string + voice_visibility?: string | null +} + +type VoiceAITtsConfig = { + audioFormat: string + model: string + language: string + temperature: number + topP: number +} + +export class VoiceAIProvider extends BaseLLMProvider { + private proxyAgent?: ProxyAgent + private proxyUrl?: string + + constructor(provider: LLM_PROVIDER, configPresenter: IConfigPresenter) { + super(provider, configPresenter) + this.init() + } + + public onProxyResolved(): void { + this.proxyAgent = undefined + this.proxyUrl = undefined + } + + public async check(): Promise<{ isOk: boolean; errorMsg: string | null }> { + if (!this.provider.apiKey) { + return { isOk: false, errorMsg: 'API key is required' } + } + + try { + await this.listVoices() + return { isOk: true, errorMsg: null } + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error) + return { isOk: false, errorMsg: message } + } + } + + public async summaryTitles(messages: ChatMessage[], _modelId: string): Promise { + const text = this.extractLatestUserText(messages) + if (!text) return 'Voice AI' + return this.buildShortTitle(text) + } + + public async completions( + messages: ChatMessage[], + modelId: string, + temperature?: number, + _maxTokens?: number + ): Promise { + const text = this.extractLatestUserText(messages) + if (!text) { + throw new Error('No user text provided for Voice.ai TTS') + } + + await this.generateSpeech(text, modelId, temperature) + + return { + content: text + } + } + + public async summaries( + text: string, + modelId: string, + temperature?: number, + _maxTokens?: number + ): Promise { + if (!text) { + throw new Error('No text provided for Voice.ai TTS') + } + + await this.generateSpeech(text, modelId, temperature) + + return { + content: this.buildShortTitle(text) + } + } + + public async generateText( + prompt: string, + modelId: string, + temperature?: number, + _maxTokens?: number + ): Promise { + if (!prompt) { + throw new Error('No prompt provided for Voice.ai TTS') + } + + await this.generateSpeech(prompt, modelId, temperature) + + return { + content: prompt + } + } + + public async *coreStream( + messages: ChatMessage[], + modelId: string, + _modelConfig: ModelConfig, + temperature: number, + _maxTokens: number, + _mcpTools: MCPToolDefinition[] + ): AsyncGenerator { + const text = this.extractLatestUserText(messages) + if (!text) { + yield createStreamEvent.error('No user text provided for Voice.ai TTS') + yield createStreamEvent.stop('error') + return + } + + try { + const { audioBase64, mimeType } = await this.generateSpeech(text, modelId, temperature) + + yield createStreamEvent.imageData({ + data: audioBase64, + mimeType + }) + + yield createStreamEvent.stop('complete') + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error) + yield createStreamEvent.error(message) + yield createStreamEvent.stop('error') + } + } + + protected async fetchProviderModels(): Promise { + if (!this.provider.apiKey) return [] + + try { + const voices = await this.listVoices() + const models: MODEL_META[] = voices.map((voice) => ({ + id: voice.voice_id, + name: voice.name && voice.name.trim().length > 0 ? voice.name : voice.voice_id, + group: 'default', + providerId: this.provider.id, + isCustom: false, + contextLength: 4096, + maxTokens: 2048 + })) + + const defaultVoice: MODEL_META = { + id: 'default', + name: 'Default Voice', + group: 'default', + providerId: this.provider.id, + isCustom: false, + contextLength: 4096, + maxTokens: 2048 + } + + return [defaultVoice, ...models] + } catch (error) { + console.error('[VoiceAI] Failed to fetch voices:', error) + return [] + } + } + + private getFetchOptions(): { dispatcher?: ProxyAgent } { + const proxyUrl = proxyConfig.getProxyUrl() + if (!proxyUrl) return {} + if (this.proxyUrl !== proxyUrl || !this.proxyAgent) { + this.proxyAgent = new ProxyAgent(proxyUrl) + this.proxyUrl = proxyUrl + } + return { dispatcher: this.proxyAgent } + } + + private getBaseUrl(): string { + const raw = this.provider.baseUrl?.trim() + if (raw && raw.length > 0) { + return raw.replace(/\/+$/, '') + } + return DEFAULT_BASE_URL + } + + private buildUrl(path: string): string { + const base = this.getBaseUrl() + const normalizedPath = path.startsWith('/') ? path : `/${path}` + return `${base}${normalizedPath}` + } + + private getAuthHeaders(): Record { + if (!this.provider.apiKey) { + throw new Error('API key is required') + } + + return { + Authorization: `Bearer ${this.provider.apiKey}`, + 'Content-Type': 'application/json', + ...this.defaultHeaders + } + } + + private getTtsConfig(): VoiceAITtsConfig { + const audioFormat = + this.configPresenter.getSetting('voiceAI_audioFormat') || DEFAULT_AUDIO_FORMAT + const model = this.configPresenter.getSetting('voiceAI_model') || DEFAULT_TTS_MODEL + const rawLanguage = this.configPresenter.getSetting('voiceAI_language') + const language = rawLanguage?.trim().toLowerCase() || DEFAULT_LANGUAGE + const temperatureSetting = this.configPresenter.getSetting('voiceAI_temperature') + const topPSetting = this.configPresenter.getSetting('voiceAI_topP') + + return { + audioFormat, + model, + language, + temperature: + typeof temperatureSetting === 'number' ? temperatureSetting : DEFAULT_TEMPERATURE, + topP: typeof topPSetting === 'number' ? topPSetting : DEFAULT_TOP_P + } + } + + private resolveVoiceId(modelId: string | undefined): string | null { + if (!modelId) return null + if (modelId === 'default') return null + return modelId + } + + private getAudioMimeType(format: string): string { + const key = format.toLowerCase() + return AUDIO_MIME_TYPE[key] || 'audio/mpeg' + } + + private parseDataUri(value: string): { mimeType: string; data: string } | null { + const match = value.match(/^data:([^;]+);base64,(.*)$/) + if (!match?.[1] || !match?.[2]) return null + return { mimeType: match[1], data: match[2] } + } + + private isHttpUrl(value: string): boolean { + return value.startsWith('http://') || value.startsWith('https://') + } + + private pickString(source: Record, keys: string[]): string | null { + for (const key of keys) { + const value = source[key] + if (typeof value === 'string' && value.trim().length > 0) { + return value + } + } + return null + } + + private async fetchAudioFromUrl( + url: string, + fallbackMimeType: string + ): Promise<{ audioBase64: string; mimeType: string }> { + const headers: Record = { ...this.defaultHeaders } + const baseUrl = this.getBaseUrl() + if (this.provider.apiKey && url.startsWith(baseUrl)) { + headers.Authorization = `Bearer ${this.provider.apiKey}` + } + + const response = await fetch(url, { + method: 'GET', + headers, + ...this.getFetchOptions() + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Voice.ai audio fetch failed: ${response.status} ${errorText}`) + } + + const contentType = response.headers.get('content-type')?.split(';')[0]?.trim() + const mimeType = contentType && contentType.length > 0 ? contentType : fallbackMimeType + const buffer = Buffer.from(await response.arrayBuffer()) + return { audioBase64: buffer.toString('base64'), mimeType } + } + + private async resolveAudioValue( + value: string, + fallbackMimeType: string + ): Promise<{ audioBase64: string; mimeType: string } | null> { + const trimmed = value.trim() + if (!trimmed) return null + + const dataUri = this.parseDataUri(trimmed) + if (dataUri) { + return { audioBase64: dataUri.data, mimeType: dataUri.mimeType } + } + + if (this.isHttpUrl(trimmed)) { + return await this.fetchAudioFromUrl(trimmed, fallbackMimeType) + } + + return { audioBase64: trimmed, mimeType: fallbackMimeType } + } + + private async resolveAudioFromJson( + payload: unknown, + fallbackMimeType: string + ): Promise<{ audioBase64: string; mimeType: string } | null> { + if (!payload || typeof payload !== 'object') return null + + const data = payload as Record + const rootMimeType = + this.pickString(data, ['mime_type', 'content_type', 'contentType']) || fallbackMimeType + + const audioField = data.audio + if (audioField && typeof audioField === 'object') { + const audioData = audioField as Record + const audioMimeType = + this.pickString(audioData, ['mime_type', 'content_type', 'contentType']) || rootMimeType + const audioValue = + this.pickString(audioData, ['base64', 'data', 'audio_base64', 'audioBase64', 'audio']) || + this.pickString(audioData, ['url', 'audio_url', 'audioUrl']) + if (audioValue) { + return await this.resolveAudioValue(audioValue, audioMimeType) + } + } + + const directAudioValue = + this.pickString(data, ['audio_base64', 'audioBase64', 'audio', 'data']) || + this.pickString(data, ['audio_url', 'audioUrl', 'url']) + if (directAudioValue) { + return await this.resolveAudioValue(directAudioValue, rootMimeType) + } + + return null + } + + private async listVoices(): Promise { + const response = await fetch(this.buildUrl('/api/v1/tts/voices'), { + method: 'GET', + headers: this.getAuthHeaders(), + ...this.getFetchOptions() + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Voice.ai list voices failed: ${response.status} ${errorText}`) + } + + const data = await response.json() + if (!Array.isArray(data)) return [] + return data as VoiceStatusResponse[] + } + + private async generateSpeech( + text: string, + modelId: string, + temperature?: number + ): Promise<{ audioBase64: string; mimeType: string }> { + const config = this.getTtsConfig() + if (!SUPPORTED_LANGUAGES.has(config.language)) { + throw new Error( + `Unsupported language code: ${config.language}. Supported languages: ${Array.from( + SUPPORTED_LANGUAGES + ).join(', ')}` + ) + } + const voiceId = this.resolveVoiceId(modelId) + const requestBody: Record = { + text, + audio_format: config.audioFormat, + model: config.model, + language: config.language, + temperature: typeof temperature === 'number' ? temperature : config.temperature, + top_p: config.topP + } + + if (voiceId) { + requestBody['voice_id'] = voiceId + } + + const response = await fetch(this.buildUrl('/api/v1/tts/speech'), { + method: 'POST', + headers: this.getAuthHeaders(), + body: JSON.stringify(requestBody), + ...this.getFetchOptions() + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Voice.ai generate speech failed: ${response.status} ${errorText}`) + } + + const contentType = response.headers.get('content-type')?.split(';')[0]?.trim() + const fallbackMimeType = this.getAudioMimeType(config.audioFormat) + + if (contentType?.includes('application/json')) { + const json = await response.json() + const resolved = await this.resolveAudioFromJson(json, fallbackMimeType) + if (!resolved) { + throw new Error('Voice.ai generate speech returned JSON without audio data') + } + return resolved + } + + const mimeType = contentType && contentType.length > 0 ? contentType : fallbackMimeType + const buffer = Buffer.from(await response.arrayBuffer()) + return { audioBase64: buffer.toString('base64'), mimeType } + } + + private extractLatestUserText(messages: ChatMessage[]): string | null { + const lastUser = [...messages].reverse().find((message) => message.role === 'user') + if (!lastUser?.content) return null + + if (typeof lastUser.content === 'string') { + return lastUser.content + } + + if (Array.isArray(lastUser.content)) { + const textParts = lastUser.content + .filter((part) => part.type === 'text') + .map((part) => part.text) + .filter(Boolean) + + return textParts.length > 0 ? textParts.join('\n') : null + } + + return null + } + + private buildShortTitle(text: string): string { + const normalized = text.replace(/\s+/g, ' ').trim() + if (!normalized) return 'Voice AI' + return normalized.length > 32 ? `${normalized.slice(0, 32)}…` : normalized + } +} diff --git a/src/renderer/settings/components/ModelProviderSettingsDetail.vue b/src/renderer/settings/components/ModelProviderSettingsDetail.vue index 5273daa09..30615df24 100644 --- a/src/renderer/settings/components/ModelProviderSettingsDetail.vue +++ b/src/renderer/settings/components/ModelProviderSettingsDetail.vue @@ -45,6 +45,10 @@ + + + + @@ -104,6 +108,7 @@ import { levelToValueMap, safetyCategories } from '@/lib/gemini' import { Separator } from '@shadcn/components/ui/separator' import type { SafetyCategoryKey, SafetySettingValue } from '@/lib/gemini' import { useThrottleFn } from '@vueuse/core' +import VoiceAIProviderConfig from './VoiceAIProviderConfig.vue' interface ProviderWebsites { official: string diff --git a/src/renderer/settings/components/VoiceAIProviderConfig.vue b/src/renderer/settings/components/VoiceAIProviderConfig.vue new file mode 100644 index 000000000..083a3c1be --- /dev/null +++ b/src/renderer/settings/components/VoiceAIProviderConfig.vue @@ -0,0 +1,204 @@ + + + diff --git a/src/renderer/src/assets/llm-icons/voiceai.svg b/src/renderer/src/assets/llm-icons/voiceai.svg new file mode 100644 index 000000000..d68a970f9 --- /dev/null +++ b/src/renderer/src/assets/llm-icons/voiceai.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/src/renderer/src/components/icons/ModelIcon.vue b/src/renderer/src/components/icons/ModelIcon.vue index 46748a54b..1116901d7 100644 --- a/src/renderer/src/components/icons/ModelIcon.vue +++ b/src/renderer/src/components/icons/ModelIcon.vue @@ -68,6 +68,7 @@ import zenmuxColorIcon from '@/assets/llm-icons/zenmux-color.svg?url' import burncloudColorIcon from '@/assets/llm-icons/burncloud-color.svg?url' import xiaomiColorIcon from '@/assets/llm-icons/xiaomi.png?url' import o3fanColorIcon from '@/assets/llm-icons/o3-fan.png?url' +import voiceAiColorIcon from '@/assets/llm-icons/voiceai.svg?url' // 导入所有图标 const icons = { @@ -148,6 +149,7 @@ const icons = { zenmux: zenmuxColorIcon, burncloud: burncloudColorIcon, xiaomi: xiaomiColorIcon, + voiceai: voiceAiColorIcon, default: defaultIcon } diff --git a/src/renderer/src/components/message/MessageBlockAudio.vue b/src/renderer/src/components/message/MessageBlockAudio.vue new file mode 100644 index 000000000..bbcda61aa --- /dev/null +++ b/src/renderer/src/components/message/MessageBlockAudio.vue @@ -0,0 +1,112 @@ + + + diff --git a/src/renderer/src/components/message/MessageItemAssistant.vue b/src/renderer/src/components/message/MessageItemAssistant.vue index 04310749b..9dae745ce 100644 --- a/src/renderer/src/components/message/MessageItemAssistant.vue +++ b/src/renderer/src/components/message/MessageItemAssistant.vue @@ -71,6 +71,12 @@ :message-id="currentMessage.id" :thread-id="currentThreadId" /> + { + if (block.type === 'audio') return true + if (block.type !== 'image') return false + const mimeType = block.image_data?.mimeType?.toLowerCase() || '' + if (mimeType.startsWith('audio/')) return true + const data = block.image_data?.data || '' + if (data.startsWith('data:audio/')) return true + if (data.startsWith('imgcache://') || data.startsWith('http://') || data.startsWith('https://')) { + const lower = data.toLowerCase() + return AUDIO_EXTENSIONS.some((ext) => lower.includes(ext)) + } + return false +} + // 定义事件 const emit = defineEmits<{ copyImage: [ diff --git a/src/renderer/src/components/settings/ModelConfigDialog.vue b/src/renderer/src/components/settings/ModelConfigDialog.vue index ed9582da9..54e551834 100644 --- a/src/renderer/src/components/settings/ModelConfigDialog.vue +++ b/src/renderer/src/components/settings/ModelConfigDialog.vue @@ -507,7 +507,8 @@ const isOpenAICompatibleProvider = computed(() => { 'aws-bedrock', 'github-copilot', 'ollama', - 'acp' + 'acp', + 'voiceai' ] const providerId = props.providerId?.toLowerCase() || '' return !EXCLUDED_PROVIDERS.some((excluded) => providerId.includes(excluded)) diff --git a/src/renderer/src/i18n/en-US/settings.json b/src/renderer/src/i18n/en-US/settings.json index 62c3cd32e..09104c498 100644 --- a/src/renderer/src/i18n/en-US/settings.json +++ b/src/renderer/src/i18n/en-US/settings.json @@ -493,6 +493,33 @@ "verifyFailed": "Verification failed", "verifySuccess": "Verification is successful" }, + "voiceai": { + "title": "Voice.ai Text-to-Speech", + "description": "Generate speech from text. Voices appear in the model list below.", + "audioFormat": { + "label": "Audio Format", + "placeholder": "Select format", + "helper": "MP3 is recommended for most cases." + }, + "language": { + "label": "Language", + "placeholder": "e.g. en", + "helper": "Supported: en, ca, sv, es, fr, de, it, pt, pl, ru, nl." + }, + "model": { + "label": "TTS Model", + "placeholder": "voiceai-tts-v1-latest", + "helper": "See the Voice.ai docs for supported models." + }, + "temperature": { + "label": "Temperature", + "helper": "Controls randomness (0-2)." + }, + "topP": { + "label": "Top P", + "helper": "Nucleus sampling (0-1)." + } + }, "anthropicApiKeyTip": "Please go to Anthropic Console to get your API Key", "anthropicConnected": "Anthropic connected", "anthropicNotConnected": "Anthropic not connected", diff --git a/src/renderer/src/i18n/zh-CN/settings.json b/src/renderer/src/i18n/zh-CN/settings.json index e93f99dde..4e3ae8873 100644 --- a/src/renderer/src/i18n/zh-CN/settings.json +++ b/src/renderer/src/i18n/zh-CN/settings.json @@ -394,6 +394,33 @@ "operationalDescription": "同步 ModelScope 平台上可直接使用的 MCP 服务器" } }, + "voiceai": { + "title": "Voice.ai 文字转语音", + "description": "将文本生成语音,voice 会在下方模型列表中展示。", + "audioFormat": { + "label": "音频格式", + "placeholder": "选择格式", + "helper": "多数场景推荐 MP3。" + }, + "language": { + "label": "语言", + "placeholder": "例如 en", + "helper": "支持语言:en, ca, sv, es, fr, de, it, pt, pl, ru, nl。" + }, + "model": { + "label": "TTS 模型", + "placeholder": "voiceai-tts-v1-latest", + "helper": "可选模型请查看 Voice.ai 文档。" + }, + "temperature": { + "label": "温度", + "helper": "随机性参数(0-2)。" + }, + "topP": { + "label": "Top P", + "helper": "Nucleus 采样(0-1)。" + } + }, "dialog": { "disableModel": { "title": "确认禁用模型", diff --git a/src/renderer/src/stores/chat.ts b/src/renderer/src/stores/chat.ts index abc4f30a1..5774875e8 100644 --- a/src/renderer/src/stores/chat.ts +++ b/src/renderer/src/stores/chat.ts @@ -1225,9 +1225,12 @@ export const useChatStore = defineStore('chat', () => { } } else if (msg.image_data) { finalizeAssistantMessageBlocks(assistantMsg.content) + const mimeType = msg.image_data.mimeType || '' + const isAudio = + mimeType.startsWith('audio/') || msg.image_data.data?.startsWith('data:audio/') assistantMsg.content.push({ - type: 'image', - content: 'image', + type: isAudio ? 'audio' : 'image', + content: isAudio ? 'audio' : 'image', status: 'success', timestamp: Date.now(), image_data: { diff --git a/src/renderer/src/stores/providerStore.ts b/src/renderer/src/stores/providerStore.ts index 5c1d21fbd..aff55d07c 100644 --- a/src/renderer/src/stores/providerStore.ts +++ b/src/renderer/src/stores/providerStore.ts @@ -5,6 +5,14 @@ import { useIpcQuery } from '@/composables/useIpcQuery' import { CONFIG_EVENTS, PROVIDER_DB_EVENTS } from '@/events' import type { AWS_BEDROCK_PROVIDER, LLM_PROVIDER, VERTEX_PROVIDER } from '@shared/presenter' +type VoiceAIConfig = { + audioFormat: string + model: string + language: string + temperature: number + topP: number +} + const PROVIDER_ORDER_KEY = 'providerOrder' const PROVIDER_TIMESTAMP_KEY = 'providerTimestamps' @@ -340,6 +348,34 @@ export const useProviderStore = defineStore('provider', () => { return await configP.getSetting('awsBedrockCredential') } + const getVoiceAIConfig = async (): Promise => { + return { + audioFormat: (await configP.getSetting('voiceAI_audioFormat')) || 'mp3', + model: (await configP.getSetting('voiceAI_model')) || 'voiceai-tts-v1-latest', + language: (await configP.getSetting('voiceAI_language')) || 'en', + temperature: (await configP.getSetting('voiceAI_temperature')) ?? 1, + topP: (await configP.getSetting('voiceAI_topP')) ?? 0.8 + } + } + + const updateVoiceAIConfig = async (updates: Partial) => { + if (updates.audioFormat !== undefined) { + await configP.setSetting('voiceAI_audioFormat', updates.audioFormat) + } + if (updates.model !== undefined) { + await configP.setSetting('voiceAI_model', updates.model) + } + if (updates.language !== undefined) { + await configP.setSetting('voiceAI_language', updates.language) + } + if (updates.temperature !== undefined) { + await configP.setSetting('voiceAI_temperature', updates.temperature) + } + if (updates.topP !== undefined) { + await configP.setSetting('voiceAI_topP', updates.topP) + } + } + const updateProviderTimestamp = async (providerId: string) => { providerTimestamps.value[providerId] = Date.now() await saveProviderTimestamps() @@ -416,6 +452,8 @@ export const useProviderStore = defineStore('provider', () => { setGeminiSafety, getGeminiSafety, setAwsBedrockCredential, - getAwsBedrockCredential + getAwsBedrockCredential, + getVoiceAIConfig, + updateVoiceAIConfig } }) diff --git a/src/shared/chat.d.ts b/src/shared/chat.d.ts index 26d514b99..8ba383299 100644 --- a/src/shared/chat.d.ts +++ b/src/shared/chat.d.ts @@ -91,6 +91,7 @@ export type AssistantMessageBlock = { | 'tool_call' | 'action' | 'image' + | 'audio' | 'artifact-thinking' | 'mcp_ui_resource' id?: string diff --git a/src/shared/types/core/chat.ts b/src/shared/types/core/chat.ts index 198ba868a..7066100d0 100644 --- a/src/shared/types/core/chat.ts +++ b/src/shared/types/core/chat.ts @@ -55,6 +55,7 @@ export type AssistantMessageBlock = { | 'tool_call' | 'action' | 'image' + | 'audio' | 'artifact-thinking' | 'mcp_ui_resource' content?: string From cf5af65e3151a7de0a4d6aea625b90110a6f4e89 Mon Sep 17 00:00:00 2001 From: zhangmo8 Date: Tue, 20 Jan 2026 18:30:28 +0800 Subject: [PATCH 2/4] feat: voice ai call phone --- electron.vite.config.ts | 5 +- .../components/VoiceAIProviderConfig.vue | 57 ++++- .../src/components/chat-input/ChatInput.vue | 218 +++++++++++++++++- .../components/message/MessageBlockAudio.vue | 1 - src/renderer/src/i18n/en-US/chat.json | 11 + src/renderer/src/i18n/en-US/settings.json | 5 + src/renderer/src/i18n/zh-CN/chat.json | 11 + src/renderer/src/i18n/zh-CN/settings.json | 5 + src/renderer/src/stores/providerStore.ts | 16 +- vitest.config.renderer.ts | 13 +- vitest.config.ts | 16 +- 11 files changed, 341 insertions(+), 17 deletions(-) diff --git a/electron.vite.config.ts b/electron.vite.config.ts index b32eae5f7..effef1451 100644 --- a/electron.vite.config.ts +++ b/electron.vite.config.ts @@ -7,6 +7,8 @@ import monacoEditorPlugin from 'vite-plugin-monaco-editor-esm' import path from 'node:path' import tailwindcss from '@tailwindcss/vite' +const isCustomElement = (tag: string) => + tag === 'voice-agent-widget' || tag.startsWith('ui-resource-renderer') export default defineConfig({ main: { @@ -82,8 +84,7 @@ export default defineConfig({ vue({ template: { compilerOptions: { - // 将所有带短横线的标签名都视为自定义元素 - isCustomElement: (tag) => tag.startsWith('ui-resource-renderer') + isCustomElement } } }), diff --git a/src/renderer/settings/components/VoiceAIProviderConfig.vue b/src/renderer/settings/components/VoiceAIProviderConfig.vue index 083a3c1be..8cdb2a3a8 100644 --- a/src/renderer/settings/components/VoiceAIProviderConfig.vue +++ b/src/renderer/settings/components/VoiceAIProviderConfig.vue @@ -39,12 +39,20 @@ - +

{{ t('settings.provider.voiceai.language.helper') }}

@@ -64,6 +72,21 @@ {{ t('settings.provider.voiceai.model.helper') }}

+ +
+ + +

+ {{ t('settings.provider.voiceai.agentId.helper') }} +

+
@@ -144,14 +167,30 @@ const ttsModel = ref('voiceai-tts-v1-latest') const language = ref('en') const temperature = ref(1) const topP = ref(0.8) +const agentId = ref('') const isHydrating = ref(true) +const languageOptions = [ + { value: 'en', label: 'English (en)' }, + { value: 'ca', label: 'Catalan (ca)' }, + { value: 'sv', label: 'Swedish (sv)' }, + { value: 'es', label: 'Spanish (es)' }, + { value: 'fr', label: 'French (fr)' }, + { value: 'de', label: 'German (de)' }, + { value: 'it', label: 'Italian (it)' }, + { value: 'pt', label: 'Portuguese (pt)' }, + { value: 'pl', label: 'Polish (pl)' }, + { value: 'ru', label: 'Russian (ru)' }, + { value: 'nl', label: 'Dutch (nl)' } +] + type VoiceAIConfigUpdates = { audioFormat?: string model?: string language?: string temperature?: number topP?: number + agentId?: string } const persistUpdates = useDebounceFn(async (updates: VoiceAIConfigUpdates) => { @@ -166,6 +205,7 @@ const loadConfig = async () => { language.value = config.language temperature.value = config.temperature topP.value = config.topP + agentId.value = config.agentId isHydrating.value = false } @@ -188,6 +228,11 @@ watch(language, (value) => { void persistUpdates({ language: value }) }) +watch(agentId, (value) => { + if (isHydrating.value) return + void persistUpdates({ agentId: value }) +}) + const onTemperatureChange = (value: number[] | undefined) => { if (!value || value[0] === undefined) return temperature.value = value[0] diff --git a/src/renderer/src/components/chat-input/ChatInput.vue b/src/renderer/src/components/chat-input/ChatInput.vue index 36847c8f5..d68d89c25 100644 --- a/src/renderer/src/components/chat-input/ChatInput.vue +++ b/src/renderer/src/components/chat-input/ChatInput.vue @@ -20,6 +20,7 @@ " :class="[ 'flex flex-col gap-2 relative', + isCallActive ? 'pointer-events-none opacity-60' : '', variant === 'newThread' ? 'bg-card rounded-lg border p-2 shadow-sm' : 'border-t px-4 py-3 gap-3' @@ -371,13 +372,29 @@ /> + + + + + + {{ t('chat.call.start') }} + + - - {{ t('chat.call.start') }} - +