From 9f6e1a925826976ada50ccb2751eb7fa959facd1 Mon Sep 17 00:00:00 2001
From: zhangmo8
Date: Tue, 20 Jan 2026 12:31:08 +0800
Subject: [PATCH 1/4] feat: voice ai text to speech
---
.../presenter/configPresenter/providers.ts | 15 +
.../managers/providerInstanceManager.ts | 3 +
.../providers/voiceAIProvider.ts | 469 ++++++++++++++++++
.../ModelProviderSettingsDetail.vue | 5 +
.../components/VoiceAIProviderConfig.vue | 204 ++++++++
src/renderer/src/assets/llm-icons/voiceai.svg | 11 +
.../src/components/icons/ModelIcon.vue | 2 +
.../components/message/MessageBlockAudio.vue | 112 +++++
.../message/MessageItemAssistant.vue | 23 +
.../components/settings/ModelConfigDialog.vue | 3 +-
src/renderer/src/i18n/en-US/settings.json | 27 +
src/renderer/src/i18n/zh-CN/settings.json | 27 +
src/renderer/src/stores/chat.ts | 7 +-
src/renderer/src/stores/providerStore.ts | 40 +-
src/shared/chat.d.ts | 1 +
src/shared/types/core/chat.ts | 1 +
16 files changed, 946 insertions(+), 4 deletions(-)
create mode 100644 src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts
create mode 100644 src/renderer/settings/components/VoiceAIProviderConfig.vue
create mode 100644 src/renderer/src/assets/llm-icons/voiceai.svg
create mode 100644 src/renderer/src/components/message/MessageBlockAudio.vue
diff --git a/src/main/presenter/configPresenter/providers.ts b/src/main/presenter/configPresenter/providers.ts
index a6cf6ec77..db04a89d4 100644
--- a/src/main/presenter/configPresenter/providers.ts
+++ b/src/main/presenter/configPresenter/providers.ts
@@ -217,6 +217,21 @@ export const DEFAULT_PROVIDERS: LLM_PROVIDER_BASE[] = [
defaultBaseUrl: 'https://api.openai.com/v1'
}
},
+ {
+ id: 'voiceai',
+ name: 'Voice.ai',
+ apiType: 'voiceai',
+ apiKey: '',
+ baseUrl: 'https://dev.voice.ai',
+ enable: false,
+ websites: {
+ official: 'https://voice.ai/',
+ apiKey: 'https://voice.ai/app/dashboard/developers',
+ docs: 'https://voice.ai/docs/introduction',
+ models: 'https://voice.ai/docs/api-reference/text-to-speech/list-voices',
+ defaultBaseUrl: 'https://dev.voice.ai'
+ }
+ },
{
id: 'gemini',
name: 'Gemini',
diff --git a/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts b/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts
index f55260155..0aed25228 100644
--- a/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts
+++ b/src/main/presenter/llmProviderPresenter/managers/providerInstanceManager.ts
@@ -35,6 +35,7 @@ import { PoeProvider } from '../providers/poeProvider'
import { JiekouProvider } from '../providers/jiekouProvider'
import { ZenmuxProvider } from '../providers/zenmuxProvider'
import { O3fanProvider } from '../providers/o3fanProvider'
+import { VoiceAIProvider } from '../providers/voiceAIProvider'
import { RateLimitManager } from './rateLimitManager'
import { StreamState } from '../types'
import { AcpSessionPersistence } from '../../agentPresenter/acp'
@@ -86,6 +87,7 @@ export class ProviderInstanceManager {
['anthropic', AnthropicProvider],
['doubao', DoubaoProvider],
['openai', OpenAIProvider],
+ ['voiceai', VoiceAIProvider],
['openai-responses', OpenAIResponsesProvider],
['cherryin', CherryInProvider],
['lmstudio', LMStudioProvider],
@@ -118,6 +120,7 @@ export class ProviderInstanceManager {
['anthropic', AnthropicProvider],
['doubao', DoubaoProvider],
['openai', OpenAIProvider],
+ ['voiceai', VoiceAIProvider],
['openai-compatible', OpenAICompatibleProvider],
['openai-responses', OpenAIResponsesProvider],
['lmstudio', LMStudioProvider],
diff --git a/src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts b/src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts
new file mode 100644
index 000000000..24971ed13
--- /dev/null
+++ b/src/main/presenter/llmProviderPresenter/providers/voiceAIProvider.ts
@@ -0,0 +1,469 @@
+import {
+ ChatMessage,
+ IConfigPresenter,
+ LLM_PROVIDER,
+ LLMResponse,
+ MODEL_META,
+ LLMCoreStreamEvent,
+ MCPToolDefinition,
+ ModelConfig
+} from '@shared/presenter'
+import { createStreamEvent } from '@shared/types/core/llm-events'
+import { BaseLLMProvider } from '../baseProvider'
+import { proxyConfig } from '../../proxyConfig'
+import { ProxyAgent } from 'undici'
+
+const DEFAULT_BASE_URL = 'https://dev.voice.ai'
+const DEFAULT_AUDIO_FORMAT = 'mp3'
+const DEFAULT_TTS_MODEL = 'voiceai-tts-v1-latest'
+const DEFAULT_LANGUAGE = 'en'
+const DEFAULT_TEMPERATURE = 1
+const DEFAULT_TOP_P = 0.8
+const SUPPORTED_LANGUAGES = new Set([
+ 'en',
+ 'ca',
+ 'sv',
+ 'es',
+ 'fr',
+ 'de',
+ 'it',
+ 'pt',
+ 'pl',
+ 'ru',
+ 'nl'
+])
+
+const AUDIO_MIME_TYPE: Record = {
+ mp3: 'audio/mpeg',
+ wav: 'audio/wav',
+ pcm: 'audio/pcm'
+}
+
+type VoiceStatusResponse = {
+ voice_id: string
+ name?: string | null
+ status?: string
+ voice_visibility?: string | null
+}
+
+type VoiceAITtsConfig = {
+ audioFormat: string
+ model: string
+ language: string
+ temperature: number
+ topP: number
+}
+
+export class VoiceAIProvider extends BaseLLMProvider {
+ private proxyAgent?: ProxyAgent
+ private proxyUrl?: string
+
+ constructor(provider: LLM_PROVIDER, configPresenter: IConfigPresenter) {
+ super(provider, configPresenter)
+ this.init()
+ }
+
+ public onProxyResolved(): void {
+ this.proxyAgent = undefined
+ this.proxyUrl = undefined
+ }
+
+ public async check(): Promise<{ isOk: boolean; errorMsg: string | null }> {
+ if (!this.provider.apiKey) {
+ return { isOk: false, errorMsg: 'API key is required' }
+ }
+
+ try {
+ await this.listVoices()
+ return { isOk: true, errorMsg: null }
+ } catch (error: unknown) {
+ const message = error instanceof Error ? error.message : String(error)
+ return { isOk: false, errorMsg: message }
+ }
+ }
+
+ public async summaryTitles(messages: ChatMessage[], _modelId: string): Promise {
+ const text = this.extractLatestUserText(messages)
+ if (!text) return 'Voice AI'
+ return this.buildShortTitle(text)
+ }
+
+ public async completions(
+ messages: ChatMessage[],
+ modelId: string,
+ temperature?: number,
+ _maxTokens?: number
+ ): Promise {
+ const text = this.extractLatestUserText(messages)
+ if (!text) {
+ throw new Error('No user text provided for Voice.ai TTS')
+ }
+
+ await this.generateSpeech(text, modelId, temperature)
+
+ return {
+ content: text
+ }
+ }
+
+ public async summaries(
+ text: string,
+ modelId: string,
+ temperature?: number,
+ _maxTokens?: number
+ ): Promise {
+ if (!text) {
+ throw new Error('No text provided for Voice.ai TTS')
+ }
+
+ await this.generateSpeech(text, modelId, temperature)
+
+ return {
+ content: this.buildShortTitle(text)
+ }
+ }
+
+ public async generateText(
+ prompt: string,
+ modelId: string,
+ temperature?: number,
+ _maxTokens?: number
+ ): Promise {
+ if (!prompt) {
+ throw new Error('No prompt provided for Voice.ai TTS')
+ }
+
+ await this.generateSpeech(prompt, modelId, temperature)
+
+ return {
+ content: prompt
+ }
+ }
+
+ public async *coreStream(
+ messages: ChatMessage[],
+ modelId: string,
+ _modelConfig: ModelConfig,
+ temperature: number,
+ _maxTokens: number,
+ _mcpTools: MCPToolDefinition[]
+ ): AsyncGenerator {
+ const text = this.extractLatestUserText(messages)
+ if (!text) {
+ yield createStreamEvent.error('No user text provided for Voice.ai TTS')
+ yield createStreamEvent.stop('error')
+ return
+ }
+
+ try {
+ const { audioBase64, mimeType } = await this.generateSpeech(text, modelId, temperature)
+
+ yield createStreamEvent.imageData({
+ data: audioBase64,
+ mimeType
+ })
+
+ yield createStreamEvent.stop('complete')
+ } catch (error: unknown) {
+ const message = error instanceof Error ? error.message : String(error)
+ yield createStreamEvent.error(message)
+ yield createStreamEvent.stop('error')
+ }
+ }
+
+ protected async fetchProviderModels(): Promise {
+ if (!this.provider.apiKey) return []
+
+ try {
+ const voices = await this.listVoices()
+ const models: MODEL_META[] = voices.map((voice) => ({
+ id: voice.voice_id,
+ name: voice.name && voice.name.trim().length > 0 ? voice.name : voice.voice_id,
+ group: 'default',
+ providerId: this.provider.id,
+ isCustom: false,
+ contextLength: 4096,
+ maxTokens: 2048
+ }))
+
+ const defaultVoice: MODEL_META = {
+ id: 'default',
+ name: 'Default Voice',
+ group: 'default',
+ providerId: this.provider.id,
+ isCustom: false,
+ contextLength: 4096,
+ maxTokens: 2048
+ }
+
+ return [defaultVoice, ...models]
+ } catch (error) {
+ console.error('[VoiceAI] Failed to fetch voices:', error)
+ return []
+ }
+ }
+
+ private getFetchOptions(): { dispatcher?: ProxyAgent } {
+ const proxyUrl = proxyConfig.getProxyUrl()
+ if (!proxyUrl) return {}
+ if (this.proxyUrl !== proxyUrl || !this.proxyAgent) {
+ this.proxyAgent = new ProxyAgent(proxyUrl)
+ this.proxyUrl = proxyUrl
+ }
+ return { dispatcher: this.proxyAgent }
+ }
+
+ private getBaseUrl(): string {
+ const raw = this.provider.baseUrl?.trim()
+ if (raw && raw.length > 0) {
+ return raw.replace(/\/+$/, '')
+ }
+ return DEFAULT_BASE_URL
+ }
+
+ private buildUrl(path: string): string {
+ const base = this.getBaseUrl()
+ const normalizedPath = path.startsWith('/') ? path : `/${path}`
+ return `${base}${normalizedPath}`
+ }
+
+ private getAuthHeaders(): Record {
+ if (!this.provider.apiKey) {
+ throw new Error('API key is required')
+ }
+
+ return {
+ Authorization: `Bearer ${this.provider.apiKey}`,
+ 'Content-Type': 'application/json',
+ ...this.defaultHeaders
+ }
+ }
+
+ private getTtsConfig(): VoiceAITtsConfig {
+ const audioFormat =
+ this.configPresenter.getSetting('voiceAI_audioFormat') || DEFAULT_AUDIO_FORMAT
+ const model = this.configPresenter.getSetting('voiceAI_model') || DEFAULT_TTS_MODEL
+ const rawLanguage = this.configPresenter.getSetting('voiceAI_language')
+ const language = rawLanguage?.trim().toLowerCase() || DEFAULT_LANGUAGE
+ const temperatureSetting = this.configPresenter.getSetting('voiceAI_temperature')
+ const topPSetting = this.configPresenter.getSetting('voiceAI_topP')
+
+ return {
+ audioFormat,
+ model,
+ language,
+ temperature:
+ typeof temperatureSetting === 'number' ? temperatureSetting : DEFAULT_TEMPERATURE,
+ topP: typeof topPSetting === 'number' ? topPSetting : DEFAULT_TOP_P
+ }
+ }
+
+ private resolveVoiceId(modelId: string | undefined): string | null {
+ if (!modelId) return null
+ if (modelId === 'default') return null
+ return modelId
+ }
+
+ private getAudioMimeType(format: string): string {
+ const key = format.toLowerCase()
+ return AUDIO_MIME_TYPE[key] || 'audio/mpeg'
+ }
+
+ private parseDataUri(value: string): { mimeType: string; data: string } | null {
+ const match = value.match(/^data:([^;]+);base64,(.*)$/)
+ if (!match?.[1] || !match?.[2]) return null
+ return { mimeType: match[1], data: match[2] }
+ }
+
+ private isHttpUrl(value: string): boolean {
+ return value.startsWith('http://') || value.startsWith('https://')
+ }
+
+ private pickString(source: Record, keys: string[]): string | null {
+ for (const key of keys) {
+ const value = source[key]
+ if (typeof value === 'string' && value.trim().length > 0) {
+ return value
+ }
+ }
+ return null
+ }
+
+ private async fetchAudioFromUrl(
+ url: string,
+ fallbackMimeType: string
+ ): Promise<{ audioBase64: string; mimeType: string }> {
+ const headers: Record = { ...this.defaultHeaders }
+ const baseUrl = this.getBaseUrl()
+ if (this.provider.apiKey && url.startsWith(baseUrl)) {
+ headers.Authorization = `Bearer ${this.provider.apiKey}`
+ }
+
+ const response = await fetch(url, {
+ method: 'GET',
+ headers,
+ ...this.getFetchOptions()
+ })
+
+ if (!response.ok) {
+ const errorText = await response.text()
+ throw new Error(`Voice.ai audio fetch failed: ${response.status} ${errorText}`)
+ }
+
+ const contentType = response.headers.get('content-type')?.split(';')[0]?.trim()
+ const mimeType = contentType && contentType.length > 0 ? contentType : fallbackMimeType
+ const buffer = Buffer.from(await response.arrayBuffer())
+ return { audioBase64: buffer.toString('base64'), mimeType }
+ }
+
+ private async resolveAudioValue(
+ value: string,
+ fallbackMimeType: string
+ ): Promise<{ audioBase64: string; mimeType: string } | null> {
+ const trimmed = value.trim()
+ if (!trimmed) return null
+
+ const dataUri = this.parseDataUri(trimmed)
+ if (dataUri) {
+ return { audioBase64: dataUri.data, mimeType: dataUri.mimeType }
+ }
+
+ if (this.isHttpUrl(trimmed)) {
+ return await this.fetchAudioFromUrl(trimmed, fallbackMimeType)
+ }
+
+ return { audioBase64: trimmed, mimeType: fallbackMimeType }
+ }
+
+ private async resolveAudioFromJson(
+ payload: unknown,
+ fallbackMimeType: string
+ ): Promise<{ audioBase64: string; mimeType: string } | null> {
+ if (!payload || typeof payload !== 'object') return null
+
+ const data = payload as Record
+ const rootMimeType =
+ this.pickString(data, ['mime_type', 'content_type', 'contentType']) || fallbackMimeType
+
+ const audioField = data.audio
+ if (audioField && typeof audioField === 'object') {
+ const audioData = audioField as Record
+ const audioMimeType =
+ this.pickString(audioData, ['mime_type', 'content_type', 'contentType']) || rootMimeType
+ const audioValue =
+ this.pickString(audioData, ['base64', 'data', 'audio_base64', 'audioBase64', 'audio']) ||
+ this.pickString(audioData, ['url', 'audio_url', 'audioUrl'])
+ if (audioValue) {
+ return await this.resolveAudioValue(audioValue, audioMimeType)
+ }
+ }
+
+ const directAudioValue =
+ this.pickString(data, ['audio_base64', 'audioBase64', 'audio', 'data']) ||
+ this.pickString(data, ['audio_url', 'audioUrl', 'url'])
+ if (directAudioValue) {
+ return await this.resolveAudioValue(directAudioValue, rootMimeType)
+ }
+
+ return null
+ }
+
+ private async listVoices(): Promise {
+ const response = await fetch(this.buildUrl('/api/v1/tts/voices'), {
+ method: 'GET',
+ headers: this.getAuthHeaders(),
+ ...this.getFetchOptions()
+ })
+
+ if (!response.ok) {
+ const errorText = await response.text()
+ throw new Error(`Voice.ai list voices failed: ${response.status} ${errorText}`)
+ }
+
+ const data = await response.json()
+ if (!Array.isArray(data)) return []
+ return data as VoiceStatusResponse[]
+ }
+
+ private async generateSpeech(
+ text: string,
+ modelId: string,
+ temperature?: number
+ ): Promise<{ audioBase64: string; mimeType: string }> {
+ const config = this.getTtsConfig()
+ if (!SUPPORTED_LANGUAGES.has(config.language)) {
+ throw new Error(
+ `Unsupported language code: ${config.language}. Supported languages: ${Array.from(
+ SUPPORTED_LANGUAGES
+ ).join(', ')}`
+ )
+ }
+ const voiceId = this.resolveVoiceId(modelId)
+ const requestBody: Record = {
+ text,
+ audio_format: config.audioFormat,
+ model: config.model,
+ language: config.language,
+ temperature: typeof temperature === 'number' ? temperature : config.temperature,
+ top_p: config.topP
+ }
+
+ if (voiceId) {
+ requestBody['voice_id'] = voiceId
+ }
+
+ const response = await fetch(this.buildUrl('/api/v1/tts/speech'), {
+ method: 'POST',
+ headers: this.getAuthHeaders(),
+ body: JSON.stringify(requestBody),
+ ...this.getFetchOptions()
+ })
+
+ if (!response.ok) {
+ const errorText = await response.text()
+ throw new Error(`Voice.ai generate speech failed: ${response.status} ${errorText}`)
+ }
+
+ const contentType = response.headers.get('content-type')?.split(';')[0]?.trim()
+ const fallbackMimeType = this.getAudioMimeType(config.audioFormat)
+
+ if (contentType?.includes('application/json')) {
+ const json = await response.json()
+ const resolved = await this.resolveAudioFromJson(json, fallbackMimeType)
+ if (!resolved) {
+ throw new Error('Voice.ai generate speech returned JSON without audio data')
+ }
+ return resolved
+ }
+
+ const mimeType = contentType && contentType.length > 0 ? contentType : fallbackMimeType
+ const buffer = Buffer.from(await response.arrayBuffer())
+ return { audioBase64: buffer.toString('base64'), mimeType }
+ }
+
+ private extractLatestUserText(messages: ChatMessage[]): string | null {
+ const lastUser = [...messages].reverse().find((message) => message.role === 'user')
+ if (!lastUser?.content) return null
+
+ if (typeof lastUser.content === 'string') {
+ return lastUser.content
+ }
+
+ if (Array.isArray(lastUser.content)) {
+ const textParts = lastUser.content
+ .filter((part) => part.type === 'text')
+ .map((part) => part.text)
+ .filter(Boolean)
+
+ return textParts.length > 0 ? textParts.join('\n') : null
+ }
+
+ return null
+ }
+
+ private buildShortTitle(text: string): string {
+ const normalized = text.replace(/\s+/g, ' ').trim()
+ if (!normalized) return 'Voice AI'
+ return normalized.length > 32 ? `${normalized.slice(0, 32)}…` : normalized
+ }
+}
diff --git a/src/renderer/settings/components/ModelProviderSettingsDetail.vue b/src/renderer/settings/components/ModelProviderSettingsDetail.vue
index 5273daa09..30615df24 100644
--- a/src/renderer/settings/components/ModelProviderSettingsDetail.vue
+++ b/src/renderer/settings/components/ModelProviderSettingsDetail.vue
@@ -45,6 +45,10 @@
+
+
+
+
@@ -104,6 +108,7 @@ import { levelToValueMap, safetyCategories } from '@/lib/gemini'
import { Separator } from '@shadcn/components/ui/separator'
import type { SafetyCategoryKey, SafetySettingValue } from '@/lib/gemini'
import { useThrottleFn } from '@vueuse/core'
+import VoiceAIProviderConfig from './VoiceAIProviderConfig.vue'
interface ProviderWebsites {
official: string
diff --git a/src/renderer/settings/components/VoiceAIProviderConfig.vue b/src/renderer/settings/components/VoiceAIProviderConfig.vue
new file mode 100644
index 000000000..083a3c1be
--- /dev/null
+++ b/src/renderer/settings/components/VoiceAIProviderConfig.vue
@@ -0,0 +1,204 @@
+
+
+
+
+
+
+
+
+
{{ t('settings.provider.voiceai.title') }}
+
+ {{ t('settings.provider.voiceai.description') }}
+
+
+
+
+
+
+
+
+
+
+
+ {{ t('settings.provider.voiceai.audioFormat.helper') }}
+
+
+
+
+
+
+
+ {{ t('settings.provider.voiceai.language.helper') }}
+
+
+
+
+
+
+
+ {{ t('settings.provider.voiceai.model.helper') }}
+
+
+
+
+
+
+
+
+
+
+ {{ temperature.toFixed(2) }}
+
+
+
+ {{ t('settings.provider.voiceai.temperature.helper') }}
+
+
+
+
+
+
+ {{ topP.toFixed(2) }}
+
+
+
+ {{ t('settings.provider.voiceai.topP.helper') }}
+
+
+
+
+
+
+
+
diff --git a/src/renderer/src/assets/llm-icons/voiceai.svg b/src/renderer/src/assets/llm-icons/voiceai.svg
new file mode 100644
index 000000000..d68a970f9
--- /dev/null
+++ b/src/renderer/src/assets/llm-icons/voiceai.svg
@@ -0,0 +1,11 @@
+
diff --git a/src/renderer/src/components/icons/ModelIcon.vue b/src/renderer/src/components/icons/ModelIcon.vue
index 46748a54b..1116901d7 100644
--- a/src/renderer/src/components/icons/ModelIcon.vue
+++ b/src/renderer/src/components/icons/ModelIcon.vue
@@ -68,6 +68,7 @@ import zenmuxColorIcon from '@/assets/llm-icons/zenmux-color.svg?url'
import burncloudColorIcon from '@/assets/llm-icons/burncloud-color.svg?url'
import xiaomiColorIcon from '@/assets/llm-icons/xiaomi.png?url'
import o3fanColorIcon from '@/assets/llm-icons/o3-fan.png?url'
+import voiceAiColorIcon from '@/assets/llm-icons/voiceai.svg?url'
// 导入所有图标
const icons = {
@@ -148,6 +149,7 @@ const icons = {
zenmux: zenmuxColorIcon,
burncloud: burncloudColorIcon,
xiaomi: xiaomiColorIcon,
+ voiceai: voiceAiColorIcon,
default: defaultIcon
}
diff --git a/src/renderer/src/components/message/MessageBlockAudio.vue b/src/renderer/src/components/message/MessageBlockAudio.vue
new file mode 100644
index 000000000..bbcda61aa
--- /dev/null
+++ b/src/renderer/src/components/message/MessageBlockAudio.vue
@@ -0,0 +1,112 @@
+
+
+
+
+
+
+
+
+
+
+ {{ t('mcp.sampling.contentType.audio') }}
+
+
+
+ {{ resolvedAudioData.mimeType }}
+
+
+ {{ t('common.error.requestFailed') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/renderer/src/components/message/MessageItemAssistant.vue b/src/renderer/src/components/message/MessageItemAssistant.vue
index 04310749b..9dae745ce 100644
--- a/src/renderer/src/components/message/MessageItemAssistant.vue
+++ b/src/renderer/src/components/message/MessageItemAssistant.vue
@@ -71,6 +71,12 @@
:message-id="currentMessage.id"
:thread-id="currentThreadId"
/>
+
{
+ if (block.type === 'audio') return true
+ if (block.type !== 'image') return false
+ const mimeType = block.image_data?.mimeType?.toLowerCase() || ''
+ if (mimeType.startsWith('audio/')) return true
+ const data = block.image_data?.data || ''
+ if (data.startsWith('data:audio/')) return true
+ if (data.startsWith('imgcache://') || data.startsWith('http://') || data.startsWith('https://')) {
+ const lower = data.toLowerCase()
+ return AUDIO_EXTENSIONS.some((ext) => lower.includes(ext))
+ }
+ return false
+}
+
// 定义事件
const emit = defineEmits<{
copyImage: [
diff --git a/src/renderer/src/components/settings/ModelConfigDialog.vue b/src/renderer/src/components/settings/ModelConfigDialog.vue
index ed9582da9..54e551834 100644
--- a/src/renderer/src/components/settings/ModelConfigDialog.vue
+++ b/src/renderer/src/components/settings/ModelConfigDialog.vue
@@ -507,7 +507,8 @@ const isOpenAICompatibleProvider = computed(() => {
'aws-bedrock',
'github-copilot',
'ollama',
- 'acp'
+ 'acp',
+ 'voiceai'
]
const providerId = props.providerId?.toLowerCase() || ''
return !EXCLUDED_PROVIDERS.some((excluded) => providerId.includes(excluded))
diff --git a/src/renderer/src/i18n/en-US/settings.json b/src/renderer/src/i18n/en-US/settings.json
index 62c3cd32e..09104c498 100644
--- a/src/renderer/src/i18n/en-US/settings.json
+++ b/src/renderer/src/i18n/en-US/settings.json
@@ -493,6 +493,33 @@
"verifyFailed": "Verification failed",
"verifySuccess": "Verification is successful"
},
+ "voiceai": {
+ "title": "Voice.ai Text-to-Speech",
+ "description": "Generate speech from text. Voices appear in the model list below.",
+ "audioFormat": {
+ "label": "Audio Format",
+ "placeholder": "Select format",
+ "helper": "MP3 is recommended for most cases."
+ },
+ "language": {
+ "label": "Language",
+ "placeholder": "e.g. en",
+ "helper": "Supported: en, ca, sv, es, fr, de, it, pt, pl, ru, nl."
+ },
+ "model": {
+ "label": "TTS Model",
+ "placeholder": "voiceai-tts-v1-latest",
+ "helper": "See the Voice.ai docs for supported models."
+ },
+ "temperature": {
+ "label": "Temperature",
+ "helper": "Controls randomness (0-2)."
+ },
+ "topP": {
+ "label": "Top P",
+ "helper": "Nucleus sampling (0-1)."
+ }
+ },
"anthropicApiKeyTip": "Please go to Anthropic Console to get your API Key",
"anthropicConnected": "Anthropic connected",
"anthropicNotConnected": "Anthropic not connected",
diff --git a/src/renderer/src/i18n/zh-CN/settings.json b/src/renderer/src/i18n/zh-CN/settings.json
index e93f99dde..4e3ae8873 100644
--- a/src/renderer/src/i18n/zh-CN/settings.json
+++ b/src/renderer/src/i18n/zh-CN/settings.json
@@ -394,6 +394,33 @@
"operationalDescription": "同步 ModelScope 平台上可直接使用的 MCP 服务器"
}
},
+ "voiceai": {
+ "title": "Voice.ai 文字转语音",
+ "description": "将文本生成语音,voice 会在下方模型列表中展示。",
+ "audioFormat": {
+ "label": "音频格式",
+ "placeholder": "选择格式",
+ "helper": "多数场景推荐 MP3。"
+ },
+ "language": {
+ "label": "语言",
+ "placeholder": "例如 en",
+ "helper": "支持语言:en, ca, sv, es, fr, de, it, pt, pl, ru, nl。"
+ },
+ "model": {
+ "label": "TTS 模型",
+ "placeholder": "voiceai-tts-v1-latest",
+ "helper": "可选模型请查看 Voice.ai 文档。"
+ },
+ "temperature": {
+ "label": "温度",
+ "helper": "随机性参数(0-2)。"
+ },
+ "topP": {
+ "label": "Top P",
+ "helper": "Nucleus 采样(0-1)。"
+ }
+ },
"dialog": {
"disableModel": {
"title": "确认禁用模型",
diff --git a/src/renderer/src/stores/chat.ts b/src/renderer/src/stores/chat.ts
index abc4f30a1..5774875e8 100644
--- a/src/renderer/src/stores/chat.ts
+++ b/src/renderer/src/stores/chat.ts
@@ -1225,9 +1225,12 @@ export const useChatStore = defineStore('chat', () => {
}
} else if (msg.image_data) {
finalizeAssistantMessageBlocks(assistantMsg.content)
+ const mimeType = msg.image_data.mimeType || ''
+ const isAudio =
+ mimeType.startsWith('audio/') || msg.image_data.data?.startsWith('data:audio/')
assistantMsg.content.push({
- type: 'image',
- content: 'image',
+ type: isAudio ? 'audio' : 'image',
+ content: isAudio ? 'audio' : 'image',
status: 'success',
timestamp: Date.now(),
image_data: {
diff --git a/src/renderer/src/stores/providerStore.ts b/src/renderer/src/stores/providerStore.ts
index 5c1d21fbd..aff55d07c 100644
--- a/src/renderer/src/stores/providerStore.ts
+++ b/src/renderer/src/stores/providerStore.ts
@@ -5,6 +5,14 @@ import { useIpcQuery } from '@/composables/useIpcQuery'
import { CONFIG_EVENTS, PROVIDER_DB_EVENTS } from '@/events'
import type { AWS_BEDROCK_PROVIDER, LLM_PROVIDER, VERTEX_PROVIDER } from '@shared/presenter'
+type VoiceAIConfig = {
+ audioFormat: string
+ model: string
+ language: string
+ temperature: number
+ topP: number
+}
+
const PROVIDER_ORDER_KEY = 'providerOrder'
const PROVIDER_TIMESTAMP_KEY = 'providerTimestamps'
@@ -340,6 +348,34 @@ export const useProviderStore = defineStore('provider', () => {
return await configP.getSetting('awsBedrockCredential')
}
+ const getVoiceAIConfig = async (): Promise => {
+ return {
+ audioFormat: (await configP.getSetting('voiceAI_audioFormat')) || 'mp3',
+ model: (await configP.getSetting('voiceAI_model')) || 'voiceai-tts-v1-latest',
+ language: (await configP.getSetting('voiceAI_language')) || 'en',
+ temperature: (await configP.getSetting('voiceAI_temperature')) ?? 1,
+ topP: (await configP.getSetting('voiceAI_topP')) ?? 0.8
+ }
+ }
+
+ const updateVoiceAIConfig = async (updates: Partial) => {
+ if (updates.audioFormat !== undefined) {
+ await configP.setSetting('voiceAI_audioFormat', updates.audioFormat)
+ }
+ if (updates.model !== undefined) {
+ await configP.setSetting('voiceAI_model', updates.model)
+ }
+ if (updates.language !== undefined) {
+ await configP.setSetting('voiceAI_language', updates.language)
+ }
+ if (updates.temperature !== undefined) {
+ await configP.setSetting('voiceAI_temperature', updates.temperature)
+ }
+ if (updates.topP !== undefined) {
+ await configP.setSetting('voiceAI_topP', updates.topP)
+ }
+ }
+
const updateProviderTimestamp = async (providerId: string) => {
providerTimestamps.value[providerId] = Date.now()
await saveProviderTimestamps()
@@ -416,6 +452,8 @@ export const useProviderStore = defineStore('provider', () => {
setGeminiSafety,
getGeminiSafety,
setAwsBedrockCredential,
- getAwsBedrockCredential
+ getAwsBedrockCredential,
+ getVoiceAIConfig,
+ updateVoiceAIConfig
}
})
diff --git a/src/shared/chat.d.ts b/src/shared/chat.d.ts
index 26d514b99..8ba383299 100644
--- a/src/shared/chat.d.ts
+++ b/src/shared/chat.d.ts
@@ -91,6 +91,7 @@ export type AssistantMessageBlock = {
| 'tool_call'
| 'action'
| 'image'
+ | 'audio'
| 'artifact-thinking'
| 'mcp_ui_resource'
id?: string
diff --git a/src/shared/types/core/chat.ts b/src/shared/types/core/chat.ts
index 198ba868a..7066100d0 100644
--- a/src/shared/types/core/chat.ts
+++ b/src/shared/types/core/chat.ts
@@ -55,6 +55,7 @@ export type AssistantMessageBlock = {
| 'tool_call'
| 'action'
| 'image'
+ | 'audio'
| 'artifact-thinking'
| 'mcp_ui_resource'
content?: string
From cf5af65e3151a7de0a4d6aea625b90110a6f4e89 Mon Sep 17 00:00:00 2001
From: zhangmo8
Date: Tue, 20 Jan 2026 18:30:28 +0800
Subject: [PATCH 2/4] feat: voice ai call phone
---
electron.vite.config.ts | 5 +-
.../components/VoiceAIProviderConfig.vue | 57 ++++-
.../src/components/chat-input/ChatInput.vue | 218 +++++++++++++++++-
.../components/message/MessageBlockAudio.vue | 1 -
src/renderer/src/i18n/en-US/chat.json | 11 +
src/renderer/src/i18n/en-US/settings.json | 5 +
src/renderer/src/i18n/zh-CN/chat.json | 11 +
src/renderer/src/i18n/zh-CN/settings.json | 5 +
src/renderer/src/stores/providerStore.ts | 16 +-
vitest.config.renderer.ts | 13 +-
vitest.config.ts | 16 +-
11 files changed, 341 insertions(+), 17 deletions(-)
diff --git a/electron.vite.config.ts b/electron.vite.config.ts
index b32eae5f7..effef1451 100644
--- a/electron.vite.config.ts
+++ b/electron.vite.config.ts
@@ -7,6 +7,8 @@ import monacoEditorPlugin from 'vite-plugin-monaco-editor-esm'
import path from 'node:path'
import tailwindcss from '@tailwindcss/vite'
+const isCustomElement = (tag: string) =>
+ tag === 'voice-agent-widget' || tag.startsWith('ui-resource-renderer')
export default defineConfig({
main: {
@@ -82,8 +84,7 @@ export default defineConfig({
vue({
template: {
compilerOptions: {
- // 将所有带短横线的标签名都视为自定义元素
- isCustomElement: (tag) => tag.startsWith('ui-resource-renderer')
+ isCustomElement
}
}
}),
diff --git a/src/renderer/settings/components/VoiceAIProviderConfig.vue b/src/renderer/settings/components/VoiceAIProviderConfig.vue
index 083a3c1be..8cdb2a3a8 100644
--- a/src/renderer/settings/components/VoiceAIProviderConfig.vue
+++ b/src/renderer/settings/components/VoiceAIProviderConfig.vue
@@ -39,12 +39,20 @@
-
+
{{ t('settings.provider.voiceai.language.helper') }}
@@ -64,6 +72,21 @@
{{ t('settings.provider.voiceai.model.helper') }}
+
+
+
+
+
+ {{ t('settings.provider.voiceai.agentId.helper') }}
+
+
@@ -144,14 +167,30 @@ const ttsModel = ref('voiceai-tts-v1-latest')
const language = ref('en')
const temperature = ref(1)
const topP = ref(0.8)
+const agentId = ref('')
const isHydrating = ref(true)
+const languageOptions = [
+ { value: 'en', label: 'English (en)' },
+ { value: 'ca', label: 'Catalan (ca)' },
+ { value: 'sv', label: 'Swedish (sv)' },
+ { value: 'es', label: 'Spanish (es)' },
+ { value: 'fr', label: 'French (fr)' },
+ { value: 'de', label: 'German (de)' },
+ { value: 'it', label: 'Italian (it)' },
+ { value: 'pt', label: 'Portuguese (pt)' },
+ { value: 'pl', label: 'Polish (pl)' },
+ { value: 'ru', label: 'Russian (ru)' },
+ { value: 'nl', label: 'Dutch (nl)' }
+]
+
type VoiceAIConfigUpdates = {
audioFormat?: string
model?: string
language?: string
temperature?: number
topP?: number
+ agentId?: string
}
const persistUpdates = useDebounceFn(async (updates: VoiceAIConfigUpdates) => {
@@ -166,6 +205,7 @@ const loadConfig = async () => {
language.value = config.language
temperature.value = config.temperature
topP.value = config.topP
+ agentId.value = config.agentId
isHydrating.value = false
}
@@ -188,6 +228,11 @@ watch(language, (value) => {
void persistUpdates({ language: value })
})
+watch(agentId, (value) => {
+ if (isHydrating.value) return
+ void persistUpdates({ agentId: value })
+})
+
const onTemperatureChange = (value: number[] | undefined) => {
if (!value || value[0] === undefined) return
temperature.value = value[0]
diff --git a/src/renderer/src/components/chat-input/ChatInput.vue b/src/renderer/src/components/chat-input/ChatInput.vue
index 36847c8f5..d68d89c25 100644
--- a/src/renderer/src/components/chat-input/ChatInput.vue
+++ b/src/renderer/src/components/chat-input/ChatInput.vue
@@ -20,6 +20,7 @@
"
:class="[
'flex flex-col gap-2 relative',
+ isCallActive ? 'pointer-events-none opacity-60' : '',
variant === 'newThread'
? 'bg-card rounded-lg border p-2 shadow-sm'
: 'border-t px-4 py-3 gap-3'
@@ -371,13 +372,29 @@
/>
+
+
+
+
+
+ {{ t('chat.call.start') }}
+
+