Skip to content

Commit 40b4d4e

Browse files
sarvam integration for stt and tts (#216)
* sarvam integration for stt and tts * resolved review comment
1 parent 36527a5 commit 40b4d4e

File tree

15 files changed

+353
-11
lines changed

15 files changed

+353
-11
lines changed

wavefront/client/src/config/voice-providers.ts

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ export interface VoiceProvidersConfig {
4444
*/
4545
export const VOICE_PROVIDERS_CONFIG: VoiceProvidersConfig = {
4646
tts: {
47-
providers: ['elevenlabs', 'deepgram', 'cartesia'] as const,
47+
providers: ['elevenlabs', 'deepgram', 'cartesia', 'sarvam'] as const,
4848
configs: {
4949
elevenlabs: {
5050
name: 'ElevenLabs',
@@ -159,10 +159,68 @@ export const VOICE_PROVIDERS_CONFIG: VoiceProvidersConfig = {
159159
},
160160
},
161161
},
162+
sarvam: {
163+
name: 'Sarvam',
164+
badge: {
165+
bg: 'bg-orange-100',
166+
text: 'text-orange-800',
167+
},
168+
parameters: {
169+
model: {
170+
type: 'string',
171+
default: 'bulbul:v2',
172+
options: ['bulbul:v2', 'bulbul:v3'],
173+
description: 'Sarvam TTS model',
174+
},
175+
language: {
176+
type: 'string',
177+
default: '',
178+
description: 'Language code',
179+
placeholder: 'hi',
180+
},
181+
pitch: {
182+
type: 'number',
183+
default: 0.0,
184+
min: -0.75,
185+
max: 0.75,
186+
step: 0.05,
187+
description: 'Voice pitch (-0.75 to 0.75)',
188+
},
189+
pace: {
190+
type: 'number',
191+
default: 1.0,
192+
min: 0.3,
193+
max: 3.0,
194+
step: 0.1,
195+
description: 'Speech pace (0.3-3.0)',
196+
},
197+
loudness: {
198+
type: 'number',
199+
default: 1.0,
200+
min: 0.1,
201+
max: 3.0,
202+
step: 0.1,
203+
description: 'Volume (0.1-3.0)',
204+
},
205+
enable_preprocessing: {
206+
type: 'boolean',
207+
default: false,
208+
description: 'Enable text preprocessing',
209+
},
210+
temperature: {
211+
type: 'number',
212+
default: 0.6,
213+
min: 0.01,
214+
max: 1.0,
215+
step: 0.05,
216+
description: 'Randomness for bulbul v3 (0.01-1.0)',
217+
},
218+
},
219+
},
162220
},
163221
},
164222
stt: {
165-
providers: ['deepgram'] as const,
223+
providers: ['deepgram', 'sarvam'] as const,
166224
configs: {
167225
deepgram: {
168226
name: 'Deepgram',
@@ -236,6 +294,37 @@ export const VOICE_PROVIDERS_CONFIG: VoiceProvidersConfig = {
236294
},
237295
},
238296
},
297+
sarvam: {
298+
name: 'Sarvam',
299+
badge: {
300+
bg: 'bg-orange-100',
301+
text: 'text-orange-800',
302+
},
303+
parameters: {
304+
model: {
305+
type: 'string',
306+
default: 'saarika:v2.5',
307+
options: ['saarika:v2.5', 'saaras:v2'],
308+
description: 'Sarvam STT model',
309+
},
310+
language: {
311+
type: 'string',
312+
default: '',
313+
description: 'Language code',
314+
placeholder: 'hi',
315+
},
316+
vad_signals: {
317+
type: 'boolean',
318+
default: true,
319+
description: 'Enable VAD signals',
320+
},
321+
high_vad_sensitivity: {
322+
type: 'boolean',
323+
default: false,
324+
description: 'High VAD sensitivity',
325+
},
326+
},
327+
},
239328
},
240329
},
241330
};

wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/CreateSttConfigDialog.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import { z } from 'zod';
3232
const createSttConfigSchema = z.object({
3333
display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'),
3434
description: z.string().max(500, 'Description must be 500 characters or less').optional(),
35-
provider: z.enum(['deepgram'] as [string, ...string[]]),
35+
provider: z.enum(['deepgram', 'sarvam'] as [string, ...string[]]),
3636
api_key: z.string().min(1, 'API key is required'),
3737
});
3838

wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/EditSttConfigDialog.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import { z } from 'zod';
3333
const updateSttConfigSchema = z.object({
3434
display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'),
3535
description: z.string().max(500, 'Description must be 500 characters or less').optional(),
36-
provider: z.enum(['deepgram'] as [string, ...string[]]),
36+
provider: z.enum(['deepgram', 'sarvam'] as [string, ...string[]]),
3737
api_key: z.string().optional(),
3838
});
3939

wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/CreateTtsConfigDialog.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import { z } from 'zod';
3232
const createTtsConfigSchema = z.object({
3333
display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'),
3434
description: z.string().max(500, 'Description must be 500 characters or less').optional(),
35-
provider: z.enum(['elevenlabs', 'deepgram', 'cartesia'] as [string, ...string[]]),
35+
provider: z.enum(['elevenlabs', 'deepgram', 'cartesia', 'sarvam'] as [string, ...string[]]),
3636
api_key: z.string().min(1, 'API key is required'),
3737
});
3838

wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/EditTtsConfigDialog.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import { z } from 'zod';
3333
const updateTtsConfigSchema = z.object({
3434
display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'),
3535
description: z.string().max(500, 'Description must be 500 characters or less').optional(),
36-
provider: z.enum(['elevenlabs', 'deepgram', 'cartesia'] as [string, ...string[]]),
36+
provider: z.enum(['elevenlabs', 'deepgram', 'cartesia', 'sarvam'] as [string, ...string[]]),
3737
api_key: z.string().optional(),
3838
});
3939

wavefront/client/src/types/stt-config.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { IApiResponse } from '@app/lib/axios';
22

3-
export type SttProvider = 'deepgram';
3+
export type SttProvider = 'deepgram' | 'sarvam';
44

55
export interface SttConfig {
66
id: string;
@@ -52,3 +52,11 @@ export interface DeepgramSttParameters {
5252
profanity_filter?: boolean;
5353
vad_events?: boolean;
5454
}
55+
56+
// Sarvam STT specific parameters
57+
export interface SarvamSttParameters {
58+
model?: string; // default: 'saarika:v2.5'
59+
language?: string;
60+
vad_signals?: boolean;
61+
high_vad_sensitivity?: boolean;
62+
}

wavefront/client/src/types/tts-config.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { IApiResponse } from '@app/lib/axios';
22

3-
export type TtsProvider = 'elevenlabs' | 'deepgram' | 'cartesia';
3+
export type TtsProvider = 'elevenlabs' | 'deepgram' | 'cartesia' | 'sarvam';
44

55
export interface TtsConfig {
66
id: string;
@@ -62,3 +62,14 @@ export interface CartesiaParameters {
6262
language?: string; // Language enum
6363
speed?: number;
6464
}
65+
66+
// Sarvam TTS specific parameters
67+
export interface SarvamTtsParameters {
68+
model?: string; // default: 'bulbul:v2'
69+
language?: string;
70+
pitch?: number; // -0.75 to 0.75
71+
pace?: number; // 0.3 to 3.0
72+
loudness?: number; // 0.1 to 3.0
73+
enable_preprocessing?: boolean;
74+
temperature?: number; // 0.01 to 1.0
75+
}

wavefront/server/apps/call_processing/call_processing/services/stt_service.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99

1010
# Pipecat STT services
1111
from pipecat.services.deepgram.stt import DeepgramSTTService
12+
from pipecat.services.sarvam.stt import SarvamSTTService
13+
14+
# Pipecat language enum
15+
from pipecat.transcriptions.language import Language
1216

1317
# Deepgram options
1418
from deepgram import LiveOptions
@@ -51,6 +55,8 @@ def create_stt_service(stt_config: Dict[str, Any]):
5155

5256
if provider == 'deepgram':
5357
return STTServiceFactory._create_deepgram_stt(api_key, parameters)
58+
elif provider == 'sarvam':
59+
return STTServiceFactory._create_sarvam_stt(api_key, parameters)
5460
elif provider == 'assemblyai':
5561
return STTServiceFactory._create_assemblyai_stt(api_key, parameters)
5662
elif provider == 'whisper':
@@ -106,6 +112,56 @@ def _create_deepgram_stt(api_key: str, parameters: Dict[str, Any]):
106112

107113
return DeepgramSTTService(api_key=api_key, live_options=live_options)
108114

115+
# Mapping of short language codes to pipecat Language enum for Sarvam
116+
SARVAM_LANGUAGE_MAP = {
117+
'bn': Language.BN_IN,
118+
'en': Language.EN_IN,
119+
'gu': Language.GU_IN,
120+
'hi': Language.HI_IN,
121+
'kn': Language.KN_IN,
122+
'ml': Language.ML_IN,
123+
'mr': Language.MR_IN,
124+
'or': Language.OR_IN,
125+
'pa': Language.PA_IN,
126+
'ta': Language.TA_IN,
127+
'te': Language.TE_IN,
128+
}
129+
130+
@staticmethod
131+
def _create_sarvam_stt(api_key: str, parameters: Dict[str, Any]):
132+
"""Create Sarvam STT service"""
133+
params_dict = {}
134+
135+
# Map language code to pipecat Language enum
136+
if 'language' in parameters and parameters['language']:
137+
lang_code = parameters['language']
138+
lang_enum = STTServiceFactory.SARVAM_LANGUAGE_MAP.get(lang_code)
139+
if lang_enum:
140+
params_dict['language'] = lang_enum
141+
else:
142+
logger.warning(f"Unknown Sarvam language '{lang_code}', skipping")
143+
144+
if 'vad_signals' in parameters:
145+
params_dict['vad_signals'] = parameters['vad_signals']
146+
if 'high_vad_sensitivity' in parameters:
147+
params_dict['high_vad_sensitivity'] = parameters['high_vad_sensitivity']
148+
149+
model = parameters.get('model', 'saarika:v2.5')
150+
sample_rate = parameters.get('sample_rate', 8000)
151+
152+
input_params = (
153+
SarvamSTTService.InputParams(**params_dict) if params_dict else None
154+
)
155+
156+
logger.info(f'Sarvam STT config: model={model}, sample_rate={sample_rate}')
157+
158+
return SarvamSTTService(
159+
api_key=api_key,
160+
model=model,
161+
sample_rate=sample_rate,
162+
params=input_params,
163+
)
164+
109165
@staticmethod
110166
def _create_assemblyai_stt(api_key: str, parameters: Dict[str, Any]):
111167
"""Create AssemblyAI STT service"""

wavefront/server/apps/call_processing/call_processing/services/tts_service.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
1212
from pipecat.services.deepgram.tts import DeepgramTTSService
1313
from pipecat.services.cartesia.tts import CartesiaTTSService
14+
from pipecat.services.sarvam.tts import SarvamTTSService
1415

1516
# Language for params
1617
from pipecat.transcriptions.language import Language
@@ -62,6 +63,8 @@ def create_tts_service(tts_config: Dict[str, Any]):
6263
return TTSServiceFactory._create_deepgram_tts(api_key, voice_id, parameters)
6364
elif provider == 'cartesia':
6465
return TTSServiceFactory._create_cartesia_tts(api_key, voice_id, parameters)
66+
elif provider == 'sarvam':
67+
return TTSServiceFactory._create_sarvam_tts(api_key, voice_id, parameters)
6568
else:
6669
raise ValueError(f'Unsupported TTS provider: {provider}')
6770

@@ -162,3 +165,55 @@ def _create_cartesia_tts(api_key: str, voice_id: str, parameters: Dict[str, Any]
162165
return CartesiaTTSService(
163166
api_key=api_key, voice_id=voice_id, model=model, params=input_params
164167
)
168+
169+
# Mapping of short language codes to pipecat Language enum for Sarvam
170+
SARVAM_LANGUAGE_MAP = {
171+
'bn': Language.BN_IN,
172+
'en': Language.EN_IN,
173+
'gu': Language.GU_IN,
174+
'hi': Language.HI_IN,
175+
'kn': Language.KN_IN,
176+
'ml': Language.ML_IN,
177+
'mr': Language.MR_IN,
178+
'or': Language.OR_IN,
179+
'pa': Language.PA_IN,
180+
'ta': Language.TA_IN,
181+
'te': Language.TE_IN,
182+
}
183+
184+
@staticmethod
185+
def _create_sarvam_tts(api_key: str, voice_id: str, parameters: Dict[str, Any]):
186+
"""Create Sarvam TTS service (WebSocket-based streaming)"""
187+
model = parameters.get('model', 'bulbul:v2')
188+
189+
# Build InputParams from the parameters dict
190+
params_dict = {}
191+
192+
if 'language' in parameters and parameters['language']:
193+
lang_code = parameters['language']
194+
lang_enum = TTSServiceFactory.SARVAM_LANGUAGE_MAP.get(lang_code)
195+
if lang_enum:
196+
params_dict['language'] = lang_enum
197+
else:
198+
logger.warning(f"Unknown Sarvam language '{lang_code}', skipping")
199+
200+
if 'pitch' in parameters:
201+
params_dict['pitch'] = parameters['pitch']
202+
if 'pace' in parameters:
203+
params_dict['pace'] = parameters['pace']
204+
if 'loudness' in parameters:
205+
params_dict['loudness'] = parameters['loudness']
206+
if 'enable_preprocessing' in parameters:
207+
params_dict['enable_preprocessing'] = parameters['enable_preprocessing']
208+
if 'temperature' in parameters:
209+
params_dict['temperature'] = parameters['temperature']
210+
211+
input_params = (
212+
SarvamTTSService.InputParams(**params_dict) if params_dict else None
213+
)
214+
215+
logger.info(f'Sarvam TTS config: voice={voice_id}, model={model}')
216+
217+
return SarvamTTSService(
218+
api_key=api_key, voice_id=voice_id, model=model, params=input_params
219+
)

wavefront/server/apps/call_processing/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies = [
2121
"redis>=5.0.0",
2222
"tenacity>=8.0.0",
2323
# Pipecat and voice processing
24-
"pipecat-ai[websocket,cartesia,google,silero,deepgram,groq,runner,azure,local-smart-turn-v3]==0.0.100",
24+
"pipecat-ai[websocket,cartesia,google,silero,deepgram,groq,runner,azure,local-smart-turn-v3,sarvam]==0.0.100",
2525
# Twilio
2626
"twilio>=8.0.0",
2727
]

0 commit comments

Comments
 (0)