diff --git a/packages/llm/src/protocols/gemini.ts b/packages/llm/src/protocols/gemini.ts index c8fa34b50965..3a2311c8fdc1 100644 --- a/packages/llm/src/protocols/gemini.ts +++ b/packages/llm/src/protocols/gemini.ts @@ -21,7 +21,7 @@ import { GeminiToolSchema } from "./utils/gemini-tool-schema" import { Lifecycle } from "./utils/lifecycle" const ADAPTER = "gemini" -const IMAGE_MIMES = new Set(ProviderShared.IMAGE_MIMES) +const MEDIA_MIMES = new Set(ProviderShared.MEDIA_MIMES) export const DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" // ============================================================================= @@ -182,7 +182,7 @@ const lowerToolConfig = (toolChoice: NonNullable) => const lowerUserPart = Effect.fn("Gemini.lowerUserPart")(function* (part: TextPart | MediaPart) { if (part.type === "text") return { text: part.text } - const media = yield* ProviderShared.validateMedia("Gemini", part, IMAGE_MIMES) + const media = yield* ProviderShared.validateMedia("Gemini", part, MEDIA_MIMES) return { inlineData: { mimeType: media.mime, data: media.base64 } } }) @@ -275,7 +275,7 @@ const lowerMessages = Effect.fn("Gemini.lowerMessages")(function* (request: LLMR }) for (const item of content) { if (item.type === "text") continue - const media = yield* ProviderShared.validateToolFile("Gemini", item, IMAGE_MIMES) + const media = yield* ProviderShared.validateToolFile("Gemini", item, MEDIA_MIMES) parts.push({ inlineData: { mimeType: media.mime, data: media.base64 } }) } } diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index 4a1fed55398a..66b353c82854 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -188,8 +188,11 @@ export const parseToolInput = (route: string, name: string, raw: string) => parseJson(route, raw || "{}", `Invalid JSON input for ${route} tool call ${name}`) export const IMAGE_MIMES = ["image/png", "image/jpeg", "image/gif", "image/webp"] as const -export const MAX_MEDIA_ENCODED_BYTES = 8 * 1024 * 1024 -export const MAX_MEDIA_DECODED_BYTES = 6 * 1024 * 1024 +export const VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"] as const +export const AUDIO_MIMES = ["audio/wav", "audio/mp3", "audio/aiff", "audio/aac", "audio/ogg", "audio/flac"] as const +export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES, ...AUDIO_MIMES] as const +export const MAX_MEDIA_ENCODED_BYTES = 28 * 1024 * 1024 +export const MAX_MEDIA_DECODED_BYTES = 20 * 1024 * 1024 const base64Pattern = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/