From 860db2d7defe6d47149b0e25361083926cd89341 Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Thu, 11 Jun 2026 15:03:49 +0200 Subject: [PATCH 1/7] feat(core): add video media support (mp4, webm) to read tool and Gemini protocol Adds video file reading (MP4, WebM) to the read tool and widens the Gemini protocol to accept video MIME types via inlineData. MIME sniffing uses ftyp major brand validation for MP4 (excludes AVIF/HEIC/M4A) and EBML DocType verification for WebM (excludes Matroska). Video bypasses photon image normalization and passes through as raw base64. Gemini gets higher media size limits for video (20 MB decoded / 28 MB encoded) while other protocols keep the default image limits (6 MB / 8 MB). OpenAI and Anthropic stay image-only since their APIs do not support inline base64 video. Session: ses_1494c002affeaQLbzZve1wSJEm --- packages/core/src/tool/read-filesystem.ts | 14 +++++++++-- packages/core/src/tool/read.ts | 11 ++++++--- packages/llm/src/protocols/gemini.ts | 9 ++++--- packages/llm/src/protocols/shared.ts | 30 +++++++++++++++++------ packages/opencode/src/tool/read.ts | 6 +++-- packages/opencode/src/tool/read.txt | 2 +- packages/opencode/src/util/media.ts | 16 +++++++++++- 7 files changed, 68 insertions(+), 20 deletions(-) diff --git a/packages/core/src/tool/read-filesystem.ts b/packages/core/src/tool/read-filesystem.ts index c27bdae6dee8..d2e1266ae734 100644 --- a/packages/core/src/tool/read-filesystem.ts +++ b/packages/core/src/tool/read-filesystem.ts @@ -94,12 +94,22 @@ const extensions = new Set([ ".pyo", ]) const startsWith = (bytes: Uint8Array, prefix: number[]) => prefix.every((value, index) => bytes[index] === value) -const imageMime = (bytes: Uint8Array) => { +const MP4_VIDEO_BRANDS = new Set(["isom", "iso2", "iso5", "iso6", "mp41", "mp42", "avc1", "dash", "m4v "]) +const isMp4VideoBrand = (bytes: Uint8Array) => { + const brand = Buffer.from(bytes.subarray(8, 12)).toString("ascii") + return MP4_VIDEO_BRANDS.has(brand) +} +const isWebmDocType = (bytes: Uint8Array) => + Buffer.from(bytes.subarray(0, Math.min(bytes.length, 64))).toString("latin1").includes("webm") +const mediaMime = (bytes: Uint8Array) => { if (startsWith(bytes, [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a])) return "image/png" if (startsWith(bytes, [0xff, 0xd8, 0xff])) return "image/jpeg" if (startsWith(bytes, [0x47, 0x49, 0x46, 0x38])) return "image/gif" if (startsWith(bytes, [0x52, 0x49, 0x46, 0x46]) && startsWith(bytes.subarray(8), [0x57, 0x45, 0x42, 0x50])) return "image/webp" + if (bytes.length >= 12 && startsWith(bytes.subarray(4), [0x66, 0x74, 0x79, 0x70]) && isMp4VideoBrand(bytes)) + return "video/mp4" + if (startsWith(bytes, [0x1a, 0x45, 0xdf, 0xa3]) && isWebmDocType(bytes)) return "video/webm" } const binary = (resource: string, bytes: Uint8Array) => { if (extensions.has(path.extname(resource).toLowerCase())) return true @@ -135,7 +145,7 @@ export const read = Effect.fn("ReadTool.read")(function* ( yield* file.readAlloc(Math.min(64 * 1024, Number(info.size) || 4 * 1024)).pipe(Effect.orDie), () => new Uint8Array(), ) - const mime = imageMime(first) + const mime = mediaMime(first) if (mime) { if (info.size > MAX_MEDIA_INGEST_BYTES) return yield* Effect.die(new MediaIngestLimitError(resource, MAX_MEDIA_INGEST_BYTES)) diff --git a/packages/core/src/tool/read.ts b/packages/core/src/tool/read.ts index 64f02d813fe3..64979bd2394d 100644 --- a/packages/core/src/tool/read.ts +++ b/packages/core/src/tool/read.ts @@ -15,6 +15,8 @@ import { Tools } from "./tools" export const name = "read" const SUPPORTED_IMAGE_MIMES = new Set(["image/jpeg", "image/png", "image/gif", "image/webp"]) +const SUPPORTED_VIDEO_MIMES = new Set(["video/mp4", "video/webm", "video/quicktime"]) +const SUPPORTED_MEDIA_MIMES = new Set([...SUPPORTED_IMAGE_MIMES, ...SUPPORTED_VIDEO_MIMES]) const LocationInput = Schema.Struct({ path: Schema.String, offset: ReadToolFileSystem.PageInput.fields.offset.annotate({ @@ -40,14 +42,15 @@ export const layer = Layer.effectDiscard( .register({ [name]: Tool.make({ description: - "Read a text file or supported image, page through a large UTF-8 text file by line offset, or list a directory page. Relative paths resolve from the current location; absolute paths are read directly.", + "Read a text file, supported image, or video (mp4, webm), page through a large UTF-8 text file by line offset, or list a directory page. Relative paths resolve from the current location; absolute paths are read directly.", input: Input, output: Output, toModelOutput: ({ input, output }) => { - if (!("encoding" in output) || output.encoding !== "base64" || !SUPPORTED_IMAGE_MIMES.has(output.mime)) + if (!("encoding" in output) || output.encoding !== "base64" || !SUPPORTED_MEDIA_MIMES.has(output.mime)) return [] + const label = SUPPORTED_VIDEO_MIMES.has(output.mime) ? "Video" : "Image" return [ - { type: "text", text: "Image read successfully" }, + { type: "text", text: `${label} read successfully` }, { type: "file", data: output.content, mime: output.mime, name: input.path }, ] }, @@ -82,6 +85,8 @@ export const layer = Layer.effectDiscard( .normalize(resource, { ...content, encoding: "base64" }) .pipe(Effect.catchTag("Image.ResizerUnavailableError", () => Effect.succeed(content))) } + if ("encoding" in content && content.encoding === "base64" && SUPPORTED_VIDEO_MIMES.has(content.mime)) + return content if ("encoding" in content && content.encoding === "base64") return yield* Effect.fail(new ReadToolFileSystem.BinaryFileError(resource)) return content diff --git a/packages/llm/src/protocols/gemini.ts b/packages/llm/src/protocols/gemini.ts index c8fa34b50965..076c75a76121 100644 --- a/packages/llm/src/protocols/gemini.ts +++ b/packages/llm/src/protocols/gemini.ts @@ -21,7 +21,8 @@ import { GeminiToolSchema } from "./utils/gemini-tool-schema" import { Lifecycle } from "./utils/lifecycle" const ADAPTER = "gemini" -const IMAGE_MIMES = new Set(ProviderShared.IMAGE_MIMES) +const MEDIA_MIMES = new Set(ProviderShared.MEDIA_MIMES) +const VIDEO_LIMITS: ProviderShared.MediaLimits = { maxDecodedBytes: 20 * 1024 * 1024, maxEncodedBytes: 28 * 1024 * 1024 } export const DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" // ============================================================================= @@ -182,7 +183,8 @@ const lowerToolConfig = (toolChoice: NonNullable) => const lowerUserPart = Effect.fn("Gemini.lowerUserPart")(function* (part: TextPart | MediaPart) { if (part.type === "text") return { text: part.text } - const media = yield* ProviderShared.validateMedia("Gemini", part, IMAGE_MIMES) + const limits = part.mediaType.startsWith("video/") ? VIDEO_LIMITS : undefined + const media = yield* ProviderShared.validateMedia("Gemini", part, MEDIA_MIMES, limits) return { inlineData: { mimeType: media.mime, data: media.base64 } } }) @@ -275,7 +277,8 @@ const lowerMessages = Effect.fn("Gemini.lowerMessages")(function* (request: LLMR }) for (const item of content) { if (item.type === "text") continue - const media = yield* ProviderShared.validateToolFile("Gemini", item, IMAGE_MIMES) + const limits = item.mime.startsWith("video/") ? VIDEO_LIMITS : undefined + const media = yield* ProviderShared.validateToolFile("Gemini", item, MEDIA_MIMES, limits) parts.push({ inlineData: { mimeType: media.mime, data: media.base64 } }) } } diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index 4a1fed55398a..9b665ec93c5c 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -188,6 +188,8 @@ export const parseToolInput = (route: string, name: string, raw: string) => parseJson(route, raw || "{}", `Invalid JSON input for ${route} tool call ${name}`) export const IMAGE_MIMES = ["image/png", "image/jpeg", "image/gif", "image/webp"] as const +export const VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"] as const +export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES] as const export const MAX_MEDIA_ENCODED_BYTES = 8 * 1024 * 1024 export const MAX_MEDIA_DECODED_BYTES = 6 * 1024 * 1024 @@ -200,18 +202,26 @@ export interface ValidatedMedia { readonly bytes: Uint8Array } +export interface MediaLimits { + readonly maxDecodedBytes?: number + readonly maxEncodedBytes?: number +} + export const validateMedia = Effect.fn("ProviderShared.validateMedia")(function* ( route: string, part: MediaPart, supportedMimes: ReadonlySet, + limits?: MediaLimits, ) { + const maxDecoded = limits?.maxDecodedBytes ?? MAX_MEDIA_DECODED_BYTES + const maxEncoded = limits?.maxEncodedBytes ?? MAX_MEDIA_ENCODED_BYTES const mime = part.mediaType.toLowerCase() if (!supportedMimes.has(mime)) return yield* invalidRequest(`${route} does not support media type ${part.mediaType}`) let base64: string if (typeof part.data !== "string") { - if (part.data.byteLength > MAX_MEDIA_DECODED_BYTES) - return yield* invalidRequest(`${route} media exceeds the ${MAX_MEDIA_DECODED_BYTES} byte decoded limit`) + if (part.data.byteLength > maxDecoded) + return yield* invalidRequest(`${route} media exceeds the ${maxDecoded} byte decoded limit`) base64 = Buffer.from(part.data).toString("base64") } else if (part.data.startsWith("data:")) { const match = /^data:([^;,]+);base64,([A-Za-z0-9+/]*={0,2})$/s.exec(part.data) @@ -223,19 +233,23 @@ export const validateMedia = Effect.fn("ProviderShared.validateMedia")(function* base64 = part.data } - if (Buffer.byteLength(base64, "utf8") > MAX_MEDIA_ENCODED_BYTES) - return yield* invalidRequest(`${route} media exceeds the ${MAX_MEDIA_ENCODED_BYTES} byte encoded limit`) + if (Buffer.byteLength(base64, "utf8") > maxEncoded) + return yield* invalidRequest(`${route} media exceeds the ${maxEncoded} byte encoded limit`) if (!base64 || base64.length % 4 !== 0 || !base64Pattern.test(base64)) return yield* invalidRequest(`${route} media must contain valid base64`) const bytes = Buffer.from(base64, "base64") - if (bytes.byteLength > MAX_MEDIA_DECODED_BYTES) - return yield* invalidRequest(`${route} media exceeds the ${MAX_MEDIA_DECODED_BYTES} byte decoded limit`) + if (bytes.byteLength > maxDecoded) + return yield* invalidRequest(`${route} media exceeds the ${maxDecoded} byte decoded limit`) if (bytes.toString("base64") !== base64) return yield* invalidRequest(`${route} media must contain canonical base64`) return { mime, base64, dataUrl: `data:${mime};base64,${base64}`, bytes } satisfies ValidatedMedia }) -export const validateToolFile = (route: string, part: ToolFileContent, supportedMimes: ReadonlySet) => - validateMedia(route, { type: "media", mediaType: part.mime, data: part.uri, filename: part.name }, supportedMimes) +export const validateToolFile = ( + route: string, + part: ToolFileContent, + supportedMimes: ReadonlySet, + limits?: MediaLimits, +) => validateMedia(route, { type: "media", mediaType: part.mime, data: part.uri, filename: part.name }, supportedMimes, limits) export const trimBaseUrl = (value: string) => value.replace(/\/+$/, "") diff --git a/packages/opencode/src/tool/read.ts b/packages/opencode/src/tool/read.ts index 678ed4451048..bfa3a31a8092 100644 --- a/packages/opencode/src/tool/read.ts +++ b/packages/opencode/src/tool/read.ts @@ -17,6 +17,7 @@ const MAX_BYTES = 50 * 1024 const MAX_BYTES_LABEL = `${MAX_BYTES / 1024} KB` const SAMPLE_BYTES = 4096 const SUPPORTED_IMAGE_MIMES = new Set(["image/jpeg", "image/png", "image/gif", "image/webp"]) +const SUPPORTED_VIDEO_MIMES = new Set(["video/mp4", "video/webm", "video/quicktime"]) class ReadStop extends Schema.TaggedErrorClass()("ReadStop", {}) {} @@ -302,10 +303,11 @@ export const ReadTool = Tool.define< const mime = sniffAttachmentMime(sample, FSUtil.mimeType(filepath)) const isImage = SUPPORTED_IMAGE_MIMES.has(mime) + const isVideo = SUPPORTED_VIDEO_MIMES.has(mime) - if (isImage || isPdfAttachment(mime)) { + if (isImage || isVideo || isPdfAttachment(mime)) { const bytes = yield* fs.readFile(filepath) - const msg = isPdfAttachment(mime) ? "PDF read successfully" : "Image read successfully" + const msg = isVideo ? "Video read successfully" : isPdfAttachment(mime) ? "PDF read successfully" : "Image read successfully" return { title, output: msg, diff --git a/packages/opencode/src/tool/read.txt b/packages/opencode/src/tool/read.txt index 368174cc8d14..a2b5fda39d47 100644 --- a/packages/opencode/src/tool/read.txt +++ b/packages/opencode/src/tool/read.txt @@ -11,4 +11,4 @@ Usage: - Any line longer than 2000 characters is truncated. - Call this tool in parallel when you know there are multiple files you want to read. - Avoid tiny repeated slices (30 line chunks). If you need more context, read a larger window. -- This tool can read image files and PDFs and return them as file attachments. +- This tool can read image files, video files (mp4, webm), and PDFs and return them as file attachments. diff --git a/packages/opencode/src/util/media.ts b/packages/opencode/src/util/media.ts index 566ac843a634..68159a94baf8 100644 --- a/packages/opencode/src/util/media.ts +++ b/packages/opencode/src/util/media.ts @@ -1,11 +1,22 @@ const startsWith = (bytes: Uint8Array, prefix: number[]) => prefix.every((value, index) => bytes[index] === value) +const MP4_VIDEO_BRANDS = new Set(["isom", "iso2", "iso5", "iso6", "mp41", "mp42", "avc1", "dash", "m4v "]) +const isMp4VideoBrand = (bytes: Uint8Array) => { + const brand = Buffer.from(bytes.subarray(8, 12)).toString("ascii") + return MP4_VIDEO_BRANDS.has(brand) +} +const isWebmDocType = (bytes: Uint8Array) => + Buffer.from(bytes.subarray(0, Math.min(bytes.length, 64))).toString("latin1").includes("webm") export function isPdfAttachment(mime: string) { return mime === "application/pdf" } +export function isVideoAttachment(mime: string) { + return mime.startsWith("video/") +} + export function isMedia(mime: string) { - return mime.startsWith("image/") || isPdfAttachment(mime) + return mime.startsWith("image/") || mime.startsWith("video/") || isPdfAttachment(mime) } export function isImageAttachment(mime: string) { @@ -21,6 +32,9 @@ export function sniffAttachmentMime(bytes: Uint8Array, fallback: string) { if (startsWith(bytes, [0x52, 0x49, 0x46, 0x46]) && startsWith(bytes.subarray(8), [0x57, 0x45, 0x42, 0x50])) { return "image/webp" } + if (bytes.length >= 12 && startsWith(bytes.subarray(4), [0x66, 0x74, 0x79, 0x70]) && isMp4VideoBrand(bytes)) + return "video/mp4" + if (startsWith(bytes, [0x1a, 0x45, 0xdf, 0xa3]) && isWebmDocType(bytes)) return "video/webm" return fallback } From bd14ce72b8c65ace636236e11f29f17c5ff162fa Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Thu, 11 Jun 2026 15:13:08 +0200 Subject: [PATCH 2/7] refactor(core): revert read tool video changes, keep protocol-only support Video support is provided through the protocol layer only, so plugins can return video via Tool.make toModelOutput. The native read tool stays image/PDF-only per maintainer preference. Session: ses_1494c002affeaQLbzZve1wSJEm --- packages/core/src/tool/read-filesystem.ts | 14 ++------------ packages/core/src/tool/read.ts | 11 +++-------- packages/opencode/src/tool/read.ts | 6 ++---- packages/opencode/src/tool/read.txt | 2 +- packages/opencode/src/util/media.ts | 16 +--------------- 5 files changed, 9 insertions(+), 40 deletions(-) diff --git a/packages/core/src/tool/read-filesystem.ts b/packages/core/src/tool/read-filesystem.ts index d2e1266ae734..c27bdae6dee8 100644 --- a/packages/core/src/tool/read-filesystem.ts +++ b/packages/core/src/tool/read-filesystem.ts @@ -94,22 +94,12 @@ const extensions = new Set([ ".pyo", ]) const startsWith = (bytes: Uint8Array, prefix: number[]) => prefix.every((value, index) => bytes[index] === value) -const MP4_VIDEO_BRANDS = new Set(["isom", "iso2", "iso5", "iso6", "mp41", "mp42", "avc1", "dash", "m4v "]) -const isMp4VideoBrand = (bytes: Uint8Array) => { - const brand = Buffer.from(bytes.subarray(8, 12)).toString("ascii") - return MP4_VIDEO_BRANDS.has(brand) -} -const isWebmDocType = (bytes: Uint8Array) => - Buffer.from(bytes.subarray(0, Math.min(bytes.length, 64))).toString("latin1").includes("webm") -const mediaMime = (bytes: Uint8Array) => { +const imageMime = (bytes: Uint8Array) => { if (startsWith(bytes, [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a])) return "image/png" if (startsWith(bytes, [0xff, 0xd8, 0xff])) return "image/jpeg" if (startsWith(bytes, [0x47, 0x49, 0x46, 0x38])) return "image/gif" if (startsWith(bytes, [0x52, 0x49, 0x46, 0x46]) && startsWith(bytes.subarray(8), [0x57, 0x45, 0x42, 0x50])) return "image/webp" - if (bytes.length >= 12 && startsWith(bytes.subarray(4), [0x66, 0x74, 0x79, 0x70]) && isMp4VideoBrand(bytes)) - return "video/mp4" - if (startsWith(bytes, [0x1a, 0x45, 0xdf, 0xa3]) && isWebmDocType(bytes)) return "video/webm" } const binary = (resource: string, bytes: Uint8Array) => { if (extensions.has(path.extname(resource).toLowerCase())) return true @@ -145,7 +135,7 @@ export const read = Effect.fn("ReadTool.read")(function* ( yield* file.readAlloc(Math.min(64 * 1024, Number(info.size) || 4 * 1024)).pipe(Effect.orDie), () => new Uint8Array(), ) - const mime = mediaMime(first) + const mime = imageMime(first) if (mime) { if (info.size > MAX_MEDIA_INGEST_BYTES) return yield* Effect.die(new MediaIngestLimitError(resource, MAX_MEDIA_INGEST_BYTES)) diff --git a/packages/core/src/tool/read.ts b/packages/core/src/tool/read.ts index 64979bd2394d..64f02d813fe3 100644 --- a/packages/core/src/tool/read.ts +++ b/packages/core/src/tool/read.ts @@ -15,8 +15,6 @@ import { Tools } from "./tools" export const name = "read" const SUPPORTED_IMAGE_MIMES = new Set(["image/jpeg", "image/png", "image/gif", "image/webp"]) -const SUPPORTED_VIDEO_MIMES = new Set(["video/mp4", "video/webm", "video/quicktime"]) -const SUPPORTED_MEDIA_MIMES = new Set([...SUPPORTED_IMAGE_MIMES, ...SUPPORTED_VIDEO_MIMES]) const LocationInput = Schema.Struct({ path: Schema.String, offset: ReadToolFileSystem.PageInput.fields.offset.annotate({ @@ -42,15 +40,14 @@ export const layer = Layer.effectDiscard( .register({ [name]: Tool.make({ description: - "Read a text file, supported image, or video (mp4, webm), page through a large UTF-8 text file by line offset, or list a directory page. Relative paths resolve from the current location; absolute paths are read directly.", + "Read a text file or supported image, page through a large UTF-8 text file by line offset, or list a directory page. Relative paths resolve from the current location; absolute paths are read directly.", input: Input, output: Output, toModelOutput: ({ input, output }) => { - if (!("encoding" in output) || output.encoding !== "base64" || !SUPPORTED_MEDIA_MIMES.has(output.mime)) + if (!("encoding" in output) || output.encoding !== "base64" || !SUPPORTED_IMAGE_MIMES.has(output.mime)) return [] - const label = SUPPORTED_VIDEO_MIMES.has(output.mime) ? "Video" : "Image" return [ - { type: "text", text: `${label} read successfully` }, + { type: "text", text: "Image read successfully" }, { type: "file", data: output.content, mime: output.mime, name: input.path }, ] }, @@ -85,8 +82,6 @@ export const layer = Layer.effectDiscard( .normalize(resource, { ...content, encoding: "base64" }) .pipe(Effect.catchTag("Image.ResizerUnavailableError", () => Effect.succeed(content))) } - if ("encoding" in content && content.encoding === "base64" && SUPPORTED_VIDEO_MIMES.has(content.mime)) - return content if ("encoding" in content && content.encoding === "base64") return yield* Effect.fail(new ReadToolFileSystem.BinaryFileError(resource)) return content diff --git a/packages/opencode/src/tool/read.ts b/packages/opencode/src/tool/read.ts index bfa3a31a8092..678ed4451048 100644 --- a/packages/opencode/src/tool/read.ts +++ b/packages/opencode/src/tool/read.ts @@ -17,7 +17,6 @@ const MAX_BYTES = 50 * 1024 const MAX_BYTES_LABEL = `${MAX_BYTES / 1024} KB` const SAMPLE_BYTES = 4096 const SUPPORTED_IMAGE_MIMES = new Set(["image/jpeg", "image/png", "image/gif", "image/webp"]) -const SUPPORTED_VIDEO_MIMES = new Set(["video/mp4", "video/webm", "video/quicktime"]) class ReadStop extends Schema.TaggedErrorClass()("ReadStop", {}) {} @@ -303,11 +302,10 @@ export const ReadTool = Tool.define< const mime = sniffAttachmentMime(sample, FSUtil.mimeType(filepath)) const isImage = SUPPORTED_IMAGE_MIMES.has(mime) - const isVideo = SUPPORTED_VIDEO_MIMES.has(mime) - if (isImage || isVideo || isPdfAttachment(mime)) { + if (isImage || isPdfAttachment(mime)) { const bytes = yield* fs.readFile(filepath) - const msg = isVideo ? "Video read successfully" : isPdfAttachment(mime) ? "PDF read successfully" : "Image read successfully" + const msg = isPdfAttachment(mime) ? "PDF read successfully" : "Image read successfully" return { title, output: msg, diff --git a/packages/opencode/src/tool/read.txt b/packages/opencode/src/tool/read.txt index a2b5fda39d47..368174cc8d14 100644 --- a/packages/opencode/src/tool/read.txt +++ b/packages/opencode/src/tool/read.txt @@ -11,4 +11,4 @@ Usage: - Any line longer than 2000 characters is truncated. - Call this tool in parallel when you know there are multiple files you want to read. - Avoid tiny repeated slices (30 line chunks). If you need more context, read a larger window. -- This tool can read image files, video files (mp4, webm), and PDFs and return them as file attachments. +- This tool can read image files and PDFs and return them as file attachments. diff --git a/packages/opencode/src/util/media.ts b/packages/opencode/src/util/media.ts index 68159a94baf8..566ac843a634 100644 --- a/packages/opencode/src/util/media.ts +++ b/packages/opencode/src/util/media.ts @@ -1,22 +1,11 @@ const startsWith = (bytes: Uint8Array, prefix: number[]) => prefix.every((value, index) => bytes[index] === value) -const MP4_VIDEO_BRANDS = new Set(["isom", "iso2", "iso5", "iso6", "mp41", "mp42", "avc1", "dash", "m4v "]) -const isMp4VideoBrand = (bytes: Uint8Array) => { - const brand = Buffer.from(bytes.subarray(8, 12)).toString("ascii") - return MP4_VIDEO_BRANDS.has(brand) -} -const isWebmDocType = (bytes: Uint8Array) => - Buffer.from(bytes.subarray(0, Math.min(bytes.length, 64))).toString("latin1").includes("webm") export function isPdfAttachment(mime: string) { return mime === "application/pdf" } -export function isVideoAttachment(mime: string) { - return mime.startsWith("video/") -} - export function isMedia(mime: string) { - return mime.startsWith("image/") || mime.startsWith("video/") || isPdfAttachment(mime) + return mime.startsWith("image/") || isPdfAttachment(mime) } export function isImageAttachment(mime: string) { @@ -32,9 +21,6 @@ export function sniffAttachmentMime(bytes: Uint8Array, fallback: string) { if (startsWith(bytes, [0x52, 0x49, 0x46, 0x46]) && startsWith(bytes.subarray(8), [0x57, 0x45, 0x42, 0x50])) { return "image/webp" } - if (bytes.length >= 12 && startsWith(bytes.subarray(4), [0x66, 0x74, 0x79, 0x70]) && isMp4VideoBrand(bytes)) - return "video/mp4" - if (startsWith(bytes, [0x1a, 0x45, 0xdf, 0xa3]) && isWebmDocType(bytes)) return "video/webm" return fallback } From ce2bd135b109c6126b764cf9aadb5ca9a197f129 Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Thu, 11 Jun 2026 15:23:44 +0200 Subject: [PATCH 3/7] fix(llm): address review feedback for video media support - Replace base64 regex with character-by-character validator to avoid Bun/JSC regex failure on large strings (>4 MB) - Lower Gemini video limits to 14 MB decoded / 20 MB encoded to stay within Gemini total request size budget - Case-insensitive video MIME detection for limit selection Session: ses_1494c002affeaQLbzZve1wSJEm --- packages/llm/src/protocols/gemini.ts | 6 +++--- packages/llm/src/protocols/shared.ts | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/packages/llm/src/protocols/gemini.ts b/packages/llm/src/protocols/gemini.ts index 076c75a76121..1c7d38e5d32f 100644 --- a/packages/llm/src/protocols/gemini.ts +++ b/packages/llm/src/protocols/gemini.ts @@ -22,7 +22,7 @@ import { Lifecycle } from "./utils/lifecycle" const ADAPTER = "gemini" const MEDIA_MIMES = new Set(ProviderShared.MEDIA_MIMES) -const VIDEO_LIMITS: ProviderShared.MediaLimits = { maxDecodedBytes: 20 * 1024 * 1024, maxEncodedBytes: 28 * 1024 * 1024 } +const VIDEO_LIMITS: ProviderShared.MediaLimits = { maxDecodedBytes: 14 * 1024 * 1024, maxEncodedBytes: 20 * 1024 * 1024 } export const DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" // ============================================================================= @@ -183,7 +183,7 @@ const lowerToolConfig = (toolChoice: NonNullable) => const lowerUserPart = Effect.fn("Gemini.lowerUserPart")(function* (part: TextPart | MediaPart) { if (part.type === "text") return { text: part.text } - const limits = part.mediaType.startsWith("video/") ? VIDEO_LIMITS : undefined + const limits = part.mediaType.toLowerCase().startsWith("video/") ? VIDEO_LIMITS : undefined const media = yield* ProviderShared.validateMedia("Gemini", part, MEDIA_MIMES, limits) return { inlineData: { mimeType: media.mime, data: media.base64 } } }) @@ -277,7 +277,7 @@ const lowerMessages = Effect.fn("Gemini.lowerMessages")(function* (request: LLMR }) for (const item of content) { if (item.type === "text") continue - const limits = item.mime.startsWith("video/") ? VIDEO_LIMITS : undefined + const limits = item.mime.toLowerCase().startsWith("video/") ? VIDEO_LIMITS : undefined const media = yield* ProviderShared.validateToolFile("Gemini", item, MEDIA_MIMES, limits) parts.push({ inlineData: { mimeType: media.mime, data: media.base64 } }) } diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index 9b665ec93c5c..8626c648ea42 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -193,7 +193,17 @@ export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES] as const export const MAX_MEDIA_ENCODED_BYTES = 8 * 1024 * 1024 export const MAX_MEDIA_DECODED_BYTES = 6 * 1024 * 1024 -const base64Pattern = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/ +const isBase64 = (value: string) => { + if (!value || value.length % 4 !== 0) return false + const padding = value.endsWith("==") ? 2 : value.endsWith("=") ? 1 : 0 + const end = value.length - padding + for (let i = 0; i < end; i++) { + const c = value.charCodeAt(i) + if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c === 43 || c === 47) continue + return false + } + return true +} export interface ValidatedMedia { readonly mime: string @@ -235,7 +245,7 @@ export const validateMedia = Effect.fn("ProviderShared.validateMedia")(function* if (Buffer.byteLength(base64, "utf8") > maxEncoded) return yield* invalidRequest(`${route} media exceeds the ${maxEncoded} byte encoded limit`) - if (!base64 || base64.length % 4 !== 0 || !base64Pattern.test(base64)) + if (!isBase64(base64)) return yield* invalidRequest(`${route} media must contain valid base64`) const bytes = Buffer.from(base64, "base64") if (bytes.byteLength > maxDecoded) From 1953862b9c5d2245d25d02a211401e956350a627 Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Thu, 11 Jun 2026 15:25:57 +0200 Subject: [PATCH 4/7] feat(llm): add video media support to Gemini protocol Add VIDEO_MIMES and MEDIA_MIMES to ProviderShared and widen Gemini protocol to accept video MIME types (mp4, webm, quicktime) via inlineData. Plugins can now return video content to Gemini models through toModelOutput or v1 ToolAttachment. Other protocols stay image-only. Session: ses_1494c002affeaQLbzZve1wSJEm --- packages/llm/src/protocols/gemini.ts | 7 ++--- packages/llm/src/protocols/shared.ts | 42 +++++++--------------------- 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/packages/llm/src/protocols/gemini.ts b/packages/llm/src/protocols/gemini.ts index 1c7d38e5d32f..3a2311c8fdc1 100644 --- a/packages/llm/src/protocols/gemini.ts +++ b/packages/llm/src/protocols/gemini.ts @@ -22,7 +22,6 @@ import { Lifecycle } from "./utils/lifecycle" const ADAPTER = "gemini" const MEDIA_MIMES = new Set(ProviderShared.MEDIA_MIMES) -const VIDEO_LIMITS: ProviderShared.MediaLimits = { maxDecodedBytes: 14 * 1024 * 1024, maxEncodedBytes: 20 * 1024 * 1024 } export const DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" // ============================================================================= @@ -183,8 +182,7 @@ const lowerToolConfig = (toolChoice: NonNullable) => const lowerUserPart = Effect.fn("Gemini.lowerUserPart")(function* (part: TextPart | MediaPart) { if (part.type === "text") return { text: part.text } - const limits = part.mediaType.toLowerCase().startsWith("video/") ? VIDEO_LIMITS : undefined - const media = yield* ProviderShared.validateMedia("Gemini", part, MEDIA_MIMES, limits) + const media = yield* ProviderShared.validateMedia("Gemini", part, MEDIA_MIMES) return { inlineData: { mimeType: media.mime, data: media.base64 } } }) @@ -277,8 +275,7 @@ const lowerMessages = Effect.fn("Gemini.lowerMessages")(function* (request: LLMR }) for (const item of content) { if (item.type === "text") continue - const limits = item.mime.toLowerCase().startsWith("video/") ? VIDEO_LIMITS : undefined - const media = yield* ProviderShared.validateToolFile("Gemini", item, MEDIA_MIMES, limits) + const media = yield* ProviderShared.validateToolFile("Gemini", item, MEDIA_MIMES) parts.push({ inlineData: { mimeType: media.mime, data: media.base64 } }) } } diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index 8626c648ea42..51828248e8eb 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -193,17 +193,7 @@ export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES] as const export const MAX_MEDIA_ENCODED_BYTES = 8 * 1024 * 1024 export const MAX_MEDIA_DECODED_BYTES = 6 * 1024 * 1024 -const isBase64 = (value: string) => { - if (!value || value.length % 4 !== 0) return false - const padding = value.endsWith("==") ? 2 : value.endsWith("=") ? 1 : 0 - const end = value.length - padding - for (let i = 0; i < end; i++) { - const c = value.charCodeAt(i) - if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c === 43 || c === 47) continue - return false - } - return true -} +const base64Pattern = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/ export interface ValidatedMedia { readonly mime: string @@ -212,26 +202,18 @@ export interface ValidatedMedia { readonly bytes: Uint8Array } -export interface MediaLimits { - readonly maxDecodedBytes?: number - readonly maxEncodedBytes?: number -} - export const validateMedia = Effect.fn("ProviderShared.validateMedia")(function* ( route: string, part: MediaPart, supportedMimes: ReadonlySet, - limits?: MediaLimits, ) { - const maxDecoded = limits?.maxDecodedBytes ?? MAX_MEDIA_DECODED_BYTES - const maxEncoded = limits?.maxEncodedBytes ?? MAX_MEDIA_ENCODED_BYTES const mime = part.mediaType.toLowerCase() if (!supportedMimes.has(mime)) return yield* invalidRequest(`${route} does not support media type ${part.mediaType}`) let base64: string if (typeof part.data !== "string") { - if (part.data.byteLength > maxDecoded) - return yield* invalidRequest(`${route} media exceeds the ${maxDecoded} byte decoded limit`) + if (part.data.byteLength > MAX_MEDIA_DECODED_BYTES) + return yield* invalidRequest(`${route} media exceeds the ${MAX_MEDIA_DECODED_BYTES} byte decoded limit`) base64 = Buffer.from(part.data).toString("base64") } else if (part.data.startsWith("data:")) { const match = /^data:([^;,]+);base64,([A-Za-z0-9+/]*={0,2})$/s.exec(part.data) @@ -243,23 +225,19 @@ export const validateMedia = Effect.fn("ProviderShared.validateMedia")(function* base64 = part.data } - if (Buffer.byteLength(base64, "utf8") > maxEncoded) - return yield* invalidRequest(`${route} media exceeds the ${maxEncoded} byte encoded limit`) - if (!isBase64(base64)) + if (Buffer.byteLength(base64, "utf8") > MAX_MEDIA_ENCODED_BYTES) + return yield* invalidRequest(`${route} media exceeds the ${MAX_MEDIA_ENCODED_BYTES} byte encoded limit`) + if (!base64 || base64.length % 4 !== 0 || !base64Pattern.test(base64)) return yield* invalidRequest(`${route} media must contain valid base64`) const bytes = Buffer.from(base64, "base64") - if (bytes.byteLength > maxDecoded) - return yield* invalidRequest(`${route} media exceeds the ${maxDecoded} byte decoded limit`) + if (bytes.byteLength > MAX_MEDIA_DECODED_BYTES) + return yield* invalidRequest(`${route} media exceeds the ${MAX_MEDIA_DECODED_BYTES} byte decoded limit`) if (bytes.toString("base64") !== base64) return yield* invalidRequest(`${route} media must contain canonical base64`) return { mime, base64, dataUrl: `data:${mime};base64,${base64}`, bytes } satisfies ValidatedMedia }) -export const validateToolFile = ( - route: string, - part: ToolFileContent, - supportedMimes: ReadonlySet, - limits?: MediaLimits, -) => validateMedia(route, { type: "media", mediaType: part.mime, data: part.uri, filename: part.name }, supportedMimes, limits) +export const validateToolFile = (route: string, part: ToolFileContent, supportedMimes: ReadonlySet) => + validateMedia(route, { type: "media", mediaType: part.mime, data: part.uri, filename: part.name }, supportedMimes) export const trimBaseUrl = (value: string) => value.replace(/\/+$/, "") From 40a9133f5dd2b5856f14ddf39857a943d1064d01 Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Thu, 11 Jun 2026 16:18:38 +0200 Subject: [PATCH 5/7] feat(llm): raise media size limits to 20 MB decoded / 28 MB encoded Previous 6 MB / 8 MB limits were too tight for video. Raise to 20 MB decoded (raw bytes) / 28 MB encoded (base64 string) to support reasonable video clip sizes. Session: ses_1494c002affeaQLbzZve1wSJEm --- packages/llm/src/protocols/shared.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index 51828248e8eb..b6bb5b890d9f 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -190,8 +190,8 @@ export const parseToolInput = (route: string, name: string, raw: string) => export const IMAGE_MIMES = ["image/png", "image/jpeg", "image/gif", "image/webp"] as const export const VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"] as const export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES] as const -export const MAX_MEDIA_ENCODED_BYTES = 8 * 1024 * 1024 -export const MAX_MEDIA_DECODED_BYTES = 6 * 1024 * 1024 +export const MAX_MEDIA_ENCODED_BYTES = 28 * 1024 * 1024 +export const MAX_MEDIA_DECODED_BYTES = 20 * 1024 * 1024 const base64Pattern = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/ From 5cf63f2e313c6080a872387fb57e1d6a2a0edde0 Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Mon, 15 Jun 2026 10:42:55 +0200 Subject: [PATCH 6/7] feat(llm): add audio media support to Gemini protocol Add AUDIO_MIMES array with 12 audio types supported by the Gemini API (wav, mp3, mpeg, aiff, aac, ogg, flac, m4a, mp4, opus, pcm, webm) and include it in MEDIA_MIMES. Since Gemini creates its local supported set from ProviderShared.MEDIA_MIMES, audio flows through the same inlineData path as images and video with zero protocol changes needed. Other protocols (OpenAI Chat, OpenAI Responses, Anthropic, Bedrock) are unaffected since they reference IMAGE_MIMES directly. Session: ses_135abc74affeXj589B4g7YkPb0 --- packages/llm/src/protocols/shared.ts | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index b6bb5b890d9f..c4f67dd03f31 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -189,7 +189,21 @@ export const parseToolInput = (route: string, name: string, raw: string) => export const IMAGE_MIMES = ["image/png", "image/jpeg", "image/gif", "image/webp"] as const export const VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"] as const -export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES] as const +export const AUDIO_MIMES = [ + "audio/wav", + "audio/mp3", + "audio/mpeg", + "audio/aiff", + "audio/aac", + "audio/ogg", + "audio/flac", + "audio/m4a", + "audio/mp4", + "audio/opus", + "audio/pcm", + "audio/webm", +] as const +export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES, ...AUDIO_MIMES] as const export const MAX_MEDIA_ENCODED_BYTES = 28 * 1024 * 1024 export const MAX_MEDIA_DECODED_BYTES = 20 * 1024 * 1024 From 963371a23ae1043858fe01982f2d1fc5ec6a2124 Mon Sep 17 00:00:00 2001 From: "Tommy D. Rossi" Date: Mon, 15 Jun 2026 10:44:13 +0200 Subject: [PATCH 7/7] fix(llm): restrict audio MIME types to Gemini-supported formats Remove 6 audio types not listed in Gemini docs (mpeg, m4a, mp4, opus, pcm, webm). Keep only the 6 officially supported formats: wav, mp3, aiff, aac, ogg, flac. Ref: https://ai.google.dev/gemini-api/docs/audio#supported-audio-formats Session: ses_135abc74affeXj589B4g7YkPb0 --- packages/llm/src/protocols/shared.ts | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index c4f67dd03f31..66b353c82854 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -189,20 +189,7 @@ export const parseToolInput = (route: string, name: string, raw: string) => export const IMAGE_MIMES = ["image/png", "image/jpeg", "image/gif", "image/webp"] as const export const VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"] as const -export const AUDIO_MIMES = [ - "audio/wav", - "audio/mp3", - "audio/mpeg", - "audio/aiff", - "audio/aac", - "audio/ogg", - "audio/flac", - "audio/m4a", - "audio/mp4", - "audio/opus", - "audio/pcm", - "audio/webm", -] as const +export const AUDIO_MIMES = ["audio/wav", "audio/mp3", "audio/aiff", "audio/aac", "audio/ogg", "audio/flac"] as const export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES, ...AUDIO_MIMES] as const export const MAX_MEDIA_ENCODED_BYTES = 28 * 1024 * 1024 export const MAX_MEDIA_DECODED_BYTES = 20 * 1024 * 1024