diff --git a/common/arg.cpp b/common/arg.cpp
index 05f4a5244e7..e2c684f3498 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -58,6 +58,7 @@ static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CLI,
+    LLAMA_EXAMPLE_LIQUID_AUDIO,
 };
 
 static std::string read_file(const std::string & fname) {
@@ -1345,7 +1346,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_LIQUID_AUDIO}));
     add_opt(common_arg(
         {"--perf"},
         {"--no-perf"},
@@ -2165,7 +2166,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.image.emplace_back(item);
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_LIQUID_AUDIO}));
     add_opt(common_arg(
         {"--image-min-tokens"}, "N",
         "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -2659,7 +2660,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_LIQUID_AUDIO}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2791,14 +2792,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.hostname = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_HOST"));
     add_opt(common_arg(
         {"--port"}, "PORT",
         string_format("port to listen (default: %d)", params.port),
         [](common_params & params, int value) {
             params.port = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_PORT"));
     add_opt(common_arg(
         {"--path"}, "PATH",
         string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
@@ -3497,7 +3498,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.vocoder.model.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}));
      add_opt(common_arg(
         {"--tts-use-guide-tokens"},
         "Use guide tokens to improve TTS word recall",
@@ -3511,7 +3512,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.vocoder.speaker_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}));
 
     add_opt(common_arg(
         {"--diffusion-steps"}, "N",
diff --git a/common/common.h b/common/common.h
index c5a80375713..f52916a0884 100644
--- a/common/common.h
+++ b/common/common.h
@@ -104,6 +104,7 @@ enum llama_example {
     LLAMA_EXAMPLE_DIFFUSION,
     LLAMA_EXAMPLE_FINETUNE,
     LLAMA_EXAMPLE_FIT_PARAMS,
+    LLAMA_EXAMPLE_LIQUID_AUDIO,
 
     LLAMA_EXAMPLE_COUNT,
 };
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 09544173981..c01fa51b95d 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -10901,6 +10901,25 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         yield f"{self.dense_tensor_name}.weight", tensor.clone()
 
 
+@ModelBase.register("Lfm25AudioTokenizer")
+class LFM25AudioTokenizer(LFM2Model):
+    model_arch = gguf.MODEL_ARCH.LFM2
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+        self.gguf_writer.add_embedding_length_out(self.hparams.get("output_size"))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name == "istft.window" or name.startswith("emb.emb"):
+            return []
+
+        if name.startswith("lin"):
+            name = name.replace("lin", "dense_2_out")
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Lfm2MoeForCausalLM")
 class LFM2MoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LFM2MOE
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 24770430e1c..b76cbcfaeb1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -832,6 +832,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
 
+        // do not quantize conv weights
+        quantize &= name.find("conv.dw.weight") == std::string::npos;
+        quantize &= name.find("conv.pw1.weight") == std::string::npos;
+        quantize &= name.find("conv.pw2.weight") == std::string::npos;
+        quantize &= name.find("conv1d") == std::string::npos;
+        quantize &= name.find("conv_dw.weight") == std::string::npos;
+
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 518f8b9ae74..746df5a91c3 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -37,4 +37,5 @@ else()
         add_subdirectory(export-lora)
     endif()
     add_subdirectory(fit-params)
+    add_subdirectory(liquid-audio)
 endif()
diff --git a/tools/liquid-audio/CMakeLists.txt b/tools/liquid-audio/CMakeLists.txt
new file mode 100644
index 00000000000..cfe27235227
--- /dev/null
+++ b/tools/liquid-audio/CMakeLists.txt
@@ -0,0 +1,22 @@
+# lib
+set(TARGET_LIB liquid-audio)
+add_library(${TARGET_LIB} runner.cpp)
+target_include_directories(${TARGET_LIB} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET_LIB} PUBLIC llama common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET_LIB} PRIVATE cxx_std_17)
+
+# cli
+set(TARGET_CLI llama-liquid-audio-cli)
+add_executable(${TARGET_CLI} cli.cpp)
+target_link_libraries(${TARGET_CLI} PRIVATE ${TARGET_LIB})
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET_CLI} RUNTIME)
+endif()
+
+# server
+set(TARGET_SERVER llama-liquid-audio-server)
+add_executable(${TARGET_SERVER} server.cpp)
+target_link_libraries(${TARGET_SERVER} PRIVATE ${TARGET_LIB} cpp-httplib)
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET_SERVER} RUNTIME)
+endif()
diff --git a/tools/liquid-audio/README.md b/tools/liquid-audio/README.md
new file mode 100644
index 00000000000..9915dedd568
--- /dev/null
+++ b/tools/liquid-audio/README.md
@@ -0,0 +1,116 @@
+---
+license: other
+license_name: lfm1.0
+license_link: LICENSE
+language:
+- en
+tags:
+- liquid
+- lfm2.5
+- edge
+- llama.cpp
+- audio
+- speech
+- gguf
+base_model:
+- LiquidAI/LFM2.5-Audio-1.5B
+widget:
+  - text: "Demo"
+    output:
+      url: demo.mp4
+---
+
+<div align="center">
+  <img
+    src="https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/2b08LKpev0DNEk6DlnWkY.png"
+    alt="Liquid AI"
+    style="width: 100%; max-width: 100%; height: auto; display: inline-block; margin-bottom: 0.5em; margin-top: 0.5em;"
+  />
+  <div style="display: flex; justify-content: center; gap: 0.5em; margin-bottom: 1em;">
+    <a href="https://playground.liquid.ai/"><strong>Try LFM</strong></a> •
+    <a href="https://docs.liquid.ai/lfm"><strong>Documentation</strong></a> •
+    <a href="https://leap.liquid.ai/"><strong>LEAP</strong></a>
+  </div>
+</div>
+
+# LFM2.5-Audio-1.5B
+
+Find more details in the original model card: https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B
+
+## Runners
+
+`runners` folder contains runners for various architectures including
+
+- llama-liquid-audio-cli
+- llama-liquid-audio-server
+
+## Convert GGUFs
+
+```bash
+export CKPT=/path/to/LFM2.5-Audio-1.5B
+export MODEL=LFM2.5-Audio-1.5B
+# backbone
+python convert_hf_to_gguf.py $CKPT --outfile $CKPT/${MODEL}-F16.gguf --outtype f16
+./llama-quantize $CKPT/${MODEL}-F16.gguf $CKPT/${MODEL}-Q8_0.gguf Q8_0
+./llama-quantize $CKPT/${MODEL}-F16.gguf $CKPT/${MODEL}-Q4_0.gguf Q4_0
+# mmproj
+python convert_hf_to_gguf.py $CKPT --mmproj --outfile $CKPT/mmproj-${MODEL}-F16.gguf --outtype f16
+./llama-quantize $CKPT/mmproj-${MODEL}-F16.gguf $CKPT/mmproj-${MODEL}-Q8_0.gguf Q8_0
+./llama-quantize $CKPT/mmproj-${MODEL}-F16.gguf $CKPT/mmproj-${MODEL}-Q4_0.gguf Q4_0
+# vocoder
+python tools/liquid-audio/convert_vocoder_to_gguf.py $CKPT --outfile $CKPT/vocoder-${MODEL}-F16.gguf --outtype f16
+python tools/liquid-audio/convert_vocoder_to_gguf.py $CKPT --outfile $CKPT/vocoder-${MODEL}-Q8_0.gguf --outtype q8_0
+python tools/liquid-audio/convert_vocoder_to_gguf.py $CKPT --outfile $CKPT/vocoder-${MODEL}-Q4_0.gguf --outtype q4_0
+# tokenizer
+python convert_hf_to_gguf.py $CKPT/audio_detokenizer --outfile $CKPT/tokenizer-${MODEL}-F16.gguf --outtype f16
+./llama-quantize $CKPT/tokenizer-${MODEL}-F16.gguf $CKPT/tokenizer-${MODEL}-Q8_0.gguf Q8_0
+./llama-quantize $CKPT/tokenizer-${MODEL}-F16.gguf $CKPT/tokenizer-${MODEL}-Q4_0.gguf Q4_0
+```
+
+# 🏃 How to run LFM2.5
+
+## CLI
+
+Set env variables.
+```
+export CKPT=/path/to/LFM2.5-Audio-1.5B-GGUF
+export INPUT_WAV=/path/to/input.wav
+export OUTPUT_WAV=/path/to/output.wav
+```
+
+### ASR (audio -> text)
+
+```bash
+./llama-liquid-audio-cli -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf -sys "Perform ASR." --audio $INPUT_WAV
+```
+
+### TTS (text -> audio)
+
+```bash
+./llama-liquid-audio-cli -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf -sys "Perform TTS." -p "Hi, how are you?" --output $OUTPUT_WAV
+```
+
+### Interleaved (audio/text -> audio + text)
+
+```bash
+./llama-liquid-audio-cli -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf -sys "Respond with interleaved text and audio." --audio $INPUT_WAV --output $OUTPUT_WAV
+```
+
+
+## Server
+
+Start server
+```
+export CKPT=/path/to/LFM2.5-Audio-1.5B-GGUF
+./llama-liquid-audio-server -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf
+```
+
+Use `liquid_audio_chat.py` script to communicate with the server.
+
+```bash
+uv run liquid_audio_chat.py
+```
+
+# Demo
+
+<Gallery />
diff --git a/tools/liquid-audio/cli.cpp b/tools/liquid-audio/cli.cpp
new file mode 100644
index 00000000000..2bdbbc19514
--- /dev/null
+++ b/tools/liquid-audio/cli.cpp
@@ -0,0 +1,191 @@
+#include "mtmd-helper.h"
+#include "mtmd.h"
+#include "runner.h"
+
+//
+#include "arg.h"
+#include "common.h"
+#include "ggml.h"
+#include "log.h"
+
+#include <algorithm>
+
+namespace {
+std::vector<std::byte> load_file(const char * fname) {
+    std::vector<std::byte> buf;
+    FILE *                 f = fopen(fname, "rb");
+    if (!f) {
+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        exit(1);
+    }
+
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf.resize(file_size);
+
+    size_t n_read = fread(buf.data(), 1, file_size, f);
+    fclose(f);
+    if (n_read != (size_t) file_size) {
+        LOG_ERR("Failed to read entire file %s", fname);
+        exit(1);
+    }
+
+    return buf;
+}
+}  // namespace
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#    include <signal.h>
+#    include <unistd.h>
+#elif defined(_WIN32)
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <signal.h>
+#    include <windows.h>
+#endif
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    LOG("CLI for LFM2.5-Audio-1.5B\n\n"
+        "Usage: %s [options] -m <model.gguf> --mmproj <mmproj.gguf> -mv <vocoder.gguf> --tts-speaker-file "
+        "<tokenizer.gguf> "
+        "-sys <system_prompt> [--audio "
+        "<audio>] [-p <user_prompt>]\n"
+        "  --audio, -p, --output can be required depending on <system_prompt>\n",
+        argv[0]);
+}
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
+static void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        // TODO(tarek): make this more graceful
+        LOG("Force exiting...\n");
+        exit(1);
+    }
+}
+#endif
+
+static std::vector<mtmd_output_modality> get_modalities_from_system_prompt(std::string const & system_prompt) {
+    if (system_prompt.empty()) {
+        LOG_ERR("ERR: -sys is required\n");
+        exit(1);
+    }
+
+    // modalities depend on system prompt
+    static constexpr const char *         asr_system_prompt         = "Perform ASR.";
+    static constexpr const char *         interleaved_system_prompt = "Respond with interleaved text and audio.";
+    static const std::vector<std::string> tts_system_prompts        = {
+        "Perform TTS. Use the US male voice.",
+        "Perform TTS. Use the UK male voice.",
+        "Perform TTS. Use the US female voice.",
+        "Perform TTS. Use the UK female voice.",
+    };
+    if (system_prompt == asr_system_prompt) {
+        return { MTMD_OUTPUT_MODALITY_TEXT };
+    }
+    if (system_prompt == interleaved_system_prompt) {
+        return { MTMD_OUTPUT_MODALITY_AUDIO, MTMD_OUTPUT_MODALITY_TEXT };
+    }
+    if (std::find(begin(tts_system_prompts), end(tts_system_prompts), system_prompt) != end(tts_system_prompts)) {
+        return { MTMD_OUTPUT_MODALITY_AUDIO };
+    }
+
+    // print error and exit
+    std::vector<std::string> prompts = tts_system_prompts;
+    prompts.push_back(asr_system_prompt);
+    prompts.push_back(interleaved_system_prompt);
+    std::string err = "Unsupported system prompt. Supported prompts are:\n";
+    for (const auto & p : prompts) {
+        err += " - " + p + "\n";
+    }
+
+    LOG_ERR("%s", err.c_str());
+    exit(1);
+}
+
+int main(int argc, char ** argv) {
+    // Ctrl+C handling
+    {
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset(&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined(_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+    }
+
+    ggml_time_init();
+
+    common_params params;
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LIQUID_AUDIO, show_additional_info)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    common_init();
+
+    // set default context size if not specified
+    if (params.n_ctx == 0) {
+        params.n_ctx = 4096;
+    }
+
+    liquid::audio::Runner runner;
+    if (0 != runner.init(params)) {
+        exit(1);
+    }
+
+    auto modalities = get_modalities_from_system_prompt(params.system_prompt);
+
+    // prepare inputs
+    std::vector<liquid::audio::Runner::Message> messages;
+    messages.push_back({ "system", params.system_prompt, {} });
+    if (!params.prompt.empty()) {
+        messages.push_back({ "user", params.prompt, {} });
+    }
+    if (!params.image.empty()) {
+        messages.push_back({ "user", mtmd_default_marker(), load_file(params.image[0].c_str()) });
+    }
+
+    std::string                      generated_text;
+    liquid::audio::generated_audio_t generated_audio;
+
+    auto text_cb = [&generated_text](const std::string & text) {
+        generated_text += text;
+    };
+    auto audio_cb = [&generated_audio](const std::vector<int16_t> & audio) {
+        generated_audio.insert(generated_audio.end(), audio.begin(), audio.end());
+    };
+
+    if (0 != runner.generate(messages, params.n_predict, text_cb, audio_cb, modalities)) {
+        exit(1);
+    }
+
+    LOG("\n");
+
+    // write output
+    if (not generated_audio.empty()) {
+        if (params.out_file.empty()) {
+            LOG_ERR("ERR: --output is required for audio generation\n");
+            return 1;
+        }
+        if (!mtmd_helper_save_wav(params.out_file.c_str(), generated_audio.data(), generated_audio.size(),
+                                  runner.get_output_sample_rate())) {
+            exit(1);
+        }
+        LOG("=== GENERATED AUDIO ===\nSaved to %s\n\n", params.out_file.c_str());
+    }
+
+    if (not generated_text.empty()) {
+        LOG("=== GENERATED TEXT ===\n%s\n\n", generated_text.c_str());
+    }
+
+    return 0;
+}
diff --git a/tools/liquid-audio/convert_vocoder_to_gguf.py b/tools/liquid-audio/convert_vocoder_to_gguf.py
new file mode 100644
index 00000000000..d2e91df3ee2
--- /dev/null
+++ b/tools/liquid-audio/convert_vocoder_to_gguf.py
@@ -0,0 +1,204 @@
+from pathlib import Path
+from safetensors import safe_open
+from torch import Tensor
+from typing import Union
+import argparse
+import gguf
+import json
+import logging
+import torch
+
+logger = logging.getLogger()
+
+
+class Lfm2AudioDecoderModelConverter:
+    mimi_tensors: dict()
+    gguf_writer: gguf.GGUFWriter
+    fname_out: Path
+    ftype: gguf.LlamaFileType
+    decoder_tensors: dict()
+
+    def __init__(
+        self,
+        pretrained_path: Union[Path, str],
+        fname_out: Path,
+        ftype: gguf.LlamaFileType,
+    ):
+        self.fname_out = fname_out
+        self.ftype = ftype
+        self.gguf_writer = gguf.GGUFWriter(
+            path=None,
+            arch="this model cannot be used as LLM, use it via --model-vocoder in TTS examples",
+            endianess=gguf.GGUFEndian.LITTLE,
+        )
+
+        self.decoder_tensors = self.load_tensors(
+            pretrained_path / "model.safetensors",
+            Lfm2AudioDecoderModelConverter._is_decoder_tensor,
+        )
+
+        self.detokenizer_tensors = self.load_tensors(
+            pretrained_path / "audio_detokenizer" / "model.safetensors", lambda _: True
+        )
+
+        for name, data_torch in (
+            self.detokenizer_tensors | self.decoder_tensors
+        ).items():
+            # convert any unsupported data types to float32
+            old_dtype = data_torch.dtype
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+            self.add_tensor(name, data_torch, old_dtype)
+
+        # populate config entries
+        with open(pretrained_path / "config.json", "r", encoding="utf-8") as f:
+            config_json = json.load(f)
+            assert config_json["architectures"] == ["Lfm2AudioForConditionalGeneration"]
+            self.gguf_writer.add_uint32(
+                "depthformer_n_layer",
+                config_json["depthformer"]["layers"],
+            )
+            self.gguf_writer.add_uint32(
+                "depthformer_n_embd",
+                config_json["depthformer"]["dim"],
+            )
+
+    def load_tensors(self, path, predicate):
+        tensors = {}
+        with safe_open(path, framework="pt") as f:
+            for key in f.keys():
+                if predicate(key):
+                    tensors[key] = f.get_tensor(key)
+        return tensors
+
+    def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype):
+        if name.startswith("lfm") or name.startswith("lin"):
+            return
+
+        is_1d = len(data_torch.shape) == 1
+        is_bias = ".bias" in name
+        can_quantize = not is_1d and not is_bias
+        data_qtype = gguf.GGMLQuantizationType.F32
+
+        # conv kernels are always F32
+        if ".conv.weight" in name:
+            data_torch = data_torch.squeeze(1)
+            can_quantize = False
+
+        # shorten name, otherwise it will be too long for ggml to read
+        name = name.replace("bounded_attention", "attention")
+
+        if can_quantize:
+            if self.ftype == gguf.LlamaFileType.ALL_F32:
+                data_qtype = gguf.GGMLQuantizationType.F32
+            elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
+                data_qtype = gguf.GGMLQuantizationType.F16
+            elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                data_qtype = gguf.GGMLQuantizationType.BF16
+            elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
+                data_qtype = gguf.GGMLQuantizationType.Q8_0
+            elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
+                data_qtype = gguf.GGMLQuantizationType.Q4_0
+            else:
+                raise ValueError(f"Unsupported file type: {self.ftype}")
+
+        data = data_torch.numpy()
+
+        try:
+            data = gguf.quants.quantize(data, data_qtype)
+        except Exception as e:
+            logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
+            data_qtype = gguf.GGMLQuantizationType.F16
+            data = gguf.quants.quantize(data, data_qtype)
+
+        # reverse shape to make it similar to the internal ggml dimension order
+        shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
+        logger.info(
+            f"{'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
+        )
+
+        self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
+
+    @staticmethod
+    def _is_decoder_tensor(key):
+        audio_out_tensor_prefixes = [
+            "depthformer",
+            "depth_embeddings",
+            "depth_linear",
+            "audio_embedding",
+        ]
+        return any(key.startswith(p) for p in audio_out_tensor_prefixes)
+
+    def write(self):
+        self.gguf_writer.write_header_to_file(path=self.fname_out)
+        self.gguf_writer.write_kv_data_to_file()
+        self.gguf_writer.write_tensors_to_file(progress=True)
+        self.gguf_writer.close()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert LFM2-Audio decoder model to GGUF",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=Path,
+        required=True,
+        help="path to write to",
+    )
+    parser.add_argument(
+        "--outtype",
+        type=str,
+        choices=["f32", "f16", "bf16", "q8_0", "q4_0"],
+        default="f16",
+        help="output format",
+    )
+    parser.add_argument(
+        "model",
+        type=Path,
+        help="Path to LFM2-Audio model",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="increase output verbosity",
+    )
+
+    args = parser.parse_args()
+    if args.model is None:
+        parser.error("the following arguments are required: model")
+    return args
+
+
+def main() -> None:
+    args = parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    dir_model = args.model
+
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
+    }
+
+    logger.info(f"Loading model: {dir_model}")
+
+    with torch.inference_mode():
+        converter = Lfm2AudioDecoderModelConverter(
+            pretrained_path=dir_model,
+            fname_out=args.outfile,
+            ftype=ftype_map[args.outtype],
+        )
+        converter.write()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/liquid-audio/liquid_audio_chat.py b/tools/liquid-audio/liquid_audio_chat.py
new file mode 100644
index 00000000000..437db420f2d
--- /dev/null
+++ b/tools/liquid-audio/liquid_audio_chat.py
@@ -0,0 +1,596 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "pyaudio",
+#     "soundfile",
+#     "openai",
+#     "prompt_toolkit",
+# ]
+# ///
+"""Interactive CLI chat tool for LFM2.5-Audio server."""
+
+import argparse
+import base64
+import io
+import os
+import shutil
+import struct
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+from queue import Queue
+
+import numpy as np
+import pyaudio
+import soundfile as sf
+from openai import OpenAI
+from prompt_toolkit import prompt
+from prompt_toolkit.history import InMemoryHistory
+from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
+
+# Suppress ALSA/JACK warnings during PyAudio init
+import os
+import contextlib
+
+
+@contextlib.contextmanager
+def suppress_stderr():
+    """Temporarily redirect stderr to /dev/null."""
+    devnull = os.open(os.devnull, os.O_WRONLY)
+    old_stderr = os.dup(2)
+    os.dup2(devnull, 2)
+    try:
+        yield
+    finally:
+        os.dup2(old_stderr, 2)
+        os.close(devnull)
+        os.close(old_stderr)
+
+
+class AudioPlayer:
+    """Streams audio samples to speakers via PyAudio (non-blocking)."""
+
+    def __init__(self, sample_rate=None):
+        self.sample_rate = sample_rate
+        self.all_samples = []
+        self.pyaudio = None
+        self.stream = None
+        self.queue = Queue()
+        self.thread = None
+        self.running = False
+        self.started = False
+
+    def _playback_thread(self):
+        """Background thread that writes audio to the stream."""
+        while self.running or not self.queue.empty():
+            try:
+                pcm_data = self.queue.get(timeout=0.1)
+                if self.stream:
+                    self.stream.write(pcm_data)
+            except:
+                pass
+
+    def start(self):
+        """Prepare the audio player (stream starts on first samples)."""
+        self.all_samples = []
+        self.running = True
+        self.started = False
+
+    def _start_stream(self):
+        """Actually start the audio stream (called when sample rate is known)."""
+        if self.started or self.sample_rate is None:
+            return
+        with suppress_stderr():
+            self.pyaudio = pyaudio.PyAudio()
+            self.stream = self.pyaudio.open(
+                format=pyaudio.paInt16,
+                channels=1,
+                rate=self.sample_rate,
+                output=True,
+            )
+        self.thread = threading.Thread(target=self._playback_thread, daemon=True)
+        self.thread.start()
+        self.started = True
+
+    def add_samples(self, samples, sample_rate=None):
+        """Add samples to playback queue (non-blocking)."""
+        if sample_rate is not None and self.sample_rate is None:
+            self.sample_rate = sample_rate
+        self._start_stream()
+        self.all_samples.extend(samples)
+        pcm_data = np.array(samples, dtype=np.int16).tobytes()
+        self.queue.put(pcm_data)
+
+    def stop(self, output_file="output.wav"):
+        """Stop the audio stream."""
+        self.running = False
+        if self.thread:
+            self.thread.join()
+            self.thread = None
+        if self.stream:
+            self.stream.stop_stream()
+            self.stream.close()
+            self.stream = None
+        if self.pyaudio:
+            self.pyaudio.terminate()
+            self.pyaudio = None
+
+
+class AudioRecorder:
+    """Records audio from microphone using PyAudio."""
+
+    def __init__(self, sample_rate=16000):
+        self.sample_rate = sample_rate
+        self.recording = False
+        self.samples = []
+        self.available = self._check_available()
+
+    def _check_available(self):
+        """Check if audio input is available."""
+        try:
+            with suppress_stderr():
+                p = pyaudio.PyAudio()
+                has_input = p.get_default_input_device_info() is not None
+                p.terminate()
+            return has_input
+        except Exception:
+            return False
+
+    def record(self, duration=None):
+        """Record audio. Press Enter to stop if duration is None."""
+        if not self.available:
+            print("[No microphone available. Use /wav to load audio files.]")
+            return None
+
+        self.samples = []
+        self.recording = True
+
+        print("Recording... (Press Enter to stop)")
+
+        # Start recording in background
+        stop_event = threading.Event()
+
+        def record_audio():
+            try:
+                with suppress_stderr():
+                    p = pyaudio.PyAudio()
+                    stream = p.open(
+                        format=pyaudio.paFloat32,
+                        channels=1,
+                        rate=self.sample_rate,
+                        input=True,
+                        frames_per_buffer=1024,
+                    )
+                while not stop_event.is_set():
+                    data = stream.read(1024, exception_on_overflow=False)
+                    samples = np.frombuffer(data, dtype=np.float32)
+                    self.samples.extend(samples.tolist())
+                stream.stop_stream()
+                stream.close()
+                p.terminate()
+            except Exception as e:
+                print(f"[Recording error: {e}]")
+
+        record_thread = threading.Thread(target=record_audio)
+        record_thread.start()
+
+        # Wait for Enter key
+        input()
+        stop_event.set()
+        record_thread.join()
+
+        self.recording = False
+        if self.samples:
+            print(f"Recorded {len(self.samples) / self.sample_rate:.2f}s of audio")
+
+        return self.samples if self.samples else None
+
+    def to_wav_bytes(self):
+        """Convert recorded samples to WAV bytes."""
+        if not self.samples:
+            return None
+        buffer = io.BytesIO()
+        sf.write(buffer, np.array(self.samples), self.sample_rate, format="WAV")
+        buffer.seek(0)
+        return buffer.read()
+
+
+SYSTEM_PROMPTS = {
+    "asr": "Perform ASR.",
+    "tts": "Perform TTS. Use the UK female voice.",
+    "interleaved": "Respond with interleaved text and audio.",
+}
+
+
+def create_text_message(text):
+    """Create a text user message."""
+    return {"role": "user", "content": text}
+
+
+def create_audio_message(wav_data):
+    """Create an audio user message."""
+    encoded = base64.b64encode(wav_data).decode("utf-8")
+    return {
+        "role": "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {"data": encoded, "format": "wav"},
+            }
+        ],
+    }
+
+
+def create_stream_single_shot(client, mode, text=None, wav_data=None, max_tokens=512):
+    """Create a single-shot request for ASR/TTS (always resets context)."""
+    messages = [{"role": "system", "content": SYSTEM_PROMPTS[mode]}]
+
+    modalities = []
+    if mode == "asr" and wav_data:
+        messages.append(create_audio_message(wav_data))
+        modalities.append("text")
+    elif mode == "tts" and text:
+        messages.append(create_text_message(text))
+        modalities.append("audio")
+
+    return client.chat.completions.create(
+        model="",
+        modalities=modalities,
+        messages=messages,
+        stream=True,
+        max_tokens=max_tokens,
+    )
+
+
+def create_stream_chat(client, messages, max_tokens=512, reset_context=False):
+    """Create a chat request for interleaved mode (maintains context)."""
+    return client.chat.completions.create(
+        model="",
+        modalities=["text", "audio"],
+        messages=messages,
+        stream=True,
+        max_tokens=max_tokens,
+        extra_body={
+            "id_slot": 0,
+            "continue": not reset_context,
+            "reset_context": reset_context,
+        },
+    )
+
+
+def process_stream(stream, audio_player=None):
+    """Process streaming response, playing audio and printing text."""
+    t0 = time.time()
+    ttft = None
+    text_chunks = []
+    audio_chunks = []
+    total_samples = 0
+    completed = False
+    audio_sample_rate = None
+
+    for chunk in stream:
+        if chunk.choices[0].finish_reason == "stop":
+            completed = True
+            break
+
+        delta = chunk.choices[0].delta
+
+        # Handle text
+        if text := delta.content:
+            if ttft is None:
+                ttft = time.time() - t0
+            text_chunks.append((time.time(), text))
+            print(text, end="", flush=True)
+
+        # Handle audio
+        if hasattr(delta, "audio") and delta.audio and "data" in delta.audio:
+            if ttft is None:
+                ttft = time.time() - t0
+            # Get sample rate from response if available
+            if audio_sample_rate is None and "sample_rate" in delta.audio:
+                audio_sample_rate = delta.audio["sample_rate"]
+            chunk_data = delta.audio["data"]
+            pcm_bytes = base64.b64decode(chunk_data)
+            samples = np.frombuffer(pcm_bytes, dtype=np.int16)
+            audio_chunks.append((time.time(), samples))
+            total_samples += len(samples)
+
+            # Print note symbol for audio progress
+            print("♪", end="", flush=True)
+
+            if audio_player:
+                audio_player.add_samples(samples, sample_rate=audio_sample_rate)
+
+    if text_chunks or audio_chunks:
+        print()  # Newline after output
+
+    if not completed:
+        print("[Warning: Server disconnected before completion]")
+
+    # Calculate and display stats (single line)
+    total_time = time.time() - t0
+    full_text = "".join(t for _, t in text_chunks)
+
+    stats = []
+    if ttft is not None:
+        stats.append(f"ttft {ttft:.3f}s")
+
+    if text_chunks and len(text_chunks) > 1:
+        text_duration = text_chunks[-1][0] - text_chunks[0][0]
+        if text_duration > 0:
+            stats.append(
+                f"text {len(text_chunks)} tok @ {len(text_chunks) / text_duration:.1f} tok/s"
+            )
+
+    if audio_chunks:
+        # Calculate from ttft to last chunk for accurate throughput
+        first_audio_time = audio_chunks[0][0]
+        last_audio_time = audio_chunks[-1][0]
+        audio_duration = last_audio_time - first_audio_time
+        audio_secs = total_samples / audio_sample_rate
+        stats.append(
+            f"audio {audio_secs:.1f}s @ {total_samples / audio_duration:.0f} samples/s"
+        )
+
+    stats.append(f"total {total_time:.3f}s")
+    print(f"\n[{' | '.join(stats)}]")
+
+    return full_text, total_samples
+
+
+def print_help():
+    """Print help information."""
+    print(
+        """
+Commands:
+  /mode <asr|tts|interleaved>  - Switch mode
+  /reset                       - Reset context (interleaved mode only)
+  /record                      - Record and transcribe/process audio
+  /wav <path>                  - Load and transcribe/process audio file
+  /help                        - Show this help
+  /quit or /exit               - Exit the program
+
+Modes:
+  ASR (single-shot):
+    - Use /record or /wav to transcribe audio
+    - Each request is independent
+
+  TTS (single-shot):
+    - Type text to synthesize audio
+    - Each request is independent
+
+  Interleaved (chat):
+    - Type text or use /record or /wav
+    - Context is maintained across requests
+    - Use /reset to start fresh
+"""
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Interactive LFM2.5-Audio chat client")
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default="http://127.0.0.1:8080/v1",
+        help="Server base URL",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["asr", "tts", "interleaved"],
+        default="interleaved",
+        help="Initial mode",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=512,
+        help="Maximum tokens to generate",
+    )
+    parser.add_argument(
+        "--no-audio-playback",
+        action="store_true",
+        help="Disable audio playback (save to file instead)",
+    )
+    args = parser.parse_args()
+
+    client = OpenAI(base_url=args.base_url, api_key="dummy")
+    recorder = AudioRecorder()
+    mode = args.mode
+    wav_data = None
+    enable_playback = not args.no_audio_playback
+
+    # Check audio capabilities
+    audio_input_ok = recorder.available
+
+    # Track if first message in interleaved mode (need to send system prompt)
+    is_first_message = True
+
+    # Command history for prompt_toolkit
+    cmd_history = InMemoryHistory()
+
+    print("=" * 50)
+    print("LFM2.5-Audio Interactive Chat")
+    print("=" * 50)
+    print(f"Server: {args.base_url}")
+    print("Audio output: pyaudio")
+    print(f"Audio input:  {'microphone' if audio_input_ok else 'file only (/wav)'}")
+    print("Type /help for commands")
+    print("=" * 50)
+    print(f"Mode: {mode}" + (" (single-shot)" if mode in ("asr", "tts") else " (chat)"))
+
+    while True:
+        try:
+            # Show prompt with mode indicator
+            mode_indicator = {"asr": "[ASR]", "tts": "[TTS]", "interleaved": "[INT]"}
+            audio_indicator = " [audio]" if wav_data else ""
+
+            prompt_str = f"{mode_indicator[mode]}{audio_indicator}> "
+            user_input = prompt(
+                prompt_str,
+                history=cmd_history,
+                auto_suggest=AutoSuggestFromHistory(),
+            ).strip()
+
+            if not user_input:
+                if mode == "asr" and wav_data:
+                    # In ASR mode with audio, pressing Enter transcribes
+                    pass
+                else:
+                    continue
+
+            # Handle commands
+            if user_input.startswith("/"):
+                parts = user_input.split(maxsplit=1)
+                cmd = parts[0].lower()
+                arg = parts[1] if len(parts) > 1 else None
+
+                if cmd in ("/quit", "/exit"):
+                    print("Goodbye!")
+                    break
+
+                elif cmd == "/help":
+                    print_help()
+                    continue
+
+                elif cmd == "/mode":
+                    if arg in ("asr", "tts", "interleaved"):
+                        if arg == mode:
+                            print(f"Already in {mode} mode")
+                        elif arg == "interleaved":
+                            mode = arg
+                            is_first_message = True
+                            print(f"Mode: {mode} (chat)")
+                        else:
+                            mode = arg
+                            print(f"Mode: {mode} (single-shot)")
+                    else:
+                        print("Usage: /mode <asr|tts|interleaved>")
+                    continue
+
+                elif cmd == "/reset":
+                    if mode != "interleaved":
+                        print("Reset only available in interleaved mode")
+                        continue
+                    is_first_message = True
+                    print("Context reset")
+                    continue
+
+                elif cmd == "/record":
+                    if mode == "tts":
+                        print("Recording not available in TTS mode")
+                        continue
+                    samples = recorder.record()
+                    if samples:
+                        wav_data = recorder.to_wav_bytes()
+                        # Start inference immediately
+                        user_input = ""
+                    else:
+                        continue
+
+                elif cmd == "/wav":
+                    if mode == "tts":
+                        print("Audio input not available in TTS mode")
+                        continue
+                    if arg:
+                        try:
+                            with open(arg, "rb") as f:
+                                wav_data = f.read()
+                            # Start inference immediately
+                            user_input = ""
+                        except Exception as e:
+                            print(f"Error loading file: {e}")
+                            continue
+                    else:
+                        print("Usage: /wav <path>")
+                        continue
+
+                else:
+                    print(f"Unknown command: {cmd}")
+                    continue
+
+            # Prepare request based on mode
+            text_input = (
+                user_input if user_input and not user_input.startswith("/") else None
+            )
+
+            if mode == "asr":
+                if not wav_data:
+                    print("ASR mode requires audio. Use /record or /wav first.")
+                    continue
+                text_input = None  # ASR ignores text input
+            elif mode == "tts":
+                if not text_input:
+                    print("TTS mode requires text input.")
+                    continue
+                wav_data = None  # TTS ignores audio input
+
+            # Create audio player if needed
+            audio_player = None
+            if enable_playback:
+                audio_player = AudioPlayer()
+                audio_player.start()
+
+            try:
+                print()  # Blank line before response
+
+                if mode in ("asr", "tts"):
+                    # Single-shot mode: system + one message, always reset
+                    stream = create_stream_single_shot(
+                        client,
+                        mode,
+                        text=text_input,
+                        wav_data=wav_data,
+                        max_tokens=args.max_tokens,
+                    )
+                else:
+                    # Interleaved chat mode: only send new messages
+                    messages = []
+
+                    # First message needs system prompt and reset
+                    if is_first_message:
+                        messages.append(
+                            {"role": "system", "content": SYSTEM_PROMPTS["interleaved"]}
+                        )
+
+                    # Add user message(s)
+                    if text_input:
+                        messages.append(create_text_message(text_input))
+                    if wav_data:
+                        messages.append(create_audio_message(wav_data))
+
+                    stream = create_stream_chat(
+                        client,
+                        messages,
+                        max_tokens=args.max_tokens,
+                        reset_context=is_first_message,
+                    )
+                    is_first_message = False
+
+                response_text, _ = process_stream(stream, audio_player)
+
+            except Exception as e:
+                print(f"Error: {e}")
+
+            finally:
+                if audio_player:
+                    audio_player.stop()
+
+            # Clear audio after use
+            if wav_data:
+                wav_data = None
+
+        except KeyboardInterrupt:
+            print("\nUse /quit to exit")
+        except EOFError:
+            print("\nGoodbye!")
+            break
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/liquid-audio/liquid_audio_example.py b/tools/liquid-audio/liquid_audio_example.py
new file mode 100755
index 00000000000..7e4f66028c5
--- /dev/null
+++ b/tools/liquid-audio/liquid_audio_example.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "soundfile",
+#     "openai",
+# ]
+# ///
+"""Example script for LFM2.5-Audio server with OpenAI-compatible API."""
+
+import argparse
+import base64
+import time
+
+import numpy as np
+import soundfile as sf
+from openai import OpenAI
+
+
+def interleaved(client, text=None, wav_data=None):
+    messages = [
+        {"role": "system", "content": "Respond with interleaved text and audio."},
+    ]
+
+    if text:
+        messages.append({"role": "user", "content": text})
+
+    if wav_data:
+        encoded_wav_data = base64.b64encode(wav_data).decode("utf-8")
+
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": encoded_wav_data, "format": "wav"},
+                    }
+                ],
+            }
+        )
+
+    return client.chat.completions.create(
+        model="",
+        modalities=["text", "audio"],
+        messages=messages,
+        stream=True,
+        max_tokens=512,
+    )
+
+
+def tts(client, text):
+    return client.chat.completions.create(
+        model="",
+        modalities=["audio"],
+        messages=[
+            {"role": "system", "content": "Perform TTS. Use the US male voice."},
+            {"role": "user", "content": text},
+        ],
+        stream=True,
+        max_tokens=512,
+    )
+
+
+def asr(client, wav_data):
+    encoded_wav_data = base64.b64encode(wav_data).decode("utf-8")
+    return client.chat.completions.create(
+        model="",
+        modalities=["text"],
+        messages=[
+            {"role": "system", "content": "Perform ASR."},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": encoded_wav_data, "format": "wav"},
+                    }
+                ],
+            },
+        ],
+        stream=True,
+        max_tokens=512,
+    )
+
+
+def collect_output(stream):
+    t0 = time.time()
+    received_text = []
+    received_audio = []
+    completed = False
+    audio_sample_rate = None
+
+    for chunk in stream:
+        # Check for proper completion
+        if chunk.choices[0].finish_reason == "stop":
+            completed = True
+            break
+
+        delta = chunk.choices[0].delta
+
+        # Handle text content
+        if text := delta.content:
+            received_text.append((time.time(), text))
+            print(text, end="", flush=True)
+
+        # Handle audio chunks (OpenAI-compatible format: delta.audio.data)
+        if hasattr(delta, "audio") and delta.audio and "data" in delta.audio:
+            # Get sample rate from response if available
+            if audio_sample_rate is None and "sample_rate" in delta.audio:
+                audio_sample_rate = delta.audio["sample_rate"]
+            chunk_data = delta.audio["data"]
+            pcm_bytes = base64.b64decode(chunk_data)
+            samples = np.frombuffer(pcm_bytes, dtype=np.int16)
+            received_audio.append((time.time(), samples))
+
+    if not completed:
+        raise ConnectionError("Server disconnected before completion")
+
+    text = "".join(t for _, t in received_text)
+    audio = [s for _, samples in received_audio for s in samples]
+
+    print("\n\n--- Performance Metrics ---")
+    print(
+        f"TTFT :                        {min(x[0][0] for x in [received_text, received_audio] if x) - t0:>5.3f}         s"
+    )
+    if text and len(received_text) > 1:
+        print(
+            f"Text : {len(received_text):>8}  tokens at {len(received_text) / (received_text[-1][0] - received_text[0][0]):>8.0f}  tokens/s"
+        )
+    if audio:
+        print(
+            f"Audio: {len(audio):>8} samples at {len(audio) / (received_audio[-1][0] - received_audio[0][0]):>8.0f} samples/s"
+        )
+
+    return text if text else None, audio if audio else None, audio_sample_rate
+
+
+def make_request(base_url, mode, wav_file, text, output):
+    client = OpenAI(base_url=base_url, api_key="dummy")
+
+    # Load WAV data if provided
+    wav_data = None
+    if wav_file:
+        with open(wav_file, "rb") as f:
+            wav_data = f.read()
+        print(f"Loaded audio from {wav_file}")
+
+    # Select mode and create stream
+    if mode == "asr":
+        print("Mode: ASR (Audio -> Text)")
+        stream = asr(client, wav_data)
+    elif mode == "tts":
+        print("Mode: TTS (Text -> Audio)")
+        print(f"Input text: {text}")
+        stream = tts(client, text)
+    elif mode == "interleaved":
+        print("Mode: Interleaved (Audio + Text)")
+        stream = interleaved(client, text=text, wav_data=wav_data)
+
+    # Collect output
+    text, audio_samples, audio_sample_rate = collect_output(stream)
+
+    # Display results
+    if audio_samples:
+        print(f"\nReceived {len(audio_samples)} audio samples")
+        sf.write(output, audio_samples, audio_sample_rate)
+        print(f"Saved audio to {output} (sample rate: {audio_sample_rate})")
+
+    if text:
+        print(f"\nTranscribed/Generated text: {text}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test LFM2-Audio server with OpenAI-compatible API"
+    )
+    parser.add_argument(
+        "--wav", type=str, help="Path to input WAV file for ASR or interleaved mode"
+    )
+    parser.add_argument(
+        "--text", type=str, help="Text prompt for TTS or interleaved mode"
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["asr", "tts", "interleaved"],
+        default="interleaved",
+        help="Mode: asr (audio->text), tts (text->audio), or interleaved (both)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="output.wav",
+        help="Output WAV file path (default: output.wav)",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default="http://127.0.0.1:8080/v1",
+        help="Server base URL (default: http://127.0.0.1:8080/v1)",
+    )
+
+    args = parser.parse_args()
+
+    # Validate inputs based on mode
+    if args.mode == "asr" and not args.wav:
+        parser.error("ASR mode requires --wav")
+    if args.mode == "tts" and not args.text:
+        parser.error("TTS mode requires --text")
+    if args.mode == "interleaved" and not args.wav and not args.text:
+        parser.error("Interleaved mode requires one of --wav or --text")
+
+    make_request(args.base_url, args.mode, args.wav, args.text, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/liquid-audio/runner.cpp b/tools/liquid-audio/runner.cpp
new file mode 100644
index 00000000000..90b1a8f9f20
--- /dev/null
+++ b/tools/liquid-audio/runner.cpp
@@ -0,0 +1,410 @@
+// mtmd-audio.h must be included before common.h due to conflicting declarations of string_replace_all
+#include "mtmd-audio.h"
+//
+#include "runner.h"
+//
+#include "chat.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+#include "mtmd-helper.h"
+#include "mtmd.h"
+#include "sampling.h"
+
+#include <algorithm>
+#include <atomic>
+#include <complex>
+#include <cstring>
+#include <filesystem>
+#include <optional>
+#include <utility>
+
+namespace liquid {
+namespace audio {
+
+namespace {
+struct audio_context {
+    mtmd::context_ptr      mtmd_ctx_audio;
+    common_init_result_ptr llama_init;
+
+    llama_model *       model;
+    llama_context *     lctx;
+    const llama_vocab * vocab;
+    common_sampler *    smpl;
+    llama_pos           n_past = 0;
+
+    int n_batch;
+    int verbosity = 0;
+
+    mtmd::bitmaps bitmaps;
+
+    common_chat_templates_ptr tmpls;
+
+    int init(common_params & params) {
+        // backbone
+        llama_init = common_init_from_params(params);
+        model      = llama_init->model();
+        lctx       = llama_init->context();
+
+        if (!model || !lctx) {
+            LOG_ERR("Failed to load backbone\n");
+            return 1;
+        }
+
+        // vocab
+        vocab = llama_model_get_vocab(model);
+
+        n_batch   = params.n_batch;
+        verbosity = params.verbosity > 3;
+
+        // sampler, greedy for text
+        params.sampling.samplers = { common_sampler_type::COMMON_SAMPLER_TYPE_TOP_K };
+        params.sampling.top_k    = 1;
+        smpl                     = common_sampler_init(model, params.sampling);
+        tmpls                    = common_chat_templates_init(model, params.chat_template);
+        LOG_INF("%s: chat template example:\n%s\n", __func__,
+                common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
+
+        // mtmd audio context
+        const char *        clip_path = params.mmproj.path.c_str();
+        mtmd_context_params mparams   = mtmd_context_params_default();
+        mparams.use_gpu               = params.mmproj_use_gpu;
+        mparams.print_timings         = true;
+        mparams.n_threads             = params.cpuparams.n_threads;
+        const bool has_vocoder    = !params.vocoder.model.path.empty();
+        const bool has_detokenizer = !params.vocoder.speaker_file.empty();
+        const bool enable_audio_output = has_vocoder && has_detokenizer;
+        if (enable_audio_output) {
+            mparams.vocoder_path   = params.vocoder.model.path.c_str();
+            mparams.tokenizer_path = params.vocoder.speaker_file.c_str();
+        } else if (has_vocoder || has_detokenizer) {
+            LOG_WRN("%s: audio output disabled: both -mv (vocoder) and --tts-speaker-file (audio detokenizer) are required\n",
+                    __func__);
+        }
+        mtmd_ctx_audio.reset(mtmd_init_from_file(clip_path, model, mparams));
+        if (!mtmd_ctx_audio.get()) {
+            LOG_ERR("Failed to load audio model from %s\n", clip_path);
+            return 1;
+        }
+
+        return 0;
+    }
+
+    ~audio_context() { common_sampler_free(smpl); }
+};
+
+}  // namespace
+
+class Runner::RunnerImpl {
+  public:
+    RunnerImpl() = default;
+
+    int generate(const std::vector<Message> &              messages,
+                 int                                       n_predict,
+                 const text_callback_t &                   text_callback,
+                 const audio_callback_t &                  audio_callback,
+                 const std::vector<mtmd_output_modality> & modalities) {
+        const bool audio_output_supported = mtmd_support_audio_output(ctx.mtmd_ctx_audio.get());
+        if (audio_output_supported) {
+            mtmd_set_output_modalities(ctx.mtmd_ctx_audio.get(), modalities.data(), modalities.size());
+            mtmd_audio_output_start_new_turn(ctx.mtmd_ctx_audio.get());
+        } else {
+            bool requested_audio = false;
+            for (const auto modality : modalities) {
+                if (modality == MTMD_OUTPUT_MODALITY_AUDIO) {
+                    requested_audio = true;
+                    break;
+                }
+            }
+            if (requested_audio) {
+                LOG_WRN("%s: requested audio output, but vocoder/audio detokenizer are not available; falling back to text-only output\n",
+                        __func__);
+            }
+        }
+
+        std::vector<common_chat_msg> msgs;
+        for (const auto & message : messages) {
+            if (message.role == "user") {
+                if (const auto & wav = message.wav; !wav.empty()) {
+                    if (message.content != mtmd_default_marker()) {
+                        return error("when providing audio input, content must be the default marker: " +
+                                     std::string(mtmd_default_marker()));
+                    }
+                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(
+                        ctx.mtmd_ctx_audio.get(), reinterpret_cast<const uint8_t *>(wav.data()), wav.size()));
+                    if (!bmp.ptr) {
+                        return error("failed to load wav");
+                    }
+                    ctx.bitmaps.entries.push_back(std::move(bmp));
+                }
+            }
+            // push msg
+            common_chat_msg msg;
+            msg.role    = message.role;
+            msg.content = message.content;
+            msgs.push_back(msg);
+        }
+
+        if (eval_messages(msgs, ctx.n_past == 0)) {
+            return error("failed to run prefill");
+        }
+
+        // inject perf measurement here
+        auto text_callback_perf = [&](const std::string & text) {
+            auto now            = ggml_time_ms();
+            first_text_received = first_text_received.value_or(now);
+            last_text_received  = now;
+            ++text_tokens_count;
+            text_callback(text);
+        };
+        auto audio_callback_perf = [&](const generated_audio_t & audio) {
+            auto now             = ggml_time_ms();
+            first_audio_received = first_audio_received.value_or(now);
+            last_audio_received  = now;
+            audio_samples_count += audio.size();
+            audio_callback(audio);
+        };
+
+        if (!stop_requested && generate_common(n_predict, text_callback_perf, audio_callback_perf) != 0) {
+            return error("failed to generate");
+        }
+
+        perf_context_print();
+
+        return 0;
+    }
+
+    int init(common_params params) {
+        for (const auto & [p, desc] : {
+                 std::pair{ params.model.path,         "-m"       },
+                 std::pair{ params.mmproj.path,        "--mmproj" },
+        }) {
+            if (p.empty()) {
+                LOG_ERR("ERR: Missing %s argument\n", desc);
+                return 1;
+            }
+            if (!std::filesystem::exists(p)) {
+                LOG_ERR("ERR: File %s does not exists\n", p.c_str());
+                return 1;
+            }
+        }
+
+        if (auto res = ctx.init(params); res) {
+            return error("failed to initialize audio context");
+        }
+
+        reset();
+
+        return 0;
+    }
+
+    void perf_context_print() const {
+        llama_perf_context_print(ctx.lctx);
+
+        fflush(stdout);
+        LOG("audio samples per second: %10.1f\n",
+            audio_samples_count / ((last_audio_received.value_or(0) - first_audio_received.value_or(0)) * 0.001));
+        LOG("text  tokens  per second: %10.1f\n",
+            text_tokens_count / ((last_text_received.value_or(0) - first_text_received.value_or(0)) * 0.001));
+    }
+
+    const char * get_last_error() const { return last_error_.c_str(); }
+
+    void stop() { stop_requested = true; }
+
+    void reset() {
+        stop_requested = false;
+
+        perf_context_reset();
+        llama_perf_context_reset(ctx.lctx);
+
+        common_sampler_reset(ctx.smpl);
+
+        llama_memory_clear(llama_get_memory(ctx.lctx), false);
+        ctx.n_past = 0;
+    }
+
+    int get_output_sample_rate() const {
+        if (!mtmd_support_audio_output(ctx.mtmd_ctx_audio.get())) {
+            return 0;
+        }
+        return mtmd_audio_output_get_sample_rate(ctx.mtmd_ctx_audio.get());
+    }
+
+  private:
+    audio_context ctx;
+
+    std::atomic<bool> stop_requested = false;
+    std::string       last_error_;
+
+    // perf
+    size_t                 text_tokens_count = 0, audio_samples_count = 0;
+    std::optional<int64_t> first_text_received, first_audio_received;
+    std::optional<int64_t> last_text_received, last_audio_received;
+
+    int error(const std::string & msg) {
+        LOG_ERR("ERR: %s\n", msg.c_str());
+        last_error_ = msg;
+        return 1;
+    }
+
+    int generate_common(int n_predict, const text_callback_t & text_callback, const audio_callback_t & audio_callback) {
+        llama_batch batch = llama_batch_get_one(nullptr, 1);  // doesn't own pointers, no need for free.
+
+        n_predict = n_predict < 0 ? std::numeric_limits<int>::max() : n_predict;
+        std::vector<float> embd(llama_model_n_embd(ctx.model));
+        for (int i = 0; i < n_predict; i++) {
+            if (i > n_predict || stop_requested) {
+                LOG("\n");
+                break;
+            }
+
+            // run backbone
+            if (i > 0) {
+                if (llama_decode(ctx.lctx, batch)) {
+                    return error("failed to run backbone");
+                }
+                ctx.n_past += batch.n_tokens;
+            }
+
+            auto * mctx = ctx.mtmd_ctx_audio.get();
+
+            if (mtmd_get_output_modality(mctx) == MTMD_OUTPUT_MODALITY_TEXT) {
+                llama_token next_text_token = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
+                common_sampler_accept(ctx.smpl, next_text_token, true);
+
+                if (llama_vocab_is_eog(ctx.vocab, next_text_token)) {
+                    LOG("\n");
+                    break;  // end of generation
+                }
+
+                // output
+                if (auto token_str = common_token_to_piece(ctx.lctx, next_text_token, false); !token_str.empty()) {
+                    text_callback(token_str);
+                    LOG("%s", token_str.c_str());
+                    fflush(stdout);
+                }
+
+                mtmd_audio_output_accept_token(mctx, next_text_token);
+
+                batch.token = &next_text_token;
+                batch.embd  = nullptr;
+            } else if (mtmd_get_output_modality(mctx) == MTMD_OUTPUT_MODALITY_AUDIO) {
+                int res = mtmd_audio_output_decode(mctx, llama_get_embeddings(ctx.lctx), llama_model_n_embd(ctx.model),
+                                                   embd.data());
+                GGML_ASSERT(res == 0);
+                auto                 n_samples = mtmd_get_n_audio_samples(mctx);
+                std::vector<int16_t> samples(n_samples);
+                mtmd_get_audio_samples(mctx, samples.data());
+                audio_callback(samples);
+
+                batch.embd  = embd.data();
+                batch.token = nullptr;
+            }
+
+            llama_set_embeddings(ctx.lctx, mtmd_get_output_modality(mctx) == MTMD_OUTPUT_MODALITY_AUDIO);
+
+            if (stop_requested) {
+                LOG("\n");
+                break;
+            }
+        }
+        LOG("\n");
+
+        return 0;
+    }
+
+    void perf_context_reset() {
+        first_audio_received = std::nullopt;
+        first_text_received  = std::nullopt;
+        last_audio_received  = std::nullopt;
+        last_text_received   = std::nullopt;
+        text_tokens_count    = 0;
+        audio_samples_count  = 0;
+    }
+
+    int eval_messages(const std::vector<common_chat_msg> & msgs, bool add_bos = false) {
+        common_chat_templates_inputs tmpl_inputs;
+        tmpl_inputs.messages              = msgs;
+        tmpl_inputs.add_generation_prompt = true;
+        auto formatted_chat               = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
+        LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
+
+        mtmd_input_text text;
+        text.text          = formatted_chat.prompt.c_str();
+        text.add_special   = add_bos;
+        text.parse_special = true;
+
+        if (stop_requested) {
+            return 0;
+        }
+
+        mtmd::input_chunks chunks(mtmd_input_chunks_init());
+        auto               bitmaps_c_ptr = ctx.bitmaps.c_ptr();
+        int32_t            res           = mtmd_tokenize(ctx.mtmd_ctx_audio.get(),
+                                                         chunks.ptr.get(),  // output
+                                                         &text,             // text
+                                                         bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
+        if (res != 0) {
+            return error("Unable to tokenize prompt");
+        }
+
+        ctx.bitmaps.entries.clear();
+
+        size_t n_chunks = mtmd_input_chunks_size(chunks.ptr.get());
+        if (n_chunks == 0) {
+            return error("no chunks to eval");
+        }
+
+        for (size_t i = 0; i < n_chunks; i++) {
+            bool         chunk_logits_last = (i == n_chunks - 1);
+            const auto * chunk             = mtmd_input_chunks_get(chunks.ptr.get(), i);
+
+            int32_t res = mtmd_helper_eval_chunk_single(ctx.mtmd_ctx_audio.get(), ctx.lctx, chunk, ctx.n_past, 0,
+                                                        ctx.n_batch, chunk_logits_last, &ctx.n_past);
+            if (res != 0) {
+                return error("failed to eval chunk");
+            }
+        }
+
+        LOG("\n");
+
+        return 0;
+    }
+};
+
+// forward to impl_
+Runner::Runner() : impl_(std::make_unique<RunnerImpl>()) {}
+
+Runner::~Runner() = default;
+
+int Runner::get_output_sample_rate() const {
+    return impl_->get_output_sample_rate();
+}
+
+const char * Runner::get_last_error() const {
+    return impl_->get_last_error();
+}
+
+void Runner::stop() {
+    impl_->stop();
+}
+
+int Runner::generate(const std::vector<Message> &              messages,
+                     int                                       n_predict,
+                     const text_callback_t &                   text_callback,
+                     const audio_callback_t &                  audio_callback,
+                     const std::vector<mtmd_output_modality> & modalities) {
+    return impl_->generate(messages, n_predict, text_callback, audio_callback, modalities);
+}
+
+int Runner::init(common_params params) {
+    return impl_->init(std::move(params));
+}
+
+void Runner::reset() {
+    impl_->reset();
+}
+
+}  // namespace audio
+}  // namespace liquid
diff --git a/tools/liquid-audio/runner.h b/tools/liquid-audio/runner.h
new file mode 100644
index 00000000000..686b7e13a1e
--- /dev/null
+++ b/tools/liquid-audio/runner.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "common.h"
+#include "mtmd.h"
+
+#include <functional>
+#include <string>
+
+namespace liquid {
+namespace audio {
+
+using generated_audio_t = std::vector<int16_t>;
+using text_callback_t   = std::function<void(const std::string &)>;
+using audio_callback_t  = std::function<void(const std::vector<int16_t> &)>;
+
+class Runner {
+  public:
+    // handling depends on system prompt
+    static constexpr const char *                asr_system_prompt         = "Perform ASR.";
+    static constexpr const char *                interleaved_system_prompt = "Respond with interleaved text and audio.";
+    static inline const std::vector<std::string> tts_system_prompts        = {
+        "Perform TTS. Use the US male voice.",
+        "Perform TTS. Use the UK male voice.",
+        "Perform TTS. Use the US female voice.",
+        "Perform TTS. Use the UK female voice.",
+    };
+
+    struct Message {
+        std::string            role;
+        std::string            content;
+        std::vector<std::byte> wav;
+    };
+
+    Runner();
+    ~Runner();
+
+    void reset();
+
+    int  init(common_params params);
+    void stop();
+    int  generate(const std::vector<Message> &              messages,
+                  int                                       n_predict,
+                  const text_callback_t &                   text_callback,
+                  const audio_callback_t &                  audio_callback,
+                  const std::vector<mtmd_output_modality> & modalities);
+
+    int          get_output_sample_rate() const;
+    const char * get_last_error() const;
+  private:
+    class RunnerImpl;
+    std::unique_ptr<RunnerImpl> impl_;
+};
+
+}  // namespace audio
+}  // namespace liquid
diff --git a/tools/liquid-audio/server.cpp b/tools/liquid-audio/server.cpp
new file mode 100644
index 00000000000..96d1ff873fd
--- /dev/null
+++ b/tools/liquid-audio/server.cpp
@@ -0,0 +1,439 @@
+#include "mtmd.h"
+#include "runner.h"
+//
+
+#include "arg.h"
+#include "base64.hpp"
+#include "common.h"
+#include "ggml.h"
+#include "log.h"
+
+#include <cpp-httplib/httplib.h>
+#include <signal.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <mutex>
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <thread>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+#define MIMETYPE_JSON "application/json; charset=utf-8"
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#    include <unistd.h>
+#elif defined(_WIN32)
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#endif
+
+static std::function<void()> g_shutdown;  // Assigned in main()
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
+static void sigint_handler(int signo) {
+    if (signo == SIGINT && g_shutdown) {
+        g_shutdown();
+    }
+}
+#endif
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    LOG("CLI for LFM2.5-Audio-1.5B\n\n"
+        "Usage: %s [options] -m <model.gguf> --mmproj <mmproj.gguf> "
+        "[-mv <vocoder.gguf> --tts-speaker-file <tokenizer.gguf>]\n",
+        argv[0]);
+}
+
+// Per-request output buffer shared between worker thread and content provider
+struct OutputBuffer {
+    std::mutex              mutex;
+    std::condition_variable cv;
+    std::deque<std::string> chunks;
+    std::atomic<bool>       done{ false };
+    std::atomic<bool>       aborted{ false };
+
+    void push(const std::string & chunk) {
+        {
+            std::lock_guard<std::mutex> lock(mutex);
+            chunks.push_back(chunk);
+        }
+        cv.notify_one();
+    }
+
+    void finish() {
+        done = true;
+        cv.notify_one();
+    }
+};
+
+// A work item: parsed request + output buffer for streaming back
+struct WorkItem {
+    std::vector<liquid::audio::Runner::Message> messages;
+    std::vector<mtmd_output_modality>           modalities;
+    int                                         n_predict;
+    bool                                        reset_context;
+    std::shared_ptr<OutputBuffer>               output;
+    std::function<void()>                       check_abort;
+    int                                         output_sample_rate;
+};
+
+// Thread-safe work queue
+struct WorkQueue {
+    std::mutex              mutex;
+    std::condition_variable cv;
+    std::deque<WorkItem>    items;
+    std::atomic<bool>       stopped{ false };
+
+    void push(WorkItem && item) {
+        {
+            std::lock_guard<std::mutex> lock(mutex);
+            items.push_back(std::move(item));
+        }
+        cv.notify_one();
+    }
+
+    bool pop(WorkItem & item) {
+        std::unique_lock<std::mutex> lock(mutex);
+        cv.wait(lock, [this]() { return !items.empty() || stopped.load(); });
+        if (stopped.load() && items.empty()) {
+            return false;
+        }
+        item = std::move(items.front());
+        items.pop_front();
+        return true;
+    }
+
+    void stop() {
+        stopped = true;
+        cv.notify_all();
+    }
+};
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    common_params params;
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LIQUID_AUDIO, show_additional_info)) {
+        return 1;
+    }
+
+    common_init();
+
+    if (params.n_ctx == 0) {
+        params.n_ctx = 4096;
+    }
+
+    LOG_INF("Loading model\n");
+    liquid::audio::Runner runner;
+    if (0 != runner.init(params)) {
+        return 1;
+    }
+    LOG_INF("Model loaded successfully!\n");
+
+    httplib::Server svr;
+    // keep request handling single-threaded to avoid per-thread allocator arena growth.
+    svr.new_task_queue = [] { return new httplib::ThreadPool(1); };
+    svr.set_default_headers({
+        { "Server", "lfm2-audio-server" }
+    });
+
+    std::atomic<bool> is_server_running(true);
+    WorkQueue         work_queue;
+
+    // Single worker thread — processes one request at a time, no mutexes needed
+    std::thread worker([&]() {
+        WorkItem item;
+        while (work_queue.pop(item)) {
+            auto & output = item.output;
+
+            if (output->aborted.load()) {
+                continue;
+            }
+
+            if (item.reset_context) {
+                LOG_INF("Resetting model context\n");
+                runner.reset();
+            }
+
+            auto text_cb = [&output, &item](const std::string & text) {
+                item.check_abort();
+                if (output->aborted.load()) {
+                    return;
+                }
+                json chunk = {
+                    { "object",  "chat.completion.chunk"                                                              },
+                    { "created", std::time(0)                                                                         },
+                    { "choices",
+                     json::array(
+                          { { { "index", 0 }, { "delta", { { "content", text } } }, { "finish_reason", nullptr } } }) }
+                };
+                output->push("data: " + chunk.dump() + "\n\n");
+            };
+
+            auto audio_cb = [&output, &item](const std::vector<int16_t> & audio) {
+                item.check_abort();
+                if (output->aborted.load()) {
+                    return;
+                }
+                std::string audio_base64 =
+                    base64::encode(reinterpret_cast<const char *>(audio.data()), audio.size() * sizeof(audio.front()));
+                json chunk = {
+                    { "object",  "chat.completion.chunk"                                                           },
+                    { "created", std::time(0)                                                                      },
+                    { "choices", json::array({ { { "index", 0 },
+                                                 { "delta",
+                                                   { { "audio",
+                                                       { { "data", audio_base64 },
+                                                         { "format", "pcm" },
+                                                         { "sample_rate", item.output_sample_rate } } } } },
+                                                 { "finish_reason", nullptr } } }) }
+                };
+                output->push("data: " + chunk.dump() + "\n\n");
+            };
+
+            std::optional<std::string> err;
+            if (runner.generate(item.messages, item.n_predict, text_cb, audio_cb, item.modalities)) {
+                err = runner.get_last_error();
+            }
+
+            if (!output->aborted.load()) {
+                if (err) {
+                    json error_chunk = {
+                        { "error", { { "message", *err }, { "type", "server_error" } } }
+                    };
+                    output->push("data: " + error_chunk.dump() + "\n\n");
+                } else {
+                    json final_chunk = {
+                        { "object",  "chat.completion.chunk"                                                    },
+                        { "created", std::time(0)                                                               },
+                        { "choices",
+                         json::array(
+                              { { { "index", 0 }, { "delta", json::object() }, { "finish_reason", "stop" } } }) }
+                    };
+                    output->push("data: " + final_chunk.dump() + "\n\n");
+                    output->push("data: [DONE]\n\n");
+                }
+            }
+
+            output->finish();
+        }
+    });
+
+    // Set up shutdown handler
+    g_shutdown = [&]() {
+        is_server_running = false;
+        runner.stop();
+        work_queue.stop();
+        svr.stop();
+    };
+
+    auto res_error = [](httplib::Response & res, const std::string & message, int code = 500) {
+        json error_response = {
+            { "error", { { "message", message }, { "type", "server_error" }, { "code", code } } }
+        };
+        res.set_content(error_response.dump(), MIMETYPE_JSON);
+        res.status = code;
+    };
+
+    // Signal handling
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = sigint_handler;
+    sigemptyset(&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+#elif defined(_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+    // CORS
+    svr.set_pre_routing_handler([](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", "*");
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+            res.set_header("Access-Control-Allow-Headers", "*");
+            res.set_content("", "text/html");
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
+    // Chat completions endpoint
+    svr.Post("/v1/chat/completions", [&](const httplib::Request & req, httplib::Response & res) {
+        if (!is_server_running.load()) {
+            res_error(res, "Server is shutting down", 503);
+            return;
+        }
+
+        try {
+            json body = json::parse(req.body);
+
+            int  n_predict     = body.value("max_tokens", 2048);
+            bool stream        = body.value("stream", false);
+            bool reset_context = body.value("reset_context", true);
+
+            std::vector<liquid::audio::Runner::Message> messages;
+            std::vector<mtmd_output_modality>           modalities;
+
+            if (body.contains("modalities") && body.at("modalities").is_array()) {
+                for (const auto & modality : body.at("modalities")) {
+                    if (modality.is_string() && modality.get<std::string>() == "audio") {
+                        modalities.push_back(MTMD_OUTPUT_MODALITY_AUDIO);
+                    } else if (modality.is_string() && modality.get<std::string>() == "text") {
+                        modalities.push_back(MTMD_OUTPUT_MODALITY_TEXT);
+                    }
+                }
+            }
+
+            if (body.contains("messages") && body["messages"].is_array()) {
+                for (const auto & msg : body["messages"]) {
+                    std::string role    = msg["role"];
+                    auto        content = msg["content"];
+
+                    if (role == "system") {
+                        messages.push_back({ role, content, {} });
+                        continue;
+                    }
+
+                    if (role != "user") {
+                        res_error(res, "role must be system or user", 400);
+                        return;
+                    }
+
+                    if (content.is_string()) {
+                        messages.push_back({ role, content, {} });
+                        continue;
+                    }
+
+                    if (!content.is_array()) {
+                        res_error(res, "content must be string or array", 400);
+                        return;
+                    }
+
+                    for (const auto & part : content) {
+                        std::string type = part["type"];
+                        if (type == "text") {
+                            messages.push_back({ role, part["text"], {} });
+                            continue;
+                        }
+
+                        if (type != "input_audio") {
+                            res_error(res, "content type must be either text or input_audio", 400);
+                            return;
+                        }
+
+                        if (part["input_audio"]["format"] != "wav") {
+                            res_error(res, "input_audio format must be wav", 400);
+                            return;
+                        }
+
+                        std::string          data = part["input_audio"]["data"];
+                        std::vector<uint8_t> data_buf;
+                        base64::decode(begin(data), end(data), std::back_inserter(data_buf));
+                        auto wav_data = std::vector<std::byte>(data_buf.size());
+                        memcpy(wav_data.data(), data_buf.data(), data_buf.size());
+                        messages.push_back({ role, mtmd_default_marker(), wav_data });
+                    }
+                }
+            }
+
+            if (!stream) {
+                res_error(res, "non streaming API is not implemented", 400);
+                return;
+            }
+
+            // Create output buffer and enqueue work
+            auto output = std::make_shared<OutputBuffer>();
+
+            auto check_abort = [&req, output, &runner, &is_server_running]() {
+                if (output->aborted.load()) {
+                    return;
+                }
+                bool should_abort = !is_server_running.load();
+                if (!should_abort && req.is_connection_closed) {
+                    should_abort = req.is_connection_closed();
+                }
+                if (should_abort && !output->aborted.exchange(true)) {
+                    LOG_INF("Aborting generation\n");
+                    runner.stop();
+                }
+            };
+
+            work_queue.push({
+                std::move(messages),
+                std::move(modalities),
+                n_predict,
+                reset_context,
+                output,
+                check_abort,
+                runner.get_output_sample_rate(),
+            });
+
+            // Stream chunks as the worker produces them
+            res.set_content_provider(
+                "text/event-stream", [output, &is_server_running](size_t, httplib::DataSink & sink) {
+                    std::unique_lock<std::mutex> lock(output->mutex);
+
+                    output->cv.wait_for(lock, std::chrono::milliseconds(100), [&output, &is_server_running]() {
+                        return !output->chunks.empty() || output->done.load() || output->aborted.load() ||
+                               !is_server_running.load();
+                    });
+
+                    if (output->aborted.load() || !is_server_running.load()) {
+                        return false;
+                    }
+
+                    while (!output->chunks.empty()) {
+                        const std::string & data = output->chunks.front();
+                        if (!sink.write(data.c_str(), data.size())) {
+                            output->aborted = true;
+                            return false;
+                        }
+                        output->chunks.pop_front();
+                    }
+
+                    if (output->done.load() && output->chunks.empty()) {
+                        sink.done();
+                        return false;
+                    }
+
+                    return true;
+                });
+
+            res.status = 200;
+
+        } catch (const std::exception & e) {
+            res_error(res, std::string("Error processing request: ") + e.what(), 500);
+        }
+    });
+
+    LOG_INF("Starting HTTP server on %s:%d\n", params.hostname.c_str(), params.port);
+
+    if (!svr.bind_to_port(params.hostname, params.port)) {
+        LOG_ERR("Failed to bind to %s:%d\n", params.hostname.c_str(), params.port);
+        return 1;
+    }
+
+    LOG_INF("Server ready at http://%s:%d\n", params.hostname.c_str(), params.port);
+    svr.listen_after_bind();
+
+    LOG_INF("\nShutting down...\n");
+    g_shutdown = nullptr;  // Clear before locals go out of scope
+    work_queue.stop();
+    if (worker.joinable()) {
+        worker.join();
+    }
+
+    return 0;
+}
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 3be3c27e87b..43033280131 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(mtmd
             mtmd.h
             mtmd-helper.cpp
             mtmd-helper.h
+            audio-decoder.cpp
             clip.cpp
             clip.h
             clip-impl.h
diff --git a/tools/mtmd/audio-decoder.cpp b/tools/mtmd/audio-decoder.cpp
new file mode 100644
index 00000000000..8b878b89e8b
--- /dev/null
+++ b/tools/mtmd/audio-decoder.cpp
@@ -0,0 +1,944 @@
+#include "audio-decoder.h"
+
+#include "clip-impl.h"
+#include "common/common.h"
+#include "ggml-backend.h"
+#include "ggml-cpp.h"
+#include "gguf.h"
+#include "llama.h"
+#include "mtmd-audio.h"
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdarg>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+namespace liquid {
+namespace audio {
+
+using audio_token_t = std::array<int32_t, 8>;
+
+namespace {
+
+ggml_tensor * build_rms_norm(ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * mw, const float eps) {
+    cur = ggml_rms_norm(ctx0, cur, eps);
+    cur = ggml_mul(ctx0, cur, mw);
+
+    return cur;
+}
+
+ggml_tensor * build_ffn(ggml_context * ctx0,
+                        ggml_tensor *  cur,
+                        ggml_tensor *  ffn_up,
+                        ggml_tensor *  ffn_gate,
+                        ggml_tensor *  ffn_down) {
+    auto * up      = ggml_mul_mat(ctx0, ffn_up, cur);
+    auto * gate    = ggml_mul_mat(ctx0, ffn_gate, cur);
+    auto * swiglu  = ggml_swiglu_split(ctx0, gate, up);
+    auto * ffn_out = ggml_mul_mat(ctx0, ffn_down, swiglu);
+
+    return ffn_out;
+}
+
+struct audio_decoder_ggml_ctx {
+    gguf_context * ctx_gguf = nullptr;
+    ggml_context * ctx_data = nullptr;
+    ggml_context * ctx_gf   = nullptr;
+
+    std::vector<ggml_backend_t>             backends;
+    std::vector<ggml_backend_buffer_type_t> bufts;
+
+    ggml_backend_buffer_t  buf = nullptr;
+    ggml_backend_sched_ptr sched;
+
+    ggml_cgraph *        gf = nullptr;
+    std::vector<uint8_t> buf_compute_meta;
+    int                  max_nodes = 16 * 1024;
+
+    std::unordered_map<std::string, ggml_tensor *> tensors;
+    std::unordered_map<std::string, uint32_t>      hyperparameters;
+
+    explicit audio_decoder_ggml_ctx(bool use_gpu) {
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        GGML_ASSERT(backend_cpu);
+
+        if (use_gpu) {
+            ggml_backend_t backend_gpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
+            if (!backend_gpu) {
+                backend_gpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
+            }
+            if (backend_gpu) {
+                LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend_gpu));
+                backends.push_back(backend_gpu);
+                bufts.push_back(ggml_backend_get_default_buffer_type(backend_gpu));
+            }
+        }
+
+        // CPU must be last (scheduler requirement)
+        backends.push_back(backend_cpu);
+        bufts.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
+
+        if (backends.size() == 1) {
+            LOG_INF("%s: using CPU backend\n", __func__);
+        } else {
+            LOG_INF("%s: using GPU+CPU backend\n", __func__);
+        }
+
+        sched.reset(ggml_backend_sched_new(backends.data(), bufts.data(), backends.size(), max_nodes, false, true));
+        buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
+    }
+
+    void load_gguf(const char * fname) {
+        ggml_context * meta = nullptr;
+
+        gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &meta,
+        };
+
+        ctx_gguf = gguf_init_from_file(fname, params);
+
+        // load tensors
+        const int n_tensors = gguf_get_n_tensors(ctx_gguf);
+
+        std::vector<uint8_t> read_buf;
+        ggml_init_params     ggml_params = {
+            /*.mem_size   =*/(n_tensors + 1) * ggml_tensor_overhead(),
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/true,
+        };
+
+        ctx_data = ggml_init(ggml_params);
+        auto fin = std::ifstream(fname, std::ios::binary);
+        if (!fin) {
+            ggml_free(meta);
+            throw std::runtime_error("cannot open model file for loading tensors");
+        }
+
+        // hyperparameters
+        for (const auto & key : { "depthformer_n_layer", "depthformer_n_embd" }) {
+            auto key_id = gguf_find_key(ctx_gguf, key);
+            if (key_id < 0) {
+                throw std::runtime_error(string_format("key not found in gguf: %s", key));
+            }
+            hyperparameters[key] = gguf_get_val_u32(ctx_gguf, key_id);
+        }
+
+        // add tensors to context
+        for (int i = 0; i < n_tensors; ++i) {
+            const char *  name = gguf_get_tensor_name(ctx_gguf, i);
+            ggml_tensor * t    = ggml_get_tensor(meta, name);
+            ggml_tensor * cur  = ggml_dup_tensor(ctx_data, t);
+            ggml_set_name(cur, name);
+            tensors.insert({ name, cur });
+        }
+
+        // alloc memory on primary backend (GPU if available)
+        buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, bufts.front());
+        ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        for (int i = 0; i < n_tensors; ++i) {
+            const char *  name   = gguf_get_tensor_name(ctx_gguf, i);
+            ggml_tensor * cur    = ggml_get_tensor(ctx_data, name);
+            const size_t  offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
+            // printf("%s: Loading tensor \"%s\"\n", __func__, name);
+            fin.seekg(offset, std::ios::beg);
+            if (!fin) {
+                ggml_free(meta);
+                throw std::runtime_error(string_format("failed to seek for tensor: %s", name));
+            }
+            int num_bytes = ggml_nbytes(cur);
+            if (ggml_backend_buft_is_host(bufts.front())) {
+                // for the CPU and Metal backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(num_bytes);
+                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+            }
+        }
+        LOG_INF("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname);
+        fin.close();
+
+        ggml_free(meta);
+    }
+
+    /**
+     * Build a cgraph using the given builder function.
+     *
+     * The built cgraph will be stored in `ctx.gf`
+     */
+    void build_graph(const std::function<void(ggml_context *, ggml_cgraph *)> & builder_fn) {
+        ggml_free(ctx_gf);
+        struct ggml_init_params params = {
+            /*.mem_size   =*/buf_compute_meta.size(),
+            /*.mem_buffer =*/buf_compute_meta.data(),
+            /*.no_alloc   =*/true,
+        };
+
+        ctx_gf = ggml_init(params);
+        ggml_backend_sched_reset(sched.get());
+        gf = ggml_new_graph_custom(ctx_gf, max_nodes, false);
+
+        builder_fn(ctx_gf, gf);
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+    }
+
+    ggml_status compute() const { return ggml_backend_sched_graph_compute(sched.get(), gf); }
+
+    void set_tensor_data(const std::string & name, const void * data) const {
+        ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
+        if (!t) {
+            throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
+        }
+        ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t));
+    }
+
+    std::pair<ggml_tensor *, std::vector<uint8_t>> get_tensor_data(const std::string & name) const {
+        ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
+        if (!t) {
+            throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
+        }
+        std::vector<uint8_t> data(ggml_nbytes(t));
+        ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+        return std::make_pair(t, data);
+    }
+
+    ggml_tensor * get_weight(const char * fmt, ...) {
+        std::vector<char> str(128);
+        va_list           va;
+        va_start(va, fmt);
+        vsnprintf(str.data(), 128, fmt, va);
+        va_end(va);
+        auto it = tensors.find(str.data());
+        if (it == tensors.end()) {
+            throw std::runtime_error(string_format("weight tensor not found: %s", str.data()));
+        }
+        return it->second;
+    }
+
+    ~audio_decoder_ggml_ctx() {
+        ggml_free(ctx_data);
+        gguf_free(ctx_gguf);
+        ggml_backend_buffer_free(buf);
+        for (auto * backend : backends) {
+            ggml_backend_free(backend);
+        }
+    }
+};
+
+template <typename Container>
+std::vector<float> run_graph(
+    audio_decoder_ggml_ctx &                                                           ctx,
+    const Container &                                                                  data,
+    const std::function<ggml_tensor *(ggml_context *, ggml_cgraph *, ggml_tensor *)> & builder_fn) {
+    ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf) {
+        using T = typename Container::value_type;
+
+        ggml_tensor * input = nullptr;
+        if constexpr (std::is_same_v<T, float>) {
+            input = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, data.size());
+        } else if constexpr (std::is_same_v<T, int32_t>) {
+            input = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, data.size());
+        } else {
+            static_assert(!sizeof(T), "Unsupported type");
+        }
+        GGML_ASSERT(input);
+        ggml_set_name(input, "input");
+        ggml_set_input(input);
+
+        auto * output = builder_fn(ctx0, gf, input);
+
+        ggml_build_forward_expand(gf, output);
+        ggml_set_name(output, "output");
+        ggml_set_output(output);
+    });
+
+    ctx.set_tensor_data("input", data.data());
+
+    ctx.compute();
+
+    ggml_tensor * t = ggml_get_tensor(ctx.ctx_gf, "output");
+    GGML_ASSERT(t);
+    std::vector<float> output(ggml_nelements(t));
+    ggml_backend_tensor_get(t, output.data(), 0, ggml_nbytes(t));
+    return output;
+}
+
+// used for KV and conv cache
+class Cache {
+  public:
+    void init(int n_tensors) {
+        GGML_ASSERT(!ctx);
+        ggml_init_params params = {
+            n_tensors * ggml_tensor_overhead(), nullptr,
+            true  // no_alloc
+        };
+        ctx = ggml_init(params);
+    }
+
+    ~Cache() {
+        if (buf) {
+            ggml_backend_buffer_free(buf);
+        }
+        if (ctx) {
+            ggml_free(ctx);
+        }
+    }
+
+    ggml_tensor * new_tensor(enum ggml_type type, const std::vector<int64_t> & shape) {
+        GGML_ASSERT(ctx);
+        return ggml_new_tensor(ctx, type, shape.size(), shape.data());
+    }
+
+    void alloc(ggml_backend_buffer_type_t buft) {
+        GGML_ASSERT(!buf);
+        buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+    }
+
+  private:
+    ggml_context *        ctx = nullptr;
+    ggml_backend_buffer_t buf = nullptr;
+};
+
+class DepthformerModel {
+  public:
+    void init(audio_decoder_ggml_ctx & ctx) {
+        config.n_layer = ctx.hyperparameters.at("depthformer_n_layer");
+        config.n_embd  = ctx.hyperparameters.at("depthformer_n_embd");
+
+        // cache
+        int n_cache_tensors = config.n_layer * 2;  // kv per layer
+        cache.init(n_cache_tensors);
+
+        weights.layers.resize(config.n_layer);
+        for (int il = 0; il < config.n_layer; il++) {
+            auto & l = weights.layers[il];
+
+            l.operator_norm = ctx.get_weight("depthformer.layers.%d.operator_norm.weight", il);
+            l.wqkv          = ctx.get_weight("depthformer.layers.%d.operator.qkv_proj.weight", il);
+            l.attn_q_norm   = ctx.get_weight("depthformer.layers.%d.operator.attention.q_layernorm.weight", il);
+            l.attn_k_norm   = ctx.get_weight("depthformer.layers.%d.operator.attention.k_layernorm.weight", il);
+            l.wo            = ctx.get_weight("depthformer.layers.%d.operator.out_proj.weight", il);
+            l.ffn_norm      = ctx.get_weight("depthformer.layers.%d.ffn_norm.weight", il);
+            l.w1            = ctx.get_weight("depthformer.layers.%d.feed_forward.w1.weight", il);
+            l.w2            = ctx.get_weight("depthformer.layers.%d.feed_forward.w2.weight", il);
+            l.w3            = ctx.get_weight("depthformer.layers.%d.feed_forward.w3.weight", il);
+            l.k_cache = cache.new_tensor(GGML_TYPE_F32, { config.n_embd_head, config.n_head_kv, config.max_seq_len });
+            l.v_cache = cache.new_tensor(GGML_TYPE_F32, { config.n_embd_head, config.n_head_kv, config.max_seq_len });
+        }
+
+        cache.alloc(ctx.bufts.front());
+    }
+
+    void reset() { n_past = 0; }
+
+    void advance(int n_tokens) { n_past += n_tokens; }
+
+    ggml_tensor * graph(ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur) {
+        auto &    c        = config;
+        const int n_tokens = cur->ne[1];
+
+        for (int i = 0; i < c.n_layer; ++i) {
+            const auto & l = weights.layers[i];
+            auto *       x = cur;
+
+            // operator_norm
+            cur = build_rms_norm(ctx0, x, l.operator_norm, c.f_norm_rms_eps);
+
+            // attention
+            {
+                ggml_tensor * qkv = ggml_mul_mat(ctx0, l.wqkv, cur);
+
+                ggml_tensor * q = ggml_view_3d(ctx0, qkv, c.n_embd_head, c.n_head, n_tokens,
+                                               c.n_embd_head * ggml_element_size(qkv), qkv->nb[1], 0);
+                ggml_tensor * k = ggml_view_3d(ctx0, qkv, c.n_embd_head, c.n_head_kv, n_tokens,
+                                               c.n_embd_head * ggml_element_size(qkv), qkv->nb[1],
+                                               c.n_embd_head * c.n_head * ggml_element_size(qkv));
+                ggml_tensor * v = ggml_view_3d(ctx0, qkv, c.n_embd_head, c.n_head_kv, n_tokens,
+                                               c.n_embd_head * ggml_element_size(qkv), qkv->nb[1],
+                                               c.n_embd_head * (c.n_head + c.n_head_kv) * ggml_element_size(qkv));
+
+                q = build_rms_norm(ctx0, q, l.attn_q_norm, c.f_norm_rms_eps);
+                k = build_rms_norm(ctx0, k, l.attn_k_norm, c.f_norm_rms_eps);
+
+                auto   n_rot   = c.n_embd_head;
+                auto * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, n_past, n_past + n_tokens, 1), GGML_TYPE_I32);
+                q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, c.rope_type, c.n_ctx_orig, c.rope_freq_base,
+                                  c.rope_freq_scale, 0, 1, 0, 0);
+                k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, c.rope_type, c.n_ctx_orig, c.rope_freq_base,
+                                  c.rope_freq_scale, 0, 1, 0, 0);
+
+                auto * k_cache = l.k_cache;
+                auto * v_cache = l.v_cache;
+
+                // write current k/v to cache at position n_past
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, k,
+                                                       ggml_view_2d(ctx0, k_cache, k_cache->ne[0], k_cache->ne[1],
+                                                                    k_cache->nb[1], n_past * k_cache->nb[2])));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, v,
+                                                       ggml_view_2d(ctx0, v_cache, v_cache->ne[0], v_cache->ne[1],
+                                                                    v_cache->nb[1], n_past * v_cache->nb[2])));
+
+                // read k/v from cache [0..n_tokens]
+                k = ggml_view_3d(ctx0, k_cache, k_cache->ne[0], k_cache->ne[1], n_past + n_tokens, k_cache->nb[1],
+                                 k_cache->nb[2], 0);
+                v = ggml_view_3d(ctx0, v_cache, v_cache->ne[0], v_cache->ne[1], n_past + n_tokens, v_cache->nb[1],
+                                 v_cache->nb[2], 0);
+
+                float kq_scale = 1.0f / sqrtf((float) c.n_embd_head);
+
+                // manual attention, faster for small size
+                {
+                    q = ggml_permute(ctx0, q, 0, 2, 1, 3);
+                    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
+
+                    auto * kq = ggml_mul_mat(ctx0, k, q);
+                    kq        = ggml_scale(ctx0, kq, kq_scale);
+                    kq        = ggml_soft_max(ctx0, kq);
+
+                    v = ggml_permute(ctx0, v, 1, 2, 0, 3);
+                    v = ggml_cont(ctx0, v);
+
+                    auto * kqv = ggml_mul_mat(ctx0, v, kq);
+                    kqv        = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+                    cur        = ggml_cont_2d(ctx0, kqv, kqv->ne[0] * kqv->ne[1], kqv->ne[2]);
+                }
+
+                cur = ggml_mul_mat(ctx0, l.wo, cur);
+            }
+
+            cur = ggml_add(ctx0, cur, x);
+
+            auto * ffn_norm = build_rms_norm(ctx0, cur, l.ffn_norm, c.f_norm_rms_eps);
+            auto * ffn_out  = build_ffn(ctx0, ffn_norm, l.w3, l.w1, l.w2);
+
+            cur = ggml_add(ctx0, cur, ffn_out);
+        }
+
+        return cur;
+    }
+
+  private:
+    struct {
+        // TODO(tarek): read from gguf
+        int n_layer = 6;
+        int n_embd  = 1024;
+
+        int   n_embd_head    = 32;
+        int   n_head         = 32;
+        int   n_head_kv      = 8;
+        float f_norm_rms_eps = 1e-5f;
+
+        int   n_ctx_orig      = 128000;
+        int   rope_type       = LLAMA_ROPE_TYPE_NORM;
+        float rope_freq_base  = 1000000.0f;
+        float rope_freq_scale = 1.f;
+
+        int max_seq_len = 8;
+    } config;
+
+    struct {
+        struct Layer {
+            ggml_tensor * operator_norm = nullptr;
+            ggml_tensor * wqkv          = nullptr;
+            ggml_tensor * attn_q_norm   = nullptr;
+            ggml_tensor * attn_k_norm   = nullptr;
+            ggml_tensor * wo            = nullptr;
+            ggml_tensor * ffn_norm      = nullptr;
+            ggml_tensor * w1            = nullptr;
+            ggml_tensor * w2            = nullptr;
+            ggml_tensor * w3            = nullptr;
+
+            ggml_tensor * k_cache = nullptr;
+            ggml_tensor * v_cache = nullptr;
+        };
+
+        std::vector<Layer> layers;
+    } weights;
+
+    // state
+    Cache   cache;
+    int32_t n_past = 0;
+};
+
+class DecoderModel {
+  public:
+    void init(audio_decoder_ggml_ctx & ctx) {
+        depthformer_model.init(ctx);
+
+        weights.depth_linear_w = ctx.get_weight("depth_linear.weight");
+        weights.depth_linear_b = ctx.get_weight("depth_linear.bias");
+
+        weights.depth_embd_layers.resize(config.n_codebook);
+        for (int ic = 0; ic < config.n_codebook; ic++) {
+            auto & cl = weights.depth_embd_layers[ic];
+
+            cl.norm      = ctx.get_weight("depth_embeddings.%d.embedding_norm.weight", ic);
+            cl.embd      = ctx.get_weight("depth_embeddings.%d.embedding.weight", ic);
+            cl.to_logits = ctx.get_weight("depth_embeddings.%d.to_logits.weight", ic);
+        }
+
+        weights.audio_embedding.norm      = ctx.get_weight("audio_embedding.embedding_norm.weight");
+        weights.audio_embedding.embd      = ctx.get_weight("audio_embedding.embedding.weight");
+        weights.audio_embedding.to_logits = ctx.get_weight("audio_embedding.to_logits.weight");
+
+        weights.audio_tokenizer.embd = ctx.get_weight("emb.emb.weight");
+    }
+
+    ggml_tensor * graph(ggml_context * ctx0, ggml_tensor * cur, ggml_cgraph * gf, int j, llama_token prev_token) {
+        auto & depth_embd_layer = weights.depth_embd_layers[j];
+
+        // calculate depthformer_in chunk for codebook j
+        {
+            auto * w        = weights.depth_linear_w;
+            auto * b        = weights.depth_linear_b;
+            auto   n_embd_d = depth_embd_layer.embd->ne[0];
+            cur = ggml_mul_mat(ctx0, ggml_view_2d(ctx0, w, w->ne[0], n_embd_d, w->nb[1], j * n_embd_d * w->nb[1]), cur);
+            cur = ggml_add(ctx0, cur, ggml_view_1d(ctx0, b, n_embd_d, j * n_embd_d * b->nb[0]));
+        }
+
+        if (j > 0) {
+            auto * prev_token_tensor = ggml_cast(ctx0, ggml_arange(ctx0, prev_token, prev_token + 1, 1), GGML_TYPE_I32);
+            auto * depthformer_token = ggml_get_rows(ctx0, weights.depth_embd_layers[j - 1].embd, prev_token_tensor);
+            cur                      = ggml_add(ctx0, cur, depthformer_token);
+        }
+
+        cur = depthformer_model.graph(ctx0, gf, cur);
+
+        cur = build_rms_norm(ctx0, cur, depth_embd_layer.norm, config.f_norm_rms_eps);
+
+        cur = ggml_mul_mat(ctx0, depth_embd_layer.to_logits, cur);
+
+        return cur;
+    }
+
+    ggml_tensor * embed(ggml_context * ctx0, ggml_tensor * input) const {
+        ggml_tensor * codebook_offsets = ggml_arange(ctx0, 0, config.n_vocab * config.n_codebook, config.n_vocab);
+        // add codebook_offsets
+        auto *        out_tokens_offsets =
+            ggml_cast(ctx0, ggml_add(ctx0, ggml_cast(ctx0, input, GGML_TYPE_F32), codebook_offsets), GGML_TYPE_I32);
+
+        // sum
+        auto * out_embd = ggml_get_rows(ctx0, weights.audio_embedding.embd, out_tokens_offsets);
+        out_embd        = ggml_cont(ctx0, ggml_permute(ctx0, out_embd, 1, 0, 2, 3));
+        out_embd        = ggml_sum_rows(ctx0, out_embd);
+        out_embd        = ggml_reshape_1d(ctx0, out_embd, ggml_nelements(out_embd));
+
+
+        return out_embd;
+    }
+
+    ggml_tensor * embed_for_detokenizer(ggml_context * ctx0, ggml_tensor * input) const {
+        const int n_codes         = 8;
+        const int n_output_tokens = 6;
+
+        ggml_tensor * cur = input;
+
+        GGML_ASSERT(!(cur->ne[0] % n_codes));
+        ggml_tensor * codes = ggml_reshape_2d(ctx0, cur, n_codes, cur->ne[0] / n_codes);
+
+        // TODO(tarek): remove transpose
+        codes = ggml_transpose(ctx0, codes);
+
+        cur = codes;
+
+        // embedding
+        {
+            int           n_embd_code = weights.audio_tokenizer.embd->ne[1] / n_codes;
+            ggml_tensor * offsets =
+                ggml_reshape_2d(ctx0, ggml_arange(ctx0, 0, n_embd_code * n_codes, n_embd_code), 1, n_codes);
+            auto * x        = ggml_cast(ctx0, cur, GGML_TYPE_F32);
+            auto * offset_x = ggml_cast(ctx0, ggml_add(ctx0, x, offsets), GGML_TYPE_I32);
+
+            offset_x = ggml_reshape_1d(ctx0, offset_x, x->ne[0] * x->ne[1]);
+
+            auto * embedding = ggml_get_rows(ctx0, weights.audio_tokenizer.embd, offset_x);
+            embedding        = ggml_reshape_3d(ctx0, embedding, embedding->ne[0], x->ne[0], n_codes);
+            embedding        = ggml_cont(ctx0, ggml_permute(ctx0, embedding, 2, 1, 0, 3));
+            embedding        = ggml_mean(ctx0, embedding);
+            embedding        = ggml_cont(ctx0, ggml_permute(ctx0, embedding, 2, 1, 0, 3));
+            cur              = embedding;
+        }
+
+        // upsample
+        {
+            auto upsample_size = n_output_tokens * cur->ne[1];
+            cur                = ggml_interpolate(ctx0, cur, cur->ne[0], upsample_size, cur->ne[2], cur->ne[3],
+                                                  0);  // linear interp
+        }
+
+        return cur;
+    }
+
+    audio_token_t sample(audio_decoder_ggml_ctx & ctx, const std::vector<float> & embd, llama_sampler * smpl) {
+        GGML_ASSERT(smpl);
+        // TODO(tarek): remove reset
+        llama_sampler_reset(smpl);
+
+        audio_token_t token;
+        llama_token   prev_token = -1;
+
+        GGML_ASSERT((int) token.size() == config.n_codebook);
+        depthformer_model.reset();
+        for (int i = 0; i < config.n_codebook; ++i) {
+            {
+                auto depthformer_logits =
+                    run_graph(ctx, embd, [&](ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur) {
+                        return graph(ctx0, cur, gf, i, prev_token);
+                    });
+
+                std::vector<llama_token_data> cur;
+                cur.reserve(config.n_vocab);
+                for (llama_token token_id = 0; token_id < config.n_vocab; token_id++) {
+                    cur.emplace_back(llama_token_data{ token_id, depthformer_logits[token_id], 0.0f });
+                }
+
+                llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+
+                llama_sampler_apply(smpl, &cur_p);
+
+                GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
+                prev_token = cur_p.data[cur_p.selected].id;
+                llama_sampler_accept(smpl, prev_token);
+
+                token[i] = prev_token;
+            }
+            depthformer_model.advance(1);
+        }
+
+        return token;
+    }
+
+  private:
+    struct {
+        int n_codebook = 8;
+        int n_vocab    = 2049;
+
+        float f_norm_rms_eps = 1e-5f;
+    } config;
+
+    struct {
+        ggml_tensor * depth_linear_w = nullptr;
+        ggml_tensor * depth_linear_b = nullptr;
+
+        struct EmbdLayer {
+            ggml_tensor * norm      = nullptr;
+            ggml_tensor * embd      = nullptr;
+            ggml_tensor * to_logits = nullptr;
+        };
+
+        std::vector<EmbdLayer> depth_embd_layers;
+        EmbdLayer              audio_embedding;
+
+        struct {
+            ggml_tensor * embd;
+        } audio_tokenizer;
+    } weights;
+
+    DepthformerModel depthformer_model;
+};
+
+}  // namespace
+
+class audio_decoder_lfm25 : public mtmd_audio_decoder {
+  public:
+    struct {
+        int n_fft       = 1280;
+        int hop_length  = 320;
+        int sample_rate = 24000;
+        int n_codes     = 8;
+    } istft_config;
+
+    DecoderModel decoder_model;
+
+    audio_decoder_ggml_ctx ctx;
+
+    bool verbose = false;
+
+    // tokenizer
+    common_init_result_ptr                      audio_tokenizer_llama_init;
+    llama_model *                               audio_tokenizer_model;
+    llama_context *                             audio_tokenizer_lctx;
+    std::unique_ptr<mtmd_audio_streaming_istft> istft_state;
+
+    // threadpool
+    ggml_threadpool * threadpool                  = nullptr;
+    void (*threadpool_free_fn)(ggml_threadpool *) = nullptr;
+
+    // output modality switch
+    std::vector<mtmd_output_modality> modalities;
+
+    static constexpr auto interleaved_n_text  = 6;
+    static constexpr auto interleaved_n_audio = 12;
+    int                   modality_left       = INT_MAX;
+
+    // sampling
+    llama_sampler_ptr smpl;
+
+    audio_decoder_lfm25(const std::string & vocoder_path,
+                        const std::string & tokenizer_path,
+                        int                 n_threads,
+                        bool                use_gpu) :
+        ctx(use_gpu) {
+        ctx.load_gguf(vocoder_path.c_str());
+
+        decoder_model.init(ctx);
+
+        // audio tokenizer
+        common_params params_audio_tokenizer;
+        params_audio_tokenizer.model.path  = tokenizer_path;
+        params_audio_tokenizer.mmproj.path = "";
+        params_audio_tokenizer.embedding   = true;
+        audio_tokenizer_llama_init         = common_init_from_params(params_audio_tokenizer);
+        audio_tokenizer_model              = audio_tokenizer_llama_init->model();
+        audio_tokenizer_lctx               = audio_tokenizer_llama_init->context();
+
+        if (!audio_tokenizer_model || !audio_tokenizer_lctx) {
+            LOG_ERR("Failed to load audio tokenizer\n");
+            throw std::runtime_error("Failed to load audio tokenizer");
+        }
+
+        istft_state = std::make_unique<mtmd_audio_streaming_istft>(istft_config.n_fft, istft_config.hop_length);
+        if (!istft_state) {
+            LOG_ERR("Failed to create ISTFT state\n");
+            throw std::runtime_error("Failed to create ISTFT state");
+        }
+
+        init_threadpool(n_threads);
+    }
+
+    virtual ~audio_decoder_lfm25() = default;
+
+    void start_new_turn() override {
+        llama_memory_clear(llama_get_memory(audio_tokenizer_lctx), false);
+        istft_state->reset();
+
+        if (is_interleaved_mode()) {
+            modality_left = interleaved_n_text;
+        } else {
+            modality_left = INT_MAX;
+        }
+    }
+
+    mtmd_audio_decoder_type get_type() override { return mtmd_audio_decoder_type::LFM25; }
+
+    int decode(mtmd_audio_decode_result & result, const float * embd_ptr, size_t n_embd) override {
+        modality_left -= 1;
+
+        if (is_interleaved_mode() && modality_left == 0) {
+            modality_left   = interleaved_n_text;
+            result.is_final = true;
+        } else {
+            result.is_final = false;
+        }
+
+        auto               t0 = ggml_time_ms();
+        std::vector<float> embd(embd_ptr, embd_ptr + n_embd);
+        audio_token_t      next_token = decoder_model.sample(ctx, embd, smpl.get());
+
+        if (verbose) {
+            LOG_INF("audio frame sampled in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        }
+
+        if (next_token[0] == 2048) {
+            result.is_final = true;  // switch back to text
+            std::fill(next_token.begin(), next_token.end(), 2048);
+        } else {
+            auto decoded = detokenize(next_token);
+
+            result.pcm16.resize(decoded.size());
+            for (size_t i = 0; i < decoded.size(); i++) {
+                result.pcm16[i] = static_cast<int16_t>(std::clamp(decoded[i], -1.0f, 1.0f) * 32767.0f);
+            }
+        }
+
+        result.embedding = embed(next_token);
+
+        return 0;
+    }
+
+    int get_sample_rate() const override { return 24000; }
+
+    mtmd_output_modality accept_text_token(llama_token token) override {
+        modality_left -= 1;
+
+        if (token == 130) {  // <|text_end|>
+            modality_left = INT_MAX;
+            return MTMD_OUTPUT_MODALITY_AUDIO;
+        }
+
+        if (is_interleaved_mode()) {
+            if (modality_left == 0) {
+                modality_left = interleaved_n_audio;
+                return MTMD_OUTPUT_MODALITY_AUDIO;
+            }
+        } else if (token == 128) {  // <|audio_start|>
+            modality_left = INT_MAX;
+            return MTMD_OUTPUT_MODALITY_AUDIO;
+        }
+
+        return MTMD_OUTPUT_MODALITY_TEXT;
+    }
+
+    void set_modalities(const std::vector<mtmd_output_modality> & modalities) override {
+        this->modalities = modalities;
+
+        // samplers are different for interleaved and asr modes
+        static constexpr float audio_temperature = 0.8f;
+        int                    audio_top_k       = is_interleaved_mode() ? 4 : 64;
+
+        struct llama_sampler_chain_params sparams;
+        sparams.no_perf = true;
+        smpl            = llama_sampler_ptr(llama_sampler_chain_init(sparams));
+        llama_sampler_chain_add(smpl.get(), llama_sampler_init_temp(audio_temperature));
+        llama_sampler_chain_add(smpl.get(), llama_sampler_init_top_k(audio_top_k));
+        llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(0));
+    }
+
+  private:
+    bool is_interleaved_mode() const {
+        return std::find(modalities.begin(), modalities.end(), MTMD_OUTPUT_MODALITY_TEXT) != modalities.end() &&
+               std::find(modalities.begin(), modalities.end(), MTMD_OUTPUT_MODALITY_AUDIO) != modalities.end();
+    }
+
+    template <typename T> ggml_type get_ggml_type() {
+        if constexpr (std::is_same_v<T, float>) {
+            return GGML_TYPE_F32;
+        } else if constexpr (std::is_same_v<T, int32_t>) {
+            return GGML_TYPE_I32;
+        } else {
+            static_assert(!sizeof(T *), "Unsupported type");
+        }
+    }
+
+    std::vector<float> embed(const audio_token_t & token) {
+        return run_graph(ctx, token, [&](ggml_context * ctx0, ggml_cgraph *, ggml_tensor * input) {
+            return decoder_model.embed(ctx0, input);
+        });
+    }
+
+    std::vector<float> embed_for_detokenizer(const audio_token_t & token) {
+        return run_graph(ctx, token, [&](ggml_context * ctx0, ggml_cgraph *, ggml_tensor * input) {
+            return decoder_model.embed_for_detokenizer(ctx0, input);
+        });
+    }
+
+    std::vector<float> detokenize(const audio_token_t & codes) {
+        // embed_for_detokenizer, converts 8 audio codes into 6 embeddings for lfm2
+        int  n_tokens = 6;
+        auto embd     = embed_for_detokenizer(codes);
+
+        const int   n_out = llama_model_n_embd_out(audio_tokenizer_model);
+        llama_batch batch = llama_batch_get_one(nullptr, n_tokens);
+
+        batch.embd = embd.data();
+
+        if (llama_decode(audio_tokenizer_lctx, batch)) {
+            LOG_ERR("failed to run audio tokenizer\n");
+            exit(1);
+        }
+
+        std::vector<float> output(n_tokens * n_out);
+        std::memcpy(output.data(), llama_get_embeddings(audio_tokenizer_lctx), sizeof(float) * output.size());
+
+        return istft(output);
+    }
+
+    std::vector<float> istft(const std::vector<float> & embd) const {
+        const int n_fft_bins    = istft_config.n_fft / 2 + 1;
+        int       n_frames      = embd.size() / (n_fft_bins * 2);
+        int       output_length = (n_frames - 1) * istft_config.hop_length;
+
+        std::vector<float> output;
+        output.reserve(output_length);
+
+        // Perform ISTFT - process each frame
+        for (int i = 0; i < n_frames; i++) {
+            std::vector<float> frame_spectrum(n_fft_bins * 2);
+
+            // Extract frame spectrum from embd (which is in [n_fft_bins × n_frames × 2] format)
+            for (int j = 0; j < n_fft_bins; j++) {
+                const auto log_abs        = embd[i * n_fft_bins * 2 + 0 * n_fft_bins + j];
+                const auto angle          = embd[i * n_fft_bins * 2 + 1 * n_fft_bins + j];
+                const auto p              = std::polar(expf(log_abs), angle);
+                frame_spectrum[j * 2 + 0] = p.real();
+                frame_spectrum[j * 2 + 1] = p.imag();
+            }
+
+            auto frame_output = istft_state->process_frame(frame_spectrum.data());
+            output.insert(output.end(), frame_output.begin(), frame_output.end());
+        }
+
+        return output;
+    }
+
+    void init_threadpool(int n_threads) {
+        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        GGML_ASSERT(cpu_dev);
+        auto * reg = ggml_backend_dev_backend_reg(cpu_dev);
+        GGML_ASSERT(reg);
+        GGML_ASSERT(n_threads > 0);
+        if (auto * threadpool_new_fn = (ggml_threadpool * (*) (ggml_threadpool_params *) )
+                ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+            threadpool_new_fn) {
+            ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
+            threadpool                 = threadpool_new_fn(&tpp);
+        }
+        threadpool_free_fn =
+            (decltype(threadpool_free_fn)) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+        GGML_ASSERT(threadpool);
+
+        llama_attach_threadpool(audio_tokenizer_lctx, threadpool, nullptr);
+        set_threadpool(threadpool, n_threads);
+    }
+
+    void set_threadpool(ggml_threadpool * tp, int n_threads) {
+        auto * backend_cpu = ctx.backends.back();
+        GGML_ASSERT(backend_cpu);
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        if (auto * set_threadpool_fn = (void (*)(ggml_backend_t, ggml_threadpool *)) ggml_backend_reg_get_proc_address(
+                reg, "ggml_backend_cpu_set_threadpool");
+            set_threadpool_fn && tp) {
+            set_threadpool_fn(backend_cpu, tp);
+        }
+        if (auto set_n_threads_fn =
+                (void (*)(ggml_backend_t, int)) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+            set_n_threads_fn) {
+            set_n_threads_fn(backend_cpu, n_threads);
+        }
+    }
+};
+
+}  // namespace audio
+}  // namespace liquid
+
+namespace {
+// FIXME(tarek): replace once model can be loaded via clip or llm path
+bool is_lfm2(const llama_model * model) {
+    char arch[256];
+    int  len = llama_model_meta_val_str(model, "general.architecture", arch, sizeof(arch));
+    if (len > 0 && strstr(arch, "lfm2") != nullptr) {
+        return true;
+    }
+    return false;
+}
+
+}  // namespace
+
+mtmd_audio_decoder_ptr mtmd_audio_decoder_create(const llama_model * text_model,
+                                                 const std::string & vocoder_path,
+                                                 const std::string & tokenizer_path,
+                                                 int                 n_threads,
+                                                 bool                use_gpu) {
+    if (is_lfm2(text_model)) {
+        return std::make_unique<liquid::audio::audio_decoder_lfm25>(vocoder_path, tokenizer_path, n_threads, use_gpu);
+    }
+
+    return nullptr;
+}
diff --git a/tools/mtmd/audio-decoder.h b/tools/mtmd/audio-decoder.h
new file mode 100644
index 00000000000..7980e4a4274
--- /dev/null
+++ b/tools/mtmd/audio-decoder.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "mtmd.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+enum class mtmd_audio_decoder_type {
+    LFM25,
+    OTHER,
+};
+
+struct mtmd_audio_decode_result {
+    std::vector<int16_t> pcm16;
+    std::vector<float>   embedding;
+    bool                 is_final = true;
+};
+
+struct mtmd_audio_decoder {
+    virtual ~mtmd_audio_decoder() = default;
+
+    virtual mtmd_audio_decoder_type get_type() = 0;
+
+    virtual int get_sample_rate() const = 0;
+
+    virtual int decode(mtmd_audio_decode_result & result, const float * embd, size_t n_embd) = 0;
+
+    // returns next modality after text token
+    virtual mtmd_output_modality accept_text_token(llama_token token) = 0;
+
+    virtual void set_modalities(const std::vector<mtmd_output_modality> & modalities) = 0;
+
+    virtual void start_new_turn() = 0;
+};
+
+using mtmd_audio_decoder_ptr = std::unique_ptr<mtmd_audio_decoder>;
+
+struct llama_model;
+mtmd_audio_decoder_ptr mtmd_audio_decoder_create(const llama_model * text_model,
+                                                 const std::string & vocoder_path,
+                                                 const std::string & tokenizer_path,
+                                                 int                 n_threads,
+                                                 bool                use_gpu);
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index c75f90730f1..0f603e2f876 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -18,9 +18,6 @@
 //#define MTMD_AUDIO_DEBUG
 
 #define MINIAUDIO_IMPLEMENTATION
-#ifndef MTMD_AUDIO_DEBUG
-#   define MA_NO_ENCODING
-#endif
 #define MA_NO_DEVICE_IO
 #define MA_NO_RESOURCE_MANAGER
 #define MA_NO_NODE_GRAPH
@@ -519,3 +516,20 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
 
     return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
 }
+
+bool mtmd_helper_save_wav(const char * fname, const int16_t * data, size_t n_samples, int sample_rate) {
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_s16, 1, sample_rate);
+    ma_encoder        encoder;
+
+    ma_result res = ma_encoder_init_file(fname, &config, &encoder);
+    if (res != MA_SUCCESS) {
+        LOG_ERR("%s: Failed to open file '%s' for writing (error %d).\n", __func__, fname, res);
+        return false;
+    }
+
+    ma_uint64 frames_written;
+    ma_result result = ma_encoder_write_pcm_frames(&encoder, data, n_samples, &frames_written);
+    ma_encoder_uninit(&encoder);
+
+    return result == MA_SUCCESS && frames_written == n_samples;
+}
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 5036b92442a..9061e3b2231 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -85,6 +85,10 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
                                                 int32_t n_batch,
                                                 llama_pos * new_n_past);
 
+// helper function to save audio data to a WAV file
+// returns true on success, false on failure
+MTMD_API bool mtmd_helper_save_wav(const char * fname, const int16_t * data, size_t n_samples, int sample_rate);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 8ca979c86cf..6ceef5379e2 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1,3 +1,4 @@
+#include "audio-decoder.h"
 #include "clip.h"
 #include "clip-impl.h"
 #include "mtmd.h"
@@ -114,6 +115,8 @@ mtmd_context_params mtmd_context_params_default() {
         /* image_max_tokens  */ -1,
         /* cb_eval           */ nullptr,
         /* cb_eval_user_data */ nullptr,
+        /* vocoder_path      */ nullptr,
+        /* tokenizer_path    */ nullptr,
     };
     return params;
 }
@@ -154,6 +157,11 @@ struct mtmd_context {
 
     std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
 
+    // audio output
+    std::unique_ptr<mtmd_audio_decoder> audio_decoder;
+    std::vector<int16_t>                audio_output_outstanding_pcm16;
+    mtmd_output_modality                output_modality = MTMD_OUTPUT_MODALITY_TEXT;
+
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
@@ -173,49 +181,74 @@ struct mtmd_context {
             throw std::runtime_error("media_marker must not be empty");
         }
 
-        clip_context_params ctx_clip_params {
-            /* use_gpu           */ ctx_params.use_gpu,
-            /* flash_attn_type   */ mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type),
-            /* image_min_tokens  */ ctx_params.image_min_tokens,
-            /* image_max_tokens  */ ctx_params.image_max_tokens,
-            /* warmup            */ ctx_params.warmup,
-            /* cb_eval           */ ctx_params.cb_eval,
-            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
-        };
+        // Load multimodal projector if path provided
+        bool has_mmproj = mmproj_fname && mmproj_fname[0] != '\0';
+        if (has_mmproj) {
+            clip_context_params ctx_clip_params {
+                /* use_gpu           */ ctx_params.use_gpu,
+                /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
+                /* image_min_tokens  */ ctx_params.image_min_tokens,
+                /* image_max_tokens  */ ctx_params.image_max_tokens,
+                /* warmup            */ ctx_params.warmup,
+                /* cb_eval           */ ctx_params.cb_eval,
+                /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
+            };
 
-        auto res = clip_init(mmproj_fname, ctx_clip_params);
-        ctx_v = res.ctx_v;
-        ctx_a = res.ctx_a;
-        if (!ctx_v && !ctx_a) {
-            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
-        }
+            auto res = clip_init(mmproj_fname, ctx_clip_params);
+            ctx_v = res.ctx_v;
+            ctx_a = res.ctx_a;
+            if (!ctx_v && !ctx_a) {
+                throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
+            }
 
-        // if both vision and audio mmproj are present, we need to validate their n_embd
-        if (ctx_v && ctx_a) {
-            int n_embd_v = clip_n_mmproj_embd(ctx_v);
-            int n_embd_a = clip_n_mmproj_embd(ctx_a);
-            if (n_embd_v != n_embd_a) {
+            // if both vision and audio mmproj are present, we need to validate their n_embd
+            if (ctx_v && ctx_a) {
+                int n_embd_v = clip_n_mmproj_embd(ctx_v);
+                int n_embd_a = clip_n_mmproj_embd(ctx_a);
+                if (n_embd_v != n_embd_a) {
+                    throw std::runtime_error(string_format(
+                        "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
+                        n_embd_v, n_embd_a));
+                }
+            }
+
+            // since we already validate n_embd of vision and audio mmproj,
+            // we can safely assume that they are the same
+            int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
+            if (n_embd_text != n_embd_clip) {
                 throw std::runtime_error(string_format(
-                    "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
-                    n_embd_v, n_embd_a));
+                    "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
+                    "hint: you may be using wrong mmproj\n",
+                    n_embd_text, n_embd_clip));
             }
         }
 
-        // since we already validate n_embd of vision and audio mmproj,
-        // we can safely assume that they are the same
-        int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
-        if (n_embd_text != n_embd_clip) {
-            throw std::runtime_error(string_format(
-                "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
-                "hint: you may be using wrong mmproj\n",
-                n_embd_text, n_embd_clip));
-        }
         if (ctx_v) {
             init_vision();
         }
         if (ctx_a) {
             init_audio();
         }
+
+        // Initialize audio output if vocoder path is provided
+        if (ctx_params.vocoder_path && ctx_params.vocoder_path[0] != '\0') {
+            audio_decoder = mtmd_audio_decoder_create(
+                text_model,
+                ctx_params.vocoder_path,
+                ctx_params.tokenizer_path,
+                ctx_params.n_threads,
+                ctx_params.use_gpu);
+            if (audio_decoder) {
+                LOG_INF("%s: audio decoder initialized\n", __func__);
+            } else {
+                LOG_WRN("%s: failed to initialize audio decoder\n", __func__);
+            }
+        }
+
+        // Require at least one capability
+        if (!ctx_v && !ctx_a && !audio_decoder) {
+            throw std::runtime_error("mtmd_context requires at least one of: vision, audio input, or audio output");
+        }
     }
 
     void init_vision() {
@@ -1154,3 +1187,92 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
+
+//
+// audio output API
+//
+
+bool mtmd_support_audio_output(mtmd_context * ctx) {
+    return ctx && ctx->audio_decoder;
+}
+
+int mtmd_audio_output_get_sample_rate(mtmd_context * ctx) {
+    GGML_ASSERT(mtmd_support_audio_output(ctx));
+    return ctx->audio_decoder ? ctx->audio_decoder->get_sample_rate() : 0;
+}
+
+int mtmd_audio_output_decode(
+        mtmd_context * ctx,
+        const float * embedding,
+        size_t n_embd,
+        float * out_embedding) {
+    GGML_ASSERT(mtmd_support_audio_output(ctx));
+
+    GGML_ASSERT(ctx->output_modality == MTMD_OUTPUT_MODALITY_AUDIO);
+
+    mtmd_audio_decode_result result;
+
+    if (auto res = ctx->audio_decoder->decode(result, embedding, n_embd); res != 0) {
+        LOG_ERR("%s: audio decoding failed: %d\n", __func__, res);
+        return res;
+    }
+
+    memcpy(out_embedding, result.embedding.data(), result.embedding.size() * sizeof(float));
+
+    ctx->audio_output_outstanding_pcm16.insert(
+        ctx->audio_output_outstanding_pcm16.end(),
+        result.pcm16.data(),
+        result.pcm16.data() + result.pcm16.size()
+    );
+
+    if (result.is_final) {
+        ctx->output_modality = MTMD_OUTPUT_MODALITY_TEXT;
+    }
+
+    return 0;
+}
+
+void mtmd_audio_output_start_new_turn(mtmd_context * ctx) {
+    GGML_ASSERT(mtmd_support_audio_output(ctx));
+    ctx->audio_decoder->start_new_turn();
+}
+
+mtmd_output_modality mtmd_get_output_modality(mtmd_context * ctx) {
+    return ctx ? ctx->output_modality : MTMD_OUTPUT_MODALITY_TEXT;
+}
+
+// get num of audio samples available after last decode
+int mtmd_get_n_audio_samples(mtmd_context * ctx) {
+    GGML_ASSERT(mtmd_support_audio_output(ctx));
+    return (int)ctx->audio_output_outstanding_pcm16.size();
+}
+
+// retrieve audio samples after last decode
+int mtmd_get_audio_samples(mtmd_context * ctx, int16_t * samples) {
+    GGML_ASSERT(mtmd_support_audio_output(ctx));
+    int n_samples = mtmd_get_n_audio_samples(ctx);
+
+    memcpy(samples, ctx->audio_output_outstanding_pcm16.data(), n_samples * sizeof(int16_t));
+    ctx->audio_output_outstanding_pcm16.clear();
+
+    return n_samples;
+}
+
+void mtmd_audio_output_accept_token(mtmd_context * ctx, llama_token id) {
+    if (!ctx || !ctx->audio_decoder) {
+        return;
+    }
+
+    ctx->output_modality = ctx->audio_decoder->accept_text_token(id);
+}
+
+void mtmd_set_output_modalities(mtmd_context * ctx, const mtmd_output_modality * ptr, size_t len) {
+    GGML_ASSERT(mtmd_support_audio_output(ctx));
+    if (!ptr || !len) {
+        ctx->audio_decoder->set_modalities({});
+        return;
+    }
+
+    std::vector modalities(ptr, ptr + len);
+    ctx->audio_decoder->set_modalities(modalities);
+}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index ef25d32bbef..72f05469721 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -99,6 +99,11 @@ struct mtmd_context_params {
     // callback function passed over to mtmd proper
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
+
+    // audio output parameters (for TTS models like LFM2.5-Audio)
+    const char * vocoder_path;      // path to vocoder model (enables audio output if set)
+    const char * tokenizer_path;    // path to audio tokenizer model (for LFM2.5)
+
 };
 
 MTMD_API const char * mtmd_default_marker(void);
@@ -236,6 +241,45 @@ MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
 // test function, to be used in test-mtmd-c-api.c
 MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 
+//
+// Audio output API
+//
+enum mtmd_output_modality {
+    MTMD_OUTPUT_MODALITY_TEXT,
+    MTMD_OUTPUT_MODALITY_AUDIO,
+    MTMD_OUTPUT_MODALITY_END,
+};
+
+// check if audio output is supported
+MTMD_API bool mtmd_support_audio_output(mtmd_context * ctx);
+
+// returns 0 if audio output is not supported
+MTMD_API int mtmd_audio_output_get_sample_rate(mtmd_context * ctx);
+
+// decode audio frame
+MTMD_API int mtmd_audio_output_decode(mtmd_context * ctx,
+                                      const float *  embedding,
+                                      size_t         n_embd,
+                                      float *        out_embedding);
+
+// get current output modality
+MTMD_API mtmd_output_modality mtmd_get_output_modality(mtmd_context * ctx);
+
+// get num of audio samples available after last decode
+MTMD_API int mtmd_get_n_audio_samples(mtmd_context * ctx);
+
+// retrieve audio samples after last decode
+MTMD_API int mtmd_get_audio_samples(mtmd_context * ctx, int16_t * samples);
+
+// accept text token, can switch modality
+MTMD_API void mtmd_audio_output_accept_token(mtmd_context * ctx, llama_token id);
+
+// set output modalities sequence for generation
+MTMD_API void mtmd_set_output_modalities(mtmd_context * ctx, const mtmd_output_modality * ptr, size_t len);
+
+// notify about new turn start, has to be called after modalities are set
+MTMD_API void mtmd_audio_output_start_new_turn(mtmd_context * ctx);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index ff3c6d3c2b0..b044b8e7460 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -691,7 +691,7 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
     return std::to_string(hash);
 }
 
-server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files, bool add_special) {
     mtmd::bitmaps bitmaps;
     for (auto & file : files) {
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
@@ -708,7 +708,7 @@ server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::
     // multimodal
     mtmd_input_text inp_txt = {
         prompt.c_str(),
-        /* add_special */   true,
+        /* add_special */   add_special,
         /* parse_special */ true,
     };
     mtmd::input_chunks chunks(mtmd_input_chunks_init());
@@ -757,7 +757,7 @@ static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_co
             for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
                 files.push_back(base64_decode(entry));
             }
-            return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
+            return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY).get<std::string>(), files, add_special);
         } else {
             // Not multimodal, but contains a subobject.
             llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
@@ -1122,6 +1122,19 @@ json oaicompat_chat_params_parse(
         }
     }
 
+    // handle "modalities" parameter for audio output
+    if (body.contains("modalities") && body.at("modalities").is_array()) {
+        for (const auto & modality : body.at("modalities")) {
+            if (modality.is_string()) {
+                if (auto const & m = modality.get<std::string>(); m == "audio") {
+                    llama_params["has_out_audio"] = true;
+                } else if (m == "text") {
+                    llama_params["has_out_text"] = true;
+                }
+            }
+        }
+    }
+
     return llama_params;
 }
 
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 4fb9e488dfd..9a5789f7515 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -251,7 +251,7 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt,
 size_t validate_utf8(const std::string& text);
 
 // process mtmd prompt, return the server_tokens containing both text tokens and media chunks
-server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files, bool add_special = true);
 
 /**
  * break the input "prompt" object into multiple prompt if needed, then tokenize them
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index aafed495020..82657b6eda8 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -162,6 +162,21 @@ struct server_slot {
     int32_t n_draft_total = 0;      // Total draft tokens generated
     int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
 
+    // Audio output state
+    std::vector<float> audio_embd;  // embedding buffer for audio decode
+    llama_pos audio_pos = 0;  // position counter for audio mode (since we can't push tokens)
+    llama_pos audio_pos_offset = 0;  // offset to add to pos_next() after audio mode (to account for audio decodes)
+
+    // Check if audio output was requested for this slot
+    bool has_audio_output() const {
+        return task && task->params.has_out_audio && mctx && mtmd_support_audio_output(mctx);
+    }
+
+    // Check if slot is currently in audio output mode
+    bool is_audio_out_mode() const {
+        return has_audio_output() && mtmd_get_output_modality(mctx) == MTMD_OUTPUT_MODALITY_AUDIO;
+    }
+
     void reset() {
         SLT_DBG(*this, "%s", "\n");
 
@@ -185,6 +200,11 @@ struct server_slot {
         n_draft_total = 0;
         n_draft_accepted = 0;
 
+        // clear audio output state
+        audio_embd.clear();
+        audio_pos = 0;
+        audio_pos_offset = 0;
+
         task_prev = std::move(task);
         task.reset();
 
@@ -692,6 +712,14 @@ struct server_context_impl {
             mparams.image_min_tokens = params_base.image_min_tokens;
             mparams.image_max_tokens = params_base.image_max_tokens;
 
+            // Audio output support (vocoder and tokenizer)
+            if (!params_base.vocoder.model.path.empty()) {
+                mparams.vocoder_path = params_base.vocoder.model.path.c_str();
+            }
+            if (!params_base.vocoder.speaker_file.empty()) {
+                mparams.tokenizer_path = params_base.vocoder.speaker_file.c_str();
+            }
+
             mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
             if (mctx == nullptr) {
                 SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
@@ -699,6 +727,10 @@ struct server_context_impl {
             }
             SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
 
+            if (mtmd_support_audio_output(mctx)) {
+                SRV_INF("audio output supported, sample_rate = %d\n", mtmd_audio_output_get_sample_rate(mctx));
+            }
+
             if (params_base.ctx_shift) {
                 params_base.ctx_shift = false;
                 SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
@@ -1065,6 +1097,18 @@ struct server_context_impl {
     }
 
     bool launch_slot_with_task(server_slot & slot, server_task && task) {
+        // continue mode: prepend existing slot tokens to task tokens
+        if (task.params.continue_slot) {
+            if (slot.prompt.tokens.empty()) {
+                send_error(task, "continue mode requires existing slot state, but slot is empty", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+            // create combined tokens: existing + new
+            server_tokens combined = slot.prompt.tokens.clone();
+            combined.push_back(task.tokens);
+            task.tokens = std::move(combined);
+        }
+
         // process per-request lora adapters
         if (!task.params.lora.empty()) {
             auto task_loras = construct_lora_list(task.params.lora);
@@ -1173,6 +1217,29 @@ struct server_context_impl {
 
         slot.task = std::make_unique<const server_task>(std::move(task));
 
+        // Initialize audio output if enabled and supported
+        if (slot.has_audio_output()) {
+            // Set output modalities based on requested modalities
+            std::vector<mtmd_output_modality> modalities;
+            if (slot.task->params.has_out_audio) {
+                modalities.push_back(MTMD_OUTPUT_MODALITY_AUDIO);
+            }
+            if (slot.task->params.has_out_text) {
+                modalities.push_back(MTMD_OUTPUT_MODALITY_TEXT);
+            }
+
+            if (!modalities.empty()) {
+                mtmd_set_output_modalities(slot.mctx, modalities.data(), modalities.size());
+                mtmd_audio_output_start_new_turn(slot.mctx);
+
+                // Reserve embedding buffer (don't resize - empty() check is used for first decode detection)
+                slot.audio_embd.clear();
+                slot.audio_embd.reserve(llama_model_n_embd(model));
+            }
+        } else if (slot.task->params.has_out_audio) {
+            SLT_WRN(slot, "%s", "audio output requested but not supported by model\n");
+        }
+
         slot.state = slot.task->is_child()
             ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
             : SLOT_STATE_STARTED;
@@ -1434,6 +1501,12 @@ struct server_context_impl {
             res->timings = slot.get_timings();
         }
 
+        // populate audio output if present
+        if (!tkn.audio_samples.empty()) {
+            res->audio_out             = tkn.audio_samples;
+            res->audio_out_sample_rate = tkn.audio_sample_rate;
+        }
+
         queue_results.send(std::move(res));
     }
 
@@ -2091,15 +2164,23 @@ struct server_context_impl {
                     slot.drafted = std::move(draft);
                 }
             } else {
-                // no speculative decoding
-                slot.i_batch = batch.n_tokens;
+                // check if this slot is in audio output mode and needs embeddings
+                if (slot.is_audio_out_mode() && !slot.audio_embd.empty()) {
+                    SLT_DBG(slot, "slot in audio mode, will process with embeddings separately (n_embd=%zu)\n",
+                            slot.audio_embd.size());
+                    // don't add to batch - will be handled in audio processing loop
+                    continue;
+                }
 
-                common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
+                slot.i_batch = batch.n_tokens;
 
+                // Use offset to account for positions consumed by audio mode
+                llama_pos pos = slot.prompt.tokens.pos_next() + slot.audio_pos_offset;
+                common_batch_add(batch, slot.sampled, pos, { slot.id }, true);
                 slot.prompt.tokens.push_back(slot.sampled);
 
-                SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
-                        slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
+                SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, pos = %d, truncated = %d\n",
+                        slot.n_ctx, slot.prompt.n_tokens(), pos, slot.truncated);
             }
         }
 
@@ -2600,7 +2681,20 @@ struct server_context_impl {
                 slot_batched->lora[alora_disabled_id].scale = alora_scale;
             }
 
-            llama_set_embeddings(ctx, slot_batched->task->need_embd());
+            // check if embeddings are needed for this batch
+            bool need_embd = slot_batched->task->need_embd();
+
+            // enable embeddings if any slot has audio output enabled
+            if (!need_embd) {
+                for (auto & slot : slots) {
+                    if (slot.is_processing() && slot.is_audio_out_mode()) {
+                        need_embd = true;
+                        break;
+                    }
+                }
+            }
+
+            llama_set_embeddings(ctx, need_embd);
         }
 
         if (batch.n_tokens == 0) {
@@ -2745,6 +2839,11 @@ struct server_context_impl {
                     continue; // continue loop of slots
                 }
 
+                // slot is in audio mode is handled by audio loop
+                if (slot.is_audio_out_mode()) {
+                    continue;
+                }
+
                 if (slot.i_batch_dft.size() > 0) {
                     continue; // sample using speculative decoding
                 }
@@ -2779,6 +2878,11 @@ struct server_context_impl {
                     populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx);
                 }
 
+                // notify decoder of text token
+                if (slot.has_audio_output()) {
+                    mtmd_audio_output_accept_token(slot.mctx, id);
+                }
+
                 if (!process_token(result, slot)) {
                     // release slot because of stop condition
                     slot.print_timings();
@@ -2845,6 +2949,360 @@ struct server_context_impl {
 
                 SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
             }
+
+            // Audio mode processing - handle slots in audio mode
+            // Two cases:
+            // 1. First audio decode (audio_need_embeddings_from_main): get embeddings from main batch
+            // 2. Subsequent decodes: use audio_embd embedding feedback loop
+            bool has_audio_slots = true;
+            while (has_audio_slots) {
+                has_audio_slots = false;
+
+                for (auto & slot : slots) {
+                    if (slot.state != SLOT_STATE_GENERATING) {
+                        continue;
+                    }
+
+                    // Check if this slot needs audio processing
+                    if (!slot.is_audio_out_mode()) {
+                        continue;
+                    }
+
+                    // First audio decode case: get embeddings from main batch decode
+                    // Condition: slot.i_batch >= 0 means the audio_start token was decoded in THIS batch
+                    // and audio_embd is empty means we haven't done the first audio decode yet
+                    if (slot.i_batch >= 0 && slot.audio_embd.empty()) {
+                        // Resize audio_embd for output (was reserved at setup, now actually allocate)
+                        const int n_embd = llama_model_n_embd(model);
+                        slot.audio_embd.resize(n_embd);
+
+                        // Get embeddings from the main batch decode (audio_start token)
+                        const float * embd = llama_get_embeddings_ith(ctx, slot.i_batch);
+                        if (!embd) {
+                            // Fallback to last position (single slot case)
+                            embd = llama_get_embeddings(ctx);
+                        }
+                        if (!embd) {
+                            slot.audio_embd.clear();
+                            continue;
+                        }
+
+                        // Do first audio decode
+                        int res = mtmd_audio_output_decode(
+                            slot.mctx, embd, n_embd,
+                            slot.audio_embd.data());
+
+                        if (res != 0) {
+                            slot.audio_embd.clear();
+                            continue;
+                        }
+
+                        // Initialize audio position for subsequent decodes
+                        // Must include offset since audio_start was added with offset (line 2207)
+                        slot.audio_pos = slot.prompt.tokens.pos_next() + slot.audio_pos_offset;
+                        SLT_DBG(slot, "first audio decode complete, audio_pos = %d (offset=%d)\n",
+                            slot.audio_pos, slot.audio_pos_offset);
+
+                        // Get first audio samples
+                        int n_samples = mtmd_get_n_audio_samples(slot.mctx);
+                        if (n_samples > 0) {
+                            completion_token_output result;
+                            result.tok = 0;
+                            result.text_to_send = "";
+                            result.prob = 1.0f;
+                            result.audio_samples.resize(n_samples);
+                            mtmd_get_audio_samples(slot.mctx, result.audio_samples.data());
+                            result.audio_sample_rate = mtmd_audio_output_get_sample_rate(slot.mctx);
+
+                            send_partial_response(slot, result, false);
+
+                            // Update stats (same as subsequent decodes)
+                            const int64_t t_current = ggml_time_us();
+                            slot.n_decoded += 1;
+                            slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                            // Check budget
+                            if (slot.n_remaining > 0) {
+                                --slot.n_remaining;
+                            }
+                            if (slot.n_remaining == 0) {
+                                SLT_INF(slot, "%s", "audio generation reached budget limit\n");
+                                slot.stop = STOP_TYPE_LIMIT;
+                                slot.print_timings();
+                                send_final_response(slot);
+                                metrics.on_prediction(slot);
+                                slot.release();
+                                continue;
+                            }
+                        }
+
+                        // Check if still in audio mode
+                        const bool still_audio = slot.is_audio_out_mode();
+                        if (still_audio) {
+                            has_audio_slots = true;
+                        } else if (!slot.task->params.has_out_text) {
+                            // TTS mode: audio complete, finish generation
+                            SLT_INF(slot, "%s", "TTS mode: audio complete after first decode\n");
+                            slot.audio_embd.clear();
+                            slot.stop = STOP_TYPE_EOS;
+                            slot.print_timings();
+                            send_final_response(slot);
+                            metrics.on_prediction(slot);
+                            slot.release();
+                        } else {
+                            // Interleaved mode: audio ended after first decode
+                            // Need to decode the audio embeddings through backbone to get logits for text
+
+                            // Decode audio_embd through backbone
+                            llama_set_embeddings(ctx, true);
+
+                            llama_batch audio_batch = {};
+                            audio_batch.n_tokens = 1;
+                            audio_batch.token = nullptr;
+                            audio_batch.embd = slot.audio_embd.data();
+
+                            llama_pos pos = slot.audio_pos;
+                            llama_pos pos_arr[] = { pos };
+                            int32_t n_seq_id_arr[] = { 1 };
+                            llama_seq_id seq_id = slot.id;
+                            llama_seq_id * seq_ids_arr[] = { &seq_id };
+                            int8_t logits_arr[] = { 1 };
+
+                            audio_batch.pos = pos_arr;
+                            audio_batch.n_seq_id = n_seq_id_arr;
+                            audio_batch.seq_id = seq_ids_arr;
+                            audio_batch.logits = logits_arr;
+
+                            if (llama_decode(ctx, audio_batch) != 0) {
+                                SLT_ERR(slot, "%s", "failed to decode audio embeddings for text transition\n");
+                                slot.audio_embd.clear();
+                                continue;
+                            }
+                            slot.audio_pos++;
+
+                            // Update position offset
+                            slot.audio_pos_offset = slot.audio_pos - slot.prompt.tokens.pos_next();
+
+                            // Clear audio_embd for next audio segment
+                            slot.audio_embd.clear();
+
+                            // Sample next text token
+                            llama_token next_token = common_sampler_sample(slot.smpl.get(), ctx, -1);
+                            common_sampler_accept(slot.smpl.get(), next_token, true);
+
+                            // Accept into audio decoder state
+                            mtmd_audio_output_accept_token(slot.mctx, next_token);
+
+                            // Create result and process through normal flow
+                            completion_token_output result;
+                            result.tok = next_token;
+                            const bool render_special = slot.has_audio_output() || accept_special_token(slot, result.tok);
+                            result.text_to_send = common_token_to_piece(ctx, result.tok, render_special);
+                            result.prob = 1.0f;
+
+
+                            // Update stats
+                            slot.n_decoded += 1;
+                            const int64_t t_current = ggml_time_us();
+                            slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                            if (!process_token(result, slot)) {
+                                slot.print_timings();
+                                send_final_response(slot);
+                                metrics.on_prediction(slot);
+                                slot.release();
+                                continue;
+                            }
+
+                            // Check if immediately switching back to audio
+                            if (slot.is_audio_out_mode()) {
+                                SLT_INF(slot, "%s", "immediately switching back to audio mode\n");
+                                has_audio_slots = true;
+                            }
+                        }
+                        continue;
+                    }
+
+                    // Subsequent audio decodes: embedding feedback loop
+                    if (slot.audio_embd.empty()) {
+                        continue;
+                    }
+
+                    has_audio_slots = true;
+
+                    SLT_DBG(slot, "%s", "processing audio mode slot with embeddings\n");
+
+                    // Enable embeddings for audio decode
+                    llama_set_embeddings(ctx, true);
+
+                    // Create embedding-based batch for this slot
+                    const int n_embd = llama_model_n_embd(model);
+                    llama_pos pos = slot.audio_pos;
+
+                    llama_batch audio_batch = {};
+                    audio_batch.n_tokens = 1;
+                    audio_batch.token = nullptr;
+                    audio_batch.embd = slot.audio_embd.data();
+
+                    // Set up position and sequence info
+                    llama_pos pos_arr[] = { pos };
+                    int32_t n_seq_id_arr[] = { 1 };
+                    llama_seq_id seq_id = slot.id;
+                    llama_seq_id * seq_ids_arr[] = { &seq_id };
+                    int8_t logits_arr[] = { 1 };
+
+                    audio_batch.pos = pos_arr;
+                    audio_batch.n_seq_id = n_seq_id_arr;
+                    audio_batch.seq_id = seq_ids_arr;
+                    audio_batch.logits = logits_arr;
+
+                    // Decode with embeddings
+                    const int ret = llama_decode(ctx, audio_batch);
+
+                    if (ret != 0) {
+                        SLT_ERR(slot, "audio embedding decode failed with code %d\n", ret);
+                        slot.audio_embd.clear();
+                        continue;
+                    }
+
+                    // Increment position for next decode
+                    slot.audio_pos++;
+
+                    // Get embeddings from decode output
+                    const float * embd = llama_get_embeddings(ctx);
+                    if (!embd) {
+                        SLT_WRN(slot, "%s", "no embeddings available after audio decode\n");
+                        slot.audio_embd.clear();
+                        continue;
+                    }
+
+                    // Decode embeddings to audio
+                    int res = mtmd_audio_output_decode(
+                        slot.mctx, embd, n_embd,
+                        slot.audio_embd.data());
+
+                    if (res != 0) {
+                        SLT_WRN(slot, "mtmd_audio_output_decode failed with code %d\n", res);
+                        slot.audio_embd.clear();
+                        continue;
+                    }
+
+                    // Get audio samples
+                    int n_samples = mtmd_get_n_audio_samples(slot.mctx);
+                    if (n_samples > 0) {
+                        completion_token_output result;
+                        result.tok = 0;
+                        result.text_to_send = "";
+                        result.prob = 1.0f;
+                        result.audio_samples.resize(n_samples);
+                        mtmd_get_audio_samples(slot.mctx, result.audio_samples.data());
+                        result.audio_sample_rate = mtmd_audio_output_get_sample_rate(slot.mctx);
+
+                        // Update stats
+                        const int64_t t_current = ggml_time_us();
+                        slot.n_decoded += 1;
+                        slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                        // Send audio directly
+                        send_partial_response(slot, result, false);
+
+                        // Check budget
+                        if (slot.n_remaining > 0) {
+                            --slot.n_remaining;
+                        }
+                        if (slot.n_remaining == 0) {
+                            slot.stop = STOP_TYPE_LIMIT;
+                            slot.print_timings();
+                            send_final_response(slot);
+                            metrics.on_prediction(slot);
+                            slot.release();
+                            continue;
+                        }
+                    }
+
+                    // Check if we're still in audio mode for next iteration
+                    if (!slot.is_audio_out_mode()) {
+
+                        // For audio-only mode (TTS), end generation when audio completes
+                        // For interleaved mode, continue to text generation
+                        if (!slot.task->params.has_out_text) {
+                            slot.audio_embd.clear();
+                            slot.audio_pos_offset = slot.audio_pos - slot.prompt.tokens.pos_next();
+                            slot.stop = STOP_TYPE_EOS;
+                            slot.print_timings();
+                            send_final_response(slot);
+                            metrics.on_prediction(slot);
+                            slot.release();
+                        } else {
+                            // Interleaved mode: decode current audio_embd through backbone to get logits
+                            // The current logits are from the PREVIOUS embedding decode, we need the CURRENT one
+                            llama_set_embeddings(ctx, true);
+
+                            llama_batch audio_batch = {};
+                            audio_batch.n_tokens = 1;
+                            audio_batch.token = nullptr;
+                            audio_batch.embd = slot.audio_embd.data();
+
+                            llama_pos pos = slot.audio_pos;
+                            llama_pos pos_arr[] = { pos };
+                            int32_t n_seq_id_arr[] = { 1 };
+                            llama_seq_id seq_id = slot.id;
+                            llama_seq_id * seq_ids_arr[] = { &seq_id };
+                            int8_t logits_arr[] = { 1 };
+
+                            audio_batch.pos = pos_arr;
+                            audio_batch.n_seq_id = n_seq_id_arr;
+                            audio_batch.seq_id = seq_ids_arr;
+                            audio_batch.logits = logits_arr;
+
+                            if (llama_decode(ctx, audio_batch) != 0) {
+                                SLT_ERR(slot, "%s", "failed to decode final audio embeddings for text transition\n");
+                                slot.audio_embd.clear();
+                                slot.audio_pos_offset = slot.audio_pos - slot.prompt.tokens.pos_next();
+                                continue;
+                            }
+                            slot.audio_pos++;
+
+                            // Update offset and clear audio state
+                            slot.audio_pos_offset = slot.audio_pos - slot.prompt.tokens.pos_next();
+                            slot.audio_embd.clear();
+
+                            // Now sample from the correct logits
+                            llama_token next_token = common_sampler_sample(slot.smpl.get(), ctx, -1);
+                            common_sampler_accept(slot.smpl.get(), next_token, true);
+
+                            // Accept into audio decoder state
+                            mtmd_audio_output_accept_token(slot.mctx, next_token);
+
+                            // Create result and process through normal flow
+                            completion_token_output result;
+                            result.tok = next_token;
+                            const bool render_special = slot.has_audio_output() || accept_special_token(slot, result.tok);
+                            result.text_to_send = common_token_to_piece(ctx, result.tok, render_special);
+                            result.prob = 1.0f;
+
+                            // Update stats
+                            slot.n_decoded += 1;
+                            const int64_t t_current = ggml_time_us();
+                            slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                            if (!process_token(result, slot)) {
+                                slot.print_timings();
+                                send_final_response(slot);
+                                metrics.on_prediction(slot);
+                                slot.release();
+                                continue;
+                            }
+
+                            // Check if this token switches back to audio mode
+                            if (slot.is_audio_out_mode()) {
+                                SLT_INF(slot, "%s", "immediately switching back to audio mode\n");
+                                has_audio_slots = true;
+                            }
+                        }
+                    }
+                }
+            }
         }
 
         SRV_DBG("%s", "run slots completed\n");
@@ -2976,15 +3434,19 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
         // TODO: this log can become very long, put it behind a flag or think about a more compact format
         //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
 
+        // continue mode: append tokens to existing slot state instead of replacing
+        const bool continue_slot = json_value(data, "continue", false);
+        const bool add_special = !continue_slot; // no BOS for continuation
+
         // process prompt
         std::vector<server_tokens> inputs;
 
         if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) {
             // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
-            inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
+            inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files, add_special));
         } else {
             // Everything else, including multimodal completions.
-            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, add_special, true);
         }
 
         // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
@@ -3007,6 +3469,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
             task.params.oaicompat_cmpl_id = completion_id;
             task.params.oaicompat_model   = meta->model_name;
 
+            // continue mode: append tokens to existing slot
+            task.params.continue_slot = continue_slot;
+
             // prepare child tasks
             if (task.params.n_cmpl > 1) {
                 int n_children = task.params.n_cmpl - 1;
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index d3aba18489b..0852775ebd9 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -7,6 +7,7 @@
 #include "sampling.h"
 #include "speculative.h"
 #include "json-schema-to-grammar.h"
+#include "base64.hpp"
 
 using json = nlohmann::ordered_json;
 
@@ -506,6 +507,10 @@ task_params server_task::params_from_json_cmpl(
         throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np");
     }
 
+    // Output modalities (from modalities: ["text", "audio"])
+    params.has_out_audio = json_value(data, "has_out_audio", false);
+    params.has_out_text  = json_value(data, "has_out_text", true);
+
     return params;
 }
 
@@ -1406,6 +1411,20 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
         add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
     }
 
+    // add audio chunk if present
+    if (!audio_out.empty()) {
+        std::string audio_data_base64 = base64::encode(
+                reinterpret_cast<const char *>(audio_out.data()),
+                audio_out.size() * sizeof(int16_t));
+        add_delta({
+            {"audio", {
+                {"data", audio_data_base64},
+                {"format", "pcm16"},
+                {"sample_rate", audio_out_sample_rate},
+            }},
+        });
+    }
+
     if (!deltas.empty()) {
         auto & last_json = deltas[deltas.size() - 1];
         GGML_ASSERT(last_json.at("choices").size() >= 1);
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index e2e3e5a5828..7d17f086e5b 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -51,6 +51,7 @@ struct task_params {
     bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
     bool return_tokens   = false;
     bool return_progress = false;
+    bool continue_slot   = false; // continue from existing slot state, append tokens instead of replacing
 
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -86,6 +87,10 @@ struct task_params {
     // Embeddings
     int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
 
+    // output modalities (from modalities: ["text"], ["audio"], or ["text", "audio"])
+    bool has_out_audio = false;
+    bool has_out_text  = true;  // default to text
+
     json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
     json to_json(bool only_metrics = false) const;
 };
@@ -320,6 +325,10 @@ struct completion_token_output {
     };
     std::vector<prob_info> probs;
 
+    // audio output
+    std::vector<int16_t> audio_samples;
+    int audio_sample_rate = 0;
+
     json to_json(bool post_sampling_probs) const;
 
     static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
@@ -434,6 +443,10 @@ struct server_task_result_cmpl_partial : server_task_result {
     // for Anthropic API: track if any reasoning content has been generated
     bool anthropic_has_reasoning = false;
 
+    // Audio output
+    std::vector<int16_t> audio_out;
+    int audio_out_sample_rate = 0;
+
     virtual bool is_stop() override {
         return false; // in stream mode, partial responses are not considered stop
     }