diff --git a/common/arg.cpp b/common/arg.cpp index 05f4a5244e7..e2c684f3498 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -58,6 +58,7 @@ static std::initializer_list mmproj_examples = { LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, + LLAMA_EXAMPLE_LIQUID_AUDIO, }; static std::string read_file(const std::string & fname) { @@ -1345,7 +1346,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_LIQUID_AUDIO})); add_opt(common_arg( {"--perf"}, {"--no-perf"}, @@ -2165,7 +2166,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.image.emplace_back(item); } } - ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI})); + ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_LIQUID_AUDIO})); add_opt(common_arg( {"--image-min-tokens"}, "N", "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)", @@ -2659,7 +2660,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_LIQUID_AUDIO})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), @@ -2791,14 +2792,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.hostname = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_HOST")); add_opt(common_arg( {"--port"}, "PORT", string_format("port to listen (default: %d)", params.port), [](common_params & params, int value) { params.port = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO}).set_env("LLAMA_ARG_PORT")); add_opt(common_arg( {"--path"}, "PATH", string_format("path to serve static files from (default: %s)", params.public_path.c_str()), @@ -3497,7 +3498,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.vocoder.model.path = value; } - ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO})); add_opt(common_arg( {"--tts-use-guide-tokens"}, "Use guide tokens to improve TTS word recall", @@ -3511,7 +3512,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.vocoder.speaker_file = value; } - ).set_examples({LLAMA_EXAMPLE_TTS})); + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LIQUID_AUDIO})); add_opt(common_arg( {"--diffusion-steps"}, "N", diff --git a/common/common.h b/common/common.h index c5a80375713..f52916a0884 100644 --- a/common/common.h +++ b/common/common.h @@ -104,6 +104,7 @@ enum llama_example { LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FIT_PARAMS, + LLAMA_EXAMPLE_LIQUID_AUDIO, LLAMA_EXAMPLE_COUNT, }; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 09544173981..c01fa51b95d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10901,6 +10901,25 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield f"{self.dense_tensor_name}.weight", tensor.clone() +@ModelBase.register("Lfm25AudioTokenizer") +class LFM25AudioTokenizer(LFM2Model): + model_arch = gguf.MODEL_ARCH.LFM2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_embedding_length_out(self.hparams.get("output_size")) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "istft.window" or name.startswith("emb.emb"): + return [] + + if name.startswith("lin"): + name = name.replace("lin", "dense_2_out") + + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Lfm2MoeForCausalLM") class LFM2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.LFM2MOE diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 24770430e1c..b76cbcfaeb1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -832,6 +832,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + // do not quantize conv weights + quantize &= name.find("conv.dw.weight") == std::string::npos; + quantize &= name.find("conv.pw1.weight") == std::string::npos; + quantize &= name.find("conv.pw2.weight") == std::string::npos; + quantize &= name.find("conv1d") == std::string::npos; + quantize &= name.find("conv_dw.weight") == std::string::npos; + // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 518f8b9ae74..746df5a91c3 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -37,4 +37,5 @@ else() add_subdirectory(export-lora) endif() add_subdirectory(fit-params) + add_subdirectory(liquid-audio) endif() diff --git a/tools/liquid-audio/CMakeLists.txt b/tools/liquid-audio/CMakeLists.txt new file mode 100644 index 00000000000..cfe27235227 --- /dev/null +++ b/tools/liquid-audio/CMakeLists.txt @@ -0,0 +1,22 @@ +# lib +set(TARGET_LIB liquid-audio) +add_library(${TARGET_LIB} runner.cpp) +target_include_directories(${TARGET_LIB} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET_LIB} PUBLIC llama common mtmd ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET_LIB} PRIVATE cxx_std_17) + +# cli +set(TARGET_CLI llama-liquid-audio-cli) +add_executable(${TARGET_CLI} cli.cpp) +target_link_libraries(${TARGET_CLI} PRIVATE ${TARGET_LIB}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET_CLI} RUNTIME) +endif() + +# server +set(TARGET_SERVER llama-liquid-audio-server) +add_executable(${TARGET_SERVER} server.cpp) +target_link_libraries(${TARGET_SERVER} PRIVATE ${TARGET_LIB} cpp-httplib) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET_SERVER} RUNTIME) +endif() diff --git a/tools/liquid-audio/README.md b/tools/liquid-audio/README.md new file mode 100644 index 00000000000..9915dedd568 --- /dev/null +++ b/tools/liquid-audio/README.md @@ -0,0 +1,116 @@ +--- +license: other +license_name: lfm1.0 +license_link: LICENSE +language: +- en +tags: +- liquid +- lfm2.5 +- edge +- llama.cpp +- audio +- speech +- gguf +base_model: +- LiquidAI/LFM2.5-Audio-1.5B +widget: + - text: "Demo" + output: + url: demo.mp4 +--- + +
+ Liquid AI +
+ Try LFM • + Documentation • + LEAP +
+
+ +# LFM2.5-Audio-1.5B + +Find more details in the original model card: https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B + +## Runners + +`runners` folder contains runners for various architectures including + +- llama-liquid-audio-cli +- llama-liquid-audio-server + +## Convert GGUFs + +```bash +export CKPT=/path/to/LFM2.5-Audio-1.5B +export MODEL=LFM2.5-Audio-1.5B +# backbone +python convert_hf_to_gguf.py $CKPT --outfile $CKPT/${MODEL}-F16.gguf --outtype f16 +./llama-quantize $CKPT/${MODEL}-F16.gguf $CKPT/${MODEL}-Q8_0.gguf Q8_0 +./llama-quantize $CKPT/${MODEL}-F16.gguf $CKPT/${MODEL}-Q4_0.gguf Q4_0 +# mmproj +python convert_hf_to_gguf.py $CKPT --mmproj --outfile $CKPT/mmproj-${MODEL}-F16.gguf --outtype f16 +./llama-quantize $CKPT/mmproj-${MODEL}-F16.gguf $CKPT/mmproj-${MODEL}-Q8_0.gguf Q8_0 +./llama-quantize $CKPT/mmproj-${MODEL}-F16.gguf $CKPT/mmproj-${MODEL}-Q4_0.gguf Q4_0 +# vocoder +python tools/liquid-audio/convert_vocoder_to_gguf.py $CKPT --outfile $CKPT/vocoder-${MODEL}-F16.gguf --outtype f16 +python tools/liquid-audio/convert_vocoder_to_gguf.py $CKPT --outfile $CKPT/vocoder-${MODEL}-Q8_0.gguf --outtype q8_0 +python tools/liquid-audio/convert_vocoder_to_gguf.py $CKPT --outfile $CKPT/vocoder-${MODEL}-Q4_0.gguf --outtype q4_0 +# tokenizer +python convert_hf_to_gguf.py $CKPT/audio_detokenizer --outfile $CKPT/tokenizer-${MODEL}-F16.gguf --outtype f16 +./llama-quantize $CKPT/tokenizer-${MODEL}-F16.gguf $CKPT/tokenizer-${MODEL}-Q8_0.gguf Q8_0 +./llama-quantize $CKPT/tokenizer-${MODEL}-F16.gguf $CKPT/tokenizer-${MODEL}-Q4_0.gguf Q4_0 +``` + +# 🏃 How to run LFM2.5 + +## CLI + +Set env variables. +``` +export CKPT=/path/to/LFM2.5-Audio-1.5B-GGUF +export INPUT_WAV=/path/to/input.wav +export OUTPUT_WAV=/path/to/output.wav +``` + +### ASR (audio -> text) + +```bash +./llama-liquid-audio-cli -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf -sys "Perform ASR." --audio $INPUT_WAV +``` + +### TTS (text -> audio) + +```bash +./llama-liquid-audio-cli -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf -sys "Perform TTS." -p "Hi, how are you?" --output $OUTPUT_WAV +``` + +### Interleaved (audio/text -> audio + text) + +```bash +./llama-liquid-audio-cli -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf -sys "Respond with interleaved text and audio." --audio $INPUT_WAV --output $OUTPUT_WAV +``` + + +## Server + +Start server +``` +export CKPT=/path/to/LFM2.5-Audio-1.5B-GGUF +./llama-liquid-audio-server -m $CKPT/LFM2.5-Audio-1.5B-Q4_0.gguf -mm $CKPT/mmproj-LFM2.5-Audio-1.5B-Q4_0.gguf -mv $CKPT/vocoder-LFM2.5-Audio-1.5B-Q4_0.gguf --tts-speaker-file $CKPT/tokenizer-LFM2.5-Audio-1.5B-Q4_0.gguf +``` + +Use `liquid_audio_chat.py` script to communicate with the server. + +```bash +uv run liquid_audio_chat.py +``` + +# Demo + + diff --git a/tools/liquid-audio/cli.cpp b/tools/liquid-audio/cli.cpp new file mode 100644 index 00000000000..2bdbbc19514 --- /dev/null +++ b/tools/liquid-audio/cli.cpp @@ -0,0 +1,191 @@ +#include "mtmd-helper.h" +#include "mtmd.h" +#include "runner.h" + +// +#include "arg.h" +#include "common.h" +#include "ggml.h" +#include "log.h" + +#include + +namespace { +std::vector load_file(const char * fname) { + std::vector buf; + FILE * f = fopen(fname, "rb"); + if (!f) { + LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); + exit(1); + } + + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fseek(f, 0, SEEK_SET); + buf.resize(file_size); + + size_t n_read = fread(buf.data(), 1, file_size, f); + fclose(f); + if (n_read != (size_t) file_size) { + LOG_ERR("Failed to read entire file %s", fname); + exit(1); + } + + return buf; +} +} // namespace + +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) +# include +# include +#elif defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#endif + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG("CLI for LFM2.5-Audio-1.5B\n\n" + "Usage: %s [options] -m --mmproj -mv --tts-speaker-file " + " " + "-sys [--audio " + "