From 8a4257a55e5cc53016d36a5b546039458e957d1f Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 09:54:40 +0100 Subject: [PATCH 1/7] server : support multiple model aliases via comma-separated --alias --- tools/server/server-context.cpp | 14 +++++- tools/server/server-context.h | 1 + tools/server/server-models.cpp | 78 ++++++++++++++++++++++++++++++--- tools/server/server-models.h | 5 +++ 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 73af812437e..402c349360d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -580,6 +580,7 @@ struct server_context_impl { float slot_prompt_similarity = 0.0f; std::string model_name; // name of the loaded model, to be used by API + std::vector model_aliases; // additional names for the model bool sleeping = false; @@ -813,8 +814,15 @@ struct server_context_impl { SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); if (!params_base.model_alias.empty()) { - // user explicitly specified model name - model_name = params_base.model_alias; + // user explicitly specified model name (may include comma-separated aliases) + auto aliases = string_split(params_base.model_alias, ','); + model_name = string_strip(aliases[0]); + for (size_t i = 1; i < aliases.size(); i++) { + auto alias = string_strip(aliases[i]); + if (!alias.empty()) { + model_aliases.push_back(alias); + } + } } else if (!params_base.model.name.empty()) { // use model name in registry format (for models in cache) model_name = params_base.model.name; @@ -2892,6 +2900,7 @@ server_context_meta server_context::get_meta() const { return server_context_meta { /* build_info */ build_info, /* model_name */ impl->model_name, + /* model_aliases */ impl->model_aliases, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, @@ -3688,6 +3697,7 @@ void server_routes::init_routes() { {"data", { { {"id", meta->model_name}, + {"aliases", meta->model_aliases}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 03c29f513bf..cc0d3566281 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -12,6 +12,7 @@ struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; + std::vector model_aliases; std::string model_path; bool has_mtmd; bool has_inp_image; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index efb22da5c3d..f3fd71e6944 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -184,6 +184,32 @@ void server_models::add_model(server_model_meta && meta) { if (mapping.find(meta.name) != mapping.end()) { throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); } + if (name_index.find(meta.name) != name_index.end()) { + throw std::runtime_error(string_format("model name '%s' conflicts with an existing alias", meta.name.c_str())); + } + + // parse aliases from preset's --alias option (comma-separated) + std::string alias_str; + if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { + for (auto & alias : string_split(alias_str, ',')) { + alias = string_strip(alias); + if (alias.empty()) { + continue; + } + if (name_index.find(alias) != name_index.end()) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with an existing name or alias", + alias.c_str(), meta.name.c_str())); + } + meta.aliases.push_back(alias); + } + } + + // index canonical name + all aliases + name_index[meta.name] = meta.name; + for (const auto & alias : meta.aliases) { + name_index[alias] = meta.name; + } + meta.update_args(ctx_preset, bin_path); // render args std::string name = meta.name; mapping[name] = instance_t{ @@ -249,6 +275,7 @@ void server_models::load_models() { server_model_meta meta{ /* preset */ preset.second, /* name */ preset.first, + /* aliases */ {}, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, @@ -268,7 +295,18 @@ void server_models::load_models() { SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); - SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); + if (inst.meta.aliases.empty()) { + SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); + } else { + std::string alias_list; + for (const auto & a : inst.meta.aliases) { + if (!alias_list.empty()) { + alias_list += ", "; + } + alias_list += a; + } + SRV_INF(" %c %s (aliases: %s)\n", has_custom ? '*' : ' ', name.c_str(), alias_list.c_str()); + } } } @@ -316,16 +354,25 @@ void server_models::update_meta(const std::string & name, const server_model_met cv.notify_all(); // notify wait_until_loaded } +std::string server_models::resolve_name(const std::string & name) { + std::lock_guard lk(mutex); + auto it = name_index.find(name); + if (it != name_index.end()) { + return it->second; + } + return ""; +} + bool server_models::has_model(const std::string & name) { std::lock_guard lk(mutex); - return mapping.find(name) != mapping.end(); + return name_index.find(name) != name_index.end(); } std::optional server_models::get_meta(const std::string & name) { std::lock_guard lk(mutex); - auto it = mapping.find(name); - if (it != mapping.end()) { - return it->second.meta; + auto it = name_index.find(name); + if (it != name_index.end()) { + return mapping[it->second].meta; } return std::nullopt; } @@ -821,6 +868,11 @@ void server_models_routes::init_routes() { this->proxy_get = [this](const server_http_req & req) { std::string method = "GET"; std::string name = req.get_param("model"); + // resolve alias to canonical model name + std::string resolved = models.resolve_name(name); + if (!resolved.empty()) { + name = resolved; + } bool autoload = is_autoload(params, req); auto error_res = std::make_unique(); if (!router_validate_model(name, models, autoload, error_res)) { @@ -833,6 +885,11 @@ void server_models_routes::init_routes() { std::string method = "POST"; json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + // resolve alias to canonical model name + std::string resolved = models.resolve_name(name); + if (!resolved.empty()) { + name = resolved; + } bool autoload = is_autoload(params, req); auto error_res = std::make_unique(); if (!router_validate_model(name, models, autoload, error_res)) { @@ -845,6 +902,11 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + // resolve alias to canonical model name + std::string resolved = models.resolve_name(name); + if (!resolved.empty()) { + name = resolved; + } auto model = models.get_meta(name); if (!model.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); @@ -883,6 +945,7 @@ void server_models_routes::init_routes() { } models_json.push_back(json { {"id", meta.name}, + {"aliases", meta.aliases}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat @@ -901,6 +964,11 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + // resolve alias to canonical model name + std::string resolved = models.resolve_name(name); + if (!resolved.empty()) { + name = resolved; + } auto model = models.get_meta(name); if (!model.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index a397abda4a8..13cd3c7e41f 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -52,6 +52,7 @@ static std::string server_model_status_to_string(server_model_status status) { struct server_model_meta { common_preset preset; std::string name; + std::vector aliases; // additional names that resolve to this model int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading @@ -84,6 +85,7 @@ struct server_models { std::mutex mutex; std::condition_variable cv; std::map mapping; + std::map name_index; // alias/name -> canonical name // for stopping models std::condition_variable cv_stop; @@ -112,6 +114,9 @@ struct server_models { // check if a model instance exists (thread-safe) bool has_model(const std::string & name); + // resolve alias/name to canonical model name, returns empty string if not found (thread-safe) + std::string resolve_name(const std::string & name); + // return a copy of model metadata (thread-safe) std::optional get_meta(const std::string & name); From 66e8fb3b6df45a12c0434df2dcc92f5ac30ab837 Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 11:27:20 +0100 Subject: [PATCH 2/7] server : update --alias description and regenerate docs --- common/arg.cpp | 2 +- tools/cli/README.md | 4 ++-- tools/completion/README.md | 4 ++-- tools/server/README.md | 12 +++++++++--- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1e8885c9ca5..8efafe34a56 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2520,7 +2520,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", + "set model name alias, comma-separated for multiple aliases (to be used by API)", [](common_params & params, const std::string & value) { params.model_alias = value; } diff --git a/tools/cli/README.md b/tools/cli/README.md index 4a15cbad9d7..9974656df10 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -57,8 +57,8 @@ | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | diff --git a/tools/completion/README.md b/tools/completion/README.md index 3ca3e684541..5d3f7c0ab9b 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -140,8 +140,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | diff --git a/tools/server/README.md b/tools/server/README.md index 0b56ca1e276..ac21fc23aa3 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -74,8 +74,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -162,9 +162,11 @@ For the full list of features, please refer to [server's changelog](https://gith | Argument | Explanation | | -------- | ----------- | +| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | +| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | @@ -229,6 +231,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | +| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | +| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) | | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) | | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall | | `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) | From fa3739ea3b6e769798d3940804d2c4b5a5af73ae Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 14:41:22 +0100 Subject: [PATCH 3/7] server : multiple model aliases and tags - address review feedback from ngxson - --alias accepts comma-separated values (std::set, no duplicates) - --tags for informational metadata (not used for routing) - aliases resolve transparently in router via get_meta/has_model - /v1/models exposes aliases and tags fields --- common/arg.cpp | 21 ++++- common/common.h | 3 +- tools/server/server-context.cpp | 21 ++--- tools/server/server-context.h | 4 +- tools/server/server-models.cpp | 137 ++++++++++++++++++-------------- tools/server/server-models.h | 7 +- tools/server/server.cpp | 2 +- 7 files changed, 111 insertions(+), 84 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8efafe34a56..05f4a5244e7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-a", "--alias"}, "STRING", - "set model name alias, comma-separated for multiple aliases (to be used by API)", + "set model name aliases, comma-separated (to be used by API)", [](common_params & params, const std::string & value) { - params.model_alias = value; + for (auto & alias : string_split(value, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + params.model_alias.insert(alias); + } + } } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + add_opt(common_arg( + {"--tags"}, "STRING", + "set model tags, comma-separated (informational, not used for routing)", + [](common_params & params, const std::string & value) { + for (auto & tag : string_split(value, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + params.model_tags.insert(tag); + } + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS")); add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA diff --git a/common/common.h b/common/common.h index 1fa17286562..a945b3fec72 100644 --- a/common/common.h +++ b/common/common.h @@ -410,7 +410,8 @@ struct common_params { struct common_params_model model; - std::string model_alias = ""; // model alias // NOLINT + std::set model_alias; // model aliases // NOLINT + std::set model_tags; // model tags (informational, not used for routing) // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 402c349360d..65a3704103d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -580,7 +580,8 @@ struct server_context_impl { float slot_prompt_similarity = 0.0f; std::string model_name; // name of the loaded model, to be used by API - std::vector model_aliases; // additional names for the model + std::set model_aliases; // additional names for the model + std::set model_tags; // informational tags bool sleeping = false; @@ -813,18 +814,7 @@ struct server_context_impl { } SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); - if (!params_base.model_alias.empty()) { - // user explicitly specified model name (may include comma-separated aliases) - auto aliases = string_split(params_base.model_alias, ','); - model_name = string_strip(aliases[0]); - for (size_t i = 1; i < aliases.size(); i++) { - auto alias = string_strip(aliases[i]); - if (!alias.empty()) { - model_aliases.push_back(alias); - } - } - } else if (!params_base.model.name.empty()) { - // use model name in registry format (for models in cache) + if (!params_base.model.name.empty()) { model_name = params_base.model.name; } else { // fallback: derive model name from file name @@ -832,6 +822,9 @@ struct server_context_impl { model_name = model_path.filename().string(); } + model_aliases = params_base.model_alias; + model_tags = params_base.model_tags; + if (!is_resume) { return init(); } @@ -2901,6 +2894,7 @@ server_context_meta server_context::get_meta() const { /* build_info */ build_info, /* model_name */ impl->model_name, /* model_aliases */ impl->model_aliases, + /* model_tags */ impl->model_tags, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, @@ -3698,6 +3692,7 @@ void server_routes::init_routes() { { {"id", meta->model_name}, {"aliases", meta->model_aliases}, + {"tags", meta->model_tags}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index cc0d3566281..631d573fcbd 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -6,13 +6,15 @@ #include #include +#include struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; - std::vector model_aliases; + std::set model_aliases; + std::set model_tags; std::string model_path; bool has_mtmd; bool has_inp_image; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index f3fd71e6944..ad7875c9216 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -184,8 +184,13 @@ void server_models::add_model(server_model_meta && meta) { if (mapping.find(meta.name) != mapping.end()) { throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); } - if (name_index.find(meta.name) != name_index.end()) { - throw std::runtime_error(string_format("model name '%s' conflicts with an existing alias", meta.name.c_str())); + + // check model name does not conflict with existing aliases + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(meta.name)) { + throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'", + meta.name.c_str(), key.c_str())); + } } // parse aliases from preset's --alias option (comma-separated) @@ -193,21 +198,35 @@ void server_models::add_model(server_model_meta && meta) { if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { for (auto & alias : string_split(alias_str, ',')) { alias = string_strip(alias); - if (alias.empty()) { - continue; + if (!alias.empty()) { + meta.aliases.insert(alias); } - if (name_index.find(alias) != name_index.end()) { - throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with an existing name or alias", - alias.c_str(), meta.name.c_str())); + } + } + + // parse tags from preset's --tags option (comma-separated) + std::string tags_str; + if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) { + for (auto & tag : string_split(tags_str, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + meta.tags.insert(tag); } - meta.aliases.push_back(alias); } } - // index canonical name + all aliases - name_index[meta.name] = meta.name; + // validate aliases do not conflict with existing names or aliases for (const auto & alias : meta.aliases) { - name_index[alias] = meta.name; + if (mapping.find(alias) != mapping.end()) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name", + alias.c_str(), meta.name.c_str())); + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(alias)) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'", + alias.c_str(), meta.name.c_str(), key.c_str())); + } + } } meta.update_args(ctx_preset, bin_path); // render args @@ -276,6 +295,7 @@ void server_models::load_models() { /* preset */ preset.second, /* name */ preset.first, /* aliases */ {}, + /* tags */ {}, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, @@ -292,21 +312,28 @@ void server_models::load_models() { for (const auto & [name, preset] : custom_presets) { custom_names.insert(name); } + auto join_set = [](const std::set & s) { + std::string result; + for (const auto & v : s) { + if (!result.empty()) { + result += ", "; + } + result += v; + } + return result; + }; + SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); - if (inst.meta.aliases.empty()) { - SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); - } else { - std::string alias_list; - for (const auto & a : inst.meta.aliases) { - if (!alias_list.empty()) { - alias_list += ", "; - } - alias_list += a; - } - SRV_INF(" %c %s (aliases: %s)\n", has_custom ? '*' : ' ', name.c_str(), alias_list.c_str()); + std::string info; + if (!inst.meta.aliases.empty()) { + info += " (aliases: " + join_set(inst.meta.aliases) + ")"; } + if (!inst.meta.tags.empty()) { + info += " [tags: " + join_set(inst.meta.tags) + "]"; + } + SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str()); } } @@ -354,25 +381,29 @@ void server_models::update_meta(const std::string & name, const server_model_met cv.notify_all(); // notify wait_until_loaded } -std::string server_models::resolve_name(const std::string & name) { - std::lock_guard lk(mutex); - auto it = name_index.find(name); - if (it != name_index.end()) { - return it->second; - } - return ""; -} - bool server_models::has_model(const std::string & name) { std::lock_guard lk(mutex); - return name_index.find(name) != name_index.end(); + if (mapping.find(name) != mapping.end()) { + return true; + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return true; + } + } + return false; } std::optional server_models::get_meta(const std::string & name) { std::lock_guard lk(mutex); - auto it = name_index.find(name); - if (it != name_index.end()) { - return mapping[it->second].meta; + auto it = mapping.find(name); + if (it != mapping.end()) { + return it->second.meta; + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return inst.meta; + } } return std::nullopt; } @@ -811,7 +842,7 @@ static void res_err(std::unique_ptr & res, const json & error_d res->data = safe_json_to_str({{ "error", error_data }}); } -static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { +static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { if (name.empty()) { res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); return false; @@ -821,6 +852,8 @@ static bool router_validate_model(const std::string & name, server_models & mode res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST)); return false; } + // resolve alias to canonical model name + name = meta->name; if (models_autoload) { models.ensure_model_loaded(name); } else { @@ -868,11 +901,6 @@ void server_models_routes::init_routes() { this->proxy_get = [this](const server_http_req & req) { std::string method = "GET"; std::string name = req.get_param("model"); - // resolve alias to canonical model name - std::string resolved = models.resolve_name(name); - if (!resolved.empty()) { - name = resolved; - } bool autoload = is_autoload(params, req); auto error_res = std::make_unique(); if (!router_validate_model(name, models, autoload, error_res)) { @@ -885,11 +913,6 @@ void server_models_routes::init_routes() { std::string method = "POST"; json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - // resolve alias to canonical model name - std::string resolved = models.resolve_name(name); - if (!resolved.empty()) { - name = resolved; - } bool autoload = is_autoload(params, req); auto error_res = std::make_unique(); if (!router_validate_model(name, models, autoload, error_res)) { @@ -902,21 +925,16 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - // resolve alias to canonical model name - std::string resolved = models.resolve_name(name); - if (!resolved.empty()) { - name = resolved; - } - auto model = models.get_meta(name); - if (!model.has_value()) { + auto meta = models.get_meta(name); + if (!meta.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } - if (model->status == SERVER_MODEL_STATUS_LOADED) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.load(name); + models.load(meta->name); res_ok(res, {{"success", true}}); return res; }; @@ -937,6 +955,7 @@ void server_models_routes::init_routes() { preset_copy.unset_option("LLAMA_ARG_HOST"); preset_copy.unset_option("LLAMA_ARG_PORT"); preset_copy.unset_option("LLAMA_ARG_ALIAS"); + preset_copy.unset_option("LLAMA_ARG_TAGS"); status["preset"] = preset_copy.to_ini(); } if (meta.is_failed()) { @@ -946,6 +965,7 @@ void server_models_routes::init_routes() { models_json.push_back(json { {"id", meta.name}, {"aliases", meta.aliases}, + {"tags", meta.tags}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat @@ -964,11 +984,6 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - // resolve alias to canonical model name - std::string resolved = models.resolve_name(name); - if (!resolved.empty()) { - name = resolved; - } auto model = models.get_meta(name); if (!model.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); @@ -978,7 +993,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.unload(name); + models.unload(model->name); res_ok(res, {{"success", true}}); return res; }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 13cd3c7e41f..78abc8d72a7 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -52,7 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) { struct server_model_meta { common_preset preset; std::string name; - std::vector aliases; // additional names that resolve to this model + std::set aliases; // additional names that resolve to this model + std::set tags; // informational tags, not used for routing int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading @@ -85,7 +86,6 @@ struct server_models { std::mutex mutex; std::condition_variable cv; std::map mapping; - std::map name_index; // alias/name -> canonical name // for stopping models std::condition_variable cv_stop; @@ -114,9 +114,6 @@ struct server_models { // check if a model instance exists (thread-safe) bool has_model(const std::string & name); - // resolve alias/name to canonical model name, returns empty string if not found (thread-safe) - std::string resolve_name(const std::string & name); - // return a copy of model metadata (thread-safe) std::optional get_meta(const std::string & name); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d3d4316026a..542b984534c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { // for consistency between server router mode and single-model mode, we set the same model name as alias if (params.model_alias.empty() && !params.model.name.empty()) { - params.model_alias = params.model.name; + params.model_alias.insert(params.model.name); } common_init(); From ddb8c375e3e1089720c9dac8644d00edbb971281 Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 14:42:14 +0100 Subject: [PATCH 4/7] regenerate docs --- tools/cli/README.md | 6 +++--- tools/completion/README.md | 6 +++--- tools/server/README.md | 9 +++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/cli/README.md b/tools/cli/README.md index 9974656df10..22d3fc87e96 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -109,14 +109,14 @@ | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 5d3f7c0ab9b..bcc08876592 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -192,14 +192,14 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/server/README.md b/tools/server/README.md index ac21fc23aa3..a0c69e8a1d6 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -126,14 +126,14 @@ For the full list of features, please refer to [server's changelog](https://gith | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | @@ -184,7 +184,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | -| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | +| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | +| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | From 27762e1de962396e491c06d5f38b61ee863ee0dd Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 14:44:51 +0100 Subject: [PATCH 5/7] nits --- common/common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.h b/common/common.h index a945b3fec72..c5a80375713 100644 --- a/common/common.h +++ b/common/common.h @@ -410,8 +410,8 @@ struct common_params { struct common_params_model model; - std::set model_alias; // model aliases // NOLINT - std::set model_tags; // model tags (informational, not used for routing) // NOLINT + std::set model_alias; // model aliases // NOLINT + std::set model_tags; // model tags (informational, not used for routing) // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT From 3bcafe6351d8490d86036074ee7fff4cecc38db4 Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 15:12:18 +0100 Subject: [PATCH 6/7] server : use first alias as model_name for backward compat address review feedback from ngxson --- tools/server/server-context.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 65a3704103d..dfc399b26f5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -814,7 +814,10 @@ struct server_context_impl { } SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); - if (!params_base.model.name.empty()) { + if (!params_base.model_alias.empty()) { + // backward compat: use first alias as model name + model_name = *params_base.model_alias.begin(); + } else if (!params_base.model.name.empty()) { model_name = params_base.model.name; } else { // fallback: derive model name from file name From 14308bfa5e813ae26be0cc76f9ef961f4c5382c5 Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 26 Feb 2026 18:19:09 +0100 Subject: [PATCH 7/7] server : add single-model test for aliases and tags --- tools/server/tests/unit/test_basic.py | 17 +++++++++++++++++ tools/server/tests/utils.py | 3 +++ 2 files changed, 20 insertions(+) diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 3405be3e25d..d1b89cf1a91 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -94,3 +94,20 @@ def test_no_webui(): server.start() res = requests.get(url) assert res.status_code == 404 + + +def test_server_model_aliases_and_tags(): + global server + server.model_alias = "tinyllama-2,fim,code" + server.model_tags = "chat,fim,small" + server.start() + res = server.make_request("GET", "/models") + assert res.status_code == 200 + assert len(res.body["data"]) == 1 + model = res.body["data"][0] + # aliases field must contain all aliases + assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"} + # tags field must contain all tags + assert set(model["tags"]) == {"chat", "fim", "small"} + # id is derived from first alias (alphabetical order from std::set) + assert model["id"] == "code" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index f76bb1a9115..5002999d9b3 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -56,6 +56,7 @@ class ServerProcess: # custom options model_alias: str | None = None + model_tags: str | None = None model_url: str | None = None model_file: str | None = None model_draft: str | None = None @@ -180,6 +181,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) + if self.model_tags: + server_args.extend(["--tags", self.model_tags]) if self.n_ctx: server_args.extend(["--ctx-size", self.n_ctx]) if self.n_slots: