From 8a4257a55e5cc53016d36a5b546039458e957d1f Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 09:54:40 +0100
Subject: [PATCH 1/7] server : support multiple model aliases via
 comma-separated --alias

---
 tools/server/server-context.cpp | 14 +++++-
 tools/server/server-context.h   |  1 +
 tools/server/server-models.cpp  | 78 ++++++++++++++++++++++++++++++---
 tools/server/server-models.h    |  5 +++
 4 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 73af812437e..402c349360d 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -580,6 +580,7 @@ struct server_context_impl {
     float slot_prompt_similarity = 0.0f;
 
     std::string model_name; // name of the loaded model, to be used by API
+    std::vector<std::string> model_aliases; // additional names for the model
 
     bool sleeping = false;
 
@@ -813,8 +814,15 @@ struct server_context_impl {
         SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
         if (!params_base.model_alias.empty()) {
-            // user explicitly specified model name
-            model_name = params_base.model_alias;
+            // user explicitly specified model name (may include comma-separated aliases)
+            auto aliases = string_split<std::string>(params_base.model_alias, ',');
+            model_name = string_strip(aliases[0]);
+            for (size_t i = 1; i < aliases.size(); i++) {
+                auto alias = string_strip(aliases[i]);
+                if (!alias.empty()) {
+                    model_aliases.push_back(alias);
+                }
+            }
         } else if (!params_base.model.name.empty()) {
             // use model name in registry format (for models in cache)
             model_name = params_base.model.name;
@@ -2892,6 +2900,7 @@ server_context_meta server_context::get_meta() const {
     return server_context_meta {
         /* build_info             */ build_info,
         /* model_name             */ impl->model_name,
+        /* model_aliases          */ impl->model_aliases,
         /* model_path             */ impl->params_base.model.path,
         /* has_mtmd               */ impl->mctx != nullptr,
         /* has_inp_image          */ impl->chat_params.allow_image,
@@ -3688,6 +3697,7 @@ void server_routes::init_routes() {
             {"data", {
                 {
                     {"id",       meta->model_name},
+                    {"aliases",  meta->model_aliases},
                     {"object",   "model"},
                     {"created",  std::time(0)},
                     {"owned_by", "llamacpp"},
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 03c29f513bf..cc0d3566281 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -12,6 +12,7 @@ struct server_context_impl; // private implementation
 struct server_context_meta {
     std::string build_info;
     std::string model_name;
+    std::vector<std::string> model_aliases;
     std::string model_path;
     bool has_mtmd;
     bool has_inp_image;
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index efb22da5c3d..f3fd71e6944 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -184,6 +184,32 @@ void server_models::add_model(server_model_meta && meta) {
     if (mapping.find(meta.name) != mapping.end()) {
         throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
     }
+    if (name_index.find(meta.name) != name_index.end()) {
+        throw std::runtime_error(string_format("model name '%s' conflicts with an existing alias", meta.name.c_str()));
+    }
+
+    // parse aliases from preset's --alias option (comma-separated)
+    std::string alias_str;
+    if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
+        for (auto & alias : string_split<std::string>(alias_str, ',')) {
+            alias = string_strip(alias);
+            if (alias.empty()) {
+                continue;
+            }
+            if (name_index.find(alias) != name_index.end()) {
+                throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with an existing name or alias",
+                    alias.c_str(), meta.name.c_str()));
+            }
+            meta.aliases.push_back(alias);
+        }
+    }
+
+    // index canonical name + all aliases
+    name_index[meta.name] = meta.name;
+    for (const auto & alias : meta.aliases) {
+        name_index[alias] = meta.name;
+    }
+
     meta.update_args(ctx_preset, bin_path); // render args
     std::string name = meta.name;
     mapping[name] = instance_t{
@@ -249,6 +275,7 @@ void server_models::load_models() {
         server_model_meta meta{
             /* preset       */ preset.second,
             /* name         */ preset.first,
+            /* aliases      */ {},
             /* port         */ 0,
             /* status       */ SERVER_MODEL_STATUS_UNLOADED,
             /* last_used    */ 0,
@@ -268,7 +295,18 @@ void server_models::load_models() {
         SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
         for (const auto & [name, inst] : mapping) {
             bool has_custom = custom_names.find(name) != custom_names.end();
-            SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+            if (inst.meta.aliases.empty()) {
+                SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+            } else {
+                std::string alias_list;
+                for (const auto & a : inst.meta.aliases) {
+                    if (!alias_list.empty()) {
+                        alias_list += ", ";
+                    }
+                    alias_list += a;
+                }
+                SRV_INF("  %c %s (aliases: %s)\n", has_custom ? '*' : ' ', name.c_str(), alias_list.c_str());
+            }
         }
     }
 
@@ -316,16 +354,25 @@ void server_models::update_meta(const std::string & name, const server_model_met
     cv.notify_all(); // notify wait_until_loaded
 }
 
+std::string server_models::resolve_name(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = name_index.find(name);
+    if (it != name_index.end()) {
+        return it->second;
+    }
+    return "";
+}
+
 bool server_models::has_model(const std::string & name) {
     std::lock_guard<std::mutex> lk(mutex);
-    return mapping.find(name) != mapping.end();
+    return name_index.find(name) != name_index.end();
 }
 
 std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
     std::lock_guard<std::mutex> lk(mutex);
-    auto it = mapping.find(name);
-    if (it != mapping.end()) {
-        return it->second.meta;
+    auto it = name_index.find(name);
+    if (it != name_index.end()) {
+        return mapping[it->second].meta;
     }
     return std::nullopt;
 }
@@ -821,6 +868,11 @@ void server_models_routes::init_routes() {
     this->proxy_get = [this](const server_http_req & req) {
         std::string method = "GET";
         std::string name = req.get_param("model");
+        // resolve alias to canonical model name
+        std::string resolved = models.resolve_name(name);
+        if (!resolved.empty()) {
+            name = resolved;
+        }
         bool autoload = is_autoload(params, req);
         auto error_res = std::make_unique<server_http_res>();
         if (!router_validate_model(name, models, autoload, error_res)) {
@@ -833,6 +885,11 @@ void server_models_routes::init_routes() {
         std::string method = "POST";
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
+        // resolve alias to canonical model name
+        std::string resolved = models.resolve_name(name);
+        if (!resolved.empty()) {
+            name = resolved;
+        }
         bool autoload = is_autoload(params, req);
         auto error_res = std::make_unique<server_http_res>();
         if (!router_validate_model(name, models, autoload, error_res)) {
@@ -845,6 +902,11 @@ void server_models_routes::init_routes() {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
+        // resolve alias to canonical model name
+        std::string resolved = models.resolve_name(name);
+        if (!resolved.empty()) {
+            name = resolved;
+        }
         auto model = models.get_meta(name);
         if (!model.has_value()) {
             res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
@@ -883,6 +945,7 @@ void server_models_routes::init_routes() {
             }
             models_json.push_back(json {
                 {"id",       meta.name},
+                {"aliases",  meta.aliases},
                 {"object",   "model"},    // for OAI-compat
                 {"owned_by", "llamacpp"}, // for OAI-compat
                 {"created",  t},          // for OAI-compat
@@ -901,6 +964,11 @@ void server_models_routes::init_routes() {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
+        // resolve alias to canonical model name
+        std::string resolved = models.resolve_name(name);
+        if (!resolved.empty()) {
+            name = resolved;
+        }
         auto model = models.get_meta(name);
         if (!model.has_value()) {
             res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index a397abda4a8..13cd3c7e41f 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -52,6 +52,7 @@ static std::string server_model_status_to_string(server_model_status status) {
 struct server_model_meta {
     common_preset preset;
     std::string name;
+    std::vector<std::string> aliases; // additional names that resolve to this model
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
@@ -84,6 +85,7 @@ struct server_models {
     std::mutex mutex;
     std::condition_variable cv;
     std::map<std::string, instance_t> mapping;
+    std::map<std::string, std::string> name_index; // alias/name -> canonical name
 
     // for stopping models
     std::condition_variable cv_stop;
@@ -112,6 +114,9 @@ struct server_models {
     // check if a model instance exists (thread-safe)
     bool has_model(const std::string & name);
 
+    // resolve alias/name to canonical model name, returns empty string if not found (thread-safe)
+    std::string resolve_name(const std::string & name);
+
     // return a copy of model metadata (thread-safe)
     std::optional<server_model_meta> get_meta(const std::string & name);
 

From 66e8fb3b6df45a12c0434df2dcc92f5ac30ab837 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 11:27:20 +0100
Subject: [PATCH 2/7] server : update --alias description and regenerate docs

---
 common/arg.cpp             |  2 +-
 tools/cli/README.md        |  4 ++--
 tools/completion/README.md |  4 ++--
 tools/server/README.md     | 12 +++++++++---
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 1e8885c9ca5..8efafe34a56 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2520,7 +2520,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"-a", "--alias"}, "STRING",
-        "set alias for model name (to be used by REST API)",
+        "set model name alias, comma-separated for multiple aliases (to be used by API)",
         [](common_params & params, const std::string & value) {
             params.model_alias = value;
         }
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 4a15cbad9d7..9974656df10 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -57,8 +57,8 @@
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index 3ca3e684541..5d3f7c0ab9b 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -140,8 +140,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
diff --git a/tools/server/README.md b/tools/server/README.md
index 0b56ca1e276..ac21fc23aa3 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -74,8 +74,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
-| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
@@ -162,9 +162,11 @@ For the full list of features, please refer to [server's changelog](https://gith
 
 | Argument | Explanation |
 | -------- | ----------- |
+| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
+| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
+| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
 | `-sp, --special` | special tokens output enabled (default: false) |
@@ -229,6 +231,10 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
 | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
 | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) |
+| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) |
 | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
 | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
 | `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |

From fa3739ea3b6e769798d3940804d2c4b5a5af73ae Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 14:41:22 +0100
Subject: [PATCH 3/7] server : multiple model aliases and tags

- address review feedback from ngxson
- --alias accepts comma-separated values (std::set, no duplicates)
- --tags for informational metadata (not used for routing)
- aliases resolve transparently in router via get_meta/has_model
- /v1/models exposes aliases and tags fields
---
 common/arg.cpp                  |  21 ++++-
 common/common.h                 |   3 +-
 tools/server/server-context.cpp |  21 ++---
 tools/server/server-context.h   |   4 +-
 tools/server/server-models.cpp  | 137 ++++++++++++++++++--------------
 tools/server/server-models.h    |   7 +-
 tools/server/server.cpp         |   2 +-
 7 files changed, 111 insertions(+), 84 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8efafe34a56..05f4a5244e7 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"-a", "--alias"}, "STRING",
-        "set model name alias, comma-separated for multiple aliases (to be used by API)",
+        "set model name aliases, comma-separated (to be used by API)",
         [](common_params & params, const std::string & value) {
-            params.model_alias = value;
+            for (auto & alias : string_split<std::string>(value, ',')) {
+                alias = string_strip(alias);
+                if (!alias.empty()) {
+                    params.model_alias.insert(alias);
+                }
+            }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
+    add_opt(common_arg(
+        {"--tags"}, "STRING",
+        "set model tags, comma-separated (informational, not used for routing)",
+        [](common_params & params, const std::string & value) {
+            for (auto & tag : string_split<std::string>(value, ',')) {
+                tag = string_strip(tag);
+                if (!tag.empty()) {
+                    params.model_tags.insert(tag);
+                }
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
     add_opt(common_arg(
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
diff --git a/common/common.h b/common/common.h
index 1fa17286562..a945b3fec72 100644
--- a/common/common.h
+++ b/common/common.h
@@ -410,7 +410,8 @@ struct common_params {
 
     struct common_params_model model;
 
-    std::string model_alias          = ""; // model alias                                                   // NOLINT
+    std::set<std::string> model_alias;    // model aliases                                                  // NOLINT
+    std::set<std::string> model_tags;     // model tags (informational, not used for routing)               // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 402c349360d..65a3704103d 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -580,7 +580,8 @@ struct server_context_impl {
     float slot_prompt_similarity = 0.0f;
 
     std::string model_name; // name of the loaded model, to be used by API
-    std::vector<std::string> model_aliases; // additional names for the model
+    std::set<std::string> model_aliases; // additional names for the model
+    std::set<std::string> model_tags;    // informational tags
 
     bool sleeping = false;
 
@@ -813,18 +814,7 @@ struct server_context_impl {
         }
         SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
-        if (!params_base.model_alias.empty()) {
-            // user explicitly specified model name (may include comma-separated aliases)
-            auto aliases = string_split<std::string>(params_base.model_alias, ',');
-            model_name = string_strip(aliases[0]);
-            for (size_t i = 1; i < aliases.size(); i++) {
-                auto alias = string_strip(aliases[i]);
-                if (!alias.empty()) {
-                    model_aliases.push_back(alias);
-                }
-            }
-        } else if (!params_base.model.name.empty()) {
-            // use model name in registry format (for models in cache)
+        if (!params_base.model.name.empty()) {
             model_name = params_base.model.name;
         } else {
             // fallback: derive model name from file name
@@ -832,6 +822,9 @@ struct server_context_impl {
             model_name = model_path.filename().string();
         }
 
+        model_aliases = params_base.model_alias;
+        model_tags    = params_base.model_tags;
+
         if (!is_resume) {
             return init();
         }
@@ -2901,6 +2894,7 @@ server_context_meta server_context::get_meta() const {
         /* build_info             */ build_info,
         /* model_name             */ impl->model_name,
         /* model_aliases          */ impl->model_aliases,
+        /* model_tags             */ impl->model_tags,
         /* model_path             */ impl->params_base.model.path,
         /* has_mtmd               */ impl->mctx != nullptr,
         /* has_inp_image          */ impl->chat_params.allow_image,
@@ -3698,6 +3692,7 @@ void server_routes::init_routes() {
                 {
                     {"id",       meta->model_name},
                     {"aliases",  meta->model_aliases},
+                    {"tags",     meta->model_tags},
                     {"object",   "model"},
                     {"created",  std::time(0)},
                     {"owned_by", "llamacpp"},
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index cc0d3566281..631d573fcbd 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -6,13 +6,15 @@
 
 #include <cstddef>
 #include <memory>
+#include <set>
 
 struct server_context_impl; // private implementation
 
 struct server_context_meta {
     std::string build_info;
     std::string model_name;
-    std::vector<std::string> model_aliases;
+    std::set<std::string> model_aliases;
+    std::set<std::string> model_tags;
     std::string model_path;
     bool has_mtmd;
     bool has_inp_image;
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index f3fd71e6944..ad7875c9216 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -184,8 +184,13 @@ void server_models::add_model(server_model_meta && meta) {
     if (mapping.find(meta.name) != mapping.end()) {
         throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
     }
-    if (name_index.find(meta.name) != name_index.end()) {
-        throw std::runtime_error(string_format("model name '%s' conflicts with an existing alias", meta.name.c_str()));
+
+    // check model name does not conflict with existing aliases
+    for (const auto & [key, inst] : mapping) {
+        if (inst.meta.aliases.count(meta.name)) {
+            throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'",
+                meta.name.c_str(), key.c_str()));
+        }
     }
 
     // parse aliases from preset's --alias option (comma-separated)
@@ -193,21 +198,35 @@ void server_models::add_model(server_model_meta && meta) {
     if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
         for (auto & alias : string_split<std::string>(alias_str, ',')) {
             alias = string_strip(alias);
-            if (alias.empty()) {
-                continue;
+            if (!alias.empty()) {
+                meta.aliases.insert(alias);
             }
-            if (name_index.find(alias) != name_index.end()) {
-                throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with an existing name or alias",
-                    alias.c_str(), meta.name.c_str()));
+        }
+    }
+
+    // parse tags from preset's --tags option (comma-separated)
+    std::string tags_str;
+    if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
+        for (auto & tag : string_split<std::string>(tags_str, ',')) {
+            tag = string_strip(tag);
+            if (!tag.empty()) {
+                meta.tags.insert(tag);
             }
-            meta.aliases.push_back(alias);
         }
     }
 
-    // index canonical name + all aliases
-    name_index[meta.name] = meta.name;
+    // validate aliases do not conflict with existing names or aliases
     for (const auto & alias : meta.aliases) {
-        name_index[alias] = meta.name;
+        if (mapping.find(alias) != mapping.end()) {
+            throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name",
+                alias.c_str(), meta.name.c_str()));
+        }
+        for (const auto & [key, inst] : mapping) {
+            if (inst.meta.aliases.count(alias)) {
+                throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'",
+                    alias.c_str(), meta.name.c_str(), key.c_str()));
+            }
+        }
     }
 
     meta.update_args(ctx_preset, bin_path); // render args
@@ -276,6 +295,7 @@ void server_models::load_models() {
             /* preset       */ preset.second,
             /* name         */ preset.first,
             /* aliases      */ {},
+            /* tags         */ {},
             /* port         */ 0,
             /* status       */ SERVER_MODEL_STATUS_UNLOADED,
             /* last_used    */ 0,
@@ -292,21 +312,28 @@ void server_models::load_models() {
         for (const auto & [name, preset] : custom_presets) {
             custom_names.insert(name);
         }
+        auto join_set = [](const std::set<std::string> & s) {
+            std::string result;
+            for (const auto & v : s) {
+                if (!result.empty()) {
+                    result += ", ";
+                }
+                result += v;
+            }
+            return result;
+        };
+
         SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
         for (const auto & [name, inst] : mapping) {
             bool has_custom = custom_names.find(name) != custom_names.end();
-            if (inst.meta.aliases.empty()) {
-                SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
-            } else {
-                std::string alias_list;
-                for (const auto & a : inst.meta.aliases) {
-                    if (!alias_list.empty()) {
-                        alias_list += ", ";
-                    }
-                    alias_list += a;
-                }
-                SRV_INF("  %c %s (aliases: %s)\n", has_custom ? '*' : ' ', name.c_str(), alias_list.c_str());
+            std::string info;
+            if (!inst.meta.aliases.empty()) {
+                info += " (aliases: " + join_set(inst.meta.aliases) + ")";
             }
+            if (!inst.meta.tags.empty()) {
+                info += " [tags: " + join_set(inst.meta.tags) + "]";
+            }
+            SRV_INF("  %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
         }
     }
 
@@ -354,25 +381,29 @@ void server_models::update_meta(const std::string & name, const server_model_met
     cv.notify_all(); // notify wait_until_loaded
 }
 
-std::string server_models::resolve_name(const std::string & name) {
-    std::lock_guard<std::mutex> lk(mutex);
-    auto it = name_index.find(name);
-    if (it != name_index.end()) {
-        return it->second;
-    }
-    return "";
-}
-
 bool server_models::has_model(const std::string & name) {
     std::lock_guard<std::mutex> lk(mutex);
-    return name_index.find(name) != name_index.end();
+    if (mapping.find(name) != mapping.end()) {
+        return true;
+    }
+    for (const auto & [key, inst] : mapping) {
+        if (inst.meta.aliases.count(name)) {
+            return true;
+        }
+    }
+    return false;
 }
 
 std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
     std::lock_guard<std::mutex> lk(mutex);
-    auto it = name_index.find(name);
-    if (it != name_index.end()) {
-        return mapping[it->second].meta;
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        return it->second.meta;
+    }
+    for (const auto & [key, inst] : mapping) {
+        if (inst.meta.aliases.count(name)) {
+            return inst.meta;
+        }
     }
     return std::nullopt;
 }
@@ -811,7 +842,7 @@ static void res_err(std::unique_ptr<server_http_res> & res, const json & error_d
     res->data = safe_json_to_str({{ "error", error_data }});
 }
 
-static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
+static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
     if (name.empty()) {
         res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
         return false;
@@ -821,6 +852,8 @@ static bool router_validate_model(const std::string & name, server_models & mode
         res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
         return false;
     }
+    // resolve alias to canonical model name
+    name = meta->name;
     if (models_autoload) {
         models.ensure_model_loaded(name);
     } else {
@@ -868,11 +901,6 @@ void server_models_routes::init_routes() {
     this->proxy_get = [this](const server_http_req & req) {
         std::string method = "GET";
         std::string name = req.get_param("model");
-        // resolve alias to canonical model name
-        std::string resolved = models.resolve_name(name);
-        if (!resolved.empty()) {
-            name = resolved;
-        }
         bool autoload = is_autoload(params, req);
         auto error_res = std::make_unique<server_http_res>();
         if (!router_validate_model(name, models, autoload, error_res)) {
@@ -885,11 +913,6 @@ void server_models_routes::init_routes() {
         std::string method = "POST";
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
-        // resolve alias to canonical model name
-        std::string resolved = models.resolve_name(name);
-        if (!resolved.empty()) {
-            name = resolved;
-        }
         bool autoload = is_autoload(params, req);
         auto error_res = std::make_unique<server_http_res>();
         if (!router_validate_model(name, models, autoload, error_res)) {
@@ -902,21 +925,16 @@ void server_models_routes::init_routes() {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
-        // resolve alias to canonical model name
-        std::string resolved = models.resolve_name(name);
-        if (!resolved.empty()) {
-            name = resolved;
-        }
-        auto model = models.get_meta(name);
-        if (!model.has_value()) {
+        auto meta = models.get_meta(name);
+        if (!meta.has_value()) {
             res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
             return res;
         }
-        if (model->status == SERVER_MODEL_STATUS_LOADED) {
+        if (meta->status == SERVER_MODEL_STATUS_LOADED) {
             res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
-        models.load(name);
+        models.load(meta->name);
         res_ok(res, {{"success", true}});
         return res;
     };
@@ -937,6 +955,7 @@ void server_models_routes::init_routes() {
                 preset_copy.unset_option("LLAMA_ARG_HOST");
                 preset_copy.unset_option("LLAMA_ARG_PORT");
                 preset_copy.unset_option("LLAMA_ARG_ALIAS");
+                preset_copy.unset_option("LLAMA_ARG_TAGS");
                 status["preset"] = preset_copy.to_ini();
             }
             if (meta.is_failed()) {
@@ -946,6 +965,7 @@ void server_models_routes::init_routes() {
             models_json.push_back(json {
                 {"id",       meta.name},
                 {"aliases",  meta.aliases},
+                {"tags",     meta.tags},
                 {"object",   "model"},    // for OAI-compat
                 {"owned_by", "llamacpp"}, // for OAI-compat
                 {"created",  t},          // for OAI-compat
@@ -964,11 +984,6 @@ void server_models_routes::init_routes() {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
-        // resolve alias to canonical model name
-        std::string resolved = models.resolve_name(name);
-        if (!resolved.empty()) {
-            name = resolved;
-        }
         auto model = models.get_meta(name);
         if (!model.has_value()) {
             res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
@@ -978,7 +993,7 @@ void server_models_routes::init_routes() {
             res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
-        models.unload(name);
+        models.unload(model->name);
         res_ok(res, {{"success", true}});
         return res;
     };
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 13cd3c7e41f..78abc8d72a7 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -52,7 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) {
 struct server_model_meta {
     common_preset preset;
     std::string name;
-    std::vector<std::string> aliases; // additional names that resolve to this model
+    std::set<std::string> aliases; // additional names that resolve to this model
+    std::set<std::string> tags;    // informational tags, not used for routing
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
@@ -85,7 +86,6 @@ struct server_models {
     std::mutex mutex;
     std::condition_variable cv;
     std::map<std::string, instance_t> mapping;
-    std::map<std::string, std::string> name_index; // alias/name -> canonical name
 
     // for stopping models
     std::condition_variable cv_stop;
@@ -114,9 +114,6 @@ struct server_models {
     // check if a model instance exists (thread-safe)
     bool has_model(const std::string & name);
 
-    // resolve alias/name to canonical model name, returns empty string if not found (thread-safe)
-    std::string resolve_name(const std::string & name);
-
     // return a copy of model metadata (thread-safe)
     std::optional<server_model_meta> get_meta(const std::string & name);
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index d3d4316026a..542b984534c 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
 
     // for consistency between server router mode and single-model mode, we set the same model name as alias
     if (params.model_alias.empty() && !params.model.name.empty()) {
-        params.model_alias = params.model.name;
+        params.model_alias.insert(params.model.name);
     }
 
     common_init();

From ddb8c375e3e1089720c9dac8644d00edbb971281 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 14:42:14 +0100
Subject: [PATCH 4/7] regenerate docs

---
 tools/cli/README.md        | 6 +++---
 tools/completion/README.md | 6 +++---
 tools/server/README.md     | 9 +++++----
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tools/cli/README.md b/tools/cli/README.md
index 9974656df10..22d3fc87e96 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -109,14 +109,14 @@
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--temp N` | temperature (default: 0.80) |
+| `--temp, --temperature N` | temperature (default: 0.80) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
 | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
-| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index 5d3f7c0ab9b..bcc08876592 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -192,14 +192,14 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--temp N` | temperature (default: 0.80) |
+| `--temp, --temperature N` | temperature (default: 0.80) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
 | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
-| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
diff --git a/tools/server/README.md b/tools/server/README.md
index ac21fc23aa3..a0c69e8a1d6 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -126,14 +126,14 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--temp N` | temperature (default: 0.80) |
+| `--temp, --temperature N` | temperature (default: 0.80) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
 | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
-| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
@@ -184,7 +184,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
 | `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
 | `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
+| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
+| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
 | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |

From 27762e1de962396e491c06d5f38b61ee863ee0dd Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 14:44:51 +0100
Subject: [PATCH 5/7] nits

---
 common/common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.h b/common/common.h
index a945b3fec72..c5a80375713 100644
--- a/common/common.h
+++ b/common/common.h
@@ -410,8 +410,8 @@ struct common_params {
 
     struct common_params_model model;
 
-    std::set<std::string> model_alias;    // model aliases                                                  // NOLINT
-    std::set<std::string> model_tags;     // model tags (informational, not used for routing)               // NOLINT
+    std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
+    std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT

From 3bcafe6351d8490d86036074ee7fff4cecc38db4 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 15:12:18 +0100
Subject: [PATCH 6/7] server : use first alias as model_name for backward
 compat

address review feedback from ngxson
---
 tools/server/server-context.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 65a3704103d..dfc399b26f5 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -814,7 +814,10 @@ struct server_context_impl {
         }
         SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
-        if (!params_base.model.name.empty()) {
+        if (!params_base.model_alias.empty()) {
+            // backward compat: use first alias as model name
+            model_name = *params_base.model_alias.begin();
+        } else if (!params_base.model.name.empty()) {
             model_name = params_base.model.name;
         } else {
             // fallback: derive model name from file name

From 14308bfa5e813ae26be0cc76f9ef961f4c5382c5 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 26 Feb 2026 18:19:09 +0100
Subject: [PATCH 7/7] server : add single-model test for aliases and tags

---
 tools/server/tests/unit/test_basic.py | 17 +++++++++++++++++
 tools/server/tests/utils.py           |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py
index 3405be3e25d..d1b89cf1a91 100644
--- a/tools/server/tests/unit/test_basic.py
+++ b/tools/server/tests/unit/test_basic.py
@@ -94,3 +94,20 @@ def test_no_webui():
     server.start()
     res = requests.get(url)
     assert res.status_code == 404
+
+
+def test_server_model_aliases_and_tags():
+    global server
+    server.model_alias = "tinyllama-2,fim,code"
+    server.model_tags = "chat,fim,small"
+    server.start()
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    model = res.body["data"][0]
+    # aliases field must contain all aliases
+    assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"}
+    # tags field must contain all tags
+    assert set(model["tags"]) == {"chat", "fim", "small"}
+    # id is derived from first alias (alphabetical order from std::set)
+    assert model["id"] == "code"
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index f76bb1a9115..5002999d9b3 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -56,6 +56,7 @@ class ServerProcess:
 
     # custom options
     model_alias: str | None = None
+    model_tags: str | None = None
     model_url: str | None = None
     model_file: str | None = None
     model_draft: str | None = None
@@ -180,6 +181,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--pooling", self.pooling])
         if self.model_alias:
             server_args.extend(["--alias", self.model_alias])
+        if self.model_tags:
+            server_args.extend(["--tags", self.model_tags])
         if self.n_ctx:
             server_args.extend(["--ctx-size", self.n_ctx])
         if self.n_slots: