From ea653e82a36ae8475ed4a2ed3a48d31712893825 Mon Sep 17 00:00:00 2001 From: dillon-blake Date: Fri, 20 Feb 2026 16:02:24 +1100 Subject: [PATCH 1/5] fix mamba gpu offload --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index cb7fa2c9cbb..78dc5aa7178 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -15017,11 +15017,27 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba return buft_ctx->device->idx == ctx->device; } +static int64_t get_op_batch_size(const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_GET_ROWS: + return 0; + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context; - return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + return (op->op == GGML_OP_MUL_MAT || + op->op == GGML_OP_MUL_MAT_ID) && + get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size; } static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) { From bb6ded5cb0433ff967ced8a71a4b29d68f72368d Mon Sep 17 00:00:00 2001 From: dillon-blake Date: Fri, 20 Feb 2026 16:53:50 +1100 Subject: [PATCH 2/5] Fix mamba context issue --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 72490a89b56..f89f611ab26 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7536,7 +7536,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* attn_type_v */ params.type_v, /* attn_v_trans */ !cparams.flash_attn, /* attn_swa_full */ params.swa_full, - /* attn_kv_size */ cparams.n_ctx, + /* attn_kv_size */ cparams.n_ctx_seq, /* attn_n_ubatch */ cparams.n_ubatch, /* attn_n_pad */ 1, /* recurrent_type_r */ GGML_TYPE_F32, @@ -7553,7 +7553,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* attn_type_k */ params.type_k, /* attn_type_v */ params.type_v, /* attn_v_trans */ !cparams.flash_attn, - /* attn_kv_size */ cparams.n_ctx, + /* attn_kv_size */ cparams.n_ctx_seq, /* attn_n_pad */ 1, /* attn_n_swa */ hparams.n_swa, /* attn_swa_type */ hparams.swa_type, From 7a8d83728e2dffb4e9d592692f17368c2c07fe8c Mon Sep 17 00:00:00 2001 From: dillon-blake Date: Fri, 20 Feb 2026 16:54:25 +1100 Subject: [PATCH 3/5] igpu no offload weights --- src/llama.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 6da90d6f1f8..91af6e1f758 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -974,7 +974,14 @@ static struct llama_model * llama_model_load_from_file_impl( // add integrated GPUs only if no other devices were found if (model->devices.empty()) { - model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); + // if only iGPUs are available and user didn't explicitly set -ngl, + // default to 0 layers (CPU-only) since iGPU offloading shares system + // RAM and adds overhead without benefit + if (!igpus.empty() && params.n_gpu_layers == -1) { + LLAMA_LOG_INFO("%s: only integrated GPU(s) detected, defaulting to -ngl 0 (CPU-only)\n", __func__); + } else { + model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); + } } } From 69cdd53b2add4e895c6a5d41247835ca095ff003 Mon Sep 17 00:00:00 2001 From: dillon-blake Date: Fri, 20 Feb 2026 16:54:43 +1100 Subject: [PATCH 4/5] No ssl in build --- .github/workflows/manual-release.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/manual-release.yml b/.github/workflows/manual-release.yml index 11fd7554ad8..7ba3b46c1ec 100644 --- a/.github/workflows/manual-release.yml +++ b/.github/workflows/manual-release.yml @@ -87,6 +87,7 @@ jobs: -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -141,6 +142,7 @@ jobs: -DGGML_CPU_ALL_VARIANTS=ON ^ -DGGML_OPENMP=ON ^ -DLLAMA_CURL=OFF ^ + -DLLAMA_OPENSSL=OFF ^ ${{ env.CMAKE_ARGS }} cmake --build build --config Release @@ -356,6 +358,7 @@ jobs: -DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_RPC=ON \ -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) From ebca24d9f26b2476fe9659e437f9a545456293a7 Mon Sep 17 00:00:00 2001 From: dillon-blake Date: Fri, 20 Feb 2026 19:51:53 +1100 Subject: [PATCH 5/5] Fix igpu offload --- common/common.cpp | 16 ++++++++++++++++ src/llama.cpp | 9 +-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 3aa396127ce..ef3751913c6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3,6 +3,7 @@ #endif #include "ggml.h" +#include "ggml-backend.h" #include "gguf.h" #include "common.h" @@ -1094,6 +1095,21 @@ common_init_result::common_init_result(common_params & params) : auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); + // If only iGPUs are available and user didn't explicitly set -ngl, default to 0 (CPU-only) + if (mparams.n_gpu_layers == -1) { + bool has_dgpu = false; + bool has_igpu = false; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + auto type = ggml_backend_dev_type(ggml_backend_dev_get(i)); + if (type == GGML_BACKEND_DEVICE_TYPE_GPU) has_dgpu = true; + if (type == GGML_BACKEND_DEVICE_TYPE_IGPU) has_igpu = true; + } + if (has_igpu && !has_dgpu) { + LOG_INF("%s: only integrated GPU(s) detected, defaulting to -ngl 0 (CPU-only)\n", __func__); + mparams.n_gpu_layers = 0; + } + } + if (params.fit_params) { LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__); llama_params_fit(params.model.path.c_str(), &mparams, &cparams, diff --git a/src/llama.cpp b/src/llama.cpp index 91af6e1f758..6da90d6f1f8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -974,14 +974,7 @@ static struct llama_model * llama_model_load_from_file_impl( // add integrated GPUs only if no other devices were found if (model->devices.empty()) { - // if only iGPUs are available and user didn't explicitly set -ngl, - // default to 0 layers (CPU-only) since iGPU offloading shares system - // RAM and adds overhead without benefit - if (!igpus.empty() && params.n_gpu_layers == -1) { - LLAMA_LOG_INFO("%s: only integrated GPU(s) detected, defaulting to -ngl 0 (CPU-only)\n", __func__); - } else { - model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); - } + model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); } }