From ea653e82a36ae8475ed4a2ed3a48d31712893825 Mon Sep 17 00:00:00 2001
From: dillon-blake <dillon@boxedlogic.com>
Date: Fri, 20 Feb 2026 16:02:24 +1100
Subject: [PATCH 1/5] fix mamba gpu offload

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index cb7fa2c9cbb..78dc5aa7178 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -15017,11 +15017,27 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
     return buft_ctx->device->idx == ctx->device;
 }
 
+static int64_t get_op_batch_size(const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_GET_ROWS:
+            return 0;
+        case GGML_OP_MUL_MAT:
+            return op->ne[1];
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
+            return op->ne[2];
+        default:
+            return ggml_nrows(op);
+    }
+}
+
 static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
 
-    return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+    return (op->op == GGML_OP_MUL_MAT ||
+            op->op == GGML_OP_MUL_MAT_ID) &&
+            get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
 }
 
 static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {

From bb6ded5cb0433ff967ced8a71a4b29d68f72368d Mon Sep 17 00:00:00 2001
From: dillon-blake <dillon@boxedlogic.com>
Date: Fri, 20 Feb 2026 16:53:50 +1100
Subject: [PATCH 2/5] Fix mamba context issue

---
 src/llama-model.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 72490a89b56..f89f611ab26 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7536,7 +7536,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             /* attn_type_v       */ params.type_v,
                             /* attn_v_trans      */ !cparams.flash_attn,
                             /* attn_swa_full     */ params.swa_full,
-                            /* attn_kv_size      */ cparams.n_ctx,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
                             /* attn_n_ubatch     */ cparams.n_ubatch,
                             /* attn_n_pad        */ 1,
                             /* recurrent_type_r  */ GGML_TYPE_F32,
@@ -7553,7 +7553,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             /* attn_type_k       */ params.type_k,
                             /* attn_type_v       */ params.type_v,
                             /* attn_v_trans      */ !cparams.flash_attn,
-                            /* attn_kv_size      */ cparams.n_ctx,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
                             /* attn_n_pad        */ 1,
                             /* attn_n_swa        */ hparams.n_swa,
                             /* attn_swa_type     */ hparams.swa_type,

From 7a8d83728e2dffb4e9d592692f17368c2c07fe8c Mon Sep 17 00:00:00 2001
From: dillon-blake <dillon@boxedlogic.com>
Date: Fri, 20 Feb 2026 16:54:25 +1100
Subject: [PATCH 3/5] igpu no offload weights

---
 src/llama.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 6da90d6f1f8..91af6e1f758 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -974,7 +974,14 @@ static struct llama_model * llama_model_load_from_file_impl(
 
         // add integrated GPUs only if no other devices were found
         if (model->devices.empty()) {
-            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
+            // if only iGPUs are available and user didn't explicitly set -ngl,
+            // default to 0 layers (CPU-only) since iGPU offloading shares system
+            // RAM and adds overhead without benefit
+            if (!igpus.empty() && params.n_gpu_layers == -1) {
+                LLAMA_LOG_INFO("%s: only integrated GPU(s) detected, defaulting to -ngl 0 (CPU-only)\n", __func__);
+            } else {
+                model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
+            }
         }
     }
 

From 69cdd53b2add4e895c6a5d41247835ca095ff003 Mon Sep 17 00:00:00 2001
From: dillon-blake <dillon@boxedlogic.com>
Date: Fri, 20 Feb 2026 16:54:43 +1100
Subject: [PATCH 4/5] No ssl in build

---
 .github/workflows/manual-release.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/manual-release.yml b/.github/workflows/manual-release.yml
index 11fd7554ad8..7ba3b46c1ec 100644
--- a/.github/workflows/manual-release.yml
+++ b/.github/workflows/manual-release.yml
@@ -87,6 +87,7 @@ jobs:
             -DGGML_NATIVE=OFF \
             -DGGML_CPU_ALL_VARIANTS=ON \
             -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
@@ -141,6 +142,7 @@ jobs:
             -DGGML_CPU_ALL_VARIANTS=ON ^
             -DGGML_OPENMP=ON ^
             -DLLAMA_CURL=OFF ^
+            -DLLAMA_OPENSSL=OFF ^
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release
 
@@ -356,6 +358,7 @@ jobs:
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DGGML_RPC=ON \
             -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 

From ebca24d9f26b2476fe9659e437f9a545456293a7 Mon Sep 17 00:00:00 2001
From: dillon-blake <dillon@boxedlogic.com>
Date: Fri, 20 Feb 2026 19:51:53 +1100
Subject: [PATCH 5/5] Fix igpu offload

---
 common/common.cpp | 16 ++++++++++++++++
 src/llama.cpp     |  9 +--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3aa396127ce..ef3751913c6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3,6 +3,7 @@
 #endif
 
 #include "ggml.h"
+#include "ggml-backend.h"
 #include "gguf.h"
 
 #include "common.h"
@@ -1094,6 +1095,21 @@ common_init_result::common_init_result(common_params & params) :
     auto mparams = common_model_params_to_llama(params);
     auto cparams = common_context_params_to_llama(params);
 
+    // If only iGPUs are available and user didn't explicitly set -ngl, default to 0 (CPU-only)
+    if (mparams.n_gpu_layers == -1) {
+        bool has_dgpu = false;
+        bool has_igpu = false;
+        for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+            auto type = ggml_backend_dev_type(ggml_backend_dev_get(i));
+            if (type == GGML_BACKEND_DEVICE_TYPE_GPU)  has_dgpu = true;
+            if (type == GGML_BACKEND_DEVICE_TYPE_IGPU) has_igpu = true;
+        }
+        if (has_igpu && !has_dgpu) {
+            LOG_INF("%s: only integrated GPU(s) detected, defaulting to -ngl 0 (CPU-only)\n", __func__);
+            mparams.n_gpu_layers = 0;
+        }
+    }
+
     if (params.fit_params) {
         LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
         llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
diff --git a/src/llama.cpp b/src/llama.cpp
index 91af6e1f758..6da90d6f1f8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -974,14 +974,7 @@ static struct llama_model * llama_model_load_from_file_impl(
 
         // add integrated GPUs only if no other devices were found
         if (model->devices.empty()) {
-            // if only iGPUs are available and user didn't explicitly set -ngl,
-            // default to 0 layers (CPU-only) since iGPU offloading shares system
-            // RAM and adds overhead without benefit
-            if (!igpus.empty() && params.n_gpu_layers == -1) {
-                LLAMA_LOG_INFO("%s: only integrated GPU(s) detected, defaulting to -ngl 0 (CPU-only)\n", __func__);
-            } else {
-                model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
-            }
+            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
         }
     }