From bc9b28421d47be4c9d660fe71eb3e0ddedaf5dd4 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 26 Jan 2026 10:37:33 +0000 Subject: [PATCH] fix prefix cache --- src/turbomind/engine/engine.cc | 7 +++++-- src/turbomind/models/llama/BlockManager.cc | 1 - src/turbomind/models/llama/SequenceManager.cc | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/turbomind/engine/engine.cc b/src/turbomind/engine/engine.cc index db9a3ff3a3..d54c3b4e20 100644 --- a/src/turbomind/engine/engine.cc +++ b/src/turbomind/engine/engine.cc @@ -344,7 +344,7 @@ void Engine::Impl::Interrupt(RequestCache& c) { auto& s = *TM_CHECK_NOTNULL(c.seq); if (c.req->session.end_flag) { - if (!is_warm_up_) { + if (!is_warm_up_ && s.status != Sequence::kCached) { // At least `Locked` status is required for caching seq_mgr_->CacheGeneration(s); } TM_CHECK(seq_mgr_->Erase(c.req->id)); @@ -691,6 +691,7 @@ void Engine::Impl::Update(BatchData& b, std::vector& signals) vector sequences_to_cache; for (int i = 0; i < b.rc.size(); ++i) { + // In async mode, `seq` may be nullptr when the request is done if (auto& c = *b.rc[i]; c.seq) { if (auto& s = *c.seq; generating[i]) { c.token_ids[c.seq_len] = output_ids[i]; @@ -714,7 +715,9 @@ void Engine::Impl::Update(BatchData& b, std::vector& signals) s.cache_len = sequence_length[i]; } c.done |= finished[i]; - sequences_to_cache.push_back(c.seq); + if (c.seq->status != Sequence::kCached) { // At least `Locked` status is required for caching + sequences_to_cache.push_back(c.seq); + } // dbg(c.seq_len, c.sequence.cache_len, c.alpha, c.beta, c.is_decoding, c.is_generate); } } diff --git a/src/turbomind/models/llama/BlockManager.cc b/src/turbomind/models/llama/BlockManager.cc index 7be87d73c7..707430e80e 100644 --- a/src/turbomind/models/llama/BlockManager.cc +++ b/src/turbomind/models/llama/BlockManager.cc @@ -3,7 +3,6 @@ #include #include "src/turbomind/models/llama/BlockManager.h" -#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/string_utils.h" diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 50e669ae49..594fa78b6b 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -222,7 +222,7 @@ struct Schedule { max_fwd_tokens{max_fwd_tokens}, max_tmp_tokens{max_tmp_tokens}, use_count_{std::move(snapshot.use_count)}, - unlocked_{size}, + unlocked_(size), // ! This is a vector, DO NOT brace initialize it it_{size} { }