From 8973842660d00ffd9498bd182fd4a42c474d43e9 Mon Sep 17 00:00:00 2001
From: "Volodymyr M. Lisivka" <vlisivka@gmail.com>
Date: Mon, 8 Jun 2026 13:51:31 +0300
Subject: [PATCH] hybrid recurrent memory cache fix (by mfielding92)

---
 tools/server/server-context.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 3e3db3c5e35..e1c89896ba6 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -5527,7 +5527,7 @@ struct server_context_impl {
                                     SLT_WRN(slot, "%s\n", st1.str().c_str());
                                 }
 
-                                if (pos_min >= pos_min_thold) {
+                                if (n_swa > 0 && pos_min >= pos_min_thold) {
                                     // search for a context checkpoint
                                     const auto it = std::find_if(
                                         slot.prompt.checkpoints.rbegin(),
@@ -5660,9 +5660,17 @@ struct server_context_impl {
 
                     SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
 
-                    common_context_seq_rm(ctx_tgt, slot.id, p0, -1);
+                    if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) {
+                        SLT_WRN(slot, "partial seq_rm at p0=%d failed (recurrent backend cannot roll into cache); clearing slot and re-evaluating from scratch\n", p0);
+                        slot.n_prompt_tokens_cache = 0;
+                        slot.prompt.tokens.keep_first(0);
+                        common_context_seq_rm(ctx_tgt, slot.id, 0, -1);
+                    }
+
                     if (ctx_dft) {
-                        common_context_seq_rm(ctx_dft.get(), slot.id, p0, -1);
+                        if (!llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), slot.id, p0, -1)) {
+                            common_context_seq_rm(ctx_dft.get(), slot.id, 0, -1);
+                        }
                     }
 
                     // If using an alora, there may be uncached tokens that come