From bae5df616f00ca96a5bd96ca6ccba9787583f5b8 Mon Sep 17 00:00:00 2001 From: jiyingd <87510204+dongjiyingdjy@users.noreply.github.com> Date: Tue, 16 Jun 2026 04:44:42 +0000 Subject: [PATCH 1/2] fix(deepseek-v4): release superseded interior continuation-state snapshots V4 State-family sliding groups (e.g. v4.c128a.compressor_state) attach a trailing-window continuation-state snapshot to each turn's terminal node. When a turn advances, the old terminal becomes an interior ancestor but its now-superseded snapshot was never released: adopt re-adopts History groups only, and the LRU prune skips it because the owning request keeps every ancestor Locked (RefCount>0). The pinned pages accumulate one window per turn and exhaust the small State pool, crashing all TP ranks (PagedCacheGroupTable::Acquire) at high concurrency. CommitChunk now releases an ancestor's State portion (keeping its History chain) once it is provably unreferenced: the owning request's sliding window has advanced past it (node_depth + window <= chunk_depth, so ReleaseSkipped already dropped those pages from this request's borrowed set) AND it is the sole device referencer (RefCount == 1, so no other shared-prefix request can be borrowing its continuation-state window). V4-Pro TP8/EP8 +MTP c=8: 878/878, 0 crash; v4.c128a.compressor_state pool p50 1490->280 (of 2385). GSM8K V4-Flash 0.96, V4-Pro 0.94. codex LGTM. Signed-off-by: jiyingd <87510204+dongjiyingdjy@users.noreply.github.com> Co-Authored-By: Claude Opus 4.8 (1M context) --- .../hybrid_prefix_cache.cpp | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp index e5bb1ab84..3d0a145db 100644 --- a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp +++ b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp @@ -1501,7 +1501,48 @@ void HybridPrefixCache::CommitChunk(const std::string& request_id, TreeNode* ter last_committed = target; } - (void)commitTerminalContinuationSnapshot(tables, terminal, chunk_depth); + const bool terminal_state_committed = + commitTerminalContinuationSnapshot(tables, terminal, chunk_depth); + + // Release superseded interior continuation-state snapshots. + // + // A continuation-state restore resumes from the deepest matching terminal + // (Match). Each turn's terminal becomes an interior ancestor on the next + // turn, but nothing released its now-superseded trailing-window state + // snapshot, so these pinned pages accumulate one window per turn and + // exhaust the small State pools (e.g. v4.c128a.compressor_state). Release an + // ancestor's State portion (keeping its History chain) only when it is + // provably unreferenced, which requires BOTH: + // (1) the owning request's sliding window has advanced past the ancestor + // (node_depth + window <= chunk_depth), so ReleaseSkipped has already + // dropped those pages from this request's own borrowed set; and + // (2) no OTHER request references the ancestor. Each request holds exactly + // one DeviceNodeRef that Locks its whole path to root (NodeRef::Lock), + // so Device().RefCount() == 1 means this committing request is the + // sole referencer and no other request can be borrowing the node's + // continuation-state window. When shared (RefCount > 1, e.g. a second + // request whose prefix runs through this node), keep the snapshot so + // the sharer's continuation-state resume stays valid; it is released + // on a later commit once the sharer's ref drops. + // Gate on a complete terminal snapshot so a resume anchor always remains. + if (terminal_state_committed) { + std::int32_t max_state_window = 0; + for (const auto& gid : paged_cache_continuation_state_groups_) { + auto alloc_it = paged_cache_allocators_.find(gid); + if (alloc_it != paged_cache_allocators_.end() && alloc_it->second != nullptr) { + max_state_window = std::max( + max_state_window, alloc_it->second->Config().sliding_window_tokens.value_or(0)); + } + } + for (TreeNode* cur = terminal->Parent(); cur != nullptr && !cur->IsRoot(); cur = cur->Parent()) { + if (!cur->HasPagedCacheSnapshot()) continue; + if (static_cast(cur->DepthInTokens()) + max_state_window > chunk_depth) { + continue; + } + if (!cur->OnDevice() || cur->Device().RefCount() != 1) continue; + DetachStateSnapshotFromNode(cur); + } + } } } // namespace tokenspeed From ed895f55c5b7c239bdd31fa4810447b7380be55d Mon Sep 17 00:00:00 2001 From: lightseek-bot <243258330+lightseek-bot@users.noreply.github.com> Date: Tue, 16 Jun 2026 05:40:33 +0000 Subject: [PATCH 2/2] style: format hybrid prefix cache scheduler code Signed-off-by: lightseek-bot <243258330+lightseek-bot@users.noreply.github.com> --- .../resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp index 3d0a145db..33ea88c1d 100644 --- a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp +++ b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp @@ -1501,8 +1501,7 @@ void HybridPrefixCache::CommitChunk(const std::string& request_id, TreeNode* ter last_committed = target; } - const bool terminal_state_committed = - commitTerminalContinuationSnapshot(tables, terminal, chunk_depth); + const bool terminal_state_committed = commitTerminalContinuationSnapshot(tables, terminal, chunk_depth); // Release superseded interior continuation-state snapshots. // @@ -1530,8 +1529,8 @@ void HybridPrefixCache::CommitChunk(const std::string& request_id, TreeNode* ter for (const auto& gid : paged_cache_continuation_state_groups_) { auto alloc_it = paged_cache_allocators_.find(gid); if (alloc_it != paged_cache_allocators_.end() && alloc_it->second != nullptr) { - max_state_window = std::max( - max_state_window, alloc_it->second->Config().sliding_window_tokens.value_or(0)); + max_state_window = + std::max(max_state_window, alloc_it->second->Config().sliding_window_tokens.value_or(0)); } } for (TreeNode* cur = terminal->Parent(); cur != nullptr && !cur->IsRoot(); cur = cur->Parent()) {