fix(scheduler): publish prefix to radix tree during prefill for non-hybrid models

qywu · qywu · commit 8373813b7f4e · 2026-06-08T07:22:04.000Z
A request's prompt-prefix KV was inserted into the shared device radix tree only at FinishEvent for non-hybrid models: the mid-flight InsertHybridCache early-returned when hybrid_prefix_cache_ was null (every non-DeepSeek-V4/Mamba model). A burst of concurrent requests sharing a prefix (RL rollouts with N samples/prompt, or a shared chat-template/system prefix) therefore all prefilled before any finished -&gt; ~0% prefix-cache reuse, vs ~26% for SGLang which publishes during prefill (cache_unfinished_req).

Rename InsertHybridCache -&gt; InsertPrefixCache; publish the freshly-computed prefix through the base KV prefix cache when there is no hybrid cache (hybrid path unchanged: still via hybrid_cache-&gt;GetKVPrefixCache()). The node is pinned via the request's DeviceNodeRef so it is not evicted while in use; Mamba checkpoint publication stays hybrid-only. Thread kv_prefix_cache_ into SchedulePrefillEvent and ScheduleDecodeEvent so they can publish for non-hybrid models.

Signed-off-by: Qingyang Wu &lt;willqywu@gmail.com&gt;
diff --git a/tokenspeed-scheduler/csrc/fsm/forward_events.cpp b/tokenspeed-scheduler/csrc/fsm/forward_events.cpp
@@ -102,12 +102,17 @@ bool ShouldPublishMambaCheckpoint(tokenspeed::HybridPrefixCache* hybrid_cache, s
 
 namespace tokenspeed::fsm {
 
-void InsertHybridCache(HybridPrefixCache* hybrid_cache,
+void InsertPrefixCache(KVPrefixCache* kv_prefix_cache, HybridPrefixCache* hybrid_cache,
                        const std::vector<std::span<const std::int32_t>>& full_paged_tokens,
                        std::unique_ptr<DeviceNodeRef>& device_node_ref, LocalKVAllocator* local_kv_allocator,
                        LocalMambaAllocator* local_mamba_allocator, std::int32_t chunk_begin, std::int32_t chunk_size,
                        std::int32_t page_size) {
-    if (hybrid_cache == nullptr) return;
+    // Hybrid models publish through the hybrid cache's wrapped KV cache (and additionally
+    // track a Mamba checkpoint); plain models publish through the base KV prefix cache.
+    // Either way the freshly-computed prefix becomes matchable by concurrent requests
+    // now, rather than only when this request finishes (FinishEvent).
+    KVPrefixCache* kv = (hybrid_cache != nullptr) ? &hybrid_cache->GetKVPrefixCache() : kv_prefix_cache;
+    if (kv == nullptr) return;
 
     std::vector<std::int32_t> prefix_pages = DevicePagesFromRoot(device_node_ref->Node());
     std::int32_t new_page_count =
@@ -120,10 +125,10 @@ void InsertHybridCache(HybridPrefixCache* hybrid_cache,
     }
 
     OwnedPages pages_to_insert = local_kv_allocator->TakeFirst(new_page_count);
-    auto insert_result = hybrid_cache->GetKVPrefixCache().Insert<ResourceType::Device>(full_paged_tokens, prefix_pages,
-                                                                                       std::move(pages_to_insert));
+    auto insert_result = kv->Insert<ResourceType::Device>(full_paged_tokens, prefix_pages, std::move(pages_to_insert));
 
-    if (local_mamba_allocator != nullptr && local_mamba_allocator->HasCheckpoint()) {
+    // Mamba checkpoint publication is hybrid-only.
+    if (hybrid_cache != nullptr && local_mamba_allocator != nullptr && local_mamba_allocator->HasCheckpoint()) {
         if (ShouldPublishMambaCheckpoint(hybrid_cache, chunk_begin, chunk_size, page_size)) {
             hybrid_cache->InsertMamba(insert_result.last_node, local_mamba_allocator->DetachCheckpoint());
         } else {
@@ -213,7 +218,7 @@ std::variant<PrefillDone, Prefilling> SchedulePrefillEvent::operator()(Prefillin
     if (end_of_window_pages < static_cast<std::int32_t>(paged_tokens.size())) {
         paged_tokens.resize(end_of_window_pages);
     }
-    InsertHybridCache(hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
+    InsertPrefixCache(kv_prefix_cache_, hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
                       local_mamba_allocator.get(), state.window.begin, state.window.size, state.GetPageSize());
     // Allocate KV pages for the new chunk
     local_kv_allocator->Acquire(tokens_this_round_);
@@ -263,7 +268,7 @@ Decoding ScheduleDecodeEvent::operator()(PrefillDone&& state) {
     if (end_of_window_pages < static_cast<std::int32_t>(paged_tokens.size())) {
         paged_tokens.resize(end_of_window_pages);
     }
-    InsertHybridCache(hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
+    InsertPrefixCache(kv_prefix_cache_, hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
                       local_mamba_allocator.get(), state.window.begin, state.window.size, state.GetPageSize());
     // Allocate fresh checkpoint for decode-phase mamba state tracking
     if (hybrid_prefix_cache_ != nullptr && local_mamba_allocator != nullptr) {
diff --git a/tokenspeed-scheduler/csrc/fsm/forward_events.h b/tokenspeed-scheduler/csrc/fsm/forward_events.h
@@ -52,7 +52,14 @@ namespace tokenspeed::fsm {
 struct PrefetchDone;
 struct Prefetching;
 
-void InsertHybridCache(HybridPrefixCache* hybrid_prefix_cache,
+// Publish a request's freshly-computed prefix into the device radix tree *mid-flight*
+// (during prefill / at the prefill->decode transition) so other in-flight requests that
+// share the prefix can reuse it -- instead of only after the request finishes
+// (FinishEvent). Works for both the plain KV prefix cache (kv_prefix_cache) and, when
+// present, the hybrid cache (which additionally publishes the Mamba checkpoint). The
+// published node is pinned via the request's device_node_ref so it is not evicted while
+// the request is still using it.
+void InsertPrefixCache(KVPrefixCache* kv_prefix_cache, HybridPrefixCache* hybrid_prefix_cache,
                        const std::vector<std::span<const std::int32_t>>& full_paged_tokens,
                        std::unique_ptr<DeviceNodeRef>& device_node_ref, LocalKVAllocator* local_kv_allocator,
                        LocalMambaAllocator* local_mamba_allocator, std::int32_t chunk_begin, std::int32_t chunk_size,
@@ -106,10 +113,11 @@ struct SchedulePrefillFirstChunkEvent : InvalidTransitionHandler<SchedulePrefill
 struct SchedulePrefillEvent : InvalidTransitionHandler<SchedulePrefillEvent> {
     using InvalidTransitionHandler<SchedulePrefillEvent>::operator();
     SchedulePrefillEvent(std::int32_t tokens_this_round, std::int32_t reserve_num_tokens_in_next_schedule_event,
-                         HybridPrefixCache* hybrid_prefix_cache = nullptr)
+                         HybridPrefixCache* hybrid_prefix_cache = nullptr, KVPrefixCache* kv_prefix_cache = nullptr)
         : tokens_this_round_(tokens_this_round),
           reserve_num_tokens_in_next_schedule_event_(reserve_num_tokens_in_next_schedule_event),
-          hybrid_prefix_cache_(hybrid_prefix_cache) {}
+          hybrid_prefix_cache_(hybrid_prefix_cache),
+          kv_prefix_cache_(kv_prefix_cache) {}
 
     // Returns PrefillDone (last chunk) or Prefilling (more chunks remain).
     std::variant<PrefillDone, Prefilling> operator()(Prefilling&& state);
@@ -118,20 +126,25 @@ struct SchedulePrefillEvent : InvalidTransitionHandler<SchedulePrefillEvent> {
     std::int32_t tokens_this_round_{};
     std::int32_t reserve_num_tokens_in_next_schedule_event_{};
     HybridPrefixCache* hybrid_prefix_cache_{};
+    KVPrefixCache* kv_prefix_cache_{};
 };
 
 struct ScheduleDecodeEvent : InvalidTransitionHandler<ScheduleDecodeEvent> {
     using InvalidTransitionHandler<ScheduleDecodeEvent>::operator();
 
-    ScheduleDecodeEvent(std::int32_t decode_input_tokens, HybridPrefixCache* hybrid_prefix_cache = nullptr)
-        : decode_input_tokens_(decode_input_tokens), hybrid_prefix_cache_(hybrid_prefix_cache) {}
+    ScheduleDecodeEvent(std::int32_t decode_input_tokens, HybridPrefixCache* hybrid_prefix_cache = nullptr,
+                        KVPrefixCache* kv_prefix_cache = nullptr)
+        : decode_input_tokens_(decode_input_tokens),
+          hybrid_prefix_cache_(hybrid_prefix_cache),
+          kv_prefix_cache_(kv_prefix_cache) {}
 
     Decoding operator()(PrefillDone&& state);
     Decoding operator()(Decoding&& state);
 
 private:
     std::int32_t decode_input_tokens_;
     HybridPrefixCache* hybrid_prefix_cache_{};
+    KVPrefixCache* kv_prefix_cache_{};
 };
 
 struct ScheduleDecodeFromRetractedEvent : InvalidTransitionHandler<ScheduleDecodeFromRetractedEvent> {
diff --git a/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp b/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
@@ -192,7 +192,7 @@ std::optional<fsm::SchedulePrefillEvent> Scheduler::schedulePrefill(
     }
 
     return fsm::SchedulePrefillEvent{tokens_this_round, reserve_num_tokens_in_next_schedule_event,
-                                     hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr};
+                                     hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};
 }
 
 std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* request,
@@ -218,7 +218,7 @@ std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* reque
     }
 
     return fsm::ScheduleDecodeEvent{config_.decode_input_tokens,
-                                    hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr};
+                                    hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};
 }
 
 std::optional<fsm::ScheduleDecodeFromRetractedEvent> Scheduler::scheduleDecodeFromRetracted(

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,7 @@ std::optional<fsm::SchedulePrefillEvent> Scheduler::schedulePrefill(`
`192`	`192`	`}`
`193`	`193`
`194`	`194`	`return fsm::SchedulePrefillEvent{tokens_this_round, reserve_num_tokens_in_next_schedule_event,`
`195`		`- hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr};`
	`195`	`+ hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};`
`196`	`196`	`}`
`197`	`197`
`198`	`198`	`std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* request,`
`@@ -218,7 +218,7 @@ std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* reque`
`218`	`218`	`}`
`219`	`219`
`220`	`220`	`return fsm::ScheduleDecodeEvent{config_.decode_input_tokens,`
`221`		`- hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr};`
	`221`	`+ hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};`
`222`	`222`	`}`
`223`	`223`
`224`	`224`	`std::optional<fsm::ScheduleDecodeFromRetractedEvent> Scheduler::scheduleDecodeFromRetracted(`