fix(scheduler): skip mid-flight prefix publish for sliding-window models

qywu · qywu · commit ce9f2f6cf353 · 2026-06-08T16:33:58.000Z
The prior commit's mid-flight prefix publish regressed gpt-oss-120b GPQA-diamond (~0.71 -&gt; 0.547, both NVIDIA + AMD): gpt-oss is non-hybrid and uses sliding-window attention, where sharing a prefix mid-flight (while the publishing request is still decoding) corrupts SWA prefix reuse. Full-attention prefix caching (ut-runtime-prefix-cache-e2e) and hybrid/MLA models were unaffected.

Add has_sliding_window to SchedulerConfig, derived in event_loop.py from hf_config.sliding_window (mirroring ModelRunner's SWA detection). For SWA models the scheduler passes a null kv_prefix_cache to SchedulePrefillEvent/ScheduleDecodeEvent so InsertPrefixCache skips the mid-flight publish; the prefix is published only at FinishEvent -- the prior, known-correct behavior. Full-attention non-hybrid models keep the mid-flight reuse; hybrid (DeepSeek-V4) is unchanged.

Signed-off-by: Qingyang Wu &lt;willqywu@gmail.com&gt;
diff --git a/python/tokenspeed/runtime/engine/event_loop.py b/python/tokenspeed/runtime/engine/event_loop.py
@@ -179,6 +179,10 @@ def __init__(
         has_mamba = getattr(self.model_config, "mambaish_config", None) is not None or (
             text_config is not None and hasattr(text_config, "mamba2_cache_params")
         )
+        # Sliding-window-attention models must not publish their prefix mid-flight (they
+        # publish only at FinishEvent); the SWA prefix-reuse path corrupts outputs
+        # otherwise. Mirror ModelRunner's SWA detection (hf_config.sliding_window).
+        has_sliding_window = getattr(hf_config, "sliding_window", None) is not None
 
         model_executor_config = ModelExecutorConfig.from_server_args(
             server_args=server_args,
@@ -327,6 +331,7 @@ def __init__(
                 else 1
             ),
             disable_prefix_cache=not server_args.enable_prefix_caching,
+            has_sliding_window=has_sliding_window,
             enable_mamba=has_mamba,
             mamba_cache_chunk_size=server_args.mamba_cache_chunk_size,
             mamba_pool_total_chunks=mamba_pool_total_chunks,
diff --git a/python/tokenspeed/runtime/engine/scheduler_utils.py b/python/tokenspeed/runtime/engine/scheduler_utils.py
@@ -64,6 +64,7 @@ def make_config(
     enable_kv_cache_events: bool = False,
     decode_input_tokens: int = 1,
     disable_prefix_cache: bool = False,
+    has_sliding_window: bool = False,
     enable_mamba: bool = False,
     mamba_cache_chunk_size: int = 64,
     mamba_pool_total_chunks: int = 0,
@@ -93,6 +94,7 @@ def make_config(
     cfg.num_device_pages = num_device_pages
     cfg.decode_input_tokens = decode_input_tokens
     cfg.disable_prefix_cache = disable_prefix_cache
+    cfg.has_sliding_window = has_sliding_window
     cfg.disable_l2_cache = disable_l2_cache
 
     cfg.enable_mamba = enable_mamba
diff --git a/tokenspeed-scheduler/bindings/python_module.cpp b/tokenspeed-scheduler/bindings/python_module.cpp
@@ -237,6 +237,7 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .def_rw("enable_kv_cache_events", &tokenspeed::SchedulerConfig::enable_kv_cache_events)
         .def_rw("enable_mixed_prefill_decode", &tokenspeed::SchedulerConfig::enable_mixed_prefill_decode)
         .def_rw("disable_prefix_cache", &tokenspeed::SchedulerConfig::disable_prefix_cache)
+        .def_rw("has_sliding_window", &tokenspeed::SchedulerConfig::has_sliding_window)
         .def_rw("enable_mamba", &tokenspeed::SchedulerConfig::enable_mamba)
         .def_rw("mamba_cache_chunk_size", &tokenspeed::SchedulerConfig::mamba_cache_chunk_size)
         .def_rw("mamba_pool_total_chunks", &tokenspeed::SchedulerConfig::mamba_pool_total_chunks)
diff --git a/tokenspeed-scheduler/csrc/fsm/forward_events.cpp b/tokenspeed-scheduler/csrc/fsm/forward_events.cpp
@@ -108,9 +108,11 @@ void InsertPrefixCache(KVPrefixCache* kv_prefix_cache, HybridPrefixCache* hybrid
                        LocalMambaAllocator* local_mamba_allocator, std::int32_t chunk_begin, std::int32_t chunk_size,
                        std::int32_t page_size) {
     // Hybrid models publish through the hybrid cache's wrapped KV cache (and additionally
-    // track a Mamba checkpoint); plain models publish through the base KV prefix cache.
-    // Either way the freshly-computed prefix becomes matchable by concurrent requests
-    // now, rather than only when this request finishes (FinishEvent).
+    // track a Mamba checkpoint); plain (non-hybrid) models publish through the base KV
+    // prefix cache, making the freshly-computed prefix matchable by concurrent requests
+    // now rather than only at FinishEvent. A null kv_prefix_cache (passed by the scheduler
+    // for sliding-window-attention models) disables this mid-flight publish, so SWA models
+    // fall back to the finish-only publish whose prefix reuse is known-correct.
     KVPrefixCache* kv = (hybrid_cache != nullptr) ? &hybrid_cache->GetKVPrefixCache() : kv_prefix_cache;
     if (kv == nullptr) return;
 
diff --git a/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp b/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
@@ -191,8 +191,11 @@ std::optional<fsm::SchedulePrefillEvent> Scheduler::schedulePrefill(
         return {};
     }
 
+    // Sliding-window-attention models pass a null kv_prefix_cache so InsertPrefixCache
+    // skips the mid-flight publish; their prefix is published only at FinishEvent.
     return fsm::SchedulePrefillEvent{tokens_this_round, reserve_num_tokens_in_next_schedule_event,
-                                     hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};
+                                     hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr,
+                                     config_.has_sliding_window ? nullptr : &kv_prefix_cache_};
 }
 
 std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* request,
@@ -217,8 +220,10 @@ std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* reque
         return {};
     }
 
+    // SWA models: skip mid-flight publish (see schedulePrefill) -- publish only at FinishEvent.
     return fsm::ScheduleDecodeEvent{config_.decode_input_tokens,
-                                    hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};
+                                    hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr,
+                                    config_.has_sliding_window ? nullptr : &kv_prefix_cache_};
 }
 
 std::optional<fsm::ScheduleDecodeFromRetractedEvent> Scheduler::scheduleDecodeFromRetracted(
diff --git a/tokenspeed-scheduler/csrc/scheduler/types.h b/tokenspeed-scheduler/csrc/scheduler/types.h
@@ -101,6 +101,10 @@ struct SchedulerConfig {
     Role role{Role::kFused};
 
     bool disable_prefix_cache{false};
+    // Sliding-window-attention models publish their prefix only at FinishEvent: the
+    // mid-flight publish (prefill->decode) enables an SWA prefix-reuse path that
+    // corrupts outputs, so the scheduler skips it for these models.
+    bool has_sliding_window{false};
     bool enable_mamba{false};
     std::int32_t mamba_cache_chunk_size{64};
     std::int32_t mamba_pool_total_chunks{0};

Original file line number	Diff line number	Diff line change
`@@ -191,8 +191,11 @@ std::optional<fsm::SchedulePrefillEvent> Scheduler::schedulePrefill(`
`191`	`191`	`return {};`
`192`	`192`	`}`
`193`	`193`
	`194`	`+ // Sliding-window-attention models pass a null kv_prefix_cache so InsertPrefixCache`
	`195`	`+ // skips the mid-flight publish; their prefix is published only at FinishEvent.`
`194`	`196`	`return fsm::SchedulePrefillEvent{tokens_this_round, reserve_num_tokens_in_next_schedule_event,`
`195`		`- hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};`
	`197`	`+ hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr,`
	`198`	`+ config_.has_sliding_window ? nullptr : &kv_prefix_cache_};`
`196`	`199`	`}`
`197`	`200`
`198`	`201`	`std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* request,`
`@@ -217,8 +220,10 @@ std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* reque`
`217`	`220`	`return {};`
`218`	`221`	`}`
`219`	`222`
	`223`	`+ // SWA models: skip mid-flight publish (see schedulePrefill) -- publish only at FinishEvent.`
`220`	`224`	`return fsm::ScheduleDecodeEvent{config_.decode_input_tokens,`
`221`		`- hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, &kv_prefix_cache_};`
	`225`	`+ hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr,`
	`226`	`+ config_.has_sliding_window ? nullptr : &kv_prefix_cache_};`
`222`	`227`	`}`
`223`	`228`
`224`	`229`	`std::optional<fsm::ScheduleDecodeFromRetractedEvent> Scheduler::scheduleDecodeFromRetracted(`