fix pd decode TP4->DP4EP4 bugs (#448)

tuanzhangCS · tuanzhangCS · web-flow · commit 215acaf14e33 · 2026-06-15T00:44:14.000-07:00
Co-authored-by: tuanzhangCS &lt;tuan@lightseed.org&gt;
diff --git a/python/tokenspeed/runtime/engine/event_loop.py b/python/tokenspeed/runtime/engine/event_loop.py
@@ -108,6 +108,23 @@ def calc_l3_query_hashes(scheduler, tokens: list[int]) -> list[str]:
 _PAUSED_IDLE_SLEEP_S = 0.001
 
 
+def _forward_op_executes_model_forward(forward_op, *, is_disagg_decode: bool) -> bool:
+    """Return whether ``forward_op`` will enter the model forward path.
+
+    On decode-side PD, EXTEND ops only start remote KV receive; the model
+    forward runs after the remote prefill completes and the scheduler advances
+    the request into decode. Treating those EXTEND ops as model work makes
+    idle DP ranks enter dummy collectives that the active rank will not match.
+    """
+    if forward_op is None:
+        return False
+    if sum(forward_op.input_lengths) <= 0:
+        return False
+    if is_disagg_decode and forward_op.num_extends() > 0:
+        return False
+    return True
+
+
 class _NullSender:
     """No-op ZMQ sender for non-rank-0 workers."""
 
@@ -1123,9 +1140,13 @@ def _dp_sync_and_check(self, forward_op) -> DpForwardMetadata:
         """
         import torch.distributed as dist
 
-        num_tokens = sum(forward_op.input_lengths) if forward_op is not None else 0
-        batch_size = len(forward_op.request_ids) if forward_op is not None else 0
-        if forward_op is None:
+        executes_model_forward = _forward_op_executes_model_forward(
+            forward_op,
+            is_disagg_decode=isinstance(self.pd_kv_transfer, DisaggDecodeExecutor),
+        )
+        num_tokens = sum(forward_op.input_lengths) if executes_model_forward else 0
+        batch_size = len(forward_op.request_ids) if executes_model_forward else 0
+        if not executes_model_forward:
             forward_mode = ForwardMode.IDLE
         else:
             forward_mode = ForwardMode.from_num_extends(
diff --git a/python/tokenspeed/runtime/pd/mooncake/receiver.py b/python/tokenspeed/runtime/pd/mooncake/receiver.py
@@ -164,6 +164,14 @@ def _build_buffer_layout_pair(
     prefill_tp_size: int,
     decode_tp_size: int,
 ):
+    """Build compatible logical layouts for one prefill/decode buffer pair.
+
+    Besides normal TP sharding and fully replicated buffers, this handles GQA
+    KV caches where prefill TP is larger than the number of distinct KV heads.
+    In that case multiple prefill TP ranks carry the same KV head, so the
+    transfer plan uses one representative rank from each replica group.
+    """
+
     if prefill_unit_len != decode_unit_len:
         raise ValueError(
             f"prefill/decode unit sizes differ for {buffer_kind.value}: "
@@ -184,12 +192,23 @@ def _build_buffer_layout_pair(
     decode_local_units = decode_item_len // decode_unit_len
     prefill_global_units = prefill_local_units * prefill_tp_size
     decode_global_units = decode_local_units * decode_tp_size
+    prefill_tp_replica_group_size = 1
     if prefill_global_units == decode_global_units:
         logical_axis = sharded_axis
         logical_size = decode_global_units
     elif prefill_item_len == decode_item_len:
         logical_axis = "replicated"
         logical_size = decode_local_units
+    elif (
+        sharded_axis == "kv_head"
+        and decode_global_units % prefill_local_units == 0
+        and decode_global_units // prefill_local_units <= prefill_tp_size
+        and prefill_tp_size % (decode_global_units // prefill_local_units) == 0
+    ):
+        logical_axis = sharded_axis
+        logical_size = decode_global_units
+        prefill_distinct_tp_size = decode_global_units // prefill_local_units
+        prefill_tp_replica_group_size = prefill_tp_size // prefill_distinct_tp_size
     else:
         raise ValueError(
             f"unsupported heterogeneous TP buffer layout for {buffer_kind.value}: "
@@ -207,6 +226,7 @@ def _build_buffer_layout_pair(
             page_size=1,
             bytes_per_logical_unit=decode_unit_len,
             item_stride_bytes=prefill_item_len,
+            tp_replica_group_size=prefill_tp_replica_group_size,
         ),
         BufferLayout(
             buffer_index=buffer_index,
diff --git a/python/tokenspeed/runtime/pd/transfer_plan.py b/python/tokenspeed/runtime/pd/transfer_plan.py
@@ -61,13 +61,21 @@ def tp_size_per_dp(self) -> int:
 
 @dataclass(frozen=True)
 class BufferLayout:
+    """Logical layout for one cache/state buffer.
+
+    ``tp_replica_group_size`` describes TP ranks that hold the same logical
+    shard. It is used by GQA/MQA-style KV caches when the prefill TP size is
+    larger than the number of distinct KV heads.
+    """
+
     buffer_index: int
     buffer_kind: BufferKind
     logical_axis: Literal["kv_head", "state_channel", "replicated"]
     logical_size: int
     page_size: int
     bytes_per_logical_unit: int
     item_stride_bytes: int
+    tp_replica_group_size: int = 1
 
     def __post_init__(self):
         if self.logical_size <= 0:
@@ -78,6 +86,8 @@ def __post_init__(self):
             raise UnsupportedPDLayoutError("bytes_per_logical_unit must be positive")
         if self.item_stride_bytes <= 0:
             raise UnsupportedPDLayoutError("item_stride_bytes must be positive")
+        if self.tp_replica_group_size <= 0:
+            raise UnsupportedPDLayoutError("tp_replica_group_size must be positive")
 
 
 @dataclass(frozen=True)
@@ -216,7 +226,9 @@ def plan_for_decode_rank(self, decode_rank: int) -> RankTransferPlan:
         target_dp_group = decode_rank // decode_tp_size
         decode_tp_rank = decode_rank % decode_tp_size
 
-        if self.prefill_layout.tp_size_per_dp == decode_tp_size:
+        if self._can_use_identity_plan() and (
+            self.prefill_layout.tp_size_per_dp == decode_tp_size
+        ):
             prefill_rank = (
                 target_dp_group * self.prefill_layout.tp_size_per_dp + decode_tp_rank
             )
@@ -261,19 +273,25 @@ def plan_for_decode_rank(self, decode_rank: int) -> RankTransferPlan:
                 fragments.setdefault(prefill_rank, []).append(fragment)
                 continue
 
-            decode_interval = self._rank_interval(
-                decode_buffer.logical_size, decode_tp_size, decode_tp_rank
+            decode_interval = self._rank_interval_for_buffer(
+                decode_buffer,
+                self.decode_layout,
+                decode_tp_rank,
             )
+            if decode_interval is None:
+                continue
             for prefill_tp_rank in range(self.prefill_layout.tp_size_per_dp):
                 prefill_rank = (
                     target_dp_group * self.prefill_layout.tp_size_per_dp
                     + prefill_tp_rank
                 )
-                prefill_interval = self._rank_interval(
-                    prefill_buffer.logical_size,
-                    self.prefill_layout.tp_size_per_dp,
+                prefill_interval = self._rank_interval_for_buffer(
+                    prefill_buffer,
+                    self.prefill_layout,
                     prefill_tp_rank,
                 )
+                if prefill_interval is None:
+                    continue
                 intersection = prefill_interval.intersect(decode_interval)
                 if intersection is None:
                     continue
@@ -341,11 +359,29 @@ def _validate_alignment(self) -> None:
             for buffer in buffers:
                 if buffer.logical_axis == "replicated":
                     continue
-                if buffer.logical_size % layout.tp_size_per_dp != 0:
+                if layout.tp_size_per_dp % buffer.tp_replica_group_size != 0:
+                    raise UnsupportedPDLayoutError(
+                        "tp replica group must divide TP size for "
+                        f"buffer_kind={buffer.buffer_kind.value}: "
+                        f"tp_size_per_dp={layout.tp_size_per_dp}, "
+                        f"tp_replica_group_size={buffer.tp_replica_group_size}"
+                    )
+                effective_tp_size = (
+                    layout.tp_size_per_dp // buffer.tp_replica_group_size
+                )
+                if buffer.logical_size % effective_tp_size != 0:
                     raise UnsupportedPDLayoutError(
                         "non-aligned TP heterogeneous mapping for "
                         f"buffer_kind={buffer.buffer_kind.value}: logical_size="
-                        f"{buffer.logical_size}, tp_size_per_dp={layout.tp_size_per_dp}"
+                        f"{buffer.logical_size}, effective_tp_size={effective_tp_size}"
+                    )
+                item_units = buffer.item_stride_bytes // buffer.bytes_per_logical_unit
+                required_units = buffer.logical_size // effective_tp_size
+                if item_units < required_units:
+                    raise UnsupportedPDLayoutError(
+                        "buffer item is smaller than its logical shard for "
+                        f"buffer_kind={buffer.buffer_kind.value}: item_units="
+                        f"{item_units}, required_units={required_units}"
                     )
 
     def _calc_source_fanout(self) -> dict[int, int]:
@@ -370,17 +406,21 @@ def _calc_source_fanout(self) -> dict[int, int]:
                     intersected_prefill_ranks.add(prefill_rank)
                     continue
 
-                decode_interval = self._rank_interval(
-                    decode_buffer.logical_size,
-                    self.decode_layout.tp_size_per_dp,
+                decode_interval = self._rank_interval_for_buffer(
+                    decode_buffer,
+                    self.decode_layout,
                     decode_tp_rank,
                 )
+                if decode_interval is None:
+                    continue
                 for prefill_tp_rank in range(self.prefill_layout.tp_size_per_dp):
-                    prefill_interval = self._rank_interval(
-                        prefill_buffer.logical_size,
-                        self.prefill_layout.tp_size_per_dp,
+                    prefill_interval = self._rank_interval_for_buffer(
+                        prefill_buffer,
+                        self.prefill_layout,
                         prefill_tp_rank,
                     )
+                    if prefill_interval is None:
+                        continue
                     if prefill_interval.intersect(decode_interval) is None:
                         continue
                     prefill_rank = (
@@ -392,12 +432,34 @@ def _calc_source_fanout(self) -> dict[int, int]:
                 fanout[prefill_rank] += 1
         return fanout
 
+    def _can_use_identity_plan(self) -> bool:
+        return all(
+            prefill_buffer.tp_replica_group_size == 1
+            and decode_buffer.tp_replica_group_size == 1
+            for prefill_buffer, decode_buffer in zip(
+                self.prefill_buffers, self.decode_buffers
+            )
+        )
+
     @staticmethod
     def _rank_interval(logical_size: int, tp_size: int, tp_rank: int) -> _Interval:
         local_size = logical_size // tp_size
         start = tp_rank * local_size
         return _Interval(start, start + local_size)
 
+    @staticmethod
+    def _rank_interval_for_buffer(
+        buffer: BufferLayout, layout: ParallelLayout, tp_rank: int
+    ) -> _Interval | None:
+        replica_group_size = buffer.tp_replica_group_size
+        if tp_rank % replica_group_size != 0:
+            return None
+        effective_tp_size = layout.tp_size_per_dp // replica_group_size
+        effective_tp_rank = tp_rank // replica_group_size
+        return PDTransferPlanner._rank_interval(
+            buffer.logical_size, effective_tp_size, effective_tp_rank
+        )
+
     @staticmethod
     def _replicated_source_tp_rank(
         prefill_tp_size: int, decode_tp_size: int, decode_tp_rank: int
diff --git a/test/runtime/distributed/test_pd_decode_dp_metadata.py b/test/runtime/distributed/test_pd_decode_dp_metadata.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from tokenspeed.runtime.engine.event_loop import _forward_op_executes_model_forward
+
+
+class FakeForwardOp:
+    def __init__(self, *, input_lengths, request_ids=None, num_extends=0):
+        self.input_lengths = input_lengths
+        self.request_ids = request_ids or [
+            f"req-{i}" for i in range(len(input_lengths))
+        ]
+        self._num_extends = num_extends
+
+    def num_extends(self):
+        return self._num_extends
+
+
+def test_pd_decode_extend_only_does_not_require_idle_forward():
+    # Decode-side PD EXTEND starts KV receive only; no model collectives run on
+    # the active DP rank yet, so idle DP ranks must not enter dummy forward.
+    op = FakeForwardOp(input_lengths=[17], num_extends=1)
+
+    assert not _forward_op_executes_model_forward(op, is_disagg_decode=True)
+
+
+def test_pd_decode_decode_step_requires_idle_forward():
+    op = FakeForwardOp(input_lengths=[1], num_extends=0)
+
+    assert _forward_op_executes_model_forward(op, is_disagg_decode=True)
+
+
+def test_non_pd_extend_still_executes_model_forward():
+    op = FakeForwardOp(input_lengths=[17], num_extends=1)
+
+    assert _forward_op_executes_model_forward(op, is_disagg_decode=False)
+
+
+def test_zero_token_forward_op_is_not_model_work():
+    op = FakeForwardOp(input_lengths=[0], num_extends=1)
+
+    assert not _forward_op_executes_model_forward(op, is_disagg_decode=False)
diff --git a/test/runtime/distributed/test_pd_transfer_plan.py b/test/runtime/distributed/test_pd_transfer_plan.py
@@ -0,0 +1,47 @@
+from tokenspeed.runtime.pd.mooncake.receiver import _build_buffer_layout_pair
+from tokenspeed.runtime.pd.transfer_plan import (
+    BufferKind,
+    ParallelLayout,
+    PDTransferPlanner,
+)
+
+
+def test_replicated_prefill_kv_heads_transfer_to_decode_full_kv_heads():
+    prefill_buffer, decode_buffer = _build_buffer_layout_pair(
+        buffer_index=0,
+        buffer_kind=BufferKind.TARGET_K,
+        sharded_axis="kv_head",
+        prefill_item_len=16_384,
+        decode_item_len=32_768,
+        prefill_unit_len=256,
+        decode_unit_len=256,
+        prefill_tp_size=4,
+        decode_tp_size=1,
+    )
+
+    assert prefill_buffer.logical_size == 128
+    assert prefill_buffer.tp_replica_group_size == 2
+    assert decode_buffer.logical_size == 128
+    assert decode_buffer.tp_replica_group_size == 1
+
+    planner = PDTransferPlanner(
+        prefill_layout=ParallelLayout(role="prefill", world_size=4),
+        decode_layout=ParallelLayout(role="decode", world_size=1),
+        prefill_buffers=(prefill_buffer,),
+        decode_buffers=(decode_buffer,),
+    )
+    plan = planner.plan_for_decode_rank(0)
+
+    assert plan.plan_kind == "fragmented"
+    assert plan.target_prefill_ranks == (0, 2)
+    assert plan.required_prefill_response_num == 2
+    assert plan.required_dst_info_num_by_prefill_rank == {0: 1, 2: 1}
+
+    first_head = plan.fragments_by_prefill_rank[0][0]
+    second_head = plan.fragments_by_prefill_rank[2][0]
+    assert first_head.src_byte_offset == 0
+    assert first_head.dst_byte_offset == 0
+    assert first_head.bytes_per_page == 16_384
+    assert second_head.src_byte_offset == 0
+    assert second_head.dst_byte_offset == 16_384
+    assert second_head.bytes_per_page == 16_384