fix(vlm): oom on default gpu_memory_utilization

chenht2022 · chenht2022 · commit fdac99d848f0 · 2026-05-26T19:51:08.000Z
Lower the tp=4 default from 0.95 to 0.9 to leave headroom for the
vision encoder.

Signed-off-by: chenht2022 &lt;chenht2022@gmail.com&gt;
diff --git a/python/tokenspeed/runtime/utils/server_args.py b/python/tokenspeed/runtime/utils/server_args.py
@@ -361,7 +361,7 @@ def resolve_memory_and_scheduling(self):
             elif self.mapping.world_size >= 8:
                 self.gpu_memory_utilization = 0.81
             elif self.mapping.world_size >= 4:
-                self.gpu_memory_utilization = 0.95
+                self.gpu_memory_utilization = 0.9
             elif self.mapping.world_size >= 2:
                 self.gpu_memory_utilization = 0.87
             else: