fix(neural): keep inference on the fitted module's device + validate CUDA index

brycewang-stanford · brycewang-stanford · commit e728456082e8 · 2026-05-05T20:22:20.000-07:00
Two related correctness fixes for STATSPAI_TORCH_DEVICE routing:

1. resolve_torch_device() now rejects "cuda:N" with N &gt;= torch.cuda.device_count()
   instead of silently constructing an out-of-range device. Also tightens the
   prefix check from startswith("cuda") to "cuda" / startswith("cuda:") so
   stray strings like "cudafoo" no longer slip through the CUDA branch.

2. DeepIV.effect() and TARNet/CFRNet/DragonNet predict / propensity paths now
   place the input tensor on next(module.parameters()).device — the device the
   network was actually fitted on — instead of re-resolving from the env var
   each call. Previously, flipping STATSPAI_TORCH_DEVICE between fit and effect
   (or any post-fit device move) raised a cross-device RuntimeError.

Tests: explicit device-count guard test in test_torch_device_resolver, and
spy-based assertions in test_deepiv / test_neural_causal that effect / predict
tensors land on the same device as the fitted parameters.
diff --git a/src/statspai/deepiv/deep_iv.py b/src/statspai/deepiv/deep_iv.py
@@ -538,6 +538,7 @@ def fit(self) -> CausalResult:
         self._effects = effects
         self._x_means = X_means
         self._x_stds = X_stds
+        self._device = device
 
         return CausalResult(
             method='DeepIV (Hartford et al. 2017)',
@@ -592,8 +593,7 @@ def effect(self, t0: float, t1: float, X: Optional[np.ndarray] = None) -> np.nda
         t1_s = (t1 - self._t_mean) / self._t_std
         n = len(X_s)
 
-        from ..utils._torch_device import resolve_torch_device
-        device = resolve_torch_device()
+        device = next(self._response_net.parameters()).device
         X_t = torch.tensor(X_s, dtype=torch.float32, device=device)
 
         with torch.no_grad():
diff --git a/src/statspai/neural_causal/models.py b/src/statspai/neural_causal/models.py
@@ -395,6 +395,11 @@ def _build_head(input_dim, hidden_layers):
     return nn.Sequential(*layers)
 
 
+def _module_device(module):
+    """Return the device that holds a fitted torch module's parameters."""
+    return next(module.parameters()).device
+
+
 # ======================================================================
 # TARNet
 # ======================================================================
@@ -554,6 +559,7 @@ def fit(self) -> CausalResult:
         self._head_0 = head_0
         self._head_1 = head_1
         self._cate = cate
+        self._device = device
 
         model_info = self._build_model_info(cate, D, n)
 
@@ -595,7 +601,8 @@ def effect(self, X_new: Optional[np.ndarray] = None) -> np.ndarray:
 
         X_new = np.asarray(X_new, dtype=np.float32)
         X_s = (X_new - self._x_mean) / self._x_std
-        X_t = torch.tensor(X_s, dtype=torch.float32)
+        device = _module_device(self._repr_net)
+        X_t = torch.tensor(X_s, dtype=torch.float32, device=device)
 
         self._repr_net.eval()
         self._head_0.eval()
@@ -796,6 +803,7 @@ def fit(self) -> CausalResult:
         self._head_0 = head_0
         self._head_1 = head_1
         self._cate = cate
+        self._device = device
 
         model_info = {
             'architecture': 'CFRNet',
@@ -844,7 +852,8 @@ def effect(self, X_new: Optional[np.ndarray] = None) -> np.ndarray:
 
         X_new = np.asarray(X_new, dtype=np.float32)
         X_s = (X_new - self._x_mean) / self._x_std
-        X_t = torch.tensor(X_s, dtype=torch.float32)
+        device = _module_device(self._repr_net)
+        X_t = torch.tensor(X_s, dtype=torch.float32, device=device)
 
         self._repr_net.eval()
         self._head_0.eval()
@@ -1085,6 +1094,7 @@ def fit(self) -> CausalResult:
         self._prop_head = prop_head
         self._cate = cate
         self._e_hat = e_hat
+        self._device = device
 
         model_info = {
             'architecture': 'DragonNet',
@@ -1139,7 +1149,8 @@ def effect(self, X_new: Optional[np.ndarray] = None) -> np.ndarray:
 
         X_new = np.asarray(X_new, dtype=np.float32)
         X_s = (X_new - self._x_mean) / self._x_std
-        X_t = torch.tensor(X_s, dtype=torch.float32)
+        device = _module_device(self._repr_net)
+        X_t = torch.tensor(X_s, dtype=torch.float32, device=device)
 
         self._repr_net.eval()
         self._head_0.eval()
@@ -1176,7 +1187,8 @@ def propensity(self, X_new: Optional[np.ndarray] = None) -> np.ndarray:
 
         X_new = np.asarray(X_new, dtype=np.float32)
         X_s = (X_new - self._x_mean) / self._x_std
-        X_t = torch.tensor(X_s, dtype=torch.float32)
+        device = _module_device(self._repr_net)
+        X_t = torch.tensor(X_s, dtype=torch.float32, device=device)
 
         self._repr_net.eval()
         self._prop_head.eval()
diff --git a/src/statspai/utils/_torch_device.py b/src/statspai/utils/_torch_device.py
@@ -69,13 +69,19 @@ def resolve_torch_device(prefer: Optional[str] = None):
             return torch.device("mps")
         return torch.device("cpu")
 
-    if spec.startswith("cuda"):
+    if spec == "cuda" or spec.startswith("cuda:"):
         if not torch.cuda.is_available():
             raise RuntimeError(
                 f"{_ENV_VAR}={raw!r} requested CUDA but torch.cuda.is_available() is False. "
                 "Install a CUDA-enabled PyTorch build or set STATSPAI_TORCH_DEVICE=cpu."
             )
-        return torch.device(spec)
+        device = torch.device(spec)
+        if device.index is not None and device.index >= torch.cuda.device_count():
+            raise RuntimeError(
+                f"{_ENV_VAR}={raw!r} requested CUDA device {device.index}, "
+                f"but only {torch.cuda.device_count()} device(s) are available."
+            )
+        return device
 
     if spec == "mps":
         if not _mps_available(torch):
diff --git a/tests/test_deepiv.py b/tests/test_deepiv.py
@@ -101,17 +101,20 @@ def test_class_interface(self, linear_iv_data):
         result = est.fit()
         assert isinstance(result, CausalResult)
 
-    def test_effect_method(self, linear_iv_data):
+    def test_effect_method(self, linear_iv_data, monkeypatch):
         est = DeepIV(
             data=linear_iv_data, y='y', treat='treat',
             instruments=['instrument'], covariates=['covar'],
             first_stage_epochs=20, second_stage_epochs=20,
             hidden_layers=(32,), n_components=3,
         )
         est.fit()
+        monkeypatch.setenv("STATSPAI_TORCH_DEVICE", "cuda")
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
         effects = est.effect(t0=0.0, t1=1.0)
         assert len(effects) == 2000
         assert np.isfinite(effects).all()
+        assert next(est._response_net.parameters()).device.type == "cpu"
 
 
 class TestDeepIVValidation:
diff --git a/tests/test_neural_causal.py b/tests/test_neural_causal.py
@@ -6,7 +6,9 @@
 import numpy as np
 import pandas as pd
 
-pytest.importorskip("torch", reason="PyTorch required for neural causal tests")
+torch = pytest.importorskip(
+    "torch", reason="PyTorch required for neural causal tests"
+)
 
 from statspai.neural_causal import (
     tarnet, cfrnet, dragonnet,
@@ -15,6 +17,23 @@
 from statspai.core.results import CausalResult
 
 
+def _spy_tensor_devices(monkeypatch):
+    """Capture devices passed to torch.tensor after model fitting."""
+    devices = []
+    original_tensor = torch.tensor
+
+    def spy_tensor(*args, **kwargs):
+        devices.append(kwargs.get("device"))
+        return original_tensor(*args, **kwargs)
+
+    monkeypatch.setattr(torch, "tensor", spy_tensor)
+    return devices
+
+
+def _module_device(module):
+    return next(module.parameters()).device
+
+
 # ======================================================================
 # Fixtures: DGPs with known true effects
 # ======================================================================
@@ -138,16 +157,18 @@ def test_class_interface(self, small_data):
         cate = est.effect()
         assert len(cate) == len(small_data)
 
-    def test_effect_new_data(self, small_data):
+    def test_effect_new_data(self, small_data, monkeypatch):
         est = TARNet(data=small_data, y='y', treat='d',
                      covariates=['x1', 'x2'],
                      epochs=50, repr_layers=(64,),
                      head_layers=(32,), n_bootstrap=50)
         est.fit()
 
         X_new = np.random.randn(10, 2).astype(np.float32)
+        devices = _spy_tensor_devices(monkeypatch)
         cate_new = est.effect(X_new)
         assert len(cate_new) == 10
+        assert devices[-1] == _module_device(est._repr_net)
 
     def test_summary_renders(self, small_data):
         result = tarnet(small_data, y='y', treat='d',
@@ -233,7 +254,7 @@ def test_citation(self, small_data):
         bib = result.cite()
         assert 'shalit2017' in bib
 
-    def test_class_effect_method(self, small_data):
+    def test_class_effect_method(self, small_data, monkeypatch):
         est = CFRNet(data=small_data, y='y', treat='d',
                      covariates=['x1', 'x2'],
                      epochs=50, repr_layers=(64,),
@@ -243,8 +264,10 @@ def test_class_effect_method(self, small_data):
         assert len(cate) == len(small_data)
 
         X_new = np.random.randn(5, 2).astype(np.float32)
+        devices = _spy_tensor_devices(monkeypatch)
         cate_new = est.effect(X_new)
         assert len(cate_new) == 5
+        assert devices[-1] == _module_device(est._repr_net)
 
 
 # ======================================================================
@@ -285,18 +308,20 @@ def test_propensity_scores(self, small_data):
         assert np.all(e >= 0.01)
         assert np.all(e <= 0.99)
 
-    def test_propensity_new_data(self, small_data):
+    def test_propensity_new_data(self, small_data, monkeypatch):
         est = DragonNet(data=small_data, y='y', treat='d',
                         covariates=['x1', 'x2'],
                         epochs=50, repr_layers=(64,),
                         head_layers=(32,), n_bootstrap=50)
         est.fit()
 
         X_new = np.random.randn(10, 2).astype(np.float32)
+        devices = _spy_tensor_devices(monkeypatch)
         e_new = est.propensity(X_new)
         assert len(e_new) == 10
         assert np.all(e_new >= 0.01)
         assert np.all(e_new <= 0.99)
+        assert devices[-1] == _module_device(est._repr_net)
 
     def test_constant_effect_recovery(self, constant_effect_data):
         result = dragonnet(constant_effect_data, y='y', treat='d',
@@ -324,7 +349,7 @@ def test_citation(self, small_data):
         bib = result.cite()
         assert 'shi2019' in bib
 
-    def test_effect_method(self, small_data):
+    def test_effect_method(self, small_data, monkeypatch):
         est = DragonNet(data=small_data, y='y', treat='d',
                         covariates=['x1', 'x2'],
                         epochs=50, repr_layers=(64,),
@@ -335,8 +360,10 @@ def test_effect_method(self, small_data):
         assert len(cate) == len(small_data)
 
         X_new = np.random.randn(5, 2).astype(np.float32)
+        devices = _spy_tensor_devices(monkeypatch)
         cate_new = est.effect(X_new)
         assert len(cate_new) == 5
+        assert devices[-1] == _module_device(est._repr_net)
 
 
 # ======================================================================
diff --git a/tests/test_torch_device_resolver.py b/tests/test_torch_device_resolver.py
@@ -54,6 +54,14 @@ def test_explicit_cuda_returns_when_available(monkeypatch):
     assert dev.type == "cuda"
 
 
+def test_explicit_cuda_index_checks_device_count(monkeypatch):
+    monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+    monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+    monkeypatch.setenv("STATSPAI_TORCH_DEVICE", "cuda:1")
+    with pytest.raises(RuntimeError, match="only 1 device"):
+        resolve_torch_device()
+
+
 def test_auto_falls_back_to_cpu_when_no_accelerator(monkeypatch):
     monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
     # Force the MPS probe to return False even on Apple Silicon.