Merge pull request #9 from brycewang-stanford/feat/arima-se-accessor

brycewang-stanford · web-flow · commit 2f61a5414975 · 2026-05-28T19:13:43.000-07:00
feat(arima): expose standard errors on ARIMAResult (#7)
diff --git a/.github/workflows/citation-audit.yml b/.github/workflows/citation-audit.yml
@@ -138,7 +138,27 @@ jobs:
         # Live verification against arXiv / NBER / Crossref. --strict:
         # unresolved IDs fail alongside mismatches, so a typo that
         # breaks primary-source lookup is caught early.
-        run: python tools/audit_citations.py --strict --out audit_report.md
+        #
+        # Exit-code contract (tools/audit_citations.py main()):
+        #   0 — clean.
+        #   1 — mismatch, or a GENUINE unresolved id (source reachable
+        #       but the id is absent) → a real §10 zero-hallucination
+        #       failure. Blocks the merge.
+        #   2 — soft failure: the ONLY unresolved ids were transient
+        #       upstream errors (arXiv / Crossref 429 rate-limit on the
+        #       shared runner IP, or a network blip). NOT a bad citation,
+        #       so it must not block a merge — we surface it as a warning
+        #       and pass. The auditor already retries 429/5xx with
+        #       back-off before giving up.
+        run: |
+          set +e
+          python tools/audit_citations.py --strict --out audit_report.md
+          code=$?
+          if [ "$code" -eq 2 ]; then
+            echo "::warning title=Citation audit soft failure::Auditor could not reach arXiv/Crossref (rate limit / network); no mismatch detected — treating as a soft pass (exit 2)."
+            exit 0
+          fi
+          exit "$code"
 
       - name: Upload citation audit report
         if: always()
diff --git a/src/statspai/timeseries/arima.py b/src/statspai/timeseries/arima.py
@@ -25,6 +25,7 @@ class ARIMAResult:
     order: Tuple[int, int, int]
     seasonal_order: Optional[Tuple[int, int, int, int]]
     params: pd.Series
+    se: pd.Series                    # asymptotic standard errors (param index)
     aic: float
     bic: float
     aicc: float
@@ -34,6 +35,44 @@ class ARIMAResult:
     n: int
     _model: object                   # statsmodels result (opaque)
 
+    # --- inference accessors -------------------------------------------------
+    @property
+    def std_errors(self) -> pd.Series:
+        """Alias for :attr:`se` (regression-style naming)."""
+        return self.se
+
+    @property
+    def tvalues(self) -> pd.Series:
+        """z-statistics ``params / se`` (SARIMAX uses a normal reference)."""
+        return self.params / self.se
+
+    @property
+    def pvalues(self) -> pd.Series:
+        """Two-sided p-values from the normal reference distribution."""
+        from scipy import stats
+        z = (self.params / self.se).to_numpy()
+        return pd.Series(2.0 * stats.norm.sf(np.abs(z)), index=self.params.index)
+
+    def conf_int(self, alpha: float = 0.05) -> pd.DataFrame:
+        """Confidence intervals for each parameter.
+
+        Parameters
+        ----------
+        alpha : float, default 0.05
+            ``1 - alpha`` is the coverage (0.05 → 95% CI).
+
+        Returns
+        -------
+        pd.DataFrame
+            Indexed by parameter name with ``lower`` / ``upper`` columns.
+        """
+        from scipy import stats
+        z = stats.norm.ppf(1.0 - alpha / 2.0)
+        lower = self.params - z * self.se
+        upper = self.params + z * self.se
+        return pd.DataFrame({"lower": lower, "upper": upper},
+                            index=self.params.index)
+
     def forecast(self, horizon: int = 10, alpha: float = 0.05) -> pd.DataFrame:
         fc = self._model.get_forecast(steps=horizon)
         pred = np.asarray(fc.predicted_mean).ravel()
@@ -71,10 +110,16 @@ def summary(self) -> str:
             f"AICc       : {self.aicc:.2f}",
             f"Log-Lik    : {self.log_likelihood:.2f}",
             "",
-            "Parameters:",
+            f"  {'':<15s}  {'coef':>10s}  {'std err':>10s}  {'z':>8s}  {'P>|z|':>8s}",
         ]
+        pvals = self.pvalues
         for nm, val in self.params.items():
-            lines.append(f"  {nm:<15s}  {val: .4f}")
+            s = float(self.se.get(nm, np.nan))
+            z = val / s if s and np.isfinite(s) else np.nan
+            p = float(pvals.get(nm, np.nan))
+            lines.append(
+                f"  {nm:<15s}  {val:>10.4f}  {s:>10.4f}  {z:>8.3f}  {p:>8.3f}"
+            )
         return "\n".join(lines)
 
     def __repr__(self) -> str:
@@ -104,6 +149,21 @@ def arima(
         If True, select (p, d, q) by AICc grid search (ignores ``order``).
     max_p, max_q, max_d : int
         Bounds for the auto search.
+
+    Returns
+    -------
+    ARIMAResult
+        Exposes ``params`` and the matching standard errors ``se`` (alias
+        ``std_errors``), plus ``tvalues``, ``pvalues``, and
+        ``conf_int(alpha)`` for inference, alongside ``aic`` / ``bic`` /
+        ``aicc`` / ``log_likelihood`` and ``forecast`` / ``plot``.
+
+    Examples
+    --------
+    >>> import statspai as sp
+    >>> res = sp.arima(df["gdp"], order=(2, 0, 0))
+    >>> res.se            # standard errors, indexed by parameter name
+    >>> res.conf_int()    # 95% confidence intervals
     """
     try:
         from statsmodels.tsa.statespace.sarimax import SARIMAX
@@ -148,10 +208,21 @@ def arima(
     k = sum(order) + 1
     aicc = res.aic + 2 * k * (k + 1) / max(n - k - 1, 1)
 
+    _param_index = res.param_names if hasattr(res, "param_names") else None
+    _params = pd.Series(res.params, index=_param_index)
+    # statsmodels computes the asymptotic SEs (sqrt of the diagonal of the
+    # covariance of the MLE) but we never surfaced them before; expose them.
+    _bse = getattr(res, "bse", None)
+    if _bse is not None:
+        _se = pd.Series(np.asarray(_bse, dtype=float), index=_param_index)
+    else:  # pragma: no cover - defensive; SARIMAX always populates bse
+        _se = pd.Series(np.full(len(_params), np.nan), index=_param_index)
+
     _result = ARIMAResult(
         order=order,
         seasonal_order=seasonal_order,
-        params=pd.Series(res.params, index=res.param_names) if hasattr(res, "param_names") else pd.Series(res.params),
+        params=_params,
+        se=_se,
         aic=float(res.aic),
         bic=float(res.bic),
         aicc=float(aicc),
diff --git a/tests/r_parity/39_arima.py b/tests/r_parity/39_arima.py
@@ -0,0 +1,61 @@
+"""StatsPAI ARIMA parity (Python side) -- Module 39.
+
+DGP: AR(2) with phi1=0.6, phi2=-0.2. Fits ARIMA(2,0,0). The
+companion R/Stata sides fit the same model.
+
+sp.arima now exposes ``ARIMAResult.se`` (statsmodels' asymptotic SEs),
+so we compare standard errors alongside the point estimates and logLik.
+
+Tolerance: rel < 1e-3 on AR coefficients.
+"""
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import statspai as sp
+
+from _common import PARITY_SEED, ParityRecord, dump_csv, write_results
+
+
+MODULE = "39_arima"
+
+
+def make_data(T: int = 300, seed: int = PARITY_SEED) -> pd.DataFrame:
+    rng = np.random.default_rng(seed)
+    y = np.zeros(T)
+    eps = rng.normal(0, 0.7, T)
+    for t in range(2, T):
+        y[t] = 0.6 * y[t - 1] - 0.2 * y[t - 2] + eps[t]
+    return pd.DataFrame({"y": y})
+
+
+def main() -> None:
+    df = make_data()
+    dump_csv(df, MODULE)
+
+    res = sp.arima(df["y"].values, order=(2, 0, 0))
+
+    rows: list[ParityRecord] = [
+        ParityRecord(MODULE, "py", "ar1",
+                     estimate=float(res.params["ar.L1"]),
+                     se=float(res.se["ar.L1"]),
+                     n=int(len(df))),
+        ParityRecord(MODULE, "py", "ar2",
+                     estimate=float(res.params["ar.L2"]),
+                     se=float(res.se["ar.L2"]),
+                     n=int(len(df))),
+        ParityRecord(MODULE, "py", "sigma2",
+                     estimate=float(res.params["sigma2"]),
+                     se=float(res.se["sigma2"]),
+                     n=int(len(df))),
+        ParityRecord(MODULE, "py", "logLik",
+                     estimate=float(res.log_likelihood),
+                     n=int(len(df))),
+    ]
+
+    write_results(MODULE, "py", rows,
+                  extra={"order": "(2,0,0)", "engine": "statsmodels"})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/r_parity/results/39_arima_py.json b/tests/r_parity/results/39_arima_py.json
@@ -0,0 +1,54 @@
+{
+  "module": "39_arima",
+  "side": "py",
+  "rows": [
+    {
+      "module": "39_arima",
+      "side": "py",
+      "statistic": "ar1",
+      "estimate": 0.7018913469353353,
+      "se": 0.04784543441281086,
+      "ci_lo": null,
+      "ci_hi": null,
+      "n": 300,
+      "extra": {}
+    },
+    {
+      "module": "39_arima",
+      "side": "py",
+      "statistic": "ar2",
+      "estimate": -0.34289324080723854,
+      "se": 0.05372735783018966,
+      "ci_lo": null,
+      "ci_hi": null,
+      "n": 300,
+      "extra": {}
+    },
+    {
+      "module": "39_arima",
+      "side": "py",
+      "statistic": "sigma2",
+      "estimate": 0.4143918597001537,
+      "se": 0.03353450058875001,
+      "ci_lo": null,
+      "ci_hi": null,
+      "n": 300,
+      "extra": {}
+    },
+    {
+      "module": "39_arima",
+      "side": "py",
+      "statistic": "logLik",
+      "estimate": -291.58314136058783,
+      "se": null,
+      "ci_lo": null,
+      "ci_hi": null,
+      "n": 300,
+      "extra": {}
+    }
+  ],
+  "extra": {
+    "order": "(2,0,0)",
+    "engine": "statsmodels"
+  }
+}
diff --git a/tests/test_arima.py b/tests/test_arima.py
@@ -27,6 +27,41 @@ def test_arima_auto_selects(rw):
     assert res.aicc < 560  # should beat a bad model
 
 
+def test_arima_standard_errors(rw):
+    res = arima(rw, order=(1, 1, 0))
+    # se is exposed, aligned with params, positive and finite
+    assert res.se is not None
+    assert list(res.se.index) == list(res.params.index)
+    assert np.all(np.isfinite(res.se.to_numpy()))
+    assert np.all(res.se.to_numpy() > 0)
+    # std_errors is an alias for se
+    assert res.std_errors.equals(res.se)
+
+
+def test_arima_conf_int_and_pvalues(rw):
+    res = arima(rw, order=(2, 0, 0))
+    ci = res.conf_int(alpha=0.05)
+    assert list(ci.columns) == ["lower", "upper"]
+    assert list(ci.index) == list(res.params.index)
+    # params lie inside their own CI; bounds ordered
+    assert np.all(ci["lower"].to_numpy() <= res.params.to_numpy())
+    assert np.all(res.params.to_numpy() <= ci["upper"].to_numpy())
+    assert np.all(ci["lower"].to_numpy() < ci["upper"].to_numpy())
+    # pvalues in [0, 1], z = params / se
+    pv = res.pvalues
+    assert np.all((pv.to_numpy() >= 0) & (pv.to_numpy() <= 1))
+    np.testing.assert_allclose(res.tvalues.to_numpy(),
+                               (res.params / res.se).to_numpy())
+
+
+def test_arima_se_matches_statsmodels(rw):
+    # the exposed se must equal statsmodels' bse on the underlying fit
+    res = arima(rw, order=(1, 1, 1))
+    np.testing.assert_allclose(res.se.to_numpy(),
+                               np.asarray(res._model.bse, dtype=float),
+                               rtol=1e-12, atol=0)
+
+
 def test_exported():
     import statspai as sp
     assert callable(sp.arima)
diff --git a/tests/test_audit_citations.py b/tests/test_audit_citations.py
diff --git a/tools/audit_citations.py b/tools/audit_citations.py