simplify inspection

jsture · jsture · commit 1c19393fcabf · 2026-07-02T09:22:16.000+02:00
diff --git a/src/scikit_opls/_inspection.py b/src/scikit_opls/_inspection.py
@@ -1,19 +1,9 @@
-"""Internal stateless math for OPLS VIP scores and explained-variance metrics.
+"""Stateless math helpers for OPLS explained-variance and VIP diagnostics.
 
-Private module — not part of the public API. The VIP scores are exposed as lazy
-``vip_`` / ``ortho_vip_`` properties on :class:`~scikit_opls.OPLS` and
-:class:`~scikit_opls.OPLSDA`; these functions compute them from fitted weights.
-
-VIP (Variable Importance in Projection) is defined in the style of Galindo-Prieto
-et al. (2014); these are not intended to reproduce ropls VIP values exactly:
-
-- predictive VIP is the standard PLS VIP of the predictive model fitted on the
-  orthogonally filtered X, weighting each component by the Y variance it explains;
-- orthogonal VIP is an X-variance-weighted score for the removed orthogonal
-  components, weighting each component by the X variance it explains.
-
-For non-empty blocks with positive explained variance, VIP is normalized so that
-sum(vip**2) == n_features. Empty or degenerate blocks return zeros.
+Private module — not part of the public API. Used by the fitted attributes of
+:class:`~scikit_opls.OPLS` and :class:`~scikit_opls.OPLSDA`. VIP scores are
+normalized so ``sum(vip**2) == n_features`` when component importance is
+positive; degenerate inputs return zeros.
 """
 
 from __future__ import annotations
@@ -27,15 +17,17 @@
 def _safe_total_ss(X: NDArray[np.float64]) -> float:
     """Total sum of squares with a nonzero guard."""
     total = float(np.sum(np.asarray(X, dtype=np.float64) ** 2))
-    return max(total, np.finfo(np.float64).eps)
+    return max(total, _EPS)
 
 
-def component_explained_x_variance(
+def _validate_x_scores_loadings(
     X: NDArray[np.float64],
     scores: NDArray[np.float64],
     loadings: NDArray[np.float64],
-) -> NDArray[np.float64]:
-    """Per-component ``SS(t_i @ p_i.T) / SS(X)`` for fitted arrays."""
+) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
+    X = np.asarray(X, dtype=np.float64)
+    scores = np.asarray(scores, dtype=np.float64)
+    loadings = np.asarray(loadings, dtype=np.float64)
     if X.ndim != 2 or scores.ndim != 2 or loadings.ndim != 2:
         raise ValueError("X, scores and loadings must all be 2D arrays.")
     if scores.shape[0] != X.shape[0]:
@@ -44,6 +36,22 @@ def component_explained_x_variance(
         raise ValueError("loadings must have one row per feature of X.")
     if scores.shape[1] != loadings.shape[1]:
         raise ValueError("scores and loadings must have the same number of components.")
+    if not np.all(np.isfinite(X)):
+        raise ValueError("X must contain only finite values.")
+    if not np.all(np.isfinite(scores)):
+        raise ValueError("scores must contain only finite values.")
+    if not np.all(np.isfinite(loadings)):
+        raise ValueError("loadings must contain only finite values.")
+    return X, scores, loadings
+
+
+def component_explained_x_variance(
+    X: NDArray[np.float64],
+    scores: NDArray[np.float64],
+    loadings: NDArray[np.float64],
+) -> NDArray[np.float64]:
+    """Per-component ``SS(t_i @ p_i.T) / SS(X)`` for fitted arrays."""
+    X, scores, loadings = _validate_x_scores_loadings(X, scores, loadings)
     total = _safe_total_ss(X)
     out = np.empty(scores.shape[1], dtype=np.float64)
     for i in range(scores.shape[1]):
@@ -52,28 +60,6 @@ def component_explained_x_variance(
     return out
 
 
-def cumulative_r2_from_residuals(
-    original: NDArray[np.float64],
-    residuals_by_component: list[NDArray[np.float64]],
-) -> NDArray[np.float64]:
-    """Cumulative R² from a sequence of residual matrices."""
-    total = _safe_total_ss(original)
-    return np.asarray(
-        [1.0 - float(np.sum(resid**2)) / total for resid in residuals_by_component],
-        dtype=np.float64,
-    )
-
-
-def component_r2_from_cumulative(
-    cumulative: NDArray[np.float64],
-) -> NDArray[np.float64]:
-    """Convert cumulative R² to per-component increments."""
-    cumulative = np.asarray(cumulative, dtype=np.float64)
-    if cumulative.size == 0:
-        return cumulative
-    return np.diff(np.r_[0.0, cumulative])
-
-
 def component_r2y_from_scores(
     y: NDArray[np.float64],
     scores: NDArray[np.float64],
@@ -87,10 +73,28 @@ def component_r2y_from_scores(
     y_arr = np.asarray(y, dtype=np.float64)
     if y_arr.ndim == 1:
         y_arr = y_arr.reshape(-1, 1)
+    if y_arr.ndim != 2:
+        raise ValueError(f"y must be 1D or 2D, got shape {y_arr.shape}.")
     T = np.asarray(scores, dtype=np.float64)
+    if T.ndim != 2:
+        raise ValueError(f"scores must be 2D, got shape {T.shape}.")
     Q = np.asarray(y_loadings, dtype=np.float64)
     if Q.ndim == 1:
-        Q = Q.reshape(-1, 1)
+        # A 1D y_loadings is one value per component (single target), matching the
+        # (n_targets, n_components) convention used elsewhere (predictive_vip).
+        Q = Q.reshape(1, -1)
+    elif Q.ndim != 2:
+        raise ValueError(f"y_loadings must be 1D or 2D, got shape {Q.shape}.")
+    if T.shape[0] != y_arr.shape[0]:
+        raise ValueError("scores must have one row per sample of y.")
+    if Q.shape[1] != T.shape[1]:
+        raise ValueError("y_loadings must have one column per component.")
+    if not np.all(np.isfinite(y_arr)):
+        raise ValueError("y must contain only finite values.")
+    if not np.all(np.isfinite(T)):
+        raise ValueError("scores must contain only finite values.")
+    if not np.all(np.isfinite(Q)):
+        raise ValueError("y_loadings must contain only finite values.")
     total = _safe_total_ss(y_arr - y_arr.mean(axis=0, keepdims=True))
     out = np.empty(T.shape[1], dtype=np.float64)
     for i in range(T.shape[1]):
@@ -105,16 +109,9 @@ def explained_x_variance(
     loadings: NDArray[np.float64],
 ) -> float:
     """Nominal ``SS(T @ P.T) / SS(X)``; not clipped to ``[0, 1]``."""
-    if X.ndim != 2 or scores.ndim != 2 or loadings.ndim != 2:
-        raise ValueError("X, scores and loadings must all be 2D arrays.")
-    if scores.shape[0] != X.shape[0]:
-        raise ValueError("scores must have one row per sample of X.")
-    if loadings.shape[0] != X.shape[1]:
-        raise ValueError("loadings must have one row per feature of X.")
+    X, scores, loadings = _validate_x_scores_loadings(X, scores, loadings)
     if scores.shape[1] == 0:
         return 0.0
-    if scores.shape[1] != loadings.shape[1]:
-        raise ValueError("scores and loadings must have the same number of components.")
     total = float(np.sum(X**2))
     if total <= 0.0:
         return 0.0
@@ -124,19 +121,9 @@ def explained_x_variance(
 def _weighted_vip(
     weights: NDArray[np.float64], ss_per_component: NDArray[np.float64]
 ) -> NDArray[np.float64]:
-    """VIP from per-component weight vectors and their importance weights.
-
-    Parameters
-    ----------
-    weights : ndarray of shape (n_features, n_components)
-        Per-component weight vectors.
-    ss_per_component : ndarray of shape (n_components,)
-        Non-negative variance explained by each component.
+    """Return VIP scores from component weights and importance values.
 
-    Returns
-    -------
-    vip : ndarray of shape (n_features,)
-        VIP scores; all-zero when there are no components or zero total variance.
+    Zeros for empty components or zero total importance.
     """
     if weights.ndim != 2:
         raise ValueError(f"weights must be 2D, got shape {weights.shape}.")
@@ -171,26 +158,15 @@ def predictive_vip(
     x_scores: NDArray[np.float64],
     y_loadings: NDArray[np.float64],
 ) -> NDArray[np.float64]:
-    """Predictive VIP from the engine's weights/scores/Y-loadings.
-
-    Parameters
-    ----------
-    x_weights : ndarray of shape (n_features, n_components)
-        Predictive weight vectors.
-    x_scores : ndarray of shape (n_samples, n_components)
-        Predictive scores.
-    y_loadings : ndarray of shape (n_components,) or (1, n_components)
-        Y-loadings of the predictive components.
-
-    Returns
-    -------
-    vip : ndarray of shape (n_features,)
-        Predictive VIP scores.
-    """
+    """Return predictive PLS VIP from weights, scores and Y-loadings."""
     if x_weights.ndim != 2:
         raise ValueError(f"x_weights must be 2D, got shape {x_weights.shape}.")
     if x_scores.ndim != 2:
         raise ValueError(f"x_scores must be 2D, got shape {x_scores.shape}.")
+    if not np.all(np.isfinite(x_weights)):
+        raise ValueError("x_weights must contain only finite values.")
+    if not np.all(np.isfinite(x_scores)):
+        raise ValueError("x_scores must contain only finite values.")
 
     _, n_components = x_weights.shape
     if x_scores.shape[1] != n_components:
@@ -213,6 +189,8 @@ def predictive_vip(
         y_loadings_2d = y_loadings
     else:
         raise ValueError(f"y_loadings must be 1D or 2D, got shape {y_loadings.shape}.")
+    if not np.all(np.isfinite(y_loadings_2d)):
+        raise ValueError("y_loadings must contain only finite values.")
 
     # Standard PLS VIP weights each component by the Y sum of squares explained by
     # that component: loading strength times score energy.
@@ -225,22 +203,7 @@ def orthogonal_vip(
     x_ortho_scores: NDArray[np.float64],
     x_ortho_loadings: NDArray[np.float64],
 ) -> NDArray[np.float64]:
-    """Orthogonal VIP, each component weighted by the X variance it captures.
-
-    Parameters
-    ----------
-    x_ortho_weights : ndarray of shape (n_features, n_orthogonal)
-        Orthogonal weight vectors.
-    x_ortho_scores : ndarray of shape (n_samples, n_orthogonal)
-        Orthogonal scores.
-    x_ortho_loadings : ndarray of shape (n_features, n_orthogonal)
-        Orthogonal loadings.
-
-    Returns
-    -------
-    vip : ndarray of shape (n_features,)
-        Orthogonal VIP scores.
-    """
+    """Return orthogonal VIP weighted by removed X variance."""
     if x_ortho_weights.ndim != 2:
         raise ValueError(
             f"x_ortho_weights must be 2D, got shape {x_ortho_weights.shape}."
@@ -253,6 +216,12 @@ def orthogonal_vip(
         raise ValueError(
             f"x_ortho_loadings must be 2D, got shape {x_ortho_loadings.shape}."
         )
+    if not np.all(np.isfinite(x_ortho_weights)):
+        raise ValueError("x_ortho_weights must contain only finite values.")
+    if not np.all(np.isfinite(x_ortho_scores)):
+        raise ValueError("x_ortho_scores must contain only finite values.")
+    if not np.all(np.isfinite(x_ortho_loadings)):
+        raise ValueError("x_ortho_loadings must contain only finite values.")
 
     n_features, n_components = x_ortho_weights.shape
     if x_ortho_scores.shape[1] != n_components:
diff --git a/tests/test_diagnostics.py b/tests/test_diagnostics.py
@@ -257,7 +257,7 @@ def test_pls_engine_exposes_internal_x_mean_for_reconstruction():
     X, y = _regression_data()
     model = OPLS(n_components=1, n_orthogonal=0, scale="none").fit(X, y)
     assert hasattr(model.pls_, "_x_mean")
-    assert model.pls_._x_mean.shape == (X.shape[1],)
+    assert getattr(model.pls_, "_x_mean").shape == (X.shape[1],)
 
 
 # ==============================================================================
@@ -335,22 +335,15 @@ def test_oplsda_diagnostics_expect_raw_x_not_prescaled_x():
     )
 
 
-def test_component_r2_from_cumulative_empty_and_differences():
-    """Verify component_r2_from_cumulative with empty and non-empty inputs."""
-    from scikit_opls._inspection import component_r2_from_cumulative
+def test_component_r2y_from_scores_1d_y_loadings_matches_2d():
+    """A 1D y_loadings is one value per component, matching the 2D (1, n) form."""
+    from scikit_opls._inspection import component_r2y_from_scores
 
-    assert component_r2_from_cumulative(np.array([])).shape == (0,)
-    np.testing.assert_allclose(
-        component_r2_from_cumulative(np.array([0.2, 0.5, 0.8])),
-        [0.2, 0.3, 0.3],
-    )
-
-
-def test_cumulative_r2_from_residuals_decreases_with_smaller_residuals():
-    """Verify cumulative_r2_from_residuals decreases as residuals decrease."""
-    from scikit_opls._inspection import cumulative_r2_from_residuals
+    rng = np.random.default_rng(0)
+    y = rng.normal(size=10)
+    T = rng.normal(size=(10, 3))
+    q_1d = rng.normal(size=3)
 
-    X = np.eye(4)
-    out = cumulative_r2_from_residuals(X, [0.5 * X, 0.1 * X])
-    assert out[1] > out[0]
-    assert np.all((0.0 <= out) & (out <= 1.0))
+    out_1d = component_r2y_from_scores(y, T, q_1d)
+    out_2d = component_r2y_from_scores(y, T, q_1d.reshape(1, -1))
+    np.testing.assert_allclose(out_1d, out_2d)