fixed name error and plotting in OPLSDA

jsture · jsture · commit 9c7348e26f46 · 2026-06-29T13:26:00.000+01:00
diff --git a/src/scikit_opls/_opls.py b/src/scikit_opls/_opls.py
@@ -493,19 +493,36 @@ def score(self, X: ArrayLike, y: ArrayLike, sample_weight=None) -> float:
         """
         return super().score(X, y, sample_weight)
 
-    def _filter(self, X: ArrayLike) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-        """Preprocess and orthogonal-filter new ``X`` exactly as at fit time.
+    def _validate_X_predict(self, X: ArrayLike) -> NDArray[np.float64]:  # noqa: N802
+        """Validate prediction/projection input against fitted OPLS metadata."""
+        check_is_fitted(self)
+        return validate_data(
+            self,
+            X,
+            dtype=np.float64,
+            copy=self.copy,
+            reset=False,
+        )
 
-        Returns the filtered ``X`` and the orthogonal scores.
-        """
-        X = validate_data(self, X, reset=False, dtype=np.float64)
+    def _filter_validated(
+        self, X: NDArray[np.float64]
+    ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
+        """Filter an already validated dense array without checking names again."""
         Xs = apply_scaling(X, self.x_mean_, self.x_std_)
         # apply_orthogonal_filter returns both the filtered matrix for prediction
         # and the replayed orthogonal scores for transform_orthogonal().
         return apply_orthogonal_filter(
             Xs, self.x_ortho_weights_, self.x_ortho_loadings_
         )
 
+    def _filter(self, X: ArrayLike) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
+        """Preprocess and orthogonal-filter new ``X`` exactly as at fit time.
+
+        Returns the filtered ``X`` and the orthogonal scores.
+        """
+        X_valid = self._validate_X_predict(X)
+        return self._filter_validated(X_valid)
+
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.regressor_tags.poor_score = True
diff --git a/src/scikit_opls/_opls_da.py b/src/scikit_opls/_opls_da.py
@@ -156,20 +156,25 @@ def fit(self, X: ArrayLike, y: ArrayLike) -> OPLSDA:
         self.n_orthogonal_ = self.opls_.n_orthogonal_
         return self
 
-    def _validate_x_predict(self, X: ArrayLike) -> NDArray[np.float64]:
+    def _validate_X_predict(self, X: ArrayLike) -> NDArray[np.float64]:  # noqa: N802
         """Validate prediction input against the outer OPLSDA fit contract."""
         check_is_fitted(self)
 
         # validate_data(..., reset=False) checks n_features_in_ and feature_names_in_
         # against OPLSDA, then returns a nameless ndarray. Passing that ndarray to
         # the inner OPLS avoids the spurious "fitted without feature names" warning.
-        return validate_data(
+        X_valid = validate_data(
             self,
             X,
             dtype=np.float64,
             copy=self.copy,
             reset=False,
         )
+        return np.asarray(X_valid, dtype=np.float64)
+
+    def _validate_x_predict(self, X: ArrayLike) -> NDArray[np.float64]:
+        """Backward-compatible alias for the canonical validation helper."""
+        return self._validate_X_predict(X)
 
     def decision_function(self, X: ArrayLike) -> NDArray[np.float64]:
         """Raw signed OPLS regression output; positive favours ``classes_[1]``.
@@ -185,8 +190,9 @@ def decision_function(self, X: ArrayLike) -> NDArray[np.float64]:
             Signed confidence; ``> 0`` predicts ``classes_[1]``. Scores equal to
             zero are assigned to ``classes_[0]`` by :meth:`predict`.
         """
-        X_valid = self._validate_x_predict(X)
-        return np.asarray(self.opls_.predict(X_valid), dtype=np.float64).ravel()
+        X_valid = self._validate_X_predict(X)
+        X_filtered, _ = self.opls_._filter_validated(X_valid)
+        return np.asarray(self.opls_.pls_.predict(X_filtered), dtype=np.float64).ravel()
 
     def predict(self, X: ArrayLike) -> NDArray:
         """Predict class labels.
diff --git a/src/scikit_opls/plotting.py b/src/scikit_opls/plotting.py
@@ -8,12 +8,13 @@ class with a :meth:`from_estimator` constructor that computes the plotted arrays
 imported lazily inside :meth:`plot`, so importing this module never requires it.
 """
 
-# check_array is under-typed (its dtype kwarg); suppress the resulting
-# static-checker false positives.
+# The sklearn validation helpers are under-typed; suppress false positives from
+# fittedness/metadata validation calls in this plotting adapter.
 # pyright: reportArgumentType=false
 
 from __future__ import annotations
 
+import warnings
 from numbers import Integral
 from typing import TYPE_CHECKING
 
@@ -22,7 +23,7 @@ class with a :meth:`from_estimator` constructor that computes the plotted arrays
 from scipy import sparse
 from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline
-from sklearn.utils.validation import check_array, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
 
 from scikit_opls._opls import OPLS
 from scikit_opls._opls_da import OPLSDA
@@ -86,11 +87,23 @@ def _unwrap_estimator_and_data(
             X = upstream.transform(X)
         inner = final_estimator
 
+    if sparse.issparse(X):
+        raise TypeError(
+            "Input to OPLS plotting is sparse, but plotting requires a dense "
+            "matrix. If it came from a Pipeline, add a densifying transformer "
+            "before the final OPLS step."
+        )
+
     if isinstance(inner, OPLSDA):
         check_is_fitted(inner)
+        # Validate against the outer classifier first: it owns n_features_in_ and
+        # feature_names_in_ for user-facing OPLSDA calls.
+        X_checked = inner._validate_X_predict(X)
         # OPLSDA is a classifier wrapper; its latent space lives on the inner OPLS.
         base = inner.opls_
     elif isinstance(inner, OPLS):
+        check_is_fitted(inner)
+        X_checked = inner._validate_X_predict(X)
         base = inner
     else:
         raise TypeError(
@@ -103,13 +116,18 @@ def _unwrap_estimator_and_data(
     if not isinstance(base, OPLS):
         raise TypeError("estimator.opls_ must be a fitted OPLS instance.")
     check_is_fitted(base)
-    if sparse.issparse(X):
+    if sparse.issparse(X_checked):
         raise TypeError(
             "Input to OPLS plotting is sparse, but plotting requires a dense "
             "matrix. If it came from a Pipeline, add a densifying transformer "
             "before the final OPLS step."
         )
-    return base, check_array(X, dtype=np.float64, ensure_min_samples=ensure_min_samples)
+    if X_checked.shape[0] < ensure_min_samples:
+        raise ValueError(
+            f"Found array with {X_checked.shape[0]} sample(s), while a minimum "
+            f"of {ensure_min_samples} is required."
+        )
+    return base, X_checked
 
 
 class OPLSScoresDisplay:
@@ -237,7 +255,7 @@ def from_estimator(
 
         # Project supplied data through the fitted filter before asking the PLS
         # engine for predictive scores.
-        X_filtered, t_ortho = base._filter(X_trans)
+        X_filtered, t_ortho = base._filter_validated(X_trans)
         scores = base.pls_.transform(X_filtered)
         if isinstance(scores, tuple):
             t_pred_arr = scores[0]
@@ -353,6 +371,7 @@ def from_estimator(
         X: ArrayLike,
         *,
         component: int = 0,
+        x_space: str = "centered",
         ax: matplotlib.axes.Axes | None = None,
     ) -> SPlotDisplay:
         """Compute the S-plot arrays from a fitted ``estimator`` and plot them.
@@ -367,6 +386,11 @@ def from_estimator(
             Samples to project.
         component : int, default=0
             The index of the predictive PLS component to plot.
+        x_space : {"centered", "scaled", "subset-centered"}, default="centered"
+            Feature space used for the covariance/correlation axes. ``"centered"``
+            uses original feature units centered by the fitted training mean,
+            ``"scaled"`` uses the model-scaled feature space, and
+            ``"subset-centered"`` centers the provided X subset by its own mean.
         ax : matplotlib Axes, default=None
             Target axes; a new figure/axes is created when ``None``.
 
@@ -384,20 +408,42 @@ def from_estimator(
                 f"component={component} is out of bounds for estimator with "
                 f"{n_pred} predictive component(s)."
             )
+        if x_space not in {"centered", "scaled", "subset-centered"}:
+            raise ValueError(
+                "x_space must be one of {'centered', 'scaled', 'subset-centered'}."
+            )
+        if X_trans.shape[0] != base.x_scores_.shape[0]:
+            warnings.warn(
+                "SPlotDisplay is usually intended for the training data. "
+                "The provided X has a different number of samples than the fitted "
+                "data; covariance and correlation will be computed on this subset.",
+                UserWarning,
+                stacklevel=2,
+            )
 
-        # S-plots are computed in the final OPLS input space, after applying the
-        # same scaling used at fit time and centering the provided sample subset.
-        Xs = apply_scaling(X_trans, base.x_mean_, base.x_std_)
-        Xs = Xs - Xs.mean(axis=0)
+        # Scores always come from the fitted model preprocessing/filtering. The
+        # S-plot axes can use original-unit or model-scaled feature space.
+        if x_space == "centered":
+            X_for_splot = X_trans - base.x_mean_
+        elif x_space == "scaled":
+            X_for_splot = apply_scaling(X_trans, base.x_mean_, base.x_std_)
+        else:
+            X_for_splot = X_trans - X_trans.mean(axis=0)
 
         # Use the fitted predictive score for the selected component as the common
         # reference vector for both covariance and correlation.
-        t = np.asarray(base.transform(X_trans))[:, component]
+        X_filtered, _ = base._filter_validated(X_trans)
+        scores = base.pls_.transform(X_filtered)
+        if isinstance(scores, tuple):
+            t_arr = scores[0]
+        else:
+            t_arr = scores
+        t = np.asarray(t_arr)[:, component]
         t = t - t.mean()
         n = t.shape[0]
 
-        covariance = Xs.T @ t / max(n - 1, 1)
-        x_std = Xs.std(axis=0, ddof=1)
+        covariance = X_for_splot.T @ t / max(n - 1, 1)
+        x_std = X_for_splot.std(axis=0, ddof=1)
         t_std = float(t.std(ddof=1))
         if t_std <= 1e-12:
             raise ValueError("Predictive score has zero variance; S-plot is undefined.")
@@ -409,8 +455,6 @@ def from_estimator(
         correlation[valid] = covariance[valid] / denom[valid]
 
         if np.any(~valid):
-            import warnings
-
             warnings.warn(
                 "Some features have zero variance; their S-plot correlations are NaN.",
                 RuntimeWarning,
diff --git a/tests/test_opls_da.py b/tests/test_opls_da.py
@@ -220,7 +220,6 @@ def test_oplsda_dataframe_predict_has_no_feature_name_warning():
         clf.predict(X)
 
     messages = [str(w.message) for w in record]
-    print(messages)
     assert not any("feature names" in message for message in messages)
 
 
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import warnings
+
 import pytest
 
 # Skip this module if matplotlib is not installed
@@ -55,6 +57,40 @@ def test_scores_plot_classification():
     plt.close("all")
 
 
+def test_scores_display_oplsda_dataframe_validates_feature_names():
+    pd = pytest.importorskip("pandas")
+    X, y = _classification_data(n_features=4)
+    df = pd.DataFrame(X, columns=["a", "b", "c", "d"])
+    model = OPLSDA(n_components=1, n_orthogonal=1).fit(df, y)
+
+    with warnings.catch_warnings(record=True) as record:
+        disp = OPLSScoresDisplay.from_estimator(model, df, y=y)
+
+    messages = [str(w.message) for w in record]
+    assert not any("feature names" in message for message in messages)
+    assert disp.t_predictive.shape == (df.shape[0],)
+
+    with pytest.raises(ValueError, match="feature names"):
+        OPLSScoresDisplay.from_estimator(model, df[["b", "a", "c", "d"]], y=y)
+
+    plt.close("all")
+
+
+def test_scores_display_replays_filter_for_new_samples():
+    X, y = _regression_data(seed=7)
+    model = OPLS(n_components=1, n_orthogonal=2).fit(X[:35], y[:35])
+    X_new = X[35:]
+
+    disp = OPLSScoresDisplay.from_estimator(model, X_new)
+
+    np.testing.assert_allclose(disp.t_predictive, model.transform(X_new)[:, 0])
+    np.testing.assert_allclose(
+        disp.t_orthogonal,
+        model.transform_orthogonal(X_new)[:, 0],
+    )
+    plt.close("all")
+
+
 def test_s_plot_regression_and_classification():
     X, y = _regression_data()
     disp1 = SPlotDisplay.from_estimator(
@@ -172,6 +208,38 @@ def test_splot_display_nan_correlation():
     assert not np.isnan(disp.correlation[1:]).any()
 
 
+def test_splot_display_x_space_controls_covariance_axis():
+    X, y = _regression_data(seed=8, n_features=4)
+    scale = np.array([1.0, 3.0, 10.0, 30.0])
+    X = X * scale
+    model = OPLS(n_components=1, n_orthogonal=1, scale="standard").fit(X, y)
+
+    centered = SPlotDisplay.from_estimator(model, X, x_space="centered")
+    scaled = SPlotDisplay.from_estimator(model, X, x_space="scaled")
+
+    assert not np.allclose(centered.covariance, scaled.covariance)
+    assert centered.correlation.shape == scaled.correlation.shape
+    plt.close("all")
+
+
+def test_splot_display_invalid_x_space_raises():
+    X, y = _regression_data()
+    model = OPLS().fit(X, y)
+
+    with pytest.raises(ValueError, match="x_space must be one of"):
+        SPlotDisplay.from_estimator(model, X, x_space="raw")
+
+
+def test_splot_display_warns_for_non_training_subset():
+    X, y = _regression_data(seed=9)
+    model = OPLS(n_components=1, n_orthogonal=1).fit(X, y)
+
+    with pytest.warns(UserWarning, match="usually intended for the training data"):
+        SPlotDisplay.from_estimator(model, X[:10])
+
+    plt.close("all")
+
+
 def test_plotting_pipeline_ending_in_opls():
     X, y = _regression_data()
     pipe = Pipeline(