Skip to content

Commit 0da2161

Browse files
authored
[FEAT] Pass a separate validation dataframe (#1498)
1 parent 9bc90c1 commit 0da2161

2 files changed

Lines changed: 85 additions & 3 deletions

File tree

neuralforecast/core.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@ def fit(
464464
df: Optional[Union[DataFrame, SparkDataFrame, Sequence[str]]] = None,
465465
static_df: Optional[Union[DataFrame, SparkDataFrame]] = None,
466466
val_size: Optional[int] = 0,
467+
val_df: Optional[DataFrame] = None,
467468
use_init_models: bool = False,
468469
verbose: bool = False,
469470
id_col: str = "unique_id",
@@ -481,7 +482,11 @@ def fit(
481482
df (pandas, polars or spark DataFrame, or a list of parquet files containing the series, optional): DataFrame with columns [`unique_id`, `ds`, `y`] and exogenous variables.
482483
If None, a previously stored dataset is required.
483484
static_df (pandas, polars or spark DataFrame, optional): DataFrame with columns [`unique_id`] and static exogenous.
484-
val_size (int, optional): Size of validation set.
485+
val_size (int, optional): Size of validation set. Cannot be used together with `val_df`.
486+
val_df (pandas or polars DataFrame, optional): Explicit validation DataFrame with columns [`unique_id`, `ds`, `y`] and exogenous variables.
487+
`val_df` can be temporally independent (no requirement that it starts immediately after `df`).
488+
Cannot be used together with `val_size`. Only supported when `df` is a pandas or polars DataFrame.
489+
All series in `val_df` must have the same length.
485490
use_init_models (bool, optional): Use initial model passed when NeuralForecast object was instantiated.
486491
verbose (bool): Print processing steps.
487492
id_col (str): Column that identifies each serie.
@@ -496,12 +501,22 @@ def fit(
496501
if (df is None) and not (hasattr(self, "dataset")):
497502
raise Exception("You must pass a DataFrame or have one stored.")
498503

504+
if val_df is not None and val_size != 0:
505+
raise ValueError(
506+
"val_df and val_size cannot be set together. "
507+
"Set val_size=0 (default) when providing val_df."
508+
)
509+
510+
if val_df is not None and not isinstance(val_df, (pd.DataFrame, pl_DataFrame)):
511+
raise ValueError("val_df must be a pandas or polars DataFrame.")
512+
499513
# Model and datasets interactions protections
500514
if (
501515
any(model.early_stop_patience_steps > 0 for model in self.models)
502516
and val_size == 0
517+
and val_df is None
503518
):
504-
raise Exception("Set val_size>0 if early stopping is enabled.")
519+
raise Exception("Set val_size>0 or provide a val_df if early stopping is enabled.")
505520

506521
if (val_size is not None) and (0 < val_size < self.h):
507522
raise ValueError(
@@ -578,6 +593,28 @@ def fit(
578593
f"`df` must be a pandas, polars or spark DataFrame, or a list of parquet files containing the series, or `None`, got: {type(df)}"
579594
)
580595

596+
if val_df is not None:
597+
if isinstance(df, (SparkDataFrame,)) or (
598+
isinstance(df, Sequence) and not isinstance(df, str)
599+
):
600+
raise ValueError(
601+
"val_df is only supported when df is a pandas or polars DataFrame."
602+
)
603+
val_dataset = self.dataset.align(
604+
val_df, id_col=id_col, time_col=time_col, target_col=target_col
605+
)
606+
if val_dataset.min_size != val_dataset.max_size:
607+
raise ValueError(
608+
"All series in val_df must be of equal length. "
609+
"Found series lengths ranging from "
610+
f"{val_dataset.min_size} to {val_dataset.max_size}"
611+
)
612+
val_size = val_dataset.min_size
613+
self.dataset = self.dataset.append(val_dataset)
614+
_, _, self.last_dates, _ = TimeSeriesDataset.from_df(
615+
df=val_df, id_col=id_col, time_col=time_col, target_col=target_col
616+
)
617+
581618
if val_size is not None:
582619
if self.dataset.min_size < val_size:
583620
warnings.warn(

tests/test_core.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def test_neural_forecast_early_stopping(setup_airplane_data):
139139
AirPassengersPanel_train, _ = setup_airplane_data
140140
models = [NHITS(h=12, input_size=12, max_steps=1, early_stop_patience_steps=5)]
141141
nf = NeuralForecast(models=models, freq="M")
142-
with pytest.raises(Exception, match="Set val_size>0 if early stopping is enabled."):
142+
with pytest.raises(Exception, match="Set val_size>0 or provide a val_df if early stopping is enabled."):
143143
nf.fit(AirPassengersPanel_train)
144144

145145

@@ -1336,6 +1336,51 @@ def test_order_of_variables_no_effect_on_val_loss(setup_airplane_data, scaler_ty
13361336
assert valid_losses[-1][1] > 10, "Validation loss is too low"
13371337

13381338

1339+
def test_val_df_parameter_validation(setup_airplane_data):
1340+
AirPassengersPanel_train, _ = setup_airplane_data
1341+
nf = NeuralForecast(
1342+
models=[NHITS(h=12, input_size=24, max_steps=1)], freq="M"
1343+
)
1344+
val_df = (
1345+
AirPassengersPanel_train.groupby("unique_id", observed=True)
1346+
.tail(12)
1347+
.reset_index(drop=True)
1348+
)
1349+
with pytest.raises(ValueError, match="val_df and val_size cannot be set together"):
1350+
nf.fit(AirPassengersPanel_train, val_size=12, val_df=val_df)
1351+
1352+
1353+
def test_val_df_equivalence_with_val_size(setup_airplane_data):
1354+
# Splitting off the last 12 rows per series as val_df and passing them
1355+
# explicitly must produce the same valid_trajectories as using val_size=12
1356+
# on the full training DataFrame (same combined dataset, same random seed).
1357+
AirPassengersPanel_train, _ = setup_airplane_data
1358+
val_size = 12
1359+
1360+
train_df = (
1361+
AirPassengersPanel_train.groupby("unique_id", observed=True)
1362+
.apply(lambda x: x.iloc[:-val_size])
1363+
.reset_index(drop=True)
1364+
)
1365+
val_df = (
1366+
AirPassengersPanel_train.groupby("unique_id", observed=True)
1367+
.tail(val_size)
1368+
.reset_index(drop=True)
1369+
)
1370+
1371+
model_kwargs = dict(h=12, input_size=24, max_steps=10, random_seed=42)
1372+
1373+
nf_val_size = NeuralForecast(models=[NHITS(**model_kwargs)], freq="M")
1374+
nf_val_size.fit(AirPassengersPanel_train, val_size=val_size)
1375+
1376+
nf_val_df = NeuralForecast(models=[NHITS(**model_kwargs)], freq="M")
1377+
nf_val_df.fit(train_df, val_df=val_df)
1378+
1379+
losses_val_size = nf_val_size.models[0].valid_trajectories
1380+
losses_val_df = nf_val_df.models[0].valid_trajectories
1381+
1382+
np.testing.assert_allclose(losses_val_size, losses_val_df, atol=1e-4)
1383+
13391384

13401385
@pytest.mark.parametrize("model,expected_error", [
13411386
(NHITS(h=12, input_size=24, max_steps=50, hist_exog_list=["not_included"], scaler_type="robust"),

0 commit comments

Comments
 (0)