[FIX] Use the common MLP as decoder in DeepAR (#1472)

marcopeix · web-flow · commit b4b3432230e2 · 2026-03-05T16:10:35.000-05:00
diff --git a/neuralforecast/common/_modules.py b/neuralforecast/common/_modules.py
@@ -41,63 +41,72 @@
 
 
 class MLP(nn.Module):  
-    """Multi-Layer Perceptron for time series forecasting.  
+    """Multi-Layer Perceptron for time series forecasting.
 
-    A feedforward neural network with configurable depth and width. The network  
-    consists of an input layer, multiple hidden layers with activation functions  
-    and dropout, and an output layer. All hidden layers have the same dimensionality.  
+    A feedforward neural network with configurable depth and width. The network
+    consists of an input layer, multiple hidden layers with activation functions
+    and dropout, and an output layer. All hidden layers have the same dimensionality.
 
-    Args:  
-        in_features (int): Dimension of input features.  
-        out_features (int): Dimension of output features.  
-        activation (str): Activation function name. Must be one of the supported  
-            activations in ACTIVATIONS list (e.g., 'ReLU', 'Tanh', 'GELU', 'ELU').  
-        hidden_size (int): Number of units in each hidden layer. All hidden layers  
-            share the same dimensionality.  
-        num_layers (int): Total number of layers including input and output layers.  
-            Must be at least 2. For example, num_layers=3 creates: input layer,  
-            one hidden layer, and output layer.  
-        dropout (float): Dropout probability applied after each hidden layer's  
+    Args:
+        in_features (int): Dimension of input features.
+        out_features (int): Dimension of output features.
+        activation (str): Activation function name. Must be one of the supported
+            activations in ACTIVATIONS list (e.g., 'ReLU', 'Tanh', 'GELU', 'ELU').
+            Ignored when num_layers=1.
+        hidden_size (int): Number of units in each hidden layer. All hidden layers
+            share the same dimensionality. Ignored when num_layers=1.
+        num_layers (int): Total number of layers including input and output layers.
+            Use num_layers=1 for a direct linear projection with no hidden layers or
+            activation. For num_layers>=2, creates: input layer, (num_layers-2) hidden
+            layers, and output layer.
+        dropout (float): Dropout probability applied after each hidden layer's
             activation. Should be in range [0.0, 1.0]. Not applied to output layer.
+            Ignored when num_layers=1.
 
     Returns:
         (torch.Tensor): Transformed output tensor of shape [..., out_features].
 
-    Notes:  
-        - The activation function is applied after each hidden layer's linear  
-          transformation, but not after the final output layer.  
-        - Dropout is applied after activation in hidden layers for regularization.  
-        - This MLP is used as a decoder component in various forecasting models  
-          including RNN, LSTM, GRU, DilatedRNN, TCN, and xLSTM.  
+    Notes:
+        - The activation function is applied after each hidden layer's linear
+          transformation, but not after the final output layer.
+        - Dropout is applied after activation in hidden layers for regularization.
+        - This MLP is used as a decoder component in various forecasting models
+          including RNN, LSTM, GRU, DilatedRNN, TCN, xLSTM, and DeepAR.
     """
 
     def __init__(
         self, in_features, out_features, activation, hidden_size, num_layers, dropout
     ):
         super().__init__()
-        assert activation in ACTIVATIONS, f"{activation} is not in {ACTIVATIONS}"
 
-        self.activation = getattr(nn, activation)()
+        if num_layers == 1:
+            # Direct linear projection with no hidden layers or activation
+            self.layers = nn.Sequential(
+                nn.Linear(in_features=in_features, out_features=out_features)
+            )
+        else:
+            assert activation in ACTIVATIONS, f"{activation} is not in {ACTIVATIONS}"
+            self.activation = getattr(nn, activation)()
 
-        # MultiLayer Perceptron
-        # Input layer
-        layers = [
-            nn.Linear(in_features=in_features, out_features=hidden_size),
-            self.activation,
-            nn.Dropout(dropout),
-        ]
-        # Hidden layers
-        for i in range(num_layers - 2):
-            layers += [
-                nn.Linear(in_features=hidden_size, out_features=hidden_size),
+            # MultiLayer Perceptron
+            # Input layer
+            layers = [
+                nn.Linear(in_features=in_features, out_features=hidden_size),
                 self.activation,
                 nn.Dropout(dropout),
             ]
-        # Output layer
-        layers += [nn.Linear(in_features=hidden_size, out_features=out_features)]
-
-        # Store in layers as ModuleList
-        self.layers = nn.Sequential(*layers)
+            # Hidden layers
+            for i in range(num_layers - 2):
+                layers += [
+                    nn.Linear(in_features=hidden_size, out_features=hidden_size),
+                    self.activation,
+                    nn.Dropout(dropout),
+                ]
+            # Output layer
+            layers += [nn.Linear(in_features=hidden_size, out_features=out_features)]
+
+            # Store in layers as ModuleList
+            self.layers = nn.Sequential(*layers)
 
     def forward(self, x):
         return self.layers(x)
diff --git a/neuralforecast/models/deepar.py b/neuralforecast/models/deepar.py
@@ -1,7 +1,7 @@
 
 
 
-__all__ = ['Decoder', 'DeepAR']
+__all__ = ['DeepAR']
 
 
 from typing import Optional
@@ -10,47 +10,10 @@
 import torch.nn as nn
 
 from ..common._base_model import BaseModel
+from ..common._modules import MLP
 from ..losses.pytorch import MAE, DistributionLoss
 
 
-class Decoder(nn.Module):
-    """Multi-Layer Perceptron Decoder
-
-    Args:
-        in_features (int): dimension of input.
-        out_features (int): dimension of output.
-        hidden_size (int): dimension of hidden layers.
-        hidden_layers (int): number of hidden layers.
-    """
-
-    def __init__(self, in_features, out_features, hidden_size, hidden_layers):
-        super().__init__()
-
-        if hidden_layers == 0:
-            # Input layer
-            layers = [nn.Linear(in_features=in_features, out_features=out_features)]
-        else:
-            # Input layer
-            layers = [
-                nn.Linear(in_features=in_features, out_features=hidden_size),
-                nn.ReLU(),
-            ]
-            # Hidden layers
-            for i in range(hidden_layers - 2):
-                layers += [
-                    nn.Linear(in_features=hidden_size, out_features=hidden_size),
-                    nn.ReLU(),
-                ]
-            # Output layer
-            layers += [nn.Linear(in_features=hidden_size, out_features=out_features)]
-
-        # Store in layers as ModuleList
-        self.layers = nn.Sequential(*layers)
-
-    def forward(self, x):
-        return self.layers(x)
-
-
 class DeepAR(BaseModel):
     """DeepAR
 
@@ -209,11 +172,13 @@ def __init__(
         )
 
         # Decoder MLP
-        self.decoder = Decoder(
+        self.decoder = MLP(
             in_features=lstm_hidden_size,
             out_features=self.loss.outputsize_multiplier,
             hidden_size=decoder_hidden_size,
-            hidden_layers=decoder_hidden_layers,
+            num_layers=decoder_hidden_layers + 1,
+            activation="ReLU",
+            dropout=0.0,
         )
 
     def forward(self, windows_batch):
diff --git a/neuralforecast/models/xlstm.py b/neuralforecast/models/xlstm.py
@@ -96,7 +96,7 @@ def __init__(
         encoder_bias: bool = True,
         encoder_dropout: float = 0.1,
         decoder_hidden_size: int = 128,
-        decoder_layers: int = 1,
+        decoder_layers: int = 2,
         decoder_dropout: float = 0.0,
         decoder_activation: str = "GELU",
         backbone: str = "mLSTM",