Search code examples
pythonscikit-learnpytorchskorch

Why Skorch show NAN in the every epoch?


I want to create my own dataset class based on Dataset class of Skorch because I want to differentiate categorical columns and continuous columns. These categorical columns will be passed through the embedding layers in the model. The result is weird because it show NAN like this:

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1           nan           nan  0.2187
      2           nan           nan  0.1719
      3           nan           nan  0.1719
      4           nan           nan  0.1562
      5           nan           nan  0.1406

Can you help me fix it ? I am using data from this kaggle: Here

from skorch import NeuralNetRegressor
from skorch.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y = np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns
                          if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[self.cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))

    def __len__(self):
        # Denotes the total number of sampoes
        return self.n

    def __getitem__(self, idx):
        # generates one sample of data
        return [self.cont_X[idx], self.cat_X[idx]], self.y[idx]


class FeedForwardNN(nn.Module):

    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
                 output_size, emb_dropout, lin_layer_dropouts):

        """
        Parameters
        ----------
        emb_dims: List of two element tuples
          This list will contain a two element tuple for each
          categorical feature. The first element of a tuple will
          denote the number of unique values of the categorical
          feature. The second element will denote the embedding
          dimension to be used for that feature.
        no_of_cont: Integer
          The number of continuous features in the data.
        lin_layer_sizes: List of integers.
          The size of each linear layer. The length will be equal
          to the total number
          of linear layers in the network.
        output_size: Integer
          The size of the final output.
        emb_dropout: Float
          The dropout to be used after the embedding layers.
        lin_layer_dropouts: List of floats
          The dropouts to be used after each linear layer.
        """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                         for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                    lin_layer_sizes[0])

        self.lin_layers = \
            nn.ModuleList([first_lin_layer] + \
                          [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
                           for i in range(len(lin_layer_sizes) - 1)])

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                      output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                        for size in lin_layer_sizes])

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                            for size in lin_layer_dropouts])

    def forward(self, X):
        cont_data = X[0]
        cat_data = X[1]
        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i])
                 for i, emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in \
                zip(self.lin_layers, self.droput_layers, self.bn_layers):
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)

        return x


# Read data
data = pd.read_csv("data/train.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                              "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

categorical_features = ["MSSubClass", "MSZoning", "Street", "LotShape", "YearBuilt"]
output_feature = "SalePrice"

# Label Encode Categorial Features
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

# feed Forward NN
cat_dims = [int(data[col].nunique()) for col in categorical_features]

emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]


net = FeedForwardNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100],
                    output_size=1, emb_dropout=0.04,
                    lin_layer_dropouts=[0.001, 0.01])

# Fit
ds = TabularDataset(data=data, cat_cols=categorical_features,
                    output_col=output_feature)
X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice'].values.reshape(-1, 1)
net = NeuralNetRegressor(
    net,
    max_epochs=5,
    lr=0.1,
    dataset=ds
)
net.fit(X, y)

Solution

  • The problem is not with skorch but your data. You have to scale your inputs and, in this case, especially the targets to avoid huge losses and exploding gradients. As a start I suggest using, for example, sklearn.preprocessing.StandardScaler:

    from sklearn.preprocessing import StandardScaler
    
    class TabularDataset(Dataset):
        def __init__(self, data, cat_cols=None, output_col=None):
            self.n = data.shape[0]
            # [...]
            if output_col:
                scaler_y = StandardScaler()
                self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
    
                scaler_y.fit(self.y)
                self.y = scaler_y.transform(self.y)
            # [...]
            if self.cont_cols:
                scaler_X_cont = StandardScaler()
                self.cont_X = data[self.cont_cols].astype(np.float32).values
                scaler_X_cont.fit(self.cont_X)
                self.cont_X = scaler_X_cont.transform(self.cont_X)
            # [...]
    

    As a side note, you don't need X and y when you have a dataset that provides the actual data, you can simply pass it to net.fit (with the exception of using a stratified CV split):

    net = NeuralNetRegressor(
        net,
        max_epochs=5,
        lr=0.00001,
    )
    net.fit(ds, y=None)