Search code examples
pythonmachine-learningpytorchneural-network

How to use the dataset comes from pytorch random_split()?


I'm new to pyTorch and this is my first project. I need to split the dataset and feed the training dataset to model. The training dataset must be splitted in to features and labels (which I failed to do that). Here is what I have tried so far, however, I don't know how to feed the dataset obtained from random_split() to model.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD
import matplotlib.pyplot as plt
import seaborn as sns
from dataset import DataSet


class NeuralNetwork(nn.Module):
    input_dim = 10
    hidden_dim = 4
    output_dim = 1

    def __init__(self, dataset):
        super().__init__()

        self.layers = [
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.Linear(self.hidden_dim, self.output_dim)
        ]
        self.train_dataset = dataset["train_dataset"]
        self.test_dataset = dataset["test_dataset"]
        self.layers = nn.ModuleList(self.layers)

    def forward(self, x):
        for layer in self.layers:
            x = nn.functional.rrelu(layer(x))


dataset = DataSet()
model = NeuralNetwork(dataset)
model(dataset["train_dataset"])

and this is dataset.py

import pandas as pd
import torch
from torch.utils.data import DataLoader


class DataSet:
    divide_rate = 0.8
    file = './pima-indians-diabetes.csv'

    def __init__(self):
        data_set = pd.read_csv(self.file)
        train_size = int(self.divide_rate * len(data_set))
        test_size = len(data_set) - train_size
        self.train_dataset, self.test_dataset = torch.utils.data.random_split(data_set, [train_size, test_size])
        self.train_dataset = torch.utils.data.DataLoader(self.train_dataset, shuffle=True)
        self.test_dataset = torch.utils.data.DataLoader(self.test_dataset, shuffle=True)

    def __getitem__(self, key):
        return getattr(self, key)

The error is

TypeError: linear(): argument 'input' (position 1) must be Tensor, not DataLoader


Solution

  • I assume the problem lies with your class Dataset, please replace it with following function,

    def load_data(test_split, batch_size):
    """Loads the data"""
    sonar_dataset = SonarDataset('./sonar.all-data')
    # Create indices for the split
    dataset_size = len(sonar_dataset)
    test_size = int(test_split * dataset_size)
    train_size = dataset_size - test_size
    
    train_dataset, test_dataset = random_split(sonar_dataset,
                                               [train_size, test_size])
    
    train_loader = DataLoader(
        train_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)
    test_loader = DataLoader(
        test_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)
    
    return train_loader, test_loader