Search code examples
python-3.xdeep-learningwindows-subsystem-for-linux

How can I prevent WSL from freezing when running my model?


I am trying to train a neural network on my computer using WSL. First I tried using a jupyter kernel to run the script line by line and it trained fro 10 epochs before WSL crashing inside VS Code and getting stuck in an endless reconnecting loop. Afterwards I tried running the script from the terminal and it has been stuck like in the pictures below (more than 1 hour since writing this). Is there a way to prevent this? Thank you! Performance View Process View

EDIT FROM HERE: This is my code as I am trying to run it. my goal is to implement Wave U Net with the MSE between the target and the input as loss.

import os
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader,random_split
import torchaudio
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt


PARENT_FOLDER = "/mnt/c/Users/Tudor/Documents/yt-dlp"

SCALER = MinMaxScaler()

class DownSamplingLayer(nn.Module):
    def __init__(self, channel_in, channel_out, dilation=1, kernel_size=9, stride=1, padding="same"):
        super(DownSamplingLayer, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
                      stride=stride, padding=padding, dilation=dilation),
            nn.BatchNorm1d(channel_out),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
        )

        self.dropout = nn.Dropout(p=0.3)

    def forward(self, x):
        x = self.main(x)
        return self.dropout(x)

class UpSamplingLayer(nn.Module):
    def __init__(self, channel_in, channel_out, kernel_size=9, stride=1, padding="same"):
        super(UpSamplingLayer, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
                      stride=stride, padding=padding),
            nn.BatchNorm1d(channel_out),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
        )

    def forward(self, x):
        return self.main(x)
    

class Model(nn.Module):
    def __init__(self, n_layers=8, channels_interval=16):
        super(Model, self).__init__()
        self.n_layers = n_layers
        self.channels_interval = channels_interval

        encoder_in_channels_list = [1] + [i * self.channels_interval for i in range(1, self.n_layers)]
        encoder_out_channels_list = [i * self.channels_interval for i in range(1, self.n_layers + 1)]

        self.encoder = nn.ModuleList()
        for i in range(self.n_layers):
            self.encoder.append(
                DownSamplingLayer(
                    channel_in=encoder_in_channels_list[i],
                    channel_out=encoder_out_channels_list[i]
                )
            )

        self.middle = nn.Sequential(
            nn.Conv1d(self.n_layers * self.channels_interval, self.n_layers * self.channels_interval, kernel_size=3, stride=1,
                      padding="same"),
            nn.BatchNorm1d(self.n_layers * self.channels_interval),
            nn.LeakyReLU(negative_slope=0.1, inplace=True)
        )

        decoder_in_channels_list = [(2 * i + 1) * self.channels_interval for i in range(1, self.n_layers)] + [
            2 * self.n_layers * self.channels_interval]
        decoder_in_channels_list = decoder_in_channels_list[::-1]
        decoder_out_channels_list = encoder_out_channels_list[::-1]

        self.decoder = nn.ModuleList()
        for i in range(self.n_layers):
            self.decoder.append(
                UpSamplingLayer(
                    channel_in=decoder_in_channels_list[i],
                    channel_out=decoder_out_channels_list[i]
                )
            )

        self.out = nn.Sequential(
            nn.Conv1d(1+self.channels_interval, 1, kernel_size=1, stride=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True)
        )

        # Initialize the weights
        self.initialize_weights()

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
                init.xavier_uniform_(m.weight, gain=1.0)
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)

    def forward(self, x):
        tmp = []
        o = x
        # Up Sample
        for i in range(self.n_layers):
            o = self.encoder[i](o)
            tmp.append(o)
            o = F.max_pool1d(o, kernel_size=2, stride=2)

        o = self.middle(o)

        for i in range(self.n_layers):
            o = F.interpolate(o, scale_factor=2, mode="linear", align_corners=True)
            o = torch.cat((o, tmp[self.n_layers - i - 1]), dim=1)
            o = self.decoder[i](o)
        o = torch.cat((o, x), dim=1)
        o = self.out(o)
        return o
    

input_samples, target_samples = np.load("input_samples.npy"), np.load("target_samples.npy")
input_samples =input_samples.tolist()
target_samples =target_samples.tolist()

df = pd.DataFrame({"input":input_samples,"target":target_samples})
class BassenhanceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.input = df["input"]
        self.target = df["target"]

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        input = self.input[idx]
        target = self.target[idx]

        input = torch.tensor(input, dtype=torch.float32).T
        target = torch.tensor(target, dtype=torch.float32).T

        return input, target
    
    
    def get_loader(self, batch_size, shuffle=True, num_workers=0):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=self.collate_fn)
    
    def transpose(self, data):
        return data.transpose(1,2)
    
    def get_loader_transpose(self, batch_size, shuffle=True, num_workers=0):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=self.collate_fn, drop_last=True)
    
    def split(self, train_size=0.8, shuffle=True):
        return torch.utils.data.random_split(self, [int(len(self) * train_size), len(self) - int(len(self) * train_size)], generator=torch.Generator().manual_seed(42))


def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for i, (input, target) in enumerate(train_loader):
        input, target = input.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(input)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

def validate_epoch(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            input, target = input.to(device), target.to(device)
            output = model(input)
            loss = criterion(output, target)
            running_loss += loss.item()
    return running_loss / len(val_loader)

def train(model, train_loader, val_loader, optimizer, criterion, device, epochs=10):
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = validate_epoch(model, val_loader, criterion, device)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Epoch {epoch + 1} | Train Loss: {train_loss:.10f} | Val Loss: {val_loss:.10f}")
        save_state(model, epoch + 1)
        if early_stopping(val_losses, patience=50):
            print("Early Stopping")
            break
    return train_losses, val_losses

def plot_losses_real_time(train_losses, val_losses):
    plt.plot(train_losses, label="Train Loss")
    plt.plot(val_losses, label="Val Loss")
    plt.title("Losses")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

def early_stopping(val_losses, patience=5):
    if len(val_losses) < patience:
        return False
    else:
        return val_losses[-1] > val_losses[-2] > val_losses[-3]
    


def save_state(model, epoch, path = "models"):
    if epoch % 10 == 0:
        state = { "epoch": epoch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict() }
        torch.save(state, os.path.join(path, f"model_{epoch}.pth"))
        print("Saved model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")


model = Model(5,32).to(device)
# Mean Squared Error Loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00002)

dataset = BassenhanceDataset(df)
train_dataset, val_dataset = dataset.split()
print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
valid_loader = DataLoader(val_dataset, batch_size=128, shuffle=True, num_workers=2)
train_losses, val_losses = train(model, train_loader, valid_loader, optimizer, criterion, device, epochs=100)
plot_losses_real_time(train_losses, val_losses)
save_state(model, 100)

Solution

  • I managed to figure out what the problem was

    The issue was with the DataLoader

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    valid_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=2)
    

    changing the num_workers to num_workers=0 fixed the issue, this webpage explains it why better than I could.