I am trying to train a neural network on my computer using WSL. First I tried using a jupyter kernel to run the script line by line and it trained fro 10 epochs before WSL crashing inside VS Code and getting stuck in an endless reconnecting loop. Afterwards I tried running the script from the terminal and it has been stuck like in the pictures below (more than 1 hour since writing this). Is there a way to prevent this? Thank you! Performance View Process View
EDIT FROM HERE: This is my code as I am trying to run it. my goal is to implement Wave U Net with the MSE between the target and the input as loss.
import os
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader,random_split
import torchaudio
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
PARENT_FOLDER = "/mnt/c/Users/Tudor/Documents/yt-dlp"
SCALER = MinMaxScaler()
class DownSamplingLayer(nn.Module):
def __init__(self, channel_in, channel_out, dilation=1, kernel_size=9, stride=1, padding="same"):
super(DownSamplingLayer, self).__init__()
self.main = nn.Sequential(
nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
stride=stride, padding=padding, dilation=dilation),
nn.BatchNorm1d(channel_out),
nn.LeakyReLU(negative_slope=0.1, inplace=True),
)
self.dropout = nn.Dropout(p=0.3)
def forward(self, x):
x = self.main(x)
return self.dropout(x)
class UpSamplingLayer(nn.Module):
def __init__(self, channel_in, channel_out, kernel_size=9, stride=1, padding="same"):
super(UpSamplingLayer, self).__init__()
self.main = nn.Sequential(
nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
stride=stride, padding=padding),
nn.BatchNorm1d(channel_out),
nn.LeakyReLU(negative_slope=0.1, inplace=True),
)
def forward(self, x):
return self.main(x)
class Model(nn.Module):
def __init__(self, n_layers=8, channels_interval=16):
super(Model, self).__init__()
self.n_layers = n_layers
self.channels_interval = channels_interval
encoder_in_channels_list = [1] + [i * self.channels_interval for i in range(1, self.n_layers)]
encoder_out_channels_list = [i * self.channels_interval for i in range(1, self.n_layers + 1)]
self.encoder = nn.ModuleList()
for i in range(self.n_layers):
self.encoder.append(
DownSamplingLayer(
channel_in=encoder_in_channels_list[i],
channel_out=encoder_out_channels_list[i]
)
)
self.middle = nn.Sequential(
nn.Conv1d(self.n_layers * self.channels_interval, self.n_layers * self.channels_interval, kernel_size=3, stride=1,
padding="same"),
nn.BatchNorm1d(self.n_layers * self.channels_interval),
nn.LeakyReLU(negative_slope=0.1, inplace=True)
)
decoder_in_channels_list = [(2 * i + 1) * self.channels_interval for i in range(1, self.n_layers)] + [
2 * self.n_layers * self.channels_interval]
decoder_in_channels_list = decoder_in_channels_list[::-1]
decoder_out_channels_list = encoder_out_channels_list[::-1]
self.decoder = nn.ModuleList()
for i in range(self.n_layers):
self.decoder.append(
UpSamplingLayer(
channel_in=decoder_in_channels_list[i],
channel_out=decoder_out_channels_list[i]
)
)
self.out = nn.Sequential(
nn.Conv1d(1+self.channels_interval, 1, kernel_size=1, stride=1),
nn.LeakyReLU(negative_slope=0.1, inplace=True)
)
# Initialize the weights
self.initialize_weights()
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
init.xavier_uniform_(m.weight, gain=1.0)
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
def forward(self, x):
tmp = []
o = x
# Up Sample
for i in range(self.n_layers):
o = self.encoder[i](o)
tmp.append(o)
o = F.max_pool1d(o, kernel_size=2, stride=2)
o = self.middle(o)
for i in range(self.n_layers):
o = F.interpolate(o, scale_factor=2, mode="linear", align_corners=True)
o = torch.cat((o, tmp[self.n_layers - i - 1]), dim=1)
o = self.decoder[i](o)
o = torch.cat((o, x), dim=1)
o = self.out(o)
return o
input_samples, target_samples = np.load("input_samples.npy"), np.load("target_samples.npy")
input_samples =input_samples.tolist()
target_samples =target_samples.tolist()
df = pd.DataFrame({"input":input_samples,"target":target_samples})
class BassenhanceDataset(Dataset):
def __init__(self, df):
self.df = df
self.input = df["input"]
self.target = df["target"]
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
input = self.input[idx]
target = self.target[idx]
input = torch.tensor(input, dtype=torch.float32).T
target = torch.tensor(target, dtype=torch.float32).T
return input, target
def get_loader(self, batch_size, shuffle=True, num_workers=0):
return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=self.collate_fn)
def transpose(self, data):
return data.transpose(1,2)
def get_loader_transpose(self, batch_size, shuffle=True, num_workers=0):
return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=self.collate_fn, drop_last=True)
def split(self, train_size=0.8, shuffle=True):
return torch.utils.data.random_split(self, [int(len(self) * train_size), len(self) - int(len(self) * train_size)], generator=torch.Generator().manual_seed(42))
def train_epoch(model, train_loader, optimizer, criterion, device):
model.train()
running_loss = 0.0
for i, (input, target) in enumerate(train_loader):
input, target = input.to(device), target.to(device)
optimizer.zero_grad()
output = model(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
return running_loss / len(train_loader)
def validate_epoch(model, val_loader, criterion, device):
model.eval()
running_loss = 0.0
with torch.no_grad():
for i, (input, target) in enumerate(val_loader):
input, target = input.to(device), target.to(device)
output = model(input)
loss = criterion(output, target)
running_loss += loss.item()
return running_loss / len(val_loader)
def train(model, train_loader, val_loader, optimizer, criterion, device, epochs=10):
train_losses = []
val_losses = []
for epoch in range(epochs):
train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
val_loss = validate_epoch(model, val_loader, criterion, device)
train_losses.append(train_loss)
val_losses.append(val_loss)
print(f"Epoch {epoch + 1} | Train Loss: {train_loss:.10f} | Val Loss: {val_loss:.10f}")
save_state(model, epoch + 1)
if early_stopping(val_losses, patience=50):
print("Early Stopping")
break
return train_losses, val_losses
def plot_losses_real_time(train_losses, val_losses):
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.title("Losses")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()
def early_stopping(val_losses, patience=5):
if len(val_losses) < patience:
return False
else:
return val_losses[-1] > val_losses[-2] > val_losses[-3]
def save_state(model, epoch, path = "models"):
if epoch % 10 == 0:
state = { "epoch": epoch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict() }
torch.save(state, os.path.join(path, f"model_{epoch}.pth"))
print("Saved model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
model = Model(5,32).to(device)
# Mean Squared Error Loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00002)
dataset = BassenhanceDataset(df)
train_dataset, val_dataset = dataset.split()
print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
valid_loader = DataLoader(val_dataset, batch_size=128, shuffle=True, num_workers=2)
train_losses, val_losses = train(model, train_loader, valid_loader, optimizer, criterion, device, epochs=100)
plot_losses_real_time(train_losses, val_losses)
save_state(model, 100)
I managed to figure out what the problem was
The issue was with the DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
valid_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=2)
changing the num_workers to num_workers=0
fixed the issue, this webpage explains it why better than I could.