I am training a CNN to classify some images. The objective is to classify them into two classes. I already executed the same code on Windows with a RTX3070, now I am trying to do the exact same on Ubuntu with a Nvidea A100-40Gb. The code I am using is this one:
import warnings
warnings.filterwarnings('ignore')
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import plotly
import plotly.graph_objects as go
%matplotlib inline
import os
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import lr_scheduler
if torch.cuda.is_available():
print("CUDA available. Using GPU acceleration.")
device = "cuda"
else:
print("CUDA is NOT available. Using CPU for training.")
device = "cpu"
import pickle
def save_var(var,filename):
with open(filename, 'wb') as f:
pickle.dump(var, f)
def recover_var(filename):
with open(filename, 'rb') as f:
var = pickle.load(f)
return var
df = recover_var('dataframe_cnn.pickle') #my dataset
df = df.sample(frac=1).reset_index(drop=True)
df.columns = ['label'] + list(range(1,27649))
train =df[:int(0.7*len(df))]
test = df[int(0.7*len(df)):]
def preprocessing(train, test, split_train_size = 0.2):
# Split data into features(pixels) and labels(numbers from 0 to 9)
targets = train.label.values
features = train.drop(["label"], axis = 1).values
# Normalization
features = features/255.
X_test = test.values/255.
# Train test split. Size of train data is (1-split_train_size)*100% and size of test data is split_train_size%.
X_train, X_val, y_train, y_val = train_test_split(features,
targets,
test_size = split_train_size,
random_state = 42)
# Create feature and targets tensor for train set. I need variable to accumulate gradients. Therefore first I create tensor, then I will create variable
X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train).type(torch.LongTensor) # data type is long
# Create feature and targets tensor for test set.
X_val = torch.from_numpy(X_val)
y_val = torch.from_numpy(y_val).type(torch.LongTensor) # data type is long
# Create feature tensor for train set.
X_test = torch.from_numpy(X_test)
return X_train, y_train, X_val, y_val, X_test
X_train, y_train, X_val, y_val, X_test = preprocessing(train, test)
print(f'Shape of training data: {X_train.shape}')
print(f'Shape training labels: {y_train.shape}')
print(f'Shape of validation data: {X_val.shape}')
print(f'Shape of valiation labels: {y_val.shape}')
print(f'Shape of testing data: {X_test.shape}')
# batch_size, epoch and iteration
BATCH_SIZE = 100
N_ITER = 2500
EPOCHS = 5
# I will be trainin the model on another 10 epochs to show flexibility of pytorch
EXTRA_EPOCHS = 10
# Pytorch train and test sets
train_tensor = torch.utils.data.TensorDataset(X_train, y_train)
val_tensor = torch.utils.data.TensorDataset(X_val, y_val)
test_tensor = torch.utils.data.TensorDataset(X_test)
# data loader
train_loader = torch.utils.data.DataLoader(train_tensor,
batch_size = BATCH_SIZE,
shuffle = True)
val_loader = torch.utils.data.DataLoader(val_tensor,
batch_size = BATCH_SIZE,
shuffle = False)
test_loader = torch.utils.data.DataLoader(test_tensor,
batch_size = BATCH_SIZE,
shuffle = False)
class CNNModel(nn.Module):
def __init__(self):
super(CNNModel, self).__init__()
# convolution 1
self.c1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(5,5), stride=1, padding=0)
self.relu1 = nn.ReLU()
# maxpool 1
self.maxpool1 = nn.MaxPool2d(kernel_size=(2,2))
# dropout 1
self.dropout1 = nn.Dropout(0.25)
# convolution 2
self.c2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3,3), stride=1, padding=0)
self.relu2 = nn.ReLU()
# maxpool 2
self.maxpool2 = nn.MaxPool2d(kernel_size=(2,2))
# dropout 2
self.dropout2 = nn.Dropout(0.25)
# linear 1
self.fc1 = nn.Linear(32*5*5, 256)
# dropout 3
self.dropout3 = nn.Dropout(0.25)
# linear 2
self.fc2 = nn.Linear(256, 10)
def forward(self, x):
out = self.c1(x) # [BATCH_SIZE, 16, 24, 24]
out = self.relu1(out)
out = self.maxpool1(out) # [BATCH_SIZE, 16, 12, 12]
out = self.dropout1(out)
out = self.c2(out) # [BATCH_SIZE, 32, 10, 10]
out = self.relu2(out)
out = self.maxpool2(out) # [BATCH_SIZE, 32, 5, 5]
out = self.dropout2(out)
out = out.view(out.size(0), -1) # [BATCH_SIZE, 32*5*5=800]
out = self.fc1(out) # [BATCH_SIZE, 256]
out = self.dropout3(out)
out = self.fc2(out) # [BATCH_SIZE, 10]
return out
# Create CNN
model = CNNModel()
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
# Cross Entropy Loss
criterion = nn.CrossEntropyLoss()
# LR scheduler
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
# On GPU if possible
if torch.cuda.is_available():
print("Model will be training on GPU")
model = model.cuda()
criterion = criterion.cuda()
else:
print("Model will be training on CPU")
def fit(epoch):
print("Training...")
# Set model on training mode
model.train()
# Update lr parameter
exp_lr_scheduler.step()
# Initialize train loss and train accuracy
train_running_loss = 0.0
train_running_correct = 0
train_running_lr = optimizer.param_groups[0]['lr']
for batch_idx, (data, target) in enumerate(train_loader):
data, target = Variable(data.view(BATCH_SIZE,1,144,192)), Variable(target)
if torch.cuda.is_available():
data = data.cuda()
target = target.cuda()
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
train_running_loss += loss.item()
_, preds = torch.max(output.data, 1)
train_running_correct += (preds == target).sum().item()
loss.backward()
optimizer.step()
if (batch_idx + 1)% 50 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch+1,
(batch_idx + 1) * len(data),
len(train_loader.dataset),
BATCH_SIZE * (batch_idx + 1) / len(train_loader),
loss.cpu().detach().numpy())
)
train_loss = train_running_loss/len(train_loader.dataset)
train_accuracy = 100. * train_running_correct/len(train_loader.dataset)
return train_loss, train_accuracy, train_running_lr
def validate(data_loader):
print("Validating...")
# Set model on validating mode
model.eval()
val_preds = torch.LongTensor().cuda()
val_proba = torch.LongTensor().cuda()
# Initialize validation loss and validation accuracy
val_running_loss = 0.0
val_running_correct = 0
for data, target in data_loader:
# Regarding volatile argument, check the note below
data, target = Variable(data.view(BATCH_SIZE,1,144,192), volatile=True), Variable(target)
if torch.cuda.is_available():
data = data.cuda()
target = target.cuda()
output = model(data)
loss = criterion(output, target)
val_running_loss += loss.item()
pred = output.data.max(1, keepdim=True)[1]
proba = torch.nn.functional.softmax(output.data)
val_running_correct += pred.eq(target.data.view_as(pred)).cpu().sum()
# Store val_predictions with probas for confusion matrix calculations & best errors made
val_preds = torch.cat((val_preds.float(), pred), dim=0).float()
val_proba = torch.cat((val_proba.float(), proba)).float()
val_loss = val_running_loss/len(data_loader.dataset)
val_accuracy = 100. * val_running_correct/len(data_loader.dataset)
return val_loss, val_accuracy, val_preds, val_proba
train_loss, train_accuracy = [], []
val_loss, val_accuracy = [], []
val_preds, val_proba = [], []
train_lr = []
for epoch in range(EPOCHS):
print(f"Epoch {epoch+1} of {EPOCHS}\n")
train_epoch_loss, train_epoch_accuracy, train_epoch_lr = fit(epoch)
val_epoch_loss, val_epoch_accuracy, val_epoch_preds, val_epoch_proba = validate(val_loader)
train_loss.append(train_epoch_loss)
train_accuracy.append(train_epoch_accuracy)
train_lr.append(train_epoch_lr)
val_loss.append(val_epoch_loss)
val_accuracy.append(val_epoch_accuracy)
val_preds.append(val_epoch_preds)
val_proba.append(val_epoch_proba)
print(f"Train Loss: {train_epoch_loss:.4f}, Train Acc: {train_epoch_accuracy:.2f}")
print(f'Val Loss: {val_epoch_loss:.4f}, Val Acc: {val_epoch_accuracy:.2f}\n')
However, it is returning the following error:
Cell In[149], line 285
281 for epoch in range(EPOCHS):
283 print(f"Epoch {epoch+1} of {EPOCHS}\n")
--> 285 train_epoch_loss, train_epoch_accuracy, train_epoch_lr = fit(epoch)
286 val_epoch_loss, val_epoch_accuracy, val_epoch_preds, val_epoch_proba = validate(val_loader)
288 train_loss.append(train_epoch_loss)
Cell In[149], line 213, in fit(epoch)
210 target = target.cuda()
212 optimizer.zero_grad()
--> 213 output = model(data)
214 loss = criterion(output, target)
216 train_running_loss += loss.item()
File /opt/miniconda3/envs/mlgpu/lib/python3.9/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
Cell In[149], line 152, in CNNModel.forward(self, x)
150 def forward(self, x):
--> 152 out = self.c1(x) # [BATCH_SIZE, 16, 24, 24]
153 out = self.relu1(out)
154 out = self.maxpool1(out) # [BATCH_SIZE, 16, 12, 12]
File /opt/miniconda3/envs/mlgpu/lib/python3.9/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/miniconda3/envs/mlgpu/lib/python3.9/site-packages/torch/nn/modules/conv.py:457, in Conv2d.forward(self, input)
456 def forward(self, input: Tensor) -> Tensor:
--> 457 return self._conv_forward(input, self.weight, self.bias)
File /opt/miniconda3/envs/mlgpu/lib/python3.9/site-packages/torch/nn/modules/conv.py:453, in Conv2d._conv_forward(self, input, weight, bias)
449 if self.padding_mode != 'zeros':
450 return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
451 weight, bias, self.stride,
452 _pair(0), self.dilation, self.groups)
--> 453 return F.conv2d(input, weight, bias, self.stride,
454 self.padding, self.dilation, self.groups)
RuntimeError: Input type (torch.cuda.DoubleTensor) and weight type (torch.cuda.FloatTensor) should be the same```
I already tried to typecast the output to float, torch.float, np.float32, but it still returned the same error. Moreover, I tried to change the type of the variable x
on line 152, without results. How can I solve it?
Looks like your model parameters are in Float
but your data is in Double
datatype. I'm not sure exactly how you attempted to cast your tensor but the following should work:
optimizer.zero_grad()
output = model(data.float())
Alternatively, you can convert the model parameters to Double
by the following:
# Create CNN
model = CNNModel()
model.double()
Try either of them and it should tackle the tensor type mismatch issue.
NOTE: Do not use Variable(tensor)
as you did in the following:
for batch_idx, (data, target) in enumerate(train_loader):
data, target = Variable(data.view(BATCH_SIZE,1,144,192)), Variable(target)
as well as here:
for data, target in data_loader:
# Regarding volatile argument, check the note below
data, target = Variable(data.view(BATCH_SIZE,1,144,192), volatile=True), Variable(target)
The Variable API has been deprecated by PyTorch.