I am trying to calculate the f1 score during evaluation of my own test set but i'm not able to solve as I am very unexperienced. I've tried to use both f1 score from Scikit-Learn and from torchmetrics but they give me everytime different errors. This is my code:
# Function to test the model
from sklearn.metrics import f1_score
since = time.time()
total=0
correct=0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
y_pred=[]
y_true=[]
# Iterate over data.
with torch.no_grad():
for inputs, labels in dataloadersTest_dict['Test']:
inputs = inputs.to(device)
labels = labels.to(device)
#outputs = model(inputs)
predicted_outputs = model(inputs)
_, predicted = torch.max(predicted_outputs, 1)
total += labels.size(0)
print(total)
correct += (predicted == labels).sum().item()
print(correct)
#f1 score
temp_true=labels.numpy()
temp_pred=predicted.numpy()
y_true.append(temp_true.tolist())
y_pred.append(temp_pred.tolist())
time_elapsed = time.time() - since
test_acc=100 * correct / total
print('Evaluation completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Accuracy: %d %%' % (test_acc))
print('F1 Score:')
f1=f1_score(y_true,y_pred, average = 'macro')
print(f1)
The error trace should be available in order to spot the problem but I guess the problem is due to passing a nested list to f1_score
instead of a single list. It must be fixed by changing the collecting strategy of the final lists.
# Iterate over data.
y_true, y_pred = [], []
with torch.no_grad():
for inputs, labels in dataloadersTest_dict['Test']:
inputs = inputs.to(device)
labels = labels.to(device)
#outputs = model(inputs)
predicted_outputs = model(inputs)
_, predicted = torch.max(predicted_outputs, 1)
total += labels.size(0)
print(total)
correct += (predicted == labels).sum().item()
print(correct)
#f1 score
temp_true=labels.numpy()
temp_pred=predicted.numpy()
y_true+=temp_true.tolist()
y_pred+=temp_pred.tolist()