How to calculate f1 score during evaluation on test set?

I am trying to calculate the f1 score during evaluation of my own test set but i'm not able to solve as I am very unexperienced. I've tried to use both f1 score from Scikit-Learn and from torchmetrics but they give me everytime different errors. This is my code:

# Function to test the model 
from sklearn.metrics import f1_score

since = time.time()
total=0
correct=0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
y_pred=[]
y_true=[]

# Iterate over data.
with torch.no_grad():
    for inputs, labels in dataloadersTest_dict['Test']:
        inputs = inputs.to(device)
        labels = labels.to(device)
        #outputs = model(inputs)
        predicted_outputs = model(inputs)
        _, predicted = torch.max(predicted_outputs, 1)
        total += labels.size(0)
        print(total)
        correct += (predicted == labels).sum().item()
        print(correct)
        #f1 score
        temp_true=labels.numpy()
        temp_pred=predicted.numpy()
        y_true.append(temp_true.tolist())
        y_pred.append(temp_pred.tolist())
        
time_elapsed = time.time() - since
test_acc=100 * correct / total


print('Evaluation completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Accuracy: %d %%' % (test_acc))
print('F1 Score:')
f1=f1_score(y_true,y_pred, average = 'macro')
print(f1)

Solution

The error trace should be available in order to spot the problem but I guess the problem is due to passing a nested list to f1_score instead of a single list. It must be fixed by changing the collecting strategy of the final lists.

# Iterate over data.
y_true, y_pred = [], []
with torch.no_grad():
    for inputs, labels in dataloadersTest_dict['Test']:
        inputs = inputs.to(device)
        labels = labels.to(device)
        #outputs = model(inputs)
        predicted_outputs = model(inputs)
        _, predicted = torch.max(predicted_outputs, 1)
        total += labels.size(0)
        print(total)
        correct += (predicted == labels).sum().item()
        print(correct)
        #f1 score
        temp_true=labels.numpy()
        temp_pred=predicted.numpy()
        y_true+=temp_true.tolist()
        y_pred+=temp_pred.tolist()