Getting different results after converting a model to from pytorch to ONNX

I'm coverting a googlenet model form pytorch to onnx using the following code:

torch.onnx.export(model,               # model being run
                  input_batch,                         # model input (or a tuple for multiple inputs)
                  "google-net-onnx-test.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=10,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                'output' : {0 : 'batch_size'}})

When I run the model on pytorch for this image:

I get the right results:

Samoyed 0.9378381967544556
Pomeranian 0.00828344002366066
Great Pyrenees 0.005603068508207798
Arctic fox 0.005527767818421125
white wolf 0.004741032607853413

But when I do it with ONNX I get this:

The pre and pos processing code is different for each case, I It should be equivalent.

This is the complete code in Pytorch:

import torch
from PIL import Image
from torchvision import transforms

model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)
model.eval()


input_image = Image.open(filename)
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

# move the input and model to GPU for speed if available
if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')



with torch.no_grad():
    output = model(input_batch)
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
#print(output[0])
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)
print(probabilities[:2])

# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

And this the code for ONNX

from PIL import Image
import imageio
import onnxruntime as ort
import numpy as np

import matplotlib.pyplot as plt
import numpy as np
from collections import namedtuple
import os
import time


def get_image(path):
    '''
        Using path to image, return the RGB load image
    '''
    img = imageio.imread(path, pilmode='RGB')
    return img

# Pre-processing function for ImageNet models using numpy
def preprocess(img):
    '''
    Preprocessing required on the images for inference with mxnet gluon
    The function takes loaded image and returns processed tensor
    '''
    img = np.array(Image.fromarray(img).resize((224, 224))).astype(np.float32)
    img[:, :, 0] -= 123.68
    img[:, :, 1] -= 116.779
    img[:, :, 2] -= 103.939
    img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
    img = img.transpose((2, 0, 1))
    img = np.expand_dims(img, axis=0)

    return img


def predict(path):
    img_batch = preprocess(get_image(path))

    outputs = ort_session.run(
        None,
        {"input": img_batch.astype(np.float32)},
    )

    a = np.argsort(-outputs[0].flatten())
    results = {}
    for i in a[0:5]:
        results[labels[i]]=float(outputs[0][0][i])
    return results

ort_session = ort.InferenceSession("/content/google-net-onnx-test.onnx")

with open('synset.txt', 'r') as f:
    labels = [l.rstrip() for l in f]

image_path = "/content/dog.jpg"
predict(image_path)

I took the code of Pytorch from this tutorial

And the code for ONNX fro the github's ONNX Zoo

Edit:

From the comments of @jhso, I think the normalisation step:

mean=[0.485, 0.456, 0.406]

I seems to me that is equivalent to:

img[:, :, 0] -= 123.68
img[:, :, 1] -= 116.779
img[:, :, 2] -= 103.939

because:

constant = 256
a,b,c =  123.68/constant, 116.779/constant, 103.939/constant

print (f'{a:.3f} {b:.3f} {c:.3f}')
0.483 0.456 0.406

Regarding the std part, I'm not sure were it happend or if it is equivalent to:

img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
img = img.transpose((2, 0, 1))

Also I ran the code again today and got a closer result:

Solution

Your preprocessing is wrong. Note that you have a center crop (less important) and a std deviation normalisation step you're not using. You're also seemingly converting from BGR which isn't required when using PIL (it's more of an opencv thing) - happy to be corrected if I'm wrong as I'm going from memory.

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

Your preprocessing stage should look something (ymmv) like this:

# Pre-processing function for ImageNet models using numpy
def preprocess(img):
    '''
    Preprocessing required on the images for inference with mxnet gluon
    The function takes loaded image and returns processed tensor
    '''
    img = np.array(Image.fromarray(img).resize((256, 256))).astype(np.float32)
    #center crop
    rm_pad = (256-224)//2 
    img = img[rm_pad:-rm_pad,rm_pad:-rm_pad]
    #normalize to 0-1
    img /= 255.
    #normalize by mean + std
    img = (img - np.array([0.485, 0.456, 0.406]))/np.array([0.229, 0.224, 0.225])
    # img[:,:,[0,1,2]] = img[:,:,[2,1,0]] #don't think this is needed?
    img = img.transpose((2, 0, 1))
    img = np.expand_dims(img, axis=0)

    return img