I'm coverting a googlenet model form pytorch to onnx using the following code:
torch.onnx.export(model, # model being run
input_batch, # model input (or a tuple for multiple inputs)
"google-net-onnx-test.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=10, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['input'], # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes={'input' : {0 : 'batch_size'}, # variable length axes
'output' : {0 : 'batch_size'}})
When I run the model on pytorch for this image:
I get the right results:
Samoyed 0.9378381967544556
Pomeranian 0.00828344002366066
Great Pyrenees 0.005603068508207798
Arctic fox 0.005527767818421125
white wolf 0.004741032607853413
But when I do it with ONNX I get this:
The pre and pos processing code is different for each case, I It should be equivalent.
This is the complete code in Pytorch:
import torch
from PIL import Image
from torchvision import transforms
model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)
model.eval()
input_image = Image.open(filename)
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
# move the input and model to GPU for speed if available
if torch.cuda.is_available():
input_batch = input_batch.to('cuda')
model.to('cuda')
with torch.no_grad():
output = model(input_batch)
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
#print(output[0])
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)
print(probabilities[:2])
# Read the categories
with open("imagenet_classes.txt", "r") as f:
categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
print(categories[top5_catid[i]], top5_prob[i].item())
And this the code for ONNX
from PIL import Image
import imageio
import onnxruntime as ort
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from collections import namedtuple
import os
import time
def get_image(path):
'''
Using path to image, return the RGB load image
'''
img = imageio.imread(path, pilmode='RGB')
return img
# Pre-processing function for ImageNet models using numpy
def preprocess(img):
'''
Preprocessing required on the images for inference with mxnet gluon
The function takes loaded image and returns processed tensor
'''
img = np.array(Image.fromarray(img).resize((224, 224))).astype(np.float32)
img[:, :, 0] -= 123.68
img[:, :, 1] -= 116.779
img[:, :, 2] -= 103.939
img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
img = img.transpose((2, 0, 1))
img = np.expand_dims(img, axis=0)
return img
def predict(path):
img_batch = preprocess(get_image(path))
outputs = ort_session.run(
None,
{"input": img_batch.astype(np.float32)},
)
a = np.argsort(-outputs[0].flatten())
results = {}
for i in a[0:5]:
results[labels[i]]=float(outputs[0][0][i])
return results
ort_session = ort.InferenceSession("/content/google-net-onnx-test.onnx")
with open('synset.txt', 'r') as f:
labels = [l.rstrip() for l in f]
image_path = "/content/dog.jpg"
predict(image_path)
I took the code of Pytorch from this tutorial
And the code for ONNX fro the github's ONNX Zoo
From the comments of @jhso, I think the normalisation step:
mean=[0.485, 0.456, 0.406]
I seems to me that is equivalent to:
img[:, :, 0] -= 123.68
img[:, :, 1] -= 116.779
img[:, :, 2] -= 103.939
because:
constant = 256
a,b,c = 123.68/constant, 116.779/constant, 103.939/constant
print (f'{a:.3f} {b:.3f} {c:.3f}')
0.483 0.456 0.406
Regarding the std part, I'm not sure were it happend or if it is equivalent to:
img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
img = img.transpose((2, 0, 1))
Also I ran the code again today and got a closer result:
Your preprocessing is wrong. Note that you have a center crop (less important) and a std deviation normalisation step you're not using. You're also seemingly converting from BGR which isn't required when using PIL (it's more of an opencv thing) - happy to be corrected if I'm wrong as I'm going from memory.
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
Your preprocessing stage should look something (ymmv) like this:
# Pre-processing function for ImageNet models using numpy
def preprocess(img):
'''
Preprocessing required on the images for inference with mxnet gluon
The function takes loaded image and returns processed tensor
'''
img = np.array(Image.fromarray(img).resize((256, 256))).astype(np.float32)
#center crop
rm_pad = (256-224)//2
img = img[rm_pad:-rm_pad,rm_pad:-rm_pad]
#normalize to 0-1
img /= 255.
#normalize by mean + std
img = (img - np.array([0.485, 0.456, 0.406]))/np.array([0.229, 0.224, 0.225])
# img[:,:,[0,1,2]] = img[:,:,[2,1,0]] #don't think this is needed?
img = img.transpose((2, 0, 1))
img = np.expand_dims(img, axis=0)
return img