python class deep-learning pytorch model

Huge speed difference between class and function approaches to load torch model in python

I have 2 different ways to load a model (in this case, a torch resnet50 model pretrained on places_365, can be found here: https://github.com/CSAILVision/places365, for the code below to work you need the model_path to point to this model). The first one is to create a class and the other to load it with a simple function. The codes are very similar but somehow when I do it with the class it is 3 to 4 times faster than with a simple function. I do not understand why, any ideas? Thanks! Here is the code (python 3.8): with the class:


from typing import List, Union
import time
import numpy as np
import torch
import torchvision.models as models
from PIL import Image
from torch.autograd import Variable
from torchvision import transforms as trn

import cv2

class FeatureExtractor:
    """
    A class to convert images to features using our pretrained model.
    """

    def __init__(
            self,
            model_path: str,
    ):
        """
        Loads a pretrained ResNet50 network pretrained on Places365. Since the purpose of the
        Network is to extract image features, the top Dense layer is not included. The output size is 1x2048.

        Args:
            model_path: The path of the model file (`.pth.tar`).

        """
        pretrained_model = models.__dict__['resnet50'](num_classes=365)
        checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
        state_dict = {str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items()}
        pretrained_model.load_state_dict(state_dict)
        res50_conv = torch.nn.Sequential(*list(pretrained_model.children())[:-1])
        res50_conv.eval()
        self.model = res50_conv

    def extract_image_features(
            self,
            img: np.ndarray,
    ) -> np.ndarray:
        """
        Given an image, outputs a vector of size 1x2048 with the deep features of the image.

        Args:
            img: the input image

        Returns:
            the output vector with the features
        """
        t_start = time.time()
        centre_crop = trn.Compose([
            trn.Resize((256, 256)),
            trn.CenterCrop(224),
            trn.ToTensor(),
            trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        
        img = Image.fromarray(np.uint8(img)).convert('RGB')
        preprocessed_img = Variable(centre_crop(img).unsqueeze(0))

        places_features = self.model.forward(preprocessed_img).detach().numpy().flatten()
        t_end = time.time()
        print(t_end-t_start)
        return places_features

#a random image in my computer
image = cv2.imread('/home/michael/Documents/thumbs/.png00001.png')

feature_extractor=FeatureExtractor('/home/michael/Gitlab/backend/packages/izirecord-api-cv/izirecord/cv/models/resnet50_places365.pth.tar')

feature_extractor.extract_image_features(image)

and with the function load_feature_extractor defined bellow:

from typing import List, Union
import time
import numpy as np
import torch
import torchvision.models as models
from PIL import Image
from torch.autograd import Variable
from torchvision import transforms as trn


def load_feature_extractor(
        model_path: str,
):
    """
    Loads a pretrained ResNet50 network pretrained on Places365. Since the purpose of the
    Network is to extract image features, the top Dense layer is not included. The output size is 1x2048.

    Args:
        model_path: The path of the model file (`.pth.tar`).

    """
    pretrained_model = models.__dict__['resnet50'](num_classes=365)
    checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
    state_dict = {str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items()}
    pretrained_model.load_state_dict(state_dict)
    res50_conv = torch.nn.Sequential(*list(pretrained_model.children())[:-1])
    res50_conv.eval()
    return res50_conv


def extract_image_features(
        img: np.ndarray,
) -> np.ndarray:
    """
    Given an image, outputs a vector of size 1x2048 with the deep features of the image.

    Args:
        img: the input image

    Returns:
        the output vector with the features
    """
    t_start = time.time()
    centre_crop = trn.Compose([
        trn.Resize((256, 256)),
        trn.CenterCrop(224),
        trn.ToTensor(),
        trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    # TODO Review this suspicious color treatment and numpy → PIL → Torch conversion
    #  when easily numpy → Torch can be handle by trn.ToTensor().
    img = Image.fromarray(np.uint8(img)).convert('RGB')
    preprocessed_img = Variable(centre_crop(img).unsqueeze(0))

    # TODO Understand this line:
    places_features = load_feature_extractor(feature_extractor_model_path).forward(
        preprocessed_img).detach().numpy().flatten()
    t_end = time.time()
    print(t_end-t_start)
    return places_features



# a random image in my computer:
image = cv2.imread('/home/michael/Documents/thumbs/.png00001.png')


extract_image_features(image)

As I said before, the function extract_image_features is much faster with the class approach on images, but why? :)

I was expecting the 2 approach to have similar computation times. I tried printed the times and I get 3.5x faster computation time for the extract_image_features function when I load the model using the FeatureExtractor class, the output is 0.26150083541870117 sec with the class structure and 0.9230237007141113 sec with the function structure.

Solution

In the function-based method, you measure both the time it takes to load the model and the time it takes to compute the result on an input image. However, in the class-based method, you don't measure the time it takes to load the model.