python pytorch computer-vision image-segmentation detectron

Build confusion matrix for instance segmantation (mask r-cnn from detectron2)

I've trained a mask r-cnn on corn images (I cannot show examples because they are confidential), but they are basically pictures of corn kernels scattered over a flat surface.

There are different kinds of corn kernels I want to be able to segment and classify. I understand the AP metrics are the best way of measuring the performance of an instance segmentation algorithm and I know a confusion matrix for this kind of algorithm doesn't usually make sense.

But for his specific case, where I have 4 classes of very similar objects, I would like to be able to set a fixed AP value, like AP50/AP75 and build a confusion matrix for that.

Would it be possible? How would I do it?

I used detectron2 library to train and get predictions. Here is the code I use to load my trained model from disk, generate predictions in the validation set, and visualize the results:

import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

import numpy as np
import matplotlib.pyplot as plt
import os, json, cv2, random, gc

from detectron2 import model_zoo
from detectron2.data.datasets import register_coco_instances
from detectron2.checkpoint import DetectionCheckpointer, Checkpointer
from detectron2.data import MetadataCatalog, DatasetCatalog, build_detection_test_loader
from detectron2.engine import DefaultTrainer, DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.modeling import build_model
from detectron2.evaluation import COCOEvaluator, inference_on_dataset

train_annotations_path = "./data/cvat-corn-train-coco-1.0/annotations/instances_default.json"
train_images_path = "./data/cvat-corn-train-coco-1.0/images"
validation_annotations_path = "./data/cvat-corn-validation-coco-1.0/annotations/instances_default.json"
validation_images_path = "./data/cvat-corn-validation-coco-1.0/images"

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("train-corn",)
cfg.DATASETS.TEST = ("validation-corn",)
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")  # Let training initialize from model zoo
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 10000
cfg.SOLVER.STEPS = []
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128 
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
cfg.OUTPUT_DIR = "./output"
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7

register_coco_instances(
    "train-corn",
    {},
    train_annotations_path,
    train_images_path
)
register_coco_instances(
    "validation-corn",
    {},
    validation_annotations_path,
    validation_images_path
)
metadata_train = MetadataCatalog.get("train-corn")
dataset_dicts = DatasetCatalog.get("train-corn")

cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
predictor = DefaultPredictor(cfg)

predicted_images_path = os.path.abspath("./predicted/")

dataset_dicts_validation = DatasetCatalog.get("validation-corn")
for d in dataset_dicts_validation:    
    im = cv2.imread(d["file_name"])
    outputs = predictor(im)
    v = Visualizer(im[:, :, ::-1],
                   metadata=metadata_train, 
                   scale=0.5, 
                   instance_mode=ColorMode.IMAGE_BW
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    fig = plt.figure(frameon=False, dpi=1)
    fig.set_size_inches(1024,1024)
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    ax.imshow(cv2.cvtColor(out.get_image()[:, :, ::-1], cv2.COLOR_BGR2RGB), aspect='auto')
    fig.savefig(f"{predicted_images_path}/{d['file_name'].split('/')[-1]}")

That is what my output for a given image looks like:

It is a dictionary with an Instances object as its only value, the Instances object has four lists: pred_boxes, scores, pred_classes and pred_masks. And can be visualized using the detectron2 visualizer, but I can't show the visualization for confidentiality reasons.

Those are the metrics I have for the model right now:

And for each class:

And I noticed visually that some of the kernels are being confused for other classes, specially between classes ardido and fermentado, that is why I want to somehow be able to build a confusion matrix.

I expect the confusion matrix would look something like this:

EDIT: I found this repository:

https://github.com/kaanakan/object_detection_confusion_matrix

And tried to use it:

from confusion_matrix import ConfusionMatrix
cm = ConfusionMatrix(4, CONF_THRESHOLD=0.3, IOU_THRESHOLD=0.3)

for d in dataset_dicts_validation:
    img = cv2.imread(d["file_name"])
    outputs = predictor(img)
    labels = list()
    detections = list()
    for ann in d["annotations"]:
        labels.append([ann["category_id"]] + ann["bbox"])
    for coord, conf, cls in zip(
        outputs["instances"].get("pred_boxes").tensor.cpu().numpy(), 
        outputs["instances"].get("scores").cpu().numpy(), 
        outputs["instances"].get("pred_classes").cpu().numpy()
    ):
        detections.append(list(coord) + [conf] + [cls])
    cm.process_batch(np.array(detections), np.array(labels))

But the matrix I got is clearly wrong, and I'm having a hard time trying to fix it.

Solution

I was able to do it, I built the confusion matrix function from scratch:

import pandas as pd
import torch
from detectron2.structures import Boxes, pairwise_iou

def coco_bbox_to_coordinates(bbox):
    out = bbox.copy().astype(float)
    out[:, 2] = bbox[:, 0] + bbox[:, 2]
    out[:, 3] = bbox[:, 1] + bbox[:, 3]
    return out

def conf_matrix_calc(labels, detections, n_classes, conf_thresh, iou_thresh):
    confusion_matrix = np.zeros([n_classes + 1, n_classes + 1])
    l_classes = np.array(labels)[:, 0].astype(int)
    l_bboxs = coco_bbox_to_coordinates((np.array(labels)[:, 1:]))
    d_confs = np.array(detections)[:, 4]
    d_bboxs = (np.array(detections)[:, :4])
    d_classes = np.array(detections)[:, -1].astype(int)
    detections = detections[np.where(d_confs > conf_thresh)]
    labels_detected = np.zeros(len(labels))
    detections_matched = np.zeros(len(detections))
    for l_idx, (l_class, l_bbox) in enumerate(zip(l_classes, l_bboxs)):
        for d_idx, (d_bbox, d_class) in enumerate(zip(d_bboxs, d_classes)):
            iou = pairwise_iou(Boxes(torch.from_numpy(np.array([l_bbox]))), Boxes(torch.from_numpy(np.array([d_bbox]))))
            if iou >= iou_thresh:
                confusion_matrix[l_class, d_class] += 1
                labels_detected[l_idx] = 1
                detections_matched[d_idx] = 1
    for i in np.where(labels_detected == 0)[0]:
        confusion_matrix[l_classes[i], -1] += 1
    for i in np.where(detections_matched == 0)[0]:
        confusion_matrix[-1, d_classes[i]] += 1
    return confusion_matrix

n_classes = 4
confusion_matrix = np.zeros([n_classes + 1, n_classes + 1])
for d in dataset_dicts_validation:
    img = cv2.imread(d["file_name"])
    outputs = predictor(img)
    labels = list()
    detections = list()
    for coord, conf, cls, ann in zip(
        outputs["instances"].get("pred_boxes").tensor.cpu().numpy(),
        outputs["instances"].get("scores").cpu().numpy(),
        outputs["instances"].get("pred_classes").cpu().numpy(),
        d["annotations"]
    ):
        labels.append([ann["category_id"]] + ann["bbox"])
        detections.append(list(coord) + [conf] + [cls])
    confusion_matrix += conf_matrix_calc(np.array(labels), np.array(detections), n_classes, conf_thresh=0.5, iou_thresh=0.5)
matrix_indexes = metadata_train.get("thing_classes") + ["null"]
pd.DataFrame(confusion_matrix, columns=matrix_indexes, index=matrix_indexes)

I built the conf_matrix_calc, that computes the conf_matrix for each image, than I executed it for every image. It took me a while to make it work, because there was a hidden problem. For some reason the labels are saved in a different format than the detections, instead of [x1, y1, x2, y2], it is saved as [x1, y2, x1-x2, y1-y2], and searching online I didn't find anywhere in detectron's or coco's documentation where that is described, but I found one format that was saved as [(x1+x2)/2, (y1+y2)/2, x1-x2, y1-y2], anyway that wasn't my case, I only found that out because I opened the images and checked the pixel coordinates of the boxes in the labels and the predictions and noticed something was wrong. Anyway, now it works, that is my result: