python computer-vision artificial-intelligence

How to transform and get coordinates/shapes from results of MMDetection?

Official demo shows we could use show_result(img, result, out_file='result.jpg') api to draw results on a picture.

model = init_detector('configs/any-config.py', 'checkpoints/any-checkpoints.pth', device='cpu')
results = inference_detector(model, 'some_pic.png')
model.show_result('some_pic.png', results, 'some_pic_results.png')

In debug tool I found format of results is a tuple contains list[][]. How should we get coordinates/shapes from it?

Is there a more detailed description for that format, or a direct api which could transform results into a more easy-to-use json (such as a COCO dataset format)?

Solution

Okay that, I combined several methods and got a usable method. 🤣
If you guys have a better way please let me know.

convert_polygon:

# this method combined:
# mmdetection.mmdet.models.detectors.base.BaseDetector.show_result
# open-mmlab\Lib\site-packages\mmdet\core\visualization\image.py imshow_det_bboxes, draw_bboxes, draw_labels, draw_masks
def convert_polygon(
        result,
        score_thr=0.3,

):
    from matplotlib.patches import Polygon
    import numpy as np
    import torch
    import cv2

    ms_bbox_result, ms_segm_result = result
    if isinstance(ms_bbox_result, dict):
        result = (ms_bbox_result['ensemble'],
                  ms_segm_result['ensemble'])

    if isinstance(result, tuple):
        bbox_result, segm_result = result
        if isinstance(segm_result, tuple):
            segm_result = segm_result[0]  # ms rcnn
    else:
        bbox_result, segm_result = result, None
    bboxes = np.vstack(bbox_result)
    labels = [
        np.full(bbox.shape[0], i, dtype=np.int32)
        for i, bbox in enumerate(bbox_result)
    ]
    labels = np.concatenate(labels)
    # draw segmentation masks
    segms = segm_result
    if segm_result is not None and len(labels) > 0:  # non empty
        segms = mmcv.concat_list(segm_result)
        if isinstance(segms[0], torch.Tensor):
            segms = torch.stack(segms, dim=0).detach().cpu().numpy()
        else:
            segms = np.stack(segms, axis=0)

    assert bboxes is None or bboxes.ndim == 2, \
        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
    assert labels.ndim == 1, \
        f' labels ndim should be 1, but its ndim is {labels.ndim}.'
    assert bboxes is None or bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
        f' bboxes.shape[1] should be 4 or 5, but its {bboxes.shape[1]}.'
    assert bboxes is None or bboxes.shape[0] <= labels.shape[0], \
        'labels.shape[0] should not be less than bboxes.shape[0].'
    assert segms is None or segms.shape[0] == labels.shape[0], \
        'segms.shape[0] and labels.shape[0] should have the same length.'
    assert segms is not None or bboxes is not None, \
        'segms and bboxes should not be None at the same time.'

    if score_thr > 0:
        assert bboxes is not None and bboxes.shape[1] == 5
        scores = bboxes[:, -1]
        inds = scores > score_thr
        bboxes = bboxes[inds, :]
        labels = labels[inds]
        if segms is not None:
            segms = segms[inds, ...]

    num_bboxes = 0
    ret_label = None
    ret_bbox = None
    ret_polygon = None
    ret_area = None
    ret_position = None
    ret_mask = None
    if bboxes is not None:
        num_bboxes = bboxes.shape[0]
        ret_bbox = bboxes
        ret_polygon = []
        for i, bbox in enumerate(bboxes):
            bbox_int = bbox.astype(np.int32)
            poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
                    [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
            np_poly = np.array(poly).reshape((4, 2))
            ret_polygon.append(Polygon(np_poly))
        ret_label = labels[:num_bboxes]

    if segms is not None:
        ret_mask = []
        for i, mask in enumerate(segms):
            temp_mask = []
            from mmdet.core.mask.structures import bitmap_to_polygon
            contours, _ = bitmap_to_polygon(mask)
            temp_mask += [Polygon(c) for c in contours]
            ret_mask.append(temp_mask)

        if num_bboxes < segms.shape[0]:
            segms = segms[num_bboxes:]
            areas = []
            positions = []
            for mask in segms:
                _, _, stats, centroids = cv2.connectedComponentsWithStats(
                    mask.astype(np.uint8), connectivity=8)
                largest_id = np.argmax(stats[1:, -1]) + 1
                positions.append(centroids[largest_id])
                areas.append(stats[largest_id, -1])
            areas = np.stack(areas, axis=0)
            ret_area = areas
            ret_position = positions

    return {'labels': ret_label,
            'bboxes': ret_bbox,
            'polygons': ret_polygon,
            'areas': ret_area,
            'positions': ret_position,
            'masks': ret_mask}

Key part of those code:

ret_mask = []
for i, mask in enumerate(segms):
    temp_mask = []
    from mmdet.core.mask.structures import bitmap_to_polygon
    contours, _ = bitmap_to_polygon(mask)
    temp_mask += [Polygon(c) for c in contours]
    ret_mask.append(temp_mask)

test code:

model = init_detector(config_file, checkpoint_file, device='cpu')
results = inference_detector(model, test_pic_file)
poly = convert_polygon(results)

After converting poly into json, format would be like this:

{
    "labels": [1, 1, 2, ...],
    "bboxes": [
            [499.54632568359375, 0.0, 599.1744384765625, 332.5544128417969, 0.9999723434448242],
            ...
    ],
    "polygons": [
        [ [499.0, 0.0], [499.0, 332.0], [599.0, 332.0], [599.0, 0.0], [499.0, 0.0] ],
        ...
    ],
    ...
    ],
    "areas": null,
    "positions": null,
    "masks": [
        [
            [
                [510.0, 0.0],
                [509.0, 1.0],
                [508.0, 1.0],
                ...
            ],
            ...
        ],
        ...
    ],
}

Some fields are easy to guess.

labels are the class id of every instance
The first 4 numbers in bboxes are left-top-x, left-top-y, right-bottom-x, right-bottom-y of the bounding boxes in rectangle shape. The last number of bboxes is confidence value of that instance
polygons contains coordinate value which are the same to above
no idea about areas and positions because they are always null when testing
masks contains coordinates arrays of instances. Only one array if there is no holes in that instance

2023-07-31 update:

Recently I work on MMDetection again and found its APIs were changed a lot. Most important change is that in MMDetection3, return type of inference_detector become DetDataSample.

Any new updates would be pushed to this GitHub repo.