Search code examples
pythoncomputer-visionartificial-intelligence

How to transform and get coordinates/shapes from results of MMDetection?


Official demo shows we could use show_result(img, result, out_file='result.jpg') api to draw results on a picture.

model = init_detector('configs/any-config.py', 'checkpoints/any-checkpoints.pth', device='cpu')
results = inference_detector(model, 'some_pic.png')
model.show_result('some_pic.png', results, 'some_pic_results.png')

In debug tool I found format of results is a tuple contains list[][]. How should we get coordinates/shapes from it?

enter image description here

Is there a more detailed description for that format, or a direct api which could transform results into a more easy-to-use json (such as a COCO dataset format)?


Solution

  • Okay that, I combined several methods and got a usable method. 🤣
    If you guys have a better way please let me know.

    convert_polygon:

    # this method combined:
    # mmdetection.mmdet.models.detectors.base.BaseDetector.show_result
    # open-mmlab\Lib\site-packages\mmdet\core\visualization\image.py imshow_det_bboxes, draw_bboxes, draw_labels, draw_masks
    def convert_polygon(
            result,
            score_thr=0.3,
    
    ):
        from matplotlib.patches import Polygon
        import numpy as np
        import torch
        import cv2
    
        ms_bbox_result, ms_segm_result = result
        if isinstance(ms_bbox_result, dict):
            result = (ms_bbox_result['ensemble'],
                      ms_segm_result['ensemble'])
    
        if isinstance(result, tuple):
            bbox_result, segm_result = result
            if isinstance(segm_result, tuple):
                segm_result = segm_result[0]  # ms rcnn
        else:
            bbox_result, segm_result = result, None
        bboxes = np.vstack(bbox_result)
        labels = [
            np.full(bbox.shape[0], i, dtype=np.int32)
            for i, bbox in enumerate(bbox_result)
        ]
        labels = np.concatenate(labels)
        # draw segmentation masks
        segms = segm_result
        if segm_result is not None and len(labels) > 0:  # non empty
            segms = mmcv.concat_list(segm_result)
            if isinstance(segms[0], torch.Tensor):
                segms = torch.stack(segms, dim=0).detach().cpu().numpy()
            else:
                segms = np.stack(segms, axis=0)
    
        assert bboxes is None or bboxes.ndim == 2, \
            f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
        assert labels.ndim == 1, \
            f' labels ndim should be 1, but its ndim is {labels.ndim}.'
        assert bboxes is None or bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
            f' bboxes.shape[1] should be 4 or 5, but its {bboxes.shape[1]}.'
        assert bboxes is None or bboxes.shape[0] <= labels.shape[0], \
            'labels.shape[0] should not be less than bboxes.shape[0].'
        assert segms is None or segms.shape[0] == labels.shape[0], \
            'segms.shape[0] and labels.shape[0] should have the same length.'
        assert segms is not None or bboxes is not None, \
            'segms and bboxes should not be None at the same time.'
    
        if score_thr > 0:
            assert bboxes is not None and bboxes.shape[1] == 5
            scores = bboxes[:, -1]
            inds = scores > score_thr
            bboxes = bboxes[inds, :]
            labels = labels[inds]
            if segms is not None:
                segms = segms[inds, ...]
    
        num_bboxes = 0
        ret_label = None
        ret_bbox = None
        ret_polygon = None
        ret_area = None
        ret_position = None
        ret_mask = None
        if bboxes is not None:
            num_bboxes = bboxes.shape[0]
            ret_bbox = bboxes
            ret_polygon = []
            for i, bbox in enumerate(bboxes):
                bbox_int = bbox.astype(np.int32)
                poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
                        [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
                np_poly = np.array(poly).reshape((4, 2))
                ret_polygon.append(Polygon(np_poly))
            ret_label = labels[:num_bboxes]
    
        if segms is not None:
            ret_mask = []
            for i, mask in enumerate(segms):
                temp_mask = []
                from mmdet.core.mask.structures import bitmap_to_polygon
                contours, _ = bitmap_to_polygon(mask)
                temp_mask += [Polygon(c) for c in contours]
                ret_mask.append(temp_mask)
    
            if num_bboxes < segms.shape[0]:
                segms = segms[num_bboxes:]
                areas = []
                positions = []
                for mask in segms:
                    _, _, stats, centroids = cv2.connectedComponentsWithStats(
                        mask.astype(np.uint8), connectivity=8)
                    largest_id = np.argmax(stats[1:, -1]) + 1
                    positions.append(centroids[largest_id])
                    areas.append(stats[largest_id, -1])
                areas = np.stack(areas, axis=0)
                ret_area = areas
                ret_position = positions
    
        return {'labels': ret_label,
                'bboxes': ret_bbox,
                'polygons': ret_polygon,
                'areas': ret_area,
                'positions': ret_position,
                'masks': ret_mask}
    

    Key part of those code:

    ret_mask = []
    for i, mask in enumerate(segms):
        temp_mask = []
        from mmdet.core.mask.structures import bitmap_to_polygon
        contours, _ = bitmap_to_polygon(mask)
        temp_mask += [Polygon(c) for c in contours]
        ret_mask.append(temp_mask)
    

    test code:

    model = init_detector(config_file, checkpoint_file, device='cpu')
    results = inference_detector(model, test_pic_file)
    poly = convert_polygon(results)
    

    After converting poly into json, format would be like this:

    {
        "labels": [1, 1, 2, ...],
        "bboxes": [
                [499.54632568359375, 0.0, 599.1744384765625, 332.5544128417969, 0.9999723434448242],
                ...
        ],
        "polygons": [
            [ [499.0, 0.0], [499.0, 332.0], [599.0, 332.0], [599.0, 0.0], [499.0, 0.0] ],
            ...
        ],
        ...
        ],
        "areas": null,
        "positions": null,
        "masks": [
            [
                [
                    [510.0, 0.0],
                    [509.0, 1.0],
                    [508.0, 1.0],
                    ...
                ],
                ...
            ],
            ...
        ],
    }
    

    Some fields are easy to guess.

    • labels are the class id of every instance
    • The first 4 numbers in bboxes are left-top-x, left-top-y, right-bottom-x, right-bottom-y of the bounding boxes in rectangle shape. The last number of bboxes is confidence value of that instance
    • polygons contains coordinate value which are the same to above
    • no idea about areas and positions because they are always null when testing
    • masks contains coordinates arrays of instances. Only one array if there is no holes in that instance

    2023-07-31 update:

    Recently I work on MMDetection again and found its APIs were changed a lot. Most important change is that in MMDetection3, return type of inference_detector become DetDataSample.

    Any new updates would be pushed to this GitHub repo.