Search code examples
pythonopencvtesseract

How to get data from chart image while preserving order?


I have few images like these,

Image 1:

Image 1

Image 2:

Image 2

I can extract the names and roles from these images using an ocr tool like tesseract from Python, but I want to preserve the hierarchy along the way.

Please provide some interesting ways to solve this problem. I am not able to think of one proper approach to the problem.

print(pytesseract.image_to_string(Image.open('test.png'))) 
import pytesseract# This returns the strings to me.

I want information in structured form in any data type, something like,

For example, in chart 1, Erica is the CEO, and under Erica are Russell, David Work, and so on. 


Solution

  • Visualization of results:

    visualization of results

    Approach:

    • box borders and connecting lines ("nets") have certain color/brightness
    • work with masks, lists of contours, label maps
    • calculate overlap/intersection
    • at overlap, check what box and what net participate

    Written for the one specific image you provided. The other one is too low-resolution.

    For less favorable input data, this would need adapting. That shouldn't be difficult though. Just different thresholds and whatnot.

    This should also already work with non-box nodes (e.g. circles/ellipses).

    You can figure out the OCR part. This approach and code gives you the individual boxes that you can pass to OCR.

    Output:

    net 1 connects boxes [1, 2]
    net 2 connects boxes [2, 3, 4, 5]
    net 3 connects boxes [4, 6, 7, 8, 9, 10]
    net 4 connects boxes [6, 11, 16, 21]
    net 5 connects boxes [7, 12, 17, 22]
    net 6 connects boxes [8, 13, 18, 23]
    net 7 connects boxes [9, 14, 19, 24]
    net 8 connects boxes [10, 15, 20, 25]
    

    Code:

    #!/usr/bin/env python3
    
    import os
    import sys
    from math import *
    import numpy as np
    import cv2 as cv
    import pprint
    import collections
    
    np.set_printoptions(suppress=True, linewidth=120)
    
    ########################################################################
    
    im = cv.imread("9Rsju.jpg", cv.IMREAD_GRAYSCALE) / np.float32(255)
    
    # optional downscale. easier to display.
    im = cv.resize(im, fx=0.5, fy=0.5, dsize=None, interpolation=cv.INTER_AREA)
    
    # generally, indices are 0..(N-1) and labels are 1..N and always (index+1) == label
    
    ########################################################################
    # boxes
    
    # select box borders
    boxes_mask = (im <= 0.50) * np.uint8(255)
    boxes_mask = cv.dilate(boxes_mask, kernel=None, iterations=1) # cover marginal pixels
    
    (boxes_contours, _) = cv.findContours(
        image=boxes_mask,
        mode=cv.RETR_EXTERNAL,
        method=cv.CHAIN_APPROX_SIMPLE)
    
    numboxes = len(boxes_contours)
    
    # sort contours vertically
    def valuation(contour):
        (x,y,w,h) = cv.boundingRect(contour)
        return y + x / 10 # "objective function"
    
    # not vital, just pretty
    boxes_contours.sort(key=valuation)
    
    boxes_rects = [ cv.boundingRect(c) for c in boxes_contours ]
    
    def bottom_of_rect(rect):
        (x,y,w,h) = rect
        return (x + w//2, y + h)
    
    def top_of_rect(rect):
        (x,y,w,h) = rect
        return (x + w//2, y)
    
    # boxes mask (fill inside of box)
    cv.drawContours(
        boxes_mask,
        boxes_contours,
        contourIdx=-1,
        color=255,
        thickness=cv.FILLED)
    
    # label map for boxes
    boxes_labels = np.zeros_like(boxes_mask, dtype=np.uint8)
    for (i, contour) in enumerate(boxes_contours):
        box_label = i+1
        cv.drawContours(
            boxes_labels,
            boxes_contours,
            contourIdx=i,
            color=box_label,
            thickness=cv.FILLED)
    
    
    ########################################################################
    # connecting lines ("nets")
    
    # select
    nets_mask = (im >= 0.7) & (im <= 0.9) # 0.8 + margin
    nets_mask = nets_mask * np.uint8(255)
    nets_mask &= ~boxes_mask
    
    # create slight overlap
    nets_mask = cv.dilate(nets_mask, kernel=None, iterations=1)
    
    # label map
    (numnets, nets_labels) = cv.connectedComponents(nets_mask)
    
    
    ########################################################################
    # connectivity
    
    overlap = nets_mask & boxes_mask
    (y,x) = overlap.nonzero() # pixel coordinates
    relations = set(zip(nets_labels[y,x], boxes_labels[y,x]))
    
    by_net = collections.defaultdict(lambda: set())
    for (net,box) in relations:
        by_net[net].add(box)
    
    for net_label, boxes in sorted(by_net.items()):
        print(f"net {net_label} connects boxes", sorted(boxes))
    
    
    ########################################################################
    # visualization
    
    canvas = cv.cvtColor(im * 0.5, cv.COLOR_GRAY2BGR)
    
    for net_label, boxes in sorted(by_net.items()):
        boxes = sorted(boxes)
    
        headbox_label = boxes.pop(0)
        headbox_center = bottom_of_rect(boxes_rects[headbox_label-1])
    
        for box_label in boxes:
            box_center = top_of_rect(boxes_rects[box_label-1])
    
            cv.line(canvas,
                pt1=headbox_center,
                pt2=box_center,
                color=(0,0,255),
                thickness=2)
    
    for (i, contour) in enumerate(boxes_contours):
        (x,y,w,h) = boxes_rects[i]
    
        cv.rectangle(canvas,
            pt1=(x,y), pt2=(x+w,y+h),
            color=(255,255,255),
            thickness=2)
    
        cv.putText(canvas,
            text=f"Box {i + 1}",
            org=(x+10, y+40),
            fontFace=cv.FONT_HERSHEY_SIMPLEX,
            fontScale=1,
            color=(255,255,255),
            thickness=2)
    
    
    ########################################################################
    # display
    
    cv.namedWindow("canvas", cv.WINDOW_NORMAL)
    cv.imshow("canvas", canvas)
    
    cv.waitKey(-1)
    cv.destroyAllWindows()