Search code examples
pythonopencvdeep-learningocr

Text detection & Segmentation using Python OpenCV


I'm using the algorithm below to segment sentences into words and words into characters. As you can see in the output below the letters 'S' and 'T' in the word 'STAND' are bounded together and I can't understand what i've done wrong, will be glad if you could help me guys.

2.I've already trained a model on the EMNIST letters dataset. My model can predict only one letter at a time. To proceed further, I need to extract each character box into an array of character images. Ultimately, I aim to have an array containing all of the character images. After that, I plan to use my model to predict each character individually.

Additionally, I'll need to resize each character to 28x28 pixels, as the model is trained to predict letters from images of that size. I'm having trouble with doing this.. hope you can help me

import cv2



# Preprocessing

def preProcessing(myImage):
    grayImg = cv2.cvtColor(myImage, cv2.COLOR_BGR2GRAY)
    # cv2.imshow('Gray Image', grayImg)
    # cv2.waitKey()

    ret, thresh1 = cv2.threshold(grayImg, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
    # cv2.imshow('After threshold', thresh1)
    # cv2.waitKey()

    print(f'The threshold valua applied to the image is: {ret} ')
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))
    dilation = cv2.dilate(thresh1, horizontal_kernel, iterations=1)
    horizontal_contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    im2 = myImage.copy()

    for cnt in horizontal_contours:
        x, y, w, h = cv2.boundingRect(cnt)
        rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (255, 255, 255), 0)
    im2= seg_word(rect)
    #im2 = seg_word(rect)
    #im2=character_seg(im2)
    return im2

# Word segmentation
def seg_word(wordImage):
    # convert the input image into gray scale
    grayImg = cv2.cvtColor(wordImage, cv2.COLOR_BGR2GRAY)

    # Binarize the gray image with OTSU algorithm
    ret, thresh2 = cv2.threshold(grayImg, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
    #print(ret)

    # create a Structuring Element size of 8*10 for the vertical contouring
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (8, 10))

    # apply Dilation for once only
    dilation = cv2.dilate(thresh2, vertical_kernel, iterations=1)

    #fingd the vertical contours
    vertical_contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    word_img = wordImage.copy()

    # Run through each contour and extract the bounding box
    for cnt in vertical_contours:
        #computes the minimum rectangle
        x, y, w, h = cv2.boundingRect(cnt)
        # Draw a rectangular from the top left to the bottom right with the
        # given Coordinates x,y and height and width
        rect = cv2.rectangle(word_img, (x, y), (x + w, y + h), (0, 255, 0), 0)
    # apply a Character Segmentation and return the output Image
    word_img= character_seg(rect)
    return word_img

# Character segmentation
def character_seg(img):
    #conver the input image int gray scale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Threshold the image
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

    # Apply morphological erosion to remove small artifacts
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,5))
    eroded = cv2.erode(thresh, kernel, iterations=1)

    # Apply morphological dilation to expand the characters
    dilated = cv2.dilate(eroded, kernel, iterations=3)

    # Find contours in the image
    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Iterate through each contour and extract the bounding box
    for contour in contours:
            (x, y, w, h) = cv2.boundingRect(contour)
            cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0 ), 2)
    return  img

# Load the test image
image_path = r"C:\Users\student\Desktop\FinalProject\Flask\uploads\1_lWmB8FGf1uWT6r1TichK-Q- 
ezgif.com-webp-to-png-converter.png"
myImage = cv2.imread(image_path)
# Display the image
cv2.imshow('Text Image', myImage)
cv2.waitKey(0)

processed_img = preProcessing(myImage)
cv2.imshow('Text Image', processed_img)
cv2.waitKey(0)

enter image description here

enter image description here


Solution

  • As you can see in the output below the letters 'S' and 'T' in the word 'STAND' are bounded together and I can't understand what i've done wrong, will be glad if you could help me guys.

    The problem can be fixed.

    Change iterations=3 on line 73:

    dilated = cv2.dilate(eroded, kernel, iterations=3)  
    

    To:

    dilated = cv2.dilate(eroded, kernel, iterations=1) #Change index to 1
    

    Screenshot:

    enter image description here