Search code examples
python-3.xregexpython-re

Regex Error and Improvement Driving Licence Data Extraction


I am trying to extract the Name, License No., Date Of Issue and Validity from an Image I processed using Pytesseract. I am quite a lot confused with regex but still went through few documentations and codes over the web.

I got till here:

import pytesseract
import cv2
import re

import cv2

from PIL import Image
import numpy as np
import datetime

from dateutil.relativedelta import relativedelta

def driver_license(filename):  
    """
    This function will handle the core OCR processing of images.
    """
    
    i = cv2.imread(filename)
    newdata=pytesseract.image_to_osd(i)
    angle = re.search('(?<=Rotate: )\d+', newdata).group(0)
    angle = int(angle)
    i = Image.open(filename)
    if angle != 0:
       #with Image.open("ro2.jpg") as i:
        rot_angle = 360 - angle
        i = i.rotate(rot_angle, expand="True")
        i.save(filename)
    
    i = cv2.imread(filename)
    # Convert to gray
    i = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)

    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    i = cv2.dilate(i, kernel, iterations=1)
    i = cv2.erode(i, kernel, iterations=1)
    
    txt = pytesseract.image_to_string(i)
    print(txt)
        
    text = []
    data = {
        'firstName': None,
        'lastName': None,
        'age': None,
        'documentNumber': None
    }
    
    c = 0
    print(txt)
    
    #Splitting lines
    lines = txt.split('\n')
    
    for lin in lines:
        c = c + 1
        s = lin.strip()
        s = s.replace('\n','')
        if s:
            s = s.rstrip()
            s = s.lstrip()
            text.append(s)

            try:
                if re.match(r".*Name|.*name|.*NAME", s):           
                    name = re.sub('[^a-zA-Z]+', ' ', s)
                    name = name.replace('Name', '')
                    name = name.replace('name', '')
                    name = name.replace('NAME', '')
                    name = name.replace(':', '')
                    name = name.rstrip()
                    name = name.lstrip()
                    nmlt = name.split(" ")
                    data['firstName'] = " ".join(nmlt[:len(nmlt)-1])
                    data['lastName'] = nmlt[-1]
                if re.search(r"[a-zA-Z][a-zA-Z]-\d{13}", s):
                    data['documentNumber'] = re.search(r'[a-zA-Z][a-zA-Z]-\d{13}', s)
                    data['documentNumber'] = data['documentNumber'].group().replace('-', '')
                    if not data['firstName']:
                        name = lines[c]           
                        name = re.sub('[^a-zA-Z]+', ' ', name)
                        name = name.rstrip()
                        name = name.lstrip()
                        nmlt = name.split(" ")
                        data['firstName'] = " ".join(nmlt[:len(nmlt)-1])
                        data['lastName'] = nmlt[-1]
                if re.search(r"[a-zA-Z][a-zA-Z]\d{2} \d{11}", s):
                    data['documentNumber'] = re.search(r'[a-zA-Z][a-zA-Z]\d{2} \d{11}', s)
                    data['documentNumber'] = data['documentNumber'].group().replace(' ', '')
                    if not data['firstName']:
                        name = lines[c]           
                        name = re.sub('[^a-zA-Z]+', ' ', name)
                        name = name.rstrip()
                        name = name.lstrip()
                        nmlt = name.split(" ")
                        data['firstName'] = " ".join(nmlt[:len(nmlt)-1])
                        data['lastName'] = nmlt[-1]
                if re.match(r".*DOB|.*dob|.*Dob", s):         
                    yob = re.sub('[^0-9]+', ' ', s)
                    yob = re.search(r'\d\d\d\d', yob)
                    data['age'] = datetime.datetime.now().year - int(yob.group())
            except:
                pass

    print(data)
    

I need to extract the Validity and Issue Date as well. But not getting anywhere near it. Also, I have seen using regex shortens the code like a lot so is there any better optimal way for it?

My input data is a string somewhat like this:

Transport Department Government of NCT of Delhi
Licence to Drive Vehicles Throughout India

Licence No. : DL-0820100052000 (P) R
N : PARMINDER PAL SINGH GILL

: SHRI DARSHAN SINGH GILL

DOB: 10/05/1966 BG: U
Address :

104 SHARDA APPTT WEST ENCLAVE
PITAMPURA DELHI 110034

  

Auth to Drive Date of Issue
M.CYL. 24/02/2010
LMV-NT 24/02/2010

(Holder's Sig natu re)

Issue Date : 20/05/2016
Validity(NT) : 19/05/2021 : c
Validity(T) : NA Issuing Authority
InvCarrNo : NA NWZ-I, WAZIRPUR

Or like this:

in

Transport Department Government of NCT of Delhi
Licence to Drive Vehicles Throughout India

2

   
    
   

Licence No. : DL-0320170595326 () WN
Name : AZAZ AHAMADSIDDIQUIE
s/w/D : SALAHUDDIN ALI
____... DOB: 26/12/1992 BG: O+
\ \ Address:
—.~J ~—; ROO NO-25 AMK BOYS HOSTEL, J.
— NAGAR, DELHI 110025
Auth to Drive Date of Issue
M.CYL. 12/12/2017
4 wt 4
Iseue Date: 12/12/2017 a
falidity(NT) < 2037
Validity(T) +: NA /
Inv CarrNo : NA te sntian sana

Note: In the second example you wouldn't get the validity, will optimise the OCR for later. Any proper guide which can help me with regex which is a bit simpler would be good.


Solution

  • You can use this pattern: (?<=KEY\s*:\s*)\b[^\n]+ and replace KEY with one of the issues of the date, License No. and others. Also for this pattern, you need to use regex library.

    Code:

    import regex
    
    text1 = """
    Transport Department Government of NCT of Delhi
    Licence to Drive Vehicles Throughout India
    
    Licence No. : DL-0820100052000 (P) R
    N : PARMINDER PAL SINGH GILL
    
    : SHRI DARSHAN SINGH GILL
    
    DOB: 10/05/1966 BG: U
    Address :
    
    104 SHARDA APPTT WEST ENCLAVE
    PITAMPURA DELHI 110034
    
    
    
    Auth to Drive Date of Issue
    M.CYL. 24/02/2010
    LMV-NT 24/02/2010
    
    (Holder's Sig natu re)
    
    Issue Date : 20/05/2016
    Validity(NT) : 19/05/2021 : c
    Validity(T) : NA Issuing Authority
    InvCarrNo : NA NWZ-I, WAZIRPUR
    """
    
    for key in ('Issue Date', 'Licence No\.', 'N', 'Validity\(NT\)'):
        print(regex.findall(fr"(?<={key}\s*:\s*)\b[^\n]+", text1, regex.IGNORECASE))
    
    

    Output:

    ['20/05/2016']
    ['DL-0820100052000 (P) R']
    ['PARMINDER PAL SINGH GILL']
    ['19/05/2021 : c']