Search code examples
pythonregexemailpdfminer

how to extract email from pdf


I'm trying to extract email from a cv using pdfminer and regular expressions

from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import re

def get_cv_email(self, cv_path):
    pagenums = set()
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(cv_path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    email = match.group(0)
    return email

The email is succesfully extracted for most of the resumes but it doesn't work correctly all the time

Example : [email protected]

UPDATE: How can I edit my regex to ignore what ever comes after the email if it starts with an uppercase


Solution

  • Based on your last comment to match the email as you were matching until it finds an Upper case letter after @ you can use this regex:

    [\w\.-]+@[a-z0-9\.-]+
    

    With an example:

    import re
    text = "[email protected]"
    match = re.search(r'[\w\.-]+@[a-z0-9\.-]+', text)
    email = match.group(0)
    
    print(email)
    #[email protected]