how to extract email from pdf

I'm trying to extract email from a cv using pdfminer and regular expressions

from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import re

def get_cv_email(self, cv_path):
    pagenums = set()
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(cv_path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    email = match.group(0)
    return email

The email is succesfully extracted for most of the resumes but it doesn't work correctly all the time

Example : jayantanathcdh@gmail.comEducationalQualification

UPDATE: How can I edit my regex to ignore what ever comes after the email if it starts with an uppercase

Solution

Based on your last comment to match the email as you were matching until it finds an Upper case letter after @ you can use this regex:

[\w\.-]+@[a-z0-9\.-]+

With an example:

import re
text = "jayantanathcdh@gmail.comEducationalQualification"
match = re.search(r'[\w\.-]+@[a-z0-9\.-]+', text)
email = match.group(0)

print(email)
#jayantanathcdh@gmail.com