python nlp artificial-intelligence extract text-extraction

Python - extract information from email

I am new to Python. Below are some sample emails I received.

Email sample 1

Dear all,

Please note the Total selling volume and total remaining stock

Total selling volume: 45677 Total remaining stock A:3456

Remain at your disposal in case of any doubt or comments.

Best Regards,

Email sample 2

Dear all,

Please see the data as below:

Tol volume: 1,231,245 No. of remaining stock A: 232 No. of remaining stock B: 1,435

Email sample 3

Dear All,

Please find our volume was 233,435

Total remaining stock A: 2453

Email sample 4

In May we had 90 remaining stock A and 4190 TEUs.

I would like to extract the volume and total remaining stock figures from those emails. Any hints if I can get those figures by using python?

I have prepared the below code to extract the figures from email. However I can not distinguish which figure is total selling volume, total remaining stock

import re
import pandas as pd
import win32com.client
from datetime import datetime, timedelta

outlook = win32com.client.Dispatch('outlook.application')
mapi = outlook.GetNamespace("MAPI")
inbox = mapi.GetDefaultFolder(6).Folders.Item("AI email testing")
#outlook.GetDefaultFolder(6) .Folders.Item("Your_Folder_Name")
#inbox = outlook.GetDefaultFolder(6)
messages = inbox.Items

received_dt = datetime.now() - timedelta(days=1)
received_dt = received_dt.strftime('%m/%d/%Y %H:%M %p')


for message in list(messages):
    #print (message)
    body_content = message.body
    body_content =body_content[body_content.find("Subject:"):]
    #print(body_content)
    figures = re.findall("\d+(?:,\d+)*(?:\.\d+)?",body_content)
    print(figures)

Solution

Here's a solution using RegEx:

from __future__ import annotations

import re
from typing import List, Tuple


def get_number(text: str) -> float | int | str:
    """
    Extract the first numeric value from the input string.

    The function uses regular expressions to extract the first numeric
    occurrence from `text`. If no numeric value is found, the original string
    is returned. Commas are removed from the extracted number, if any.
    The function first attempts to convert the number to an integer,
    and if that fails, it tries to convert it to a float.

    Parameters
    ----------
    text : str
        The string from which the numeric value should be extracted.

    Returns
    -------
    float | int | str
        The first numeric value in `text` converted to int or float,
        or original `text` if no numeric value is found.

    Raises
    ------
    ValueError
        If the extracted number can't be converted to an integer or a float.

    Examples
    --------
    Illustration of the function usage and behavior.

    >>> get_number("Hello world 123!")
    123
    >>> get_number("I have 2,200 dollars.")
    2200
    >>> get_number("No numbers here.")
    'No numbers here.'
    >>> get_number("It is over 9000!")
    9000
    >>> get_number("The value of pi is about 3.14159.")
    3.14159
    >>> get_number("Total: 123,456,789.")
    123456789.0
    """
    number = re.search(r'(\d+|,)+.', text, re.I)
    if number:
        number = number[0].strip().replace(',', '')
    if not number:
        print(f"Found no numbers inside text: {text!r}")
        return text
    try:
        return int(number)
    except ValueError:
        return float(number)


def extract_stock_volume_from_email(email: str) -> Tuple[int | float | str, int | float | str]:
    """
    Extract the volume and remaining stock A details from an email text.

    This function employs regular expressions to parse the given email text and
    extract details about volume and remaining stock A.
    The values extracted are then cleaned and returned.

    Parameters
    ----------
    email : str
        Text from the email to parse.

    Returns
    -------
    volume : int | float | str
        Volume extracted from the email.
        Returns 'Volume not found' if no volume details are found.
    remaining_stock_a : int | float | str
        Remaining stock A extracted from the email.
        Returns 'Remaining stock A not found' if no stock A details are found.

    Raises
    ------
    re.error
        If a non-valid regular expression is used.

    See Also
    --------
    re.search : The method used for extracting volume and remaining stock details.

    Examples
    --------
    >>> email_text = "The volume was 5000 TEUs. Stock A: 1000 units."
    >>> extract_stock_volume_from_email(email_text)
    (5000, 1000)
    >>> email_text = "No volume and stock data available."
    >>> extract_stock_volume_from_email(email_text)
    ('Volume not found', 'Remaining stock A not found')
    """
    # Extract the volume
    volume = re.search(
        r'(?:volume:|volume was|TEUs\.|TEUs |TEU |$)\s(\d+|,)+.*?|(\d+|,)+.(?:\sTEUs|\sTEU)',
        email, re.I
    )
    if volume:
        volume = get_number(volume[0].strip())
    if not volume:
        volume = 'Volume not found'

    # Extract the remaining stock
    remaining_stock_a = re.search(r'(?:stock A:|stock A: |$)(\d+|,)+.*?', email, re.I)
    if remaining_stock_a:
        remaining_stock_a = remaining_stock_a[0].strip()
    if not remaining_stock_a:
        remaining_stock_a = re.search(r'(\d+)(.+)(stock A)', email, re.I)
        if remaining_stock_a:
            remaining_stock_a = remaining_stock_a[0].strip()
    if remaining_stock_a:
        remaining_stock_a = get_number(remaining_stock_a)
    if not remaining_stock_a:
        remaining_stock_a = 'Remaining stock A not found'
    # print(f"Volume: {volume}\nRemaining Stock A: {remaining_stock_a}\n")
    return volume, remaining_stock_a


def extract_stock_volume_from_emails(
    emails: List[str],
) -> List[Tuple[int | float | str, int | float | str]]:
    """
    Apply the function `extract_stock_volume_from_email` to a list of emails.

    Parameters
    ----------
    emails : List[str]
        A list of email texts to be parsed.

    Returns
    -------
    List[Tuple[int | float | str, int | float | str]]
        A list of tuples. Each tuple contains the volume and remaining stock A
        extracted from each email. If no volume or stock A details could be
        extracted from an email, the corresponding element in the tuple will be
        'Volume not found' or 'Remaining stock A not found', respectively.

    Raises
    ------
    re.error
        If a non-valid regular expression is used in `extract_stock_volume_from_email`.

    See Also
    --------
    extract_stock_volume_from_email : The function used to extract details from each email.

    Examples
    --------
    >>> email_texts = [
    ...     "The volume was 5000 TEUs. Stock A: 1000 units.",
    ...     "No volume and stock data available.",
    ... ]
    >>> extract_stock_volume_from_emails(email_texts)
    [(5000, 1000), ('Volume not found', 'Remaining stock A not found')]
    """
    return list(map(extract_stock_volume_from_email, emails))

Using the above code on the e-mails you provided as example:

emails = [
    r"""Dear all,

Please note the Total selling volume and total remaining stock

Total selling volume: 45677 Total remaining stock A:3456

Remain at your disposal in case of any doubt or comments.

Best Regards,""",
    r"""Dear all,

Please see the data as below:

Tol volume: 1,231,245 No. of remaining stock A: 232 No. of remaining stock B: 1,435""",
    r"""Dear All,

Please find our volume was 233,435

Total remaining stock A: 2453""",
    r"In May we had 90 remaining stock A and 4190 TEUs.",
]
extract_stock_volume_from_emails(emails)
# Returns:
#
# [(45677, 3456), (1231245, 232), (233435, 2453), (4190, 90)]
#  ^----^  ^--^
#  |       |
#  |       +-- Remaining stock A
#  +-- Volume

Note

It should be noted that the function extract_stock_volume_from_email, that parses each e-mail is not failproof. The RegEx patterns it contains were all based on the e-mails you provided as example. If other e-mails don't follow the same patterns as the example e-mails, these additional patterns will have to be added to the extract_stock_volume_from_email function.