Search code examples
pythonpandasziptemporary-filestabular-form

Resource Management with Python Temporary Files and Zipfiles


I needed to create a subroutine that loops through a collection of outlook messages, opens the attachments and extracts any tabular data in a zip folder to a pandas data frame. To grab the tabular data, I created a function called zip_to_dfs that accepts an outlook MailItem attachment as the argument).

#function to extract tabluar data within zip file to pandas dataframe. returns dictionary object(key=filename;value=pandas df)
import pandas as pd, zipfile, tempfile, os

def zip_to_dfs(attachment, extract_fn=None):
    #returns diciontary object with filename for key and dataframes from attached files as values
    df_objects = {}
    tmp=tempfile.TemporaryFile().name
    attachment.SaveAsFile(tmp)
    if zipfile.is_zipfile(tmp)==True:
        zf = zipfile.ZipFile(tmp)
        #below subroutine could be made to separate function (read tablular to df) to make more readable
        for file in zf.infolist():
            extension = os.path.splitext(file.filename)[1]
            if extension in ['.xls','.xlsx','.xlsm']:
                temp_df = pd.read_excel(zf.open(file.filename), header=None)
                df_objects.update({file.filename:temp_df})
            elif file.filename.endswith(".csv"):
                temp_df = pd.read_csv(zf.open(file.filename), header=None)
                df_objects.update({file.filename:temp_df})
            else:
                raise NotImplementedError('Unexpected filetype: '+str(file.filename))
    else: 
        raise NotImplementedError('Expected zip file')
    return(df_objects)

The function works as intended, but it's probably inefficient. Has anyone used the tempfile or zip file libraries? If so, do you know if Zipfile and TemporaryFile methods clean up automatically? Or are these files being left open on the disk? Do you seen any other obvious issues with this approach?

Edited Code Revision:

def zipattach_to_dfs(attachment, extract_fn=None):
    #evaluates zip file attachments and returns dictionary with file name as key and dataframes as values
    df_objects = {}
    with NamedTemporaryFile(suffix='.tmp', delete=False) as tmp:
        attachment.SaveAsFile(tmp.name)
        zf = ZipFile(tmp)
        for file in zf.infolist():
            datetime = (file.date_time)
            key = (f'{file.filename}({datetime[0]}-{datetime[1]}-{datetime[2]})')
            if isexcel(file) ==True:
                temp_df = pd.read_excel(zf.open(file.filename), header=None)
                df_objects.update({key:temp_df})
            elif file.filename.endswith(".csv"):
                temp_df = pd.read_csv(zf.open(file.filename), header=None)
                df_objects.update({key:temp_df})
            else:
                raise NotImplementedError('Unexpected filetype: '+str(file.filename))
    return (df_objects)


Solution

  • ZipFile supports the with statement too. So, here is my suggestion based on your code(s) :

    def zip_to_dfs(attachment, extract_fn=None): # extract_fn ?
        '''
        Returns a dictionary object with filename for key
        and dataframes from attached files as values.
        '''
        with NamedTemporaryFile(delete=False) as tmp:
            attachment.SaveAsFile(tmp.name)
            
            if is_zipfile(tmp):
                with ZipFile(tmp) as zf:
                    for file in zf.infolist():
                        fn, exte = file.filename.rsplit(".", 1)
                        key = (f'{fn} ({"-".join(map(str, file.date_time[:3]))})')
                        
                        if exte in {'xls', 'xlsx', 'xlsm', 'csv'}:
                            df_objects = {}
                            with zf.open(file) as zip_file:
                                if exte == 'csv':
                                    df = pd.read_csv(zip_file, header=None)
                                else:
                                    df = pd.read_excel(zip_file, header=None)
                                df_objects[key] = df
                        else:
                            raise NotImplementedError(
                                'Unexpected filetype: ' + file.filename
                            )
                    return df_objects
    

    Calling this function would look like something below :

    from win32com.client import Dispatch
    
    outlook = Dispatch("Outlook.Application").GetNamespace("MAPI")
    folder = outlook.Folders("[email protected]").Folders("Inbox")
    
    out = pd.concat(
        [v.assign(date=k) for item in folder.Items for att in item.Attachments
         if zip_to_dfs(att) for k,v in zip_to_dfs(att).items()
        ]
    )  # this will consolidate all the dfs in a single one