Search code examples
pandasopenpyxlxlsxpython-3.9

Export pandas dataframe to xlsx: dealing with the openpyxl issue on python 3.9


Using the latest packages version: openpyxl: 3.0.6 | pandas: 1.2.3 |python: 3.9

The function below was working fine before updating the packages above to the latest version reported.

Now it raises the error: "zipfile.BadZipFile: File is not a zip file".

Such function is really useful and would be great to know if it can be fixed in order to work.

The function below can be run as it is, just replace "pathExport" to your export directory for testing.

    def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
                           truncate_sheet=False, 
                           **to_excel_kwargs):
        """
        Append a DataFrame [df] to existing Excel file [filename]
        into [sheet_name] Sheet.
        If [filename] doesn't exist, then this function will create it.
    
        Parameters:
          filename : File path or existing ExcelWriter
                     (Example: '/path/to/file.xlsx')
          df : dataframe to save to workbook
          sheet_name : Name of sheet which will contain DataFrame.
                       (default: 'Sheet1')
          startrow : upper left cell row to dump data frame.
                     Per default (startrow=None) calculate the last row
                     in the existing DF and write to the next row...
          truncate_sheet : truncate (remove and recreate) [sheet_name]
                           before writing DataFrame to Excel file
          to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
                            [can be dictionary]
    
        Returns: None
    
        (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
        """
        from openpyxl import load_workbook
    
        # ignore [engine] parameter if it was passed
        if 'engine' in to_excel_kwargs:
            to_excel_kwargs.pop('engine')
    
        writer = pd.ExcelWriter(filename, engine='openpyxl')
    
        # Python 2.x: define [FileNotFoundError] exception if it doesn't exist 
        try:
            FileNotFoundError
        except NameError:
            FileNotFoundError = IOError
    
    
        try:
            # try to open an existing workbook
            writer.book = load_workbook(filename)
            
            # get the last row in the existing Excel sheet
            # if it was not specified explicitly
            if startrow is None and sheet_name in writer.book.sheetnames:
                startrow = writer.book[sheet_name].max_row
    
            # truncate sheet
            if truncate_sheet and sheet_name in writer.book.sheetnames:
                # index of [sheet_name] sheet
                idx = writer.book.sheetnames.index(sheet_name)
                # remove [sheet_name]
                writer.book.remove(writer.book.worksheets[idx])
                # create an empty sheet [sheet_name] using old index
                writer.book.create_sheet(sheet_name, idx)
            
            # copy existing sheets
            writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
        except FileNotFoundError:
            # file does not exist yet, we will create it
            pass
    
        if startrow is None:
            startrow = 0
    
        # write out the new sheet
        df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)
    
        # save the workbook
        writer.save()

pathExport = r"F:\PYTHON\NB-Suite_python39\MNE\outputData\df.xlsx"
df1 = pd.DataFrame({'numbers': [1, 2, 3],
                    'colors': ['red', 'white', 'blue'],
                    'colorsTwo': ['yellow', 'white', 'blue']
                    })

append_df_to_excel(pathExport, df1, sheet_name="DF1", index=False, startcol=0, startrow=0)

Solution

  • The solution is the following:

    import pandas as pd
    
    def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None, startcol=None,
        truncate_sheet=False, resizeColumns=True, na_rep = 'NA', **to_excel_kwargs):
        """
        Append a DataFrame [df] to existing Excel file [filename]
        into [sheet_name] Sheet.
        If [filename] doesn't exist, then this function will create it.
    
        Parameters:
          filename : File path or existing ExcelWriter
                     (Example: '/path/to/file.xlsx')
          df : dataframe to save to workbook
          sheet_name : Name of sheet which will contain DataFrame.
                       (default: 'Sheet1')
          startrow : upper left cell row to dump data frame.
                     Per default (startrow=None) calculate the last row
                     in the existing DF and write to the next row...
          truncate_sheet : truncate (remove and recreate) [sheet_name]
                           before writing DataFrame to Excel file
    
          resizeColumns: default = True . It resize all columns based on cell content width
          to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
                            [can be dictionary]
          na_rep: default = 'NA'. If, instead of NaN, you want blank cells, just edit as follows: na_rep=''
    
    
        Returns: None
    
        *******************
    
        CONTRIBUTION:
        Current helper function generated by [Baggio]: https://stackoverflow.com/users/14302009/baggio?tab=profile
        Contributions to the current helper function: https://stackoverflow.com/users/4046632/buran?tab=profile
        Original helper function: (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
    
    
        Features of the new helper function:
        1) Now it works with python 3.9 and latest versions of pandas and openpxl
        ---> Fixed the error: "zipfile.BadZipFile: File is not a zip file".
        2) Now It resize all columns based on cell content width AND all variables will be visible (SEE "resizeColumns")
        3) You can handle NaN,  if you want that NaN are displayed as NaN or as empty cells (SEE "na_rep")
        4) Added "startcol", you can decide to start to write from specific column, oterwise will start from col = 0
    
        *******************
    
    
    
        """
        from openpyxl import load_workbook
        from string import ascii_uppercase
        from openpyxl.utils import get_column_letter
        from openpyxl import Workbook
    
        # ignore [engine] parameter if it was passed
        if 'engine' in to_excel_kwargs:
            to_excel_kwargs.pop('engine')
    
        try:
            f = open(filename)
            # Do something with the file
        except IOError:
            # print("File not accessible")
            wb = Workbook()
            ws = wb.active
            ws.title = sheet_name
            wb.save(filename)
    
        writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')
    
    
        # Python 2.x: define [FileNotFoundError] exception if it doesn't exist
        try:
            FileNotFoundError
        except NameError:
            FileNotFoundError = IOError
    
    
        try:
            # try to open an existing workbook
            writer.book = load_workbook(filename)
    
            # get the last row in the existing Excel sheet
            # if it was not specified explicitly
            if startrow is None and sheet_name in writer.book.sheetnames:
                startrow = writer.book[sheet_name].max_row
    
            # truncate sheet
            if truncate_sheet and sheet_name in writer.book.sheetnames:
                # index of [sheet_name] sheet
                idx = writer.book.sheetnames.index(sheet_name)
                # remove [sheet_name]
                writer.book.remove(writer.book.worksheets[idx])
                # create an empty sheet [sheet_name] using old index
                writer.book.create_sheet(sheet_name, idx)
    
            # copy existing sheets
            writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
        except FileNotFoundError:
            # file does not exist yet, we will create it
            pass
    
        if startrow is None:
            # startrow = -1
            startrow = 0
    
        if startcol is None:
            startcol = 0
    
        # write out the new sheet
        df.to_excel(writer, sheet_name, startrow=startrow, startcol=startcol, na_rep=na_rep, **to_excel_kwargs)
    
    
        if resizeColumns:
    
            ws = writer.book[sheet_name]
    
            def auto_format_cell_width(ws):
                for letter in range(1,ws.max_column):
                    maximum_value = 0
                    for cell in ws[get_column_letter(letter)]:
                        val_to_check = len(str(cell.value))
                        if val_to_check > maximum_value:
                            maximum_value = val_to_check
                    ws.column_dimensions[get_column_letter(letter)].width = maximum_value + 2
    
            auto_format_cell_width(ws)
    
        # save the workbook
        writer.save()
    

    Example Usage:

    # Create a sample dataframe
    df = pd.DataFrame({'numbers': [1, 2, 3],
                        'colors': ['red', 'white', 'blue'],
                        'colorsTwo': ['yellow', 'white', 'blue'],
                        'NaNcheck': [float('NaN'), 1, float('NaN')],
                        })
    
    # EDIT YOUR PATH FOR THE EXPORT 
    filename = r"C:\DataScience\df.xlsx" 
    
    # RUN ONE BY ONE IN ROW THE FOLLOWING LINES, TO SEE THE DIFFERENT UPDATES TO THE EXCEL FILE
    append_df_to_excel(filename, df, index=False, startrow=0) # Basic Export of df in default sheet (Sheet1)
    append_df_to_excel(filename, df, sheet_name="Cool", index=False, startrow=0) # Append the sheet "Cool" where "df" is written
    append_df_to_excel(filename, df, sheet_name="Cool", index=False) # Append another "df" to the sheet "Cool", just below the other "df" instance
    append_df_to_excel(filename, df, sheet_name="Cool", index=False, startrow=0, startcol=5) # Append another "df" to the sheet "Cool" starting from col 5
    append_df_to_excel(filename, df, index=False, truncate_sheet=True, startrow=10, na_rep = '') # Override (truncate) the "Sheet1", writing the df from row 10, and showing blank cells instead of NaN