Search code examples
pythonpandasglob

Read various files and write it to one Excel sheet using Python, Pandas


Doesnt matter what i do, id ont get it done that alle the data from the xhtml files will be written in one Excel sheet. It looks like, that Python llops through all my files in the folder but as output i only get the data from the last file. Help would be great!

#!/usr/bin/python3

# Import libaries
import pandas as pd
import openpyxl
from openpyxl import load_workbook
import glob
import time

#Path to folder
path_dir: str = r"C:\Users\Moench\Desktop\r2d2\EPUB\content1\*.xhtml"

#Read files
for filename in glob.glob(path_dir):

#Assign the table data to a Pandas dataframe 
        dfs = open(filename, 'r')
        dfs1 = pd.read_html(dfs)

#Read data                                                                                                         
        df2 = dfs1[0][['Unnamed: 0_level_0','Unnamed: 1_level_0','Unnamed: 2_level_0','Unnamed: 3_level_0','Unnamed: 4_level_0','Unnamed: 12_level_0','Unnamed: 13_level_0']]
 
#Print result (Looks like that it goes through all files in the folder)
#        print (df2)

# Write to existing Excel-Sheet
book = load_workbook('output.xlsx')
writer = pd.ExcelWriter('output.xlsx', engine='openpyxl') 
writer.book = book
ts = time.time()

df3 = df2.append(df2) 
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df3.to_excel(writer, str(ts))
writer.save()

Solution

  • You are storing your data at each iteration in the same dataframe, rewriting it at each iteration, so you only have your last data (twice actually, because of df2.append(df2).

    Here is a slightly modified version, storing each dataframe in df_list, and using pd.concat on this list to create df3:

    #!/usr/bin/python3
    
    # Import libaries
    import pandas as pd
    import openpyxl
    from openpyxl import load_workbook
    import glob
    import time
    
    #Path to folder
    path_dir: str = r"C:\Users\Moench\Desktop\r2d2\EPUB\content1\*.xhtml"
    
    # Initiate list of dataframes
    df_list = list()
    
    #Read files
    for filename in glob.glob(path_dir):
    
    #Assign the table data to a Pandas dataframe 
            dfs = open(filename, 'r')
            dfs1 = pd.read_html(dfs)
    
    #Read data                                                                                                         
            df2 = dfs1[0][['Unnamed: 0_level_0','Unnamed: 1_level_0','Unnamed: 2_level_0','Unnamed: 3_level_0','Unnamed: 4_level_0','Unnamed: 12_level_0','Unnamed: 13_level_0']]
           
            df_list.append(df2)
     
    #Print result (Looks like that it goes through all files in the folder)
    #        print (df2)
    
    # Write to existing Excel-Sheet
    book = load_workbook('output.xlsx')
    writer = pd.ExcelWriter('output.xlsx', engine='openpyxl') 
    writer.book = book
    ts = time.time()
    
    # Concatenate all dataframes into one
    df3 = pd.concat(df_list, ignore_index=True) 
    
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df3.to_excel(writer, str(ts))
    writer.save()