Doesnt matter what i do, id ont get it done that alle the data from the xhtml files will be written in one Excel sheet. It looks like, that Python llops through all my files in the folder but as output i only get the data from the last file. Help would be great!
#!/usr/bin/python3
# Import libaries
import pandas as pd
import openpyxl
from openpyxl import load_workbook
import glob
import time
#Path to folder
path_dir: str = r"C:\Users\Moench\Desktop\r2d2\EPUB\content1\*.xhtml"
#Read files
for filename in glob.glob(path_dir):
#Assign the table data to a Pandas dataframe
dfs = open(filename, 'r')
dfs1 = pd.read_html(dfs)
#Read data
df2 = dfs1[0][['Unnamed: 0_level_0','Unnamed: 1_level_0','Unnamed: 2_level_0','Unnamed: 3_level_0','Unnamed: 4_level_0','Unnamed: 12_level_0','Unnamed: 13_level_0']]
#Print result (Looks like that it goes through all files in the folder)
# print (df2)
# Write to existing Excel-Sheet
book = load_workbook('output.xlsx')
writer = pd.ExcelWriter('output.xlsx', engine='openpyxl')
writer.book = book
ts = time.time()
df3 = df2.append(df2)
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df3.to_excel(writer, str(ts))
writer.save()
You are storing your data at each iteration in the same dataframe, rewriting it at each iteration, so you only have your last data (twice actually, because of df2.append(df2)
.
Here is a slightly modified version, storing each dataframe in df_list
, and using pd.concat
on this list to create df3
:
#!/usr/bin/python3
# Import libaries
import pandas as pd
import openpyxl
from openpyxl import load_workbook
import glob
import time
#Path to folder
path_dir: str = r"C:\Users\Moench\Desktop\r2d2\EPUB\content1\*.xhtml"
# Initiate list of dataframes
df_list = list()
#Read files
for filename in glob.glob(path_dir):
#Assign the table data to a Pandas dataframe
dfs = open(filename, 'r')
dfs1 = pd.read_html(dfs)
#Read data
df2 = dfs1[0][['Unnamed: 0_level_0','Unnamed: 1_level_0','Unnamed: 2_level_0','Unnamed: 3_level_0','Unnamed: 4_level_0','Unnamed: 12_level_0','Unnamed: 13_level_0']]
df_list.append(df2)
#Print result (Looks like that it goes through all files in the folder)
# print (df2)
# Write to existing Excel-Sheet
book = load_workbook('output.xlsx')
writer = pd.ExcelWriter('output.xlsx', engine='openpyxl')
writer.book = book
ts = time.time()
# Concatenate all dataframes into one
df3 = pd.concat(df_list, ignore_index=True)
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df3.to_excel(writer, str(ts))
writer.save()