I have a spreadsheet which is in format of Microsoft Excel 97-2003 xls. I tried following:
import pandas as pd
xlsx_file_path = "C:/temp/a_file.xls"
sheets_dict = pd.read_excel(xlsx_file_path, engine='xlrd', sheet_name=None)
for sheet_name, df_in in sheets_dict.items():
print(sheet_name)
It gives error:
File C:\xxxxxx\site-packages\xlrd\__init__.py:172 in open_workbook
bk = open_workbook_xls(
File C:\xxxxxxx\site-packages\xlrd\book.py:79 in open_workbook_xls
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
File C:\xxxxxxxx\site-packages\xlrd\book.py:1284 in getbof
bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
File C:\xxxxxxxx\site-packages\xlrd\book.py:1278 in bof_error
raise XLRDError('Unsupported format, or corrupt file: ' + msg)
XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'\xef\xbb\xbf\xef\xbb\xbf<?'
I tried other engines such as openpyxl, and got following error:
File C:\xxxx\lib\zipfile.py:1336 in _RealGetContents
raise BadZipFile("File is not a zip file")
BadZipFile: File is not a zip file
Is there any workaround?
The XLS file is a birany file that is not in the Zipped file format. Because that you have an ZIP error when you're using the openpyxl engine. You can leave the engine empty and the pandas select to you.
Now checking the Microsoft Excel 97-2003 XML spreadsheet problem, I've developed an reader based on this documentation https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats
import pandas as pd
import xml.etree.ElementTree as ET
from io import StringIO ## for Python 3
import re
### Performs some cleanup at the generate XML
### These cleanups are based on https://www.ishares.com/ needs
### You need to customized based on your personal needs
def cleanup(xml):
# Remooving multiple BOM at the file
xml = xml.replace('\ufeff', '')
# Regular expression that finds "&" with no ";" at next 5 positions
pattern = r'&(?!(?:.*?;){1,5})'
# Replacing the alone "&""
xml = re.sub(pattern, '&', xml)
# TODO you can perform others celeanup
return xml
### Removing the namespace to improove the code readbility
def remove_namespace(tree):
for elem in tree.iter():
if '}' in elem.tag:
elem.tag = elem.tag.split('}', 1)[1]
for key in list(elem.attrib.keys()):
if '}' in key:
new_key = key.split('}', 1)[1]
elem.attrib[new_key] = elem.attrib.pop(key)
### Extract data frame from xml and workSheet name/firstLine
def extract_data_from_xml(xml_file, sheets):
df = {}
prefix= u'{urn:schemas-microsoft-com:office:spreadsheet}'
tree = ET.parse(xml_file)
remove_namespace(tree)
root = tree.getroot()
for sheet_name, start_row in sheets.items():
data = extract_data_from_root(root, sheet_name, start_row)
if(len(data)>0):
headers = data[0]
df[sheet_name] = pd.DataFrame(data[1:], columns=headers)
return df
### Extracting data array from parsed xml root not from text, to improove performance
def extract_data_from_root(root, sheet_name, start_row):
data = []
found_table = False
acc = 0
for elem in root.iter():
if found_table and elem.tag == 'Row':
row_data = [cell.text for cell in elem.findall('Cell/Data')]
if(acc>= start_row):
data.append(row_data)
else:
acc +=1
elif elem.tag == 'Table' or elem.tag == 'Worksheet':
if elem.attrib.get('Name') == sheet_name:
found_table = True
else:
if found_table and len(data)>0:
break
else:
continue
return data
### The core function
def read_XML_MS_spreadsheet(filename, sheets):
with open(filename, mode="r", encoding="utf-8-sig") as fin:
xml = fin.read()
xml = cleanup(xml)
df = extract_data_from_xml(StringIO(xml), sheets)
return df
def read_XLS_and_XLSX_spreadsheet(filename, sheet_start_rows):
dfs = {}
for sheet_name, start_row in sheet_start_rows.items():
df = pd.read_excel(filename, sheet_name=sheet_name, header=start_row)
dfs[sheet_name] = df
return dfs
### main
sheets = {
'Holdings' : 7,
'Historical' : 0,
'Performance' : 4,
'Distributions' : 0
}
# Originaly posted on https://stackoverflow.com/questions/77958287/having-difficulties-to-open-an-old-format-xls-file-with-python-pandas/77958383#77958383
# https://www.ishares.com/us/products/239566/ishares-iboxx-investment-grade-corporate-bond-etf/1521942788811.ajax?fileType=xls&fileName=iShares-iBoxx--Investment-Grade-Corporate-Bond-ETF_fund&dataType=fund
file_path = 'iShares-iBoxx--Investment-Grade-Corporate-Bond-ETF_fund.xls'
## Firstly try to read in XLSX or XLS. Pandas will choose the right one
try:
df = read_XLS_and_XLSX_spreadsheet(file_path, sheets)
except Exception:
try: ## If this isn't an XLS or XLSX file, try to load as an MS-XML 2003 Spreadsheet @see https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats
df = read_XML_MS_spreadsheet(file_path, sheets)
except Exception: ## If still having an error, try to open an CSV file
df = pd.read_csv(file_path) # CSV it's not multi-sheet
# Let's do some stufs
print(df)
@edited 1 Based on the error, you're problaby have an CSV file named as as XLS file. try to change the read method to
df = pd.read_csv(file_path)
@edited 2 After receiving the link to the specific file, I have enhanced the response by developing an MS-XML 2003 Spreadsheet reader. Additionally, I have conducted some cleanup on the externally generated XML file. As a result, the code is now compatible with various file formats such as XLSX, XLS, MS-XML, or CSV. You may input the spreadsheets along with the desired initial line for importing into the pandas DataFrame.