Search code examples
pythonexcelpandasxls

How to parse xls data including merged cells using python pandas


How to parse xls data to this struct, both row and column have merged cells, simply use df.index.to_series().ffill() cannot handle.

{
  "time": "time",
  "category": "A",
  "variety": "A1",
  "specification": "S1",
  "unit": "U1",
  "average": 1.25,
  "region": "RegionA",
  "market": "MarketA",
  "price": 1.1,
}

enter image description here


Solution

  • I figured out this solution:

    def test_xls_parse():
        file_path = 'test.xls'
        df = pd.read_excel(file_path, engine='xlrd')
    
        time_label = df.iloc[0, 0]
    
    
        categories = df.iloc[1, 2:]
        varieties = df.iloc[2, 2:]
        specifications = df.iloc[3, 2:]
        units = df.iloc[4, 2:]
        averages = df.iloc[5, 2:]
    
    
        regions = df.iloc[6:, 0].ffill()
        markets = df.iloc[6:, 1]
        prices = df.iloc[6:, 2:]
    
        result = []
        for i in range(len(categories)):
            for j in range(len(regions)):
                obj = {
                    "date": time_label,
                    "category": categories.iloc[i],
                    "variety": varieties.iloc[i],
                    "specification": specifications.iloc[i],
                    "unit": units.iloc[i],
                    "average": None if averages.iloc[i] == '-' else float(averages.iloc[i]),
                    "region": regions.iloc[j],
                    "market": markets.iloc[j],
                    "price": None if prices.iloc[j, i] == '-' else float(prices.iloc[j, i])
                }
                result.append(obj)
    
        return pd.DataFrame(result)