Search code examples
pythonexcelopenpyxlxlsx

Preserving Data Groupings when writing to Worksheet using OpenPyxl


Disclaimer: I am a beginner-level, self-taught casual 'programmer' with Python.

Background: I have a script that takes Groupings of data from an .xlsx Workbook and writes them to separate Worksheets of a seperate .xlsx like so -

Before state (original_data.xlsx): you will note each top-level is a Client, underneath sits multiple levels of underyling data, pertaining to it's parent client. enter image description here

After state (split_data.xlsx): you will note each client and their underlying data is written to an identically named Worksheet in a new .xlsx file like so:
enter image description here

Issue: you will note from the After state that the Groupings / Levels have been lost, whilst all the data is present and has different levels of indentation.

Does anyone know how I might enhance my script to ensure that the Groupings / Levels are preserved? E.g., so you can still expand each Grouping (e.g., Client A) like so: enter image description here

My script: here is the script (sorry it's messy!) which achieves the above, minus the preserved Grouping/Levels.

import openpyxl

from copy import copy
from openpyxl import load_workbook

columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
    for col in columns:
        ws_cell=ws1[col+str(ws_row)]
        new_cell = ws[col+str(row)]

        if ws_cell.has_style:
            new_cell.font = copy(ws_cell.font)
            new_cell.border = copy(ws_cell.border)
            new_cell.fill = copy(ws_cell.fill)
            new_cell.number_format = copy(ws_cell.number_format)
            new_cell.protection = copy(ws_cell.protection)
            new_cell.alignment = copy(ws_cell.alignment)

wb1 = openpyxl.load_workbook('original_data.xlsx')

ws1=wb1.active

indexs=[]
clients=[]

index=1

while ws1['A'+str(index)]:
    if str(ws1['A'+str(index)].alignment.indent)=='0.0':
        indexs.append(index)
        clients.append(ws1['A'+str(index)].value)
    if ws1['A'+str(index)].value  is None:
        indexs.append(index)
        break
    index+=1

wb1.close()

wb = openpyxl.Workbook()

ws=wb.active

start_index=1

headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']

for y,index in enumerate(indexs):
    try:
        client=0
        if len(clients[y])>=32:
            client=clients[y][:31]
        else:
            client=clients[y]
        wb.create_sheet(client)
        ws=wb[client]
        ws.column_dimensions['A'].width=35
        ws.append(headers)
        row_index=2
        for i in range(start_index,indexs[y+1]):
            ws.append([ws1[col+str(i)].value for col in columns])
            copy_cell(ws,row_index,i,ws1)
            row_index+=1
        start_index=indexs[y+1]
    except:
        pass

wb.save('split_data.xlsx')
wb.close()

try:
    wb1 = openpyxl.load_workbook('split_data.xlsx')
    a=wb1['Sheet']
    wb1.remove(a)
    a=wb1['Sheet1']
    wb1.remove(a)
    wb1.save('split_data.xlsx')
    wb1.close()
except:
    pass

Resources: here is a link to some test data (original_data.xlsx)


Solution

  • from openpyxl import load_workbook
    
    
    def get_client_rows(sheet):
        """Get client rows.
    
        Skip header and then look for row dimensions without outline level
        """
        return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
        return [
            row_index
            for row_index, row_dimension in sheet.row_dimensions.items()
            if row_index > 1 and row_dimension.outline_level == 0
        ]
    
    
    def delete_client_block(sheet, start, end):
        """
        Delete rows starting from up to and including end.
        """
        for row in range(start, end + 1):
            sheet.row_dimensions.pop(row, None)
        sheet.delete_rows(start, end - start + 1)
    
    
    def split_workbook(input_file, output_file):
        """
        Split workbook each main group into its own sheet.
    
        Not too loose any formatting we copy the current sheet and remove all rows
        which do not belong to extacted group.
        """
    
        try:
            workbook = load_workbook(input_file)
            data_sheet = workbook.active
            client_rows = get_client_rows(data_sheet)
    
            for index, client_row in enumerate(client_rows):
                # create new sheet for given client, shorten client as it might be too long
                client_sheet = workbook.copy_worksheet(data_sheet)
                client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
    
                # delete rows after current client if available
                if index < len(client_rows) - 1:
                    row_after_client = client_rows[index + 1]
                    delete_client_block(
                        client_sheet, row_after_client, client_sheet.max_row
                    )
    
                # delete rows before current client if available
                if index > 0:
                    first_client_row = client_rows[0]
                    delete_client_block(
                        client_sheet, first_client_row, client_row - first_client_row + 1
                    )
    
                    # move left over dimensions to top of the sheet
                    for row_index in list(client_sheet.row_dimensions.keys()):
                        # skip header row dimension
                        if row_index > first_client_row - 1:
                            row_dimension = client_sheet.row_dimensions.pop(row_index)
                            new_index = row_index - client_row + first_client_row
                            row_dimension.index = new_index
                            client_sheet.row_dimensions[new_index] = row_dimension
    
            del workbook[data_sheet.title]
            workbook.save(output_file)
        finally:
            workbook.close()
    
    
    if __name__ == "__main__":
        input_file = 'data.xlsx'
        output_file = "data_output.xlsx"
    
        split_workbook(input_file, output_file)