Search code examples
pythonexcelyamlopenpyxlxlsx

Import yaml to excel


I have following .YAML file:

Name: Tom
Surname: Smith
Status: Married
Childs:
- ChildName: Willy
  Age: 30
  Grandchild:
  - GrandchildName: John
    Age: 4
  - GrandchildName: Maria
    Age: 3
- ChildName: Arthur
  Age: 40
  Grandchild: N/A

What I've done:

import openpyxl
import yaml

with open('family.yaml') as file:
    family_list = yaml.load(file, Loader=yaml.FullLoader)

def create_workbook():
    wb = openpyxl.Workbook('family.xlsx')
    wb.save('family.xlsx')

def update_workbook():
    wb = openpyxl.load_workbook('family.xlsx')

    sh1 = wb.active

    for item in family_list.items():
        sh1.append(item)
    wb.save('family.xlsx')

create_workbook()
update_workbook()

This code allows me add data (without "pause" part, so only Name, Surname, Status) to excel but "Name", "Surname" etc. are added in one column A1 (not in rows as I want to)

Final excel workbook I want to get

I'm not sure how to append and format (each pause sign should be new row) mentioned data. I will be grateful for any tips!


Solution

  • I have decided to split the yaml load into three dictionaries;
    Parent
    Child
    Grandchild
    so they can be adjusted to suit the excel layout. Then use Pandas to export these to Excel at the appropriate row/column locations.

    The code will process YAML files in 'yaml_files'. On the first run for the set 'save_file' the Headers are written to Row 1 and each YAML file data is then placed under the previous entry down the spreadsheet across columns A - G. Any subsequent run on the same 'save_file' will place the new data below the last data written.
    Each file entry is also highlighted by alternating background colour.
    You can change the background colors used or disable if you wish.

    import glob
    import os
    import pandas as pd
    import yaml
    from openpyxl import load_workbook
    from yaml.loader import SafeLoader
    from openpyxl.utils import get_column_letter
    from openpyxl.styles import PatternFill
    
    
    def write_next_segments(dataf, header, start_row, start_col):
        dataframe = ''
    
        if start_col == 0:
            dataframe = pd.DataFrame([parent_dictionary])
            dataframe.to_excel(writer, sheet_name=save_sheet,
                               index=False,
                               header=header,
                               startrow=start_row,
                               startcol=start_col)
        else:
            for ele in dataf:
                dataframe = pd.DataFrame([dataf[ele]])
                dataframe.to_excel(writer, sheet_name=save_sheet,
                                   index=False,
                                   header=header,
                                   startrow=start_row,
                                   startcol=start_col)
                if header:
                    header = False
                    start_row += 2
                else:
                    start_row += 1
    
        adjust_colmns(dataframe, start_col)
    
    
    def adjust_colmns(dataframe, offset):
        for column in dataframe:
            if column != '':
                column_width = max(dataframe[column].astype(str).map(len).max(), len(column)) + 2
                col_letter = get_column_letter(dataframe.columns.get_loc(column) + offset + 1)
                writer.sheets[save_sheet].column_dimensions[col_letter].width = column_width
    
    
    def build_dictionary(yf):
        # Open the file and load the file
        print("Formatting YAML file: " + yf)
        yaml_sub_name = ''  # Name of 'child' sub element
        with open(yf) as file:
            yaml_data = yaml.load(file, Loader=SafeLoader)
    
        for k, v in yaml_data.items():
            if type(v) == list:
                yaml_sub_name = k  # Name of 'child' sub element
    
        # Make copy of the original dictionary for modification
        print("------ Copy the Parent dictionary                 -------------")
        parent_dict = yaml_data.copy()
    
        print("------ Create the Child & GrandChild dictionaries -------------")
    
        child_dict = {}
        grandchild_dict = {}
        sub_list = ''
        for x in parent_dict[yaml_sub_name][0]:
            if len(child_dict) == 0:
                child_dict['Childs0'] = {x: ''}
            else:
                if type(yaml_data[yaml_sub_name][0][x]) != list:
                    child_dict['Childs0'].update({x: ''})
                else:
                    sub_list = x
    
        for x in parent_dict[yaml_sub_name][0][sub_list][0]:
            if len(grandchild_dict) == 0:
                grandchild_dict['Grandchilds0'] = {x: ''}
            else:
                grandchild_dict['Grandchilds0'].update({x: ''})
    
        # Remove the Child elements from the Parent dictionary
        child_list = parent_dict.pop(yaml_sub_name)
    
        # Create the rest of the Child and Grandchild dictionaries
        row = 1
        for enum, ele in enumerate(child_list):
            if row != 1: row += 1
            child_dict[yaml_sub_name + str(row)] = ele
            pop_num = row
            if type(child_list[enum][sub_list]) == list:
                for i in range(len(child_list[enum][sub_list])):
                    if i == 0:
                        grandchild_dict['Grandchilds' + str(row)] = {'': '', '': ''}
                    grandchild_dict['Grandchilds' + str(row + 1)] = ele[sub_list][i]
                    row += 1
                    child_dict[yaml_sub_name + str(row)] = {'': '', '': ''}
            child_dict[yaml_sub_name + str(pop_num)].pop(sub_list)
    
        return parent_dict, child_dict, grandchild_dict
    
    
    def set_background_color(bg_color, bg_start_row):
        coords = writer.sheets[save_sheet].dimensions
        start_coord, end_coord = coords.split(':')
        bg_color_dims = 'A' + str(bg_start_row) + ':' + end_coord[:1] + str(row_offset)
        sheet = writer.sheets[save_sheet]
        for cells in sheet[bg_color_dims]:
            for cell in cells:
                cell.fill = PatternFill("solid", start_color=bg_color)
        bg_start_row = row_offset + 1
    
        return bg_start_row
    
    
    if __name__ == "__main__":
        # Change these variables as necessary
        yaml_dir = './yaml_files'   # Set name of the input yaml files directory
        save_file = 'family.xlsx'   # Set name of Excel workbook for output
        save_sheet = 'Sheet1'       # Set name of Excel sheet
        
        # These variables should not be changed 
        row_offset = 0              # Row number for next Excel write
        header = True               # Write Header to Excel
        bg_color_start_row = '2'    # First row to start fill background color
        # Set start bg color
        bg_color = "DDD9C4"         # First segment background color
    
        # If save file already exists then load it and the save sheet
        # Get the last row and last background colour used. Header is set to False
        if os.path.isfile(save_file):
            book = load_workbook(save_file)
            writer = pd.ExcelWriter(save_file, engine='openpyxl')
            writer.book = book
            writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
            row_offset = writer.sheets[save_sheet].max_row + 1
            bg_color_start_row = row_offset
            bg_color = writer.sheets[save_sheet][writer.sheets[save_sheet].dimensions[3:]].fill.fgColor.value[2:]
            header = False
        else:
            # If no save file then create a new Excel file
            writer = pd.ExcelWriter(save_file, engine='openpyxl')
    
        # Get a list of the YAML files in yaml directory
        yaml_files = glob.glob(yaml_dir + "/[!~]*.yaml")
        # Loop through the YAML files
        for enum, y_file in enumerate(yaml_files):
            if enum > 0:
                header = False
    
            print("------ Export dictionaries to Excel using Pandas --------------")
            parent_dictionary, child_dictionary, grandchild_dictionary = build_dictionary(y_file)
    
            parent_col_offset = len(parent_dictionary)
            child_col_offset = parent_col_offset + len(child_dictionary['Childs0'])
    
            write_next_segments(parent_dictionary, header, row_offset, 0)
            write_next_segments(child_dictionary, header, row_offset, parent_col_offset)
            write_next_segments(grandchild_dictionary, header, row_offset, child_col_offset)
            row_offset = writer.sheets[save_sheet].max_row + 1
            # Insert background fill using alternate fill colours
            print("------ Set Background Colors ---------------------------------")
            if bg_color == "DAEEF3":
                bg_color = "DDD9C4"
            else:
                bg_color = "DAEEF3"
    
            bg_color_start_row = set_background_color(bg_color, bg_color_start_row)
            print("\n")
    
        print("------ Save Excel file to '" + save_file + "' ---------------------")
    
        writer.save()