Search code examples
pythonexcelopenpyxl

Combining data from multiple sheets into one sheet using Openpyxl append


I tried to make an algorithm which could merge data from two or more excel files to a new excel file, it does the job, but the output is not as expected.

[this image shows the output generated by the algorithm.] (https://i.sstatic.net/0ErHz.png) [this is how the output should be.] (https://i.sstatic.net/aKlDZ.png)


import os
import openpyxl

# Prompt the user to enter the directory location of the Excel files
directory_path = input("Enter the directory location of the Excel files: ")

# Check if the directory exists
if not os.path.exists(directory_path):
    print("Directory does not exist!")
    exit()

# Get a list of all files in the directory
file_list = os.listdir(directory_path)

# Filter only Excel files
excel_files = []
for file in file_list:
    if file.endswith('.xlsx') or file.endswith('.xls'):
        excel_files.append(file)

# Display the available file names
print("Excel files found within the given directory:")
count = 1
for file in excel_files:
    file_name = os.path.splitext(file)[0]
    print(f"{count}) {file_name}")
    count += 1

# Prompt the user to select the files to add to the new file
selected_files = input("Select the file(s) that you want to add to the new file (separated by commas): ")
selected_files = selected_files.split(",")

# Store the selected file names
selected_file_names = []
print("\nSelected files:")
for file_index in selected_files:
    file_index = int(file_index.strip()) - 1  # Convert to integer and subtract 1 to get the index
    if file_index < 0 or file_index >= len(excel_files):
        print(f"Invalid file number: {file_index + 1}")
        continue
    selected_file = excel_files[file_index]
    selected_file_names.append(selected_file)
    print(selected_file)

# Create an empty dictionary to store the selected titles for each file
selected_titles_dict = {}

# Iterate over the selected files and display their titles
print("\nViewing the titles within the selected files...")
for selected_file in selected_file_names:
    print(selected_file)
    wb = openpyxl.load_workbook(os.path.join(directory_path, selected_file))
    sheet = wb.active

    titles = []
    for i, cell in enumerate(sheet[1], start=1):
        titles.append(cell.value)
        print(f"{i}) {cell.value}")

    # Prompt the user to select the titles to save to the new Excel file
    selected_titles = input("Select the titles of which you want to save the details (separated by commas): ")
    selected_titles = selected_titles.split(",")
    selected_titles = [int(title.strip()) - 1 for title in selected_titles]  # Convert to integer and subtract 1

    # Store the selected titles for the current file
    selected_titles_dict[selected_file] = selected_titles

    wb.close()

# Print the overall selected files and titles
print("\nOverall, you have selected the following:")
for selected_file in selected_file_names:
    print(f"\n{selected_file} - ", end="")
    selected_titles = selected_titles_dict[selected_file]
    for title_index in selected_titles:
        print(titles[title_index], end=", ")
    print()

# Prompt the user to confirm merging the data into a new Excel file
merge_data = input("\nDo you want to merge those data and write to a new Excel file? (y/n): ")
if merge_data.lower() != "y":
    print("Data merging cancelled.")
    exit()

# Prompt the user for the name of the new Excel file
new_file_name = input("\nProvide the name for the new Excel file: ")
new_file_name += ".xlsx"

# Prompt the user for the location to store the new file
new_file_location = input("Where do you want to store the newly created file?: ")

# Create a new workbook to store the merged data
new_wb = openpyxl.Workbook()
new_sheet = new_wb.active

# Write the headers to the new file
header_row = []
for selected_file in selected_file_names:
    selected_titles = selected_titles_dict[selected_file]
    for title_index in selected_titles:
        header_row.append(titles[title_index])

new_sheet.append(header_row)

# Iterate over the selected files and write their data to the new file
for selected_file in selected_file_names:
    wb = openpyxl.load_workbook(os.path.join(directory_path, selected_file))
    sheet = wb.active

    for row in sheet.iter_rows(min_row=2, values_only=True):
        data_row = []
        selected_titles = selected_titles_dict[selected_file]
        for title_index in selected_titles:
            data_row.append(row[title_index])
        new_sheet.append(data_row)

    wb.close()

# Save the new file
new_file_path = os.path.join(new_file_location, new_file_name)
new_wb.save(new_file_path)
new_wb.close()

print("\nData written to", new_file_name)

# enter the directory location of the Excel files : C:\Users\judej\Documents\kidaProject

# Excel files found within the given directory :
# 1) mad assignment.xlsx
# 2) wad assignment.xlsx
# 3) foss assignment.xlsx
# 4) dro assignment.xlsx

# select the file that you want to add to the new file (separated by commas) : 2 , 3

# selected files:
# wad assignment.xlsx
# foss assignment.xlsx
#
# viewing the titles within the selected files...
# wad assignment.xlsx
# 1) Reg.no
# 2) Name
# 3) Assignment Marks
#
# select the titles of which you want to save the details to a new Excel file (separated by commas) : 1,2,3

# You have selected Reg.no, Name, Assignment Marks from wad assignment.xlsx
# viewing foss assignment.xlsx
# 1) Reg.no
# 2) Name
# 3) Assignment Marks
#
# select the titles of which you want to save the details to a new Excel file (separated by commas) : 3

# You have selected Assignment Marks from foss assignment.xlsx
#
# overall you have selected the following:
#
# wad assignment.xlsx - Reg.no, Name, Assignment Marks
# foss assignment.xlsx - Assignment Marks
#
# do you want to merge those data and write to a new Excel file? (y/n): y

# Provide the name for the new Excel file : newGeneratedFile

# where do you want to store the newly created file? : C:\Users\judej\Documents\kidaProject

# writing data to newGeneratedFile.xlsx

I tried going through the code again and again and I couldn't spot the problem, maybe I am not capable of doing so at this level.


Solution

  • I made a change to your code to show an option to work around your issue.

    In the section that creates the list data_row I have changed this to a dictionary of lists keyed on the row number called data_dict_row. The changed code first reads all the input sheets for all workbooks and updates this dictionary with all values to be written to the [key] row. The key is used to ensure the data from each sheet is added to the correct list in data_dict_row.
    Therefore for the example sheet screenshot showing the incorrect result, the dictionary would have;

    2: [TRI/IT/2020/F/0001, S.Dilshikka, 98, 0, 98, 56]
    3: [TRI/IT/2020/F/0002, M.Jude, 99, 21, 99, 69]
    

    for rows 2 & 3.
    After the dictionary is created with all the values, then the new workbook/worksheet is created and data_dict_row is looped to obtain each list for each row which can then be appended on the new sheet (along with the Headers being the first append).

    import os
    import openpyxl
    
    # Prompt the user to enter the directory location of the Excel files
    directory_path = input("Enter the directory location of the Excel files: ")
    
    # Check if the directory exists
    if not os.path.exists(directory_path):
        print("Directory does not exist!")
        exit()
    
    # Get a list of all files in the directory
    file_list = os.listdir(directory_path)
    
    # Filter only Excel files
    excel_files = []
    for file in file_list:
        if file.endswith('.xlsx') or file.endswith('.xls'):
            excel_files.append(file)
    
    # Display the available file names
    print("Excel files found within the given directory:")
    count = 1
    for file in excel_files:
        file_name = os.path.splitext(file)[0]
        print(f"{count}) {file_name}")
        count += 1
    
    # Prompt the user to select the files to add to the new file
    selected_files = input("Select the file(s) that you want to add to the new file (separated by commas): ")
    selected_files = selected_files.split(",")
    
    # Store the selected file names
    selected_file_names = []
    print("\nSelected files:")
    for file_index in selected_files:
        file_index = int(file_index.strip()) - 1  # Convert to integer and subtract 1 to get the index
        if file_index < 0 or file_index >= len(excel_files):
            print(f"Invalid file number: {file_index + 1}")
            continue
        selected_file = excel_files[file_index]
        selected_file_names.append(selected_file)
        print(selected_file)
    
    # Create an empty dictionary to store the selected titles for each file
    selected_titles_dict = {}
    
    # Iterate over the selected files and display their titles
    print("\nViewing the titles within the selected files...")
    for selected_file in selected_file_names:
        print(selected_file)
        wb = openpyxl.load_workbook(os.path.join(directory_path, selected_file))
        sheet = wb.active
    
        titles = []
        for i, cell in enumerate(sheet[1], start=1):
            titles.append(cell.value)
            print(f"{i}) {cell.value}")
    
        # Prompt the user to select the titles to save to the new Excel file
        selected_titles = input("Select the titles of which you want to save the details (separated by commas): ")
        selected_titles = selected_titles.split(",")
        selected_titles = [int(title.strip()) - 1 for title in selected_titles]  # Convert to integer and subtract 1
    
        # Store the selected titles for the current file
        selected_titles_dict[selected_file] = selected_titles
    
    
    # Print the overall selected files and titles
    print("\nOverall, you have selected the following:")
    for selected_file in selected_file_names:
        print(f"\n{selected_file} - ", end="")
        selected_titles = selected_titles_dict[selected_file]
        for title_index in selected_titles:
            print(titles[title_index], end=", ")
        print()
    
    # Prompt the user to confirm merging the data into a new Excel file
    merge_data = input("\nDo you want to merge those data and write to a new Excel file? (y/n): ")
    if merge_data.lower() != "y":
        print("Data merging cancelled.")
        exit()
    
    # Prompt the user for the name of the new Excel file
    new_file_name = input("\nProvide the name for the new Excel file: ")
    new_file_name += ".xlsx"
    
    # Prompt the user for the location to store the new file
    new_file_location = input("Where do you want to store the newly created file?: ")
    
    # Write the headers to the new file
    header_row = []
    for selected_file in selected_file_names:
        selected_titles = selected_titles_dict[selected_file]
        for title_index in selected_titles:
            header_row.append(titles[title_index])
    
    
    # Iterate over the selected files and write their data to the new file
    data_dict_row = {}
    for selected_file in selected_file_names:
        wb = openpyxl.load_workbook(os.path.join(directory_path, selected_file))
        sheet = wb.active
    
        ### Create dictionary of lists; key is the row number and value is all
        ### values from all sheets to be written that row
        row_num = 2
        for row in sheet.iter_rows(min_row=row_num, values_only=True):
            selected_titles = selected_titles_dict[selected_file]
            for title_index in selected_titles:
                if row_num not in data_dict_row:
                    data_dict_row[row_num] = [row[title_index]]
                else:
                    data_dict_row[row_num] += [row[title_index]]
    
            row_num += 1
        wb.close()
    
    ### Create a new workbook to store the merged data
    new_wb = openpyxl.Workbook()
    new_sheet = new_wb.active
    new_sheet.append(header_row)
    
    ### Loop through data_dict_rows for each row list
    for new_sheet_row, data_row in data_dict_row.items():
        new_sheet.append(data_row)
    
    # Save the new file
    new_file_path = os.path.join(new_file_location, new_file_name)
    new_wb.save(new_file_path)
    
    print("\nData written to", new_file_name)
    

    Example output
    Example screenshot