Search code examples
pythonxlrdxlwt

Optimize add row in xls file with xlwt


I have a big problem with a large xls file. When my app add a new stats record (a new row at the end of the file) there is a very long time (one minute). If I replace it with an empty xls file this work the best (1-2 seconds). So I'm trying to optimize this if possible.

I use something like:

def add_stats_record():
    # Add record
    lock = LockFile(STATS_FILE)
    with lock:
        # Open for read
        rb = open_workbook(STATS_FILE, formatting_info=True)
        sheet_records = rb.sheet_by_index(0)

        # record_id
        START_ROW = sheet_records.nrows
        try:
            record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
        except:
            record_id = 1

        # Open for write
        wb = copy(rb)
        sheet_records = wb.get_sheet(0)

        # Set normal style
        style_normal = xlwt.XFStyle()
        normal_font = xlwt.Font()
        style_normal.font = normal_font

        # Prepare some data here
        ........................
        # then:

        for i, col in enumerate(SHEET_RECORDS_COLS):
            sheet_records.write(START_ROW, i, possible_values.get(col[0], ''),
                                style_normal)

        wb.save(STATS_FILE)

Do you see here something to improve? Or can you give me a better idea / example how to do this?


Solution

  • Probably not the answer you want to hear but there is hardly anything to optimize.

    import xlwt, xlrd
    from xlutils.copy import copy as copy
    from time import time
    
    def add_stats_record():
        #Open for read
        start_time = time()
        rb = xlrd.open_workbook(STATS_FILE, formatting_info=True)
        sheet_records_original = rb.sheet_by_index(0)
        print('Elapsed time for opening:            %.2f' % (time()-start_time))
        #Record_id
        start_time = time()
        START_ROW = sheet_records_original.nrows
        SHEET_RECORDS_COLS = sheet_records_original.ncols
        try:
            record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
        except:
            record_id = 1
        print('Elapsed time for record ID:          %.2f' % (time()-start_time))
        #Open for write
        start_time = time()
        wb = copy(rb)
        sheet_records = wb.get_sheet(0)
        print('Elapsed time for write:              %.2f' % (time()-start_time))
        #Set normal style
        style_normal = xlwt.XFStyle()
        normal_font = xlwt.Font()
        style_normal.font = normal_font
    
        #Read all the data and get some stats
        start_time = time()
        max_col = {}
        start_time = time()
        for col_idx in range(0,16):
            max_value = 0
            for row_idx in range(START_ROW):
                if sheet_records_original.cell(row_idx, col_idx).value:
                    val = float(sheet_records_original.cell(row_idx, col_idx).value)
                    if val > max_value:
                        max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
    
        text_cells = [[0 for x in range(15)] for y in range(START_ROW)] 
        for col_idx in range(16,31):
            max_value = 0
            for row_idx in range(START_ROW):
                if sheet_records_original.cell(row_idx, col_idx).value:
                    val = str(sheet_records_original.cell(row_idx, col_idx).value).replace('text', '').count(str(col_idx))
                    if val > max_value:
                        max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
        print('Elapsed time for reading data/stats: %.2f' % (time()-start_time))
        #Write the stats row
        start_time = time()
        for i in range(SHEET_RECORDS_COLS):
            sheet_records.write(START_ROW, i, max_col[i], style_normal)
    
        start_time = time()
        wb.save(STATS_FILE)
        print('Elapsed time for writing:            %.2f' % (time()-start_time))    
    
    if __name__ == '__main__':
        STATS_FILE = 'output.xls'
        start_time2 = time()
        add_stats_record()
        print ('Total time:                         %.2f' % (time() - start_time2))
    

    Elapsed time for opening: 2.43
    Elapsed time for record ID: 0.00
    Elapsed time for write: 7.62
    Elapsed time for reading data/stats: 2.35
    Elapsed time for writing: 3.33
    Total time: 15.75

    From those results it becomes pretty clear that there is hardly any room for improvement in your code. Open/copy/write make up the bulk time but are just simple calls to xlrd/xlwt.

    Using on_demand=True in open_workbook doesn't help either.

    Using openpyxl doesn't improve performance as well.

    from openpyxl import load_workbook
    from time import time
    
    #Load workbook
    start_time = time()
    wb = load_workbook('output.xlsx')
    print('Elapsed time for loading workbook: %.2f' % (time.time()-start_time))    
    
    #Read all data
    start_time = time()
    ws = wb.active
    cell_range1 = ws['A1':'P20001']
    cell_range2 = ws['Q1':'AF20001']
    print('Elapsed time for reading workbook: %.2f' % (time.time()-start_time))    
    
    #Save to a new workbook
    start_time = time()
    wb.save("output_tmp.xlsx")
    print('Elapsed time for saving workbook:  %.2f' % (time.time()-start_time))    
    

    Elapsed time for loading workbook: 22.35
    Elapsed time for reading workbook: 0.00
    Elapsed time for saving workbook: 21.11

    Ubuntu 14.04 (Virtual machine)/Python2.7-64bit/Regular hard disk (with native Windows 10 similar results, Python 3 performs worse in loading but better in writing).


    Random data was generated using Pandas and Numpy

    import pandas as pd
    import numpy as np
    #just random numbers
    df = pd.DataFrame(np.random.rand(20000,30), columns=range(0,30))
    #convert half the columns to text
    for i in range(15,30):
        df[i].apply(str)
        df[i] = 'text' + df[i].astype(str)
    writer = pd.ExcelWriter(STATS_FILE)
    df.to_excel(writer,'Sheet1')
    writer.save()
    

    After some fiddling with multiprocessing I found a slightly improved solution. Since the copy operation was the most time consuming operation and having a shared workbook made performance worse, a different approach was taken. Both threads read the original workbook, one reads the data, calculates the statistics and writes them to a file (tmp.txt), the other one copies the workbook, waits for the statistics file to appear and then writes it to the newly copied workbook.

    Difference: 12% less time needed in total (n=3 for both scripts). Not great but I cannot think of another way of doing, except for not using Excel files.

    xls_copy.py

    def xls_copy(STATS_FILE, START_ROW, style_normal):
        from xlutils.copy import copy as copy
        from time import sleep, time
        from os import stat
        from xlrd import open_workbook
        print('started 2nd thread')
        start_time = time()
        rb = open_workbook(STATS_FILE, formatting_info=True)
        wb = copy(rb)
        sheet_records = wb.get_sheet(0)
        print('2: Elapsed time for xls_copy:         %.2f' % (time()-start_time))
    
        counter = 0
        filesize = stat('tmp.txt').st_size
    
        while filesize == 0 and counter < 10**5:
            sleep(0.01)
            filesize = stat('tmp.txt').st_size
            counter +=1
        with open('tmp.txt', 'r') as f:
            for line in f.readlines():
                cells = line.split(';')
                sheet_records.write(START_ROW, int(cells[0]), cells[1], style_normal)
    
        start_time = time()
        wb.save('tmp_' + STATS_FILE)
        print('2: Elapsed time for writing:          %.2f' % (time()-start_time))    
    

    xlsx_multi.py

    from xls_copy import xls_copy
    import xlwt, xlrd
    from time import time
    from multiprocessing import Process
    
    def add_stats_record():
    
        #Open for read
        start_time = time()
        rb = xlrd.open_workbook(STATS_FILE, formatting_info=True)
        sheet_records_original = rb.sheet_by_index(0)
        print('Elapsed time for opening:            %.2f' % (time()-start_time))
        #Record_id
        start_time = time()
        START_ROW = sheet_records_original.nrows
        f = open('tmp.txt', 'w')
        f.close()
        #Set normal style
        style_normal = xlwt.XFStyle()
        normal_font = xlwt.Font()
        style_normal.font = normal_font
    
        #start 2nd thread
        p = Process(target=xls_copy, args=(STATS_FILE, START_ROW, style_normal,))
        p.start()
        print('continuing with 1st thread')
        SHEET_RECORDS_COLS = sheet_records_original.ncols
        try:
            record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
        except:
            record_id = 1
        print('Elapsed time for record ID:          %.2f' % (time()-start_time))
    
        #Read all the data and get some stats
        start_time = time()
        max_col = {}
        start_time = time()
        for col_idx in range(0,16):
            max_value = 0
            for row_idx in range(START_ROW):
                if sheet_records_original.cell(row_idx, col_idx).value:
                    val = float(sheet_records_original.cell(row_idx, col_idx).value)
                    if val > max_value:
                        max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
    
        text_cells = [[0 for x in range(15)] for y in range(START_ROW)] 
        for col_idx in range(16,31):
            max_value = 0
            for row_idx in range(START_ROW):
                if sheet_records_original.cell(row_idx, col_idx).value:
                    val = str(sheet_records_original.cell(row_idx, col_idx).value).replace('text', '').count(str(col_idx))
                    if val > max_value:
                        max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
        #write statistics to a temp file
        with open('tmp.txt', 'w') as f:
            for k in max_col:
                f.write(str(k) + ';' + max_col[k] + str('\n'))
        print('Elapsed time for reading data/stats: %.2f' % (time()-start_time))
        p.join()
    if __name__ == '__main__':
    
        done = False
        wb = None
        STATS_FILE = 'output.xls'
        start_time2 = time()
        add_stats_record()
        print ('Total time:                          %.2f' % (time() - start_time2))