Search code examples
pythonperformanceresourcesoverwrite

Python: re-open file on each iteration or truncate to overwrite?


In Python, if you have a loop, in each iteration of which you want to write to a file (pickling in my case), overwriting whatever data is already there, one option is to open the file before the loop, keep it open, and truncate it on each iteration to erase the previous data before writing new data:

import pickle
with open(filename, 'wb') as file:
    for blah in blahs:
        file.truncate(0)
        file.seek(0)
        pickle.dump(blah, file)

and another is to just re-open the file on each iteration, as opening it in wb automatically truncates it:

import pickle
for blah in blahs:
    with open(filename, 'wb') as file:
        pickle.dump(blah, file)

Which is best (in terms of performance/speed and handling system resources etc)? Is there a better way to overwrite data in an already-open file than using file.truncate() and file.seek() as above?

I'm aware a similar question has been asked (Is it better to open/close a file every time vs keeping it open until the process is finished?) but there it appears to be about when you want to append on each iteration rather than overwrite, so I'm wondering if the process of truncating etc in the latter results in any significance performance degradation that would tip the scales?


Solution

  • I don't like guessing so I profiled the two approaches:

    import pickle
    import tempfile
    from random import choices
    from string import ascii_lowercase, ascii_uppercase, digits
    from pathlib import Path
    
    from performance_measurement import run_performance_comparison
    
    
    class Bla:
        def __init__(self):
            population = ascii_uppercase + digits + ascii_lowercase
            self._content = str.join("", choices(population, k=50))
    
    
    def truncate_approach(blahs: list[Bla], filename: str):
        with open(filename, "wb") as file:
            for blah in blahs:
                file.truncate(0)
                file.seek(0)
                pickle.dump(blah, file)
    
    
    def reopen_approach(blahs: list[Bla], filename: str):
        for blah in blahs:
            with open(filename, "wb") as file:
                pickle.dump(blah, file)
    
    
    def setup(N):
        return [[Bla() for i in range(N)], Path(tempfile.NamedTemporaryFile().name)]
    
    
    run_performance_comparison(
        approaches=[truncate_approach, reopen_approach],
        data_size=[10, 20, 30, 100, 200, 300, 1000, 2000, 3000],
        setup=setup,
        number_of_repetitions=10,
    )
    

    truncate_approach is slightly faster. I assume it's because we interact with the disk less and sometimes get to truncate the content and re-set the writebuffer before we have to interact with the hard disc.

    enter image description here

    Profiling code:

    import timeit
    from functools import partial
    
    import matplotlib.pyplot as plt
    from typing import List, Dict, Callable
    
    from contextlib import contextmanager
    import matplotlib.pyplot as plt
    import matplotlib.transforms as mtransforms
    import matplotlib.ticker as ticker
    import numpy as np
    
    
    @contextmanager
    def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
        data = setup(data_size)
        yield data
        teardown(*data)
    
    
    def run_performance_comparison(approaches: List[Callable],
                                   data_size: List[int],
                                   *,
                                   setup=lambda N: [N],
                                   teardown=lambda *N: None,
                                   number_of_repetitions=5,
                                   title='Performance Comparison',
                                   data_name='N',
                                   yscale='log',
                                   xscale='log'):
        approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
        for N in data_size:
            with data_provider(N, setup, teardown) as data:
                print(f'Running performance comparison for {data_name}={N}')
                for approach in approaches:
                    function = partial(approach, *data)
                    approach_time = min(timeit.Timer(function).repeat(repeat=number_of_repetitions, number=1))
                    approach_times[approach].append(approach_time)
    
        for approach in approaches:
            plt.plot(data_size, approach_times[approach], label=approach.__name__)
        plt.yscale(yscale)
        plt.xscale(xscale)
    
        plt.xlabel(data_name)
        plt.ylabel('Execution Time (seconds)')
        plt.title(title)
        plt.legend()
        plt.show()