Search code examples
pythonpandaspython-decorators

Cache Size Decorators in Python


I am building my own decorator function, but I can't seem to be able to update the func.cache_length method of the function.

The code below simply uses an OrderedDict to store the items from all the dataframes loaded in pandas, with 5 dataframes maximum stored in cache.

I want the user to also find out how many items currently the function has loaded using cache_length but every time I run it I get 0.

from functools import wraps
from collections import OrderedDict


def cache(func, max_length=5):
    
    func.cache_dict = OrderedDict()
    func.cache_length = 0
    @wraps(func)
    
    def wrapper(*args, **kwargs):
        if kwargs['df_name'] in func.cache_dict:
            return func.cache_dict[kwargs['df_name']]
        elif len(func.cache_dict) < max_length:
            print('Running function...')
            df = func(*args, **kwargs)
            func.cache_dict[kwargs['df_name']] = df
            func.cache_length += 1
            return df
        else:
            func.cache_dict.popitem(last=True)
            df = func(*args, **kwargs)
            func.cache_dict[kwargs['df_name']] = df
            return df
    
    func.cache_reset = lambda: func.cache_dict.clear()
        
    return wrapper


import pandas as pd


@cache
def data_reader(*, df_name: pd.DataFrame, file: str):
    df = pd.read_csv(file)
    return df

This is the output vs. expected (I should get 1),


data_reader(df_name='test_dataframe', file="parsed_data.csv")

>>

Running function...
....


>>

data_reader.cache_length

>>

0


Solution

  • Based on what you described, here is a more general implementation: (details below)

    from collections import OrderedDict
    from functools import wraps
    
    
    def cache(function=None, *, max_length=5):
        def decorator(func):
            cache_dict = OrderedDict()
            @wraps(func)
            def wrapper(*args, **kwargs):
                call_repr = f"args={args}, kwargs={kwargs}"
                try:
                    return cache_dict[call_repr]
                except KeyError:
                    pass
                if len(cache_dict) >= max_length:
                    cache_dict.popitem(last=False)
                print(f"Running function {func.__name__}...")
                cache_dict[call_repr] = output = func(*args, **kwargs)
                return output
            wrapper.cache = cache_dict
            return wrapper
        return decorator if function is None else decorator(function)
    
    
    @cache(max_length=3)
    def add(x, y):
        return x + y
    
    
    def main():
        print(f"{add(1, 1)=}")                                  
        print(f"{add(2, 1)=}")
        print(f"{add(1, 1)=}")
        print(f"{add(3, 1)=}")
        print(f"{add(4, 1)=}")                                  
        print(f"{add(1, 1)=}")                                  
        print(f"{add.cache=}")
        add.cache.clear()
        print(f"{len(add.cache)=}")
        print(f"{add.cache=}")
    
    
    if __name__ == "__main__":                                  
        main()
    

    Output:

    Running function add...
    add(1, 1)=2
    Running function add...
    add(2, 1)=3
    add(1, 1)=2
    Running function add...
    add(3, 1)=4
    Running function add...
    add(4, 1)=5
    Running function add...
    add(1, 1)=2
    add.cache=OrderedDict([('args=(3, 1), kwargs={}', 4), ('args=(4, 1), kwargs={}', 5), ('args=(1, 1), kwargs={}', 2)])
    len(add.cache)=0
    add.cache=OrderedDict()
    

    Notice the cache was used for the second add(1, 1) call, but not the third.

    Details

    • Uses the pattern allowing the decorator to be used with or without parantheses
    • Resulting wrapper function has the cache attribute to allow direct access to the underlying OrderedDict
    • Caching based on the string representation of all function arguments (positional and keyword)

    Caveats

    • Not completely general by any stretch
    • Works as expected only with argument types that have a deterministic __repr__ without side effects (which is what one would expect, to be fair)
    • Cannot differentiate between arguments with identical string representations
    • Clean type annotations may be a bit more involved

    Hope this helps.