Search code examples
pythonnumpymontecarlo

Quickly performing a function n times for each element n in an array


I have an n_years by n_repeats array of count data.

For each element (e), I want to draw from a loss severity array e times and take the sum of the draws.

Below is the best I can do so far. It's barely faster than two nested for loops in python. In my real use case, my array is 100,000 by 1,000.

Does anyone have any ideas how this can be done using pure numpy?

frequency = np.array(
    [
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 1],
        [1, 2, 1],
        [1, 2, 1],
        [2, 4, 2],
        [2, 4, 2],
        [3, 5, 2],
    ]
)
sev = np.array([1,1,2,2,1,2,3,4,5,1,1,2])

def calculate_insured_losses(frequency, severity_array):

    def yearly_loss(element, severity_array=severity_array):  
        return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()

    return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)

calculate_insured_losses(freq, sev)

291 µs ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

EDIT: Simpler code with nested loops

def calculate_insured_losses(frequency, severity):
    
    def yearly_loss(element, severity_array=severity):
        if element == 0:
            return 0
        else:
            return np.random.choice(severity_array, size=element, replace=True).sum()
    
    n_years, n_repeats = frequency.shape
    
    losses = np.empty(shape=frequency.shape)
    
    for year in range(n_years):
        for repeat in range(n_repeats):
            losses[year, repeat] = yearly_loss(frequency[year, repeat])

    return losses

calculate_insured_losses(freq, sev)

Solution

  • You can do that faster like this:

    import numpy as np
    
    def calculate_insured_losses(frequency, severity_array):
        # Flattened frequencies table
        r = frequency.ravel()
        # Accumulate
        rcum = np.cumsum(r)
        # Take all ramdom samples at once
        c = np.random.choice(severity_array, rcum[-1], replace=True)
        # Sum segments
        res = np.add.reduceat(c, rcum - r)
        # Make zero elements
        res *= r.astype(bool)
        # Return reshaped result
        return res.reshape(frequency.shape)
    
    # For comparison
    def calculate_insured_losses_loop(frequency, severity_array):
        def yearly_loss(element, severity_array=severity_array):  
            return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
        return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
    
    # Test
    frequency = np.array(
        [
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 1],
            [1, 2, 1],
            [1, 2, 1],
            [2, 4, 2],
            [2, 4, 2],
            [3, 5, 2],
        ]
    )
    sev = np.array([1, 1, 2, 2, 1, 2, 3, 4, 5, 1, 1, 2])
    # Check results from functions match
    np.random.seed(0)
    res = calculate_insured_losses(frequency, sev)
    np.random.seed(0)
    res_loop = calculate_insured_losses_loop(frequency, sev)
    print(np.all(res == res_loop))
    # True
    
    # Benchmark
    %timeit calculate_insured_losses(frequency, sev)
    # 32.4 µs ± 220 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    %timeit calculate_insured_losses_loop(frequency, sev)
    # 383 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)