I have an n_years by n_repeats array of count data.
For each element (e), I want to draw from a loss severity array e times and take the sum of the draws.
Below is the best I can do so far. It's barely faster than two nested for
loops in python. In my real use case, my array is 100,000 by 1,000.
Does anyone have any ideas how this can be done using pure numpy?
frequency = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[1, 2, 1],
[1, 2, 1],
[2, 4, 2],
[2, 4, 2],
[3, 5, 2],
]
)
sev = np.array([1,1,2,2,1,2,3,4,5,1,1,2])
def calculate_insured_losses(frequency, severity_array):
def yearly_loss(element, severity_array=severity_array):
return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
calculate_insured_losses(freq, sev)
291 µs ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
EDIT: Simpler code with nested loops
def calculate_insured_losses(frequency, severity):
def yearly_loss(element, severity_array=severity):
if element == 0:
return 0
else:
return np.random.choice(severity_array, size=element, replace=True).sum()
n_years, n_repeats = frequency.shape
losses = np.empty(shape=frequency.shape)
for year in range(n_years):
for repeat in range(n_repeats):
losses[year, repeat] = yearly_loss(frequency[year, repeat])
return losses
calculate_insured_losses(freq, sev)
You can do that faster like this:
import numpy as np
def calculate_insured_losses(frequency, severity_array):
# Flattened frequencies table
r = frequency.ravel()
# Accumulate
rcum = np.cumsum(r)
# Take all ramdom samples at once
c = np.random.choice(severity_array, rcum[-1], replace=True)
# Sum segments
res = np.add.reduceat(c, rcum - r)
# Make zero elements
res *= r.astype(bool)
# Return reshaped result
return res.reshape(frequency.shape)
# For comparison
def calculate_insured_losses_loop(frequency, severity_array):
def yearly_loss(element, severity_array=severity_array):
return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
# Test
frequency = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[1, 2, 1],
[1, 2, 1],
[2, 4, 2],
[2, 4, 2],
[3, 5, 2],
]
)
sev = np.array([1, 1, 2, 2, 1, 2, 3, 4, 5, 1, 1, 2])
# Check results from functions match
np.random.seed(0)
res = calculate_insured_losses(frequency, sev)
np.random.seed(0)
res_loop = calculate_insured_losses_loop(frequency, sev)
print(np.all(res == res_loop))
# True
# Benchmark
%timeit calculate_insured_losses(frequency, sev)
# 32.4 µs ± 220 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit calculate_insured_losses_loop(frequency, sev)
# 383 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)