Suppose I have a Pandas Series s
whose values sum to 1
and whose values are also all greater than or equal to 0
. I need to subtract a constant from all values such that the sum of the new Series is equal to 0.6
. The catch is, when I subtract this constant, the values never end up less than zero.
In math formula, assume I have a series of x
's and I want to find k
import pandas as pd
import numpy as np
from string import ascii_uppercase
np.random.seed([3, 141592653])
s = np.power(
1000, pd.Series(
np.random.rand(10),
list(ascii_uppercase[:10])
)
).pipe(lambda s: s / s.sum())
s
A 0.001352
B 0.163135
C 0.088365
D 0.010904
E 0.007615
F 0.407947
G 0.005856
H 0.198381
I 0.027455
J 0.088989
dtype: float64
The sum is 1
s.sum()
0.99999999999999989
I can use Newton's method (among others) found in Scipy's optimize
module
from scipy.optimize import newton
def f(k):
return s.sub(k).clip(0).sum() - .6
Finding the root of this function will give me the k
I need
initial_guess = .1
k = newton(f, x0=initial_guess)
Then subtract this from s
new_s = s.sub(k).clip(0)
new_s
A 0.000000
B 0.093772
C 0.019002
D 0.000000
E 0.000000
F 0.338583
G 0.000000
H 0.129017
I 0.000000
J 0.019626
dtype: float64
And the new sum is
new_s.sum()
0.60000000000000009
Can we find k
without resorting to using a solver?
Updated: Three different implementations - interestingly, the least sophisticated scales best.
import numpy as np
def f_sort(A, target=0.6):
B = np.sort(A)
C = np.cumsum(np.r_[B[0], np.diff(B)] * np.arange(N, 0, -1))
idx = np.searchsorted(C, 1 - target)
return B[idx] + (1 - target - C[idx]) / (N-idx)
def f_partition(A, target=0.6):
target, l = 1 - target, len(A)
while len(A) > 1:
m = len(A) // 2
A = np.partition(A, m-1)
ls = A[:m].sum()
if ls + A[m-1] * (l-m) > target:
A = A[:m]
else:
l -= m
target -= ls
A = A[m:]
return target / l
def f_direct(A, target=0.6):
target = 1 - target
while True:
gt = A > target / len(A)
if np.all(gt):
return target / len(A)
target -= A[~gt].sum()
A = A[gt]
N = 10
A = np.random.random(N)
A /= A.sum()
print(f_sort(A), np.clip(A-f_sort(A), 0, None).sum())
print(f_partition(A), np.clip(A-f_partition(A), 0, None).sum())
print(f_direct(A), np.clip(A-f_direct(A), 0, None).sum())
from timeit import timeit
kwds = dict(globals=globals(), number=1000)
N = 100000
A = np.random.random(N)
A /= A.sum()
print(timeit('f_sort(A)', **kwds))
print(timeit('f_partition(A)', **kwds))
print(timeit('f_direct(A)', **kwds))
Sample run:
0.04813686999999732 0.5999999999999999
0.048136869999997306 0.6000000000000001
0.048136869999997306 0.6000000000000001
8.38109541599988
2.1064437470049597
1.2743922089866828