Search code examples
pythonpandasnumpyscipyscipy-optimize

How to calculate daily weights which satisfy certain conditions


I have the following pandas dataframe which represents the consumption of 7 days (day_0 is today, day_-1 is yesterday etc) of 10 people (ids):

import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randint(8, 15, size=(10, 7)))
df.columns = ['day_0', 'day_-1', 'day_-2', 'day_-3', 'day_-4', 'day_-5', 'day_-6']
df.index.name = 'id'

print(df.reset_index())

   id  day_0  day_-1  day_-2  day_-3  day_-4  day_-5  day_-6
0   0    10      10      14       8      14      14      14
1   1    10      13      11      11       8      10      10
2   2    10      12       9      12       9      10      10
3   3    12      12       9      11       9      12      13
4   4    12      13       8      12       8      11       9
5   5    13       9       8      13       9      12      10
6   6     8       9       8      14       8      13      14
7   7    13      10      14      12       8       9      11
8   8     8       8      10      12      11      14      14
9   9    14      13      13       9      11      14      13

I would like to find daily weights (so in total 7 weights: w_0, w_-1, w_-2, w_-3, w_-4, w_-5, w_-6) which need to have the following properties:

  1. w_0 > w_-1 > w_-2 > ... > w_-6 > 0
  2. w_0 + w_-1 + w_-2 + ... + w_-6 = 7
  3. the weighted average for exactly 7 out of 10 ids to be below a threshold (e.g. 11)

I can achieve prerequisites 1 & 2 by using the exponential decay function and later normalizing:

import numpy as np

n = 7

_lambda = 0.5

# Calculate the weights using exponential decay
weights = np.exp(-_lambda * np.arange(n))

# Normalize the weights so that their sum is equal to the length of the time series
weights *= n / np.sum(weights)

But I don't know how I could apply also prerequisite 3.

Is that possible? How can I do that in python?


Solution

  • This does not use exponential decay, because that doesn't seem particularly useful to meet your requirements. Define an ILP with a disjunction constraint that at exactly one combination of n out of m IDs has a weighted mean less than or equal to a threshold:

    import io
    
    import numpy as np
    import pandas as pd
    import scipy.sparse
    from scipy.optimize import milp, Bounds, LinearConstraint
    
    with io.StringIO(
    '''id,  0, -1, -2, -3, -4, -5, -6
        0, 10, 10, 14,  8, 14, 14, 14
        1, 10, 13, 11, 11,  8, 10, 10
        2, 10, 12,  9, 12,  9, 10, 10
        3, 12, 12,  9, 11,  9, 12, 13
        4, 12, 13,  8, 12,  8, 11,  9
        5, 13,  9,  8, 13,  9, 12, 10
        6,  8,  9,  8, 14,  8, 13, 14
        7, 13, 10, 14, 12,  8,  9, 11
        8,  8,  8, 10, 12, 11, 14, 14
        9, 14, 13, 13,  9, 11, 14, 13
    ''') as f:
        df = pd.read_csv(f, skipinitialspace=True, index_col=0)
    df.columns = pd.Index(name='day', data=df.columns.astype(int))
    
    
    m, n = df.shape  # number of IDs, days
    
    '''
    LP variables:
    n weights
    m weighted mean threshold binary predicates
    '''
    
    # The weight sum must be equal to n
    sum_constraint = LinearConstraint(
        A=np.concatenate((
            np.ones(n), np.zeros(m),
        )),
        lb=n, ub=n,
    )
    
    # The weights must be strictly decreasing by this amount
    min_decrease = 1e-2  # chosen fully arbitrarily
    antimonotonic_constraint = LinearConstraint(
        A=scipy.sparse.diags_array(
            (
                np.ones(shape=n - 1),
                np.full(shape=n - 1, fill_value=-1),
            ),
            offsets=(0, 1), shape=(n - 1, n + m), format='csc',
        ),
        lb=min_decrease,
    )
    
    '''
    For each binary threshold predicate,
    pred = 1 iff weights.df_values/n <= threshold
    lower bound:
    pred >= 1 - (weights.values)/n/threshold
    weights.values/threshold + pred*n >= n
    upper bound:
    pred <= 2 - (weights.values)/n/threshold
    weights.values/threshold + pred*n <= 2*n
    '''
    threshold = 11
    mean_constraint = LinearConstraint(
        A=scipy.sparse.hstack(
            (
                df.values/threshold,
                scipy.sparse.diags_array(
                    np.full(shape=m, fill_value=n),
                ),
            ),
            format='csc',
        ),
        lb=n, ub=2*n,
    )
    
    # Exactly this many out of m IDs must be at the threshold or lower
    n_up_to_threshold = 6
    disjunction_constraint = LinearConstraint(
        A=np.concatenate((np.zeros(n), np.ones(m))),
        lb=n_up_to_threshold, ub=n_up_to_threshold,
    )
    
    result = milp(
        c=np.zeros(n + m),  # no optimisation objective
        integrality=np.concatenate((
            np.zeros(shape=n, dtype=np.uint8),  # weights are continuous
            np.ones(shape=m, dtype=np.uint8),   # predicates are binary
        )),
        bounds=Bounds(
            lb=np.concatenate((
                np.full(shape=n, fill_value=1e-2),  # minimum weight, arbitrary
                np.zeros(shape=m),  # binary predicate
            )),
            ub=np.concatenate((
                np.full(shape=n, fill_value=np.inf),
                np.ones(shape=m),  # binary predicate
            )),
        ),
        constraints=(
            sum_constraint,
            antimonotonic_constraint,
            mean_constraint,
            disjunction_constraint,
        ),
    )
    if not result.success:
        raise ValueError(result.message)
    
    weights, threshold_preds = np.split(result.x, (n,))
    means = df @ (weights/n)
    
    print('weights =')
    print(weights)
    print('threshold predicates =')
    print(threshold_preds)
    print('means =')
    print(means)
    
    weights =
    [2.96714286 1.01571429 1.00571429 0.99571429 0.98571429 0.02
     0.01      ]
    threshold predicates =
    [1. 1. 1. 0. 1. 0. 1. 0. 1. 0.]
    means =
    id
    0    10.870612
    1    10.439592
    2    10.290204
    3    11.005714
    4    11.000000
    5    11.130816
    6     9.021429
    7    11.847755
    8     9.304490
    9    12.576122
    dtype: float64
    

    Approximate solutions

    To turn the hard threshold constraint into a soft constraint where the threshold moves to accommodate the input system,

    import io
    
    import numpy as np
    import pandas as pd
    import scipy.sparse
    from scipy.optimize import milp, Bounds, LinearConstraint
    
    with io.StringIO(
    '''id,  0, -1, -2, -3, -4, -5, -6
        0, 10, 10, 14,  8, 14, 14, 14
        1, 10, 13, 11, 11,  8, 10, 10
        2, 10, 12,  9, 12,  9, 10, 10
        3, 12, 12,  9, 11,  9, 12, 13
        4, 12, 13,  8, 12,  8, 11,  9
        5, 13,  9,  8, 13,  9, 12, 10
        6,  8,  9,  8, 14,  8, 13, 14
        7, 13, 10, 14, 12,  8,  9, 11
        8,  8,  8, 10, 12, 11, 14, 14
        9, 14, 13, 13,  9, 11, 14, 13
    ''') as f:
        df = pd.read_csv(f, skipinitialspace=True, index_col=0)
    df.columns = pd.Index(name='day', data=df.columns.astype(int))
    
    
    m, n = df.shape  # number of IDs, days
    
    '''
    LP variables:
    n weights
    m weighted mean threshold binary predicates
    1 threshold
    1 threshold error
    '''
    
    # The weight sum must be equal to n
    sum_constraint = LinearConstraint(
        A=np.concatenate((
            np.ones(n),           # weights
            np.zeros(m + 1 + 1),  # preds, threshold, error
        )),
        lb=n, ub=n,
    )
    
    # The weights must be strictly decreasing by this amount
    min_decrease = 1e-3  # chosen fully arbitrarily
    antimonotonic_constraint = LinearConstraint(
        A=scipy.sparse.diags_array(
            (
                np.ones(shape=n - 1),                 # current weight
                np.full(shape=n - 1, fill_value=-1),  # next weight
            ),
            offsets=(0, 1),                # current and next weight index
            shape=(n - 1, n + m + 1 + 1),  # after the weight positions, everything is zero
            format='csc',
        ),
        lb=min_decrease,
    )
    
    '''
    For each binary threshold predicate,
    pred = 1 iff weights.df_values/n <= threshold
    Upper bound (forces predicate to 0):
    weights.df_values + pM - n*threshold <= M
    Lower bound (forces predicate to 1):
    weights.df_values + pM - n*threshold >= 0
    '''
    M = 2*df.sum(axis=1).max()
    threshold_constraint = LinearConstraint(
        A=scipy.sparse.hstack(
            (
                df.values,                 # weights
                scipy.sparse.diags_array(  # predicates
                    np.full(shape=m, fill_value=M),
                ),
                np.full(shape=(m, 1), fill_value=-n),  # threshold
                scipy.sparse.csc_array((m, 1)),        # error (0)
            ),
            format='csc',
        ),
        lb=0, ub=M,
    )
    
    # Exactly this many out of m IDs must be at the threshold or lower
    n_up_to_threshold = 5
    count_constraint = LinearConstraint(
        A=np.concatenate((
            np.zeros(n),      # weights
            np.ones(m),       # predicates
            np.zeros(1 + 1),  # threshold, error
        )),
        lb=n_up_to_threshold, ub=n_up_to_threshold,
    )
    
    '''
    The absolute error must be relative to the threshold
    error >= target - threshold    threshold + error >= target
    error >= threshold - target    threshold - error <= target
    '''
    target_threshold = 12
    error_constraint = LinearConstraint(
        A=scipy.sparse.hstack(
            (
                scipy.sparse.csc_array((2, n + m)),  # weights, predicates
                np.array((
                    (1, +1),  # threshold, error
                    (1, -1),
                )),
            ),
            format='csc',
        ),
        lb=(target_threshold, -np.inf),
        ub=(np.inf, target_threshold),
    )
    
    result = milp(
        c=np.concatenate((
            np.zeros(n + m + 1),  # weights, preds, threshold are non-optimised
            np.ones(1),           # threshold error is minimised
        )),
        integrality=np.concatenate((
            np.zeros(shape=n, dtype=np.uint8),  # weights are continuous
            np.ones(shape=m, dtype=np.uint8),   # predicates are binary
            (0, 0),                             # threshold, error are continuous
        )),
        bounds=Bounds(
            lb=np.concatenate((
                np.full(shape=n, fill_value=1e-3),  # minimum weight, arbitrary
                np.zeros(shape=m),                  # binary predicate
                (-np.inf, -np.inf),                 # threshold, error unbounded
            )),
            ub=np.concatenate((
                np.full(shape=n, fill_value=np.inf),  # weight unbounded
                np.ones(shape=m),                     # binary predicate
                (np.inf, np.inf),                     # threshold, error unbounded
            )),
        ),
        constraints=(
            sum_constraint,
            antimonotonic_constraint,
            threshold_constraint,
            count_constraint,
            error_constraint,
        ),
    )
    if not result.success:
        raise ValueError(result.message)
    
    weights, preds, (threshold,), (error,) = np.split(result.x, (n, n+m, n+m+1))
    means = df @ (weights/n)
    
    print(f'threshold = {threshold:.3f}')
    print(f'error from target of {target_threshold} = {error:.6f}')
    print('weights =')
    print(weights)
    print('threshold predicates =')
    print(preds)
    print('means =')
    print(means.values)
    
    threshold = 11.996
    error from target of 12 = 0.003857
    weights =
    [6.975e+00 1.000e-02 5.000e-03 4.000e-03 3.000e-03 2.000e-03 1.000e-03]
    threshold predicates =
    [1. 1. 1. 0. 0. 0. 1. 0. 1. 0.]
    means =
    [10.00514286 10.00471429 10.00285714 11.99614286 11.99614286 12.98828571
      8.00714286 12.99228571  8.00757143 13.99357143]