python pandas dataframe rolling-computation

Call function on pandas df with lagged values calculated in the previous row/loop

I am calling a function rowise on a pandas data frame using lagged values (for Q and S) that were calculated for the previous row. The first row already has values for Q and S so it starts on the second row. It works fine in a for loop but the df I'm ultimately applying it too has over 3000 rows so I need something faster.

I've contemplated df.shift(-1), rolling.apply() and vectorising but nothing I've tried works.

import time
import pandas as pd
import math

def myfunc(Eo, P, Smax, Sprev, Qprev):
  print("i = ", i)
  print("Qprev = ", Qprev)
  S = Sprev + Eo * math.exp(-1 * Sprev/Smax) - P + Qprev
  Q = P + S
  print("Q = ", Q)
  return S, Q

data = {'peti': {0: 0.1960418075323104, 1: 0.5796640515327454, 2: 0.737823486328125, 3: 0.222676545381546, 4: 0.8804306983947754}, 'tas': {0: 281.0088195800781, 1: 277.112060546875, 2: 273.7044372558594, 3: 277.48309326171875, 4: 279.4878845214844}, 'precip': {0: 0.0, 1: 0.0, 2: 1.5046296539367177e-05, 3: 0.0002500000118743, 4: 4.6296295295178425e-06}, 'year': {0: 2008, 1: 2008, 2: 2008, 3: 2008, 4: 2008}, 'row_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, 'S': {0: 90.9, 1: "nan", 2: "nan", 3: "nan", 4: "nan"}, 'Q': {0: 0.0, 1: "nan", 2: "nan", 3: "nan", 4: "nan"}}
    
df = pd.DataFrame.from_dict(data)

smaxval = 100


start_time = time.time()

for i in df.index[1:len(df)]: #' start on second row
  df.loc[i,["S","Q"]] = myfunc(
    df.peti[i],
      df.precip[i],
        smax_val,
          df.S[i-1],
            df.Q[i-1])
            
print("--- %s seconds ---" % (time.time() - start_time))

Solution

On my machine, your code runs in 0.004 second on average for 5,000 iterations:

N = 5_000
smax_val = 100
df = pd.DataFrame.from_dict(data)
times = []

for _ in range(N):
    start_time = time.time()
    for i in df.index[1 : len(df)]:  #' start on second row
        df.loc[i, ["S", "Q"]] = myfunc(
            df.peti[i], df.precip[i], smax_val, df.S[i - 1], df.Q[i - 1]
        )
    times.append(time.time() - start_time)

print(f"--- {round(np.mean(times), 3)} second(s) on average for {N} runs ---")
print(df)

--- 0.004 seconds on average for 5000 runs ---
       peti         tas    precip  year  row_id           S           Q
0  0.196042  281.008820  0.000000  2008       0   90.900000    0.000000
1  0.579664  277.112061  0.000000  2008       1   91.133562   91.133562
2  0.737823  273.704437  0.000015  2008       2  182.563705  182.563720
3  0.222677  277.483093  0.000250  2008       3  365.163051  365.163301
4  0.880431  279.487885  0.000005  2008       4  730.349194  730.349199

One way to speed things up (4x on average on my machine) is to make computations outside of Pandas and add results back in with Pandas concat:

N = 5_000
smax_val = 100
df = pd.DataFrame.from_dict(data)
times = []

for _ in range(N):
    start_time = time.time()
    vals = [[90.9, 0.0]]
    S = 90.9
    Q = 0.0
    for i, (x, y) in enumerate(zip(df.loc[1:, "peti"], df.loc[1:, "precip"])):
        S, Q = myfunc(x, y, smax_val, S, Q)
        vals.append([S, Q])
    df = pd.concat(
        [df.drop(columns=["S", "Q"]), pd.DataFrame(vals, columns=["S", "Q"])], axis=1
    )
    times.append(time.time() - start_time)

print(f"--- {round(np.mean(times), 3)} second(s) on average for {N} runs ---")
print(df)

--- 0.001 seconds on average for 5000 runs ---
       peti         tas    precip  year  row_id           S           Q
0  0.196042  281.008820  0.000000  2008       0   90.900000    0.000000
1  0.579664  277.112061  0.000000  2008       1   91.133562   91.133562
2  0.737823  273.704437  0.000015  2008       2  182.563705  182.563720
3  0.222677  277.483093  0.000250  2008       3  365.163051  365.163301
4  0.880431  279.487885  0.000005  2008       4  730.349194  730.349199