Search code examples
pythonpandasdataframefinancenumba

How to resample OHLC dataframe in Python without peeking into the future?


I have a Pandas DataFrame with one second frequency datetime index and columns 'Open', 'High', 'Low', 'Close' which represent prices for a financial instrument.

I want to resample this DataFrame to 15 min (or any frequency) but without peeking into the future and still keeping the original DataFrame with one second frequency but adding four new columns for each candle. The goal is to represent how candles form in real time.

For example, for a 15 min candle, I would have four new columns in the original DataFrame named 'Open_15m', 'High_15m', 'Low_15m', 'Close_15m' which would update the values each second as a rolling OHLC.

Keep in mind that a 15 min candle can only start at hh:00:00 or hh:15:00 hh:30:00 or hh:45:00. This means if for example our DataFrame starts at time 09:00:00, we have rolling OHLC from 09:00:00 until 09:15:00 then we reset and start over as a new 15 min candle starts forming at 09:15:00.

I came up with the code to do this and I think its correct, but it is too slow for DataFrames with millions and millions of rows. If the code is correct, it would need to be sped up somehow by using Numpy & Numba for example.

# Function to find the nearest 15-minute floor
def nearest_quarter_hour(timestamp):
    return timestamp.floor('15T')

# Find the nearest 15-minute floor for each timestamp
df['15_min_floor'] = df.index.map(nearest_quarter_hour)

# Group by the nearest 15-minute floor and calculate rolling OHLC
rolling_df = df.groupby('15_min_floor').rolling(window='15T').agg({
    'Open': lambda x: x.iloc[0],  # First value in the window
    'High': 'max',
    'Low': 'min',
    'Close': lambda x: x.iloc[-1]  # Last value in the window
}).reset_index(level=0, drop=True)

# add _15 to each column rolling df
rolling_df.columns = [f'{col}_15' for col in rolling_df.columns]

# Merge with original DataFrame
result_df = pd.concat([df, rolling_df], axis=1)

Solution

  • Here is version that computes OHLC your way which is significantly faster:

    from numba import njit
    
    @njit
    def compute_ohlc(floor_15_min, O, H, L, C, O_out, H_out, L_out, C_out):
        first, curr_max, curr_min, last = O[0], H[0], L[0], C[0]
    
        last_v = floor_15_min[0]
        for i, v in enumerate(floor_15_min):
            if v != last_v:
                first, curr_max, curr_min, last = O[i], H[i], L[i], C[i]
                last_v = v
            else:
                curr_max = max(curr_max, H[i])
                curr_min = min(curr_min, L[i])
                last = C[i]
    
            O_out[i] = first
            H_out[i] = curr_max
            L_out[i] = curr_min
            C_out[i] = last
    
    
    def compute_numba(df):
        df["15_min_floor_2"] = df.index.floor("15 min")
        df[["Open_15_2", "High_15_2", "Low_15_2", "Close_15_2"]] = np.nan
    
        compute_ohlc(
            df["15_min_floor_2"].values,
            df["Open"].values,
            df["High"].values,
            df["Low"].values,
            df["Close"].values,
            df["Open_15_2"].values,
            df["High_15_2"].values,
            df["Low_15_2"].values,
            df["Close_15_2"].values,
        )
    
    compute_numba(df)
    

    Benchmark with random df with 432001 rows:

    from timeit import timeit
    
    import pandas as pd
    from numba import njit
    
    
    # generate some random data:
    
    np.random.seed(42)
    
    idx = pd.date_range("1-1-2023", "1-6-2023", freq="1000ms")
    df = pd.DataFrame(
        {
            "Open": 50 + np.random.random(len(idx)) * 100,
            "High": 50 + np.random.random(len(idx)) * 100,
            "Low": 50 + np.random.random(len(idx)) * 100,
            "Close": 50 + np.random.random(len(idx)) * 100,
        },
        index=idx,
    )
    
    
    def get_result_df(df):
        def nearest_quarter_hour(timestamp):
            return timestamp.floor("15min")
    
        # Find the nearest 15-minute floor for each timestamp
        df["15_min_floor"] = df.index.map(nearest_quarter_hour)
    
        # Group by the nearest 15-minute floor and calculate rolling OHLC
        rolling_df = (
            df.groupby("15_min_floor")
            .rolling(window="15min")
            .agg(
                {
                    "Open": lambda x: x.iloc[0],  # First value in the window
                    "High": "max",
                    "Low": "min",
                    "Close": lambda x: x.iloc[-1],  # Last value in the window
                }
            )
            .reset_index(level=0, drop=True)
        )
    
        # add _15 to each column rolling df
        rolling_df.columns = [f"{col}_15" for col in rolling_df.columns]
    
        # Merge with original DataFrame
        result_df = pd.concat([df, rolling_df], axis=1)
    
        return result_df
    
    
    @njit
    def compute_ohlc(floor_15_min, O, H, L, C, O_out, H_out, L_out, C_out):
        first, curr_max, curr_min, last = O[0], H[0], L[0], C[0]
    
        last_v = floor_15_min[0]
        for i, v in enumerate(floor_15_min):
            if v != last_v:
                first, curr_max, curr_min, last = O[i], H[i], L[i], C[i]
                last_v = v
            else:
                curr_max = max(curr_max, H[i])
                curr_min = min(curr_min, L[i])
                last = C[i]
    
            O_out[i] = first
            H_out[i] = curr_max
            L_out[i] = curr_min
            C_out[i] = last
    
    
    def compute_numba(df):
        df["15_min_floor_2"] = df.index.floor("15 min")
        df[["Open_15_2", "High_15_2", "Low_15_2", "Close_15_2"]] = np.nan
    
        compute_ohlc(
            df["15_min_floor_2"].values,
            df["Open"].values,
            df["High"].values,
            df["Low"].values,
            df["Close"].values,
            df["Open_15_2"].values,
            df["High_15_2"].values,
            df["Low_15_2"].values,
            df["Close_15_2"].values,
        )
    
    
    t1 = timeit("get_result_df(df)", number=1, globals=globals())
    t2 = timeit("compute_numba(df)", number=1, globals=globals())
    
    print(f"Time normal = {t1}")
    print(f"Time numba =  {t2}")
    

    Prints on my computer AMD 5700x (432001 rows):

    Time normal = 29.57983471499756
    Time numba =  0.2751060768496245
    

    With dataframe pd.date_range("1-1-2004", "1-1-2024", freq="1000ms") (~631 millions of rows) the result is:

    Time numba =  11.551695882808417