Search code examples
pythonpandasnumpystockohlc

Getting NaN when using pandas groupby


I have a dataframe like so:

         index        date symbol  stock_id  open  high     low  close    volume      vwap
0            0  2021-10-11    BVN        13  7.69  7.98  7.5600   7.61    879710  7.782174
1            1  2021-10-12    BVN        13  7.67  8.08  7.5803   8.02    794436  7.967061
2            2  2021-10-13    BVN        13  8.12  8.36  8.0900   8.16    716012  8.231286
3            3  2021-10-14    BVN        13  8.26  8.29  8.0500   8.28    586091  8.185899
4            4  2021-10-15    BVN        13  8.18  8.44  8.0600   8.44   1278409  8.284539
...        ...         ...    ...       ...   ...   ...     ...    ...       ...       ...
227774  227774  2022-10-04   ERIC     11000  6.27  6.32  6.2400   6.29  14655189  6.280157
227775  227775  2022-10-05   ERIC     11000  6.17  6.31  6.1500   6.29  10569193  6.219965
227776  227776  2022-10-06   ERIC     11000  6.20  6.25  6.1800   6.22   7918812  6.217198
227777  227777  2022-10-07   ERIC     11000  6.17  6.19  6.0800   6.10   9671252  6.135976
227778  227778  2022-10-10   ERIC     11000  6.13  6.15  6.0200   6.04   6310661  6.066256

[227779 rows x 10 columns]

And then a function to return a boolean mask on whether or not the df is consolidating inside of a range:

def is_consolidating(df, window=2, minp=2, percentage=0.95):

    rolling_min = pd.Series(df['close']).rolling(window=window, min_periods=minp).min()
    rolling_max = pd.Series(df['close']).rolling(window=window, min_periods=minp).max()
    consolidation = np.where( (rolling_min / rolling_max) >= percentage, True, False)

    return consolidation

Which I then call like:

df['t'] = df.groupby("stock_id").apply(is_consolidating)

The problem is when I print the df I am getting NaN for the values of my new column:

dan@danalgo:~/Documents/code/wolfhound$ python3 add_indicators_daily.py
                 index        date symbol  stock_id  open  high     low  close    volume      vwap    t
0            0  2021-10-11    BVN        13  7.69  7.98  7.5600   7.61    879710  7.782174  NaN
1            1  2021-10-12    BVN        13  7.67  8.08  7.5803   8.02    794436  7.967061  NaN
2            2  2021-10-13    BVN        13  8.12  8.36  8.0900   8.16    716012  8.231286  NaN
3            3  2021-10-14    BVN        13  8.26  8.29  8.0500   8.28    586091  8.185899  NaN
4            4  2021-10-15    BVN        13  8.18  8.44  8.0600   8.44   1278409  8.284539  NaN
...        ...         ...    ...       ...   ...   ...     ...    ...       ...       ...  ...
227774  227774  2022-10-04   ERIC     11000  6.27  6.32  6.2400   6.29  14655189  6.280157  NaN
227775  227775  2022-10-05   ERIC     11000  6.17  6.31  6.1500   6.29  10569193  6.219965  NaN
227776  227776  2022-10-06   ERIC     11000  6.20  6.25  6.1800   6.22   7918812  6.217198  NaN
227777  227777  2022-10-07   ERIC     11000  6.17  6.19  6.0800   6.10   9671252  6.135976  NaN
227778  227778  2022-10-10   ERIC     11000  6.13  6.15  6.0200   6.04   6310661  6.066256  NaN

[227779 rows x 11 columns]

Full code:

import pandas as pd
from IPython.display import display
import sqlite3 as sql
import numpy as np

conn = sql.connect('allStockData.db') 

# get everything inside daily_ohlc and add to a dataframe
df = pd.read_sql_query("SELECT * from daily_ohlc_init", conn)

def is_consolidating(df, window=2, minp=2, percentage=0.95):

    rolling_min = pd.Series(df['close']).rolling(window=window, min_periods=minp).min()
    rolling_max = pd.Series(df['close']).rolling(window=window, min_periods=minp).max()
    consolidation = np.where( (rolling_min / rolling_max) >= percentage, True, False)

    return consolidation

df['t'] = df.groupby("stock_id").apply(is_consolidating)

print(df)

df.to_sql('daily_ohlc_init_with_indicators', if_exists='replace', con=conn, index=True)

Solution

  • You could do it like this:

    def is_consolidating(grp, window=2, minp=2, percentage=0.95):
    
        rolling_min = pd.Series(grp).rolling(window=window, min_periods=minp).min()
        rolling_max = pd.Series(grp).rolling(window=window, min_periods=minp).max()
        consolidation = np.where( (rolling_min / rolling_max) >= percentage, True, False)
    
        return pd.Series(consolidation, index=grp.index)
    
    df['t'] = df.groupby("stock_id")['close'].apply(is_consolidating)
    print(df)
    

    Output (part of it):

              volume      vwap      t  
    0         879710  7.782174  False  
    1         794436  7.967061  False  
    2         716012  8.231286   True  
    3         586091  8.185899   True  
    4        1278409  8.284539   True  
    227774  14655189  6.280157  False  
    227775  10569193  6.219965   True  
    227776   7918812  6.217198   True  
    227777   9671252  6.135976   True  
    227778   6310661  6.066256   True