Search code examples
pythonpandasdataframedatetimefeature-engineering

Insert value in new column after comapring the same dataframe with itself


I have a dataframe

# Create a sample dataframe
df = pd.DataFrame({'num_posts': [4, 6, 3, 9, 1, 14, 2, 5, 7, 2,12,1,2,3],
                   'date' : ['2020-03-01', '2020-01-02', '2020-01-03', 
                            '2020-01-04', '2019-01-05', '2019-01-06', 
                            '2020-01-07', '2020-01-08', '2020-01-09', 
                            '2020-01-10','2020-01-11','2019-01-10','2019-01-08','2019-01-09']})
cols=['num_posts']  

Now i want to create some features such as prev_week_of_last_year, prev_month_of_last_year I am trying this but i am not getting right output

def create_calendar_columns(df,dt):
    df[dt]=pd.to_datetime(df[dt])
    df['day_number']= pd.to_datetime(df[dt]).dt.dayofyear
    df['week_number']= pd.to_datetime(df[dt]).dt.isocalendar().week
    df['year']= pd.to_datetime(df[dt]).dt.year
    return df
    
def calculate_curr_week_year_avg(df,cols,dt):
    df=create_calendar_columns(df,dt)
    for col in cols:
        df['week_avg_'+str(col)]= df.groupby(['year','week_number'])[col].transform(np.mean)
        df['year_avg_'+str(col)]= df.groupby(['year'])[col].transform(np.mean)
    return df
def calculate_prev_1w_curr_year(df,cols,dt):
    for col in cols:
        df['prev_1w_curr_year_'+str(col)]=df[(df['week_number']==(df['week_number']-1)) & (df['year']==df['year'])]['week_avg_'+str(col)]
    return df
    
df=calculate_curr_week_year_avg(df,cols,'date')    
df=calculate_prev_1w_curr_year(df,cols,'date')        
    

After i call calculate_curr_week_year_avg(df,cols,'date'), i got a dataframe like this

    num_posts       date  day_number  week_number  year  week_avg_num_posts  \
0           4 2020-03-01          61            9  2020               4.000   
1           6 2020-01-02           2            1  2020               6.000   
2           3 2020-01-03           3            1  2020               6.000   
3           9 2020-01-04           4            1  2020               6.000   
4           1 2019-01-05           5            1  2019               7.500   
5          14 2019-01-06           6            1  2019               7.500   
6           2 2020-01-07           7            2  2020               5.600   
7           5 2020-01-08           8            2  2020               5.600   
8           7 2020-01-09           9            2  2020               5.600   
9           2 2020-01-10          10            2  2020               5.600   
10         12 2020-01-11          11            2  2020               5.600   
11          1 2019-01-10          10            2  2019               2.000   
12          2 2019-01-08           8            2  2019               2.000   
13          3 2019-01-09           9            2  2019               2.000   

    year_avg_num_posts  
0                5.556  
1                5.556  
2                5.556  
3                5.556  
4                4.200  
5                4.200  
6                5.556  
7                5.556  
8                5.556  
9                5.556  
10               5.556  
11               4.200  
12               4.200  
13               4.200 

Now when i do the following, it gives me Nan

df['prev_1w_curr_year_'+str(col)]=df[(df['week_number']==(df['week_number']-1)) & (df['year']==df['year'])]['week_avg_'+str(col)]

Solution

  • I hope I've understood your question right:

    df["prev_1w_curr_year_num_posts"] = df.apply(
        lambda x: df.loc[(df['week_number'] == x["week_number"] - 1) & (df['year'] == x['year']), 'num_posts'].mean(),
        axis=1,
    )
    
    df["prev_week_of_last_year_num_posts"] = df.apply(
        lambda x: df.loc[(df['week_number'] == x["week_number"] - 1) & (df['year'] == x['year'] - 1), 'num_posts'].mean(),
        axis=1,
    )
    

    Prints:

        num_posts       date  day_number  week_number  year  week_avg_num_posts  year_avg_num_posts  prev_1w_curr_year_num_posts  prev_week_of_last_year_num_posts
    0           4 2020-03-01          61            9  2020                 4.0            5.555556                          NaN                               NaN
    1           6 2020-01-02           2            1  2020                 6.0            5.555556                          NaN                               NaN
    2           3 2020-01-03           3            1  2020                 6.0            5.555556                          NaN                               NaN
    3           9 2020-01-04           4            1  2020                 6.0            5.555556                          NaN                               NaN
    4           1 2019-01-05           5            1  2019                 7.5            4.200000                          NaN                               NaN
    5          14 2019-01-06           6            1  2019                 7.5            4.200000                          NaN                               NaN
    6           2 2020-01-07           7            2  2020                 5.6            5.555556                          6.0                               7.5
    7           5 2020-01-08           8            2  2020                 5.6            5.555556                          6.0                               7.5
    8           7 2020-01-09           9            2  2020                 5.6            5.555556                          6.0                               7.5
    9           2 2020-01-10          10            2  2020                 5.6            5.555556                          6.0                               7.5
    10         12 2020-01-11          11            2  2020                 5.6            5.555556                          6.0                               7.5
    11          1 2019-01-10          10            2  2019                 2.0            4.200000                          7.5                               NaN
    12          2 2019-01-08           8            2  2019                 2.0            4.200000                          7.5                               NaN
    13          3 2019-01-09           9            2  2019                 2.0            4.200000                          7.5                               NaN