I have a dataframe
# Create a sample dataframe
df = pd.DataFrame({'num_posts': [4, 6, 3, 9, 1, 14, 2, 5, 7, 2,12,1,2,3],
'date' : ['2020-03-01', '2020-01-02', '2020-01-03',
'2020-01-04', '2019-01-05', '2019-01-06',
'2020-01-07', '2020-01-08', '2020-01-09',
'2020-01-10','2020-01-11','2019-01-10','2019-01-08','2019-01-09']})
cols=['num_posts']
Now i want to create some features such as prev_week_of_last_year, prev_month_of_last_year I am trying this but i am not getting right output
def create_calendar_columns(df,dt):
df[dt]=pd.to_datetime(df[dt])
df['day_number']= pd.to_datetime(df[dt]).dt.dayofyear
df['week_number']= pd.to_datetime(df[dt]).dt.isocalendar().week
df['year']= pd.to_datetime(df[dt]).dt.year
return df
def calculate_curr_week_year_avg(df,cols,dt):
df=create_calendar_columns(df,dt)
for col in cols:
df['week_avg_'+str(col)]= df.groupby(['year','week_number'])[col].transform(np.mean)
df['year_avg_'+str(col)]= df.groupby(['year'])[col].transform(np.mean)
return df
def calculate_prev_1w_curr_year(df,cols,dt):
for col in cols:
df['prev_1w_curr_year_'+str(col)]=df[(df['week_number']==(df['week_number']-1)) & (df['year']==df['year'])]['week_avg_'+str(col)]
return df
df=calculate_curr_week_year_avg(df,cols,'date')
df=calculate_prev_1w_curr_year(df,cols,'date')
After i call calculate_curr_week_year_avg(df,cols,'date'), i got a dataframe like this
num_posts date day_number week_number year week_avg_num_posts \
0 4 2020-03-01 61 9 2020 4.000
1 6 2020-01-02 2 1 2020 6.000
2 3 2020-01-03 3 1 2020 6.000
3 9 2020-01-04 4 1 2020 6.000
4 1 2019-01-05 5 1 2019 7.500
5 14 2019-01-06 6 1 2019 7.500
6 2 2020-01-07 7 2 2020 5.600
7 5 2020-01-08 8 2 2020 5.600
8 7 2020-01-09 9 2 2020 5.600
9 2 2020-01-10 10 2 2020 5.600
10 12 2020-01-11 11 2 2020 5.600
11 1 2019-01-10 10 2 2019 2.000
12 2 2019-01-08 8 2 2019 2.000
13 3 2019-01-09 9 2 2019 2.000
year_avg_num_posts
0 5.556
1 5.556
2 5.556
3 5.556
4 4.200
5 4.200
6 5.556
7 5.556
8 5.556
9 5.556
10 5.556
11 4.200
12 4.200
13 4.200
Now when i do the following, it gives me Nan
df['prev_1w_curr_year_'+str(col)]=df[(df['week_number']==(df['week_number']-1)) & (df['year']==df['year'])]['week_avg_'+str(col)]
I hope I've understood your question right:
df["prev_1w_curr_year_num_posts"] = df.apply(
lambda x: df.loc[(df['week_number'] == x["week_number"] - 1) & (df['year'] == x['year']), 'num_posts'].mean(),
axis=1,
)
df["prev_week_of_last_year_num_posts"] = df.apply(
lambda x: df.loc[(df['week_number'] == x["week_number"] - 1) & (df['year'] == x['year'] - 1), 'num_posts'].mean(),
axis=1,
)
Prints:
num_posts date day_number week_number year week_avg_num_posts year_avg_num_posts prev_1w_curr_year_num_posts prev_week_of_last_year_num_posts
0 4 2020-03-01 61 9 2020 4.0 5.555556 NaN NaN
1 6 2020-01-02 2 1 2020 6.0 5.555556 NaN NaN
2 3 2020-01-03 3 1 2020 6.0 5.555556 NaN NaN
3 9 2020-01-04 4 1 2020 6.0 5.555556 NaN NaN
4 1 2019-01-05 5 1 2019 7.5 4.200000 NaN NaN
5 14 2019-01-06 6 1 2019 7.5 4.200000 NaN NaN
6 2 2020-01-07 7 2 2020 5.6 5.555556 6.0 7.5
7 5 2020-01-08 8 2 2020 5.6 5.555556 6.0 7.5
8 7 2020-01-09 9 2 2020 5.6 5.555556 6.0 7.5
9 2 2020-01-10 10 2 2020 5.6 5.555556 6.0 7.5
10 12 2020-01-11 11 2 2020 5.6 5.555556 6.0 7.5
11 1 2019-01-10 10 2 2019 2.0 4.200000 7.5 NaN
12 2 2019-01-08 8 2 2019 2.0 4.200000 7.5 NaN
13 3 2019-01-09 9 2 2019 2.0 4.200000 7.5 NaN