Search code examples
pythonpandasdataframefillna

dataframe combine_first does not work as well as fillna


the first dataframe is:

   data_date cookie_type   dau  next_dau  dau_7  dau_15
0   20181006    avg(0-d)  2288       NaN    NaN     NaN
1   20181006    avg(e-f)  2284       NaN    NaN     NaN
2   20181007    avg(e-f)  2296       100    NaN     NaN

the second dataframe is :

  data_date cookie_type  next_dau
0  20181006    avg(e-f)       908
1  20181006    avg(0-d)       904

how to update the first dataframe's next_dau from the second one i have tried combine_first and fillna, they seem not support multi-index:

cols = ['data_date', 'cookie_type']

    if (frame1 is not None and not frame1.empty):
        frame1.set_index(cols)
        print(frame1)
        print(next_day_dau)
        frame1.combine_first(next_day_dau.set_index(cols))
        frame1.combine_first(dau_7.set_index(cols))
        frame1.combine_first(dau_15.set_index(cols))

then i updated to:

frame1.index = frame1.data_date.astype(str) + frame1.cookie_type
        next_day_dau.index = next_day_dau.data_date.astype(str) + next_day_dau.cookie_type
        dau_7.index = dau_7.data_date.astype(str) + dau_7.cookie_type
        dau_15.index = dau_15.data_date.astype(str) + dau_15.cookie_type
        """frame1.loc[next_day_dau.index, "next_dau"] = next_day_dau.next_dau
        frame1.loc[dau_7.index, "dau_7"] = dau_7.dau_7
        frame1.loc[dau_15.index, "dau_15"] = dau_15.dau_15"""
        frame1.combine_first(next_day_dau)
        frame1.combine_first(dau_7)
        frame1.combine_first(dau_15)
        print(frame1)
        print(next_day_dau)

loc raise a error because of next_day_dau dose not contain all the indexes in frame1 then i tried combine-first and fillna with inplace=True ,all dont work.enter image description here

{'data_date': {'20181007avg(0-d)': 20181007, '20181007avg(e-f)': 20181007, '20181006avg(0-d)': 20181006, '20181006avg(e-f)': 20181006}, 'cookie_type': {'20181007avg(0-d)': 'avg(0-d)', '20181007avg(e-f)': 'avg(e-f)', '20181006avg(0-d)': 'avg(0-d)', '20181006avg(e-f)': 'avg(e-f)'}, 'dau': {'20181007avg(0-d)': 2288, '20181007avg(e-f)': 2284, '20181006avg(0-d)': 2288, '20181006avg(e-f)': 2284}, 'next_dau': {'20181007avg(0-d)': nan, '20181007avg(e-f)': nan, '20181006avg(0-d)': nan, '20181006avg(e-f)': nan}, 'dau_7': {'20181007avg(0-d)': nan, '20181007avg(e-f)': nan, '20181006avg(0-d)': nan, '20181006avg(e-f)': nan}, 'dau_15': {'20181007avg(0-d)': nan, '20181007avg(e-f)': nan, '20181006avg(0-d)': nan, '20181006avg(e-f)': nan}}
{'data_date': {0: '20181007', 1: '20181007'}, 'cookie_type': {0: 'avg(e-f)', 1: 'avg(0-d)'}, 'next_dau': {0: 2284, 1: 2288}}

Solution

  • finally i solved this problem with help from "tianhua liao":

    frame1.index = frame1.data_date.astype(str) + frame1.cookie_type
    next_day_dau.index = next_day_dau.data_date.astype(str) + next_day_dau.cookie_type
    dau_7.index = dau_7.data_date.astype(str) + dau_7.cookie_type
    dau_15.index = dau_15.data_date.astype(str) + dau_15.cookie_type
    # get_index
    next_day_dau_idx = frame1.index.isin(next_day_dau.index)
    dau_7_idx = frame1.index.isin(dau_7.index)
    dau_15_idx = frame1.index.isin(dau_15.index)
    #
    if any(next_day_dau_idx):
        frame1.loc[next_day_dau_idx, "next_dau"] = next_day_dau.next_dau
    if any(dau_7_idx):
        frame1.loc[dau_7_idx, "dau_7"] = dau_7.dau_7
    if any(dau_15_idx):
        frame1.loc[dau_15_idx, "dau_15"] = dau_15.dau_15