Search code examples
pythonpandasdataframegroup-by

pandas add rows to original df based on groupby


I want to group by text, if at least one name exists, then loop through the names, and each name that does not appear - add a line to original df. example -

names_set = {'A','B','C'}

initial df:

columns = ['id','text','name','start','end']
data = [
    [1,"this is text 1", 'A',0,4],
    [2,"this is text 1", 'B',4,5],
    [3,"this is text 1", 'C',4,5],
    [3,"this is text 2", 'A',6,8],
    [4,'this is text 3',None, None, None],
    [5,"this is text 4", 'B',10,13],
    [6,"this is text 4", 'B',1,5]
]
df1 = pd.DataFrame(data= data,columns=columns)
df1

    id  text           name     start   end
0   1   this is text 1  A       0.0     4.0
1   2   this is text 1  B       4.0     5.0
2   3   this is text 1  C       4.0     5.0
3   3   this is text 2  A       6.0     8.0
4   4   this is text 3  None    NaN     NaN
5   5   this is text 4  B       10.0    13.0
6   6   this is text 4  B       1.0     5.0

output:

columns2 = ['id','text','name','start','end']
data2 = [
    [1,"this is text 1", 'A',0,4],
    [2,"this is text 1", 'B',4,5],
    [3,"this is text 1", 'C',4,5],
    [3,"this is text 2", 'A',6,8],
    [None,"this is text 2", 'B',None,None],
    [None,"this is text 2", 'C',None,None],
    [4,'this is text 3',None, None, None],
    [None,"this is text 4", 'A',None,None],
    [5,"this is text 4", 'B',10,13],
    [6,"this is text 4", 'B',1,5],
    [None,"this is text 4", 'C',None,None]
]
df2 = pd.DataFrame(data= data2,columns=columns2)
df2
    id  text            name    start   end
0   1.0 this is text 1  A       0.0     4.0
1   2.0 this is text 1  B       4.0     5.0
2   3.0 this is text 1  C       4.0     5.0
3   3.0 this is text 2  A       6.0     8.0
4   NaN this is text 2  B       NaN     NaN
5   NaN this is text 2  C       NaN     NaN
6   4.0 this is text 3  None    NaN     NaN
7   NaN this is text 4  A       NaN     NaN
8   5.0 this is text 4  B       10.0    13.0
9   6.0 this is text 4  B       1.0     5.0
10  NaN this is text 4  C       NaN     NaN

the code I have up until now -

g = df1.groupby('text')
text_names_group = df1.groupby("text")["name"].agg(list)
text_names_group
for text in text_names_group:
  if len(text) == 1 and text[0] is None:
    continue
  cur_names = set(text)
  missing_names_per_text = names_set - cur_names 

so missing_names_per_text is what the names missing for each text, but I want to add it to the original df, per text

Thanks!

edit : there's an option for two lines with same text and name, but different start and end

example - added line 6 in input


Solution

  • First filter only rows match by names_set and add missing combinations, last append not matching rows by invert mask by ~ in boolean indexing and join together by concat:

    names_set = {'A','B','C'} 
    
    m = df1['name'].isin(names_set)
    df2 = (df1.set_index(['text', 'name'])
                .reindex(pd.MultiIndex.from_product([df1.loc[m, 'text'].unique(), 
                                                     sorted(names_set)], 
                                                    names=['text', 'name']))).reset_index()
    
    df = pd.concat([df1[~m], df2]).sort_values(['text'], ignore_index=True)
    print (df)
        id            text  name  start   end
    0  1.0  this is text 1     A    0.0   4.0
    1  2.0  this is text 1     B    4.0   5.0
    2  3.0  this is text 1     C    4.0   5.0
    3  3.0  this is text 2     A    6.0   8.0
    4  NaN  this is text 2     B    NaN   NaN
    5  NaN  this is text 2     C    NaN   NaN
    6  4.0  this is text 3  None    NaN   NaN
    7  NaN  this is text 4     A    NaN   NaN
    8  5.0  this is text 4     B   10.0  13.0
    9  NaN  this is text 4     C    NaN   NaN
    

    In ral data should be problem sorting by text column, so here is solution with mapping by enumerate dictionary:

    columns = ['id','text','name','start','end']
    data = [
        [1,"this is text 10", 'A',0,4],
        [2,"this is text 10", 'B',4,5],
        [3,"this is text 10", 'C',4,5],
        [3,"this is text 20", 'A',6,8],
        [4,'this is text 13',None, None, None],
        [5,"this is text 14", 'B',10,13]
    ]
    df1 = pd.DataFrame(data= data,columns=columns)
    

    names_set = {'A','B','C'} 
    
    
    m = df1['name'].isin(names_set)
    
    sorting = {v: k for k, v in enumerate(df1['text'].drop_duplicates())}
    print (sorting)
    {'this is text 10': 0, 'this is text 20': 1, 'this is text 13': 2, 'this is text 14': 3}
    
    mux = pd.MultiIndex.from_product([df1.loc[m, 'text'].unique(), 
                                     sorted(names_set)], 
                                     names=['text', 'name'])
    
    df2 = df1.set_index(['text', 'name']).reindex(mux).reset_index()
    
    df = pd.concat([df1[~m], df2]).sort_values(['text'], 
                                               ignore_index=True, 
                                               key=lambda x: x.map(sorting))
    print (df2)
                  text name   id  start   end
    0  this is text 10    A  1.0    0.0   4.0
    1  this is text 10    B  2.0    4.0   5.0
    2  this is text 10    C  3.0    4.0   5.0
    3  this is text 20    A  3.0    6.0   8.0
    4  this is text 20    B  NaN    NaN   NaN
    5  this is text 20    C  NaN    NaN   NaN
    6  this is text 14    A  NaN    NaN   NaN
    7  this is text 14    B  5.0   10.0  13.0
    8  this is text 14    C  NaN    NaN   NaN
    

    Solution with duplicated names per groups is similar:

    names_set = {'A','B','C'} 
    
    m = df1['name'].isin(names_set)
    
    sorting = {v: k for k, v in enumerate(df1['text'].drop_duplicates())}
    #print (sorting)
    

    Create helper df3 DataFrame by MultiIndex.to_frame, add missing text and name rows and last use left join with original DataFrame:

    df3 = pd.MultiIndex.from_product([df1.loc[m, 'text'].unique(), 
                                     sorted(names_set)], 
                                     names=['text', 'name']).to_frame(index=False)
    
    df3 = (pd.concat([df1.loc[~m, ['text', 'name']], df3])
             .sort_values(['text'], ignore_index=True, key=lambda x: x.map(sorting)))
    print (df3)
          text  name
    0  this is text 1     A
    1  this is text 1     B
    2  this is text 1     C
    3  this is text 2     A
    4  this is text 2     B
    5  this is text 2     C
    6  this is text 3  None
    7  this is text 4     A
    8  this is text 4     B
    9  this is text 4     C
        
    

    df2 = df3.merge(df1, how='left')
    print (df2)
                  text  name   id  start   end
    0   this is text 1     A  1.0    0.0   4.0
    1   this is text 1     B  2.0    4.0   5.0
    2   this is text 1     C  3.0    4.0   5.0
    3   this is text 2     A  3.0    6.0   8.0
    4   this is text 2     B  NaN    NaN   NaN
    5   this is text 2     C  NaN    NaN   NaN
    6   this is text 3  None  4.0    NaN   NaN
    7   this is text 4     A  NaN    NaN   NaN
    8   this is text 4     B  5.0   10.0  13.0
    9   this is text 4     B  6.0    1.0   5.0
    10  this is text 4     C  NaN    NaN   NaN