Search code examples
pythonpandasdataframedata-analysiskeyword-search

Keyword search between two DataFrames using python pandas


Hi I have two DataFrames like below

 DF1

 Alpha   |  Numeric  |  Special

 and, or |  1,2,3,4,5|  @,$,&

and

DF2 with single column

Content      |

boy or girl  |
school @ morn|

I want to search if anyone of the column in DF1 has anyone of the keyword in content column of DF2 and the output should be in a new DF

 output_DF

 output_column|
 Alpha        |
 Special      |

someone help me with this


Solution

  • Solution is s bit complicated, because for multiple match (row 2) need only matched first column df1:

    df1 = pd.DataFrame({'Alpha':['and','or', None, None,None],
                        'Numeric':['1','2','3','4','5'],
                        'Special':['@','$','&', None, None]})
    print (df1)
      Alpha Numeric Special
    0   and       1       @
    1    or       2       $
    2  None       3       &
    3  None       4    None
    4  None       5    None
    
    
    df2 = pd.DataFrame({'Content':['boy or girl','school @ morn', 
                                   '1 school @ morn', 'Pechi']})
    print (df2)
               Content
    0      boy or girl
    1    school @ morn
    2  1 school @ morn
    3            Pechi
    

    #reshape df1
    df1.columns = [np.arange(len(df1.columns)), df1.columns]
    df11 = df1.unstack()
              .reset_index(level=2,drop=True)
              .rename_axis(('col_order','col_name'))
              .dropna()
              .reset_index(name='val')
    print (df11)
       col_order col_name  val
    0          0    Alpha  and
    1          0    Alpha   or
    2          1  Numeric    1
    3          1  Numeric    2
    4          1  Numeric    3
    5          1  Numeric    4
    6          1  Numeric    5
    7          2  Special    @
    8          2  Special    $
    9          2  Special    &
    

    #split column by whitespaces, reshape
    df22 = df2['Content'].str.split(expand=True)
                         .stack()
                         .rename('val')
                         .reset_index(level=1,drop=True)
                         .rename_axis('idx').reset_index()
    print (df22)
        idx     val
    0     0     boy
    1     0      or
    2     0    girl
    3     1  school
    4     1       @
    5     1    morn
    6     2       1
    7     2  school
    8     2       @
    9     2    morn
    10    3   Pechi
    

    #left join dataframes, remove non match values by dropna
    #also for multiple match get always first - use sorting with drop_duplicates
    df = pd.merge(df22, df11, on='val', how='left')
           .dropna(subset=['col_name'])
           .sort_values(['idx','col_order'])
           .drop_duplicates(['idx'])
    
    #if necessary get values from df2
    #if no value matched add Other category
    df = pd.concat([df2, df.set_index('idx')], axis=1)
           .fillna({'col_name':'Other'})[['val','col_name','Content']]
    print (df)
       val col_name          Content
    0   or    Alpha      boy or girl
    1    @  Special    school @ morn
    2    1  Numeric  1 school @ morn
    3  NaN    Other            Pechi
    

    EDIT:

    :

    df1 = pd.DataFrame({'Alpha':['and','or', None, None,None],
                        'Numeric':['1','2','3','4','5'],
                        'Special':['@','$','&', None, None]})
    
    
    df2 = pd.DataFrame({'Content':['boy OR girl','school @ morn', 
                                   '1 school @ morn', 'Pechi']})
    
    #If df1 Alpha values are not lower
    #df1['Alpha'] = df1['Alpha'].str.lower()
    df1.columns = [np.arange(len(df1.columns)), df1.columns]
    
    df11 = (df1.unstack()
              .reset_index(level=2,drop=True)
              .rename_axis(('col_order','col_name'))
              .dropna()
              .reset_index(name='val_low'))
    
    df22 = (df2['Content'].str.split(expand=True)
                         .stack()
                         .rename('val')
                         .reset_index(level=1,drop=True)
                         .rename_axis('idx')
                         .reset_index())
    

    #convert columns values to lower to new column
    df22['val_low'] = df22['val'].str.lower()                    
    
    df = (pd.merge(df22, df11, on='val_low', how='left')
           .dropna(subset=['col_name'])
           .sort_values(['idx','col_order'])
           .drop_duplicates(['idx']))
    
    
    df = (pd.concat([df2, df.set_index('idx')], axis=1)
           .fillna({'col_name':'Other'})[['val','col_name','Content']])
    print (df)
       val col_name          Content
    0   OR    Alpha      boy OR girl
    1    @  Special    school @ morn
    2    1  Numeric  1 school @ morn
    3  NaN    Other            Pechi