Search code examples
pythonpython-3.xnltkcase-insensitive

Match word irrespective of the case


Dataset:

> df
Id       Clean_Data
1918916  Luxury Apartments consisting 11 towers Well equipped gymnasium Swimming Pool Toddler Pool Health Club Steam Room Sauna Jacuzzi Pool Table Chess Billiards room Carom Table Tennis indoor games 
1495638  near medavakkam junction calm area near global hospital
1050651  No Pre Emi No Booking Amount No Floor Rise Charges No Processing Fee HLPROJECT HIGHLIGHTS 

Below is the code which is successfully returning the matching words in ngrams from the list of values in Category.py

df['one_word_tokenized_text'] =df["Clean_Data"].str.split()
df['bigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 2)))
df['trigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 3)))
df['four_words'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 4)))
token=pd.Series(df["one_word_tokenized_text"])
Lid=pd.Series(df["Id"])
matches= token.apply(lambda x: pd.Series(x).str.extractall("|".join(["({})".format(cat) for cat in Categories.HealthCare])))
match_list= [[m for m in match.values.ravel() if isinstance(m, str)] for match in matches]
match_df = pd.DataFrame({"ID":Lid,"jc1":match_list})


def match_word(feature, row):
    categories = []

    for bigram in row.bigram:
        joined = ' '.join(bigram)
        if joined in feature:
            categories.append(joined)
    for trigram in row.trigram:
        joined = ' '.join(trigram)
        if joined in feature:
            categories.append(joined)
    for fourwords in row.four_words:
        joined = ' '.join(fourwords)
        if joined in feature:
            categories.append(joined)
    return categories

match_df['Health1'] = df.apply(partial(match_word, HealthCare), axis=1)
match_df['HealthCare'] = match_df[match_df.columns[[1,2]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

Category.py

 category = [('steam room','IN','HealthCare'),
        ('sauna','IN','HealthCare'),
        ('Jacuzzi','IN','HealthCare'),
        ('Aerobics','IN','HealthCare'),
        ('yoga room','IN','HealthCare'),]
    HealthCare= [e1 for (e1, rel, e2) in category if e2=='HealthCare']

Output:

ID  HealthCare
1918916 Jacuzzi
1495638 
1050651 Aerobics, Jacuzzi, yoga room

Here if I mention the features in "Category list" in the exact letter case as mentioned in the dataset, then the code identifies it and returns the value, else it won't. So I want my code to be case insensitive and even track "Steam Room","Sauna" under health category. I tried with ".lower()" function, but am not sure how to implement it.


Solution

  • edit 2: only category.py is updated

    Category.py

    category = [('steam room','IN','HealthCare'),
            ('sauna','IN','HealthCare'),
            ('jacuzzi','IN','HealthCare'),
            ('aerobics','IN','HealthCare'),
            ('Yoga room','IN','HealthCare'),
            ('booking','IN','HealthCare'),        
            ]
    category1 = [value[0].capitalize() for index, value in enumerate(category)]
    category2 = [value[0].lower() for index, value in enumerate(category)]
    
    test = []
    test2 =[]
    
    for index, value in enumerate(category1):
        test.append((value, category[index][1],category[index][2])) 
    
    for index, value in enumerate(category2):
        test2.append((value, category[index][1],category[index][2]))
    
    category = category + test + test2
    
    
    HealthCare = [e1 for (e1, rel, e2) in category if e2=='HealthCare']
    

    Your unaltered dataset

    import pandas as pd
    from nltk import ngrams, word_tokenize
    import Categories
    from Categories import *
    from functools import partial
    
    
    data = {'Clean_Data':['Luxury Apartments consisting 11 towers Well equipped gymnasium Swimming Pool Toddler Pool Health Club Steam Room Sauna Jacuzzi Pool Table Chess Billiards room Carom Table Tennis indoor games',
                         'near medavakkam junction calm area near global hospital',
                         'No Pre Emi No Booking Amount No Floor Rise Charges No Processing Fee HLPROJECT HIGHLIGHTS '],
    'Id' : [1918916, 1495638,1050651]}
    
    df = pd.DataFrame(data)
    
    
    df['one_word_tokenized_text'] =df["Clean_Data"].str.split()
    df['bigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 2)))
    df['trigram'] = df['Clean_Data']).apply(lambda row: list(ngrams(word_tokenize(row), 3)))
    df['four_words'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 4)))
    token=pd.Series(df["one_word_tokenized_text"])
    Lid=pd.Series(df["Id"])
    matches= token.apply(lambda x: pd.Series(x).str.extractall("|".join(["({})".format(cat) for cat in Categories.HealthCare])))
    match_list= [[m for m in match.values.ravel() if isinstance(m, str)] for match in matches]
    match_df = pd.DataFrame({"ID":Lid,"jc1":match_list})
    
    
    def match_word(feature, row):
        categories = []
    
        for bigram in row.bigram:
            joined = ' '.join(bigram)
            if joined in feature:
                categories.append(joined)
        for trigram in row.trigram:
            joined = ' '.join(trigram)
            if joined in feature:
                categories.append(joined)
        for fourwords in row.four_words:
            joined = ' '.join(fourwords)
            if joined in feature:
                categories.append(joined)
        return categories
    
    match_df['Health1'] = df.apply(partial(match_word, HealthCare), axis=1)
    match_df['HealthCare'] = match_df[match_df.columns[[1,2]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)enize(row), 4)))
    

    Output

    print match_df 
    
    +--------+----------------+-------------+------------------------------------+
    |ID      |jc1             |Health1      |HealthCare                          |
    +--------+----------------+-------------+------------------------------------+
    |1918916 |[sauna, jacuzzi]|             |['sauna', 'jacuzzi'],['steam room'] |
    +--------+----------------+-------------+------------------------------------+
    |1495638 |                |             |                                    |
    +--------+----------------+-------------+------------------------------------+
    |1050651 |    [Booking]   |             |  ['Booking'],[]                    |                |
    +--------+----------------+-------------+------------------------------------+