Search code examples
pythonlistfrequencycustom-listsvocabulary

Check text/string for occurence of predefined list elements


I have several text files, which I want to compare against a vocabulary list consisting of expressions and single words. The desired output should be a dictionary containing all elements of that list as keys and their respective frequency in the textfile as value. To construct the vocabulary list I need to match two lists together,

list1 = ['accounting',..., 'yields', 'zero-bond']
list2 = ['accounting', 'actual cost', ..., 'zero-bond']
vocabulary_list = ['accounting', 'actual cost', ..., 'yields', 'zero-bond']

sample_text = "Accounting experts predict an increase in yields for zero-bond and yields for junk-bonds."

desired_output = ['accounting':1, 'actual cost':0, ..., 'yields':2, 'zero-bond':1]

what I tried:

def word_frequency(fileobj, words):
     """Build a Counter of specified words in fileobj""" 
     # initialise the counter to 0 for each word 
    ct = Counter(dict((w, 0) for w in words)) 
    file_words = (word for line in fileobj for word in line)             
    filtered_words = (word for word in file_words if word in words)       
    return Counter(filtered_words)

 def print_summary(filepath, ct): 
    words = sorted(ct.keys()) 
    counts = [str(ct[k]) for k in words] with open(filepath[:-4] + '_dict' + '.txt', mode = 'w') as outfile: 
    outfile.write('{0}\n{1}\n{2}\n\n'.format(filepath,', '.join(words),', '.join(counts))) 
    return outfile 

Is there any way to do this in Python? I figured out how to manage this with a vocabulary list of single words (1token) but couldnt figure out a solution for the multiple-word case?


Solution

  • If you want to consider words ending with punctuation you will need to clean the text also i.e 'yields' and 'yields!'

    from collections import Counter
    c = Counter()
    import re
    
    vocabulary_list = ['accounting', 'actual cost','yields', 'zero-bond']
    d = {k: 0 for k in vocabulary_list}
    sample_text = "Accounting experts predict actual costs an increase in yields for zero-bond and yields for junk-bonds.".lower()
    splitted = set(sample_text.split())
    c.update(splitted) # get count of all words 
    
    for k in d:
        spl = k.split()
        ln = len(spl)
        # if we have multiple words we cannot split
        if ln > 1:
            check = re.findall(r'\b{0}\b'.format(k),sample_text)
            if check:
                d[k] += len(check)
        # else we are looking for a single word
        elif k in splitted:
            d[k] += c[k]
    print(d)
    

    To chain all the lists into a single vocab dict:

    from collections import Counter
    from itertools import chain
    import re
    
    c = Counter()
    
    l1,l2 = ['accounting', 'actual cost'], ['yields', 'zero-bond']
    vocabulary_dict  = {k:0 for k in chain(l1,l2)}
    print(vocabulary_dict)
    sample_text = "Accounting experts predict actual costs an increase in yields for zero-bond and yields for junk-bonds.".lower()
    splitted = sample_text.split()
    c.update(splitted)
    
    for k in vocabulary_dict:
        spl = k.split()
        ln = len(spl)
        if ln > 1:
            check = re.findall(r'\b{0}\b'.format(k),sample_text)
            if check:
                vocabulary_dict[k] += len(check)
        elif k in sample_text.split():
            vocabulary_dict[k] += c[k]
    print(vocabulary_dict)
    

    You could create two dicts one for phrases and the other for words and do a pass over each.