Search code examples
pythonpython-3.xdictionaryfrequencyword-frequency

How to compare word frequencies from two text files?


How to compare word frequencies from two text files in python? For example, if a word contains in file1 and file2 both then it should be written only once but not adding their frequencies while comparing, it should be {'The': 3,5}. Here 3 is the frequency in file1 and 5 is frequency in file2. And if some words only exist in one file but not both then for that file there should be 0. Please Help Here is what I have done so far:

import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2

wordlist=[]
wordlist2=[]
for line in f1:
    for word in line.split():
        wordlist.append(word)

for line in f2:
    for word in line.split():
        wordlist2.append(word)

worddictionary = {}
for word in wordlist:
    if word in worddictionary:
        worddictionary[word] += 1
    else:
        worddictionary[word] = 1

worddictionary2 = {}
for word in wordlist2:
    if word in worddictionary2:
        worddictionary2[word] += 1
    else:
        worddictionary2[word] = 1

print(worddictionary)
print(worddictionary2)

Solution

  • Edit: Here's the more general way you would do this for any list of files (explanation in comments):

    f1=open('file1.txt','r') #file 1
    f2=open('file2.txt','r') #file 2
    
    file_list = [f1, f2] # This would hold all your open files
    num_files = len(file_list)
    
    frequencies = {} # We'll just make one dictionary to hold the frequencies
    
    for i, f in enumerate(file_list): # Loop over the files, keeping an index i
        for line in f: # Get the lines of that file
            for word in line.split(): # Get the words of that file
                if not word in frequencies:
                    frequencies[word] = [0 for _ in range(num_files)] # make a list of 0's for any word you haven't seen yet -- one 0 for each file
    
                frequencies[word][i] += 1 # Increment the frequency count for that word and file
    
    print frequencies
    

    Keeping with the code you wrote, here's how you could create a combined dictionary:

    import operator
    f1=open('file1.txt','r') #file 1
    f2=open('file2.txt','r') #file 2
    
    wordlist=[]
    wordlist2=[]
    for line in f1:
        for word in line.split():
            wordlist.append(word)
    
    for line in f2:
        for word in line.split():
            wordlist2.append(word)
    
    worddictionary = {}
    for word in wordlist:
        if word in worddictionary:
            worddictionary[word] += 1
        else:
            worddictionary[word] = 1
    
    worddictionary2 = {}
    for word in wordlist2:
        if word in worddictionary2:
            worddictionary2[word] += 1
        else:
            worddictionary2[word] = 1
    
    # Create a combined dictionary
    combined_dictionary = {}
    all_word_set = set(worddictionary.keys()) | set(worddictionary2.keys())
    for word in all_word_set:
        combined_dictionary[word] = [0,0]
        if word in worddictionary:
            combined_dictionary[word][0] = worddictionary[word]
        if word in worddictionary2:
            combined_dictionary[word][1] = worddictionary2[word]
    
    
    print(worddictionary)
    print(worddictionary2)
    print(combined_dictionary)