Search code examples
pythonpython-3.xpython-ggplot

Optimizing code to graph word counts


I just finished a program that reads text from books and graphs their word count with the x-axis being the count of one book and the y-axis being the count of the second book. It works, but it's surprisingly slow and I'm hoping to get some tips on how to optimize it. I think my biggest concern is creating a dictionary for similar words between the books and a dictionary for words that are in one book but not the other. This implementation added a lot of runtime to the program and I'd like to find a pythonic way to improve this. Below is the code:

import re   # regular expressions
import io
import collections
from matplotlib import pyplot as plt

# xs=[x1,x2,...,xn]
# Number of occurences of the word in book 1

# use

# ys=[y1.y2,...,yn]
# Number of occurences of the word in book 2

# plt.plot(xs,ys)
# save as svg or pdf files

word_pattern = re.compile(r'\w+')
# with version ensures closing even if there are failures
with io.open("swannsway.txt") as f:
    text = f.read() # read as a single large string
    book1 = word_pattern.findall(text)  # pull out words
    book1 = [w.lower() for w in book1 if len(w)>=3]

with io.open("moby_dick.txt") as f:
    text = f.read() # read as a single large string
    book2 = word_pattern.findall(text)  # pull out words
    book2 = [w.lower() for w in book2 if len(w)>=3]


#Convert these into relative percentages/total book length

wordcount_book1 = {}
for word in book1:
    if word in wordcount_book1:
        wordcount_book1[word]+=1
    else:
        wordcount_book1[word]=1

'''
for word in wordcount_book1:
    wordcount_book1[word] /= len(wordcount_book1)

for word in wordcount_book2:
    wordcount_book2[word] /= len(wordcount_book2)
'''

wordcount_book2 = {}
for word in book2:
    if word in wordcount_book2:
        wordcount_book2[word]+=1
    else:
        wordcount_book2[word]=1


common_words = {}

for i in wordcount_book1:
    for j in wordcount_book2:
        if i == j:
            common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
            break

book_singles= {}
for i in wordcount_book1:
    if i not in common_words:
        book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
    if i not in common_words:
        book_singles[i] = [0, wordcount_book2[i]]

wordcount_book1 = collections.Counter(book1)
wordcount_book2 = collections.Counter(book2)

# how many words of different lengths?

word_length_book1 = collections.Counter([len(word) for word in book1])
word_length_book2 = collections.Counter([len(word) for word in book2])

print(wordcount_book1)

#plt.plot(list(word_length_book1.keys()),list(word_length_book1.values()), list(word_length_book2.keys()), list(word_length_book2.values()), 'bo')
for i in range(len(common_words)):
    plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
for i in range(len(book_singles)):
    plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
plt.ylabel('Swannsway')
plt.xlabel('Moby Dick')
plt.show()
#key:value

Solution

  • The bulk of your code only had minor inefficiencies which I've tried to address. Your largest delay was in plotting book_singles which I believe I've fixed. The details: I switched this:

    word_pattern = re.compile(r'\w+')
    

    to:

    word_pattern = re.compile(r'[a-zA-Z]{3,}')
    

    as book_singles is large enough without including numbers too! By including a minimum size in the pattern, we eliminate the need for this loop:

    book1 = [w.lower() for w in book1 if len(w)>=3]
    

    And the matching one for book2. Here:

    book1 = word_pattern.findall(text)  # pull out words
    book1 = [w.lower() for w in book1 if len(w)>=3]
    

    I moved the .lower() so we only do it once, rather than on every word:

    book1 = word_pattern.findall(text.lower())  # pull out words
    book1 = [w for w in book1 if len(w) >= 3]
    

    Since it's likely implemented in C, this can be a win. This:

    wordcount_book1 = {}
    for word in book1:
        if word in wordcount_book1:
            wordcount_book1[word]+=1
        else:
            wordcount_book1[word]=1
    

    I switched to use a defaultdict since you have collections imported already:

    wordcount_book1 = collections.defaultdict(int)
    for word in book1:
        wordcount_book1[word] += 1
    

    For these loops:

    common_words = {}
    
    for i in wordcount_book1:
        for j in wordcount_book2:
            if i == j:
                common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
                break
    
    book_singles= {}
    for i in wordcount_book1:
        if i not in common_words:
            book_singles[i] = [wordcount_book1[i], 0]
    for i in wordcount_book2:
        if i not in common_words:
            book_singles[i] = [0, wordcount_book2[i]]
    

    I rewrote the first loop which was a disaster and then made it do double duty since it had done the work of the second loop already:

    common_words = {}
    book_singles = {}
    
    for i in wordcount_book1:
        if i in wordcount_book2:
            common_words[i] = [wordcount_book1[i], wordcount_book2[i]]
        else:
            book_singles[i] = [wordcount_book1[i], 0]
    
    for i in wordcount_book2:
        if i not in common_words:
            book_singles[i] = [0, wordcount_book2[i]]
    

    Finally, these plotting loops are horribly inefficient both in the way they walk common_words.values() and book_singles.values() over and over again and in the way that they plot one point at a time:

    for i in range(len(common_words)):
        plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
    for i in range(len(book_singles)):
        plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
    

    I changed them to simply:

    counts1, counts2 = zip(*common_words.values())
    plt.plot(counts1, counts2, 'bo', alpha=0.2)
    
    counts1, counts2 = zip(*book_singles.values())
    plt.plot(counts1, counts2, 'ro', alpha=0.2)
    

    The complete reworked code that leaves out things you calculated but never used in the example:

    import re  # regular expressions
    import collections
    from matplotlib import pyplot as plt
    
    # xs=[x1,x2,...,xn]
    # Number of occurrences of the word in book 1
    
    # use
    
    # ys=[y1.y2,...,yn]
    # Number of occurrences of the word in book 2
    
    # plt.plot(xs,ys)
    # save as svg or pdf files
    
    word_pattern = re.compile(r'[a-zA-Z]{3,}')
    
    # with ensures closing of file even if there are failures
    with open("swannsway.txt") as f:
        text = f.read() # read as a single large string
        book1 = word_pattern.findall(text.lower())  # pull out words
    
    with open("moby_dick.txt") as f:
        text = f.read() # read as a single large string
        book2 = word_pattern.findall(text.lower())  # pull out words
    
    # Convert these into relative percentages/total book length
    
    wordcount_book1 = collections.defaultdict(int)
    for word in book1:
        wordcount_book1[word] += 1
    
    wordcount_book2 = collections.defaultdict(int)
    for word in book2:
        wordcount_book2[word] += 1
    
    common_words = {}
    book_singles = {}
    
    for i in wordcount_book1:
        if i in wordcount_book2:
            common_words[i] = [wordcount_book1[i], wordcount_book2[i]]
        else:
            book_singles[i] = [wordcount_book1[i], 0]
    
    for i in wordcount_book2:
        if i not in common_words:
            book_singles[i] = [0, wordcount_book2[i]]
    
    counts1, counts2 = zip(*common_words.values())
    plt.plot(counts1, counts2, 'bo', alpha=0.2)
    
    counts1, counts2 = zip(*book_singles.values())
    plt.plot(counts1, counts2, 'ro', alpha=0.2)
    
    plt.xlabel('Moby Dick')
    plt.ylabel('Swannsway')
    plt.show()
    

    OUTPUT

    enter image description here

    You might eliminate stop words to reduce high scoring words and bring out the interesting data.