Search code examples
pythonpython-3.xnlpkey

Bag of Words gives KeyError


I am getting a KeyError: when performing bag of words on a large text file. This had worked a few years back, but I dug it back up and redid it in python 3.7 using pycharm instead of emacs and python 2.7. Looking to get an old NLP example running to compare to newer techniques eventually.

KeyError: 'learning'
line 12, in get_bag_of_words
    bag_of_words[word] += course_bag_of_words[word]

How do I track this down? truly not sure what to do been reading posts most of the day?

If its the key or entry do I do something like delete or pop it? How to remove a key from a Python dictionary?

Or some wrong with the bag of word function.

def get_bag_of_words(titles_lines):
    bag_of_words = {}
    for line in titles_lines[1:]:
        courseid, course_bag_of_words = get_course_bag_of_words(line)
        for word in course_bag_of_words:
                if word not in course_bag_of_words:
                    bag_of_words[word] = course_bag_of_words[word]
                else:
                    bag_of_words[word] += course_bag_of_words[word]
    return bag_of_words

full code on my github simple recommender and search full run.py

from myfuncs import *
# import myfuncs

# get keywords, inverted index and titles
f = open('s2-titles.txt', encoding = "utf8")
titles_lines = f.readlines()
f.close()

bag_of_words = get_bag_of_words(titles_lines)
keywords = get_keywords(titles_lines, bag_of_words)
inverted_index = get_inverted_index(keywords)
titles = get_titles(titles_lines)

# run search query
query = input('Input your search query: ')
while query != '':
    query_terms = query.split()
    sorted_results = get_search_results(query_terms,
                                        keywords,
                                        inverted_index)
    print('==> search results for query:', query)
    for result in sorted_results:
        print(result, titles[result])
    query = input('Input your search query [hit return to finish]: ')

# get unit vectors
f = open('s2-categories.tsv', encoding = "utf8")
categories_lines = f.readlines()
f.close()
unit_vectors = get_dot_product(keywords, categories_lines)

# run recommendation algorithm
seed_courseid = input('Input your seed courseid: ')
while seed_courseid != '':
    sorted_results = get_recommendation_results(seed_courseid,
                                                keywords,
                                                inverted_index,
                                                unit_vectors)
    print('==> recommendation results:')
    for result in sorted_results:
        print(result, titles[result])
        print(get_dot_product(seed_courseid, result, unit_vectors))
    seed_courseid = input('Input seed courseid [hit return to finish]:')

full myfuncs.py

#!/usr/bin/env python
# coding: utf-8


def get_bag_of_words(titles_lines):
    bag_of_words = {}
    for line in titles_lines[1:]:
        courseid, course_bag_of_words = get_course_bag_of_words(line)
        for word in course_bag_of_words:
            if word not in course_bag_of_words:
                bag_of_words[word] = course_bag_of_words[word]
            else:
                bag_of_words[word] += course_bag_of_words[word]
    return bag_of_words


def get_course_bag_of_words(line):
    course_bag_of_words = {}
    # split by weirdcombo to prevent weird splits
    courseid, title, description = line.split('XXXYYYZZZ')
    title = title.lower()
    description = description.lower()
    wordlist = title.split() + description.split()
    if len(wordlist) >= 10:
        for word in wordlist:
            if word not in course_bag_of_words:
                course_bag_of_words[word] = 1
            else:
                course_bag_of_words[word] += 1

    return courseid, course_bag_of_words


def get_sorted_results(d):
    kv_list = d.items()
    vk_list = []
    for kv in kv_list:
        k, v = kv
        vk = v, k
        vk_list.append(vk)
    vk_list.sort()
    vk_list.reverse()
    k_list = []
    for vk in vk_list[:10]:
        v, k = vk
        k_list.append(k)
    return k_list


def get_keywords(titles_lines, bag_of_words):
    n = sum(bag_of_words.values())
    keywords = {}
    for line in titles_lines[1:]:
        courseid, course_bag_of_words = get_course_bag_of_words(line)
        term_importance = {}
        for word in course_bag_of_words:
            tf_course = (float(course_bag_of_words[word]) /
                         sum(course_bag_of_words.values())
                         )
            tf_overall = float(bag_of_words[word]) / n
            term_importance[word] = tf_course / tf_overall
        keywords[courseid] = get_sorted_results(term_importance)
    return keywords


def get_inverted_index(keywords):
    inverted_index = {}
    for courseid in keywords:
        for keyword in keywords[courseid]:
            if keyword not in inverted_index:
                inverted_index[keyword] = []
            inverted_index[keyword].append(courseid)
        return inverted_index


def get_search_results(query_terms, keywords, inverted_index):
    search_results = {}
    for term in query_terms:
        if term in inverted_index:
            for courseid in inverted_index[term]:
                if courseid not in search_results:
                    search_results[courseid] = 0.0
                search_results[courseid] += (
                        1 / float(keywords[courseid].index(term) + 1) *
                        1 / float(query_terms(term) + 1)
                )
    sorted_results = get_sorted_results(search_results)
    return sorted_results


def get_titles(titles_lines):
    titles = {}
    for line in titles_lines[1:]:
        courseid, title, description = line.split('XXXYYYZZZ')
        titles[courseid] = title[:60]  # take first 60 characters
    return titles


def get_unit_vectors(keywords, categories_lines):
    norm = 1.884
    cat = {}
    subcat = {}
    for line in categories_lines[1:]:
        courseid, category, subcategory = line.split('\t')
        cat[courseid] = category.strip()
        subcat[courseid] = subcategory.strip()
    unit_vectors = {}
    for courseid in keywords:
        u = {}
        if courseid in cat:
            u[cat[courseid]] = 1 / norm
            u[subcat[courseid]] = 1 / norm
        for keyword in keywords[courseid]:
            u[keyword] = (
                    1 /
                    float(keywords[courseid].index(keyword) + 1) /
                    norm
            )
        unit_vectors[courseid] = u
    return unit_vectors


def get_dot_product(courseid1, courseid2, unit_vectors):
    u1 = unit_vectors[courseid1]
    u2 = unit_vectors[courseid2]
    dot_product = 0.0
    for dimension in u1:
        if dimension in u2:
            dot_product += u1[dimension] * u2[dimension]
        return dot_product


def get_recommendation_results(seed_courseid,
                               keywords,
                               inverted_index,
                               unit_vectors):
    courseids = []
    for keyword in keywords[seed_courseid]:
        for courseid in inverted_index[keyword]:
            if courseid not in courseids and courseid != seed_courseid:
                courseids.append(courseid)

    dot_products = {}
    for courseid in courseids:
        dot_products[courseids] = get_dot_product(seed_courseid,
                                                  courseid,
                                                  unit_vectors)
    sorted_results = get_sorted_results(dot_products)
    return sorted_results

Solution

  • I think there might be a small bug:

    def get_bag_of_words(titles_lines):
        bag_of_words = {}
        for line in titles_lines[1:]:
            courseid, course_bag_of_words = get_course_bag_of_words(line)
            for word in course_bag_of_words:
                # should check in bag_of_words
                if word not in bag_of_words:
                    bag_of_words[word] = course_bag_of_words[word]
                else:
                    bag_of_words[word] += course_bag_of_words[word]
        return bag_of_words
    

    This should be the reason causing your KeyError.

    Didn't check your other functions.