Search code examples
pythonoptimizationletter

One letter game Issue?


Recently at a job interview I was given the following problem:

  1. Write a script capable of running on the command line as python

  2. It should take in two words on the command line (or optionally if you'd prefer it can query the user to supply the two words via the console).

  3. Given those two words: a. Ensure they are of equal length b. Ensure they are both words present in the dictionary of valid words in the English language that you downloaded.

  4. If so compute whether you can reach the second word from the first by a series of steps as follows a. You can change one letter at a time b. Each time you change a letter the resulting word must also exist in the dictionary c. You cannot add or remove letters

  5. If the two words are reachable, the script should print out the path which leads as a single, shortest path from one word to the other.

  6. You can /usr/share/dict/words for your dictionary of words.

My solution consisted of using breadth first search to find a shortest path between two words. But apparently that wasn't good enough to get the job :(

Would you guys know what I could have done wrong? Thank you so much.

import collections
import functools
import re

def time_func(func):
    import time

    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        timed = time.time() - start

        setattr(wrapper, 'time_taken', timed)
        return res

    functools.update_wrapper(wrapper, func)
    return wrapper

class OneLetterGame:
    def __init__(self, dict_path):
        self.dict_path = dict_path
        self.words = set()

    def run(self, start_word, end_word):
        '''Runs the one letter game with the given start and end words.
        '''
        assert len(start_word) == len(end_word), \
            'Start word and end word must of the same length.'

        self.read_dict(len(start_word))

        path = self.shortest_path(start_word, end_word)
        if not path:
            print 'There is no path between %s and %s (took %.2f sec.)' % (
                start_word, end_word, find_shortest_path.time_taken)
        else:
            print 'The shortest path (found in %.2f sec.) is:\n=> %s' % (
                self.shortest_path.time_taken, ' -- '.join(path))

    def _bfs(self, start):
        '''Implementation of breadth first search as a generator.

        The portion of the graph to explore is given on demand using get_neighboors.
        Care was taken so that a vertex / node is explored only once.
        '''
        queue = collections.deque([(None, start)])
        inqueue = set([start])

        while queue:
            parent, node = queue.popleft()
            yield parent, node

            new = set(self.get_neighbours(node)) - inqueue
            inqueue = inqueue | new
            queue.extend([(node, child) for child in new])

    @time_func
    def shortest_path(self, start, end):
        '''Returns the shortest path from start to end using bfs.
        '''
        assert start in self.words, 'Start word not in dictionnary.'
        assert end in self.words, 'End word not in dictionnary.'

        paths = {None: []}
        for parent, child in self._bfs(start):
            paths[child] = paths[parent] + [child]
            if child == end:
                return paths[child]
        return None

    def get_neighbours(self, word):
        '''Gets every word one letter away from the a given word.

        We do not keep these words in memory because bfs accesses 
        a given vertex only once.
        '''
        neighbours = []

        p_word = ['^' + word[0:i] + '\w' + word[i+1:] + '$' 
            for i, w in enumerate(word)]
        p_word = '|'.join(p_word)

        for w in self.words:
            if w != word and re.match(p_word, w, re.I|re.U):
                neighbours += [w]
        return neighbours

    def read_dict(self, size):
        '''Loads every word of a specific size from the dictionnary into memory.
        '''
        for l in open(self.dict_path):
            l = l.decode('latin-1').strip().lower()
            if len(l) == size:
                self.words.add(l)

if __name__ == '__main__':
    import sys
    if len(sys.argv) not in [3, 4]:
        print 'Usage: python one_letter_game.py start_word end_word'
    else:
        g = OneLetterGame(dict_path = '/usr/share/dict/words')
        try:
            g.run(*sys.argv[1:])
        except AssertionError, e:
            print e

Thank you for all the great answers. I think what really got me is the fact that I do iterate over ALL words in the dictionary each time to consider possible word neighbors. Instead I could have used an inverted index as pointed by Duncan and Matt Anderson. A* approach would definitely have helped too. Thanks a lot, now I know what I have done wrong.

Here is the same code with inverted index:

import collections
import functools
import re

def time_func(func):
    import time

    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        timed = time.time() - start

        setattr(wrapper, 'time_taken', timed)
        return res

    functools.update_wrapper(wrapper, func)
    return wrapper

class OneLetterGame:
    def __init__(self, dict_path):
        self.dict_path = dict_path
        self.words = {}

    def run(self, start_word, end_word):
        '''Runs the one letter game with the given start and end words.
        '''
        assert len(start_word) == len(end_word), \
            'Start word and end word must of the same length.'

        self.read_dict(len(start_word))

        path = self.shortest_path(start_word, end_word)
        if not path:
            print 'There is no path between %s and %s (took %.2f sec.)' % (
                start_word, end_word, self.shortest_path.time_taken)
        else:
            print 'The shortest path (found in %.2f sec.) is:\n=> %s' % (
                self.shortest_path.time_taken, ' -- '.join(path))

    def _bfs(self, start):
        '''Implementation of breadth first search as a generator.

        The portion of the graph to explore is given on demand using get_neighboors.
        Care was taken so that a vertex / node is explored only once.
        '''
        queue = collections.deque([(None, start)])
        inqueue = set([start])

        while queue:
            parent, node = queue.popleft()
            yield parent, node

            new = set(self.get_neighbours(node)) - inqueue
            inqueue = inqueue | new
            queue.extend([(node, child) for child in new])

    @time_func
    def shortest_path(self, start, end):
        '''Returns the shortest path from start to end using bfs.
        '''
        assert self.in_dictionnary(start), 'Start word not in dictionnary.'
        assert self.in_dictionnary(end), 'End word not in dictionnary.'

        paths = {None: []}
        for parent, child in self._bfs(start):
            paths[child] = paths[parent] + [child]
            if child == end:
                return paths[child]
        return None

    def in_dictionnary(self, word):
        for s in self.get_steps(word):
            if s in self.words:
                return True
        return False

    def get_neighbours(self, word):
        '''Gets every word one letter away from the a given word.
        '''
        for step in self.get_steps(word):
            for neighbour in self.words[step]:
                yield neighbour

    def get_steps(self, word):
        return (word[0:i] + '*' + word[i+1:] 
            for i, w in enumerate(word))

    def read_dict(self, size):
        '''Loads every word of a specific size from the dictionnary into an inverted index.
        '''
        for w in open(self.dict_path):
            w = w.decode('latin-1').strip().lower()
            if len(w) != size:
                continue
            for step in self.get_steps(w):
                if step not in self.words:
                    self.words[step] = [] 
                self.words[step].append(w)

if __name__ == '__main__':
    import sys
    if len(sys.argv) not in [3, 4]:
        print 'Usage: python one_letter_game.py start_word end_word'
    else:
        g = OneLetterGame(dict_path = '/usr/share/dict/words')
        try:
            g.run(*sys.argv[1:])
        except AssertionError, e:
            print e

And a timing comparison:

% python one_letter_game_old.py happy hello The shortest path (found in 91.57 sec.) is:
=> happy -- harpy -- harps -- harts -- halts -- halls -- hells -- hello

% python one_letter_game.py happy hello The shortest path (found in 1.71 sec.) is:
=> happy -- harpy -- harps -- harts -- halts -- halls -- hells -- hello


Solution

  • I wouldn't say your solution is wrong, but it is a little slow. For two reasons.

    1. Breadth-first-search is going to visit all paths of length one shorter than is needed, plus some-to-all of paths of length needed, before it can give you an answer. A best-first-search (A*) will ideally skip most irrelevant paths.

    2. You're checking every word in the dictionary for candidacy as a neighbor each time you look for neighbors. As Duncan suggests, you can build a data structure to essentially look up the neighbors instead of searching for them.

    Here is another solution to your problem:

    import collections
    import heapq
    import time
    
    def distance(start, end):
        steps = 0
        for pos in range(len(start)):
            if start[pos] != end[pos]:
                steps += 1
        return steps
    
    
    class SearchHeap(object):
        def __init__(self):
            self.on_heap = set()
            self.heap = []
    
        def push(self, distance, word, path):
            if word in self.on_heap:
                return
            self.on_heap.add(word)
            heapq.heappush(self.heap, ((distance, len(path)), word, path))
    
        def __len__(self):
            return len(self.heap)
    
        def pop(self):
            return heapq.heappop(self.heap)
    
    
    class OneLetterGame(object):
        _word_data = None
    
        def __init__(self, dict_path):
            self.dict_path = dict_path
    
        def run(self, start_word, end_word):
            start_time = time.time()
            self._word_data = collections.defaultdict(list)
            if len(start_word) != len(end_word):
                print 'words of different length; no path'
                return
    
            found_start, found_end = self._load_words(start_word, end_word)
            if not found_start:
                print 'start word %r not found in dictionary' % start_word
                return
            if not found_end:
                print 'end word %r not found in dictionary' % end_word
                return
    
            search_start_time = time.time()
            path = self._shortest_path(start_word, end_word)
            search_time = time.time() - search_start_time
            print 'search time was %.4f seconds' % search_time
    
            if path:
                print path
            else:
                print 'no path found from %r to %r' % (start_word, end_word)
    
            run_time = time.time() - start_time
            print 'total run time was %.4f seconds' % run_time
    
        def _load_words(self, start_word, end_word):
            found_start, found_end = False, False
            length = len(start_word)
            with open(self.dict_path) as words:
                for word in words:
                    word = word.strip()
                    if len(word) == length:
                        if start_word == word: found_start = True
                        if end_word == word: found_end = True
                        for bucket in self._buckets_for(word):
                            self._word_data[bucket].append(word)
            return found_start, found_end
    
        def _shortest_path(self, start_word, end_word):
            heap = SearchHeap()
            heap.push(distance(start_word, end_word), start_word, (start_word,))
            while len(heap):
                dist, word, path = heap.pop()
                if word == end_word:
                    return path
                for neighbor in self._neighbors_of(word):
                    heap.push(
                        distance(neighbor, end_word), 
                        neighbor, 
                        path + (neighbor,))
            return ()
    
        def _buckets_for(self, word):
            buckets = []
            for pos in range(len(word)):
                front, back = word[:pos], word[pos+1:]
                buckets.append(front+'*'+back)
            return buckets
    
        def _neighbors_of(self, word):
            for bucket in self._buckets_for(word):
                for word in self._word_data[bucket]:
                    yield word
    
    if __name__ == '__main__':
        import sys
        if len(sys.argv) not in [3, 4]:
            print 'Usage: python one_letter_game.py start_word end_word'
        else:
            matt = OneLetterGame(dict_path = '/usr/share/dict/words')
            matt.run(*sys.argv[1:])
    

    And, a timing comparison:

    % python /tmp/one_letter_alex.py canoe happy
    The shortest path (found in 51.98 sec.) is:
    => canoe -- canon -- caxon -- taxon -- taxor -- taxer -- taper -- paper -- papey -- pappy -- happy
    
    % python /tmp/one_letter_matt.py canoe happy
    search time was 0.0020 seconds
    ('canoe', 'canon', 'caxon', 'taxon', 'taxor', 'taxer', 'taper', 'paper', 'papey', 'pappy', 'happy')
    total run time was 0.2416 seconds