Search code examples
pythonstringalgorithmdata-structureswords

Generate words of varying length give n number of characters


I need to generate all possible words of Ki lengths given n characters for example:

given

LNDJOBEAWRL

do bear

I cant come up with the word of len 5 but this is the idea

n = 11
k1 = 2
k2 = 4
k3 = 5 

So basically all words of length 2 4 and 5 but without reusing characters. What would be the best way to do it?


My dictionary structure looks likes this:

{
    3: [{u'eit': u' "eit":0'}], 
    5: [{u'doosw': u' "woods": 4601, '}, {u'acenr': u' "caner": 0, '}, {u'acens': u' "canes": 0, '}, {u'acden': u' "caned": 0, '}, {u'aceln': u' "canel": 0,'}], 
    6: [{u'abeill': u' "alible": 0, '}, {u'cdeeit': u' "deciet":0,'}, {u'demoor': u' "mooder": 0, '}], 
    7: [{u'deiprss': u' "spiders": 0, '}, {u'deiprsy': u' "spidery": 0, '}, {u'cersttu': u' "scutter": 0, '}], 
    8: [{u'chiiilst': u' "chilitis": 0, '}, {u'agilnrtw': u' "trawling": 0, '}, {u'abdeemns': u' "beadsmen": 0, '}], 
    9: [{u'abeiilnns': u' "biennials": 0, '}, {u'bclooortu': u' "oblocutor": 0, '}, {u'aabfiinst': u' "fabianist": 0, '}, {u'acdeiituz': u' "diazeutic": 0, '}, {u'aabfiimns': u' "fabianism": 0, '}, {u'ehnoooppt': u' "optophone": 0, '}], 
    10: [{u'aiilnoprtt': u' "tripolitan": 0, '}, {u'eeilprrsty': u' "sperrylite": 0, '}, {u'gghhiilttt': u' "lighttight": 0, '}, {u'aeegilrruz': u' "regularize": 0, '}, {u'ellnprtuuy': u' "purulently": 0, '}], 
    11: [{u'cdgilnoostu': u' "outscolding": 0, '}], 
    12: [{u'ceeeilnostuy': u' "leucosyenite": 0, '}, {u'aacciloprsst': u' "sarcoplastic": 0, '}], 
    13: [{u'acdeimmoprrsu': u' "cardiospermum": 0, '}, {u'celnnooostuvy': u' "noncovetously": 0, '}], 
    14: [{u'adeejmnnoprrtu': u' "preadjournment": 0, '}]
}

And my modified code looks like this:

wlen = self.table[pos]
if pos == 0:
    # See if the letters remaining in the bag are a valid word
    key = ''.join(sorted(bag.elements()))

    for d in wlen:
        if key in d.keys():
            yield solution + [key]
else:
    pos -= 1
    for dic in wlen:
        print(len(dic))
        for key in dic.keys():

Solution

  • The code below uses a recursive generator to build solutions. To store the target letters we use a collections.Counter, this acts like a set that permits repeated items.

    To simplify searching we create a dictionary for each word length that we want, storing each of those dictionaries in a dictionary called all_words, with the word length as the key. Each sub-dictionary stores lists of words containing the same letters, with the sorted letters as the key, eg 'aet': ['ate', 'eat', 'tea'].

    I use the standard Unix '/usr/share/dict/words' word file. If you use a file with a different format you may need to modify the code that puts words into all_words.

    The solve function starts its search with the smallest word length, and works up to the largest. That's probably the most efficient order if the set containing the longest words is the biggest, since the final search is performed by doing a simple dict lookup, which is very fast. The previous searches have to test each word in the sub-dictionary of that length, looking for keys that are still in the target bag.

    #!/usr/bin/env python3
    
    ''' Create anagrams from a string of target letters and a list of word lengths '''
    
    from collections import Counter
    from itertools import product
    
    # The Unix word list
    fname = '/usr/share/dict/words'
    
    # The target letters to use
    target = 'lndjobeawrl'
    
    # Word lengths, in descending order
    wordlengths = [5, 4, 2]
    
    # A dict to hold dicts for each word length.
    # The inner dicts store lists of words containing the same letters,
    # with the sorted letters as the key, eg 'aet': ['ate', 'eat', 'tea']
    all_words = {i: {} for i in wordlengths}
    
    # A method that tests if a word only contains letters in target
    valid = set(target).issuperset
    
    print('Scanning', fname, 'for valid words...')
    count = 0
    with open(fname) as f:
        for word in f:
            word = word.rstrip()
            wlen = len(word)
            # Only add words of the correct length, with no punctuation.
            # Using word.islower() eliminates most abbreviations.
            if (wlen in wordlengths and word.islower()
            and word.isalpha() and valid(word)):
                sorted_word = ''.join(sorted(word))
                # Add this word to the list in all_words[wlen],
                # creating the list if it doesn't exist
                all_words[wlen].setdefault(sorted_word, []).append(word)
                count += 1
    
    print(count, 'words found')
    for k, v in all_words.items():
        print(k, len(v))
    print('\nSolving...')
    
    def solve(pos, bag, solution):
        wlen = wordlengths[pos]
        if pos == 0:
            # See if the letters remaining in the bag are a valid word
            key = ''.join(sorted(bag.elements()))
            if key in all_words[wlen]:
                yield solution + [key]
        else:
            pos -= 1
            for key in all_words[wlen].keys():
                # Test that all letters in key are in the bag
                newbag = bag.copy()
                newbag.subtract(key)
                if all(v >= 0 for v in newbag.values()):
                    # Add this key to the current solution and 
                    # recurse to find the next key
                    yield from solve(pos, newbag, solution + [key])
    
    # Find all lists of keys that produce valid combinations
    for solution in solve(len(wordlengths) - 1, Counter(target), []):
        # Convert solutions to tuples of words
        t = [all_words[len(key)][key] for key in solution]
        for s in product(*t):
            print(s)
    

    output

    Scanning /usr/share/dict/words for valid words...
    300 words found
    5 110
    4 112
    2 11
    
    Solving...
    ('ad', 'jell', 'brown')
    ('do', 'jell', 'brawn')
    ('ow', 'jell', 'brand')
    ('re', 'jowl', 'bland')
    

    FWIW, here are the results for

    target = 'nobigword'
    wordlengths = [4, 3, 2]
    

    output

    Scanning /usr/share/dict/words for valid words...
    83 words found
    4 31
    3 33
    2 7
    
    Solving...
    ('do', 'big', 'worn')
    ('do', 'bin', 'grow')
    ('do', 'nib', 'grow')
    ('do', 'bow', 'grin')
    ('do', 'bow', 'ring')
    ('do', 'gin', 'brow')
    ('do', 'now', 'brig')
    ('do', 'own', 'brig')
    ('do', 'won', 'brig')
    ('do', 'orb', 'wing')
    ('do', 'rob', 'wing')
    ('do', 'rib', 'gown')
    ('do', 'wig', 'born')
    ('go', 'bid', 'worn')
    ('go', 'bin', 'word')
    ('go', 'nib', 'word')
    ('go', 'bow', 'rind')
    ('go', 'din', 'brow')
    ('go', 'now', 'bird')
    ('go', 'own', 'bird')
    ('go', 'won', 'bird')
    ('go', 'orb', 'wind')
    ('go', 'rob', 'wind')
    ('go', 'rib', 'down')
    ('go', 'row', 'bind')
    ('id', 'bog', 'worn')
    ('id', 'gob', 'worn')
    ('id', 'orb', 'gown')
    ('id', 'rob', 'gown')
    ('id', 'row', 'bong')
    ('in', 'bog', 'word')
    ('in', 'gob', 'word')
    ('in', 'dog', 'brow')
    ('in', 'god', 'brow')
    ('no', 'bid', 'grow')
    ('on', 'bid', 'grow')
    ('no', 'big', 'word')
    ('on', 'big', 'word')
    ('no', 'bow', 'gird')
    ('no', 'bow', 'grid')
    ('on', 'bow', 'gird')
    ('on', 'bow', 'grid')
    ('no', 'dig', 'brow')
    ('on', 'dig', 'brow')
    ('or', 'bid', 'gown')
    ('or', 'big', 'down')
    ('or', 'bog', 'wind')
    ('or', 'gob', 'wind')
    ('or', 'bow', 'ding')
    ('or', 'wig', 'bond')
    ('ow', 'bog', 'rind')
    ('ow', 'gob', 'rind')
    ('ow', 'dig', 'born')
    ('ow', 'don', 'brig')
    ('ow', 'nod', 'brig')
    ('ow', 'orb', 'ding')
    ('ow', 'rob', 'ding')
    ('ow', 'rid', 'bong')
    ('ow', 'rig', 'bond')
    

    This code was written for Python 3. You can use it on Python 2.7 but you will need to change

    yield from solve(pos, newbag, solution + [key])
    

    to

    for result in solve(pos, newbag, solution + [key]):
        yield result