Break list of words into whole word chunks under a max token size

Let's say I have a long list of names that I would like to feed into an LLM in chunks. How can I split up my list of names so that each group is a list with < max_tokens items without repeating or breaking up a any individual entries in the list? I know from the OpenAI docs that I can turn my list into a big string and use tiktoken to truncate the string to a token size but I don't know how to make sure there are whole words in each chunk.

import tiktoken

city_reprex = ['The Colony', 'Bridgeport', 'Toledo', 'Barre', 'Newburyport', 'Dover', 'Jonesboro', 'South Haven', 'Ogdensburg', 'Berkeley', 'Ray', 'Sugar Land', 'Telluride', 'Erwin', 'Milpitas', 'Jonesboro', 'Orem', 'Winnemucca', 'Calabash', 'Sugarcreek']

max_tokens = 25
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

prompt = ', '.join(city_reprex)

prompt_size_in_tokens = len(encoding.encode(prompt))
record_encoding = encoding.encode(prompt)

# How can I get my chunks as close to the max size as possible while also making sure each item in the chunk is a whole item in the list?
print(f"Chunk 1: --> {encoding.decode(record_encoding[:max_tokens])}")
print(f"Chunk 2: --> {encoding.decode(record_encoding[max_tokens:max_tokens*2])}")

Output:

Chunk 1: --> The Colony, Bridgeport, Toledo, Barre, Newburyport, Dover, Jonesboro, South Haven, Ogd
Chunk 2: --> ensburg, Berkeley, Ray, Sugar Land, Telluride, Erwin, Milpitas, Jonesboro, Orem

Solution

Thank you for your answer David. For completeness I changed your function slightly so that the output is of type list and does not start with commas.

import typing as t

import tiktoken

city_reprex = ['The Colony', 'Bridgeport', 'Toledo', 'Barre', 'Newburyport', 'Dover', 'Jonesboro', 'South Haven', 'Ogdensburg', 'Berkeley', 'Ray', 'Sugar Land', 'Telluride', 'Erwin', 'Milpitas', 'Jonesboro', 'Orem', 'Winnemucca', 'Calabash', 'Sugarcreek']

max_tokens = 25
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def split_list_into_chunks(lst: list, max_tokens: int, encoding: tiktoken.core.Encoding) -> t.List[t.List]:
    chunks = []
    current_chunk = ""
    for i, item in enumerate(lst):
        item_size_in_tokens = len(encoding.encode(item))
        if len(encoding.encode(current_chunk)) + item_size_in_tokens + 2 > max_tokens:
            chunks.append(current_chunk.split(', '))
            current_chunk = item
        elif i > 0:
            current_chunk += f", {item}"
        else:
            current_chunk += item
    chunks.append(current_chunk.split(', '))
    return chunks

chunks = split_list_into_chunks(city_reprex, max_tokens, encoding)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}: --> {chunk}")

Output

Chunk 1: --> ['The Colony', 'Bridgeport', 'Toledo', 'Barre', 'Newburyport', 'Dover', 'Jonesboro', 'South Haven']
Chunk 2: --> ['Ogdensburg', 'Berkeley', 'Ray', 'Sugar Land', 'Telluride', 'Erwin', 'Milpitas', 'Jonesboro']
Chunk 3: --> ['Orem', 'Winnemucca', 'Calabash', 'Sugarcreek']