Let's say I have a long list of names that I would like to feed into an LLM in chunks. How can I split up my list of names so that each group is a list with < max_tokens
items without repeating or breaking up a any individual entries in the list? I know from the OpenAI docs that I can turn my list into a big string and use tiktoken
to truncate the string to a token size but I don't know how to make sure there are whole words in each chunk.
import tiktoken
city_reprex = ['The Colony', 'Bridgeport', 'Toledo', 'Barre', 'Newburyport', 'Dover', 'Jonesboro', 'South Haven', 'Ogdensburg', 'Berkeley', 'Ray', 'Sugar Land', 'Telluride', 'Erwin', 'Milpitas', 'Jonesboro', 'Orem', 'Winnemucca', 'Calabash', 'Sugarcreek']
max_tokens = 25
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
prompt = ', '.join(city_reprex)
prompt_size_in_tokens = len(encoding.encode(prompt))
record_encoding = encoding.encode(prompt)
# How can I get my chunks as close to the max size as possible while also making sure each item in the chunk is a whole item in the list?
print(f"Chunk 1: --> {encoding.decode(record_encoding[:max_tokens])}")
print(f"Chunk 2: --> {encoding.decode(record_encoding[max_tokens:max_tokens*2])}")
Output:
Chunk 1: --> The Colony, Bridgeport, Toledo, Barre, Newburyport, Dover, Jonesboro, South Haven, Ogd
Chunk 2: --> ensburg, Berkeley, Ray, Sugar Land, Telluride, Erwin, Milpitas, Jonesboro, Orem
Thank you for your answer David. For completeness I changed your function slightly so that the output is of type list and does not start with commas.
import typing as t
import tiktoken
city_reprex = ['The Colony', 'Bridgeport', 'Toledo', 'Barre', 'Newburyport', 'Dover', 'Jonesboro', 'South Haven', 'Ogdensburg', 'Berkeley', 'Ray', 'Sugar Land', 'Telluride', 'Erwin', 'Milpitas', 'Jonesboro', 'Orem', 'Winnemucca', 'Calabash', 'Sugarcreek']
max_tokens = 25
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
def split_list_into_chunks(lst: list, max_tokens: int, encoding: tiktoken.core.Encoding) -> t.List[t.List]:
chunks = []
current_chunk = ""
for i, item in enumerate(lst):
item_size_in_tokens = len(encoding.encode(item))
if len(encoding.encode(current_chunk)) + item_size_in_tokens + 2 > max_tokens:
chunks.append(current_chunk.split(', '))
current_chunk = item
elif i > 0:
current_chunk += f", {item}"
else:
current_chunk += item
chunks.append(current_chunk.split(', '))
return chunks
chunks = split_list_into_chunks(city_reprex, max_tokens, encoding)
for i, chunk in enumerate(chunks):
print(f"Chunk {i + 1}: --> {chunk}")
Output
Chunk 1: --> ['The Colony', 'Bridgeport', 'Toledo', 'Barre', 'Newburyport', 'Dover', 'Jonesboro', 'South Haven']
Chunk 2: --> ['Ogdensburg', 'Berkeley', 'Ray', 'Sugar Land', 'Telluride', 'Erwin', 'Milpitas', 'Jonesboro']
Chunk 3: --> ['Orem', 'Winnemucca', 'Calabash', 'Sugarcreek']