I have a profiles table in SQL with around 50 columns, and only 244 rows. I have created a view with only 2 columns, ID and content and in content I concatenated all data from other columns in a format like this: FirstName: John. LastName: Smith. Age: 70, Likes: Gardening, Painting. Dislikes: Soccer.
Then I created the following code to index all contents from the view into pinecone, and it works so far. However I noticed something strange.
What am I doing wrong? or is it normal?
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENV # next to api key in console
)
import streamlit as st
st.title('Work in progress')
embed = OpenAIEmbeddings(deployment=OPENAI_EMBEDDING_DEPLOYMENT_NAME, model=OPENAI_EMBEDDING_MODEL_NAME, chunk_size=1)
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+DATABASE_SERVER+'.database.windows.net;DATABASE='+DATABASE_DB+';UID='+DATABASE_USERNAME+';PWD='+ DATABASE_PASSWORD)
query = "SELECT * from views.vwprofiles2;"
df = pd.read_sql(query, cnxn)
index = pinecone.Index("default")
batch_limit = 100
texts = []
metadatas = []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=20,
length_function=tiktoken_len,
separators=["\n\n", "\n", " ", ""]
)
for _, record in stqdm(df.iterrows(), total=len(df)):
# First get metadata fields for this record
metadata = {
'IdentityId': str(record['IdentityId'])
}
# Now we create chunks from the record text
record_texts = text_splitter.split_text(record['content'])
# Create individual metadata dicts for each chunk
record_metadatas = [{
"chunk": j, "text": text, **metadata
} for j, text in enumerate(record_texts)]
# Append these to the current batches
texts.extend(record_texts)
metadatas.extend(record_metadatas)
# If we have reached the batch_limit, we can add texts
if len(texts) >= batch_limit:
ids = [str(uuid4()) for _ in range(len(texts))]
embeds = embed.embed_documents(texts)
index.upsert(vectors=zip(ids, embeds, metadatas))
texts = []
metadatas = []
if len(texts) > 0:
ids = [str(uuid4()) for _ in range(len(texts))]
embeds = embed.embed_documents(texts)
index.upsert(vectors=zip(ids, embeds, metadatas))
I have done some good research on the topic and have some recommendations
Consider the following when optimizing code:
With these factors in mind, it is possible to make significant improvements to the time and complexity of code.
also:
do this and you should be able to improve the time and complexity of your code.
Example:
from faker import Faker
import pandas as pd
import time
# Initialize Faker for random data generation
fake = Faker()
# Create a DataFrame with 244 rows of random data
data = {
'IdentityId': [fake.uuid4() for _ in range(244)],
'content': [fake.text(max_nb_chars=1000) for _ in range(244)]
}
df = pd.DataFrame(data)
# Initialize lists for texts and metadata
texts = []
metadatas = []
# Set the batch limit
batch_limit = 500
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=20,
length_function=tiktoken_len,
separators=["\n\n", "\n", " ", ""]
)
# Iterate through DataFrame rows
# Time Complexity: O(n), where n is the number of rows in the DataFrame
for _, record in df.iterrows():
start_time = time.time()
# Get metadata for this record
# Time Complexity: O(1)
metadata = {
'IdentityId': str(record['IdentityId'])
}
print(f'Time taken for metadata extraction: {time.time() - start_time} seconds')
start_time = time.time()
# Split record text into chunks
# Time Complexity: O(m), where m is the size of the text
record_texts = text_splitter.split_text(record['content'])
print(f'Time taken for text splitting: {time.time() - start_time} seconds')
start_time = time.time()
# Create metadata for each chunk
# Time Complexity: O(k), where k is the number of chunks in the text
record_metadatas = [{
"chunk": j, "text": text, **metadata
} for j, text in enumerate(record_texts)]
print(f'Time taken for metadata dictionary creation: {time.time() - start_time} seconds')
start_time = time.time()
# Append chunks and metadata to current batches
# Time Complexity: O(1)
texts.extend(record_texts)
metadatas.extend(record_metadatas)
print(f'Time taken for data appending: {time.time() - start_time} seconds')
# If batch_limit is reached, upsert vectors
# Time Complexity: Depends on the upsert implementation
if len(texts) >= batch_limit:
start_time = time.time()
ids = [str(uuid4()) for _ in range(len(texts))]
# Simulating embedding and upserting here
texts = []
metadatas = []
print(f'Time taken for vector upsertion (simulated): {time.time() - start_time} seconds')
# Upsert any remaining vectors after the loop
# Time Complexity: Depends on the upsert implementation
if len(texts) > 0:
start_time = time.time()
ids = [str(uuid4()) for _ in range(len(texts))]
# Simulating embedding and upserting here
print(f'Time taken for remaining vector upsertion (simulated): {time.time() - start_time} seconds')