I found this code: https://github.com/pixegami/langchain-rag-tutorial/blob/main/create_database.py
It takes the document, splits it into chunks, creates vector embeddings for each chunk, and saves those into Chroma Database. However, the source code uses OpenAI key to create embeddings. Since I don't have access to OpenAI I tried to use free alternative, - SentenceTransformers. But I wasn't able to rewrite the code properly.
Here is my curent attempt:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from langchain.vectorstores.chroma import Chroma
import os
import shutil
CHROMA_PATH = "chroma"
DATA_PATH = "data/books"
embedder = SentenceTransformer("all-MiniLM-L6-v2")
def main():
generate_data_store()
def generate_data_store():
documents = load_documents()
chunks = split_text(documents)
save_to_chroma(chunks)
def load_documents():
loader = DirectoryLoader(DATA_PATH, glob="*.md")
documents = loader.load()
return documents
def split_text(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=100,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
document = chunks[10]
print(document.page_content)
print(document.metadata)
return chunks
def save_to_chroma(chunks: list[Document]):
# Clear out the database first.
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
# Create a new DB from the documents.
db = Chroma.from_documents(
chunks, embedder.encode, persist_directory=CHROMA_PATH
)
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
if __name__ == "__main__":
main()
This is the error that I get:
Traceback (most recent call last):
File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 56, in <module>
main()
File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 15, in main
generate_data_store()
File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 20, in generate_data_store
save_to_chroma(chunks)
File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 49, in save_to_chroma
db = Chroma.from_documents(
File "/home/andrew/.local/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py", line 778, in from_documents
return cls.from_texts(
File "/home/andrew/.local/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py", line 736, in from_texts
chroma_collection.add_texts(
File "/home/andrew/.local/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py", line 275, in add_texts
embeddings = self._embedding_function.embed_documents(texts)
AttributeError: 'function' object has no attribute 'embed_documents'
I'm not a programmer. If someone could point out if it's at all possible to achive this without OpenAI key and if yes, then where my mistake is, that'd be great.
embedding
model, with HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
embedder = HuggingFaceEmbeddings(
model_name = "sentence-transformers/all-MiniLM-L6-v2"
)
db = Chroma.from_documents(
documents=chunks,
embedding=embedder,
persist_directory=CHROMA_PATH
)
db.persist()