Search code examples
pythonopenai-apiembeddingfaiss

Multiple file loading and embeddings with OpenAI


I am trying to load a bunch of pdf files and query them using OpenAI APIs.

from langchain.text_splitter import CharacterTextSplitter
#from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle
import os


print("Loading data...")
pdf_folder_path = "content/"
print(os.listdir(pdf_folder_path))

# Load multiple files
# location of the pdf file/files. 
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]


print(loaders)

alldocument = []
vectorstore = None
for loader in loaders:

    print("Loading raw document..." + loader.file_path)
    raw_documents = loader.load()

    print("Splitting text...")
    text_splitter = CharacterTextSplitter(
        separator="\n\n",
        chunk_size=800,
        chunk_overlap=100,
        length_function=len,
    )
    documents = text_splitter.split_documents(raw_documents)
    #alldocument = alldocument + documents

    print("Creating vectorstore...")
    embeddings = OpenAIEmbeddings()
    
    vectorstore = FAISS.from_documents(documents, embeddings)

    #with open("vectorstore.pkl", "wb") as f:
    with open("vectorstore.pkl", "ab") as f:
        pickle.dump(vectorstore, f)
        f.close()

I am trying to load multiple files for QnA but the index only remembers the last file uploaded from a folder.

Do I need to change the structure of for loop or have another parameter with the Open Method?


Solution

  • The problem is that with each iteration of the loop, you're overwriting the previous vectorstore when you create a new one. Then, when saving to "vectorstore.pkl", you're only saving the last vectorstore.

    print("Loading data...")
    pdf_folder_path = "content/"
    print(os.listdir(pdf_folder_path))
    
    # Load multiple files
    loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
    
    print(loaders)
    
    all_documents = []
    
    for loader in loaders:
        print("Loading raw document..." + loader.file_path)
        raw_documents = loader.load()
    
        print("Splitting text...")
        text_splitter = CharacterTextSplitter(
            separator="\n\n",
            chunk_size=800,
            chunk_overlap=100,
            length_function=len,
        )
        documents = text_splitter.split_documents(raw_documents)
        all_documents.extend(documents)
    
    print("Creating vectorstore...")
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(all_documents, embeddings)
    
    with open("vectorstore.pkl", "wb") as f:
        pickle.dump(vectorstore, f)