Search code examples
pythonlarge-language-modelpython-embeddingollamarag

Ollama with RAG for local utilization to chat with pdf


I am trying to build ollama usage by using RAG for chatting with pdf on my local machine. I followed this GitHub repo: https://github.com/tonykipkemboi/ollama_pdf_rag/tree/main The issue is when I am running code, there is no error, but the code will stop at embedding and will stop after that. I have attached all possible logs along with ollama list.

import logging
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

local_path = "D:/KnowledgeSplice/ollama_pdf_rag-main/WEF_The_Global_Cooperation_Barometer_2024.pdf"

try:
  # Local PDF file uploads
  if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
    logging.info("Loading of PDF is done")
  else:
    logging.error("Upload a PDF file")
    raise ValueError("No PDF file uploaded")

  # Preview first page
  logging.info(f"First page content preview: {data[0].page_content[:500]}...")

  # Split and chunk 
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
  logging.info("Text splitter created")
  chunks = text_splitter.split_documents(data)
  logging.info(f"Created {len(chunks)} chunks")

  # Add to vector database
  logging.info("Creating Vector db")
  try:
    embedding_model = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)
    print("Embedding", embedding_model)
    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        collection_name="local-rag"
    )
    logging.info("Local db created successfully")
  except Exception as e:
    logging.error(f"Error creating vector db: {e}")
    raise  # Re-raise the exception to stop further execution

  # Verify vector database creation
  if vector_db:
    logging.info("Vector db verification successful")
  else:
    logging.error("Vector db creation failed")
    raise ValueError("Vector db creation failed")

    # LLM from Ollama
    local_model = "llama3"
    llm = ChatOllama(model=local_model)
    logging.info("LLM model loaded")

    QUERY_PROMPT = PromptTemplate(
        input_variables=["question"],
        template="""You are an AI language model assistant. Your task is to generate five
        different versions of the given user question to retrieve relevant documents from
        a vector database. By generating multiple perspectives on the user question, your
        goal is to help the user overcome some of the limitations of the distance-based
        similarity search. Provide these alternative questions separated by newlines.
        Original question: {question}""",
    )
    logging.info("Query prompt created")

    retriever = MultiQueryRetriever.from_llm(
        vector_db.as_retriever(), 
        llm,
        prompt=QUERY_PROMPT
    )
    logging.info("Retriever created")

    # RAG prompt
    template = """Answer the question based ONLY on the following context:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    logging.info("RAG prompt created")

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    logging.info("Chain created")

    response = chain.invoke("What are the 5 pillars of global cooperation?")
    logging.info("Chain invoked")
    logging.info(f"Response: {response}")

except Exception as e:
    logging.error(f"An error occurred: {e}")

The code is showing no error but did not work after embedding.

Output:

2024-08-06 14:59:59,858 - INFO - Text splitter created
2024-08-06 14:59:59,861 - INFO - Created 11 chunks
2024-08-06 14:59:59,861 - INFO - Creating Vector db
Embedding base_url='http://localhost:11434' model='nomic-embed-text' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=True headers=None model_kwargs=None
2024-08-06 15:00:00,662 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:27<00:00,  2.46s/it]

Below is my ollama list :

NAME                    ID              SIZE    MODIFIED
nomic-embed-text:latest 0a109f422b47    274 MB  3 hours ago
mistral:latest          f974a74358d6    4.1 GB  17 hours ago
phi3:latest             d184c916657e    2.2 GB  2 weeks ago
llama3:latest           365c0bd3c000    4.7 GB  2 weeks ago

How to resolve this issue?


Solution

  • ChromaDB does not support large tokens of more than 768

    I suggest we change the vector base to FAISS because the chroma has issues with dimensionality which is not comparable with the embedding model, to be precise the database chromadb allows 768 while embedding model offers 1028. Here is the reviewed code

    import logging
    
    import ollama
    from langchain.prompts import ChatPromptTemplate, PromptTemplate
    from langchain.retrievers.multi_query import MultiQueryRetriever
    from langchain_community.chat_models import ChatOllama
    from langchain_community.document_loaders import UnstructuredPDFLoader
    from langchain_community.embeddings import OllamaEmbeddings
    from langchain_community.vectorstores import FAISS
    from langchain_core.output_parsers import StrOutputParser
    from langchain_core.runnables import RunnablePassthrough
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    
    
    # Configure logging
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )
    
    local_path = "WEF_The_Global_Cooperation_Barometer_2024.pdf"
    
    try:
        # Local PDF file uploads
        if local_path:
            loader = UnstructuredPDFLoader(file_path=local_path)
            data = loader.load()
            logging.info("Loading of PDF is done")
        else:
            logging.error("Upload a PDF file")
            raise ValueError("No PDF file uploaded")
    
        # Preview first page
        # logging.info(f"First page content preview: {data[0].page_content[:500]}...")
    
        # Split and chunk
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
        logging.info("Text splitter created")
        chunks = text_splitter.split_documents(data)
        logging.info(f"Created {len(chunks)} chunks")
    
        # Add to vector database
        logging.info("Creating Vector db")
        try:
            ollama.embeddings(
                model="mxbai-embed-large",
                # prompt='Llamas are members of the camelid family',
            )
            embedding_model = (OllamaEmbeddings(model="mxbai-embed-large"),)
            vectorstore_db = FAISS.from_documents(
                documents=chunks, embedding=embedding_model
            )
            vectorstore_db.save_local("faiss_index")
            vector_retriever = vectorstore_db.as_retriever()
    
        except Exception as e:
            logging.error(f"Error creating vector db: {e}")
            raise  # Re-raise the exception to stop further execution
    
        # LLM from Ollama
        local_model = "mistral"
        llm = ChatOllama(model=local_model)
        print("local llm modal", local_model)
        logging.info("LLM model loaded")
    
        QUERY_PROMPT = PromptTemplate(
            input_variables=["question"],
            template="""You are an AI language model assistant. Your task is to generate five
            different versions of the given user question to retrieve relevant documents from
            a vector database. By generating multiple perspectives on the user question, your
            goal is to help the user overcome some of the limitations of the distance-based
            similarity search. Provide these alternative questions separated by newlines.
            Original question: {question}""",
        )
        logging.info("Query prompt created")
    
        retriever = MultiQueryRetriever.from_llm(
            vector_retriever, llm, prompt=QUERY_PROMPT  # Use the correct retriever
        )
        logging.info("Retriever created")
    
        # RAG prompt
        template = """Answer the question based ONLY on the following context:
        {context}
        Question: {question}
        """
        prompt = ChatPromptTemplate.from_template(template)
        logging.info("RAG prompt created")
    
        chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )
        logging.info("Chain created")
    
        response = chain.invoke("What are the 5 pillars of global cooperation?")
        logging.info("Chain invoked")
        logging.info(f"Response: {response}")
    
    except Exception as e:
        logging.error(f"An error occurred: {e}")