Search code examples
streamlitopenai-apilangchainlarge-language-modelnlp-question-answering

Retrieve page from the PDF in PDF-chatbot using Langchain


I have developed a small app based on langchain and streamlit, where user can ask queries using pdf files. The code is mentioned as below:

from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback


def main():
    load_dotenv()
    st.set_page_config(page_title="Ask your PDF")
    st.header("Ask your PDF 💬")
    
    # upload file
    pdf = st.file_uploader("Upload your PDF", type="pdf")
    
    # extract the text
    if pdf is not None:
      pdf_reader = PdfReader(pdf)
      text = ""
      for page in pdf_reader.pages:
        text += page.extract_text()
        
      # split into chunks
      text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
      )
      chunks = text_splitter.split_text(text)
      
      # create embeddings
      embeddings = OpenAIEmbeddings()
      knowledge_base = FAISS.from_texts(chunks, embeddings)
      
      # show user input
      user_question = st.text_input("Ask a question about your PDF:")
      if user_question:
        docs = knowledge_base.similarity_search(user_question)
        
        llm = OpenAI()
        chain = load_qa_chain(llm)
        with get_openai_callback() as cb:
          response = chain.run(input_documents=docs, question=user_question)
          print(cb)
           
        st.write(response)
    

if __name__ == '__main__':
    main()

Can someone suggest that how I can retrieve or render the page of the pdf from where answer or information has been extracted? I have came across this but won't able to implement it properly.


Solution

  • Here is a simple approach.

    • While reading the pdf, also save the content per page and the page number.
        # extract the text
        if pdf is not None:
            pdf_reader = PdfReader(pdf)
            text = ""
    
            page_dict = {}
            for i, page in enumerate(pdf_reader.pages):
                page_content = page.extract_text()
                text += page_content + '\n\n'
                page_dict[page_content] = i+1
    

    Once we get the response, we will compare it with the content of each page that we have saved before. The idea is to get which page gets the highest similarity to the response. It can be page 1, page 2, etc.

                # Get the similarity between each page and response.
                # Use spacy model (free). Openai similarity can be expensive
                # but maybe more accurate.
                data = []
                for page_content, page_num in page_dict.items():
                    similarity = spacy_sim(response, page_content)
                    data.append([similarity, page_num])
    

    Sort the data and get the page with highest similarity.

                # Sort the similarity score fron high to low.
                data = sorted(data, key=lambda x: x[0], reverse=True)
                print(data)
    
                # Get the top page number.
                top_page_num = data[0][1]
    

    Now generate all the images per page, using the library pdf2image. We are going to show the content of the page as an image. You can do other methods as we already have the content of the page. But in this approach I will show the image via streamlit image widget.

                # Generate images per page in the pdf.
                images = convert_from_path(pdf.name)
    

    Now that we have a list of images, get the index that corresponds to the page that we want to show.

                # Show the the page image with the highest similarity.
                st.image(images[top_page_num-1])
    

    Here is the code to get the similarity score between page content and response.

    def spacy_sim(str1, str2):
        """model en_core_web_lg should be better"""
        nlp = spacy.load("en_core_web_md")
        doc_1 = nlp(str1)
        doc_2 = nlp(str2)
        return doc_1.similarity(doc_2)
    

    Sample output

    enter image description here

    You can download a sample pdf from my google drive.

    Full code

    from dotenv import load_dotenv
    import streamlit as st
    from PyPDF2 import PdfReader
    from langchain.text_splitter import CharacterTextSplitter
    from langchain.embeddings.openai import OpenAIEmbeddings
    from langchain.vectorstores import FAISS
    from langchain.chains.question_answering import load_qa_chain
    from langchain.llms import OpenAI
    from langchain.callbacks import get_openai_callback
    from pdf2image import convert_from_path
    import spacy
    
    
    SECRET = 'abc'
    
    
    def spacy_sim(str1, str2):
        nlp = spacy.load("en_core_web_md")
        doc_1 = nlp(str1)
        doc_2 = nlp(str2)
        return doc_1.similarity(doc_2)
    
    
    def main():
        load_dotenv()
        st.set_page_config(page_title="Ask your PDF")
        st.header("Ask your PDF 💬")
    
        # upload file, should be in the same location with the streamlit script.
        pdf = st.file_uploader("Upload your PDF", type="pdf")
    
        # extract the text
        if pdf is not None:
            pdf_reader = PdfReader(pdf)
            text = ""
    
            page_dict = {}
            for i, page in enumerate(pdf_reader.pages):
                page_content = page.extract_text()
                text += page_content + '\n\n'
                page_dict[page_content] = i+1
    
            # split into chunks
            text_splitter = CharacterTextSplitter(
                separator="\n",
                chunk_size=500,
                chunk_overlap=100,
                length_function=len
            )
            chunks = text_splitter.split_text(text)
    
            # create embeddings
            embeddings = OpenAIEmbeddings(openai_api_key=SECRET)
            knowledge_base = FAISS.from_texts(chunks, embeddings)
    
            # show user input
            user_question = st.text_input("Ask a question about your PDF:")
            if user_question:
                docs = knowledge_base.similarity_search(user_question)
    
                llm = OpenAI(openai_api_key=SECRET)
                chain = load_qa_chain(llm)
                with get_openai_callback() as cb:
                    response = chain.run(input_documents=docs,
                                         question=user_question)
                    print(f'billing details: {cb}')
    
                # Get the similarity between each page and response.
                # Use spacy model (free). Openai similarity can be expensive
                # but maybe more accurate.
                data = []
                for page_content, page_num in page_dict.items():
                    similarity = spacy_sim(response, page_content)
                    data.append([similarity, page_num])
    
                # Sort the similarity score from high to low.
                data = sorted(data, key=lambda x: x[0], reverse=True)
                print(data)
    
                # Get the top page number.
                top_page_num = data[0][1]
    
                st.write(f"Answer: {response}")
    
                # Generate images per page from the pdf.
                images = convert_from_path(pdf.name)
    
                # Show the page image with the highest similarity.
                st.image(images[top_page_num-1])
    
    
    if __name__ == '__main__':
        main()
    

    Use similarity from openai api.

    import openai
    
    openai.api_key  = SECRET
    
    def openai_sim(str1, str2):
        # Call the API
        response = openai.Embedding.create(
            input=[str1, str2],
            model="text-embedding-ada-002"
        )
    
        # Extract the embeddings
        embedding1 = response['data'][0]['embedding']
        embedding2 = response['data'][1]['embedding']
    
        # Calculate cosine similarity
        similarity_score = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
    
        return similarity_score
    

    Use sentence-transfomer for similarity.

    def transformer_sim(str1, str2):
        """
        install pytorch:
            https://pytorch.org/get-started/locally/
    
        install sentence-transformers:
            pip install -U sentence-transformers
    
        from sentence_transformers import SentenceTransformer, util
        """
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings1 = model.encode(str1, convert_to_tensor=True)
        embeddings2 = model.encode(str2, convert_to_tensor=True)
        cosine_score = util.cos_sim(embeddings1, embeddings2)
        simscore = float(cosine_score[0][0])
    
        return simscore
    

    Solution 2

    Uses pymupdf to save the text and save the images per page. Uploaded file can be from anywhere not necessarily from the location of the streamlit script because while we are saving the text on each pdf page, we also save the images as data bytes.

    This also uses the sentence-transformer to measure similarity of two text strings useful for page content and response comparison.

    Full code

    """Using sentence-transfomer for similarity score."""
    
    
    from dotenv import load_dotenv
    import streamlit as st
    from langchain.text_splitter import CharacterTextSplitter
    from langchain.embeddings.openai import OpenAIEmbeddings
    from langchain.vectorstores import FAISS
    from langchain.chains.question_answering import load_qa_chain
    from langchain.llms import OpenAI
    from langchain.callbacks import get_openai_callback
    from sentence_transformers import SentenceTransformer, util
    import fitz  # pymupdf
    
    
    SECRET = 'abc'
    
    
    def transformer_sim(str1, str2):
        """
        install pytorch:
            https://pytorch.org/get-started/locally/
    
        install sentence-transformers:
            pip install -U sentence-transformers
    
        from sentence_transformers import SentenceTransformer, util
        """
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings1 = model.encode(str1, convert_to_tensor=True)
        embeddings2 = model.encode(str2, convert_to_tensor=True)
        cosine_score = util.cos_sim(embeddings1, embeddings2)
        simscore = float(cosine_score[0][0])
    
        return simscore
    
    
    def main():
        load_dotenv()
        st.set_page_config(page_title="Ask your PDF")
        st.header("Ask your PDF 💬")
    
        # upload file, should be in the same location with the streamlit script.
        pdf = st.file_uploader("Upload your PDF", type="pdf")
    
        # extract the text
        if pdf is not None:
            text = ""
            images = []
            page_dict = {}
    
            with fitz.open(stream=pdf.read(), filetype="pdf") as pdf_pages:            
                for i, page in enumerate(pdf_pages):
                    page_content = page.get_text()
                    text += page_content + '\n\n'
                    page_dict[page_content] = i+1
    
                    # images
                    pix = page.get_pixmap()
                    bytes_data = pix.tobytes("PNG")
                    images.append(bytes_data)
    
            # split into chunks
            text_splitter = CharacterTextSplitter(
                separator="\n",
                chunk_size=500,
                chunk_overlap=100,
                length_function=len
            )
            chunks = text_splitter.split_text(text)
    
            # create embeddings
            embeddings = OpenAIEmbeddings(openai_api_key=SECRET)
            knowledge_base = FAISS.from_texts(chunks, embeddings)
    
            # show user input
            user_question = st.text_input("Ask a question about your PDF:")
            if user_question:
                docs = knowledge_base.similarity_search(user_question)
    
                llm = OpenAI(openai_api_key=SECRET)
                chain = load_qa_chain(llm)
                with get_openai_callback() as cb:
                    response = chain.run(input_documents=docs,
                                         question=user_question)
                    print(f'billing details: {cb}')
    
                # Get the similarity between each page and response.
                data = []
                for page_content, page_num in page_dict.items():
                    similarity = transformer_sim(response, page_content)
                    data.append([similarity, page_num, page_content])
    
                # Sort the similarity score fron high to low.
                data = sorted(data, key=lambda x: x[0], reverse=True)
    
                # Get the top page number.
                top_page_num = data[0][1]
                top_sim_score = data[0][0]
    
                st.write(f"Answer: {response}")
                st.markdown(f'**There is a top similarity score of {top_sim_score} that the response is from page {top_page_num}**')
    
                # Show the the page image with the highest similarity.
                st.image(images[top_page_num-1])
    
    
    if __name__ == '__main__':
        main()