I have developed a small app based on langchain and streamlit, where user can ask queries using pdf files. The code is mentioned as below:
from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
def main():
load_dotenv()
st.set_page_config(page_title="Ask your PDF")
st.header("Ask your PDF 💬")
# upload file
pdf = st.file_uploader("Upload your PDF", type="pdf")
# extract the text
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# split into chunks
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=500,
chunk_overlap=100,
length_function=len
)
chunks = text_splitter.split_text(text)
# create embeddings
embeddings = OpenAIEmbeddings()
knowledge_base = FAISS.from_texts(chunks, embeddings)
# show user input
user_question = st.text_input("Ask a question about your PDF:")
if user_question:
docs = knowledge_base.similarity_search(user_question)
llm = OpenAI()
chain = load_qa_chain(llm)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=user_question)
print(cb)
st.write(response)
if __name__ == '__main__':
main()
Can someone suggest that how I can retrieve or render the page of the pdf from where answer or information has been extracted? I have came across this but won't able to implement it properly.
Here is a simple approach.
# extract the text
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
page_dict = {}
for i, page in enumerate(pdf_reader.pages):
page_content = page.extract_text()
text += page_content + '\n\n'
page_dict[page_content] = i+1
Once we get the response, we will compare it with the content of each page that we have saved before. The idea is to get which page gets the highest similarity to the response. It can be page 1, page 2, etc.
# Get the similarity between each page and response.
# Use spacy model (free). Openai similarity can be expensive
# but maybe more accurate.
data = []
for page_content, page_num in page_dict.items():
similarity = spacy_sim(response, page_content)
data.append([similarity, page_num])
Sort the data and get the page with highest similarity.
# Sort the similarity score fron high to low.
data = sorted(data, key=lambda x: x[0], reverse=True)
print(data)
# Get the top page number.
top_page_num = data[0][1]
Now generate all the images per page, using the library pdf2image
. We are going to show the content of the page as an image. You can do other methods as we already have the content of the page. But in this approach I will show the image via streamlit image widget.
# Generate images per page in the pdf.
images = convert_from_path(pdf.name)
Now that we have a list of images, get the index that corresponds to the page that we want to show.
# Show the the page image with the highest similarity.
st.image(images[top_page_num-1])
Here is the code to get the similarity score between page content and response.
def spacy_sim(str1, str2):
"""model en_core_web_lg should be better"""
nlp = spacy.load("en_core_web_md")
doc_1 = nlp(str1)
doc_2 = nlp(str2)
return doc_1.similarity(doc_2)
You can download a sample pdf from my google drive.
from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from pdf2image import convert_from_path
import spacy
SECRET = 'abc'
def spacy_sim(str1, str2):
nlp = spacy.load("en_core_web_md")
doc_1 = nlp(str1)
doc_2 = nlp(str2)
return doc_1.similarity(doc_2)
def main():
load_dotenv()
st.set_page_config(page_title="Ask your PDF")
st.header("Ask your PDF 💬")
# upload file, should be in the same location with the streamlit script.
pdf = st.file_uploader("Upload your PDF", type="pdf")
# extract the text
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
page_dict = {}
for i, page in enumerate(pdf_reader.pages):
page_content = page.extract_text()
text += page_content + '\n\n'
page_dict[page_content] = i+1
# split into chunks
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=500,
chunk_overlap=100,
length_function=len
)
chunks = text_splitter.split_text(text)
# create embeddings
embeddings = OpenAIEmbeddings(openai_api_key=SECRET)
knowledge_base = FAISS.from_texts(chunks, embeddings)
# show user input
user_question = st.text_input("Ask a question about your PDF:")
if user_question:
docs = knowledge_base.similarity_search(user_question)
llm = OpenAI(openai_api_key=SECRET)
chain = load_qa_chain(llm)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs,
question=user_question)
print(f'billing details: {cb}')
# Get the similarity between each page and response.
# Use spacy model (free). Openai similarity can be expensive
# but maybe more accurate.
data = []
for page_content, page_num in page_dict.items():
similarity = spacy_sim(response, page_content)
data.append([similarity, page_num])
# Sort the similarity score from high to low.
data = sorted(data, key=lambda x: x[0], reverse=True)
print(data)
# Get the top page number.
top_page_num = data[0][1]
st.write(f"Answer: {response}")
# Generate images per page from the pdf.
images = convert_from_path(pdf.name)
# Show the page image with the highest similarity.
st.image(images[top_page_num-1])
if __name__ == '__main__':
main()
Use similarity from openai api.
import openai
openai.api_key = SECRET
def openai_sim(str1, str2):
# Call the API
response = openai.Embedding.create(
input=[str1, str2],
model="text-embedding-ada-002"
)
# Extract the embeddings
embedding1 = response['data'][0]['embedding']
embedding2 = response['data'][1]['embedding']
# Calculate cosine similarity
similarity_score = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
return similarity_score
Use sentence-transfomer for similarity.
def transformer_sim(str1, str2):
"""
install pytorch:
https://pytorch.org/get-started/locally/
install sentence-transformers:
pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
"""
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings1 = model.encode(str1, convert_to_tensor=True)
embeddings2 = model.encode(str2, convert_to_tensor=True)
cosine_score = util.cos_sim(embeddings1, embeddings2)
simscore = float(cosine_score[0][0])
return simscore
Uses pymupdf to save the text and save the images per page. Uploaded file can be from anywhere not necessarily from the location of the streamlit script because while we are saving the text on each pdf page, we also save the images as data bytes.
This also uses the sentence-transformer to measure similarity of two text strings useful for page content and response comparison.
"""Using sentence-transfomer for similarity score."""
from dotenv import load_dotenv
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from sentence_transformers import SentenceTransformer, util
import fitz # pymupdf
SECRET = 'abc'
def transformer_sim(str1, str2):
"""
install pytorch:
https://pytorch.org/get-started/locally/
install sentence-transformers:
pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
"""
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings1 = model.encode(str1, convert_to_tensor=True)
embeddings2 = model.encode(str2, convert_to_tensor=True)
cosine_score = util.cos_sim(embeddings1, embeddings2)
simscore = float(cosine_score[0][0])
return simscore
def main():
load_dotenv()
st.set_page_config(page_title="Ask your PDF")
st.header("Ask your PDF 💬")
# upload file, should be in the same location with the streamlit script.
pdf = st.file_uploader("Upload your PDF", type="pdf")
# extract the text
if pdf is not None:
text = ""
images = []
page_dict = {}
with fitz.open(stream=pdf.read(), filetype="pdf") as pdf_pages:
for i, page in enumerate(pdf_pages):
page_content = page.get_text()
text += page_content + '\n\n'
page_dict[page_content] = i+1
# images
pix = page.get_pixmap()
bytes_data = pix.tobytes("PNG")
images.append(bytes_data)
# split into chunks
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=500,
chunk_overlap=100,
length_function=len
)
chunks = text_splitter.split_text(text)
# create embeddings
embeddings = OpenAIEmbeddings(openai_api_key=SECRET)
knowledge_base = FAISS.from_texts(chunks, embeddings)
# show user input
user_question = st.text_input("Ask a question about your PDF:")
if user_question:
docs = knowledge_base.similarity_search(user_question)
llm = OpenAI(openai_api_key=SECRET)
chain = load_qa_chain(llm)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs,
question=user_question)
print(f'billing details: {cb}')
# Get the similarity between each page and response.
data = []
for page_content, page_num in page_dict.items():
similarity = transformer_sim(response, page_content)
data.append([similarity, page_num, page_content])
# Sort the similarity score fron high to low.
data = sorted(data, key=lambda x: x[0], reverse=True)
# Get the top page number.
top_page_num = data[0][1]
top_sim_score = data[0][0]
st.write(f"Answer: {response}")
st.markdown(f'**There is a top similarity score of {top_sim_score} that the response is from page {top_page_num}**')
# Show the the page image with the highest similarity.
st.image(images[top_page_num-1])
if __name__ == '__main__':
main()