I am extracting text from pdf documents and load it to Azure Cognitive Search for a RAG approach. Unfortunately this does not work. I am receiving the error message
HttpResponseError: () The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code:
Message: The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
What i want to do is
Beside the error i want to add to this document
object the metadata information filename
but also dont know how to extend this ...
My code:
!pip install cohere tiktoken
!pip install openai==0.28.1
!pip install pymupdf
!pip install azure-storage-blob azure-identity
!pip install azure-search-documents --pre --upgrade
!pip install langchain
import fitz
import time
import uuid
import os
import openai
from PIL import Image
from io import BytesIO
from IPython.display import display
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import AzureSearch
from langchain.docstore.document import Document
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from google.colab import drive
OPENAI_API_BASE = "https://xxx.openai.azure.com"
OPENAI_API_KEY = "xxx"
OPENAI_API_VERSION = "2023-05-15"
openai.api_type = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION
AZURE_COGNITIVE_SEARCH_SERVICE_NAME = "https://xxx.search.windows.net"
AZURE_COGNITIVE_SEARCH_API_KEY = "xxx"
AZURE_COGNITIVE_SEARCH_INDEX_NAME = "test"
llm = AzureChatOpenAI(deployment_name="gpt35", openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
embeddings = OpenAIEmbeddings(deployment_id="ada002", chunk_size=1, openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
acs = AzureSearch(azure_search_endpoint=AZURE_COGNITIVE_SEARCH_SERVICE_NAME,
azure_search_key = AZURE_COGNITIVE_SEARCH_API_KEY,
index_name = AZURE_COGNITIVE_SEARCH_INDEX_NAME,
embedding_function = embeddings.embed_query)
def generate_tokens(s, f):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_text(s)
i = 0
documents = []
for split in splits:
metadata = {}
metadata["index"] = i
metadata["file_source"] = f
i = i+1
new_doc = Document(page_content=split, metadata=metadata)
documents.append(new_doc)
#documents = text_splitter.create_documents(splits)
print (documents)
return documents
drive.mount('/content/drive')
folder = "/content/drive/.../pdf/"
page_content = ''
doc_content = ''
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
if os.path.isfile(file_path):
print(f"Processing file: {file_path}")
doc = fitz.open(file_path)
for page in doc: # iterate the document pages
page_content += page.get_text() # get plain text encoded as UTF-8
d = generate_tokens(doc_content)
# the following line throws the error
# how can i add the chunks + filename to
# Azure Cognitive Search?
doc_content += page_content
d = generate_tokens(doc_content, file_path)
acs.add_documents(documents=d)
print(metadatas)
print("----------")
print(doc_content)
count = len(doc_content.split())
print("Number of tokens: ", count)
HttpResponseError Traceback (most recent call last)
<ipython-input-11-d9eaff7ee027> in <cell line: 10>()
31 all_texts.extend(d)
32
---> 33 acs.add_documents(documents=d)
34
35 metadatas = [{"Source": f"{i}-pl"} for i in range(len(all_texts))]
7 frames
/usr/local/lib/python3.10/dist-packages/azure/search/documents/_generated/operations/_documents_operations.py in index(self, batch, request_options, **kwargs)
1249 map_error(status_code=response.status_code, response=response, error_map=error_map)
1250 error = self._deserialize.failsafe_deserialize(_models.SearchError, pipeline_response)
-> 1251 raise HttpResponseError(response=response, model=error)
1252
1253 if response.status_code == 200:
HttpResponseError: () The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code:
Message: The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
This is my index in Azure Cognitive Search index:
I have solved it now. You had to create the necessary fields in Azure Cognitive Search. These are
The field content_vector seems to hold the vectors. The JSON definition of the field is
{
"name": "content_vector",
"type": "Collection(Edm.Single)",
"searchable": true,
"filterable": false,
"retrievable": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": 1536,
"vectorSearchConfiguration": "vector-config-1699712748580",
"synonymMaps": []
}
and