Search code examples
pythonazureazure-cognitive-searchlangchainazure-openai

Trying adding embeddings in Azure Cognitive Search leads to error "The property 'content' does not exist on type 'search.documentFields'."


I am extracting text from pdf documents and load it to Azure Cognitive Search for a RAG approach. Unfortunately this does not work. I am receiving the error message

HttpResponseError: () The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code: 
Message: The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.

What i want to do is

  1. Extract text from pdf via pymupdf - works
  2. Upload it to Azure Vector search as embeddings with vectors and metdata `filename``
  3. Query this through ChatGPT model

Beside the error i want to add to this document object the metadata information filename but also dont know how to extend this ...

My code:

!pip install cohere tiktoken
!pip install openai==0.28.1
!pip install pymupdf
!pip install azure-storage-blob azure-identity
!pip install azure-search-documents --pre --upgrade
!pip install langchain

import fitz
import time
import uuid
import os
import openai

from PIL import Image
from io import BytesIO
from IPython.display import display

from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import AzureSearch
from langchain.docstore.document import Document
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

from google.colab import drive

OPENAI_API_BASE = "https://xxx.openai.azure.com"
OPENAI_API_KEY = "xxx"
OPENAI_API_VERSION = "2023-05-15"

openai.api_type = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION

AZURE_COGNITIVE_SEARCH_SERVICE_NAME = "https://xxx.search.windows.net"
AZURE_COGNITIVE_SEARCH_API_KEY = "xxx"
AZURE_COGNITIVE_SEARCH_INDEX_NAME = "test"

llm = AzureChatOpenAI(deployment_name="gpt35", openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
embeddings = OpenAIEmbeddings(deployment_id="ada002", chunk_size=1, openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)

acs = AzureSearch(azure_search_endpoint=AZURE_COGNITIVE_SEARCH_SERVICE_NAME,
                  azure_search_key = AZURE_COGNITIVE_SEARCH_API_KEY,
                  index_name = AZURE_COGNITIVE_SEARCH_INDEX_NAME,
                  embedding_function = embeddings.embed_query)
    
def generate_tokens(s, f):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
  splits = text_splitter.split_text(s)
  i = 0

  documents = []
  for split in splits:
    metadata = {}
    metadata["index"] = i
    metadata["file_source"] = f
    i = i+1

    new_doc = Document(page_content=split, metadata=metadata)
    documents.append(new_doc)
    #documents = text_splitter.create_documents(splits)

  print (documents)

  return documents


drive.mount('/content/drive')
folder = "/content/drive/.../pdf/"

page_content = ''
doc_content = ''
    
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    if os.path.isfile(file_path):
        print(f"Processing file: {file_path}")

        doc = fitz.open(file_path)
        for page in doc: # iterate the document pages
          page_content += page.get_text() # get plain text encoded as UTF-8    
          d = generate_tokens(doc_content)

          # the following line throws the error
          # how can i add the chunks + filename to 
          # Azure Cognitive Search?

          doc_content += page_content
          d = generate_tokens(doc_content, file_path)

          acs.add_documents(documents=d)
    
        print(metadatas)
        print("----------")
        print(doc_content)
        count = len(doc_content.split())
        print("Number of tokens: ", count)


HttpResponseError                         Traceback (most recent call last)
<ipython-input-11-d9eaff7ee027> in <cell line: 10>()
     31           all_texts.extend(d)
     32 
---> 33           acs.add_documents(documents=d)
     34 
     35           metadatas = [{"Source": f"{i}-pl"} for i in range(len(all_texts))]

7 frames
/usr/local/lib/python3.10/dist-packages/azure/search/documents/_generated/operations/_documents_operations.py in index(self, batch, request_options, **kwargs)
   1249             map_error(status_code=response.status_code, response=response, error_map=error_map)
   1250             error = self._deserialize.failsafe_deserialize(_models.SearchError, pipeline_response)
-> 1251             raise HttpResponseError(response=response, model=error)
   1252 
   1253         if response.status_code == 200:

HttpResponseError: () The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code: 
Message: The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.

This is my index in Azure Cognitive Search index:

enter image description here


Solution

  • I have solved it now. You had to create the necessary fields in Azure Cognitive Search. These are

    enter image description here

    The field content_vector seems to hold the vectors. The JSON definition of the field is

    {
      "name": "content_vector",
      "type": "Collection(Edm.Single)",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": 1536,
      "vectorSearchConfiguration": "vector-config-1699712748580",
      "synonymMaps": []
    }
    

    and

    enter image description here