Search code examples
pythonazureazure-openai

Azure OpenAI Embedding Skill - Cannot iterate over non-array '/document/contentVector'


The following code runs successfully but the indexer execution history shows the warning:

Cannot iterate over non-array '/document/contentVector'.
Could not map output field 'contentVector' to search index. Check the 'outputFieldMappings' property of your indexer.

Based on the split skill and openai embedding skill docs (https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-azure-openai-embedding , https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-textsplit), I am quite sure i've configured input outputs and field mappings correctly. When retrieving a doc, the chunks field is correctly chunked however contentVector is an empty list [].

import os
from pprint import pprint
from tqdm import tqdm
import time
import json
from dotenv import load_dotenv
from lxml import etree
from bs4 import BeautifulSoup
from typing import List, Dict, Collection
import uuid
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    SearchableField,
    SearchFieldDataType,
    SearchField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    _edm,
    AzureOpenAIEmbeddingSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchIndexerSkillset,
    SearchIndexerSkill,
    SearchIndexer,
    SplitSkill,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataContainer,
    FieldMapping,
    IndexingParameters,
    IndexingParametersConfiguration,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
from langchain.vectorstores import AzureSearch
from langchain.retrievers import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from azure.search.documents import IndexDocumentsBatch
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient



load_dotenv()
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_BLOB_URL = os.getenv("AZURE_BLOB_URL")
AZURE_BLOB_CONN_STRING = os.getenv("AZURE_BLOB_CONN_STRING")
AZURE_BLOB_ACC_KEY = os.getenv("AZURE_BLOB_ACC_KEY")




"""
Create index
Create chunking and embedding skills
Index data
"""

def ta_create_skillset():
    split_skill = SplitSkill(
        inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
        outputs=[OutputFieldMappingEntry(name="textItems", target_name="chunks")],
        name="cf-textsplit-skill-1k",
        text_split_mode="pages",
        maximum_page_length=1000,
        page_overlap_length=100
    )

    embedding_skill = AzureOpenAIEmbeddingSkill(
        inputs=[InputFieldMappingEntry(name="text", source="/document/chunks/*")],
        outputs=[OutputFieldMappingEntry(name="embedding", target_name="contentVector")],
        context="/document/chunks/*",
        name="cf-embedding-skill-large",
        resource_url=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        deployment_name="text-embedding-3-large",
        model_name="text-embedding-3-large",
        dimensions=3072
    )

    skillset = SearchIndexerSkillset(
        name="cf-chunk-embed-skillset",
        description="Skillset for chunking and Azure OpenAI embeddings",
        skills=[split_skill, embedding_skill]
    )

    indexer = SearchIndexerClient(
        endpoint=AZURE_SEARCH_ENDPOINT,
        credential=AzureKeyCredential(AZURE_SEARCH_KEY)
    )

    indexer.create_or_update_skillset(skillset)
    print(f"Skillset {skillset.name} with skills: {', '.join([x.name for x in skillset.skills])} created.")


def ta_create_index(index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="chunks", collection=True, type=SearchFieldDataType.String),
        SimpleField(name="location", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="document_type", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="jurisdiction", collection=True, type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="category", collection=True, type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="summary", type=SearchFieldDataType.String),
        SearchableField(name="abstract", type=SearchFieldDataType.String),
        SearchableField(name="contentVector", collection=True, type=SearchFieldDataType.Single,
                        vector_search_dimensions=3072, vector_search_profile_name="cf-vector")
        # SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        #             searchable=True, hidden=False, vector_search_dimensions=3072, vector_search_profile_name="cf-vector"),
    ]

    vectorizer = AzureOpenAIVectorizer(
        vectorizer_name="cf-vectorizer",
        parameters=AzureOpenAIVectorizerParameters(
            resource_url=AZURE_OPENAI_ENDPOINT,
            deployment_name="text-embedding-3-large",
            api_key=AZURE_OPENAI_API_KEY,
            model_name="text-embedding-3-large"
        )
    )

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(name="cf-vector",
                                algorithm_configuration_name="vector-config",
                                vectorizer_name="cf-vectorizer"
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="vector-config",
                parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            )
        ],
       vectorizers=[vectorizer]
    )

    index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_SEARCH_KEY))

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    index_client.create_or_update_index(index)
    print(f"Index '{index_name}' created or updated with vector search capability.")





ta_create_skillset()
ta_create_index("cf-rag-index")


indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_SEARCH_KEY))
data_source_conn = SearchIndexerDataSourceConnection(
    name="cf-ta-blob-conn",
    connection_string=AZURE_BLOB_CONN_STRING,
    type="azureblob",
    container=SearchIndexerDataContainer(name="cf-ta-container")
)
indexer_client.create_or_update_data_source_connection(data_source_conn)
indexer = SearchIndexer(
    name="cf-ta-indexer",
    data_source_name="cf-ta-blob-conn",
    target_index_name="cf-rag-index",
    skillset_name="cf-chunk-embed-skillset",
    output_field_mappings=[
        FieldMapping(source_field_name="/document/chunks", target_field_name="chunks"),
        FieldMapping(source_field_name="/document/contentVector/*", target_field_name="contentVector")
    ],
    parameters={"configuration": {"parsing_mode":"json"}}
)
indexer_client.create_or_update_indexer(indexer)
indexer_client.run_indexer(name="cf-ta-indexer")


indexer_status = indexer_client.get_indexer_status("cf-ta-indexer")
print(indexer_status.status)


def print_execution_history(indexer_client, indexer_name):
    indexer_status = indexer_client.get_indexer_status(indexer_name)
    for execution in indexer_status.execution_history:
        if len(execution.errors) > 0:
            e = execution.errors[0]
            print(e.details)
            print(e.error_message)
            print("-"*10)
        if len(execution.warnings) > 0:
            w = execution.warnings[0]
            print(w.details)
            print(w.message)
            print("-"*10)

print_execution_history(indexer_client, "cf-ta-indexer")


search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name="cf-rag-index",
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)
results = search_client.search("*", top=1)
res = list(results)[0]
print(res["chunks"][0][-100:])
print(res["chunks"][1][:100])
pprint(res)

The following vector search returns the error:

Message: The field 'contentVector' in the vector field list is not a vector field.
Parameter name: vector.fields
Exception Details:  (FieldNotSearchable) The field 'contentVector' in the vector field list is not a vector field.
    Code: FieldNotSearchable
    Message: The field 'contentVector' in the vector field list is not a vector field.
results = search_client.search(
    select="title,chunks",
    vector_queries=[VectorizableTextQuery(
        text=myquery,
        k_nearest_neighbors=3,
        fields="contentVector"
    )]
)
for r in results:
    pprint(r)

Changing the vector field to

SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=3072, vector_search_profile_name="cf-vector")

actually gets rid of the above error however the search result is empty and another error is returned:

There's a mismatch in vector dimensions. The vector field 'contentVector', with dimension of '3072', expects a length of '3072'. However, the provided vector has a length of '0'. Please ensure that the vector length matches the expected length of the vector field. Read the following documentation for more details: https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-configure-compression-storage.
Could not index document because some of the data in the document was not valid.
----------
Cannot iterate over non-array '/document/contentVector'.
Could not map output field 'contentVector' to search index. Check the 'outputFieldMappings' property of your indexer.

Solution

  • Had to change a few things but it is working as expected now.

    Firstly, had to add index projections to the skillset and explicitly map all fields I want to populate in the search index. Note that contentVector does not show up in search index because it is not retrievable.

    skillset = SearchIndexerSkillset(
            name="cf-chunk-embed-skillset",
            description="Skillset for chunking and Azure OpenAI embeddings",
            skills=[split_skill, embedding_skill],
            index_projection=SearchIndexerIndexProjection(
                selectors=[
                    SearchIndexerIndexProjectionSelector(
                        target_index_name="cf-rag-index",
                        parent_key_field_name="parent_id",
                        source_context="/document/chunks/*",
                        mappings=[
                            InputFieldMappingEntry(name="chunk", source="/document/chunks/*"),
                            InputFieldMappingEntry(name="contentVector", source="/document/chunks/*/contentVector"),
                            InputFieldMappingEntry(name="title", source="/document/title"),
                            InputFieldMappingEntry(name="abstract", source="/document/abstract"),
                            InputFieldMappingEntry(name="summary", source="/document/summary"),
                            InputFieldMappingEntry(name="jurisdiction", source="/document/jurisdiction"),
                            InputFieldMappingEntry(name="category", source="/document/category"),
                            InputFieldMappingEntry(name="location", source="/document/location"),
                            InputFieldMappingEntry(name="document_type", source="/document/document_type"),
                        ]
                    )
                ],
                parameters=SearchIndexerIndexProjectionsParameters(
                    projection_mode="skipIndexingParentDocuments"
                )
            )
        )
    

    Also needed to create a new parent id field and modify the key field to have keyword analyzer.

    fields = [
            SearchField(name="id", type=SearchFieldDataType.String, key=True, analyzer_name="keyword"),
            SimpleField(name="parent_id", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="title", type=SearchFieldDataType.String),
            # SearchableField(name="content", type=SearchFieldDataType.String),
            SearchableField(name="chunk", type=SearchFieldDataType.String),
            SimpleField(name="location", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="document_type", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="jurisdiction", collection=True, type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="category", collection=True, type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="summary", type=SearchFieldDataType.String),
            SearchableField(name="abstract", type=SearchFieldDataType.String),
            SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=3072, vector_search_profile_name="cf-vector"),
        ]
    

    Lastly, you can remove the output field mappings from the search indexer. It didn't seem to do anything when the index projection is set on the skillset.