The following code runs successfully but the indexer execution history shows the warning:
Cannot iterate over non-array '/document/contentVector'.
Could not map output field 'contentVector' to search index. Check the 'outputFieldMappings' property of your indexer.
Based on the split skill and openai embedding skill docs (https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-azure-openai-embedding , https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-textsplit), I am quite sure i've configured input outputs and field mappings correctly. When retrieving a doc, the chunks
field is correctly chunked however contentVector
is an empty list []
.
import os
from pprint import pprint
from tqdm import tqdm
import time
import json
from dotenv import load_dotenv
from lxml import etree
from bs4 import BeautifulSoup
from typing import List, Dict, Collection
import uuid
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
SearchIndex,
SimpleField,
SearchableField,
SearchFieldDataType,
SearchField,
VectorSearch,
VectorSearchProfile,
HnswAlgorithmConfiguration,
_edm,
AzureOpenAIEmbeddingSkill,
InputFieldMappingEntry,
OutputFieldMappingEntry,
SearchIndexerSkillset,
SearchIndexerSkill,
SearchIndexer,
SplitSkill,
SearchIndexerDataSourceConnection,
SearchIndexerDataContainer,
FieldMapping,
IndexingParameters,
IndexingParametersConfiguration,
AzureOpenAIVectorizer,
AzureOpenAIVectorizerParameters
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
from langchain.vectorstores import AzureSearch
from langchain.retrievers import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from azure.search.documents import IndexDocumentsBatch
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
load_dotenv()
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_BLOB_URL = os.getenv("AZURE_BLOB_URL")
AZURE_BLOB_CONN_STRING = os.getenv("AZURE_BLOB_CONN_STRING")
AZURE_BLOB_ACC_KEY = os.getenv("AZURE_BLOB_ACC_KEY")
"""
Create index
Create chunking and embedding skills
Index data
"""
def ta_create_skillset():
split_skill = SplitSkill(
inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
outputs=[OutputFieldMappingEntry(name="textItems", target_name="chunks")],
name="cf-textsplit-skill-1k",
text_split_mode="pages",
maximum_page_length=1000,
page_overlap_length=100
)
embedding_skill = AzureOpenAIEmbeddingSkill(
inputs=[InputFieldMappingEntry(name="text", source="/document/chunks/*")],
outputs=[OutputFieldMappingEntry(name="embedding", target_name="contentVector")],
context="/document/chunks/*",
name="cf-embedding-skill-large",
resource_url=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
deployment_name="text-embedding-3-large",
model_name="text-embedding-3-large",
dimensions=3072
)
skillset = SearchIndexerSkillset(
name="cf-chunk-embed-skillset",
description="Skillset for chunking and Azure OpenAI embeddings",
skills=[split_skill, embedding_skill]
)
indexer = SearchIndexerClient(
endpoint=AZURE_SEARCH_ENDPOINT,
credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)
indexer.create_or_update_skillset(skillset)
print(f"Skillset {skillset.name} with skills: {', '.join([x.name for x in skillset.skills])} created.")
def ta_create_index(index_name):
fields = [
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
SearchableField(name="title", type=SearchFieldDataType.String),
SearchableField(name="content", type=SearchFieldDataType.String),
SearchableField(name="chunks", collection=True, type=SearchFieldDataType.String),
SimpleField(name="location", type=SearchFieldDataType.String, filterable=True),
SimpleField(name="document_type", type=SearchFieldDataType.String, filterable=True),
SearchableField(name="jurisdiction", collection=True, type=SearchFieldDataType.String, filterable=True),
SearchableField(name="category", collection=True, type=SearchFieldDataType.String, filterable=True),
SearchableField(name="summary", type=SearchFieldDataType.String),
SearchableField(name="abstract", type=SearchFieldDataType.String),
SearchableField(name="contentVector", collection=True, type=SearchFieldDataType.Single,
vector_search_dimensions=3072, vector_search_profile_name="cf-vector")
# SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
# searchable=True, hidden=False, vector_search_dimensions=3072, vector_search_profile_name="cf-vector"),
]
vectorizer = AzureOpenAIVectorizer(
vectorizer_name="cf-vectorizer",
parameters=AzureOpenAIVectorizerParameters(
resource_url=AZURE_OPENAI_ENDPOINT,
deployment_name="text-embedding-3-large",
api_key=AZURE_OPENAI_API_KEY,
model_name="text-embedding-3-large"
)
)
vector_search = VectorSearch(
profiles=[
VectorSearchProfile(name="cf-vector",
algorithm_configuration_name="vector-config",
vectorizer_name="cf-vectorizer"
)
],
algorithms=[
HnswAlgorithmConfiguration(
name="vector-config",
parameters={
"m": 4,
"efConstruction": 400,
"efSearch": 500,
"metric": "cosine"
}
)
],
vectorizers=[vectorizer]
)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_SEARCH_KEY))
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
index_client.create_or_update_index(index)
print(f"Index '{index_name}' created or updated with vector search capability.")
ta_create_skillset()
ta_create_index("cf-rag-index")
indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_SEARCH_KEY))
data_source_conn = SearchIndexerDataSourceConnection(
name="cf-ta-blob-conn",
connection_string=AZURE_BLOB_CONN_STRING,
type="azureblob",
container=SearchIndexerDataContainer(name="cf-ta-container")
)
indexer_client.create_or_update_data_source_connection(data_source_conn)
indexer = SearchIndexer(
name="cf-ta-indexer",
data_source_name="cf-ta-blob-conn",
target_index_name="cf-rag-index",
skillset_name="cf-chunk-embed-skillset",
output_field_mappings=[
FieldMapping(source_field_name="/document/chunks", target_field_name="chunks"),
FieldMapping(source_field_name="/document/contentVector/*", target_field_name="contentVector")
],
parameters={"configuration": {"parsing_mode":"json"}}
)
indexer_client.create_or_update_indexer(indexer)
indexer_client.run_indexer(name="cf-ta-indexer")
indexer_status = indexer_client.get_indexer_status("cf-ta-indexer")
print(indexer_status.status)
def print_execution_history(indexer_client, indexer_name):
indexer_status = indexer_client.get_indexer_status(indexer_name)
for execution in indexer_status.execution_history:
if len(execution.errors) > 0:
e = execution.errors[0]
print(e.details)
print(e.error_message)
print("-"*10)
if len(execution.warnings) > 0:
w = execution.warnings[0]
print(w.details)
print(w.message)
print("-"*10)
print_execution_history(indexer_client, "cf-ta-indexer")
search_client = SearchClient(
endpoint=AZURE_SEARCH_ENDPOINT,
index_name="cf-rag-index",
credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)
results = search_client.search("*", top=1)
res = list(results)[0]
print(res["chunks"][0][-100:])
print(res["chunks"][1][:100])
pprint(res)
The following vector search returns the error:
Message: The field 'contentVector' in the vector field list is not a vector field.
Parameter name: vector.fields
Exception Details: (FieldNotSearchable) The field 'contentVector' in the vector field list is not a vector field.
Code: FieldNotSearchable
Message: The field 'contentVector' in the vector field list is not a vector field.
results = search_client.search(
select="title,chunks",
vector_queries=[VectorizableTextQuery(
text=myquery,
k_nearest_neighbors=3,
fields="contentVector"
)]
)
for r in results:
pprint(r)
Changing the vector field to
SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True, vector_search_dimensions=3072, vector_search_profile_name="cf-vector")
actually gets rid of the above error however the search result is empty and another error is returned:
There's a mismatch in vector dimensions. The vector field 'contentVector', with dimension of '3072', expects a length of '3072'. However, the provided vector has a length of '0'. Please ensure that the vector length matches the expected length of the vector field. Read the following documentation for more details: https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-configure-compression-storage.
Could not index document because some of the data in the document was not valid.
----------
Cannot iterate over non-array '/document/contentVector'.
Could not map output field 'contentVector' to search index. Check the 'outputFieldMappings' property of your indexer.
Had to change a few things but it is working as expected now.
Firstly, had to add index projections to the skillset and explicitly map all fields I want to populate in the search index. Note that contentVector
does not show up in search index because it is not retrievable.
skillset = SearchIndexerSkillset(
name="cf-chunk-embed-skillset",
description="Skillset for chunking and Azure OpenAI embeddings",
skills=[split_skill, embedding_skill],
index_projection=SearchIndexerIndexProjection(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name="cf-rag-index",
parent_key_field_name="parent_id",
source_context="/document/chunks/*",
mappings=[
InputFieldMappingEntry(name="chunk", source="/document/chunks/*"),
InputFieldMappingEntry(name="contentVector", source="/document/chunks/*/contentVector"),
InputFieldMappingEntry(name="title", source="/document/title"),
InputFieldMappingEntry(name="abstract", source="/document/abstract"),
InputFieldMappingEntry(name="summary", source="/document/summary"),
InputFieldMappingEntry(name="jurisdiction", source="/document/jurisdiction"),
InputFieldMappingEntry(name="category", source="/document/category"),
InputFieldMappingEntry(name="location", source="/document/location"),
InputFieldMappingEntry(name="document_type", source="/document/document_type"),
]
)
],
parameters=SearchIndexerIndexProjectionsParameters(
projection_mode="skipIndexingParentDocuments"
)
)
)
Also needed to create a new parent id field and modify the key field to have keyword analyzer.
fields = [
SearchField(name="id", type=SearchFieldDataType.String, key=True, analyzer_name="keyword"),
SimpleField(name="parent_id", type=SearchFieldDataType.String, filterable=True),
SearchableField(name="title", type=SearchFieldDataType.String),
# SearchableField(name="content", type=SearchFieldDataType.String),
SearchableField(name="chunk", type=SearchFieldDataType.String),
SimpleField(name="location", type=SearchFieldDataType.String, filterable=True),
SimpleField(name="document_type", type=SearchFieldDataType.String, filterable=True),
SearchableField(name="jurisdiction", collection=True, type=SearchFieldDataType.String, filterable=True),
SearchableField(name="category", collection=True, type=SearchFieldDataType.String, filterable=True),
SearchableField(name="summary", type=SearchFieldDataType.String),
SearchableField(name="abstract", type=SearchFieldDataType.String),
SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True, vector_search_dimensions=3072, vector_search_profile_name="cf-vector"),
]
Lastly, you can remove the output field mappings from the search indexer. It didn't seem to do anything when the index projection is set on the skillset.