I was trying to achieve semantic vector search on my own data. The PDF file will be uploaded into the blob storage, Indexer with skillsets pick up the file content from datasource and map to the index fields: Id, Content and Contentvector. I am able to create a storage source and index with fields Id, Content and Contentvector. Also managed to upload a file. But struggling a bit to create an Indexer with skillsets to convert file content to vector format and map to an Index field "Contentvector".
Below code I tried where first I am creating an Index, 2nd: Define datasource and creating it. Then creating a skillsets and lastly using the created skillsets, creating an Indexer.
public async Task ConfigureSearchIndexer()
{
//"my-semantic-config";
SearchIndexClient indexClient = new SearchIndexClient(ServiceEndpoint, new AzureKeyCredential(SearchAdminApiKey));
SearchIndexerClient indexerClient = new SearchIndexerClient(ServiceEndpoint, new AzureKeyCredential(SearchAdminApiKey));
try
{
//This code section is to create Index
var SampleIndex = GetSampleIndex(IndexName);
Console.WriteLine("Creating index: " + IndexName);
indexClient.CreateOrUpdateIndex(SampleIndex);
Console.WriteLine("Created the index: " + IndexName);
// Define the data source
Console.WriteLine("Creating or Updating dataStorage: " + DataSourceName);
SearchIndexerDataSourceConnection dataSources = new SearchIndexerDataSourceConnection(
name: DataSourceName,
type: "azureblob",
connectionString: BlobStorageConnectionString,
container: new SearchIndexerDataContainer(ContainerName)
);
// Create or update the data sourceindexerClient.CreateOrUpdateDataSourceConnection(dataSources);
Console.WriteLine("Create or Update dataStorage operation has been completed for the name: " + DataSourceName);
//upload pdf file
//string BlobName = UploadFileToBlobStorage(filePath);
// Console.WriteLine("uploaded PDF file to blobstorage: " + BlobName);
//Create Skillsets
CreateOrUpdateSkillSets();
//Define Indexer parameters
IndexingParameters indexingParameters = new IndexingParameters()
{
MaxFailedItems = -1,
MaxFailedItemsPerBatch = -1,
};
indexingParameters.Configuration.Add("dataToExtract", "contentAndMetadata");
indexingParameters.Configuration.Add("parsingMode", "default");
indexingParameters.Configuration.Add("allowSkillsetToReadFileData", true);
// Create the indexer
var indexer = new SearchIndexer(indexerName, DataSourceName, IndexName)
{
SkillsetName = "sanindexerskillset1",
Description = "Blob indexer",
Parameters = indexingParameters
};
//FieldMappingFunction mappingFunction = new FieldMappingFunction("base64Encode");
//mappingFunction.Parameters.Add("useHttpServerUtilityUrlTokenEncode", true);
//indexer.FieldMappings.Add(new FieldMapping("metadata_storage_path") {TargetFieldName = "id", MappingFunction = mappingFunction});
//indexer.FieldMappings.Add(new FieldMapping("content") { TargetFieldName = "content"});
//indexer.FieldMappings.Add(new FieldMapping("metadata_storage_name") { TargetFieldName = "title" });
// Create or update the indexer
indexerClient.CreateOrUpdateIndexer(indexer);
// Run the indexer
indexerClient.RunIndexer(indexerName);
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString());
}
}
public void CreateOrUpdateSkillSets()
{
// Create a SearchIndexerClient using the search service endpoint and API key
AzureKeyCredential credential = new AzureKeyCredential(SearchAdminApiKey);
SearchIndexerClient searchIndexerClient = new SearchIndexerClient(ServiceEndpoint, credential);
string skillsetName = "sanindexerskillset1";
string projectionName = "sanindexerprojectionname1";
var collection = new List<SearchIndexerSkill>()
{
//Extract text PDFs
new DocumentExtractionSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("file_data")
{
Source = "/document/content"
//Target = "input"
}
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("text")
{
TargetName = "extractedText"
}
})
{ Context = "/document", Description = "Extract text from documents"},
//Shape the data into a vector
new ShaperSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text")
{
Source = "/document/content"
}
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("output")
{
TargetName = "contentvector"
}
}) { Context = "/document", Description = "Shape the data into a vector"},
};
// Create the skillset
var skillset = new SearchIndexerSkillset(skillsetName, new List<SearchIndexerSkill>(collection));
Console.WriteLine("Create or Update Indexer skill sets Skillsets name: " + skillsetName);
try
{
// Create or update the skillset
searchIndexerClient.CreateOrUpdateSkillset(skillset);
Console.WriteLine("Skillset created successfully! Skillsets name:" + skillsetName);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
Indexer started running but throws an error "Required skill input was not in the expected format. Name: 'file_data', Source: '$(/document/content)', Error: 'Missing file reference object'". Even I am bit confuse how to convert the file content data to 1536 dimension Vetor data using Indexer. Please suggest me if I am doing anything wrong or a suggestion to achieve what I am expecting.
There is a few things worth clarifying here.
I highly recommend you check out this demo that we have that shows the current method for ingesting data and generating vector embeddings for vector search using indexers. It utilizes the Azure Open AI service for generating vector embeddings, but you could write your own custom skill to use a different vector embedding service or model as well.