I have the following index
SearchIndex searchIndex = new(_indexName)
{
VectorSearch = new()
{
Profiles =
{
new VectorSearchProfile(vectorSearchProfile, vectorSearchHnswConfig)
},
Algorithms =
{
new HnswVectorSearchAlgorithmConfiguration(vectorSearchHnswConfig)
}
},
SemanticSettings = new()
{
Configurations =
{
new SemanticConfiguration(SemanticSearchConfigName, new()
{
TitleField = new(){ FieldName = "Title" },
ContentFields =
{
new(){FieldName="Subtitle" },new() { FieldName = "Content"},new() { FieldName = "AuthorsSpeakers" }
},
})
}
},
Fields =
{
new SearchField("Id", SearchFieldDataType.String) { IsKey = true, IsFilterable = true },
new SearchField("ParentId", SearchFieldDataType.String){ },
new SearchField("OriginalId", SearchFieldDataType.String){ },
new SearchField("Title", SearchFieldDataType.String){ IsFilterable = true, IsSearchable=true},
new SearchField("Content", SearchFieldDataType.String){ IsFilterable = true, IsSearchable=true},
new SearchField("Subtitle", SearchFieldDataType.String){ IsFilterable = true, IsSearchable=true},
new SearchField("ContentType", SearchFieldDataType.String){ IsFilterable = true, IsSearchable=true},
new SearchField("Site", SearchFieldDataType.String){ IsFilterable = true},
new SearchField("AuthorsSpeakers", SearchFieldDataType.String){ IsSearchable=true},
new SimpleField("UpdateDate", SearchFieldDataType.DateTimeOffset) { IsFilterable = true, IsSortable = true },
new SearchField("Url", SearchFieldDataType.String){},
new SearchableField("Roles", collection: true) { IsFilterable = true },
new SearchableField("Authors", collection: true) { IsFilterable = true},
new SearchableField("Categories", collection: true) { IsFilterable = true, },
new SearchField("TitleVector", SearchFieldDataType.Collection(SearchFieldDataType.Single))
{
IsSearchable = true,
VectorSearchDimensions = ModelDimensions,
VectorSearchProfile = vectorSearchProfile,
},
new SearchField("ContentVector", SearchFieldDataType.Collection(SearchFieldDataType.Single))
{
IsSearchable = true,
VectorSearchDimensions = ModelDimensions,
VectorSearchProfile=vectorSearchProfile,
}
}
};
_searchIndexClient.CreateOrUpdateIndex(searchIndex);
Where one of the documents (at the moment only 30) is
"Title": "ADB supports power sector reforms in India with $250m loan",
"Content": "ADB has approved a $250 million policy-based loan to strengthen India’s power sector by improving financial sustainability and facilitating the shift to renewable energy.The Power Sector Reform Program (Subprogram 1) is the first of a two-part program to strengthen the development of markets for power trade and related ancillary services.\r\nThis will facilitate the integration of intermittent renewable energy given India’s target of 50% of its power generation capacity from non-fossil fuel sources by 2030. These policy actions will, among others, help accelerate the deployment of solar and other renewable energy technologies, promote the use of renewables in the agriculture sector, and optimize dispatch of power plants to reduce emissions.\r\nThe loan will also enable measures to improve the financial performance, corporate governance, and service quality of electricity distribution companies (DISCOM) and create a conducive environment for private sector investment. It will aid in the implementation of an incentive-based results-oriented approach to improving DISCOM performance on parameters including losses, cost recovery, metering and timely payment of dues in order to access government budget support.\r\nThis program builds on ADB’s engagements in emerging areas such as green hydrogen that will be important to facilitate energy transition. ADB will provide financial grants of $1.5 million from its Technical Assistance Special Fund and the Climate Change Fund for capacity development and the implementation of policy reforms. The program has been prepared in coordination with other development partners, in particular German development cooperation through KfW.",
"Subtitle": "",
With hybrid semantic search, I'm getting very low scores.
query=ADB supports power sector reforms in India with $250m loan
// Generate the embedding for the query
var queryEmbeddings = await _openAiService.GenerateEmbedding(query);
// Perform the vector similarity search
var searchOptions = new SearchOptions
{
VectorQueries = { new RawVectorQuery() { Vector = queryEmbeddings.ToArray(), KNearestNeighborsCount = 3, Fields = { "ContentVector", "TitleVector" } } },
Size = noOfResults,
QueryType = SearchQueryType.Semantic,
QueryLanguage = QueryLanguage.EnUs,
SemanticConfigurationName = SemanticSearchConfigName,
QueryCaption = QueryCaptionType.Extractive,
QueryAnswer = QueryAnswerType.Extractive,
QueryCaptionHighlightEnabled = true,
Select = { "Title", "Content", "Categories", "Subtitle", "Id", "OriginalId", "ContentType", "Site", "UpdateDate", "Authors", "Url" },
};
//if(brandsFilter.Any())
//{// "filter":"group_ids/any(g:search.in(g, 'group_id1, group_id2'))"
// searchOptions.Filter = SearchFilter.Create( new FormattableString(){ Format.})
//}
try
{
SearchResults<IndexModel> response = await _searchClient.SearchAsync<IndexModel>(query, searchOptions);
return await ConverToCognitiveResultModel(response);
}
The first document that my search returns is the document, but the score is very low
The same query, just a normal full-text search, returns 13 rank.
My question is, why is the ranking score so low? we use these search results for RAG, and our cut-off is 4, obviously, this document would not count
thanks
Thanks for trying out semantic search-
Semantic search produces the @search.reranker score. The range of that score is 0 to 4, where 4 is the highest relevance. Your screenshot shows that the reranker score is 3.78, which is considered highly relevant. The low-rank score you see is the reciprocal rank fusion score, used to merge vector and text results. That number will almost always be low and should not be used to evaluate the absolute relevance of a document, rather, it's only meant to rank documents relative to other documents within the same result set.
For your RAG threshold, you should use the semantic ranker score and decide on a number between 0 and 4 based on your needs.
Here's some documentation explaining those various concepts:
How hybrid scores are computed:
https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking
How to use semantic scores:
https://learn.microsoft.com/en-us/azure/search/semantic-search-overview#how-summaries-are-scored