Search code examples
azuresearchazure-blob-storageazure-cognitive-search

How to search for part of a string with Azure Cognitive Search


I'm quite new to Azure Cognitive Search, and have succeded to configure my index in order to have autocompletion (using partial search thanks to this article).

But now I have another use case where I have many files stored in an Azure Blob Container with metadata:

One of the metadata field (of each file) is called partnumbers and its value is a string of products SKU separated with a comma (like "123456,78901,102938,09876"). I've built my index in order to store this info as a Edm.String, as you can see below:

{
  "name": "my-index",
  "fields": [
    {
      "name": "partnumbers",
      "type": "Edm.String",
      "facetable": true,
      "filterable": true,
      "key": false,
      "retrievable": true,
      "searchable": true,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "metadata_storage_name",
      "type": "Edm.String",
      "facetable": true,
      "filterable": true,
      "key": false,
      "retrievable": false,
      "searchable": true,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "metadata_storage_content_type",
      "type": "Edm.String",
      "facetable": true,
      "filterable": true,
      "key": false,
      "retrievable": false,
      "searchable": true,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "metadata_storage_last_modified",
      "type": "Edm.String",
      "facetable": true,
      "filterable": true,
      "key": false,
      "retrievable": false,
      "searchable": true,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "metadata_storage_path",
      "type": "Edm.String",
      "facetable": true,
      "filterable": true,
      "key": false,
      "retrievable": false,
      "searchable": true,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "metadata_storage_size",
      "type": "Edm.Int64",
      "facetable": true,
      "filterable": true,
      "retrievable": false,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "key",
      "type": "Edm.String",
      "facetable": true,
      "filterable": true,
      "key": true,
      "retrievable": true,
      "searchable": true,
      "sortable": true,
      "analyzer": null,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "synonymMaps": [],
      "fields": []
    },
    {
      "name": "partialPartnumbers",
      "type": "Edm.String",
      "facetable": false,
      "filterable": false,
      "key": false,
      "retrievable": false,
      "searchable": true,
      "sortable": false,
      "analyzer": null,
      "indexAnalyzer": "prefixCmAnalyzer",
      "searchAnalyzer": "standardCmAnalyzer",
      "synonymMaps": [],
      "fields": []
    },
  ],
  "suggesters": [
    {
      "name": "my-index_suggester",
      "searchMode": "analyzingInfixMatching",
      "sourceFields": [
        "partnumbers"
      ]
    }
  ],
  "scoringProfiles": [
    {
      "name": "exactFirst",
      "functions": [],
      "functionAggregation": null,
      "text": {
        "weights": {
          "partnumbers": 2,
          "partialPartnumbers": 1,
        }
      }
    }
  ],
  "defaultScoringProfile": "exactFirst",
  "corsOptions": null,
  "analyzers": [
    {
      "@odata.type": "#Microsoft.Azure.Search.CustomAnalyzer",
      "name": "standardCmAnalyzer",
      "tokenizer": "standard_v2",
      "tokenFilters": [
        "lowercase",
        "asciifolding"
      ],
      "charFilters": []
    },
    {
      "@odata.type": "#Microsoft.Azure.Search.CustomAnalyzer",
      "name": "prefixCmAnalyzer",
      "tokenizer": "standard_v2",
      "tokenFilters": [
        "lowercase",
        "asciifolding",
        "edgeNGramCmTokenFilter"
      ],
      "charFilters": []
    }
  ],
  "charFilters": [],
  "tokenFilters": [
    {
      "@odata.type": "#Microsoft.Azure.Search.EdgeNGramTokenFilterV2",
      "name": "edgeNGramCmTokenFilter",
      "minGram": 2,
      "maxGram": 20,
      "side": "front"
    }
  ],
  "tokenizers": [],
  "@odata.etag": "\"0x8D8184F367A74XX\""
}

Now I am struggling to find a way (through a specific syntax? an analyzer? a tokenizer?) to be able to find all the files that have a partnumbers metadata field which contains one single SKU (so that I can retrive all documents related to one product): I'd like to pass the SKU "102938" to Azure Search and it would return me all the files that have this SKU in its partnumbers metadata field (potentially among others SKU).

But I have a hard time finding examples on Google, and the documentation seems - for now - a bit out my league (I am not really sure to understand properly what are analylyzers, tokenizers, etc and how they work! This is the first time I dig into the "search" world...).

So I would really appreciate of the community could help me on this, I'd be keen to read articles for beginners to understand everyhting, or tutorials, or anything that could help me move forward with this!

Thanks in advance.


Solution

  • OK, I just tried something that works: I defined the pattern analyzer on my partnumbers field, and when I tested with the Analyzer Text API, it did split my SKUs into several tokens. And after that I could search for one SKU and it gave me back all the files I wanted! Here is my index JSON definition:

    {
      "name": "my-index",
      "fields": [
        {
          "name": "partnumbers",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": false,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": "pattern",
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "metadata_storage_name",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": false,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "metadata_storage_content_type",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": false,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "metadata_storage_last_modified",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": false,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "metadata_storage_path",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": false,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "metadata_storage_size",
          "type": "Edm.Int64",
          "facetable": true,
          "filterable": true,
          "retrievable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "key",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": true,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "name",
          "type": "Edm.String",
          "facetable": true,
          "filterable": true,
          "key": false,
          "retrievable": true,
          "searchable": true,
          "sortable": true,
          "analyzer": null,
          "indexAnalyzer": null,
          "searchAnalyzer": null,
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "partialPartnumbers",
          "type": "Edm.String",
          "facetable": false,
          "filterable": false,
          "key": false,
          "retrievable": false,
          "searchable": true,
          "sortable": false,
          "analyzer": null,
          "indexAnalyzer": "prefixCmAnalyzer",
          "searchAnalyzer": "standardCmAnalyzer",
          "synonymMaps": [],
          "fields": []
        },
        {
          "name": "partialName",
          "type": "Edm.String",
          "facetable": false,
          "filterable": false,
          "key": false,
          "retrievable": false,
          "searchable": true,
          "sortable": false,
          "analyzer": null,
          "indexAnalyzer": "prefixCmAnalyzer",
          "searchAnalyzer": "standardCmAnalyzer",
          "synonymMaps": [],
          "fields": []
        }
      ],
      "suggesters": [
        {
          "name": "conformity-certificates-index_suggester",
          "searchMode": "analyzingInfixMatching",
          "sourceFields": [
            "name"
          ]
        }
      ],
      "scoringProfiles": [
        {
          "name": "exactFirst",
          "functions": [],
          "functionAggregation": null,
          "text": {
            "weights": {
              "partnumbers": 4,
              "partialPartnumbers": 3,
              "name": 2,
              "partialName": 1
            }
          }
        }
      ],
      "defaultScoringProfile": "exactFirst",
      "corsOptions": null,
      "analyzers": [
        {
          "@odata.type": "#Microsoft.Azure.Search.CustomAnalyzer",
          "name": "standardCmAnalyzer",
          "tokenizer": "standard_v2",
          "tokenFilters": [
            "lowercase",
            "asciifolding"
          ],
          "charFilters": []
        },
        {
          "@odata.type": "#Microsoft.Azure.Search.CustomAnalyzer",
          "name": "prefixCmAnalyzer",
          "tokenizer": "standard_v2",
          "tokenFilters": [
            "lowercase",
            "asciifolding",
            "edgeNGramCmTokenFilter"
          ],
          "charFilters": []
        }
      ],
      "charFilters": [],
      "tokenFilters": [
        {
          "@odata.type": "#Microsoft.Azure.Search.EdgeNGramTokenFilterV2",
          "name": "edgeNGramCmTokenFilter",
          "minGram": 2,
          "maxGram": 20,
          "side": "front"
        }
      ],
      "tokenizers": [],
      "@odata.etag": "\"0x8D818EC80CXXXX\""
    }