Search code examples
azureazure-cognitive-servicesazure-ai-search

Azure AI Search Index - Multiple indexers and chunking


I have an indexer that reads through blob storage, chunks, and vectorizes the data into an index. This is working great. I also have a key field, lets call it fileID that is stored in the metadata of the document and is also in the index. This is unique to the document, however it is not unique after chunking because a document will be split into multiple documents each with the same fileid.

I want to have a second indexer than can add data from a sql query into the index, joined on that fileid. However since I can't use fileid anymore as the key - because of the chunking process and the fact that a key needs to be unique, how can I merge the data from the sql query indexer into the index?

I'm guessing this is not possible right now but if anyone has any suggestions, that would be amazing!


Solution

  • I ended up doing this with a custom web api skill using an Azure Function which takes a recordid and returns additional fields from a sql server database.

    Custom Skill Web API Documentation

    Here is my skillset definition when all is said and done.

        {
      "name": "myindexskillset",
      "description": "Skillset to chunk documents and generate embeddings",
      "skills": [
        {
            "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
            "name": "#1",
            "description": "This skill calls an Azure function to get additional metadata from sql database",
            "httpMethod": "POST",
            "timeout": "PT30S",
            "batchSize": 100,
            "degreeOfParallelism": 1,
            "uri": "customskillendpointgoeshere",
            "context": "/document",
            "inputs": [
              {
                "name": "systemOfRecordFileId",
                "source": "/document/SystemOfRecordFileID"
              }
            ],
            "outputs": [
              {
                "name": "recordType",
                "targetName": "RecordType"
              },
              {
                "name": "referenceRecordId",
                "targetName": "ReferenceRecordID"
              },
              {
                "name": "recordTitle",
                "targetName": "RecordTitle"
              },
              {
                "name": "sponsorName",
                "targetName": "SponsorName"
              },
              {
                "name": "piFullName",
                "targetName": "PIFullName"
              },
              {
                "name": "subrecipientName",
                "targetName": "SubrecipientName"
              }
            ]
        },
        {
          "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
          "name": "#2",
          "description": null,
          "context": "/document/pages/*",
          "resourceUri": "<omitted>",
          "apiKey": "<omitted>",
          "deploymentId": "text-embedding-ada-002",
          "inputs": [
            {
              "name": "text",
              "source": "/document/pages/*"
            }
          ],
          "outputs": [
            {
              "name": "embedding",
              "targetName": "vector"
            }
          ],
          "authIdentity": null
        },
        {
          "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
          "name": "#3",
          "description": "Split skill to chunk documents",
          "context": "/document",
          "defaultLanguageCode": "en",
          "textSplitMode": "pages",
          "maximumPageLength": 2000,
          "pageOverlapLength": 500,
          "maximumPagesToTake": 0,
          "inputs": [
            {
              "name": "text",
              "source": "/document/mergedText"
            }
          ],
          "outputs": [
            {
              "name": "textItems",
              "targetName": "pages"
            }
          ]
        },
        {
          "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
          "name": "#4",
          "description": null,
          "context": "/document",
          "insertPreTag": " ",
          "insertPostTag": " ",
          "inputs": [
            {
              "name": "text",
              "source": "/document/content"
            },
            {
              "name": "itemsToInsert",
              "source": "/document/normalized_images/*/text"
            },
            {
              "name": "offsets",
              "source": "/document/normalized_images/*/contentOffset"
            }
          ],
          "outputs": [
            {
              "name": "mergedText",
              "targetName": "mergedText"
            }
          ]
        },
        {
          "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
          "name": "#5",
          "description": null,
          "context": "/document/normalized_images/*",
          "textExtractionAlgorithm": null,
          "lineEnding": "Space",
          "defaultLanguageCode": "en",
          "detectOrientation": true,
          "inputs": [
            {
              "name": "image",
              "source": "/document/normalized_images/*"
            }
          ],
          "outputs": [
            {
              "name": "text",
              "targetName": "text"
            }
          ]
        }
      ],
      "cognitiveServices": {
        "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
        "description": null,
        "key": "<omitted>"
      },
      "knowledgeStore": null,
      "indexProjections": {
        "selectors": [
          {
            "targetIndexName": "myIndex",
            "parentKeyFieldName": "parent_id",
            "sourceContext": "/document/pages/*",
            "mappings": [
              {
                "name": "chunk",
                "source": "/document/pages/*",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "vector",
                "source": "/document/pages/*/vector",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "title",
                "source": "/document/metadata_storage_name",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "SystemOfRecordFileID",
                "source": "/document/SystemOfRecordFileID",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "RecordType",
                "source": "/document/RecordType",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "ReferenceRecordID",
                "source": "/document/ReferenceRecordID",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "RecordTitle",
                "source": "/document/RecordTitle",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "SponsorName",
                "source": "/document/SponsorName",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "PIFullName",
                "source": "/document/PIFullName",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "SubrecipientName",
                "source": "/document/SubrecipientName",
                "sourceContext": null,
                "inputs": []
              }
            ]
          }
        ],
        "parameters": {
          "projectionMode": "skipIndexingParentDocuments"
        }
      },
      "encryptionKey": null
    }