MongoDB Case insensitive index not working as expected

I have the following query which matches 2 collections using common fields, but there was a problem, the comparison was not insensitive, therefore I didn't get full match. I then tried to make it insensitive..

Here's the query I was using:

cursor= db.csv_import.aggregate([
 {
     $lookup: {
  from: 'EN',
  let: {pn:'$part_no',vendor:'$vendor_standard'},
  pipeline: [{
      $match: {
          $expr: {
              $and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]}],{$eq: ["$$vendor","$ICECAT-interface.Product.Supplier.@Name"]}]
          }
      }
  }],
  as: 'part_number_info'
     }
 }, { $match: {"part_number_info.0": {$exists: true}}
 }, { $project: {"part_no": 1,"part_number_info.ICECAT-interface.Product.@ID": 1, "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1, "_id":0}}
 ]).pretty();

I've read about insensitive here: https://www.mongodb.com/docs/manual/core/index-case-insensitive/

so I did create 2 index for vendor and product part number on the 2 collections as follow (example for part number)

db.csv_import.createIndex(
    {'part_no': 1},{name: "part_no_unsensitive_idx", collation: {locale: "en",strength:2})

  db.EN.createIndex(
    {'ICECAT-interface.Product.@Prod_id': 1},{name: "product_unsensitive_idx", collation: {locale: "en",strength:2})

and I tried this:

cursor= db.csv_import.aggregate([
 {
     $lookup: {
  from: 'EN',
  let: {pn:'$part_no',vendor:'$vendor_standard'},
  pipeline: [{
      $match: {
          $expr: {
              $and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]},{$eq: ["$$vendor","$ICECAT-interface.Product.Supplier.@Name"]}]
          }
      }
  }],
  as: 'part_number_info'
     }
 }, { $match: {"part_number_info.0": {$exists: true}}
 }, { $project: {"part_no": 1,"part_number_info.ICECAT-interface.Product.@ID": 1, "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1, "_id":0}}
 ]).collation( { locale: 'en', strength: 2 }).pretty();

However, the execution time went from milliseconds to long minutes (more than 10 minutes). Apparently it is not using the index whatsoever, was there an issue with the way I created the index or am I missing something else here?

EDIT

I changed it to this as suggested (collation after pipeline) and although it improved (took half of the time) still takes an hour versus without collation which takes a few seconds:

use Icecat
db.csv_import.aggregate([
 {
     $lookup: {
  from: 'EN',
  let: {pn:'$part_no',vendor:'$vendor_standard'},
  pipeline: [{
      $match: {
          $expr: {
              $and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]},{$eq: ["$$vendor","$ICECAT-interface.Product.Supplier.@Name"]}]
          }
      }
  }],
  as: 'part_number_info'
     }
 }, { $match: {"part_number_info.0": {$exists: true}}
 }, { $project: {"part_no": 1,"part_number_info.ICECAT-interface.Product.@ID": 1, "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1, "_id":0}
 }
 ],
 {collation: { locale: 'en', strength: 2 }
 ).pretty();

EDIT 2

Output of explain:

{
  "explainVersion": "1",
  "stages": [
    {
      "$cursor": {
        "queryPlanner": {
          "namespace": "Icecat.csv_import",
          "indexFilterSet": false,
          "parsedQuery": {},
          "queryHash": "005DB16E",
          "planCacheKey": "E1018696",
          "maxIndexedOrSolutionsReached": false,
          "maxIndexedAndSolutionsReached": false,
          "maxScansToExplodeReached": false,
          "winningPlan": {
            "stage": "PROJECTION_SIMPLE",
            "transformBy": {
              "part_no": 1,
              "part_number_info": 1,
              "vendor_standard": 1,
              "_id": 0
            },
            "inputStage": {
              "stage": "COLLSCAN",
              "direction": "forward"
            }
          },
          "rejectedPlans": []
        },
        "executionStats": {
          "executionSuccess": true,
          "nReturned": 23685,
          "executionTimeMillis": 9051,
          "totalKeysExamined": 0,
          "totalDocsExamined": 23685,
          "executionStages": {
            "stage": "PROJECTION_SIMPLE",
            "nReturned": 23685,
            "executionTimeMillisEstimate": 18,
            "works": 23687,
            "advanced": 23685,
            "needTime": 1,
            "needYield": 0,
            "saveState": 25,
            "restoreState": 25,
            "isEOF": 1,
            "transformBy": {
              "part_no": 1,
              "part_number_info": 1,
              "vendor_standard": 1,
              "_id": 0
            },
            "inputStage": {
              "stage": "COLLSCAN",
              "nReturned": 23685,
              "executionTimeMillisEstimate": 8,
              "works": 23687,
              "advanced": 23685,
              "needTime": 1,
              "needYield": 0,
              "saveState": 25,
              "restoreState": 25,
              "isEOF": 1,
              "direction": "forward",
              "docsExamined": 23685
            }
          }
        }
      },
      "nReturned": 23685,
      "executionTimeMillisEstimate": 65
    },
    {
      "$lookup": {
        "from": "EN",
        "as": "part_number_info",
        "let": {
          "pn": "$part_no",
          "vendor": "$vendor_standard"
        },
        "pipeline": [
          {
            "$match": {
              "$expr": {
                "$and": [
                  {
                    "$eq": [
                      "$$pn",
                      "$ICECAT-interface.Product.@Prod_id"
                    ]
                  },
                  {
                    "$eq": [
                      "$$vendor",
                      "$ICECAT-interface.Product.Supplier.@Name"
                    ]
                  }
                ]
              }
            }
          }
        ]
      },
      "totalDocsExamined": 2769,
      "totalKeysExamined": 2769,
      "collectionScans": 0,
      "indexesUsed": [
        "ICECAT-interface.Product.@Prod_id_1"
      ],
      "nReturned": 23685,
      "executionTimeMillisEstimate": 8918
    },
    {
      "$match": {
        "part_number_info.0": {
          "$exists": true
        }
      },
      "nReturned": 2690,
      "executionTimeMillisEstimate": 8919
    },
    {
      "$project": {
        "part_no": true,
        "part_number_info": {
          "ICECAT-interface": {
            "Product": {
              "@ID": true,
              "Supplier": {
                "@Name": true
              }
            }
          }
        },
        "_id": false
      },
      "nReturned": 2690,
      "executionTimeMillisEstimate": 8919
    }
  ],
  "serverInfo": {
    "host": "ip-10-0-1-199.eu-west-1.compute.internal",
    "port": 27017,
    "version": "5.0.9-8",
    "gitVersion": "15a95b4ea8203b337be88bebbeea864b4dadb6a2"
  },
  "serverParameters": {
    "internalQueryFacetBufferSizeBytes": 104857600,
    "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
    "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
    "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
    "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
    "internalQueryProhibitBlockingMergeOnMongoS": 0,
    "internalQueryMaxAddToSetBytes": 104857600,
    "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
  },
  "command": {
    "aggregate": "csv_import",
    "pipeline": [
      {
        "$lookup": {
          "from": "EN",
          "let": {
            "pn": "$part_no",
            "vendor": "$vendor_standard"
          },
          "pipeline": [
            {
              "$match": {
                "$expr": {
                  "$and": [
                    {
                      "$eq": [
                        "$$pn",
                        "$ICECAT-interface.Product.@Prod_id"
                      ]
                    },
                    {
                      "$eq": [
                        "$$vendor",
                        "$ICECAT-interface.Product.Supplier.@Name"
                      ]
                    }
                  ]
                }
              }
            }
          ],
          "as": "part_number_info"
        }
      },
      {
        "$match": {
          "part_number_info.0": {
            "$exists": true
          }
        }
      },
      {
        "$project": {
          "part_no": 1,
          "part_number_info.ICECAT-interface.Product.@ID": 1,
          "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1,
          "_id": 0
        }
      }
    ],
    "cursor": {},
    "$db": "Icecat"
  },
  "ok": 1
}

Solution

I think there may be two problems here.

The first is that it's not clear if you are requesting the collation in the appropriate manner. While it may depend on version, the documentation suggests that the syntax is:

db.myColl.aggregate(
   <PIPELINE>,
   { collation: { locale: "en", strength: 2 } }
);

The second thing is that, somewhat similar to one of your previous questions, I think the field names are wrong here. In your $lookup pipeline for the EN collection you have:

   $and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]}] ...

But the index you created on the EN collection was:

db.EN.createIndex( {'ICECAT-interface.Product.@ID': 1}, ...

Notice the end of the field name is @Prod_id versus @ID. One of the two needs to be updated.

Edit based on the explain plan (thanks!) and additional questions.

The explain plan appears to show that things are mostly working as expected. More specifically, we see:

The query uses a collection scan on the source csv_import collection. Note that this is expected because the aggregation is not doing any filtering on that collection so there is nothing for an index to help with.
The database is using an index on the EN collection. The $lookup entry in the stages array reports "indexesUsed": [ "ICECAT-interface.Product.@Prod_id_1" ] along with "totalDocsExamined": 2769 and "totalKeysExamined": 2769. Finally for good measure, it additionally reports "collectionScans": 0.
The total time needed to execute this operation was around 9 seconds: "executionTimeMillis": 9051.

So why do I say this is "mostly working" and what's up with the difference between the 9 seconds reported here and your comment that "what used to take seconds now takes about an hour (when adding the collation)"?

Well, what I'm not seeing in the explain output is a mention of the collation. When I generate an explain plan (using the syntax previously mentioned of specifying the collation as an option in the .aggregate() function call), the command portion of the response looks like this:

  command: {
    aggregate: 'csv_import',
    pipeline: [ ... ],
    cursor: {},
    collation: { locale: 'en', strength: 2 },
    '$db': 'test'
  }

Specifically the collation is stated directly. This is absent in your explain output. Therefore I believe that:

The explain plan we are looking at is for the aggregation without using collation. This also implies that your "ICECAT-interface.Product.@Prod_id_1" index does not have a collation applied to it. You should be able to confirm that by inspecting the .getIndexes() output from that collection.
When you apply the collation and attempt to execute the operation, the database will not be able to use the index (due to the mismatch mentioned in the previous point) which would definitely explain the long duration.

In summary - it looks like you still have a mismatch in the collation defined by the index on the EN collection and the aggregation that you are running. Please go double check and correct those items.