I have the following query which matches 2 collections using common fields, but there was a problem, the comparison was not insensitive, therefore I didn't get full match. I then tried to make it insensitive..
Here's the query I was using:
cursor= db.csv_import.aggregate([
{
$lookup: {
from: 'EN',
let: {pn:'$part_no',vendor:'$vendor_standard'},
pipeline: [{
$match: {
$expr: {
$and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]}],{$eq: ["$$vendor","$ICECAT-interface.Product.Supplier.@Name"]}]
}
}
}],
as: 'part_number_info'
}
}, { $match: {"part_number_info.0": {$exists: true}}
}, { $project: {"part_no": 1,"part_number_info.ICECAT-interface.Product.@ID": 1, "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1, "_id":0}}
]).pretty();
I've read about insensitive here: https://www.mongodb.com/docs/manual/core/index-case-insensitive/
so I did create 2 index for vendor and product part number on the 2 collections as follow (example for part number)
db.csv_import.createIndex(
{'part_no': 1},{name: "part_no_unsensitive_idx", collation: {locale: "en",strength:2})
db.EN.createIndex(
{'ICECAT-interface.Product.@Prod_id': 1},{name: "product_unsensitive_idx", collation: {locale: "en",strength:2})
and I tried this:
cursor= db.csv_import.aggregate([
{
$lookup: {
from: 'EN',
let: {pn:'$part_no',vendor:'$vendor_standard'},
pipeline: [{
$match: {
$expr: {
$and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]},{$eq: ["$$vendor","$ICECAT-interface.Product.Supplier.@Name"]}]
}
}
}],
as: 'part_number_info'
}
}, { $match: {"part_number_info.0": {$exists: true}}
}, { $project: {"part_no": 1,"part_number_info.ICECAT-interface.Product.@ID": 1, "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1, "_id":0}}
]).collation( { locale: 'en', strength: 2 }).pretty();
However, the execution time went from milliseconds to long minutes (more than 10 minutes). Apparently it is not using the index whatsoever, was there an issue with the way I created the index or am I missing something else here?
EDIT
I changed it to this as suggested (collation after pipeline) and although it improved (took half of the time) still takes an hour versus without collation which takes a few seconds:
use Icecat
db.csv_import.aggregate([
{
$lookup: {
from: 'EN',
let: {pn:'$part_no',vendor:'$vendor_standard'},
pipeline: [{
$match: {
$expr: {
$and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]},{$eq: ["$$vendor","$ICECAT-interface.Product.Supplier.@Name"]}]
}
}
}],
as: 'part_number_info'
}
}, { $match: {"part_number_info.0": {$exists: true}}
}, { $project: {"part_no": 1,"part_number_info.ICECAT-interface.Product.@ID": 1, "part_number_info.ICECAT-interface.Product.Supplier.@Name": 1, "_id":0}
}
],
{collation: { locale: 'en', strength: 2 }
).pretty();
EDIT 2
Output of explain:
{
"explainVersion": "1",
"stages": [
{
"$cursor": {
"queryPlanner": {
"namespace": "Icecat.csv_import",
"indexFilterSet": false,
"parsedQuery": {},
"queryHash": "005DB16E",
"planCacheKey": "E1018696",
"maxIndexedOrSolutionsReached": false,
"maxIndexedAndSolutionsReached": false,
"maxScansToExplodeReached": false,
"winningPlan": {
"stage": "PROJECTION_SIMPLE",
"transformBy": {
"part_no": 1,
"part_number_info": 1,
"vendor_standard": 1,
"_id": 0
},
"inputStage": {
"stage": "COLLSCAN",
"direction": "forward"
}
},
"rejectedPlans": []
},
"executionStats": {
"executionSuccess": true,
"nReturned": 23685,
"executionTimeMillis": 9051,
"totalKeysExamined": 0,
"totalDocsExamined": 23685,
"executionStages": {
"stage": "PROJECTION_SIMPLE",
"nReturned": 23685,
"executionTimeMillisEstimate": 18,
"works": 23687,
"advanced": 23685,
"needTime": 1,
"needYield": 0,
"saveState": 25,
"restoreState": 25,
"isEOF": 1,
"transformBy": {
"part_no": 1,
"part_number_info": 1,
"vendor_standard": 1,
"_id": 0
},
"inputStage": {
"stage": "COLLSCAN",
"nReturned": 23685,
"executionTimeMillisEstimate": 8,
"works": 23687,
"advanced": 23685,
"needTime": 1,
"needYield": 0,
"saveState": 25,
"restoreState": 25,
"isEOF": 1,
"direction": "forward",
"docsExamined": 23685
}
}
}
},
"nReturned": 23685,
"executionTimeMillisEstimate": 65
},
{
"$lookup": {
"from": "EN",
"as": "part_number_info",
"let": {
"pn": "$part_no",
"vendor": "$vendor_standard"
},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{
"$eq": [
"$$pn",
"$ICECAT-interface.Product.@Prod_id"
]
},
{
"$eq": [
"$$vendor",
"$ICECAT-interface.Product.Supplier.@Name"
]
}
]
}
}
}
]
},
"totalDocsExamined": 2769,
"totalKeysExamined": 2769,
"collectionScans": 0,
"indexesUsed": [
"ICECAT-interface.Product.@Prod_id_1"
],
"nReturned": 23685,
"executionTimeMillisEstimate": 8918
},
{
"$match": {
"part_number_info.0": {
"$exists": true
}
},
"nReturned": 2690,
"executionTimeMillisEstimate": 8919
},
{
"$project": {
"part_no": true,
"part_number_info": {
"ICECAT-interface": {
"Product": {
"@ID": true,
"Supplier": {
"@Name": true
}
}
}
},
"_id": false
},
"nReturned": 2690,
"executionTimeMillisEstimate": 8919
}
],
"serverInfo": {
"host": "ip-10-0-1-199.eu-west-1.compute.internal",
"port": 27017,
"version": "5.0.9-8",
"gitVersion": "15a95b4ea8203b337be88bebbeea864b4dadb6a2"
},
"serverParameters": {
"internalQueryFacetBufferSizeBytes": 104857600,
"internalQueryFacetMaxOutputDocSizeBytes": 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
"internalDocumentSourceGroupMaxMemoryBytes": 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
"internalQueryProhibitBlockingMergeOnMongoS": 0,
"internalQueryMaxAddToSetBytes": 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
},
"command": {
"aggregate": "csv_import",
"pipeline": [
{
"$lookup": {
"from": "EN",
"let": {
"pn": "$part_no",
"vendor": "$vendor_standard"
},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{
"$eq": [
"$$pn",
"$ICECAT-interface.Product.@Prod_id"
]
},
{
"$eq": [
"$$vendor",
"$ICECAT-interface.Product.Supplier.@Name"
]
}
]
}
}
}
],
"as": "part_number_info"
}
},
{
"$match": {
"part_number_info.0": {
"$exists": true
}
}
},
{
"$project": {
"part_no": 1,
"part_number_info.ICECAT-interface.Product.@ID": 1,
"part_number_info.ICECAT-interface.Product.Supplier.@Name": 1,
"_id": 0
}
}
],
"cursor": {},
"$db": "Icecat"
},
"ok": 1
}
I think there may be two problems here.
The first is that it's not clear if you are requesting the collation in the appropriate manner. While it may depend on version, the documentation suggests that the syntax is:
db.myColl.aggregate(
<PIPELINE>,
{ collation: { locale: "en", strength: 2 } }
);
The second thing is that, somewhat similar to one of your previous questions, I think the field names are wrong here. In your $lookup
pipeline
for the EN
collection you have:
$and: [{$eq: ["$$pn","$ICECAT-interface.Product.@Prod_id"]}] ...
But the index you created on the EN
collection was:
db.EN.createIndex( {'ICECAT-interface.Product.@ID': 1}, ...
Notice the end of the field name is @Prod_id
versus @ID
. One of the two needs to be updated.
Edit based on the explain plan (thanks!) and additional questions.
The explain plan appears to show that things are mostly working as expected. More specifically, we see:
csv_import
collection. Note that this is expected because the aggregation is not doing any filtering on that collection so there is nothing for an index to help with.EN
collection. The $lookup
entry in the stages
array reports "indexesUsed": [ "ICECAT-interface.Product.@Prod_id_1" ]
along with "totalDocsExamined": 2769
and "totalKeysExamined": 2769
. Finally for good measure, it additionally reports "collectionScans": 0
."executionTimeMillis": 9051
.So why do I say this is "mostly working" and what's up with the difference between the 9 seconds reported here and your comment that "what used to take seconds now takes about an hour (when adding the collation)"?
Well, what I'm not seeing in the explain
output is a mention of the collation
. When I generate an explain plan (using the syntax previously mentioned of specifying the collation
as an option in the .aggregate()
function call), the command
portion of the response looks like this:
command: {
aggregate: 'csv_import',
pipeline: [ ... ],
cursor: {},
collation: { locale: 'en', strength: 2 },
'$db': 'test'
}
Specifically the collation
is stated directly. This is absent in your explain
output. Therefore I believe that:
explain
plan we are looking at is for the aggregation without using collation
. This also implies that your "ICECAT-interface.Product.@Prod_id_1"
index does not have a collation applied to it. You should be able to confirm that by inspecting the .getIndexes()
output from that collection.collation
and attempt to execute the operation, the database will not be able to use the index (due to the mismatch mentioned in the previous point) which would definitely explain the long duration.In summary - it looks like you still have a mismatch in the collation defined by the index on the EN
collection and the aggregation that you are running. Please go double check and correct those items.