I'm trying to query results from a large dataset called 'tasks' containing 187297 documents which are nested into another dataset called 'workers', that's in its turn nested into a collection called 'production_units'.
production_units -> workers -> tasks
(BTW this is a simplified version of production_units):
[{
"_id": ObjectId("5aca27b926974863ed9f01ab"),
"name": "Z",
"workers": [{
"name": "X Y",
"worker_number": 655,
"employed": false,
"_id": ObjectId("5aca27bd26974863ed9f0425"),
"tasks": [{
"_id": ObjectId("5ac9f6c2e1a668d6d39c1fd1"),
"inbound_order_number": 3296,
"task_number": 90,
"minutes_elapsed": 120,
"date": "2004-11-30",
"start": 1101823200,
"pieces_actual": 160,
"pause_from": 1101812400,
"pause_to": 1101814200
}]
}]
}]
In order to accomplish this I have used the following aggregation command:
db.production_units.aggregate([{
'$project': {
'workers': '$workers'
}
}, {
'$unwind': '$workers'
}, {
'$project': {
'tasks': '$workers.tasks',
'worker_number': '$workers.worker_number'
}
}, {
'$unwind': '$tasks'
}, {
'$project': {
'task_number': '$tasks.task_number',
'pieces_actual': '$tasks.pieces_actual',
'minutes_elapsed': '$tasks.minutes_elapsed',
'worker_number': 1,
'start': '$tasks.start',
'inbound_order_number': '$tasks.inbound_order_number',
'pause_from': '$tasks.pause_from',
'date': '$tasks.date',
'_id': '$tasks._id',
'pause_to': '$tasks.pause_to'
}
}, {
'$match': {
'start': {
'$exists': true
}
}
}, {
'$group': {
'entries_count': {
'$sum': 1
},
'_id': null,
'entries': {
'$push': '$$ROOT'
}
}
}, {
'$project': {
'entries_count': 1,
'_id': 0,
'entries': 1
}
}, {
'$unwind': '$entries'
}, {
'$project': {
'task_number': '$entries.task_number',
'pieces_actual': '$entries.pieces_actual',
'minutes_elapsed': '$entries.minutes_elapsed',
'worker_number': '$entries.worker_number',
'start': '$entries.start',
'inbound_order_number': '$entries.inbound_order_number',
'pause_from': '$entries.pause_from',
'date': '$entries.date',
'entries_count': 1,
'_id': '$entries._id',
'pause_to': '$entries.pause_to'
}
}, {
'$sort': {
'start': 1
}
}, {
'$skip': 187290
}, {
'$limit': 10
}], {
allowDiskUse: true
})
And the returned documents are:
{ "entries_count" : 187297, "task_number" : 100, "pieces_actual" : 68, "minutes_elapsed" : 102, "worker_number" : 411, "start" : 1594118400, "inbound_order_number" : 8569, "pause_from" : 1594119600, "date" : "2020-07-07", "_id" : ObjectId("5ac9f6d3e1a668d6d3a06351"), "pause_to" : 1594119600 } { "entries_count" : 187297, "task_number" : 130, "pieces_actual" : 20, "minutes_elapsed" : 30, "worker_number" : 549, "start" : 1596531600, "inbound_order_number" : 7683, "pause_from" : 1596538800, "date" : "2020-08-04", "_id" : ObjectId("5ac9f6cde1a668d6d39f1b26"), "pause_to" : 1596538800 } { "entries_count" : 187297, "task_number" : 210, "pieces_actual" : 84, "minutes_elapsed" : 180, "worker_number" : 734, "start" : 1601276400, "inbound_order_number" : 8330, "pause_from" : 1601290800, "date" : "2020-09-28", "_id" : ObjectId("5ac9f6d0e1a668d6d39fd677"), "pause_to" : 1601290800 } { "entries_count" : 187297, "task_number" : 20, "pieces_actual" : 64, "minutes_elapsed" : 90, "worker_number" : 114, "start" : 1601800200, "inbound_order_number" : 7690, "pause_from" : 1601809200, "date" : "2020-10-04", "_id" : ObjectId("5ac9f6cee1a668d6d39f3032"), "pause_to" : 1601811900 } { "entries_count" : 187297, "task_number" : 140, "pieces_actual" : 70, "minutes_elapsed" : 84, "worker_number" : 49, "start" : 1603721640, "inbound_order_number" : 4592, "pause_from" : 1603710000, "date" : "2020-10-26", "_id" : ObjectId("5ac9f6c8e1a668d6d39df664"), "pause_to" : 1603712700 } { "entries_count" : 187297, "task_number" : 80, "pieces_actual" : 20, "minutes_elapsed" : 30, "worker_number" : 277, "start" : 1796628600, "inbound_order_number" : 4655, "pause_from" : 1796641200, "date" : "2026-12-07", "_id" : ObjectId("5ac9f6c8e1a668d6d39e1fc0"), "pause_to" : 1796643900 } { "entries_count" : 187297, "task_number" : 40, "pieces_actual" : 79, "minutes_elapsed" : 123, "worker_number" : 96, "start" : 3802247580, "inbound_order_number" : 4592, "pause_from" : 3802244400, "date" : "2090-06-27", "_id" : ObjectId("5ac9f6c8e1a668d6d39de218"), "pause_to" : 3802244400 }
However, the query takes seconds in order to show the results, instead of few milliseconds. This is the result returned by the profiler:
db.system.profile.findOne().millis 3216
(UPDATE)
Even the following simplified count query gets executed in 312 ms instead of few time:
db.production_units.aggregate([{
"$unwind": "$workers"
}, {
"$unwind": "$workers.tasks"
},
{
"$count": "entries_count"
}
])
This is what explain()
returns for the query above:
{
"stages" : [
{
"$cursor" : {
"query" : {
},
"fields" : {
"workers" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "my_db.production_units",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "COLLSCAN",
"direction" : "forward"
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 28,
"executionTimeMillis" : 13,
"totalKeysExamined" : 0,
"totalDocsExamined" : 28,
"executionStages" : {
"stage" : "COLLSCAN",
"nReturned" : 28,
"executionTimeMillisEstimate" : 0,
"works" : 30,
"advanced" : 28,
"needTime" : 1,
"needYield" : 0,
"saveState" : 1,
"restoreState" : 1,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 28
},
"allPlansExecution" : [ ]
}
}
},
{
"$unwind" : {
"path" : "$workers"
}
},
{
"$unwind" : {
"path" : "$workers.tasks"
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"entries_count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : false,
"entries_count" : true
}
}
],
"ok" : 1
}
I'm not an experienced DBA, so I don't know what I'm missing exactly in my aggregation pipeline, for solving the performance issue I'm facing. I have also investigated the problem and made research, but without finding any solution.
What I am missing?
without the explain()
of the query it's impossible to know for sure what is the bottleneck of the query. However, here are some advices on how to improve this query
$project
stage at the end of the pipelinethe query contains 5 $project
stage, when actually only one is needed. This can add a lot of overhead, especially if applied to a large number of document.
Instead, use dot notation to query nested fields, for example:
{ "$unwind": "$workers.tasks" }
$match
as early as possible$match
allows to remove some of the documents, so add it as early as possible to apply further aggregation stage on a lower number of documents
skip
and $limit
before $project
As the query returns only 10 documents, no need to apply the $project
stage on the 180000 other docs
This is likely to be the bottleneck. Make sure that the field workers.tasks.start
is indexed ( see MongoDB ensureIndex() for details )
Instead of the $group
/$unwind
stage to count matching documents, run another query in the same time for counting only the number of matching documents
the main query now looks like:
db.collection.aggregate([{
"$unwind": "$workers"
}, {
"$unwind": "$workers.tasks"
}, {
"$match": {
"workers.tasks.start": {
"$ne": null
}
}
},
{
"$sort": {
"workers.tasks.start": 1
}
}, {
"$skip": 0
}, {
"$limit": 10
},
{
"$project": {
"task_number": "$workers.tasks.task_number",
"pieces_actual": "$workers.tasks.pieces_actual",
"minutes_elapsed": "$workers.tasks.minutes_elapsed",
"worker_number": "$workers.worker_number",
"start": "$workers.tasks.start",
"inbound_order_number": "$workers.tasks.inbound_order_number",
"pause_from": "$workers.tasks.pause_from",
"date": "$workers.tasks.date",
"_id": "$workers.tasks._id",
"pause_to": "$workers.tasks.pause_to"
}
}
])
you can try it here: mongoplayground.net/p/yua7qspo2Jj
the count query would be
db.collection.aggregate([{
"$unwind": "$workers"
}, {
"$unwind": "$workers.tasks"
}, {
"$match": {
"workers.tasks.start": {
"$ne": null
}
}
},
{
"$count": "entries_count"
}
])
the count query would look like