I'm trying to count the documents in a collection for a given filter. Using countDocuments
uses up to 1s per 100'000 documents
, which is kind of slow and not consdering the impact is has on the db if multiple users trigger this count every few seconds to minutes.
Under the hood countDocuments()
translates to this aggregate
query:
db.collection.aggregate([
{
"$match": {
"_id": {"$exists": true}
}
},
{
"$group": {
"_id": 1,
"n": {"$sum": 1}
}
}
])
Calling explain()
on it yields:
{
explainVersion: '1',
stages: [
{
'$cursor': {
queryPlanner: {
namespace: 'collection',
indexFilterSet: false,
parsedQuery: { _id: { '$exists': true } },
queryHash: 'BA029CD5',
planCacheKey: '4D66BB31',
maxIndexedOrSolutionsReached: false,
maxIndexedAndSolutionsReached: false,
maxScansToExplodeReached: false,
winningPlan: {
stage: 'FETCH',
filter: { _id: { '$exists': true } },
inputStage: {
stage: 'IXSCAN',
keyPattern: { _id: 1 },
indexName: '_id_',
isMultiKey: false,
multiKeyPaths: { _id: [] },
isUnique: true,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: { _id: [ '[MinKey, MaxKey]' ] }
}
},
rejectedPlans: []
},
executionStats: {
executionSuccess: true,
nReturned: 614833,
executionTimeMillis: 6589,
totalKeysExamined: 614833,
totalDocsExamined: 614833,
executionStages: {
stage: 'FETCH',
filter: { _id: { '$exists': true } },
nReturned: 614833,
executionTimeMillisEstimate: 4219,
works: 614834,
advanced: 614833,
needTime: 0,
needYield: 0,
saveState: 668,
restoreState: 668,
isEOF: 1,
docsExamined: 614833,
alreadyHasObj: 0,
inputStage: {
stage: 'IXSCAN',
nReturned: 614833,
executionTimeMillisEstimate: 568,
works: 614834,
advanced: 614833,
needTime: 0,
needYield: 0,
saveState: 668,
restoreState: 668,
isEOF: 1,
keyPattern: { _id: 1 },
indexName: '_id_',
isMultiKey: false,
multiKeyPaths: { _id: [] },
isUnique: true,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: { _id: [ '[MinKey, MaxKey]' ] },
keysExamined: 614833,
seeks: 1,
dupsTested: 0,
dupsDropped: 0
}
},
allPlansExecution: []
}
},
nReturned: Long("614833"),
executionTimeMillisEstimate: Long("6510")
},
{
'$group': { _id: { '$const': 1 }, n: { '$sum': { '$const': 1 } } },
maxAccumulatorMemoryUsageBytes: { n: Long("80") },
totalOutputDataSizeBytes: Long("237"),
usedDisk: false,
spills: Long("0"),
nReturned: Long("1"),
executionTimeMillisEstimate: Long("6584")
}
],
command: {
aggregate: 'collection',
pipeline: [
{ '$match': { _id: { '$exists': true } } },
{ '$group': { _id: 1, n: { '$sum': 1 } } }
],
cursor: {},
'$db': 'db'
},
ok: 1,
'$clusterTime': {
clusterTime: Timestamp({ t: 1677593996, i: 1 }),
signature: {
hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
keyId: Long("0")
}
},
operationTime: Timestamp({ t: 1677593996, i: 1 })
}
Notice how the winning strategy consists of 2 phases: First FETCH
and only after this of IXSCAN
A query that uses find()
that does not include the _id
field and instead projects onto an indexed field (called origin
) will only go for an IXSCAN
instead:
db.collection.find({}, {"_id":0, "origin": 1}).count()
Will count the correct result and can also use a filter
. .explain()
will yield:
{
explainVersion: '1',
queryPlanner: {
namespace: 'db.collection',
indexFilterSet: false,
parsedQuery: { origin: { '$eq': 'WF' } },
queryHash: '2428EDD1',
planCacheKey: '85C94249',
maxIndexedOrSolutionsReached: false,
maxIndexedAndSolutionsReached: false,
maxScansToExplodeReached: false,
winningPlan: {
stage: 'PROJECTION_COVERED',
transformBy: { _id: 0, origin: 1 },
inputStage: {
stage: 'IXSCAN',
keyPattern: { origin: 1, 'metadata.iv': -1 },
indexName: 'originAscending',
isMultiKey: false,
multiKeyPaths: { origin: [], 'metadata.iv': [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
origin: [ '["WF", "WF"]' ],
'metadata.iv': [ '[MaxKey, MinKey]' ]
}
}
},
rejectedPlans: [
{
stage: 'PROJECTION_COVERED',
transformBy: { _id: 0, origin: 1 },
inputStage: {
stage: 'IXSCAN',
keyPattern: { origin: -1, 'metadata.iv': -1 },
indexName: 'originDescending',
isMultiKey: false,
multiKeyPaths: { origin: [], 'metadata.iv': [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
origin: [ '["WF", "WF"]' ],
'metadata.iv': [ '[MaxKey, MinKey]' ]
}
}
}
]
},
command: {
find: 'collection',
filter: { origin: 'WF' },
projection: { _id: 0, origin: 1 },
'$db': 'db'
},
ok: 1,
'$clusterTime': {
clusterTime: Timestamp({ t: 1677593816, i: 1 }),
signature: {
hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
keyId: Long("0")
}
},
operationTime: Timestamp({ t: 1677593816, i: 1 })
}
This will only use an IXSCAN
phase and execute much faster.
I tried building this query in MongoDB Java driver version 4.8.0 by writing this (still in progress):
private int aggregate(InvoiceQuery query) {
var filter = new MongoInvoiceFilterCriteria(query.getFilter()).asBson();
var projection = Projections.fields(Projections.excludeId(), Projections.include("origin"));
var findPublisher =
this.collection.find(filter).projection(projection).explain();
var result = Flowable.fromPublisher(findPublisher).blockingFirst();
LOGGER.info("Aggregate " + result.toJson());
return 0;
}
Which will yield the follwing:
{
"explainVersion": "1",
"queryPlanner": {
"namespace": "db.collection",
"indexFilterSet": false,
"parsedQuery": {
"_id": {
"$exists": true
}
},
"queryHash": "88DBAD21",
"planCacheKey": "D9A2E277",
"maxIndexedOrSolutionsReached": false,
"maxIndexedAndSolutionsReached": false,
"maxScansToExplodeReached": false,
"winningPlan": {
"stage": "PROJECTION_SIMPLE",
"transformBy": {
"_id": 0,
"origin": 1
},
"inputStage": {
"stage": "FETCH",
"filter": {
"_id": {
"$exists": true
}
},
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"_id": 1
},
"indexName": "_id_",
"isMultiKey": false,
"multiKeyPaths": {
"_id": []
},
"isUnique": true,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"_id": [
"[MinKey, MaxKey]"
]
}
}
}
},
"rejectedPlans": []
},
"executionStats": {
"executionSuccess": true,
"nReturned": 9533,
"executionTimeMillis": 18,
"totalKeysExamined": 9533,
"totalDocsExamined": 9533,
"executionStages": {
"stage": "PROJECTION_SIMPLE",
"nReturned": 9533,
"executionTimeMillisEstimate": 4,
"works": 9534,
"advanced": 9533,
"needTime": 0,
"needYield": 0,
"saveState": 9,
"restoreState": 9,
"isEOF": 1,
"transformBy": {
"_id": 0,
"origin": 1
},
"inputStage": {
"stage": "FETCH",
"filter": {
"_id": {
"$exists": true
}
},
"nReturned": 9533,
"executionTimeMillisEstimate": 4,
"works": 9534,
"advanced": 9533,
"needTime": 0,
"needYield": 0,
"saveState": 9,
"restoreState": 9,
"isEOF": 1,
"docsExamined": 9533,
"alreadyHasObj": 0,
"inputStage": {
"stage": "IXSCAN",
"nReturned": 9533,
"executionTimeMillisEstimate": 0,
"works": 9534,
"advanced": 9533,
"needTime": 0,
"needYield": 0,
"saveState": 9,
"restoreState": 9,
"isEOF": 1,
"keyPattern": {
"_id": 1
},
"indexName": "_id_",
"isMultiKey": false,
"multiKeyPaths": {
"_id": []
},
"isUnique": true,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"_id": [
"[MinKey, MaxKey]"
]
},
"keysExamined": 9533,
"seeks": 1,
"dupsTested": 0,
"dupsDropped": 0
}
}
}
},
"command": {
"find": "collection",
"filter": {
"_id": {
"$exists": true
}
},
"projection": {
"_id": 0,
"origin": 1
},
"$db": "db"
},
"ok": 1.0,
"$clusterTime": {
"clusterTime": {
"$timestamp": {
"t": 1677769219,
"i": 1
}
},
"signature": {
"hash": {
"$binary": {
"base64": "AAAAAAAAAAAAAAAAAAAAAAAAAAA=",
"subType": "00"
}
},
"keyId": 0
}
},
"operationTime": {
"$timestamp": {
"t": 1677769219,
"i": 1
}
}
}
So again for some reason, mongo executes a FETCH
and an IXSCAN
phase although the query should to my understanding perform the same.
I considered using documentCount()
, estimatedDocumentCount()
, different aggregates()
and find()
queries. But I couldn't find any query that was as efficient as db.collection.find({}, {"_id":0, "origin": 1}).count()
on mongosh directly.
How can I improve the performance of the document count in java?
Edit for the answer provided by @Noel:
// build the aggregation pipeline
List<Bson> pipeline = Arrays.asList(
Aggregates.match(Filters.gte("_id", new MinKey())),
Aggregates.group("$1", Accumulators.sum("n", 1)));
// explain the execution stats of the aggregation pipeline
var findPublisher =
this.collection.aggregate(pipeline).explain();
Which also has a FETCH
step first:
{
"explainVersion": "2",
"queryPlanner": {
"namespace": "db.collection",
"indexFilterSet": false,
"parsedQuery": {
"_id": {
"$gte": {
"$minKey": 1
}
}
},
"queryHash": "D1046F5E",
"planCacheKey": "7E518BFB",
"optimizedPipeline": true,
"maxIndexedOrSolutionsReached": false,
"maxIndexedAndSolutionsReached": false,
"maxScansToExplodeReached": false,
"winningPlan": {
"queryPlan": {
"stage": "GROUP",
"planNodeId": 3,
"inputStage": {
"stage": "FETCH",
"planNodeId": 2,
"inputStage": {
"stage": "IXSCAN",
"planNodeId": 1,
"keyPattern": {
"_id": 1
},
"indexName": "_id_",
"isMultiKey": false,
"multiKeyPaths": {
"_id": []
},
"isUnique": true,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"_id": [
"[MinKey, MaxKey]"
]
}
}
}
},
"slotBasedPlan": {
}
},
"rejectedPlans": []
},
"executionStats": {
},
"command": {
"aggregate": "collection",
"pipeline": [
{
"$match": {
"_id": {
"$gte": {
"$minKey": 1
}
}
}
},
{
"$group": {
"_id": "$1",
"n": {
"$sum": 1
}
}
}
],
"cursor": {
"batchSize": 1
},
"$db": "db"
}
}
I think you should be targetting to get COUNT_SCAN as the winning plan. This query uses index on _id to count.
db.collection.explain("executionStats").aggregate([
{
$match: {
"_id": {"$gte": MinKey}
}
},
{
$group: {
"_id": 1, "n": {"$sum": 1}
}
}
]);
Java:
List<Bson> pipeline = Arrays.asList(
Aggregates.match(Filters.gte("_id", new MinKey())),
Aggregates.group("1", Accumulators.sum("n", 1))
);
Note: This may not trigger COUNT_SCAN on sharded collections.
Stats:
"winningPlan" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : []
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"indexBounds" : {
"startKey" : {
"_id" : { "$minKey" : 1 }
},
"startKeyInclusive" : true,
"endKey" : {
"_id" : { "$maxKey" : 1 }
},
"endKeyInclusive" : true
}
}
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 9,
"executionTimeMillis" : 0,
"totalKeysExamined" : 10,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT_SCAN",
"nReturned" : 9,
"executionTimeMillisEstimate" : 0,
"works" : 10,
"advanced" : 9,
"needTime" : 0,
"needYield" : 0,
"saveState" : 1,
"restoreState" : 1,
"isEOF" : 1,
"keysExamined" : 10,
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : []
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"indexBounds" : {
"startKey" : {
"_id" : { "$minKey" : 1 }
},
"startKeyInclusive" : true,
"endKey" : {
"_id" : { "$maxKey" : 1 }
},
"endKeyInclusive" : true
}
}