arrays mongodb indexing subdocument multikey

mongodb multikey index bounds in arrays of subdocs

For background, the page from the docs that seems most relevant to my question is this page on multikey index bounds and index intersection.

I'm having an issue where it seems like mongo is not properly combining index boundaries in an $elemMatch query on a field in an array of subdocs.

The docs don't seem to say that my specific use case should not work and there are plenty of examples that hit very close to what I'm doing. From a technical standpoint I can't come up with any reason this shouldn't be working. Did I miss something or can someone explain to me why this behaves the way it does?

I'm starting with a collection of documents like this:

mongos> db.test.findOne()
{
    "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"),
    "data" : [
            {
                    "point" : 1,
                    "other" : "what"
            }
    ]
}

I have indexes in place like this:

mongos> db.test.getIndexes()
[
    {
            "v" : 1,
            "key" : {
                    "_id" : 1
            },
            "name" : "_id_",
            "ns" : "temp.test"
    },
    {
            "v" : 1,
            "key" : {
                    "data.point" : 1
            },
            "name" : "data.point_1",
            "ns" : "temp.test"
    }
]

When the data array only has one subdoc in it like this:

mongos> db.test.find()
{ "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"), "data" : [ { "point" : 1, "other" : "what" } ] }
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
{ "_id" : ObjectId("54c7fdb59a9950e75fa616bb"), "data" : [ { "point" : 3, "other" : "where" } ] }

An $elemMatch query on data works just fine:

mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}})
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }

mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}}).explain(true)
{
    "cursor" : "BtreeCursor data.point_1",
    "isMultiKey" : false,
    "n" : 1,
    "nscannedObjects" : 1,
    "nscanned" : 1,
    "nscannedObjectsAllPlans" : 1,
    "nscannedAllPlans" : 1,
    "scanAndOrder" : false,
    "indexOnly" : false,
    "nYields" : 0,
    "nChunkSkips" : 0,
    "millis" : 0,
    "indexBounds" : {
            "data.point" : [
                    [
                            2,
                            2
                    ]
            ]
    },
    "allPlans" : [
            {
                    "cursor" : "BtreeCursor data.point_1",
                    "isMultiKey" : false,
                    "n" : 1,
                    "nscannedObjects" : 1,
                    "nscanned" : 1,
                    "scanAndOrder" : false,
                    "indexOnly" : false,
                    "nChunkSkips" : 0,
                    "indexBounds" : {
                            "data.point" : [
                                    [
                                            2,
                                            2
                                    ]
                            ]
                    }
            }
    ],
    "server" : "XXXXXX",
    "filterSet" : false,
    "stats" : {
            "type" : "KEEP_MUTATIONS",
            "works" : 2,
            "yields" : 0,
            "unyields" : 0,
            "invalidates" : 0,
            "advanced" : 1,
            "needTime" : 0,
            "needFetch" : 0,
            "isEOF" : 1,
            "children" : [
                    {
                            "type" : "FETCH",
                            "works" : 2,
                            "yields" : 0,
                            "unyields" : 0,
                            "invalidates" : 0,
                            "advanced" : 1,
                            "needTime" : 0,
                            "needFetch" : 0,
                            "isEOF" : 1,
                            "alreadyHasObj" : 0,
                            "forcedFetches" : 0,
                            "matchTested" : 1,
                            "children" : [
                                    {
                                            "type" : "IXSCAN",
                                            "works" : 2,
                                            "yields" : 0,
                                            "unyields" : 0,
                                            "invalidates" : 0,
                                            "advanced" : 1,
                                            "needTime" : 0,
                                            "needFetch" : 0,
                                            "isEOF" : 1,
                                            "keyPattern" : "{ data.point: 1.0 }",
                                            "isMultiKey" : 0,
                                            "boundsVerbose" : "field #0['data.point']: [2.0, 2.0]",
                                            "yieldMovedCursor" : 0,
                                            "dupsTested" : 0,
                                            "dupsDropped" : 0,
                                            "seenInvalidated" : 0,
                                            "matchTested" : 0,
                                            "keysExamined" : 1,
                                            "children" : [ ]
                                    }
                            ]
                    }
            ]
    },
    "millis" : 0
}

But when I add any document with more than one subdoc inside the data array, like this:

mongos> db.test.insert({data: [{point: 3, other: 'where'}, {point:4, other:"huh"}]})
WriteResult({ "nInserted" : 1 })
mongos> db.test.find()
{ "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"), "data" : [ { "point" : 1, "other" : "what" } ] }
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
{ "_id" : ObjectId("54c7fdb59a9950e75fa616bb"), "data" : [ { "point" : 3, "other" : "where" } ] }
{ "_id" : ObjectId("54c806c39a9950e75fa616bc"), "data" : [ { "point" : 3, "other" : "where" }, { "point" : 4, "other" : "huh" } ] }

The query takes orders of magnitude longer (in nontrivial test cases that is), and the explain changes the bounds from the proper [2, 2] to [-Infinity, 2] and the isMultiKey flag ticks over to true:

mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}}).explain(true)
{
    "cursor" : "BtreeCursor data.point_1",
    "isMultiKey" : true,
    "n" : 1,
    "nscannedObjects" : 2,
    "nscanned" : 2,
    "nscannedObjectsAllPlans" : 2,
    "nscannedAllPlans" : 2,
    "scanAndOrder" : false,
    "indexOnly" : false,
    "nYields" : 0,
    "nChunkSkips" : 0,
    "millis" : 0,
    "indexBounds" : {
            "data.point" : [
                    [
                            -Infinity,
                            2
                    ]
            ]
    },
    "allPlans" : [
            {
                    "cursor" : "BtreeCursor data.point_1",
                    "isMultiKey" : true,
                    "n" : 1,
                    "nscannedObjects" : 2,
                    "nscanned" : 2,
                    "scanAndOrder" : false,
                    "indexOnly" : false,
                    "nChunkSkips" : 0,
                    "indexBounds" : {
                            "data.point" : [
                                    [
                                            -Infinity,
                                            2
                                    ]
                            ]
                    }
            }
    ],
    "server" : "XXXXXX",
    "filterSet" : false,
    "stats" : {
            "type" : "KEEP_MUTATIONS",
            "works" : 3,
            "yields" : 0,
            "unyields" : 0,
            "invalidates" : 0,
            "advanced" : 1,
            "needTime" : 1,
            "needFetch" : 0,
            "isEOF" : 1,
            "children" : [
                    {
                            "type" : "FETCH",
                            "works" : 3,
                            "yields" : 0,
                            "unyields" : 0,
                            "invalidates" : 0,
                            "advanced" : 1,
                            "needTime" : 1,
                            "needFetch" : 0,
                            "isEOF" : 1,
                            "alreadyHasObj" : 0,
                            "forcedFetches" : 0,
                            "matchTested" : 1,
                            "children" : [
                                    {
                                            "type" : "IXSCAN",
                                            "works" : 3,
                                            "yields" : 0,
                                            "unyields" : 0,
                                            "invalidates" : 0,
                                            "advanced" : 2,
                                            "needTime" : 0,
                                            "needFetch" : 0,
                                            "isEOF" : 1,
                                            "keyPattern" : "{ data.point: 1.0 }",
                                            "isMultiKey" : 1,
                                            "boundsVerbose" : "field #0['data.point']: [-inf.0, 2.0]",
                                            "yieldMovedCursor" : 0,
                                            "dupsTested" : 2,
                                            "dupsDropped" : 0,
                                            "seenInvalidated" : 0,
                                            "matchTested" : 0,
                                            "keysExamined" : 2,
                                            "children" : [ ]
                                    }
                            ]
                    }
            ]
    },
    "millis" : 0
}

It's still using the correct btree index and getting the correct results, but this issue is the difference between usable and unusable on large datasets.

I know that the query I'm using is also equivalent to this:

db.test.find({data: {$elemMatch: {point: 2}}})

But I'm doing that for simplicity - the same behavior is observed when specifying any bounds with any of $gt $gte $lt $lte in that the index bounds are improperly set in the index (or so it seems to me).

For reference, when I do the above query, I actually get the index bounds I would expect, so its not like mongo can't issue a query that meets the plan I want using arrays of multiple subdocs:

{
    "cursor" : "BtreeCursor data.point_1",
    "isMultiKey" : true,
    "n" : 1,
    "nscannedObjects" : 1,
    "nscanned" : 1,
    "nscannedObjectsAllPlans" : 1,
    "nscannedAllPlans" : 1,
    "scanAndOrder" : false,
    "indexOnly" : false,
    "nYields" : 0,
    "nChunkSkips" : 0,
    "millis" : 0,
    "indexBounds" : {
            "data.point" : [
                    [
                            2,
                            2
                    ]
            ]
    },
    "allPlans" : [
            {
                    "cursor" : "BtreeCursor data.point_1",
                    "isMultiKey" : true,
                    "n" : 1,
                    "nscannedObjects" : 1,
                    "nscanned" : 1,
                    "scanAndOrder" : false,
                    "indexOnly" : false,
                    "nChunkSkips" : 0,
                    "indexBounds" : {
                            "data.point" : [
                                    [
                                            2,
                                            2
                                    ]
                            ]
                    }
            }
    ],
    "server" : "XXXXXX",
    "filterSet" : false,
    "stats" : {
            "type" : "KEEP_MUTATIONS",
            "works" : 2,
            "yields" : 0,
            "unyields" : 0,
            "invalidates" : 0,
            "advanced" : 1,
            "needTime" : 0,
            "needFetch" : 0,
            "isEOF" : 1,
            "children" : [
                    {
                            "type" : "FETCH",
                            "works" : 2,
                            "yields" : 0,
                            "unyields" : 0,
                            "invalidates" : 0,
                            "advanced" : 1,
                            "needTime" : 0,
                            "needFetch" : 0,
                            "isEOF" : 1,
                            "alreadyHasObj" : 0,
                            "forcedFetches" : 0,
                            "matchTested" : 1,
                            "children" : [
                                    {
                                            "type" : "IXSCAN",
                                            "works" : 2,
                                            "yields" : 0,
                                            "unyields" : 0,
                                            "invalidates" : 0,
                                            "advanced" : 1,
                                            "needTime" : 0,
                                            "needFetch" : 0,
                                            "isEOF" : 1,
                                            "keyPattern" : "{ data.point: 1.0 }",
                                            "isMultiKey" : 1,
                                            "boundsVerbose" : "field #0['data.point']: [2.0, 2.0]",
                                            "yieldMovedCursor" : 0,
                                            "dupsTested" : 1,
                                            "dupsDropped" : 0,
                                            "seenInvalidated" : 0,
                                            "matchTested" : 0,
                                            "keysExamined" : 1,
                                            "children" : [ ]
                                    }
                            ]
                    }
            ]
    },
    "millis" : 0
}

... So again - Did I miss something? Am I doing it wrong? Is there a work around? Is this a bug or a known issue?

I'm using mongodb v2.6.5 in a sharded replicated cluseter.

Solution

After searching through mongodb's jira, I found a known issue that explains this behavior.

https://jira.mongodb.org/browse/SERVER-15086