Search code examples
mongodbmongodb-querymonk

how to get data in batches in mongodb


I want to retrieve data from MongoDB, 5 at a time

I am using limit to limit the number of records returned

router.post('/List', function (req, res) {
    var db = req.db;
    var collection = db.get('clnName');
    collection.find({}, { limit: 5 * req.body.requestCount }, function (e, docs) {
        res.json(docs);
    });
});

Here, from the client i am incrementing the requestCount variable so that i get data in multiples of 5. What i want to achieve is get first 5 data in first request, get next 5 data in second request, but what is happening is, i get first 5 data and then first 10 data.

What change should i make to achieve what i need?

will the use of batch size in mongo cursor methods solve my problem?


Solution

  • Well the clear an obvious case here is to use .skip() as a modifier along with .limit() in order to implement "paging" of data:

        collection.find({}, { "limit": 5, "skip": 5 * req.body.requestCount  }, function 
    

    But better still if you are just processing in batches, just filter out the range you have already seen. The _id field makes a nice identifier for this without other sorting. So on the first request:

    var lastSeen = null;
        collection.find(
            {}, 
            { "limit": 5, "sort": { "_id": 1}  },
            function(err,docs) {
               docs.forEach(function(doc) {
                   // do something
                   lastSeen = doc._id;        // keep the _id
               });
            }
        );
    

    And the next time around after storing that "lastSeen" in something like a session variable ( or other loop construct where just processing batches ):

        collection.find(
            { "_id": { "$gt": lastSeen }, 
            { "limit": 5, "sort": { "_id": 1}  },
            function(err,docs) {
               docs.forEach(function(doc) {
                   // do something
                   lastSeen = doc._id;        // keep the _id
               });
            }
        );
    

    So excluding all results less that the last _id value seen.

    With other sorting this is still possible, but you need to take note of both the last _id seen and the last sorted value as well. Also keeping the _id seen as a list since the last value change.

        var lastSeenIds = [],
            lastSeenValue = null;    
    
        collection.find(
            {}, 
            { "limit": 5, "sort": { "other": 1, "_id": 1 }  },
            function(err,docs) {
               docs.forEach(function(doc) {
                   // do something
                   if ( lastSeenValue != doc.other ) {  // clear on change
                       lastSeenValue = doc.other;
                       lastSeenIds = [];
                   }
                   lastSeenIds.push(doc._id);     // keep a list
               });
            }
        );
    

    Then on your next iteration with variables in place:

        collection.find(
            { "_id": { "$nin": lastSeenIds }, "other": { "$gte": lastSeenValue } },
            { "limit": 5, "sort": { "other": 1, "_id": 1 }  },
            function(err,docs) {
               docs.forEach(function(doc) {
                   // do something
                   if ( lastSeenValue != doc.other ) {  // clear on change
                       lastSeenValue = doc.other;
                       lastSeenIds = [];
                   }
                   lastSeenIds.push(doc._id);     // keep a list
               });
            }
        );
    

    That is a lot more efficient than "skipping" through the results that match the basic query condition.