Search code examples
mongodbmapreducemedian

Efficient Median Calculation in MongoDB


We have a Mongo collection named analytics and it tracks user visits by a cookie id. We want to calculate medians for several variables as users visit different pages.

Mongo does not yet have an internal method for calculating the median. I have used the below method for determining it, but I'm afraid there is be a more efficient way as I'm pretty new to JS. Any comments would be appreciated.

// Saves the JS function for calculating the Median. Makes it accessible to the Reducer.
db.system.js.save({_id: "myMedianValue",
    value: function (sortedArray) {
    var m = 0.0;
    if (sortedArray.length % 2 === 0) {
        //Even numbered array, average the middle two values
        idx2 = sortedArray.length / 2;
        idx1 = idx2 - 1;
        m = (sortedArray[idx1] + sortedArray[idx2]) / 2;
    } else {
        //Odd numbered array, take the middle value
        idx = Math.floor(sortedArray.length/2);
        m = sortedArray[idx];
    }
        return m
    }
});


var mapFunction = function () {
    key = this.cookieId;
    value = {
        // If there is only 1 view it will look like this
        // If there are multiple it gets passed to the reduceFunction
        medianVar1: this.Var1,
        medianVar2: this.Var2,
        viewCount: 1
    };

    emit(key, value);
    };

var reduceFunction = function(keyCookieId, valueDicts) {
    Var1Array = Array();
    Var2Array = Array();
    views = 0;

    for (var idx = 0; idx < valueDicts.length; idx++) {
        Var1Array.push(valueDicts[idx].medianVar1);
        Var2Array.push(valueDicts[idx].medianVar2);
        views += valueDicts[idx].viewCount;
    }


    reducedDict = {
        medianVar1: myMedianValue(Var1Array.sort(function(a, b){return a-b})),
        medianVar2: myMedianValue(Var2Array.sort(function(a, b){return a-b})),
        viewCount: views
    };

    return reducedDict
    };


db.analytics.mapReduce(mapFunction,
                       reduceFunction,
                       { out: "analytics_medians",
                         query: {Var1: {$exists:true},
                                 Var2: {$exists:true}
                                 }}
                                 )

Solution

  • We ended up updating the medians every page request, rather than in bulk with a cron job or something. We have a Node API that uses Mongo's aggregation framework to do the match/sort the user's results. The array of results then pass to a median function within Node. The results are then written back to Mongo for that user. Not super pleased with it, but it doesn't appear to have locking issues and is performing well.