Search code examples
mongodbmapreducekinveynosql

How to group documents by matching array elements with MapReduce in MongoDB?


I have a database with a column containing an array of strings. Example table:

name | words                          | ...
Ash  | ["Apple", "Pear", "Plum"]      | ...
Joe  | ["Walnut", "Peanut"]           | ...
Max  | ["Pineapple", "Apple", "Plum"] | ...

Now I would like to match this table against a given array of words and group the documents by their matching rate.

Example input with expected result:

// matched for input = ["Walnut", "Peanut", "Apple"]
{
  "1.00": [{name:"Joe", match:"1.00"}],
  "0.33": [{name:"Ash", match:"0.33"}, {name:"Max", match:"0.33"}]
}

I am using the following map function emitting the document with the matching rate as the key:

function map() {
    var matches = 0.0;
    for(var i in input) 
      if(this.words.indexOf(input[i]) !== -1) matches+=1;
    matches /= input.length;
    var key = ""+matches.toFixed(2);
    emit(key, {name: this.name, match: key});
}

Now missing is a matching reduce function to combine the emitted KV pairs into the result object.

I have tried it like this:

function reduce(key, value) {
    var res = {};
    res[key] = values;
    return res;
}

However I have trouble with the specification that

MongoDB can invoke the reduce function more than once for the same key. In this case, the previous output from the reduce function for that key will become one of the input values to the next reduce function invocation for that key.

...resulting in nested result objects. What is the correct way to group documents by their match?


Solution

  • invoke the reduce function more than once for the same key.

    That's idempotence, and the reduce function must respect that.

    But, to make this simple, you just have to make sure the map output is in the same format as the reduce one.

    For your case, something like this will work:

    db.col.insert({"name": "Ash", "words": ["Apple", "Pear", "Plum"]})
    db.col.insert({"name": "Joe", "words": ["Walnut", "Peanut"]})
    db.col.insert({"name": "Max", "words": ["Pineapple", "Apple", "Plum"]})
    
    function map() {
    
        input = ["Walnut", "Peanut", "Apple"]
    
        var matches = 0.0;
        for(var i in input) 
          if(this.words.indexOf(input[i]) !== -1) matches+=1;
        matches /= input.length;
        var key = ""+matches.toFixed(2);
    
        emit(key, {users: [{name: this.name, match: key}]});
    }
    
    function reduce(key, value) {
    
        ret = value[0]
    
        for(var i=1; i<value.length; i++){
            ret.users = ret.users.concat(value[i].users)
        }
    
        return ret
    
    }
    
    db.col.mapReduce(map, reduce, {"out": {inline:1}})
    

    Output:

    {
        "results" : [
            {
                "_id" : "0.33",
                "value" : {
                    "users" : [
                        {
                            "name" : "Ash",
                            "match" : "0.33"
                        },
                        {
                            "name" : "Max",
                            "match" : "0.33"
                        }
                    ]
                }
            },
            {
                "_id" : "0.67",
                "value" : {
                    "users" : [
                        {
                            "name" : "Joe",
                            "match" : "0.67"
                        }
                    ]
                }
            }
        ],
        "timeMillis" : 22,
        "counts" : {
            "input" : 3,
            "emit" : 3,
            "reduce" : 1,
            "output" : 2
        },
        "ok" : 1
    }