Search code examples
mongodbmapreduceset-union

Union Set using MapReduce MongoDB


I'm trying to unite two collections using MapReduce. They have identical structure, for example:

db.tableR.insert({product:"A", quantity:150});
db.tableR.insert({product:"B", quantity:100});
db.tableR.insert({product:"C", quantity:60});
db.tableR.insert({product:"D", quantity:200});

db.tableS.insert({product:"A", quantity:150});
db.tableS.insert({product:"B", quantity:100});
db.tableS.insert({product:"F", quantity:220});
db.tableS.insert({product:"G", quantity:130});

I want MapReduce delete duplicates.

I'm creating a map that divides collection according quantity:

map = function(){
    if (this.quantity<150){
        var key=0;
    }else{
    var key=1;
    }
    var value = {"product":this.product, "quantity":this.quantity};
    emit(key,value);
};

Now I want that reduce function removes duplicates but I can't find a way to add the new ones to the reduced var.

This is what I tried:

reduce = function(keys,values){
var reduced = {
    product:"",
    quantity:""
};
for (var i=0; i < values.length;i++)
{
    if(values[i].product !== null) {reduced.insert({product: values[i].product, quantity: values[i].quantity})}

}
return reduced;};

db.tableR.mapReduce(map,reduce,{out:'map_reduce_result'});
db.tableS.mapReduce(map,reduce,{out:'map_reduce_result'});
db.map_reduce_result.find();

What function can I use?

My expected output:

   {"_id" : 0, "value" : {"product" : "B","quantity" : 100}}
   {"_id" : 0, "value" : {"product" : "C","quantity" : 60}}
   {"_id" : 0, "value" : {"product" : "G","quantity" : 130}}
   {"_id" : 1, "value" : {"product" : "A","quantity" : 150}}
   {"_id" : 1, "value" : {"product" : "D","quantity" : 200}}
   {"_id" : 1, "value" : {"product" : "F","quantity" : 220}}

Solution

  • The reduce function can only return a single value, so you want it to execute for every single row. The reduce function gets called for each unique key returned in your map function. Your keys were 0 and 1, so it would only get called twice for each collection - once for key 0 and once for key 1. Hence, the max number of results would only be 2 for each collection.

    What you need to do is set the key to the product in the map function:

    map = function(){
        emit(this.product,{product:this.product,quantity:this.quantity});
    };
    

    Now, the reduce function will get called for every unique product value. Our new map function just returns the first value in the array (if there were duplicates in the same collection it would just take the first. You could be smart here and take the highest or lowest quantity - or the sum of the quantities, etc).

    reduce = function(keys,values){
        return values[0];
    };
    

    Run your first map reduce job:

    db.tableR.mapReduce(map,reduce,{out:'map_reduce_result'});
    

    Run your second, but this time merge the result:

    db.tableS.mapReduce(map,reduce,{out: {merge: 'map_reduce_result'}});
    

    Now db.map_reduce_result.find() returns:

    { "_id" : "A", "value" : { "product" : "A", "quantity" : 150 } }
    { "_id" : "B", "value" : { "product" : "B", "quantity" : 100 } }
    { "_id" : "C", "value" : { "product" : "C", "quantity" : 60 } }
    { "_id" : "D", "value" : { "product" : "D", "quantity" : 200 } }
    { "_id" : "F", "value" : { "product" : "F", "quantity" : 220 } }
    { "_id" : "G", "value" : { "product" : "G", "quantity" : 130 } }
    

    Obviously the _id doesn't match what you are looking for. If you absolutely need that you can use the aggregation framework like so:

    db.map_reduce_result.aggregate([{$project:{
      _id:{$cond: { if: { $gte: [ "$value.quantity", 150 ] }, then: 1, else: 0 }},
      value:1
    }}]);
    

    This results in:

    { "_id" : 1, "value" : { "product" : "A", "quantity" : 150 } }
    { "_id" : 0, "value" : { "product" : "B", "quantity" : 100 } }
    { "_id" : 0, "value" : { "product" : "C", "quantity" : 60 } }
    { "_id" : 1, "value" : { "product" : "D", "quantity" : 200 } }
    { "_id" : 1, "value" : { "product" : "F", "quantity" : 220 } }
    { "_id" : 0, "value" : { "product" : "G", "quantity" : 130 } }
    

    Note: If two rows from different collections have the same product ID, but different quantities I am unsure which one will be returned.