Search code examples
mongodbmergemapreduceupdates

Merge duplicates and remove the oldest


I have a collection where there are some duplicate documents. In example:

First document:

{
    "_id" : ObjectId("56f3d7cc1de31cb20c08ae6b"),
    "AddedDate" : ISODate("2016-05-01T00:00:00.000Z"),
    "Place": "THISPLACE",
    "PresentInDB" : [ 
        {
            "InDB" : ISODate("2016-05-01T00:00:00.000Z")
        }
     ],
    "Checked" : [],
    "Link": "http://www.mylink.com/first/84358"
}

Second document:

{
    "_id" : ObjectId("577740526c1e542904725238"),
    "AddedDate" : ISODate("2016-05-02T00:00:00.000Z"),
    "Place": "THISPLACE",
    "PresentInDB" : [ 
        {
            "InDB" : ISODate("2016-05-02T00:00:00.000Z")
        },
        {
            "InDB" : ISODate("2016-05-03T00:00:00.000Z")
        }
     ],
    "Checked" : [
        {
            "Done" : ISODate("2016-05-02T00:00:00.000Z")
        },
    ],
    "Link": "http://www.mylink.com/second/84358"
}

Link field contains same sequense of numbers in both documents, 84358.

So I would like to achieve those steps:

  1. Loop over each document in the collection.
  2. Match the number sequence in each document in the Link field (i.e. 84358 above) and if there are several documents in collection that have that sequence in the Link field. And also if Place field match in both documents:
  3. Merge PresentInDB and Checked fields - > merge PresentInDB and Checked fields by adding array values from the newest document (by date in AddedDate field) to the oldest document.
  4. Remove the newest document.

How could I achieve such a query?


Solution

  • In MongoDB 3.3.6 release is introduced a $split operator for dealing with strings in aggregation framework (Jira). Before this release you only could solve this with a map/reduce solution.

    After MongoDB 3.3.6 release: Aggregation framework solution

    db.duplicatedCollection.aggregate(
      [
        {
          $project: {
            _id : 1,
            AddedDate : 1,
            Place : 1,
            PresentInDB : 1,
            Checked : 1,
            Link : 1,
            sequenceNumber: { $arrayElemAt: [ {$split: ["$Link", "/"]}, -1 ]},
          }
        }, 
        {
          $sort: { AddedDate: 1 }
        },
        {
          $group: {
            _id : {
              sequenceNumber : "$sequenceNumber",
              Place : "$Place"
            },
            id : { $first: "$_id"},
            AddedDate: { $first: "$AddedDate" },
            Place :  { $first: "$Place" },
            PresentInDB: {
              $push: '$PresentInDB'
            },
            Checked: {
              $push: '$Checked'
            },
            Link: { $first: "$Link"}
          }
        }, 
       {
          $unwind: "$PresentInDB"
        },
        {
          $unwind: {
            path : "$PresentInDB",
            preserveNullAndEmptyArrays: true
          }    
        },
        {
          $unwind: "$Checked"
        },
        {
          $unwind: {
            path : "$Checked",
            preserveNullAndEmptyArrays: true
          }
        },    
        {
          $group: {
            _id : "$id",
            AddedDate: { $first: "$AddedDate" },        
            Place :  { $first: "$Place" },
            PresentInDB : {
              $addToSet: '$PresentInDB'
            },
            Checked : {
              $addToSet: '$Checked'
            },        
            Link: { $first: "$Link"}
          }
        },  
        {
          $out: "duplicatedCollection"
        }
      ]
    );
    

    Before MongoDB 3.3.6 release: Map/Reduce solution

    Map Function:

    var mapFunction = function() {
        var linkArray = this.Link.split("/");
        var sequenceNumber = linkArray[linkArray.length - 1];
    
        var keyDoc = { 
           place : this.Place,
           sequenceNumber: sequenceNumber, 
        };
    
        emit(keyDoc, this);
    };
    

    Reduce Function:

    var reduceFunction = function(key, values) {   
        var reducedDoc = {};
        reducedDoc._id = values[0]._id;
        reducedDoc.AddedDate = values[0].AddedDate;
        reducedDoc.Link = values[0].Link;
        reducedDoc.PresentInDB = [];
        reducedDoc.Checked = [];
    
        var presentInDbMillisArray = [];
        var checkedMillisArray = [];        
    
        values.forEach(function(doc) {
            if (reducedDoc.AddedDate < doc.AddedDate) {
                reducedDoc._id = doc._id;
                reducedDoc.AddedDate = doc.AddedDate;
                reducedDoc.Link = doc.Link;
            } 
    
            // PresentInDB field merge
            doc.PresentInDB.forEach(function(presentInDBElem) {
                var millis = presentInDBElem.InDB.getTime();
                if (!Array.contains(presentInDbMillisArray, millis)) { 
                    reducedDoc.PresentInDB.push(presentInDBElem);
                    presentInDbMillisArray.push(millis);
                }
            });
    
            // same here with Checked field
            doc.Checked.forEach(function(checkedElem) {
                var millis = checkedElem.Done.getTime();
                if (!Array.contains(checkedMillisArray, millis)) { 
                    reducedDoc.Checked.push(checkedElem);
                    checkedMillisArray.push(millis);
                }
            });
        });
        return reducedDoc;
    };
    

    Map/Reduce:

    db.duplicatedCollection.mapReduce(
        mapFunction,
        reduceFunction,
        { 
            "out": "duplicatedCollection"
        }
    );
    

    Unwrap the value from the map/reduce returned documents:

       db.duplicatedCollection.find(
        {
            value : {
                $exists: true 
            }
        }
        ).forEach(function(doc) {
            db.duplicatedCollection.insert(doc.value);
            db.duplicatedCollection.remove({_id : doc._id});
        });