Search code examples

How can I find the number of duplicates for a field using MongoDB Java?

How can I find the number of duplicates in each document in Java-MongoDB I have collection like this. Collection example:

    "_id": {
        "$oid": "5fc8eb07d473e148192fbecd"
    "ip_address": "",
    "mac_address": "00:A0:C9:14:C8:29",
    "url": "",
    "datetimes": {
        "$date": "2021-02-13T02:02:00.000Z"
    "_id": {
        "$oid": "5ff539269a10d529d88d19f4"
    "ip_address": "",
    "mac_address": "00:A0:C9:14:C8:30",
    "url": "",
    "datetimes": {
        "$date": "2021-02-12T19:00:00.000Z"
    "_id": {
        "$oid": "60083d9a1cad2b613cd0c0a2"
    "ip_address": "",
    "mac_address": "00:0A:05:C7:C8:31",
    "url": "",
    "datetimes": {
        "$date": "2021-01-24T17:00:00.000Z"

example query:

            BasicDBObject whereQuery = new BasicDBObject();
            DBCursor cursor = table1.find(whereQuery);
            while (cursor.hasNext()) {
                DBObject obj =;
                String ip_address = (String) obj.get("ip_address");
                String mac_address = (String) obj.get("mac_address");
                Date datetimes = (Date) obj.get("datetimes");
                String url = (String) obj.get("url");
                System.out.println(ip_address, mac_address, datetimes, url);

in Java, How I can know count duplicated data of "url". And how many of duplicated.


  • in mongodb you can solve this problem with "Aggregation Pipelines". You need to implement this pipeline in "Mongodb Java Driver". It gives only duplicated results with their duplicates count.

            "$group": {
                // group by url and calculate count of duplicates by url 
                "_id": "$url",
                "url": {
                    "$first": "$url"
                "duplicates_count": {
                    "$sum": 1
                "duplicates": {
                    "$push": {
                        "_id": "$_id",
                        "ip_address": "$ip_address",
                        "mac_address": "$mac_address",
                        "url": "$url",
                        "datetimes": "$datetimes"
        {   // select documents that only duplicates count higher than 1
            "$match": {
                "duplicates_count": {
                    "$gt": 1
            "$project": {
                "_id": 0

    Output Result:

        "url" : "",
        "duplicates_count" : 2.0,
        "duplicates" : [ 
                "_id" : ObjectId("5fc8eb07d473e148192fbecd"),
                "ip_address" : "",
                "mac_address" : "00:A0:C9:14:C8:29",
                "url" : "",
                "datetimes" : {
                    "$date" : "2021-02-13T02:02:00.000Z"
                "_id" : ObjectId("5ff539269a10d529d88d19f4"),
                "ip_address" : "",
                "mac_address" : "00:A0:C9:14:C8:30",
                "url" : "",
                "datetimes" : {
                    "$date" : "2021-02-12T19:00:00.000Z"