Search code examples
pandasmongodbaggregation-frameworkpymongo

Mongodb aggregation to find outliers


In my mongodb collection documents are stored in the following format:

{ "_id" : ObjectId("62XXXXXX"), "res" : 12, ... }
{ "_id" : ObjectId("63XXXXXX"), "res" : 23, ... }
{ "_id" : ObjectId("64XXXXXX"), "res" : 78, ... }
...

I need to extract id's for the document for which the value of "res" is outlier (i.e. value < Q1 - 1.5 * IQR or value > Q3 + 1.5 * IQR (Q1, Q3 are percentiles)). I have done this using pandas functionality by retrieving all documents from the collection, which may become slow if the number of documents in collection become too big.

Is there a way to do this using mongodb aggregation pipeline (or just calculating percentiles)?


Solution

  • If I understand how you want to retrieve outliers, here's one way you might be able to do it.

    db.collection.aggregate([
      { // partition res into quartiles
        "$bucketAuto": {
          "groupBy": "$res",
          "buckets": 4
        }
      },
      { // get the max of each quartile
        "$group": {
          "_id": "$_id.max"
        }
      },
      { // sort the quartile maxs
        "$sort": {
          "_id": 1
        }
      },
      { // put sorted quartile maxs into array
        "$group": {
          "_id": null,
          "maxs": {"$push": "$_id"}
        }
      },
      { // assign Q1 and Q3
        "$project": {
          "_id": 0,
          "q1": {"$arrayElemAt": ["$maxs", 0]},
          "q3": {"$arrayElemAt": ["$maxs", 2]}
        }
      },
      { // set IQR
        "$set": {
          "iqr": {
            "$subtract": ["$q3", "$q1"]
          }
        }
      },
      { // assign upper/lower outlier thresholds
        "$project": {
          "outlierThresholdLower": {
            "$subtract": [
              "$q1",
              {"$multiply": ["$iqr", 1.5]}
            ]
          },
          "outlierThresholdUpper": {
            "$add": [
              "$q3",
              {"$multiply": ["$iqr", 1.5]}
            ]
          }
        }
      },
      { // get outlier _id's
        "$lookup": {
          "from": "collection",
          "as": "outliers",
          "let": {
            "oTL": "$outlierThresholdLower",
            "oTU": "$outlierThresholdUpper"
          },
          "pipeline": [
            {
              "$match": {
                "$expr": {
                  "$or": [
                    {"$lt": ["$res", "$$oTL"]},
                    {"$gt": ["$res", "$$oTU"]}
                  ]
                }
              }
            },
            {
              "$project": {
                "_id": 1
              }
            }
          ]
        }
      }
    ])
    

    Try it on mongoplayground.net.