pandas mongodb aggregation-framework pymongo

Mongodb aggregation to find outliers

In my mongodb collection documents are stored in the following format:

{ "_id" : ObjectId("62XXXXXX"), "res" : 12, ... }
{ "_id" : ObjectId("63XXXXXX"), "res" : 23, ... }
{ "_id" : ObjectId("64XXXXXX"), "res" : 78, ... }
...

I need to extract id's for the document for which the value of "res" is outlier (i.e. value < Q1 - 1.5 * IQR or value > Q3 + 1.5 * IQR (Q1, Q3 are percentiles)). I have done this using pandas functionality by retrieving all documents from the collection, which may become slow if the number of documents in collection become too big.

Is there a way to do this using mongodb aggregation pipeline (or just calculating percentiles)?

Solution

If I understand how you want to retrieve outliers, here's one way you might be able to do it.

db.collection.aggregate([
  { // partition res into quartiles
    "$bucketAuto": {
      "groupBy": "$res",
      "buckets": 4
    }
  },
  { // get the max of each quartile
    "$group": {
      "_id": "$_id.max"
    }
  },
  { // sort the quartile maxs
    "$sort": {
      "_id": 1
    }
  },
  { // put sorted quartile maxs into array
    "$group": {
      "_id": null,
      "maxs": {"$push": "$_id"}
    }
  },
  { // assign Q1 and Q3
    "$project": {
      "_id": 0,
      "q1": {"$arrayElemAt": ["$maxs", 0]},
      "q3": {"$arrayElemAt": ["$maxs", 2]}
    }
  },
  { // set IQR
    "$set": {
      "iqr": {
        "$subtract": ["$q3", "$q1"]
      }
    }
  },
  { // assign upper/lower outlier thresholds
    "$project": {
      "outlierThresholdLower": {
        "$subtract": [
          "$q1",
          {"$multiply": ["$iqr", 1.5]}
        ]
      },
      "outlierThresholdUpper": {
        "$add": [
          "$q3",
          {"$multiply": ["$iqr", 1.5]}
        ]
      }
    }
  },
  { // get outlier _id's
    "$lookup": {
      "from": "collection",
      "as": "outliers",
      "let": {
        "oTL": "$outlierThresholdLower",
        "oTU": "$outlierThresholdUpper"
      },
      "pipeline": [
        {
          "$match": {
            "$expr": {
              "$or": [
                {"$lt": ["$res", "$$oTL"]},
                {"$gt": ["$res", "$$oTU"]}
              ]
            }
          }
        },
        {
          "$project": {
            "_id": 1
          }
        }
      ]
    }
  }
])

Try it on mongoplayground.net.