Search code examples
mongodbmetabase

Count unique Ids ($addtoSet) only if they meet a condition


I'm trying to modify this query generated by Metabase. It generates counts of unique Ids divided by weeks. What I'm trying to accomplish is get unique Id counts by weeks ONLY if each Id has at least X entries for that period of time.

[
  {
    "$match": {
      "$and": [
        {
          (some filters)
        },
        {
          "$expr": {
            "$gte": [
              "$createdAt", { "$subtract": [ISODate(), { "$multiply": [3600000, 24, 7, 8] }] }
            ]
          }
        },
        {
          "$expr": {
            "$lt": [
              "$createdAt", ISODate() 
            ]
          }
        }
      ]
    }
  },

  {
    "$group": {
      "_id": {
        "createdAt~~~week": {
          "$let": {
            "vars": {
              "parts": {
                "$dateToParts": {
                  "date": {
                    "$subtract": [
                      "$createdAt",
                      {
                        "$multiply": [
                          {
                            "$subtract": [
                              {
                                "$let": {
                                  "vars": {
                                    "day_of_week": {
                                      "$mod": [
                                        {
                                          "$add": [
                                            {
                                              "$dayOfWeek": "$createdAt"
                                            },
                                            6
                                          ]
                                        },
                                        7
                                      ]
                                    }
                                  },
                                  "in": {
                                    "$cond": {
                                      "if": {
                                        "$eq": [
                                          "$$day_of_week",
                                          0
                                        ]
                                      },
                                      "then": 7,
                                      "else": "$$day_of_week"
                                    }
                                  }
                                }
                              },
                              1
                            ]
                          },
                          86400000
                        ]
                      }
                    ]
                  }
                }
              }
            },
            "in": {
              "$dateFromParts": {
                "year": "$$parts.year",
                "month": "$$parts.month",
                "day": "$$parts.day"
              }
            }
          }
        }
      },
      "count": {
        "$addToSet": "$user._id"
      }
    }
  },
  {
    "$project": {
      "_id": false,
      "createdAt~~~week": "$_id.createdAt~~~week",
      "count": {
        "$size": "$count"
      }
    }
  }
]

The main problem is this part:

  "count": {
        "$addToSet": "$user._id"
      }

as by using "$addtoSet" it's counting all the unique Ids, even the ones that only have a single entry. When I do this type of query normally, I usually group by Id > count > match on counts that meet the criteria. In this case since it's grouped based on weeks I don't know how to go about it. Any help would be appreciated.


Solution

  • You need to add another $group stage, first group by user and "week" and sum the interactions, at this point we can filter out all the users that don't have sufficient count and continue with the pipeline, it will look like this:

    db.collection.aggregate([
      {
        $group: {
          _id: {
            user: "$user._id",
            "createdAt~~~week": {
              "$let": {
                "vars": {
                  "parts": {
                    "$dateToParts": {
                      "date": {
                        "$subtract": [
                          "$createdAt",
                          {
                            "$multiply": [
                              {
                                "$subtract": [
                                  {
                                    "$let": {
                                      "vars": {
                                        "day_of_week": {
                                          "$mod": [
                                            {
                                              "$add": [
                                                {
                                                  "$dayOfWeek": "$createdAt"
                                                },
                                                6
                                              ]
                                            },
                                            7
                                          ]
                                        }
                                      },
                                      "in": {
                                        "$cond": {
                                          "if": {
                                            "$eq": [
                                              "$$day_of_week",
                                              0
                                            ]
                                          },
                                          "then": 7,
                                          "else": "$$day_of_week"
                                        }
                                      }
                                    }
                                  },
                                  1
                                ]
                              },
                              86400000
                            ]
                          }
                        ]
                      }
                    }
                  }
                },
                "in": {
                  "$dateFromParts": {
                    "year": "$$parts.year",
                    "month": "$$parts.month",
                    "day": "$$parts.day"
                  }
                }
              }
            }
          },
          sum: {
            $sum: 1
          }
        }
      },
      {
        $match: {
          sum: {
            $gt: x // x is the number of interactions you want the user to have more than, this will only match relevant users.
          }
        }
      },
      {
        "$group": {
          "_id": "$_id.createdAt~~~week",
          "count": {
            "$addToSet": "$_id.user"
          }
        }
      },
      {
        "$project": {
          "_id": false,
          "createdAt~~~week": "$_id",
          "count": {
            "$size": "$count"
          }
        }
      }
    ])
    

    Mongo Playground

    One thing to note with this approach weeks with "0" users will not appear, you can instead use a conditional $addToSet based on the sum field if you want to allow documents with count 0.