Search code examples
mongodbaggregation-frameworknosql-aggregation

How to use MongoDB aggregation query to group documents by status and create ranges of id with each group?


How to use MongoDB aggregation query to obtain range of documents based on filtered status?

Consider the following example data:

[
  {
    "visible_id": 1,
    "status": "INITIAL",
    "name": "ab"
  },
  {
    "visible_id": 2,
    "status": "INITIAL",
    "name": "aab"
  },
  {
    "visible_id": 3,
    "status": "SCANNED",
    "name": "bba"
  },
  {
    "visible_id": 4,
    "status": "SCANNED",
    "name": "cca"
  },
  {
    "visible_id": 5,
    "status": "INITIAL",
    "name": "dds"
  },
  {
    "visible_id": 6,
    "status": "INITIAL",
    "name": "aax"
  },
  {
    "visible_id": 7,
    "status": "INITIAL",
    "name": "bsd"
  },
  {
    "visible_id": 8,
    "status": "PRINTED",
    "name": "ads"
  }
]

The aggregated query should list me all the available sticker in the filter status in a range value of visible_id. If a different status is found in between, a new range must be projected from the next filter status onward appended to the result list.

Example Filter and Results according to the above CSV data: if filter status = "INITIAL" result:

[
    {status: "INITIAL", range: "1-2"},
    {status: "INITIAL", range: "5-7"}
]

if filter status = "SCANNED" result:

[
    {status: "SCANNED", range: "3-4"},
]

if filter status = "PRINTED" result:

[
    {status: "PRINTED", range: "8-8"},
]

Solution

  • Try this one:

    db.collection.aggregate([
       { // Get the status from previous document
          $setWindowFields: {
             sortBy: { visible_id: 1 },
             output: {
                lag: { $shift: { output: "$status", by: -1 } }
             }
          }
       },
       {
          $set: { // Check if status is different to previous status
             group: {
                $cond: {
                   if: { $eq: ["$status", "$lag"] },
                   then: "$$REMOVE",
                   else: "$visible_id"
                }
             }
          }
       },
       {
          $fill: { // Fill missing values with "last observation carried forward"
             sortBy: { visible_id: 1 },
             output: { group: { method: "locf" } }
          }
       },
       { $match: { status: "SCANNED" } }, // Limit the output
       {
          $group: { // Get min and max values
             _id: "$group",
             min: { $min: "$visible_id" },
             max: { $max: "$visible_id" },
             status: { $first: "$status" }
          }
       },
       {
          $project: { // some output cosmetics
             _id: 0,
             status: 1,
             range: { $concat: [{ $toString: "$min" }, "-", { $toString: "$max" }] }
          }
       }
    ])
    

    Mongo Playground

    In case visible_id is a strict sequence and you have an index on status, then this version would be much faster:

    db.collection.aggregate([
       { $match: { status: "INITIAL" } },
       {
          $setWindowFields: {
             sortBy: { visible_id: 1 },
             output: {
                lag: { $shift: { output: "$visible_id", by: -1 } }
             }
          }
       },
       {
          $set: {
             group: {
                $cond: {
                   if: { $eq: [{$subtract: ["$visible_id",1]}, "$lag"] },
                   then: "$$REMOVE",
                   else: "$visible_id"
                }
             }
          }
       },
       {
          $fill: {
             sortBy: { visible_id: 1 },
             output: { group: { method: "locf" } }
          }
       },
       {
          $group: {
             _id: "$group",
             min: { $min: "$visible_id" },
             max: { $max: "$visible_id" },
             status: { $first: "$status" }
          }
       },
       {
          $project: {
             _id: 0,
             status: 1,
             range: { $concat: [{ $toString: "$min" }, "-", { $toString: "$max" }] }
          }
       }
    ])