Search code examples
jsonapache-nifijolt

Filter JSON Array to Keep Only the Entry with the Latest Date/Time Using Jolt


I'm working with a JSON array where each object contains a ScheduledDateUTC_Max field in the format "YYYY-MM-DD HH:MM:SS". My goal is to transform this array using Jolt to keep only the object with the latest date and time in the ScheduledDateUTC_Max field.

Here's an example of my input JSON:

[
  {
    "_id": {
      "$oid": "213asd4444-44asd44"
    },
    "PurchaseOrderNumber": "0085701091",
    "WorkflowId": "1243840",
    "CreatedDateUTC": "2023",
    "ExternalReference": "22261952",
    "Flag": "0",
    "NIT": "000890101676",
    "Tag03_ExternalReference": "early",
    "Warehouse_ExternalReference": "0085",
    "updateDate": {
      "$date": "2024-03-19T09:37:41.312Z"
    },
    "ScheduledDateUTC-Date": "2023-03-14",
    "ScheduledDateUTC-Hours": "07:30:00",
    "StatedTimestamp13UTC-Date": "2023-03-14",
    "StatedTimestamp13UTC-Hours": "05:29:35"
  },
  {
    "_id": {
      "$oid": "2134asdas444-44asda44"
    },
    "PurchaseOrderNumber": "0085701091",
    "WorkflowId": "1243839",
    "CreatedDateUTC": "2023",
    "ExternalReference": "22261952",
    "Flag": "0",
    "NIT": "000890101676",
    "Tag03_ExternalReference": "OnTime",
    "Warehouse_ExternalReference": "0085",
    "updateDate": {
      "$date": "2024-03-19T09:37:41.312Z"
    },
    "ScheduledDateUTC-Date": "2023-03-14",
    "ScheduledDateUTC-Hours": "06:20:00",
    "StatedTimestamp13UTC-Date": "2023-03-14",
    "StatedTimestamp13UTC-Hours": "05:29:35"
  },
  {
    "_id": {
      "$oid": "aasd2222sd-4444"
    },
    "PurchaseOrderNumber": "0085701091",
    "WorkflowId": "1243838",
    "CreatedDateUTC": "2023",
    "ExternalReference": "22261952",
    "Flag": "0",
    "NIT": "000890101676",
    "Tag03_ExternalReference": "OnTime",
    "Warehouse_ExternalReference": "0085",
    "updateDate": {
      "$date": "2024-03-19T09:37:41.312Z"
    },
    "ScheduledDateUTC-Date": "2023-03-14",
    "ScheduledDateUTC-Hours": "06:15:00",
    "StatedTimestamp13UTC-Date": "2023-03-14",
    "StatedTimestamp13UTC-Hours": "05:29:35"
  }
]

I have this jolt:

[
  {
    "operation": "shift",
    "spec": {
      "*": {
        "ExternalReference": "[&1].ExternalReference",
        "Warehouse_ExternalReference": "[&1].Warehouse_ExternalReference",
        "PurchaseOrderNumber": "[&1].PurchaseOrderNumber",
        "CreatedDateUTC": "[&1].CreatedDateUTC",
        "NIT": "[&1].NIT",
        "ScheduledDateUTC-Date": "[&1].ScheduledDateUTC_Date",
        "ScheduledDateUTC-Hours": "[&1].ScheduledDateUTC_Hours",
        "StatedTimestamp13UTC-Date": "[&1].StatedTimestamp13UTC_Date",
        "StatedTimestamp13UTC-Hours": "[&1].StatedTimestamp13UTC_Hours"
      }
    }
 },
  {
    "operation": "modify-overwrite-beta",
    "spec": {
      "*": {
        "PurchaseOrderNumberSub": "=substring(@(1,PurchaseOrderNumber),4,10)",
        "CreatedDateUTC_PurchaseOrderNumber": "=concat(@(1,CreatedDateUTC),@(1,PurchaseOrderNumberSub))",
        "ScheduledDateUTC_Max": "=concat(@(1,ScheduledDateUTC_Date),' ',@(1,ScheduledDateUTC_Hours))"
      }
    }
 },
  {
    "operation": "remove",
    "spec": {
      "*": {
        "PurchaseOrderNumberSub": "",
        "CreatedDateUTC": ""
      }
    }
  },
  {
    "operation": "default",
    "spec": {
      "*": {
        "indicator": "${indicador}"
      }
    }
  },
  {
    "operation": "shift",
    "spec": {
      "*": {
        "ExternalReference": "[&1].ExternalReference",
        "Warehouse_ExternalReference": "[&1].Warehouse_ExternalReference",
        "CreatedDateUTC_PurchaseOrderNumber": "[&1].CreatedDateUTC_PurchaseOrderNumber",
        "NIT": "[&1].NIT",
        "ScheduledDateUTC_Date": "[&1].ScheduledDateUTC_Date",
        "ScheduledDateUTC_Hours": "[&1].ScheduledDateUTC_Hours",
        "StatedTimestamp13UTC_Date": "[&1].StatedTimestamp13UTC_Date",
        "StatedTimestamp13UTC_Hours": "[&1].StatedTimestamp13UTC_Hours",
        "indicator": "[&1].indicator",
        "ScheduledDateUTC_Max": "[&1].ScheduledDateUTC_Max"
      }
    }
  }
]

with the validating field: "ScheduledDateUTC_Max": "2023-03-14 07:30:00"

And this is the desired output:

[
 {
    "ExternalReference": "22261952",
    "Warehouse_ExternalReference": "0085",
    "CreatedDateUTC_PurchaseOrderNumber": "2023701091",
    "NIT": "000890101676",
    "ScheduledDateUTC_Date": "2023-03-14",
    "ScheduledDateUTC_Hours": "07:30:00",
    "StatedTimestamp13UTC_Date": "2023-03-14",
    "StatedTimestamp13UTC_Hours": "05:29:35",
    "indicator": "${indicador}"
    
 }
]

I've tried various Jolt transformations, but I'm struggling to find a way to compare the ScheduledDateUTC_Max values and keep only the object with the latest date and time. Jolt's modify-default-beta operation allows for some manipulation, but it doesn't seem to support direct date-time comparisons.

Is there a way to achieve this using Jolt, or should I consider a different approach or tool for this task? Any guidance or examples would be greatly appreciated.


Solution

  • You can sort by ScheduledDateUTC-Date ~ Hours combination through use of the following transformation :

    [
      { // generate a new attribute "ScheduledDateUTC" 
        // by combining date and time
        "operation": "modify-overwrite-beta",
        "spec": {
          "*": {
            "ScheduledDateUTC": "=concat(@(1,ScheduledDateUTC-Date),' ',@(1,ScheduledDateUTC-Hours))"
          }
        }
      },
      { // make new item object keys
        "operation": "shift",
        "spec": {
          "*": {
            "*": "obj.@1,ScheduledDateUTC.&"
          }
        }
      },
      { // find the size of the outermost object "obj" in order to compute
        // the last inner object's index which is "lastObj"
        "operation": "modify-overwrite-beta",
        "spec": {
          "sz": "=size(@(1,obj))",
          "lastObj": "=intSum(-1,@(1,sz))"
        }
      },
      { // order the inner objects by date-time combination
        "operation": "sort"
      },
      { // for preparation of the object keys to be ordinals of 0,1,2, ...
        "operation": "shift",
        "spec": {
          "lastObj": "&",
          "obj": {
            "*": {
              "*": "&2[#2].&"
            }
          }
        }
      },
      { // convert the object keys to ordinals of 0,1,2, ...
        "operation": "shift",
        "spec": {
          "lastObj": "&",
          "obj": {
            "*": {
              "*": "&1.&"
            }
          }
        }
      },
      { // match the key with "lastObj" to determine the object with 
        // the latest date-time
        "operation": "shift",
        "spec": {
          "lastObj": {
            "*": {
              "@2,&": "[]"
            }
          }
        }
      },
      { // pick the desired attributes while putting them in ordered to
        // keep the desired returning order
        "operation": "shift",
        "spec": {
          "*": {
            "ExternalReference": "[&1].&",
            "Warehouse_ExternalReference": "[&1].&",
            "PurchaseOrderNumber": "[&1].CreatedDateUTC_&",
            "NIT": "[&1].&",
            "ScheduledDateUTC*": "[&1].&",
            "StatedTimestamp*": "[&1].&",
            "#\\$\\{indicador\\}": "[&1].indicator"
          }
        }
      }
    ]