Search code examples
javascriptarraysreactjsjsonnested-object

Combine duplicate tokens inside huge JSON file into nested array of objects using React


I looked at several of the suggested solutions but none seemed to rise to this confounding data formatting challenge.

I have a huge JSON file (over 100k rows) and massive duplicates of data all as top level objects. Here's an example:

[
   {
      "manufacturer":"Samsung",
      "device":"Galaxy A32 5G",
      "model":"SM-A326B",
      "chipset":"Mediatek MT6853V/NZA",
      "date":"2022-01-01",
      "fw_id":"A326BXXS4AVA1",
      "android":"R(Android 11)",
      "known_passcode":false,
      "afu":false,
      "bfu":false,
      "bruteforce":false
   },
   {
      "manufacturer":"Samsung",
      "device":"Galaxy A32 5G",
      "model":"SM-A326U",
      "chipset":"Mediatek MT6853V/NZA",
      "date":"2021-03-01",
      "fw_id":"A326USQU1AUD4",
      "android":"R(Android 11)",
      "known_passcode":true,
      "afu":false,
      "bfu":true,
      "bruteforce":true
   },
   {
      "manufacturer":"Samsung",
      "device":"Galaxy A32 5G",
      "model":"SM-A326U1",
      "chipset":"Mediatek MT6853V/NZA",
      "date":"2021-09-01",
      "fw_id":"A326U1UEU5AUJ2",
      "android":"R(Android 11)",
      "known_passcode":true,
      "afu":false,
      "bfu":true,
      "bruteforce":true
   },
   {
      "manufacturer":"LGE",
      "device":"LG K31",
      "model":"LGL355DL",
      "chipset":"Mediatek MT6762",
      "date":"unknown",
      "fw_id":"L355DL10l",
      "android":"unknown",
      "known_passcode":false,
      "afu":false,
      "bfu":false,
      "bruteforce":false
   }
]

This needs to be organized so that data points like manufacturer, device, model are not duplicated hundreds of times.

Btw, here's a JSFiddle to play with: https://jsfiddle.net/xpancom/Lq7duahv/

Ideally, the JSON format would be the following:

[
  {
    "manufacturers": [
      {
        "manufacturer": "Samsung",
        "devices": [
          {
            "device": "Galaxy A32 5G",
            "models": [
              {
                "model": "SM-A326B",
                "data": [
                  {
                    "chipset": "Mediatek MT6853V/NZA",
                    "date": "2022-01-01",
                    "fw_id": "A326BXXS4AVA1",
                    "android": "R(Android 11)",
                    "known_passcode": false,
                    "afu": false,
                    "bfu": false,
                    "bruteforce": false
                  },
                  {
                    "chipset": "Mediatek MT6853V/NZA",
                    "date": "2021-09-01",
                    "fw_id": "A326BXXU3AUH7",
                    "android": "R(Android 11)",
                    "known_passcode": true,
                    "afu": false,
                    "bfu": true,
                    "bruteforce": true
                  }
                ]
              },
              {
                "model": "SM-A326U1",
                "data": [
                  {
                    "chipset": "Mediatek MT6853V/NZA",
                    "date": "2021-09-01",
                    "fw_id": "A326U1UEU5AUJ2",
                    "android": "R(Android 11)",
                    "known_passcode": true,
                    "afu": false,
                    "bfu": true,
                    "bruteforce": true
                  }
                ]
              }
            ]
          }
        ]
      },
      {
        "manufacturer": "LGE",
        "devices": [
          {
            "device": "LG K31",
            "models": [
              {
                "model": "SM-A326B",
                "data": [
                  {
                    "chipset": "Mediatek MT6762",
                    "date": "unknown",
                    "fw_id": "L355DL10l",
                    "android": "unknown",
                    "known_passcode": false,
                    "afu": false,
                    "bfu": false,
                    "bruteforce": false
                  }
                ]
              }
            ]
          }
        ]
      }
    ]
  }
]

Working in React, here's what I've got so far in trying to massage this data:

  const source = data;
  const destination = [];
  const classifiedTokens = []; // will be used to stored already classified tokens
  const classifiedTokensModel = []; // will be used to stored already classified tokens for models

  const getNextTokenArray = (source) => {
    let unusedToken = null;
    const nextTokenArray = source.filter(function (element) {
      if (!unusedToken && !classifiedTokens.includes(element['device'])) {
        unusedToken = element['device'];
        classifiedTokens.push(unusedToken);
      }
      return unusedToken ? unusedToken === element['device'] : false;
    });
    return unusedToken ? nextTokenArray : null;
  };

  // Pass in arrays deconstructed from addToDestination to process third tier nested objects for models
  const getNextTokenArrayModel = (tokenObject) => {
    let tokenObjectDevice = tokenObject['device'];
    let tokenObjectData = tokenObject['data'];
    let unusedTokenModel = null;
    const nextTokenArrayModel = tokenObjectData.filter(function (element) {
      if (!unusedTokenModel && !classifiedTokensModel.includes(element['model'])) {
        unusedTokenModel = element['model'];
        classifiedTokensModel.push(unusedTokenModel);
      }
      return unusedTokenModel ? unusedTokenModel === element['model'] : false;
    });
    //return unusedTokenModel ? nextTokenArrayModel : null;

    if (unusedTokenModel) {
      if (nextTokenArrayModel.length === 0) return;
      let res = {
        device: tokenObjectDevice,
        model: nextTokenArrayModel[0]['model'],
        data: [],
      };
      nextTokenArrayModel.forEach((element) => {
        res.data.push({
          manufacturer: element.manufacturer,
          chipset: element.chipset,
          date: element.date,
          fw_id: element.fw_id,
          android: element.android,
          knownPasscode: element.knownPasscode,
          afu: element.afu,
          bfu: element.bfu,
          bruteforce: element.bruteforce,
        });
      });
      destination.push(res);
    } else {
      return null;
    }
    
  };

  const addToDestination = (tokenArray) => {
    if (tokenArray.length === 0) return;
    let res = {
      device: tokenArray[0]['device'],
      data: [],
    };
    tokenArray.forEach((element) => {
      res.data.push({
        manufacturer: element.manufacturer,
        model: element.model,
        chipset: element.chipset,
        date: element.date,
        fw_id: element.fw_id,
        android: element.android,
        knownPasscode: element.knownPasscode,
        afu: element.afu,
        bfu: element.bfu,
        bruteforce: element.bruteforce,
      });
    });

    getNextTokenArrayModel(res); // Call this to process and group nested model duplicates by device

    //destination.push(res);
  };

  let nextTokenArray = getNextTokenArray(source);

  while (nextTokenArray) {
    addToDestination(nextTokenArray);
    nextTokenArray = getNextTokenArray(source);
  }

  setTimeout(() => {
    document.getElementById('root').innerHTML =
      '<pre>' + JSON.stringify(destination, null, 2) + '</pre>';
  }, 1000);

};


And here's the JSFiddle again: https://jsfiddle.net/xpancom/Lq7duahv/

Who can smash this data formatting dilemma?


Solution

  • This answer is not React specific, but one approach would be to use array.reduce() to transform each level/node of the structure as shown in the code snippet below.

    const source = [
      {
        manufacturer: 'Samsung',
        device: 'Galaxy A32 5G',
        model: 'SM-A326B',
        chipset: 'Mediatek MT6853V/NZA',
        date: '2022-01-01',
        fw_id: 'A326BXXS4AVA1',
        android: 'R(Android 11)',
        known_passcode: false,
        afu: false,
        bfu: false,
        bruteforce: false,
      },
      {
        manufacturer: 'Samsung',
        device: 'Galaxy A32 5G',
        model: 'SM-A326B',
        chipset: 'Mediatek MT6853V/NZA',
        date: '2022-01-01',
        fw_id: 'A326BXXS4AVA1',
        android: 'R(Android 11)',
        known_passcode: false,
        afu: false,
        bfu: false,
        bruteforce: false,
      },
      {
        manufacturer: 'Samsung',
        device: 'Galaxy A32 5G',
        model: 'SM-A326U',
        chipset: 'Mediatek MT6853V/NZA',
        date: '2021-03-01',
        fw_id: 'A326USQU1AUD4',
        android: 'R(Android 11)',
        known_passcode: true,
        afu: false,
        bfu: true,
        bruteforce: true,
      },
      {
        manufacturer: 'Samsung',
        device: 'Galaxy A32 5G',
        model: 'SM-A326U1',
        chipset: 'Mediatek MT6853V/NZA',
        date: '2021-09-01',
        fw_id: 'A326U1UEU5AUJ2',
        android: 'R(Android 11)',
        known_passcode: true,
        afu: false,
        bfu: true,
        bruteforce: true,
      },
      {
        manufacturer: 'LGE',
        device: 'LG K31',
        model: 'LGL355DL',
        chipset: 'Mediatek MT6762',
        date: 'unknown',
        fw_id: 'L355DL10l',
        android: 'unknown',
        known_passcode: false,
        afu: false,
        bfu: false,
        bruteforce: false,
      },
    ];
    
    function generateTree(data, key) {
      return data.reduce((acc, val) => {
        // Split the key name from the child data
        const { [key.name]: keyName, ...childData } = val;
    
        // Find a tree item in the structure being generated
        const treeItem = acc.find((item) => item[key.name] === keyName);
    
        if (treeItem) {
          // If found, append child data
          treeItem[key.child].push(childData);
        } else {
          // If not found, create new key and append child data
          acc.push({ [key.name]: keyName, [key.child]: [childData] });
        }
    
        return acc;
      }, []);
    }
    
    // Generate manufacturer/device structure
    const manufacturers = generateTree(source, {
      name: 'manufacturer', // Key name to use as grouping identifier
      child: 'devices', // Key name for child data
    });
    
    // Generate device/model structure
    manufacturers.forEach((manufacturer) => {
      manufacturer.devices = generateTree(manufacturer.devices, {
        name: 'device',
        child: 'models',
      });
    
      // Generate model/data structure
      manufacturer.devices.forEach((device) => {
        device.models = generateTree(device.models, {
          name: 'model',
          child: 'data',
        });
      });
    });
    
    const destination = [{ manufacturers }];
    
    console.log(destination);