Search code examples
javascriptnode.jscsvduplicatesfast-csv

Detect duplicate data on csv


With node.js using fast-csv package, I currently have this parsing function, which reads csv file, change headers, go through each row and fire an event based on row data.

validateRows: (filePath, payload, validators) => new Promise((resolve, reject) => {        
        const invalidRecords = [];
        const validRecords = [];

        fs.createReadStream(filePath)
            .pipe(csv.parse({
                headers: (headers) => mapHeaderToRelated(headers, payload), delimiter: ";", discardUnmappedColumns: true
            }))
            .validate((data, cb) => {
                const errors = validators.reduce((err, func) => [...err, ...func(data)], []);

                if (errors.length > 0) {
                    return cb(null, false, errors);
                }

                return cb(null, true);
            })
            .on("error", (error) => {
                console.log("There is some error");
                reject(error);
            })
            .on("data", (row) => {
                validRecords.push(row);
            })
            .on("data-invalid", (row, rowNumber, reason) => {
                invalidRecords.push({
                    data: row,
                    rowNumber: rowNumber,
                    reason: reason
                });
            })
            .on("end", (rowCount) => {
                console.log(`Parsed ${rowCount} rows. Valid Count: ${validRecords.length} Invalid Count: ${invalidRecords.length}`);

                resolve({
                    invalidRecords,
                    validRecords
                });
            });
    }),

I need to detect records that occur multiple times checking on number. If there is duplication, like multiple rows having the same phone number, they should be considered as invalid and pushed to the invalid records array

Example CSV:

| name   | surname | gender  | phone  | 
| ------ | ------- | -------- | -----  |
| John   | Doe     | Male     | 123456 |
| Joh    | Deo     | Unknown  | 123456 |
| Jane   | Doe     | Female   | 999999 |

The output I'd like from the parsed CSV:

{
 validRecords: [ 
   {
     name: Jane
     surname: Doe
     gender: Female
     phone: 99999   
   }
 ]

 invalidRecords: [ 
   {
     data: {
       name: John
       surname: Doe
       gender: Male
       phone: 123456 
     }
     rowNumber: 1,
     reason: ["Duplicate data"]
   },
   {
     data: {
       name: Joh
       surname: Deo
       gender: Male
       phone: 123456 
     }
     rowNumber: 2,
     reason: ["Duplicate data"]
   }
 ]
]

How could I approach this problem?


Solution

  • I've extended my on("end") event with the following and some helper functions under. It works out for now.

    .on("end", (rowCount) => {
                    console.log(`Parsed ${rowCount} rows. Valid Count: ${validCustomers.length} Invalid Count: ${invalidCustomers.length}`);
                    
                    const allCustomers = [...invalidCustomers, ...validCustomers];
    
                    const duplicateNumbers = findDuplicatePhoneNumbers(allCustomers);
    
                    flagDuplicateCustomers(allCustomers, duplicateNumbers);
                    
                    // Valid but duplicate customers are pushed to the invalid customers and reason set to "Duplicate"
                    const validButDuplicateCustomers = getDuplicateCustomers(validCustomers);
                    validButDuplicateCustomers.forEach((c) => {
                        invalidCustomers.push({
                            data: c,
                            reason: ["Duplicate"]
                        });
                    });
                    
                    // Add reason "Duplicate" for Invalid and Duplicate customers
                    const invalidAndDuplicateCustomers = getDuplicateCustomers(invalidCustomers);
                    invalidAndDuplicateCustomers.forEach((c) => {
                        if (c.reason) {
                            c.reason = [...c.reason, "Duplicate"];
                        }
                    });
                    
                    const validAndNotDuplicate = getNonDuplicateCustomers(validCustomers);
    
                    resolve({
                        invalidCustomers: invalidCustomers,
                        validCustomers: validAndNotDuplicate
                    });
                });
    

    And the helper methods are

    const getDuplicateCustomers = (customers) => customers.filter((customer) => customer.isDuplicate);
    
    const getNonDuplicateCustomers = (records) => records.filter((record) => !record.isDuplicate);
    
    const findDuplicatePhoneNumbers = (customers) => {
        let duplicates = [];
    
        const sortedCustomers = customers.sort((a, b)=> a.customer_phone - b.customer_phone);
    
        sortedCustomers.forEach((customer, index, array) => {
            const nextCustomer = array[index + 1];
      
            if (!nextCustomer) {
                return;
            }
      
            if (customer.customer_phone === nextCustomer.customer_phone) {
                duplicates.push(customer);
            }
        });
    
        const duplicatePhoneNumbers = duplicates.map((customer) => customer.customer_phone);
        const uniqueDuplicatePhoneNumbers = [...new Set(duplicatePhoneNumbers)];
    
    
        return uniqueDuplicatePhoneNumbers;
    };
    
    const flagDuplicateCustomers = (customers, duplicateNumbers) => {
        if (!duplicateNumbers) {
            return;
        }
        
        if (duplicateNumbers.length === 0) {
            return;
        }
    
        const duplicateCustomers = customers.filter((customer) => duplicateNumbers.includes(customer.customer_phone));
    
        duplicateCustomers.forEach((customer) => {
            customer.isDuplicate = true;
        });
    };