With node.js using fast-csv package, I currently have this parsing function, which reads csv file, change headers, go through each row and fire an event based on row data.
validateRows: (filePath, payload, validators) => new Promise((resolve, reject) => {
const invalidRecords = [];
const validRecords = [];
fs.createReadStream(filePath)
.pipe(csv.parse({
headers: (headers) => mapHeaderToRelated(headers, payload), delimiter: ";", discardUnmappedColumns: true
}))
.validate((data, cb) => {
const errors = validators.reduce((err, func) => [...err, ...func(data)], []);
if (errors.length > 0) {
return cb(null, false, errors);
}
return cb(null, true);
})
.on("error", (error) => {
console.log("There is some error");
reject(error);
})
.on("data", (row) => {
validRecords.push(row);
})
.on("data-invalid", (row, rowNumber, reason) => {
invalidRecords.push({
data: row,
rowNumber: rowNumber,
reason: reason
});
})
.on("end", (rowCount) => {
console.log(`Parsed ${rowCount} rows. Valid Count: ${validRecords.length} Invalid Count: ${invalidRecords.length}`);
resolve({
invalidRecords,
validRecords
});
});
}),
I need to detect records that occur multiple times checking on number. If there is duplication, like multiple rows having the same phone number, they should be considered as invalid and pushed to the invalid records array
Example CSV:
| name | surname | gender | phone |
| ------ | ------- | -------- | ----- |
| John | Doe | Male | 123456 |
| Joh | Deo | Unknown | 123456 |
| Jane | Doe | Female | 999999 |
The output I'd like from the parsed CSV:
{
validRecords: [
{
name: Jane
surname: Doe
gender: Female
phone: 99999
}
]
invalidRecords: [
{
data: {
name: John
surname: Doe
gender: Male
phone: 123456
}
rowNumber: 1,
reason: ["Duplicate data"]
},
{
data: {
name: Joh
surname: Deo
gender: Male
phone: 123456
}
rowNumber: 2,
reason: ["Duplicate data"]
}
]
]
How could I approach this problem?
I've extended my on("end") event with the following and some helper functions under. It works out for now.
.on("end", (rowCount) => {
console.log(`Parsed ${rowCount} rows. Valid Count: ${validCustomers.length} Invalid Count: ${invalidCustomers.length}`);
const allCustomers = [...invalidCustomers, ...validCustomers];
const duplicateNumbers = findDuplicatePhoneNumbers(allCustomers);
flagDuplicateCustomers(allCustomers, duplicateNumbers);
// Valid but duplicate customers are pushed to the invalid customers and reason set to "Duplicate"
const validButDuplicateCustomers = getDuplicateCustomers(validCustomers);
validButDuplicateCustomers.forEach((c) => {
invalidCustomers.push({
data: c,
reason: ["Duplicate"]
});
});
// Add reason "Duplicate" for Invalid and Duplicate customers
const invalidAndDuplicateCustomers = getDuplicateCustomers(invalidCustomers);
invalidAndDuplicateCustomers.forEach((c) => {
if (c.reason) {
c.reason = [...c.reason, "Duplicate"];
}
});
const validAndNotDuplicate = getNonDuplicateCustomers(validCustomers);
resolve({
invalidCustomers: invalidCustomers,
validCustomers: validAndNotDuplicate
});
});
And the helper methods are
const getDuplicateCustomers = (customers) => customers.filter((customer) => customer.isDuplicate);
const getNonDuplicateCustomers = (records) => records.filter((record) => !record.isDuplicate);
const findDuplicatePhoneNumbers = (customers) => {
let duplicates = [];
const sortedCustomers = customers.sort((a, b)=> a.customer_phone - b.customer_phone);
sortedCustomers.forEach((customer, index, array) => {
const nextCustomer = array[index + 1];
if (!nextCustomer) {
return;
}
if (customer.customer_phone === nextCustomer.customer_phone) {
duplicates.push(customer);
}
});
const duplicatePhoneNumbers = duplicates.map((customer) => customer.customer_phone);
const uniqueDuplicatePhoneNumbers = [...new Set(duplicatePhoneNumbers)];
return uniqueDuplicatePhoneNumbers;
};
const flagDuplicateCustomers = (customers, duplicateNumbers) => {
if (!duplicateNumbers) {
return;
}
if (duplicateNumbers.length === 0) {
return;
}
const duplicateCustomers = customers.filter((customer) => duplicateNumbers.includes(customer.customer_phone));
duplicateCustomers.forEach((customer) => {
customer.isDuplicate = true;
});
};