Search code examples
javaopencsv

How to merge csv files based on unique columns in java


I have 3 different very large files and my requirement is to merge them together in one single csv file. All these 3 files have different number of columns but they contain some unique column between each other.

Example file 1:

StudentId   StudentName   ClassId
1           Ajay           6
2           Vinay          8
3           Geeta          6
4           Sameer         7

file 2:

ClassId       ClassColor
6           Blue
7           Grey
8           White

file 3:

HouseId    HouseName    StudentId
1          Knights       4
2          Spartans      1
3          Samurai       2
4          Trojans       3

As you can see file1 and file2 has classId in common And file1 and file2 has studentId in common. Now based on this information I want have result like this by joining files based on common key.

StudentId     StudentName    ClassId     ClassColor    HouseId    HouseName
1             Ajay           6           Blue          2          Spartans
2             Vinay          8           White         3          Samurai
3             Geeta          6           Blue          4          Trojans
4             Sameer         7           Grey          1          Knights

This scenario is completely new to me, I did some research but didn't find solution for it in java.


Solution

  • At this stage I am able to join two files with below implementation:

    Class to hold csv file data:

    import java.util.LinkedHashMap;
    import java.util.Map;
    public class CsvVo {
        private Map<String, String> keyVal;
        public CsvVo(String id) {
            keyVal = new LinkedHashMap<>();// you may also use HashMap if you don't need to keep order
        }
    
        public Map<String, String> getKeyVal() {
            return keyVal;
        }
        public void setKeyVal(Map<String, String> keyVal) {
            this.keyVal = keyVal;
        }
        public void put(String key, String val) {
            keyVal.put(key, val);
        }
        public String get(String key) {
            return keyVal.get(key);
        }
    }
    

    CSV parser to read files:

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.List;
    import java.util.Set;
    public class CsvParser {
        public static List<CsvVo> getRecodrsFromACsv(File file, List<String> keys) throws IOException {
            BufferedReader br = new BufferedReader(new FileReader(file));
            List<CsvVo> records = new ArrayList<>();
            boolean isHeader = true;
            String line = null;
            while ((line = br.readLine()) != null) {
                if (isHeader) {// first line is header
                    isHeader = false;
                    continue;
                }
                CsvVo record = new CsvVo(file.getName());
                String[] lineSplit = line.split(",");
                for (int i = 0; i < lineSplit.length; i++) {
                    record.put(keys.get(i), lineSplit[i]);
                }
                records.add(record);
            }
            br.close();
            return records;
        }
        public static List<String> getHeadersFromACsv(File file) throws IOException {
            BufferedReader br = new BufferedReader(new FileReader(file));
            List<String> headers = null;
            String line = null;
            while ((line = br.readLine()) != null) {
                String[] lineSplit = line.split(",");
                headers = new ArrayList<>(Arrays.asList(lineSplit));
                break;
            }
            br.close();
            return headers;
        }
        public static void writeToCsv(final File file, final Set<String> headers, final List<CsvVo> records)
                throws IOException {
            FileWriter csvWriter = new FileWriter(file);
            // write headers
            String sep = "";
            String[] headersArr = headers.toArray(new String[headers.size()]);
            for (String header : headersArr) {
                csvWriter.append(sep);
                csvWriter.append(header);
                sep = ",";
            }
            csvWriter.append("\n");
            // write records at each line
            for (CsvVo record : records) {
                sep = "";
                for (int i = 0; i < headersArr.length; i++) {
                    csvWriter.append(sep);
                    csvWriter.append(record.get(headersArr[i]));
                    sep = ",";
                }
                csvWriter.append("\n");
            }
            csvWriter.flush();
            csvWriter.close();
        }
    }
    

    Main Class to test the functionality:

    import java.io.File;
    import java.io.IOException;
    import java.util.*;
    
    public class CsvApplication {
        public static void main(String[] args) throws IOException {
            File csv1 = new File("/Users/guru/Desktop/Standard.csv");
            File csv2 = new File("/Users/guru/Desktop/Match.csv");
            List<String> csv1Headers = CsvParser.getHeadersFromACsv(csv1);
            //csv1Headers.forEach(h -> System.out.print(h + " "));
            //System.out.println();
            List<String> csv2Headers = CsvParser.getHeadersFromACsv(csv2);
            //csv2Headers.forEach(h -> System.out.print(h + " "));
            //System.out.println();
            List<String> allCsvHeaders = new ArrayList<>();
            allCsvHeaders.addAll(csv1Headers);
            allCsvHeaders.addAll(csv2Headers);
            //allCsvHeaders.forEach(h -> System.out.print(h + " "));
            //System.out.println();
            Set<String> uniqueHeaders = new HashSet<>(allCsvHeaders);
            //uniqueHeaders.forEach(h -> System.out.print(h + " "));
            //System.out.println();
            List<CsvVo> csv1Records = CsvParser.getRecodrsFromACsv(csv1, csv1Headers);
            List<CsvVo> csv2Records = CsvParser.getRecodrsFromACsv(csv2, csv2Headers);
            List<CsvVo> allCsvRecords = new ArrayList<>();
            String key = "StudentID";
            getUniqueRecordsForKey(key, csv1Records, csv2Records,allCsvRecords);
           // allCsvRecords.addAll(csv1Records);
            //allCsvRecords.addAll(csv2Records);
            CsvParser.writeToCsv(new File("/Users/guru/Desktop/Output.csv"), uniqueHeaders, allCsvRecords);
        }
    
        private static void getUniqueRecordsForKey(String key, List<CsvVo> csv1Records, List<CsvVo> csv2Records, List<CsvVo> allCsvRecords) {
    
            for (CsvVo record1: csv1Records){
                for(CsvVo record2: csv2Records){
                    if( !record1.getKeyVal().get(key).isEmpty() && record1.getKeyVal().get(key).equals(record2.getKeyVal().get(key))){
                        HashMap<String ,String> mergedMap = new LinkedHashMap();
                        CsvVo mergedRecord = new CsvVo(record2.getKeyVal().get(key));
                        mergeRecords(mergedMap, record1,record2);
                        mergedRecord.setKeyVal(mergedMap);
                        csv2Records.remove(record2);
                        allCsvRecords.add(mergedRecord);
                        break;
                    }
                }
            }
        }
    
        private static void mergeRecords(HashMap mergedMap,CsvVo record1, CsvVo record2 ){
            mergedMap.putAll(record1.getKeyVal());
            mergedMap.putAll(record2.getKeyVal());
        }
    }
    

    Here we are loading entire data of files to lists and then comparing data based on the column we want to join. In this case i have taken StudentID.

    Not sure if this is good approach for big files.