Search code examples
javaperformancepostgresqldatasetgoogle-books

How to improve the speed of this code?


I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:

public class ToPostgres {

    public static void main(String[] args) throws Exception {
        String filePath = "./";
        List<String> files = new ArrayList<String>();
        for (int i =0; i < 10; i++) {
            files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
        }
        Connection c = null;
        try {
            c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                    "postgres", "xxxxxx");
        } catch (SQLException e) {
            e.printStackTrace();
        }

        if (c != null) {
            try {
                PreparedStatement wordInsert = c.prepareStatement(
                    "INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
                );
                PreparedStatement countInsert = c.prepareStatement(
                    "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                    "VALUES (?,?,?,?,?)"
                );
                String lastWord = "";
                Long lastId = -1L;
                for (String filename: files) {
                    BufferedReader input =  new BufferedReader(new FileReader(new File(filename)));
                    String line = "";
                    while ((line = input.readLine()) != null) {
                        String[] data = line.split("\t");
                        Long id = -1L;
                        if (lastWord.equals(data[0])) {
                            id = lastId;
                        } else {
                            wordInsert.setString(1, data[0]);
                            wordInsert.executeUpdate();
                            ResultSet resultSet = wordInsert.getGeneratedKeys();
                            if (resultSet != null && resultSet.next()) 
                            {
                                id = resultSet.getLong(1);
                            }
                        }
                        countInsert.setLong(1, id);
                        countInsert.setInt(2, Integer.parseInt(data[1]));
                        countInsert.setInt(3, Integer.parseInt(data[2]));
                        countInsert.setInt(4, Integer.parseInt(data[3]));
                        countInsert.setInt(5, Integer.parseInt(data[4]));
                        countInsert.executeUpdate();
                        lastWord = data[0];
                        lastId = id;
                    }
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)

EDIT: This code is how I solved it:

public class ToPostgres {

    public static void main(String[] args) throws Exception {
        String filePath = "./";
        List<String> files = new ArrayList<String>();
        for (int i =0; i < 10; i++) {
            files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
        }
        Connection c = null;
        try {
            c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                    "postgres", "xxxxxx");
        } catch (SQLException e) {
            e.printStackTrace();
        }

        if (c != null) {
            c.setAutoCommit(false);
            try {
                PreparedStatement wordInsert = c.prepareStatement(
                    "INSERT INTO words (id, word) VALUES (?,?)"
                );
                PreparedStatement countInsert = c.prepareStatement(
                    "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                    "VALUES (?,?,?,?,?)"
                );
                String lastWord = "";
                Long id = 0L;
                for (String filename: files) {
                    BufferedReader input =  new BufferedReader(new FileReader(new File(filename)));
                    String line = "";
                    int i = 0;
                    while ((line = input.readLine()) != null) {
                        String[] data = line.split("\t");
                        if (!lastWord.equals(data[0])) {
                            id++;
                            wordInsert.setLong(1, id);
                            wordInsert.setString(2, data[0]);
                            wordInsert.executeUpdate();
                        }
                        countInsert.setLong(1, id);
                        countInsert.setInt(2, Integer.parseInt(data[1]));
                        countInsert.setInt(3, Integer.parseInt(data[2]));
                        countInsert.setInt(4, Integer.parseInt(data[3]));
                        countInsert.setInt(5, Integer.parseInt(data[4]));
                        countInsert.executeUpdate();
                        lastWord = data[0];
                        if (i % 10000 == 0) {
                            c.commit();
                        }
                        if (i % 100000 == 0) {
                            System.out.println(i+" mark file "+filename);
                        }
                        i++;
                    }
                    c.commit();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!


Solution

  • JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:

    c.setAutoCommit(false)
    

    then commit in batches, something along the lines of:

    long ops = 0;
    
    for(String filename : files) {
        // ...
        while ((line = input.readLine()) != null) {
            // insert some stuff...
    
            ops ++;
    
            if(ops % 1000 == 0) {
                c.commit();
            }
        }
    }
    
    c.commit();