Search code examples
javajsoup

How to get multiple tables using JSoup from a website


I need to get all 9 tables off:

https://www.basketball-reference.com/players/c/collijo01.html

My current code only does 1 table. I switch .first() to .last() which doesn't work. I tried using ("table.totals") to grab a table by name but that also failed.

public static void getData(String url) throws IOException
{
    String fileName = "table.csv";
    FileWriter writer = new FileWriter(fileName);
    Document doc = Jsoup.connect(url).get();
    Element tableElement = doc.select("table").first();

    System.out.println(doc);

    Elements tableHeaderEles = tableElement.select("thead tr th");
    for (int i = 0; i < tableHeaderEles.size(); i++) {
        writer.append(tableHeaderEles.get(i).text());

        if(i != tableHeaderEles.size() -1){             
            writer.append(',');
        }
    }
    writer.append('\n');
    System.out.println();

    Elements tableRowElements = tableElement.select(":not(thead) tr");

    for (int i = 0; i < tableRowElements.size(); i++) {
        Element row = tableRowElements.get(i);
        Elements rowItems = row.select("td");
        for (int j = 0; j < rowItems.size(); j++) {
            writer.append(rowItems.get(j).text());

            if(j != rowItems.size() -1){
                writer.append(',');
            }
        }
        writer.append('\n');
    }

    writer.close();
}

I get the first table from the site perfectly, but unable to advance past that. Does anyone know how to get all tables or grab tables based on ID?

EDIT: if anyone wants to completely test this coding's outputs for themselves

 public static void read(String file) throws IOException
 {
    Scanner scanner = new Scanner(new File(file));
    scanner.useDelimiter(",");
    while(scanner.hasNext()){
        System.out.print(scanner.next()+"|");
    }
    scanner.close();
}

Solution

  • You've already selected all tables but you're explicitly getting only the first one:

    Element tableElement = doc.select("table").first();
    

    Instead you can easily iterate over all of them:

    Elements tableElements = doc.select("table");
    for (Element tableElement : tableElements) {
       // for each of selected tables
    }
    

    So after some modifications to get unique filename the code will look like this:

    
    public static void getData(String url) throws IOException {
        String html = Jsoup.connect(url).execute().body();
        // this one is tricky as it contains tables as commented out HTML, and shows them using javascript code
        // so I'm using dirty replace to remove comment tags before parsing to make tables visible to Jsoup
        html = html.replaceAll("<!--", "");
        html = html.replaceAll("-->", "");
        Document doc = Jsoup.parse(html);
        Elements tableElements = doc.select("table");
        int number = 1;
        for (Element tableElement : tableElements) {
            String tableId = tableElement.id();
            if (tableId.isEmpty()) {
                // skip table without id
                continue;
            }
            tableId = " with id " + tableId;
            String fileName = "table" + number++ + tableId + ".csv";
            FileWriter writer = new FileWriter(fileName);
    
            System.out.println(doc);
    
            Elements tableHeaderEles = tableElement.select("thead tr th");
            for (int i = 0; i < tableHeaderEles.size(); i++) {
                writer.append(tableHeaderEles.get(i).text());
    
                if (i != tableHeaderEles.size() - 1) {
                    writer.append(',');
                }
            }
            writer.append('\n');
            System.out.println();
    
            Elements tableRowElements = tableElement.select(":not(thead) tr");
    
            for (int i = 0; i < tableRowElements.size(); i++) {
                Element row = tableRowElements.get(i);
                Elements rowItems = row.select("td");
                for (int j = 0; j < rowItems.size(); j++) {
                    writer.append(rowItems.get(j).text());
    
                    if (j != rowItems.size() - 1) {
                        writer.append(',');
                    }
                }
                writer.append('\n');
            }
    
            writer.close();
        }
    }
    

    Answering your second question:

    grab tables based on ID

    Instead of selecting first table of all tables:

    Element tableElement = doc.select("table").first();
    

    select first table of table with id advanced:

    Element tableElement = doc.select("table#advanced").first();
    

    Additional advice: The things you give as parameters to select(...) are CSS selectors.