Search code examples
javajsoup

Extract values from html tags using java with jsoup


i m new using jsoup library (jsoup-1.14.3)

i have this html

<html><head><title>Alfresco Content Repository</title><style>body { font-family: Arial, Helvetica; font-size: 12pt; background-color: white; }
table { font-family: Arial, Helvetica; font-size: 12pt; background-color: white; }
.listingTable { border: solid black 1px; }
.textCommand { font-family: verdana; font-size: 10pt; }
.textLocation { font-family: verdana; font-size: 11pt; font-weight: bold; color: #2a568f; }
.textData { font-family: verdana; font-size: 10pt; }
.tableHeading { font-family: verdana; font-size: 10pt; font-weight: bold; color: white; background-color: #2a568f; }
.rowOdd { background-color: #eeeeee; }
.rowEven { background-color: #dddddd; }
</style></head>
<body>
<table cellspacing='2' cellpadding='3' border='0' width='100%'>
<tr><td colspan='4' class='textLocation'>Directory listing for /rep</td></tr>
<tr><td height='10' colspan='4'></td></tr></table><table cellspacing='2' cellpadding='3' border='0' width='100%' class='listingTable'>
<tr><td class='tableHeading' width='*'>Name</td><td class='tableHeading' width='10%'>Size</td><td class='tableHeading' width='20%'>Type</td><td class='tableHeading' width='25%'>Modified Date</td></tr>
<tr class='rowOdd'><td class='textData'><a href="/alfresco/webdav/rep/ED">ED</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Thu, 05 Jan 2017 11:11:14 GMT</td></tr>
<tr class='rowEven'><td class='textData'><a href="/alfresco/webdav/rep/FLOW%20CHART">FLOW CHART</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Thu, 27 Jun 2013 13:30:18 GMT</td></tr>
<tr class='rowOdd'><td class='textData'><a href="/alfresco/webdav/rep/file">file</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Wed, 10 Nov 2021 13:16:49 GMT</td></tr>


</table></body></html>

ANd , i'm trying to get the href of each tag .

For example ,

  
<table cellspacing='2' cellpadding='3' border='0' width='100%'>
<tr><td colspan='4' class='textLocation'>Directory listing for /rep</td></tr>
<tr><td height='10' colspan='4'></td></tr></table><table cellspacing='2' cellpadding='3' border='0' width='100%' class='listingTable'>
<tr><td class='tableHeading' width='*'>Name</td><td class='tableHeading' width='10%'>Size</td><td class='tableHeading' width='20%'>Type</td><td class='tableHeading' width='25%'>Modified Date</td></tr>
<tr class='rowOdd'><td class='textData'><a href="/alfresco/webdav/rep/ED">ED</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Thu, 05 Jan 2017 11:11:14 GMT</td></tr>

I want to extract "/alfresco/webdav/rep/ED" and "ED" and "Thu, 05 Jan 2017 11:11:14 GMT"


Solution

  • First you need to parse the html which is String to Document.

    final Document document = Jsoup.parse(html);
    

    Then you need to select all tr tags which contains a tag.

    final Elements trElements = document.select("tr:has(a)");
    

    After, you need to browse each tr tag found :

    for (final Element trElement : trElements) {
        //Do stuff
    }
    

    For each tr tag, you retrieve the href value of tag. But first, you need to retrieve the a tag :

    final Element aElement = trElement.select("a").first();
    

    Then, we retrieve, the value of href attribute in tag a.

    final String href = aElement.attr("href");
    

    For name, you retrieve the text content of a tag :

    final String name = aElement.text();
    

    For date, you need to retrieve the fourth td tag from tr tag :

    final Element dateTdElement = trElement.select("td").get(3);
    

    And just retrieve the value text to get the date content :

    final String date = dateTdElement.text();
    

    NB : The method select() accept a css query. All css query is valid with extended syntax like ':has()' and other part. See Jsoup documention for more detail.

    To resume all in one code :

    public static void main(final String[] args) {
        final String html = "<html><head><title>Alfresco Content Repository</title><style>body { font-family: Arial, Helvetica; font-size: 12pt; background-color: white; }\n" +
                "table { font-family: Arial, Helvetica; font-size: 12pt; background-color: white; }\n" +
                ".listingTable { border: solid black 1px; }\n" +
                ".textCommand { font-family: verdana; font-size: 10pt; }\n" +
                ".textLocation { font-family: verdana; font-size: 11pt; font-weight: bold; color: #2a568f; }\n" +
                ".textData { font-family: verdana; font-size: 10pt; }\n" +
                ".tableHeading { font-family: verdana; font-size: 10pt; font-weight: bold; color: white; background-color: #2a568f; }\n" +
                ".rowOdd { background-color: #eeeeee; }\n" +
                ".rowEven { background-color: #dddddd; }\n" +
                "</style></head>\n" +
                "<body>\n" +
                "<table cellspacing='2' cellpadding='3' border='0' width='100%'>\n" +
                "<tr><td colspan='4' class='textLocation'>Directory listing for /rep</td></tr>\n" +
                "<tr><td height='10' colspan='4'></td></tr></table><table cellspacing='2' cellpadding='3' border='0' width='100%' class='listingTable'>\n" +
                "<tr><td class='tableHeading' width='*'>Name</td><td class='tableHeading' width='10%'>Size</td><td class='tableHeading' width='20%'>Type</td><td class='tableHeading' width='25%'>Modified Date</td></tr>\n" +
                "<tr class='rowOdd'><td class='textData'><a href=\"/alfresco/webdav/rep/ED\">ED</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Thu, 05 Jan 2017 11:11:14 GMT</td></tr>\n" +
                "<tr class='rowEven'><td class='textData'><a href=\"/alfresco/webdav/rep/FLOW%20CHART\">FLOW CHART</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Thu, 27 Jun 2013 13:30:18 GMT</td></tr>\n" +
                "<tr class='rowOdd'><td class='textData'><a href=\"/alfresco/webdav/rep/file\">file</a></td><td class='textData'>&nbsp;</td><td class='textData'>&nbsp;</td><td class='textData'>Wed, 10 Nov 2021 13:16:49 GMT</td></tr>\n" +
                "\n" +
                "\n" +
                "</table></body></html>";
    
        final Document document = Jsoup.parse(html);
        final Elements trElements = document.select("tr:has(a)");
        for (final Element trElement : trElements) {
            final Element aElement = trElement.select("a").first();
            final String href = aElement.attr("href");
            System.out.println("Href : " + href);
    
            final String name = aElement.text();
            System.out.println("Name : " + name);
    
            final Element dateTdElement = trElement.select("td").get(3);
            final String date = dateTdElement.text();
            System.out.println("Date : " + date);
        }
    }
    

    It prints something like :

    Href : /alfresco/webdav/rep/ED
    Name : ED
    Date : Thu, 05 Jan 2017 11:11:14 GMT
    Href : /alfresco/webdav/rep/FLOW%20CHART
    Name : FLOW CHART
    Date : Thu, 27 Jun 2013 13:30:18 GMT
    Href : /alfresco/webdav/rep/file
    Name : file
    Date : Wed, 10 Nov 2021 13:16:49 GMT