Search code examples
javajsoup

Parsing span inside div using Jsoup


Given this HTML:

<div id="cat-product-list" alt1="356623" class="product-list list_all_items_price price_new"><span id="wholesale_11_member_price" class="index-price special_price final_price" price="US$5.25"><strong class="final_price_strong">US$5.25</strong><b class="show_vip">(vip)</b></span><span id="wholesale_12_member_price" class="index-price special_price final_price" price="US$4.90" style="display: none"><strong class="final_price_strong">US$4.90</strong><b class="show_vip">(vip)</b></span><span id="wholesale_13_member_price" class="index-price special_price final_price" price="US$4.55" style="display: none"><strong class="final_price_strong">US$4.55</strong><b class="show_vip">(vip)</b></span><span id="wholesale_14_member_price" class="index-price special_price final_price" price="US$4.20" style="display: none"><strong class="final_price_strong">US$4.20</strong><b class="show_vip">(vip)</b></span><span id="shop_price_member_price_on" class="index-price shop_price" price="US$7.00"><strike>US$7.00</strike></span></div>

I am trying to select the first span inside the div and then get the strong value. So far I managed to scrape other things successfully, however for this I couldn't get it done:

        Document d = Jsoup.connect("http://www.emmacloth.com/Clothing-vc-7061.html?icn=clothing&ici=ec_navbar05").timeout(6000).get();
        Elements elements =  d.select("div#productsContent1_goods.products_category");
    for (Element element: elements.select("div.box-product-list.list_all_items")){
        System.out.println("start");
        String productImage = element.select("div.goods_aImg a img").attr("src");
        String productname = element.select("div.goods_mz a").attr("title");
        String productUrl = "http://www.emmacloth.com" + element.select("div.goods_mz a").attr("href");
 //         String productPrice = element.select("div.product-
list.list_all_items_price.price_new >span.index-price.special_price.final_price").toString();
        Elements priceElements = element.select(
                "div.product-list.list_all_items_price.price_new > span.index-price.special_price.final_price"
        );

        for (Element priceElement : priceElements) {
            System.out.println(priceElement.attr("price"));
        }
//          System.out.println(productPrice);                   
    }
}

Solution

  • Within this div you are looking for the span which has the following classes: index-price special_price final_price and from that (I think) you want to extract the price.

    Given the html provided in your question, the following code ...

    String html = "<div id=\"cat-product-list\" alt1=\"356623\" class=\"product-list list_all_items_price price_new\">" +
        "<span id=\"wholesale_11_member_price\" class=\"index-price special_price final_price\" price=\"US$5.25\">" +
        "<strong class=\"final_price_strong\">US$5.25</strong>" +
        "<b class=\"show_vip\">(vip)</b>" +
        "</span>" +
        "<span id=\"wholesale_12_member_price\" class=\"index-price special_price final_price\" price=\"US$4.90\" style=\"display: none\">" +
        "<strong class=\"final_price_strong\">US$4.90</strong>" +
        "<b class=\"show_vip\">(vip)</b>" +
        "</span>" +
        "<span id=\"wholesale_13_member_price\" class=\"index-price special_price final_price\" price=\"US$4.55\" style=\"display: none\">" +
        "<strong class=\"final_price_strong\">US$4.55</strong>" +
        "<b class=\"show_vip\">(vip)</b>" +
        "</span>" +
        "<span id=\"wholesale_14_member_price\" class=\"index-price special_price final_price\" price=\"US$4.20\" style=\"display: none\">" +
        "<strong class=\"final_price_strong\">US$4.20</strong>" +
        "<b class=\"show_vip\">(vip)</b>" +
        "</span>" +
        "<span id=\"shop_price_member_price_on\" class=\"index-price shop_price\" price=\"US$7.00\"><strike>US$7.00</strike></span>" +
        "</div>";
    
    Document doc = Jsoup.parse(html);
    
    // this selector selects the div(s) having classes: "product-list list_all_items_price price_new"
    // and within that div, it selects the span(s) having the classes: "index-price special_price final_price"
    Elements priceElements = doc.select(
            "div.product-list.list_all_items_price.price_new > span.index-price.special_price.final_price"
    );
    
    for (Element priceElement : priceElements) {
        System.out.println(priceElement.attr("price"));
    }
    

    ... will print out the product prices:

    US$5.25
    US$4.90
    US$4.55
    US$4.20
    

    Update

    In response to his comment:

    or some reason its not working for the whole website, can you check my modified question

    The following code ...

    Document d =
            Jsoup.connect("http://www.emmacloth.com/Clothing-vc-7061.html?icn=clothing&ici=ec_navbar05").timeout(6000).get();
    for (Element element : d.select("div#productsContent1_goods.products_category > div.box-product-list.list_all_items")) {
        System.out.println("start");
        String productImage = element.select("div.goods_aImg > a > img").attr("src");
        String productname = element.select("div.goods_mz > a").attr("title");
        String productUrl = "http://www.emmacloth.com" + element.select("div.goods_mz > a").attr("href");
    
        System.out.println(productImage);
        System.out.println(productname);
        System.out.println(productUrl);
    }
    

    .. will print:

    http://img.ltwebstatic.com/images/pi/201710/3b/15090086488079557831_thumbnail_220x293.jpg
    Pearl Embellished Bow Tied Bell Cuff Blouse
    http://www.emmacloth.com/Pearl-Embellished-Bow-Tied-Bell-Cuff-Blouse-p-403325-cat-1733.html
    ... etc
    

    So far, so good. But what about the price? If you look at the source of this webpage you'll see that the price element is dynamic content which is provided by the category_price JS function on that page. So, that element does not exist statically and hence cannot be read by JSoup. In order to read dynamic content you'll have to use a web driver such as Selenium.