Search code examples
javascriptjavaparsingjsouphtmlunit

href field missing when I get the page using jsoup or htmlunit


I'm trying to parse google images search result.

I'm trying to get the href attribute of an element. I've noticed that the href field is missing when I get the page programmatically (this happens with both jsoup and htmlunit).
Comparing the element of the page got programmatically through java and the element of the page loaded by the actual browser, the only difference is, indeed, the href field that is missing (the rest is the same).

The href attribute (IMAGE_LINK) is the following: /imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpe‌​liculas-naruto-shipp‌​uden.jpg&imgrefurl=h‌​ttp%3A%2F%2Fwww.zona‌​rutoppuden.com%2F201‌​0%2F10%2Fnaruto-ship‌​puden-peliculas.html‌​&docid=JR8NPqKrF3ac_‌​M&tbnid=0EPPOYQcflXk‌​MM%3A&w=900&h=600&bi‌​h=638&biw=1275&ved=0‌​ahUKEwih9O2e88_OAhWM‌​ExoKHRLGAGQQMwg2KAMw‌​Aw&iact=mrc&uact=8

Maybe some issue with the javascript engine? Or maybe some kind of algorithm anti-parsing used by the website?

Snippet Java Code:

WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.waitForBackgroundJavaScript(50000);
HtmlPage page1=null;

        try {
            // Get the first page
            page1 = webClient.getPage(URL);
            System.out.println(page1.asXml());
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

Snippet Html Code (Real Browser):

<a jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l" style="width: 134px; height: 201px; left: 0px; background: rgb(128, 128, 128);" href="IMAGE_LINK"> CONTENT... </a>

Snippet Html Code (Page got programmatically):

<a jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l" style="width: 134px; height: 201px; left: 0px; background: rgb(128, 128, 128);"> CONTENT... </a>

Thank you.


Solution

  • For each search result there is a <div class="rg_meta">containing a JSON object, which also holds the url. Using a JSON parser like json-simple to parse the object, the following code prints the image urls:

    String searchTerm = "naruto shippuden";
    String searchUrl = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&biw=1920&bih=955&q=" + searchTerm.replace(" ", "+") + "&gws_rd=cr";
    
    try {
        Document doc = Jsoup.connect(searchUrl)
                .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36")
                .referrer("https://www.google.com/").get();
    
        JSONObject obj;
    
        for (Element result : doc.select("div.rg_meta")) {
    
            // div.rg_meta contains a JSON object, which also holds the image url
            obj = (JSONObject) new JSONParser().parse(result.text());
    
            String imageUrl = (String) obj.get("ou");
    
            // just printing out the url to demonstate the approach
            System.out.println("imageUrl: " + imageUrl);    
        } 
    
    } catch (IOException e1) {
        e1.printStackTrace();
    }catch (ParseException e) {
        e.printStackTrace();
    }
    

    Output:

    imageUrl: http://ib3.huluim.com/show_key_art/1603?size=1600x600&region=US
    imageUrl: http://cdn.zonarutoppuden.com/ns/peliculas-naruto-shippuden.jpg
    imageUrl: http://www.saiyanisland.com/news/wp-content/uploads2/2014/12/Naruto-Sasuke.jpg
    ...
    

    Update

    Since jsAction doesn't seem to play nicely with htmlUnit, I would propose to use phantomJs. Just download the binary for your OS and create a script file.

    create a page.js file:

    var page = require('webpage').create();
    var fs = require('fs');
    
    page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
    
    page.zoomFactor = 0.1;
    
    page.viewportSize = {
      width: 1920,
      height: 1080
    };
    
    var divCount="-1";
    var topPosition=0;
    var unchangedCounter=0;
    
    page.open('https://www.google.com/search?site=imghp&tbm=isch&source=hp&q=naruto+shippuden&gws_rd=cr', function(status) {
        console.log("Status: " + status);
        if(status === "success") {
    
            window.setInterval(function() {
    
                var newDivCount = page.evaluate(function() { 
                    var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                    return divs[divs.length-1].getAttribute("data-ri");
                });
    
                topPosition = topPosition + 1080;
    
                page.scrollPosition = {
                    top: topPosition,
                    left: 0
                };
    
                if(newDivCount===divCount){
                    page.evaluate(function() {
                        var button = document.querySelector("#smb");
                        console.log("buttontype:"+typeof button);
                        if(!(typeof button === "undefined")) {
                            button.click();
                            return true;
                        }else{
                            return false;
                        }
                    });
    
                    if(unchangedCounter===5){
                        console.log(newDivCount);
                        var path = 'output.html';
                        fs.write(path, page.content, 'w');
                        phantom.exit();
                    }else{
                        unchangedCounter=unchangedCounter+1;
                    }
                }else{
                    unchangedCounter=0;
                }
                divCount = newDivCount;
    
            }, 500);
        }
    });
    

    Now we execute the script file with phantomJs and parse the result as before with jsoup:

    try {
        Process process = Runtime.getRuntime().exec("bin\\phantomjs page.js"); //change path to phantomjs binary and your script file
        process.waitFor();
    
        Document doc = Jsoup.parse(new File("output.html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js
    
        for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
            System.out.println(element.attr("href"));
        }
        System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
    } catch (IOException | InterruptedException e) {
        e.printStackTrace();
    }
    

    Output:

    /imgres?imgurl=http%3A%2F%2Fib3.huluim.com%2Fshow_key_art%2F1603%3Fsize%3D1600x600%26region%3DUS&imgrefurl=http%3A%2F%2Fwww.hulu.com%2Fnaruto-shippuden&docid=OgW4j66rp7CKkM&tbnid=SElXvYDJj9cR6M%3A&w=1600&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwgzKAAwAA&iact=mrc&uact=8
    /imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpeliculas-naruto-shippuden.jpg&imgrefurl=http%3A%2F%2Fwww.zonarutoppuden.com%2F2010%2F10%2Fnaruto-shippuden-peliculas.html&docid=JR8NPqKrF3ac_M&tbnid=0EPPOYQcflXkMM%3A&w=900&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwg0KAEwAQ&iact=mrc&uact=8
    ...
    Number of results: 463
    

    Update: passing the url as a parameter to the script

    Script page.js

    var page = require('webpage').create();
    var fs = require('fs');
    var system = require('system');
    
    var url = "";
    var searchParameter = "";
    
    if (system.args.length === 3) {
        url=system.args[1];
        searchParameter=system.args[2];
    }
    
    if(url==="" || searchParameter===""){
        phantom.exit();
    }
    
    page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
    
    page.zoomFactor = 0.1;
    
    page.viewportSize = {
      width: 1920,
      height: 1080
    };
    
    var divCount="-1";
    var topPosition=0;
    var unchangedCounter=0;
    
    page.open(url, function(status) {
        console.log("Status: " + status);
        if(status === "success") {
    
            window.setInterval(function() {
    
                var newDivCount = page.evaluate(function() { 
                    var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                    return divs[divs.length-1].getAttribute("data-ri");
                });
    
                topPosition = topPosition + 1080;
    
                page.scrollPosition = {
                    top: topPosition,
                    left: 0
                };
    
                if(newDivCount===divCount){
                    page.evaluate(function() {
                        var button = document.querySelector("#smb");
                        if(!(typeof button === "undefined")) {
                            button.click();
                            return true;
                        }else{
                            return false;
                        }
                    });
    
                    if(unchangedCounter===5){
                        var path = searchParameter+'.html';
                        fs.write(path, page.content, 'w');
                        phantom.exit();
                    }else{
                        unchangedCounter=unchangedCounter+1;
                    }
                }else{
                    unchangedCounter=0;
                }
                divCount = newDivCount;
    
            }, 500);
        }else{
            phantom.exit();
        }
    });
    

    Java code

    try {
        //change path to phantomjs binary and your script file
        String phantomJSPath = "phantomjs" + File.separator + "bin" + File.separator + "phantomjs";
        String scriptFile = "page.js";
    
        String searchTerm = "naruto+shippuden";
        String urlParameter = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&gws_rd=cr&q="+searchTerm;
    
        Process process = Runtime.getRuntime().exec(phantomJSPath + " " + scriptFile + " " + urlParameter + " " + searchTerm);
        process.waitFor();
    
        Document doc = Jsoup.parse(new File(searchTerm + ".html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js
    
        for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
            System.out.println(element.attr("href"));
        }
        System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
    } catch (IOException | InterruptedException e) {
        e.printStackTrace();
    }