Search code examples
c#.nethttpwebrequestwebrequest

Bypass incapsula while scraping


I'm scraping a website using webbrowser. However, it takes a lot time when it is scraping. I need to just read and get source page codes. Because of that i decided to use Httpwebrequest. First time i could get elements that i want but next time i didn't scrape. The site that i scrape has SSL protection and when i try to scrape i get an error below

<html>
<head>
<META NAME="robots" CONTENT="noindex,nofollow">
<script>
(function(){function getSessionCookies(){var cookieArray=new Array();var cName=/^\s?incap_ses_/;var c=document.cookie.split(";");for(var i=0;i<c.length;i++){var key=c[i].substr(0,c[i].indexOf("="));var value=c[i].substr(c[i].indexOf("=")+1,c[i].length);if(cName.test(key)){cookieArray[cookieArray.length]=value}}return cookieArray}function setIncapCookie(vArray){var res;try{var cookies=getSessionCookies();var digests=new Array(cookies.length);for(var i=0;i<cookies.length;i++){digests[i]=simpleDigest((vArray)+cookies[i])}res=vArray+",digest="+(digests.join())}catch(e){res=vArray+",digest="+(encodeURIComponent(e.toString()))}createCookie("___utmvc",res,20)}function simpleDigest(mystr){var res=0;for(var i=0;i<mystr.length;i++){res+=mystr.charCodeAt(i)}return res}function createCookie(name,value,seconds){var expires="";if(seconds){var date=new Date();date.setTime(date.getTime()+(seconds*1000));var expires="; expires="+date.toGMTString()}document.cookie=name+"="+value+expires+"; path=/"}function test(o){var res="";var vArray=new Array();for(var j=0;j<o.length;j++){var test=o[j][0];switch(o[j][1]){case"exists":try{if(typeof(eval(test))!="undefined"){vArray[vArray.length]=encodeURIComponent(test+"=true")}else{vArray[vArray.length]=encodeURIComponent(test+"=false")}}catch(e){vArray[vArray.length]=encodeURIComponent(test+"=false")}break;case"value":try{try{res=eval(test);if(typeof(res)==="undefined"){vArray[vArray.length]=encodeURIComponent(test+"=undefined")}else if(res===null){vArray[vArray.length]=encodeURIComponent(test+"=null")}else{vArray[vArray.length]=encodeURIComponent(test+"="+res.toString())}}catch(e){vArray[vArray.length]=encodeURIComponent(test+"=cannot evaluate");break}break}catch(e){vArray[vArray.length]=encodeURIComponent(test+"="+e)}case"plugin_extentions":try{var extentions=[];try{i=extentions.indexOf("i")}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext=indexOf is not a function");break}try{var num=navigator.plugins.length if(num==0||num==null){vArray[vArray.length]=encodeURIComponent("plugin_ext=no plugins");break}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext=cannot evaluate");break}for(var i=0;i<navigator.plugins.length;i++){if(typeof(navigator.plugins[i])=="undefined"){vArray[vArray.length]=encodeURIComponent("plugin_ext=plugins[i] is undefined");break}var filename=navigator.plugins[i].filename var ext="no extention";if(typeof(filename)=="undefined"){ext="filename is undefined"}else if(filename.split(".").length>1){ext=filename.split('.').pop()}if(extentions.indexOf(ext)<0){extentions.push(ext)}}for(i=0;i<extentions.length;i++){vArray[vArray.length]=encodeURIComponent("plugin_ext="+extentions[i])}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext="+e)}break}}vArray=vArray.join();return vArray}var o=[["navigator","exists"],["navigator.vendor","value"],["navigator.appName","value"],["navigator.plugins.length==0","value"],["navigator.platform","value"],["navigator.webdriver","value"],["platform","plugin_extentions"],["ActiveXObject","exists"],["webkitURL","exists"],["_phantom","exists"],["callPhantom","exists"],["chrome","exists"],["yandex","exists"],["opera","exists"],["opr","exists"],["safari","exists"],["awesomium","exists"],["puffinDevice","exists"],["navigator.cpuClass","exists"],["navigator.oscpu","exists"],["navigator.connection","exists"],["window.outerWidth==0","value"],["window.outerHeight==0","value"],["window.WebGLRenderingContext","exists"],["document.documentMode","value"],["eval.toString().length","value"]];try{setIncapCookie(test(o));document.createElement("img").src="/_Incapsula_Resource?SWKMTFSR=1&e="+Math.random()}catch(e){img=document.createElement("img");img.src="/_Incapsula_Resource?SWKMTFSR=1&e="+e}})();
</script>
<script>
(function() { 
var z="";var b="7472797B766172207868723B76617220743D6E6577204461746528292E67657454696D6528293B766172207374617475733D227374617274223B7661722074696D696E673D6E65772041727261792833293B77696E646F772E6F6E756E6C6F61643D66756E6374696F6E28297B74696D696E675B325D3D22723A222B286E6577204461746528292E67657454696D6528292D74293B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B69662877696E646F772E584D4C4874747052657175657374297B7868723D6E657720584D4C48747470526571756573747D656C73657B7868723D6E657720416374697665584F626A65637428224D6963726F736F66742E584D4C4854545022297D7868722E6F6E726561647973746174656368616E67653D66756E6374696F6E28297B737769746368287868722E72656164795374617465297B6361736520303A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374206E6F7420696E697469616C697A656420223B627265616B3B6361736520313A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2073657276657220636F6E6E656374696F6E2065737461626C6973686564223B627265616B3B6361736520323A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374207265636569766564223B627265616B3B6361736520333A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2070726F63657373696E672072657175657374223B627265616B3B6361736520343A7374617475733D22636F6D706C657465223B74696D696E675B315D3D22633A222B286E6577204461746528292E67657454696D6528292D74293B6966287868722E7374617475733D3D323030297B706172656E742E6C6F636174696F6E2E72656C6F616428297D627265616B7D7D3B74696D696E675B305D3D22733A222B286E6577204461746528292E67657454696D6528292D74293B7868722E6F70656E2822474554222C222F5F496E63617073756C615F5265736F757263653F535748414E45444C3D3234303839313831303338343234373432322C31343334313032313035373436353238383130322C31343539333639393136373936303235393635352C313935393639222C66616C7365293B7868722E73656E64286E756C6C297D63617463682863297B7374617475732B3D6E6577204461746528292E67657454696D6528292D742B2220696E6361705F6578633A20222B633B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B";for (var i=0;i<b.length;i+=2){z=z+parseInt(b.substring(i, i+2), 16)+",";}z = z.substring(0,z.length-1); eval(eval('String.fromCharCode('+z+')'));})();
</script></head>
<body>
<iframe style="display:none;visibility:hidden;" src="//content.incapsula.com/jsTest.html" id="gaIframe"></iframe>
</body></html>

Here My Code

            ServicePointManager.ServerCertificateValidationCallback += (sender, certificate, chain, sslPolicyErrors) => { return true; };

            string address = string.Format(@"https://www.example.com");

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(address);
            //request.Proxy = WebProxy.GetDefaultProxy();

                request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3";
                request.Accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*;q=0.8";
                request.Accept = "Accept-Language: tr-TR,tr;q=0.8,en-US;q=0.5,en;q=0.3";
                request.Accept = "Accept-Encoding: gzip, deflate, br";

                //request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
                request.Headers.Add("Upgrade-Insecure-Requests", "1");
                request.Referer = "https://www.example.com/page.html";
                string strData = "";

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            System.IO.Stream stream = response.GetResponseStream();
                System.Text.Encoding ec = System.Text.Encoding.GetEncoding("utf-8");
                System.IO.StreamReader reader = new System.IO.StreamReader(stream, ec);
                strData = reader.ReadToEnd();

HERE REQUEST HEADERS

Host: www.example.com
User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: tr-TR,tr;q=0.8,en-US;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate, br
Referer: https://www.example.com/mgrp115.html
Cookie: visid_incap_969915=n/UA1sPWSRqcLHS8izlZl/vJOlgAAAAAQkIPAAAAAACAbNh4AS7Fy71tyrvY4hm5/8klCVy0ZPw6; last_domain_id=26; __utma=185813676.385095112.1480247807.1481740765.1481816400.14; __utmz=185813676.1480247807.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); incap_ses_473_969915=lZEeZH8zLDzHexwQH2+QBka5UlgAAAAAiNkwWJCksFcH1rQFP4yccA==; GAMBLINGSESS=mii1g4hdedjjimpatgd1p93gld3b5h8l; nlbi_969915=3CsSHBl0mTjavKlP18U7bQAAAADGlJZO8Hu2ocuraCIlqUwK; __utmb=185813676.16.10.1481816400; __utmc=185813676; docscrollltop=0; live_box_sport_status1=true; __utmt=1
Connection: keep-alive
Upgrade-Insecure-Requests: 1

HERE RESPONSE HEADERS

Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
Content-Encoding: gzip
Content-Length: 12889
Content-Type: text/html; charset=UTF-8
Date: Thu, 15 Dec 2016 17:09:44 GMT
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Last-Modified: Thu, 15 Dec 2016 17:09:44 GMT
Pragma: no-cache
Server: Apache/2.2.22 (Linux/SUSE)
Set-Cookie: last_domain_id=26; expires=Fri, 15-Dec-2017 17:09:44 GMT; path=/; domain=.example.com
Vary: Accept-Encoding
X-Cdn: Incapsula
X-Firefox-Spdy: h2
x-iinfo: 8-44586852-44579876 PNNN RT(1481821783171 0) q(0 0 0 -1) r(1 1) U2

Can you help about this problem ?

Thanks in advance.


Solution

  • you're being blocked by Incapsula that checks you can run javascript on the tool you're using to send requests. I see three options:

    1. Use a thirdparty tool: use htmlagilitypack on github with the method HtmlWeb.LoadFromBrowser or this othe project on github incapsula-cracker-py3
    2. Build your own tool using workarounds that work on your website. (it's unlikely to be done as a reply on this forum)
    3. Scrape the data using a browser engine. Send your request in a browser, save the page and scrape using your work in .net