2016-12-15 29 views
1

我正在使用webbrowser抓取網站。但是,它需要花費很多時間。我需要閱讀並獲取源代碼頁。正因爲如此,我決定使用Httpwebrequest。第一次我可以得到我想要的元素,但下次我沒有刮。我湊的網站有SSL保護,當我嘗試刮我得到以下抓取時繞過incapsula

<html> 
<head> 
<META NAME="robots" CONTENT="noindex,nofollow"> 
<script> 
(function(){function getSessionCookies(){var cookieArray=new Array();var cName=/^\s?incap_ses_/;var c=document.cookie.split(";");for(var i=0;i<c.length;i++){var key=c[i].substr(0,c[i].indexOf("="));var value=c[i].substr(c[i].indexOf("=")+1,c[i].length);if(cName.test(key)){cookieArray[cookieArray.length]=value}}return cookieArray}function setIncapCookie(vArray){var res;try{var cookies=getSessionCookies();var digests=new Array(cookies.length);for(var i=0;i<cookies.length;i++){digests[i]=simpleDigest((vArray)+cookies[i])}res=vArray+",digest="+(digests.join())}catch(e){res=vArray+",digest="+(encodeURIComponent(e.toString()))}createCookie("___utmvc",res,20)}function simpleDigest(mystr){var res=0;for(var i=0;i<mystr.length;i++){res+=mystr.charCodeAt(i)}return res}function createCookie(name,value,seconds){var expires="";if(seconds){var date=new Date();date.setTime(date.getTime()+(seconds*1000));var expires="; expires="+date.toGMTString()}document.cookie=name+"="+value+expires+"; path=/"}function test(o){var res="";var vArray=new Array();for(var j=0;j<o.length;j++){var test=o[j][0];switch(o[j][1]){case"exists":try{if(typeof(eval(test))!="undefined"){vArray[vArray.length]=encodeURIComponent(test+"=true")}else{vArray[vArray.length]=encodeURIComponent(test+"=false")}}catch(e){vArray[vArray.length]=encodeURIComponent(test+"=false")}break;case"value":try{try{res=eval(test);if(typeof(res)==="undefined"){vArray[vArray.length]=encodeURIComponent(test+"=undefined")}else if(res===null){vArray[vArray.length]=encodeURIComponent(test+"=null")}else{vArray[vArray.length]=encodeURIComponent(test+"="+res.toString())}}catch(e){vArray[vArray.length]=encodeURIComponent(test+"=cannot evaluate");break}break}catch(e){vArray[vArray.length]=encodeURIComponent(test+"="+e)}case"plugin_extentions":try{var extentions=[];try{i=extentions.indexOf("i")}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext=indexOf is not a function");break}try{var num=navigator.plugins.length if(num==0||num==null){vArray[vArray.length]=encodeURIComponent("plugin_ext=no plugins");break}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext=cannot evaluate");break}for(var i=0;i<navigator.plugins.length;i++){if(typeof(navigator.plugins[i])=="undefined"){vArray[vArray.length]=encodeURIComponent("plugin_ext=plugins[i] is undefined");break}var filename=navigator.plugins[i].filename var ext="no extention";if(typeof(filename)=="undefined"){ext="filename is undefined"}else if(filename.split(".").length>1){ext=filename.split('.').pop()}if(extentions.indexOf(ext)<0){extentions.push(ext)}}for(i=0;i<extentions.length;i++){vArray[vArray.length]=encodeURIComponent("plugin_ext="+extentions[i])}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext="+e)}break}}vArray=vArray.join();return vArray}var o=[["navigator","exists"],["navigator.vendor","value"],["navigator.appName","value"],["navigator.plugins.length==0","value"],["navigator.platform","value"],["navigator.webdriver","value"],["platform","plugin_extentions"],["ActiveXObject","exists"],["webkitURL","exists"],["_phantom","exists"],["callPhantom","exists"],["chrome","exists"],["yandex","exists"],["opera","exists"],["opr","exists"],["safari","exists"],["awesomium","exists"],["puffinDevice","exists"],["navigator.cpuClass","exists"],["navigator.oscpu","exists"],["navigator.connection","exists"],["window.outerWidth==0","value"],["window.outerHeight==0","value"],["window.WebGLRenderingContext","exists"],["document.documentMode","value"],["eval.toString().length","value"]];try{setIncapCookie(test(o));document.createElement("img").src="/_Incapsula_Resource?SWKMTFSR=1&e="+Math.random()}catch(e){img=document.createElement("img");img.src="/_Incapsula_Resource?SWKMTFSR=1&e="+e}})(); 
</script> 
<script> 
(function() { 
var z="";var b="7472797B766172207868723B76617220743D6E6577204461746528292E67657454696D6528293B766172207374617475733D227374617274223B7661722074696D696E673D6E65772041727261792833293B77696E646F772E6F6E756E6C6F61643D66756E6374696F6E28297B74696D696E675B325D3D22723A222B286E6577204461746528292E67657454696D6528292D74293B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B69662877696E646F772E584D4C4874747052657175657374297B7868723D6E657720584D4C48747470526571756573747D656C73657B7868723D6E657720416374697665584F626A65637428224D6963726F736F66742E584D4C4854545022297D7868722E6F6E726561647973746174656368616E67653D66756E6374696F6E28297B737769746368287868722E72656164795374617465297B6361736520303A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374206E6F7420696E697469616C697A656420223B627265616B3B6361736520313A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2073657276657220636F6E6E656374696F6E2065737461626C6973686564223B627265616B3B6361736520323A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374207265636569766564223B627265616B3B6361736520333A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2070726F63657373696E672072657175657374223B627265616B3B6361736520343A7374617475733D22636F6D706C657465223B74696D696E675B315D3D22633A222B286E6577204461746528292E67657454696D6528292D74293B6966287868722E7374617475733D3D323030297B706172656E742E6C6F636174696F6E2E72656C6F616428297D627265616B7D7D3B74696D696E675B305D3D22733A222B286E6577204461746528292E67657454696D6528292D74293B7868722E6F70656E2822474554222C222F5F496E63617073756C615F5265736F757263653F535748414E45444C3D3234303839313831303338343234373432322C31343334313032313035373436353238383130322C31343539333639393136373936303235393635352C313935393639222C66616C7365293B7868722E73656E64286E756C6C297D63617463682863297B7374617475732B3D6E6577204461746528292E67657454696D6528292D742B2220696E6361705F6578633A20222B633B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B";for (var i=0;i<b.length;i+=2){z=z+parseInt(b.substring(i, i+2), 16)+",";}z = z.substring(0,z.length-1); eval(eval('String.fromCharCode('+z+')'));})(); 
</script></head> 
<body> 
<iframe style="display:none;visibility:hidden;" src="//content.incapsula.com/jsTest.html" id="gaIframe"></iframe> 
</body></html> 

這裏我的代碼

  ServicePointManager.ServerCertificateValidationCallback += (sender, certificate, chain, sslPolicyErrors) => { return true; }; 

      string address = string.Format(@"https://www.example.com"); 

      HttpWebRequest request = (HttpWebRequest)WebRequest.Create(address); 
      //request.Proxy = WebProxy.GetDefaultProxy(); 

       request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3"; 
       request.Accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*;q=0.8"; 
       request.Accept = "Accept-Language: tr-TR,tr;q=0.8,en-US;q=0.5,en;q=0.3"; 
       request.Accept = "Accept-Encoding: gzip, deflate, br"; 

       //request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; 
       request.Headers.Add("Upgrade-Insecure-Requests", "1"); 
       request.Referer = "https://www.example.com/page.html"; 
       string strData = ""; 

      HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
      System.IO.Stream stream = response.GetResponseStream(); 
       System.Text.Encoding ec = System.Text.Encoding.GetEncoding("utf-8"); 
       System.IO.StreamReader reader = new System.IO.StreamReader(stream, ec); 
       strData = reader.ReadToEnd(); 

這裏請求頭錯誤

Host: www.example.com 
User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0 
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 
Accept-Language: tr-TR,tr;q=0.8,en-US;q=0.5,en;q=0.3 
Accept-Encoding: gzip, deflate, br 
Referer: https://www.example.com/mgrp115.html 
Cookie: visid_incap_969915=n/UA1sPWSRqcLHS8izlZl/vJOlgAAAAAQkIPAAAAAACAbNh4AS7Fy71tyrvY4hm5/8klCVy0ZPw6; last_domain_id=26; __utma=185813676.385095112.1480247807.1481740765.1481816400.14; __utmz=185813676.1480247807.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); incap_ses_473_969915=lZEeZH8zLDzHexwQH2+QBka5UlgAAAAAiNkwWJCksFcH1rQFP4yccA==; GAMBLINGSESS=mii1g4hdedjjimpatgd1p93gld3b5h8l; nlbi_969915=3CsSHBl0mTjavKlP18U7bQAAAADGlJZO8Hu2ocuraCIlqUwK; __utmb=185813676.16.10.1481816400; __utmc=185813676; docscrollltop=0; live_box_sport_status1=true; __utmt=1 
Connection: keep-alive 
Upgrade-Insecure-Requests: 1 

HERE響應頭

Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0 
Content-Encoding: gzip 
Content-Length: 12889 
Content-Type: text/html; charset=UTF-8 
Date: Thu, 15 Dec 2016 17:09:44 GMT 
Expires: Thu, 19 Nov 1981 08:52:00 GMT 
Last-Modified: Thu, 15 Dec 2016 17:09:44 GMT 
Pragma: no-cache 
Server: Apache/2.2.22 (Linux/SUSE) 
Set-Cookie: last_domain_id=26; expires=Fri, 15-Dec-2017 17:09:44 GMT; path=/; domain=.example.com 
Vary: Accept-Encoding 
X-Cdn: Incapsula 
X-Firefox-Spdy: h2 
x-iinfo: 8-44586852-44579876 PNNN RT(1481821783171 0) q(0 0 0 -1) r(1 1) U2 

你能幫助解決這個問題嗎?

在此先感謝。

+0

@AlfieGoodacre當然。 – Quicksilver

+0

這是一個笑話,指的是這個問題不一定是巨大的事實,請看一下[this](http://stackoverflow.com/help/mcve) –

+0

您的代碼在第一個代碼片段中可以達到超過6500字符供我們查看,其中3700個是可能真正導致問題的JavaScript函數。 –

回答

1

你被Incapsula封鎖,它檢查你可以在你用來發送請求的工具上運行javascript。我看到三個選項:

  1. 使用第三方工具:在github上使用htmlagilitypack與方法HtmlWeb.LoadFromBrowser或此行吟項目在GitHub上incapsula,餅乾-PY 3
  2. 構建使用在您的網站工作的變通辦法自己的工具。 (這不太可能作爲本論壇的回覆)
  3. 使用瀏覽器引擎刮取數據。在瀏覽器中發送您的請求,保存頁面,並使用您的工作刮.net
+0

雖然這個鏈接可能回答這個問題,但最好在這裏包含答案的重要部分並提供參考鏈接。如果鏈接頁面更改,則僅鏈接答案可能會失效。 - [來自評論](/ review/low-quality-posts/17642403) – TryingToImprove

+0

謝謝。我試圖變得更有建設性 – sofsntp

+0

@sofsntp我可以在私人信息中問你一個問題,或者我可以怎樣與你聯繫,先生?順便謝謝你的回答 – Quicksilver