2010-08-30 26 views
0

包括

#include <iostream> 
#include <sstream> 
#include <curl/curl.h> 
    #include <htmlcxx/html/ParserDom.h> 
    #include <iostream> 
    using namespace std; 
    using namespace htmlcxx; 

static size_t http_write(void* buf, size_t size, size_t nmemb, void* userp) 
{ 
    if(userp) 
    { 
     ostringstream* oss = static_cast<ostringstream*>(userp); 
     streamsize len = size * nmemb; 
     oss->write(static_cast<char*>(buf), len); 
     return nmemb; 
    } 

    return 0; 
} 

string get_html_page(const string& url, long timeout = 0) 
{ 
    CURL* curl = curl_easy_init(); 

    ostringstream oss; 

    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &http_write); 
    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); 
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); 
    curl_easy_setopt(curl, CURLOPT_FILE, &oss); 
    curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); 
    curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); 

    curl_easy_perform(curl); 
    curl_easy_cleanup(curl); 

    return oss.str(); 
} 

int main() 
{ 
    string html = get_html_page("http://www.google.co.in"); 

    //cout << html << endl; 
    HTML::ParserDom parser; 
     tree<HTML::Node> dom = parser.parseTree(html); 

     //Print whole DOM tree 
     //cout <<dom <<endl; 

     //Dump all links in the tree 
     tree<HTML::Node>::iterator it = dom.begin(); 
     tree<HTML::Node>::iterator end = dom.end(); 
     for (; it !=end; ++it) 
     { 
     if (strcasecmp(it->tagName().c_str(), "A") == 0) 
     { 
      it->parseAttributes(); 
      //cout << it->attribute("href").second << endl; 
     } 
     } 

     //Dump all text of the document 
     it = dom.begin(); 
     end = dom.end(); 
     for (; it != end; ++it) 
     { 
     if ((!it->isTag()) && (!it->isComment())) 
     { 
      cout << it->text(); 
     } 
     } 
    // cout << endl; 
    return 0; 
} 

我用這個代碼,從中提取htmlpage單獨提取文本的Java腳本頌歌也是有任何錯我的代碼?不能單獨提取文本

OUTPUT:

Googlewindow.google={kEI:"0a97TLvcFMS7rAe5htz9Ag",kEXPI:"25901,26119,26325",kCSI:{e:"25901,26119,26325",ei:"0a97TLvcFMS7rAe5htz9Ag",expi:"25901,26119,26325"},ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(b,d,c){var a=new Image,e=google,g=e.lc,f=e.li;a.onerror=(a.onload=(a.onabort=function(){delete g[f]}));g[f]=a;c=c||"/gen_204?atyp=i&ct="+b+"&cad="+d+"&zx="+google.time();a.src=c;e.li=f+1},lc:[],li:0,Toolbelt:{}}; 
window.google.sn="webhp";window.google.timers={load:{t:{start:(new Date).getTime()}}};try{}catch(u){}window.google.jsrt_kill=1; 
var _gjwl=location;function _gjuc(){var e=_gjwl.href.indexOf("#");if(e>=0){var a=_gjwl.href.substring(e);if(a.indexOf("&q=")>0||a.indexOf("#q=")>=0){a=a.substring(1);if(a.indexOf("#")==-1){for(var c=0;c<a.length;){var d=c;if(a.charAt(d)=="&")++d;var b=a.indexOf("&",d);if(b==-1)b=a.length;var f=a.substring(d,b);if(f.indexOf("fp=")==0){a=a.substring(0,c)+a.substring(b,a.length);b=c}else if(f=="cad=h")return 0;c=b}_gjwl.href="/search?"+a+"&cad=h";return 1}}}return 0}function _gjp(){!(window._gjwl.hash&& 
window._gjuc())&&setTimeout(_gjp,500)}; 
window._gjp && _gjp()body{margin:0}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}body,td,a,p,.h{font-family:arial,sans-serif}.h{color:#36c;font-size:20px}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{width:496px}.tiah{width:458px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}#gog{background:#fff}#gbar,#guser{font-size:13px;padding-top:1px !important}#gbar{float:left;height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}#gbs,.gbm{background:#fff;left:0;position:absolute;text-align:left;visibility:hidden;z-index:1000}.gbm{border:1px solid;border-color:#c9d7f1 #36C#36C#a2bae7;z-index:1001}.gb1{margin-right:.5em}.gb1,.gb3{zoom:1}.gb2{display:block;padding:.2em .5em}.gb2,.gb3{text-decoration:none;border-bottom:none}a.gb1,a.gb2,a.gb3,a.gb4{color:#00c !important}a.gb2:hover{background:#36c;color:#fff !important}body{background:#fff;color:black}input{-moz-box-sizing:content-box}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#4272db}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff!important}.ds{display:-moz-inline-box}.ds{border-bottom:solid 1px #e7e7e7;border-right:solid 1px #e7e7e7;display:inline-block;margin:3px 0 4px;margin-left:4px}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px;}.lsbb{background:#eee;border:solid 1px;border-color:#ccC#999 #999 #ccc;height:30px;display:block}.lsb{background:url(/images/srpr/nav_logo14.png) bottom;font:15px arial,sans-serif;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}.ftl,#fll a{margin:0 12px}#addlang a{padding:0 3px}.gac_v div{display:none}.gac_v .gac_v2,.gac_bt{display:block!important}google.y={};google.x=function(e,g){google.y[e.id]=[e,g];return false};window.gbar={qs:function(){},tg:function(e){var o={id:'gbar'};for(i in e)o[i]=e[i];google.x(o,function(){gbar.tg(o)})}};Web Images Maps News Orkut Books Gmail more &#9660;Translate Scholar Blogs YouTube Calendar Photos Documents Reader Sites Groups even more &raquo; iGoogle | Search settings | Sign in India&nbsp;Advanced SearchLanguage ToolsGoogle.co.in offered in: Hindi Bengali Telugu Marathi Tamil Gujarati Kannada Malayalam PunjabiAdvertising&nbsp;ProgramsAbout GoogleGo to Google.com&copy; 2010 - Privacy if(google.y)google.y.first=[];if(google.y)google.y.first=[];google.dstr=[];google.rein=[];window.setTimeout(function(){var a=document.createElement("script");a.src="/extern_js/f/CgJlbhICaW4gACswRTgBLCswWjgDLCswDjgALCswFzgHLCswJzgELCswPDgDLCswUTgDLCswCjhzQB0sKzAWOB0sKzAZOCAsKzAlOMqIASwrMDU4BCwrMEA4EiwrMEE4BSwrME44BiwrMFQ4ASwrMBg4BSwrMCY4DSyAAheQAhg/x2R96GGjycQ.js";(document.getElementById("xjsd")||document.body).appendChild(a);if(google.timers&&google.timers.load.t)google.timers.load.t.xjsls=(new Date).getTime();},0); 
;google.neegg=1;google.y.first.push(function(){var form=document.f||document.f||document.gs;google.ac.i(form,form.q,'','','',{o:1,sw:1});google.History&&google.History.initialize('/')});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);google.fade=null;}(function(){ 
var b,d,e,f;function g(a,c){if(a.removeEventListener){a.removeEventListener("load",c,false);a.removeEventListener("error",c,false)}else{a.detachEvent("onload",c);a.detachEvent("onerror",c)}}function h(a){f=(new Date).getTime();++d;a=a||window.event;var c=a.target||a.srcElement;g(c,h)}var i=document.getElementsByTagName("img");b=i.length;d=0;for(var j=0,k;j<b;++j){k=i[j];if(k.complete||typeof k.src!="string"||!k.src)++d;else if(k.addEventListener){k.addEventListener("load",h,false);k.addEventListener("error", 
h,false)}else{k.attachEvent("onload",h);k.attachEvent("onerror",h)}}e=b-d;function l(){if(!google.timers.load.t)return;google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=f;google.kCSI.imc=d;google.kCSI.imn=b;google.kCSI.imp=e;google.timers.load.t.xjs&&google.report&&google.report(google.timers.load,google.kCSI)}if(window.addEventListener)window.addEventListener("load",l,false);else if(window.attachEvent)window.attachEvent("onload",l);google.timers.load.t.prt=(f=(new Date).getTime()); 
})(); 

回答

5

這是預期的行爲(我是圖書館作者之一)。如果您不希望打印JavaScript有效內容,則需要跳過代碼中的JavaScript標記。您可以簡單地將以下內容添加爲for循環的第一行。

if (it->isTag() && strcasecasecmp(it->tagName(), "javascript") == 0) continue;