2012-10-26 209 views
9
var http = require('http'); 
var urlOpts = {host: 'www.nodejs.org', path: '/', port: '80'}; 
http.get(urlOpts, function (response) { 
response.on('data', function (chunk) { 
var str=chunk.toString(); 
var re = new RegExp("(<\s*title[^>]*>(.+?)<\s*/\s*title)\>", "g") 
console.log(str.match(re)); 
}); 

}); 

輸出從刮網頁獲取頁面標題

[email protected] ~ $ node app.js [ 'node.js' ] null null

我只需要拿到冠軍。

回答

7

我會使用的RegEx.exec代替String.match建議。您還可以定義使用文本語法的正則表達式,只有一次:

var http = require('http'); 
var urlOpts = {host: 'www.nodejs.org', path: '/', port: '80'}; 
var re = /(<\s*title[^>]*>(.+?)<\s*\/\s*title)>/gi; 
http.get(urlOpts, function (response) { 
    response.on('data', function (chunk) { 
     var str=chunk.toString(); 
     var match = re.exec(str); 
     if (match && match[2]) { 
      console.log(match[2]); 
     } 
    });  
}); 

代碼還假定title將在一個塊完全和兩個塊之間不分裂。或許,這將是最好保持大塊的聚集,以防title是塊之間的分裂。一旦找到它,您可能還想停止尋找title

+0

@argonius在他的示例中有一個好處,那就是除了'g'之外,您應該也可以使用'i'標誌來使正則表達式不區分大小寫(因爲''標籤的外殼不是'保證是小寫的,特別是如果文檔不是XHTML)。當使用JavaScript設置標題時, – <span class="text-secondary"> <small> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/2688/">bdukes</a></span> <span></span> </small> </span> </p> </div> </div> </div> <div itemprop="comment" class="post-comment"> <div class="row"> <div class="col-lg-1"><span class="text-secondary">+0</span></div> <div class="col-lg-11"> <p class="commenttext">不適用於SPA網頁。你應該使用谷歌瀏覽器之類的無頭瀏覽器 – <span class="text-secondary"> <small> <span></span> </small> </span> </p> </div> </div> </div> </div> </div> </article> <div> <script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <ins class="adsbygoogle" style="display:block" data-ad-client="ca-pub-6208739752673518" data-ad-slot="1038284119" data-ad-format="auto" data-full-width-responsive="true"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <article class="board-top-1 padding-top-10"> <div class="post-col vote-info"> <span class="count">2<i class="fa fa-thumbs-up"></i></span> </div> <div class="post-offset"> <div class="answer fmt"> <p>試試這個:</p> <pre><code class="prettyprint-override">var re = new RegExp("<title>(.*?)</title>", "i"); console.log(str.match(re)[1]); </code></pre> </div> <div class="post-info"> <div class="post-meta row"> <p class="text-secondary col-lg-6"> <span class="source"> <a rel="noopener" target="_blank" href="https://stackoverflow.com/q/13088064">來源</a> </span> </p> <p class="text-secondary col-lg-6"> <span class="float-right date"> <span>2012-10-26 13:40:43</span> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/915320/">gradosevic</a></span> </p> <p class="col-12"></p> <p class="col-12"></p></div> </div> <!-- comments --> <div class="comments"> <div itemprop="comment" class="post-comment"> <div class="row"> <div class="col-lg-1"><span class="text-secondary">+0</span></div> <div class="col-lg-11"> <p class="commenttext">'E:\╨а╨░╨▒╨╛╤З╨╕╨╣╤Б╤В╨╛╨╗\ dev的\ app.js:7 的console.log(str.match(RE)[1 ]); ^ 類型錯誤:無法在IncomingMessage看空 的特性「1」。 <anonymous>(E:\╨а╨░╨▒╨╛╤З╨╕╨╣╤Б╤В╨╛╨╗\ dev的\ app.js:7:26) 在IncomingMessage.EventEmitter.emit(events.js: (http.js:359:10) at HTTPParser.parserOnBody [as onBody](http.js:123:21) at Socket.socketOnData [as ondata](http.js: 1367:20) at TCP.onread(net.js:403:27)' – <span class="text-secondary"> <small> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/1777212/">user1777212</a></span> <span></span> </small> </span> </p> </div> </div> </div> <div itemprop="comment" class="post-comment"> <div class="row"> <div class="col-lg-1"><span class="text-secondary">+0</span></div> <div class="col-lg-11"> <p class="commenttext">爲我工作,謝謝! – <span class="text-secondary"> <small> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/2162226/">gnB</a></span> <span></span> </small> </span> </p> </div> </div> </div> </div> </div> </article> </div> <div class="clearfix"> </div> <div class="relative-box"> <div class="relative">相關問題</div> <ul class="relative_list"> <li> 1. <a href="http://hk.uwenku.com/question/p-dhpdqvai-vu.html" target="_blank" title="從前一頁獲取頁面標題"> 從前一頁獲取頁面標題 </a> </li> <li> 2. <a href="http://hk.uwenku.com/question/p-pgzmocqy-px.html" target="_blank" title="從網頁上刮取網頁數據"> 從網頁上刮取網頁數據 </a> </li> <li> 3. <a href="http://hk.uwenku.com/question/p-nglkeybq-xg.html" target="_blank" title="rvest從網頁的html頁面刮"> rvest從網頁的html頁面刮 </a> </li> <li> 4. <a href="http://hk.uwenku.com/question/p-gqxprkzi-co.html" target="_blank" title="從網頁刮取HTML? - VB.NET"> 從網頁刮取HTML? - VB.NET </a> </li> <li> 5. <a href="http://hk.uwenku.com/question/p-hhmnloei-bk.html" target="_blank" title="如何獲取網頁的網站名稱和頁面標題"> 如何獲取網頁的網站名稱和頁面標題 </a> </li> <li> 6. <a href="http://hk.uwenku.com/question/p-ussrvyor-bnd.html" target="_blank" title="獲取頁面標題"> 獲取頁面標題 </a> </li> <li> 7. <a href="http://hk.uwenku.com/question/p-qluelxqq-cm.html" target="_blank" title="從頁面表中獲取標題"> 從頁面表中獲取標題 </a> </li> <li> 8. <a href="http://hk.uwenku.com/question/p-oueyuteh-hp.html" target="_blank" title="從redirrected頁面獲取標題信息"> 從redirrected頁面獲取標題信息 </a> </li> <li> 9. <a href="http://hk.uwenku.com/question/p-wklczvam-nm.html" target="_blank" title="從特定頁面獲取父頁面標題名稱ID​​號"> 從特定頁面獲取父頁面標題名稱ID​​號 </a> </li> <li> 10. <a href="http://hk.uwenku.com/question/p-nwxalzla-bka.html" target="_blank" title="如何從Google表格中的網址獲取頁面標題?"> 如何從Google表格中的網址獲取頁面標題? </a> </li> <div> <script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <ins class="adsbygoogle" style="display:block; text-align:center;" data-ad-layout="in-article" data-ad-format="fluid" data-ad-client="ca-pub-6208739752673518" data-ad-slot="4606349252"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <li> 11. <a href="http://hk.uwenku.com/question/p-qkzcdmkv-wr.html" target="_blank" title="刮內容從網站頁面"> 刮內容從網站頁面 </a> </li> <li> 12. <a href="http://hk.uwenku.com/question/p-oiffthhj-gr.html" target="_blank" title="從網頁從刮HTML"> 從網頁從刮HTML </a> </li> <li> 13. <a href="http://hk.uwenku.com/question/p-ammeckch-bgk.html" target="_blank" title="刮網頁的問題"> 刮網頁的問題 </a> </li> <li> 14. <a href="http://hk.uwenku.com/question/p-bvhhuiks-bmx.html" target="_blank" title="網頁刮傷問題"> 網頁刮傷問題 </a> </li> <li> 15. <a href="http://hk.uwenku.com/question/p-djqifszl-rd.html" target="_blank" title="問題與CSV格式從網頁刮"> 問題與CSV格式從網頁刮 </a> </li> <li> 16. <a href="http://hk.uwenku.com/question/p-wwxapypp-bdm.html" target="_blank" title="獲取Facebook頁面ID(刮擦)"> 獲取Facebook頁面ID(刮擦) </a> </li> <li> 17. <a href="http://hk.uwenku.com/question/p-bmdnjduz-pd.html" target="_blank" title="php從simplehtmldom獲取錯誤,當試圖獲取網頁的下一頁刮"> php從simplehtmldom獲取錯誤,當試圖獲取網頁的下一頁刮 </a> </li> <li> 18. <a href="http://hk.uwenku.com/question/p-bqmwyoka-baq.html" target="_blank" title="PHP從表HTML標記網頁刮"> PHP從表HTML標記網頁刮 </a> </li> <li> 19. <a href="http://hk.uwenku.com/question/p-cubynlaj-xp.html" target="_blank" title="從網頁表中刮取值"> 從網頁表中刮取值 </a> </li> <li> 20. <a href="http://hk.uwenku.com/question/p-xkgtfzvj-vz.html" target="_blank" title="從網頁刮取數字值?"> 從網頁刮取數字值? </a> </li> <li> 21. <a href="http://hk.uwenku.com/question/p-gczmhvna-bmd.html" target="_blank" title="使用C從網頁中刮取JSON#"> 使用C從網頁中刮取JSON# </a> </li> <li> 22. <a href="http://hk.uwenku.com/question/p-ffursxtu-nw.html" target="_blank" title="刮屏|網頁抓取"> 刮屏|網頁抓取 </a> </li> <li> 23. <a href="http://hk.uwenku.com/question/p-zilbinfl-pn.html" target="_blank" title="用jsoup從頁面中刮取文本"> 用jsoup從頁面中刮取文本 </a> </li> <li> 24. <a href="http://hk.uwenku.com/question/p-ttgualpd-beo.html" target="_blank" title="使用JavaScript從R刮取頁面"> 使用JavaScript從R刮取頁面 </a> </li> <li> 25. <a href="http://hk.uwenku.com/question/p-xzzdzjvw-bdc.html" target="_blank" title="Android:從網址獲取「標題」而無需獲取整個頁面"> Android:從網址獲取「標題」而無需獲取整個頁面 </a> </li> <li> 26. <a href="http://hk.uwenku.com/question/p-abjgoiyk-xd.html" target="_blank" title="從網頁上刮 - python"> 從網頁上刮 - python </a> </li> <li> 27. <a href="http://hk.uwenku.com/question/p-uuvplhgx-bgg.html" target="_blank" title="從網頁上刮信息"> 從網頁上刮信息 </a> </li> <li> 28. <a href="http://hk.uwenku.com/question/p-gsmahhba-ks.html" target="_blank" title="從單獨的[EXTERNAL]頁面(使用Javascript?)刮取/獲取IMG Src"> 從單獨的[EXTERNAL]頁面(使用Javascript?)刮取/獲取IMG Src </a> </li> <li> 29. <a href="http://hk.uwenku.com/question/p-bitafhbs-bbc.html" target="_blank" title="刮網頁"> 刮網頁 </a> </li> <li> 30. <a href="http://hk.uwenku.com/question/p-xmwlusoo-bge.html" target="_blank" title="刮網頁"> 刮網頁 </a> </li> </ul> </div> <div> <script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <ins class="adsbygoogle" style="display:block" data-ad-format="autorelaxed" data-ad-client="ca-pub-6208739752673518" data-ad-slot="1575177025"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <div class="padding-top-10"></div> </div> </div> <script type="text/javascript" src="http://img.uwenku.com/uwenku/script/side.js?t=1644592048261"></script> <script type="text/javascript" src="http://img.uwenku.com/uwenku/plugin/highlight/highlight.pack.js"></script> <link href="http://img.uwenku.com/uwenku/plugin/highlight/styles/docco.css" media="screen" rel="stylesheet" type="text/css" /> <script type="text/javascript"> $('pre').each(function(i, e) { hljs.highlightBlock(e, "<span class='indent'> </span>", false) }); </script> <div class="col-lg-3 col-md-4 col-sm-5"> <div id="rightTop"> <div class="row"> <script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <ins class="adsbygoogle" style="display:block" data-ad-client="ca-pub-6208739752673518" data-ad-slot="5415218910" data-ad-format="auto" data-full-width-responsive="true"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <div class="row sidebar panel panel-default"> <div class="panel-heading font-bold"> 最新問題 </div> <div class="m-b-sm m-t-sm clearfix"> <ul class="side_article_list"> <li class="side_article_list_item"> 1. <a href="http://hk.uwenku.com/question/p-yiaqahpl-vx.html" target="_blank" title="如何對目錄中的單個文件進行相同的更改?"> 如何對目錄中的單個文件進行相同的更改? </a> </li> <li class="side_article_list_item"> 2. <a href="http://hk.uwenku.com/question/p-sptrwqoy-uo.html" target="_blank" title="VB如何限制登錄系統上用來註冊的信息?"> VB如何限制登錄系統上用來註冊的信息? </a> </li> <li class="side_article_list_item"> 3. <a href="http://hk.uwenku.com/question/p-erzjnrpy-uu.html" target="_blank" title="關於應用於CCHESS的算法的困惑"> 關於應用於CCHESS的算法的困惑 </a> </li> <li class="side_article_list_item"> 4. <a href="http://hk.uwenku.com/question/p-pkwqbdaz-tw.html" target="_blank" title="春季"> 春季 </a> </li> <li class="side_article_list_item"> 5. <a href="http://hk.uwenku.com/question/p-vtphzeru-ue.html" target="_blank" title="添加touchEvents進行導航"> 添加touchEvents進行導航 </a> </li> <li class="side_article_list_item"> 6. <a href="http://hk.uwenku.com/question/p-wjxlkzjz-vh.html" target="_blank" title="如何創建列描述(CD)文件Catboost"> 如何創建列描述(CD)文件Catboost </a> </li> <li class="side_article_list_item"> 7. <a href="http://hk.uwenku.com/question/p-zqtjksso-vq.html" target="_blank" title="如何打印二叉樹?"> 如何打印二叉樹? </a> </li> <li class="side_article_list_item"> 8. <a href="http://hk.uwenku.com/question/p-ufavsjen-va.html" target="_blank" title="從數列與百分比的餅圖創建爲標籤"> 從數列與百分比的餅圖創建爲標籤 </a> </li> <li class="side_article_list_item"> 9. <a href="http://hk.uwenku.com/question/p-czfslifo-ss.html" target="_blank" title="爲什麼jQuery懸停效果不起作用?"> 爲什麼jQuery懸停效果不起作用? </a> </li> <li class="side_article_list_item"> 10. <a href="http://hk.uwenku.com/question/p-ahvkxzum-tb.html" target="_blank" title="get-webbinding-like IIS:\ Sites \ $變量"> get-webbinding-like IIS:\ Sites \ $變量 </a> </li> </ul> </div> </div> </div> <p class="article-nav-bar"></p> <div class="row sidebar article-nav"> <div class="row box_white visible-sm visible-md visible-lg margin-zero"> <div class="top"> <h3 class="title"><i class="glyphicon glyphicon-th-list"></i> 相關問題</h3> </div> <div class="article-relative-content"> <ul class="side_article_list"> <li class="side_article_list_item"> 1. <a href="http://hk.uwenku.com/question/p-dhpdqvai-vu.html" target="_blank" title="從前一頁獲取頁面標題"> 從前一頁獲取頁面標題 </a> </li> <li class="side_article_list_item"> 2. <a href="http://hk.uwenku.com/question/p-pgzmocqy-px.html" target="_blank" title="從網頁上刮取網頁數據"> 從網頁上刮取網頁數據 </a> </li> <li class="side_article_list_item"> 3. <a href="http://hk.uwenku.com/question/p-nglkeybq-xg.html" target="_blank" title="rvest從網頁的html頁面刮"> rvest從網頁的html頁面刮 </a> </li> <li class="side_article_list_item"> 4. <a href="http://hk.uwenku.com/question/p-gqxprkzi-co.html" target="_blank" title="從網頁刮取HTML? - VB.NET"> 從網頁刮取HTML? - VB.NET </a> </li> <li class="side_article_list_item"> 5. <a href="http://hk.uwenku.com/question/p-hhmnloei-bk.html" target="_blank" title="如何獲取網頁的網站名稱和頁面標題"> 如何獲取網頁的網站名稱和頁面標題 </a> </li> <li class="side_article_list_item"> 6. <a href="http://hk.uwenku.com/question/p-ussrvyor-bnd.html" target="_blank" title="獲取頁面標題"> 獲取頁面標題 </a> </li> <li class="side_article_list_item"> 7. <a href="http://hk.uwenku.com/question/p-qluelxqq-cm.html" target="_blank" title="從頁面表中獲取標題"> 從頁面表中獲取標題 </a> </li> <li class="side_article_list_item"> 8. <a href="http://hk.uwenku.com/question/p-oueyuteh-hp.html" target="_blank" title="從redirrected頁面獲取標題信息"> 從redirrected頁面獲取標題信息 </a> </li> <li class="side_article_list_item"> 9. <a href="http://hk.uwenku.com/question/p-wklczvam-nm.html" target="_blank" title="從特定頁面獲取父頁面標題名稱ID​​號"> 從特定頁面獲取父頁面標題名稱ID​​號 </a> </li> <li class="side_article_list_item"> 10. <a href="http://hk.uwenku.com/question/p-nwxalzla-bka.html" target="_blank" title="如何從Google表格中的網址獲取頁面標題?"> 如何從Google表格中的網址獲取頁面標題? </a> </li> </ul> </div> </div> </div> </div> </div> </div> </div><!-- wrap end--> <!-- footer --> <footer id="footer"> <div class="bg-simple lt"> <div class="container"> <div class="row padder-v m-t"> <div class="col-xs-8"> <ul class="list-inline"> <li><a href="http://hk.uwenku.com/contact">聯系我們</a></li> <li>© 2020 HK.UWENKU.COM</li> <li><a target="_blank" href="https://beian.miit.gov.cn/">沪ICP备13005482号-4</a></li> <li><script type="text/javascript" src="https://v1.cnzz.com/z_stat.php?id=1280101193&web_id=1280101193"></script></li> <li><a href="http://www.uwenku.com/" target="_blank" title="优文库">简体中文</a></li> <li><a href="http://hk.uwenku.com/" target="_blank" title="優文庫">繁體中文</a></li> <li><a href="http://ru.uwenku.com/" target="_blank" title="поле вопросов и ответов">Русский</a></li> <li><a href="http://de.uwenku.com/" target="_blank" title="Frage - und - antwort - Park">Deutsch</a></li> <li><a href="http://es.uwenku.com/" target="_blank" title="Preguntas y respuestas">Español</a></li> <li><a href="http://hi.uwenku.com/" target="_blank" title="कार्यक्रम प्रश्न और उत्तर पार्क">हिन्दी</a></li> <li><a href="http://it.uwenku.com/" target="_blank" title="IL Programma di chiedere Park">Italiano</a></li> <li><a href="http://ja.uwenku.com/" target="_blank" title="プログラム問答園区">日本語</a></li> <li><a href="http://ko.uwenku.com/" target="_blank" title="프로그램 문답 단지">한국어</a></li> <li><a href="http://pl.uwenku.com/" target="_blank" title="program o park">Polski</a></li> <li><a href="http://tr.uwenku.com/" target="_blank" title="Program soru ve cevap parkı">Türkçe</a></li> <li><a href="http://vi.uwenku.com/" target="_blank" title="Đáp ứng viên">Tiếng Việt</a></li> <li><a href="http://fr.uwenku.com/" target="_blank" title="Programme interrogation Park">Française</a></li> </ul> </div> </div> </div> </div> </div> </footer> <!-- / footer --> <script> var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?f78a970f17b19a79fc477a3378096f29"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> </body> </html>