var bobik = new Bobik("YOUR_AUTH_TOKEN"); 
bobik.scrape({ 
    urls: ['amazon.com', 'zynga.com', 'http://finance.google.com/', 'http://shopping.yahoo.com'], 
    queries: ["//th", "//img/@src", "return document.title", "return $('script').length", "#logo", ".logo"] 
}, function (scraped_data) { 
    if (!scraped_data) { 
    console.log("Data is unavailable"); 
    return; 
    } 
    var scraped_urls = Object.keys(scraped_data); 
    for (var url in scraped_urls) 
    console.log("Results from " + url + ": " + scraped_data[scraped_urls[url]]); 
});

這裏，刮遠程執行，回調是發給你的代碼只有當結果準備就緒（也有收集的結果，因爲他們成爲可用的選項）。

您可以在https://github.com/emirkin/bobik_javascript_sdk

來源

2012-07-14 15:44:09 Yevgeniy

我一直在做自己的研究下載Bobik client proxy SDK，並https://npmjs.org/package/wscraper擁有自己基於cheerio.js一個

網絡刮刀劑快速，靈活，和瘦實現核心jQuery;建立在request.js之上;靈感來自 http-agent.js

使用率很低（根據npmjs.org），但值得一看任何感興趣的人。

來源

2013-06-03 23:49:26 electblake

這是我易於使用的通用刮板https://github.com/harish2704/html-scrapper寫爲Node.JS 它可以提取基於預定義模式的信息。模式定義包括一個css選擇器和一個數據提取功能。它目前使用cheerio進行dom解析..

來源

2014-05-19 05:25:03 harish2704

退房https://github.com/rc0x03/node-promise-parser

Fast: uses libxml C bindings 
Lightweight: no dependencies like jQuery, cheerio, or jsdom 
Clean: promise based interface- no more nested callbacks 
Flexible: supports both CSS and XPath selectors

來源

2014-06-09 18:20:03 user3723412

使用ES7 /承諾

通常情況下，新的方式，當你刮你想用一些方法來

獲取資源上的Web服務器（通常爲html文件）
閱讀該資源並使用它作爲
1. 一個DOM /樹結構，並使其可導航
2. 解析它爲標記文檔與類似SAS的東西。

兩個樹，和令牌解析具有優勢，但樹是通常簡單得多。我們會這樣做。退房request-promise，這裏是如何工作的：

const rp = require('request-promise'); 
const cheerio = require('cheerio'); // Basically jQuery for node.js 

const options = { 
    uri: 'http://www.google.com', 
    transform: function (body) { 
     return cheerio.load(body); 
    } 
}; 

rp(options) 
    .then(function ($) { 
     // Process html like you would with jQuery... 
    }) 
    .catch(function (err) { 
     // Crawling failed or Cheerio

這是使用cheerio基本上是一個輕量級的服務器端的jQuery式的圖書館（即不需要窗口對象，或jsdom）。

因爲您使用的是承諾，您還可以在異步函數中編寫它。它會看起來同步，但它會是異步的ES7：

async function parseDocument() { 
    let $; 
    try { 
     $ = await rp(options); 
    } catch (err) { console.error(err); } 

    console.log($('title').text()); // prints just the text in the <title> 
}

來源

2016-05-31 02:17:54

我看到的大多數回答正確的道路與cheerio等等，但是一旦你的地步，你需要解析和執行JavaScript（ala SPA和更多），然後我會檢查出https://github.com/joelgriffith/navalia（我是作者）。 Navalia的構建是爲了支持在無頭瀏覽器中進行搜索，而且速度非常快。謝謝！

來源

2017-07-01 17:34:59

使用Node.js實時刮取網頁

回答

使用ES7 /承諾

相關問題