2016-08-07 32 views
2

我正在構建一個可覆蓋200多個網站的網絡爬蟲。我現在的代碼已經運行在一個由十幾個網站構建的外部JSON文件的頂部。示例:NightmareJS - 網絡爬蟲需要迭代JSON數據

[ 
    { 
    "company": "My Company", 
    "url": "http://example.com/jobs/", 
    "query": "div.job-listings>dt a", 
    "link": "div.job-listings>dt a" 
    }, 
    { 
    "company": "Another Company", 
    "url": "http://anothercompany.com/careers/", 
    "query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a", 
    "link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a" 
    } 
] 

當我試圖async。每次它會記錄所有在功能上的原始對象的試圖進入噩夢實例之前,然後返回error Nothing responds to "goto"。然後我嘗試async .eachSeries,它打印出正確的結果,但在第一次迭代後停止。

var async = require ('async'); 
var data = require('./input.json') 
var Nightmare = require('nightmare'); 
var nightmare = Nightmare({ show: false }) 

function crawl(data, cb) { 
    console.log(data) // When async.each will iterate all items then error 
    var nightmare = new Nightmare() 
    nightmare 
    .goto(data.url) // go to JSON specified url 
    .wait(data.query) // wait until CSS selector loads 
    .evaluate(function (data) { 
     positionsArr = [] 
     obj = {} 
     obj.company = data.company 
     query = document.querySelectorAll(data.query) 
     link = document.querySelectorAll(data.link) 
    /* Set query and link equal to all elements with selector 
    itearte through appending text (innerText) from each element 
    with job url to obj*/ 
     var i; 
     for (i = 0; i < query.length; i++) { 
    positionsObj = {} 
    positionsObj.title = query[i].innerText.trim() 
     // if each position has individual page 
     if (data.link !== null) { 
     positionsObj.url = link[i].href 
     } else { 
      positionsObj.url = data.url 
     } 
    positionsArr.push(positionsObj) 
     } 
     obj.positions = positionsArr 
     return obj 
    }, data) 
    .end() 
    .then(function (obj) { 
    console.log(obj) 
    console.log('done') 
    }) 
    .catch(function (error) { 
    console.error('error', error); 
    }); 
} 


async.eachSeries(data, crawl, function (err){ 
    console.log('done!'); 
}) 

如何在不必爲每個文件編寫單獨文件的情況下進行此項工作?或者有更好的方法來抓取這個數量的網站?

Source code

回答

1

你,如果你想執行第二步等使用回調(cb):

.end() 
.then(function (obj) { 
    console.log(obj); 
    console.log('done'); 
    cb(); 
}) 
.catch(function (error) { 
    console.error('error', error); 
    cb(error); 
});