2
我正在構建一個可覆蓋200多個網站的網絡爬蟲。我現在的代碼已經運行在一個由十幾個網站構建的外部JSON文件的頂部。示例:NightmareJS - 網絡爬蟲需要迭代JSON數據
[
{
"company": "My Company",
"url": "http://example.com/jobs/",
"query": "div.job-listings>dt a",
"link": "div.job-listings>dt a"
},
{
"company": "Another Company",
"url": "http://anothercompany.com/careers/",
"query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a",
"link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a"
}
]
當我試圖async。每次它會記錄所有在功能上的原始對象的試圖進入噩夢實例之前,然後返回error Nothing responds to "goto"
。然後我嘗試async .eachSeries,它打印出正確的結果,但在第一次迭代後停止。
var async = require ('async');
var data = require('./input.json')
var Nightmare = require('nightmare');
var nightmare = Nightmare({ show: false })
function crawl(data, cb) {
console.log(data) // When async.each will iterate all items then error
var nightmare = new Nightmare()
nightmare
.goto(data.url) // go to JSON specified url
.wait(data.query) // wait until CSS selector loads
.evaluate(function (data) {
positionsArr = []
obj = {}
obj.company = data.company
query = document.querySelectorAll(data.query)
link = document.querySelectorAll(data.link)
/* Set query and link equal to all elements with selector
itearte through appending text (innerText) from each element
with job url to obj*/
var i;
for (i = 0; i < query.length; i++) {
positionsObj = {}
positionsObj.title = query[i].innerText.trim()
// if each position has individual page
if (data.link !== null) {
positionsObj.url = link[i].href
} else {
positionsObj.url = data.url
}
positionsArr.push(positionsObj)
}
obj.positions = positionsArr
return obj
}, data)
.end()
.then(function (obj) {
console.log(obj)
console.log('done')
})
.catch(function (error) {
console.error('error', error);
});
}
async.eachSeries(data, crawl, function (err){
console.log('done!');
})
如何在不必爲每個文件編寫單獨文件的情況下進行此項工作?或者有更好的方法來抓取這個數量的網站?