我有一個非常簡單的爬蟲,它通過250頁,分配大約400mb的內存,永遠不會釋放它。我不知道如何解決它,也許有人注意到一些事情,並善意讓我知道。nodejs內存泄漏(async.queue和請求)
function scrape(shop, o, cb, step) {
var itemz = []
var q = async.queue(function (o, cb) {
req({
url: o.url
}, function (e, r) {
if (e) throw (e)
cb()
o.cb(r.body)
})
}, o.threads)
var get = function (url, cb) {
q.push({
url: url,
cb: cb
})
}
var url = 'https://www.host.com'
var total, done = 0,
itemsPerPage = 24
get(url, function (r) {
pages = (r.match(/data-page="(\d+)"/g));
pages = pages[pages.length - 2].split("data-page=\"")[1].split('"')[0] || 1;
pages = Math.min(pages, 10) // limit to 10 pages max (240 items)
for (var i = 1; i <= pages; i++) {
get(url + '&page=' + i, scrapeList)
}
total = pages + pages * itemsPerPage
})
// - extract the transaction links from the pages:
// and add them to queue
function scrapeList(r) {
var itemsFound = 0
r.replace(/href="(https:\/\/www.host.com\/listing\/(\d+).*)"/g, function (s, itemUrl, dateSold) {
itemsFound++
get(itemUrl, function (r) {
scrapeItem(r, itemUrl, dateSold)
step(++done, total)
if (done == total) onend()
})
})
total -= itemsPerPage - itemsFound // decrease expected items, if less items per page found than initially expected
step(++done, total)
}
// - from item page extract the details, and add to items array
function scrapeItem(r, itemUrl, dateSold) {
var d = {}
d.url = itemUrl;
d.date = new Date(Date.now())
d.quantity = 1;
itemz.push(d)
}
// - when no more requests in a queue (on drain), group items by title
function onend() {
cb(null, itemz);
}
}
你是如何調用'刮(...)'函數一個有趣的閱讀?它將數組返回給它的回調函數。如果您持久地存儲該數組,那將是一組持久數據。 – jfriend00
我將它存儲在數組中,並且setInterval每分鐘遍歷該數組並清除它(刪除緩存[k]); – freddor
您是否運行堆快照並檢查了堆中的內容? – jfriend00