我正在做一些從許多來源(有數百萬條記錄)的爬蟲,但我有問題涉及內存不足。我GOOGLE了一下,發現了一些資源,但它並沒有解決我的問題。堆滿內存時抓取數百萬條記錄
similar question沒有解決不了我的問題
這裏是我的示例代碼:
function getContent() {
let d = q.defer();
let urls = [];
array.forEach(function(mang, index) {
for (let i = 1; i <= 600000; i++) {
urls.push(function (callback) {
setTimeout(function() {
let link = 'http://something.com/' + i;
let x = link;
let options = {
url: link,
headers: {
'User-Agent': 'something'
}
};
function callback1(error, response, html) {
if (!error) {
let $ = cheerio.load(html);
let tag_name = $('h1').text();
tag_name = tag_name.trim();
let tag_content = $('#content-holder').find('div').text();
let tagObject = new Object();
tagObject.tag_name = tag_name;
tagObject.tag_content = tag_content;
tagObject.tag_number = i;
tagArray.push(tagObject);
for (let v = 0; v < tagArray.length; v++) {
//console.log("INSERT INTO `tags` (tag_name, content, story_id, tag_number) SELECT * FROM (SELECT " + "'" + tagArray[v].tag_name + "'" + "," + "'" + tagArray[v].tag_content + "','" + array[c].story_id + "','" + tagArray[v].tag_number + "' as ChapName) AS tmp WHERE NOT EXISTS (SELECT `tag_name` FROM `tags` WHERE `tag_name`=" + "'" + tagArray[v].tag_name + "'" + ") LIMIT 1");
db.query("INSERT INTO `tags` (tag_name, content) " +
"SELECT * FROM (SELECT " + "'" + tagArray[v].tag_name + "'" + "," + "'" + tagArray[v].tag_content + "','" + "' as TagName) AS tmp " +
"WHERE NOT EXISTS (SELECT `tag_name` FROM `tags` WHERE `tag_name`=" + "'" + tagArray[v].tag_name + "'" + ") " +
"LIMIT 1", (err) => {
if (err) {
console.log(err);
}
});
}
urls = null;
global.gc();
console.log("Program is using " + heapUsed + " bytes of Heap.")
}
}
request(options, callback1);
callback(null, x);
}, 15000);
});
}
});
d.resolve(urls);
return d.promise;
}
getContent()
.then(function (data) {
let tasks = data;
console.log("start data");
async.parallelLimit(tasks, 10,() => {
console.log("DONE ");
});
})
我試圖用global.gc()函數,但似乎無法有效是
你*真的*需要創建一個最小可行的問題。展示你的工作,解釋你在找什麼等等。這個問題需要我們對你的方法做出太多的假設。 – Paul
@Paul我編輯了一些示例代碼:( –
添加了一個答案,但我不明白,'數組'是什麼?你正在做一個forEach,但我看不到你在哪裏使用其中的項目來完成任何事情,並且你意識到你正在向'array'中的堆*每項*添加至少120萬個函數,對嗎?我說至少因爲我不記得callback1是否會創建一次或每個範圍一次,很確定它是後者。 – Paul