1
我想抓取geocaching.com,但一些像coords這樣的數據只能用於用戶。使用「爬行」從故宮 Im和現在已經知道如何登錄與履帶式,但我已經得到了登錄表單的名字:NodeJS Crawler登錄網站
- ctl00 $ ContentBody $ tbUsername:用戶
- ctl00 $ ContentBody $ tbPassword :passwaord
- ctl00 $ ContentBody $ btnSignIn: 「簽名+在」
這是到目前爲止我的代碼:
var Crawler = require("crawler");
var url = require('url');
var mongoose = require("mongoose");
var Cache = require("./models/cache.js");
mongoose.connect("localhost:27017/Cache");
var removeTags = function(text){
return String(text).replace(/(<([^>]+)>)/ig,'');
};
var c = new Crawler({
maxConnections: 10,
skipDuplicates: true,
callback: function (error, result, $) {
if (result.request.uri.href.startsWith("http://www.geocaching.com/geocache/")) {
var cache = new Cache();
var id = removeTags($(".CoordInfoCode"));
Cache.count({
"_id": id
}, function (err, count) {
if (err)
return;
else if (count < 1) {
//Saving the data
}
});
}
if (result.headers['content-type'] == "text/html; charset=utf-8") {
if ($('a').length != 0) {
$('a').each(function (index, a) {
var toQueueUrl = $(a).attr('href');
process.nextTick(function() {
process.nextTick(function() {
c.queue(toQueueUrl);
})
});
});
}
}
}
});
c.queue('http://www.geocaching.com/seek/nearest.aspx?ul=Die_3sten_3');