我正在爬行一個網站,每天嚴格的用戶下載頁面,大約(1000頁),之後,用戶無法登錄,直到明天0: 00。scrapy與多個帳戶或用不同的帳戶(不同的cookie)重新登錄
因此,我註冊了很多賬戶來面對它。網站確實使用cookie。
這是我的問題,如何在用戶過期時重新登錄帳戶,並繼續抓取'堆棧'中的舊頁面。 這裏是我的代碼可能會幫助你瞭解我的問題。
def start_requests(self):
return [Request(self.start_urls[0], meta = {'cookiejar' : 1}, callback = self.login,dont_filter=True)]
def login(self, response):
self.account = self.accounts[self.line_count].split(",")
self.line_count = self.line_count+1
if(len(self.accounts)<=self.line_count):
self.line_count = 0;
self.log('Preparing login:'+self.account[0]+":"+self.account[1].rstrip())
return [FormRequest.from_response(response,
meta = {'cookiejar' : response.meta['cookiejar']},
headers = self.headers,
formdata = {
'j_email': self.account[0],
'j_password': self.account[1].rstrip(),
'submit': 'Ok'
},
callback = self.parse_url,
dont_filter = True,
)]
當我面對下面的問題,我會再次START_REQUEST,但它沒有工作或錯過了太多的頁(似會話相同的錯誤已過期)。
def parse_page_imo(self, response):
hxs = Selector(response)
loginfail = hxs.xpath('//table[@class="tab"]/tbody/tr/td/div[@id="encart"]/li/text()').extract()
if loginfail==([u'Your login (e-mail) or/and password are unknown in Equasis. Please, try again']):
print "relogin"
self.start_requests()
if loginfail==([u'Your session has expired, please try to login again']):
print "relogin"
self.start_requests()
if loginfail==([u'You have been disconnected or your login/password is unknown in Equasis. Please, try again.']):
print "relogin"
self.start_requests()
if loginfail==([u'By security, your session has been cancelled.']):
print "relogin"
self.start_requests()
....codes to parse items....
這裏是我的洞代碼:
# -*- coding:utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request, FormRequest
from imo_dlcosco_ships.settings import URLS, COOKIES, HEADER
from imo_dlcosco_ships.items import ShipListItem
from scrapy.selector import Selector
import time
class EquasisSpider(CrawlSpider):
name = 'imo_202'
allowed_domains = ["www.equasis.org"]
start_urls = [
"http://www.equasis.org/EquasisWeb/public/HomePage",
]
def __init__(self):
self.headers = HEADER
self.cookies = COOKIES
self.urls = URLS
f = open("account.txt", "r")
self.accounts = f.readlines()
f.close()
self.line_count = 0
#login
def start_requests(self):
return [Request(self.start_urls[0], meta = {'cookiejar' : 1}, callback = self.login,dont_filter=True)]
def login(self, response):
self.account = self.accounts[self.line_count].split(",")
self.line_count = self.line_count+1
if(len(self.accounts)<=self.line_count):
self.line_count = 0;
self.log('Preparing login:'+self.account[0]+":"+self.account[1].rstrip())
return [FormRequest.from_response(response,
meta = {'cookiejar' : response.meta['cookiejar']},
headers = self.headers,
formdata = {
'j_email': self.account[0],
'j_password': self.account[1].rstrip(),
'submit': 'Ok'
},
callback = self.parse_url,
dont_filter = True,
)]
def parse_url(self, response):
return [FormRequest(url="http://www.equasis.org/EquasisWeb/restricted/ShipSearchAdvanced?fs=ShipSearch",
meta = {'cookiejar' : response.meta['cookiejar']},
headers = self.headers,
cookies = self.cookies,
formdata = {
'P_PAGE': '1'
},
dont_filter = True,
callback = self.parse_imo_url,
)]
def parse_imo_url(self, response):
return [FormRequest(url="http://www.equasis.org/EquasisWeb/restricted/ShipList?fs=ShipSearch",
meta = {'cookiejar' : response.meta['cookiejar']},
headers = self.headers,
cookies = self.cookies,
formdata = {
'P_CLASS_ST_rb':'HC',
'P_CLASS_rb':'HC',
'P_CatTypeShip':'6',
'P_CatTypeShip_p2':'6',
'P_CatTypeShip_rb':'CM',
'P_DW_GT':'250000',
'P_DW_LT':'999999',
'P_FLAG_rb':'HC',
'P_PAGE':'1',
'Submit':'SEARCH'
},
dont_filter = True,
callback = self.parse_page_num,
)]
def parse_page_num(self,response):
hxs = Selector(response)
loginfail = hxs.xpath('//table[@class="tab"]/tbody/tr/td/div[@id="encart"]/li/text()').extract()
if loginfail==([u'Your login (e-mail) or/and password are unknown in Equasis. Please, try again']):
print "relogin"
self.start_requests()
if loginfail==([u'Your session has expired, please try to login again']):
print "relogin"
self.start_requests()
if loginfail==([u'You have been disconnected or your login/password is unknown in Equasis. Please, try again.']):
print "relogin"
self.start_requests()
if loginfail==([u'By security, your session has been cancelled.']):
print "relogin"
self.start_requests()
htmlurl = response._url.split('?')[0]
f = open('page.html','a')
f.write(response.body)
f.close()
if(htmlurl=='http://www.equasis.org/EquasisWeb/restricted/ShipList'):
temp1 = hxs.xpath('//form[@name="form"]/table[@class="tab"]/tbody/tr/td[@align="right"]/span/a/@onclick').extract()
temp2 = temp1[len(temp1)-1].split(";document")[0]
PageNum = temp2.split("P_PAGE.value=")[1].encode("utf-8")
for h in range(int(PageNum)):
yield FormRequest(url="http://www.equasis.org/EquasisWeb/restricted/ShipList?fs=ShipList",
meta={'cookiejar' : response.meta['cookiejar'],'pageNum':str(h+1)},
headers = self.headers,
cookies = self.cookies,
formdata = {
'P_CALLSIGN':'',
'P_IMO':'',
'P_NAME':'',
'P_PAGE':'%d' %(h+1)
},
dont_filter = True,
callback = self.parse_page_imo
)
def parse_page_imo(self, response):
hxs = Selector(response)
loginfail = hxs.xpath('//table[@class="tab"]/tbody/tr/td/div[@id="encart"]/li/text()').extract()
if(loginfail==([u'Your login (e-mail) or/and password are unknown in Equasis. Please, try again'])):
print "relogin"
self.start_requests()
if(loginfail == [u'Your session has expired, please try to login again']):
print "relogin"
self.start_requests()
if(loginfail == [u'You have been disconnected or your login/password is unknown in Equasis. Please, try again.']):
print "relogin"
self.start_requests()
if(loginfail == [u'By security, your session has been cancelled.']):
print "relogin"
self.start_requests()
htmlurl = response._url.split('?')[0]
if(htmlurl=='http://www.equasis.org/EquasisWeb/restricted/ShipList'):
item = ShipListItem()
shipNameHtml = hxs.xpath('//form[@name="formShip"]/table[@class="tab"]/tbody/tr/td[1]').extract()
shipHtmlTitle = Selector(text=shipNameHtml[0]).xpath('//text()').extract()
if(shipHtmlTitle[0].find('Name of ship')>-1):
item['ship_name'] = hxs.xpath('//form[@name="formShip"]/table[@class="tab"]/tbody/tr/td[1]/a/text()').extract()
onclickValue = hxs.xpath('//form[@name="formShip"]/table[@class="tab"]/tbody/tr/td[1]/a/@onclick').extract()
for i in range(len(onclickValue)):
onclickValue2 = onclickValue[i].split(";document")[0]
onclickValue3 = onclickValue2.split("P_IMO.value=")[1].encode("utf-8")
onclickValue[i] = onclickValue3.strip('\'')
item['imo'] = onclickValue
for h in range(len(item['imo'])):
p_imo = item['imo'][h]
ShipName = item['ship_name'][h]
p_imo = p_imo.rstrip()
yield FormRequest("http://www.equasis.org/EquasisWeb/restricted/ShipInfo?fs=ShipList",
meta = {'cookiejar' : response.meta['cookiejar'],'P_imo':p_imo,'ShipName':ShipName},
headers = self.headers,
cookies = self.cookies,
formdata = {
'P_IMO': p_imo
},
dont_filter = True,
callback = self.parse_page_mmsi,
)
def parse_page_mmsi(self,response):
hxs = Selector(response)
loginfail = hxs.xpath('//table[@class="tab"]/tbody/tr/td/div[@id="encart"]/li/text()').extract()
if(loginfail==([u'Your login (e-mail) or/and password are unknown in Equasis. Please, try again'])):
print "relogin"
self.start_requests()
if(loginfail == [u'Your session has expired, please try to login again']):
print "relogin"
self.start_requests()
if(loginfail == [u'You have been disconnected or your login/password is unknown in Equasis. Please, try again.']):
print "relogin"
self.start_requests()
if(loginfail == [u'By security, your session has been cancelled.']):
print "relogin"
self.start_requests()
shipHtml = hxs.xpath('//table[@class="encart"]/tbody/tr').extract()
item=ShipListItem()
item['mmsi'] = [u'']
for j in range(len(shipHtml)):
shipHtmlTitle = Selector(text=shipHtml[j]).xpath('//td[1]/text()').extract()
if(shipHtmlTitle[0].find('MMSI :')>-1):
item['mmsi'] = Selector(text=shipHtml[j]).xpath('//td[2]/text()').extract()
item['imo'] = response.meta['P_imo']
item['ship_name'] = response.meta['ShipName']
yield item
我曾嘗試用yield [Request(self.start_urls [0],meta = {'cookiejar':1},callback = self.login,dont_filter = True)]替換start_requests()。但它並沒有幫助。我不認爲這應該是我的問題的直接解決方案 – sacuba
@sacuba * 1. *您*不*正確:在我的示例中,它是「產生」一個請求對象,而你試圖「屈服」一個「列表」。 'scrapy'不會處理響應回調中的'list'。 * 2. *您只發布了部分代碼(例如網站的確切位置?您的'parse_page_imo'在哪裏被調用?),這裏的人無法知道您認爲的是什麼直接解決方案。 * 3. *至少這是一個問題,問題一個接一個解決。 – starrify
我明白了你的觀點,這是我的錯,它返回的不是一個Request對象的迭代列表。我對我的錯誤感到抱歉。另外,我在我的問題中添加了漏洞代碼(不會更改'返回錯誤',可能是原始代碼可能會幫助您理解我的問題)。期待您的回覆,謝謝! – sacuba