我寫了我的Scrapy蜘蛛,它應該用AJAX處理一些網站。理論上它應該可以正常工作,而且當它在Scrapy shell中使用fetch()手動使用時工作正常,但是當我運行「scrapy crawl ...」時,我看不到日誌中的任何POST請求,並且沒有項目被刮擦。它會是什麼?問題的根源是什麼?Scrapy沒有發佈POST請求
import scrapy
from scrapy import Request, FormRequest
import json
class ExpertSpider(scrapy.Spider):
name = "expert"
allowed_domains = ["expert.fi"]
start_urls = (
'http://www.expert.fi/',
)
def parse(self, response):
categories = response.xpath('//div[@id="categories-navigation"]//a/@href').extract()
for cat in categories:
yield Request(response.urljoin(cat), callback=self.parseCat)
def parseCat(self, response):
catMenu = response.xpath('//div[@id="category-left-menu"]')
if catMenu:
subCats = catMenu.xpath('.//a[@class="category"]/@href').extract()
for subCat in subCats:
yield Request(response.urljoin(subCat), callback=self.parseCat)
else:
self.parseProdPage(response)
print "I`ve reached this point" # debug
def parseProdPage(self, response):
catId = response.css...
url = 'https://www.expert.fi/Umbraco/Api/Product/ProductsByCategory'
data = dict()
...
jsonDict = json.dumps(data)
heads = dict()
heads['Content-Type'] = 'application/json;charset=utf-8'
heads['Content-Length'] = len(jsonDict)
heads['Accept'] = 'application/json, text/plain, */*'
heads['Referer'] = response.url
return Request(url=url, method="POST", body=jsonDict, headers=heads, callback=self.startItemProc)
def startItemProc(self, response):
resDict = json.loads(response.body)
item = dict()
for it in resDict['Products']:
# Product data
...
item['Category Path'] = it['Breadcrumb'][-1]['Name'] + ''.join([' > ' + crumb['Name']
for crumb in it['Breadcrumb'][-2::-1]])
# Make the new request for delivery price
url = 'https://www.expert.fi/Umbraco/Api/Cart/GetFreightOptionsForProduct'
data = dict()
...
jsonDict = json.dumps(data)
heads = dict()
heads['Content-Type'] = 'application/json;charset=utf-8'
heads['Content-Length'] = len(jsonDict)
heads['Accept'] = 'application/json, text/plain, */*'
heads['Referer'] = item['Product URL']
req = Request(url=url, method="POST", body=jsonDict, headers=heads, callback=self.finishItemProc)
req.meta['item'] = item
yield req
def finishItemProc(self, response):
item = response.meta['item']
ansList = json.loads(response.body)
for delivery in ansList:
if delivery['Name'] == ...
item['Delivery price'] = delivery['Price']
return item
日誌是:
2016-10-09 01:11:16 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 9,
'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 1,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 8,
'downloader/request_bytes': 106652,
'downloader/request_count': 263,
'downloader/request_method_count/GET': 263,
'downloader/response_bytes': 5644786,
'downloader/response_count': 254,
'downloader/response_status_count/200': 252,
'downloader/response_status_count/301': 1,
'downloader/response_status_count/302': 1,
'dupefilter/filtered': 19,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 8, 22, 11, 16, 949472),
'log_count/DEBUG': 265,
'log_count/INFO': 11,
'request_depth_max': 3,
'response_received_count': 252,
'scheduler/dequeued': 263,
'scheduler/dequeued/memory': 263,
'scheduler/enqueued': 263,
'scheduler/enqueued/memory': 263,
'start_time': datetime.datetime(2016, 10, 8, 22, 7, 7, 811163)}
2016-10-09 01:11:16 [scrapy] INFO: Spider closed (finished)
據我現在瞭解,問題是調用一個方法從另一個像'self.myMethodName(response)' - 這根本不工作。但是爲什麼以及如何做我需要做的事情,避免在另一個方法中簡單地替換一個方法的代碼? – vchslv13