我需要使用Scrapy報廢每個項目的數據(http://example.com/itemview)。我有一個itemID列表,我需要通過example.com中的表單傳遞它。 每個項目都沒有網址更改。因此,對於我的蜘蛛中的每個請求,url將始終是相同的。但內容會有所不同。如何在調用item_scraped scrapy信號後開始新請求?
我不想爲處理每個請求的for循環。所以我按照下面提到的步驟。
- 開始蜘蛛與上述URL
- 加入item_scraped和spider_closed信號
- 通過幾個功能傳遞
- 通過刮下數據到管道
- trigerred的item_scraped信號
之後它會自動調用spider_closed信號。但是我想要繼續上述步驟直到完成itemID。
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
itemIDs = [11111,22222,33333]
current_item_num = 0
def __init__(self, itemids=None, *args, **kwargs):
super(ExampleSpider, self).__init__(*args, **kwargs)
dispatcher.connect(self.item_scraped, signals.item_scraped)
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.quit()
def start_requests(self):
request = self.make_requests_from_url('http://example.com/itemview')
yield request
def parse(self,response):
self.driver = webdriver.PhantomJS()
self.driver.get(response.url)
first_data = self.driver.find_element_by_xpath('//div[@id="itemview"]').text.strip()
yield Request(response.url,meta={'first_data':first_data},callback=self.processDetails,dont_filter=True)
def processDetails(self,response):
itemID = self.itemIDs[self.current_item_num]
..form submission with the current itemID goes here...
...the content of the page is updated with the given itemID...
yield Request(response.url,meta={'first_data':response.meta['first_data']},callback=self.processData,dont_filter=True)
def processData(self,response):
...some more scraping goes here...
item = ExamplecrawlerItem()
item['first_data'] = response.meta['first_data']
yield item
def item_scraped(self,item,response,spider):
self.current_item_num += 1
#i need to call the processDetails function here for the next itemID
#and the process needs to contine till the itemID finishes
self.parse(response)
我piepline:
class ExampleDBPipeline(object):
def process_item(self, item, spider):
MYCOLLECTION.insert(dict(item))
return