我能夠從第一頁中刪除所有的故事,我的問題是如何移動到下一頁並繼續刮取故事和名稱,請檢查我的代碼如何關注Scrapy Crawler的下一頁以報廢內容
# -*- coding: utf-8 -*-
import scrapy
from cancerstories.items import CancerstoriesItem
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'cancerstories'
allowed_domains = ['thebreastcancersite.greatergood.com']
start_urls = ['http://thebreastcancersite.greatergood.com/clickToGive/bcs/stories/']
def parse(self, response):
rows = response.xpath('//a[contains(@href,"story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./@href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
#myItem['name']=response.xpath('//h1[@class="headline"]/text()').extract()
text_raw = response.xpath('//div[@class="photoStoryBox"]/div/p/text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item
規則(SgmlLinkExtractor(允許不要忘了重命名解析方法parse_start_url = ('/ clickToGive \/bcs \/stories \?\ page \ = [0-9] +'),),callback =「parseme」,follow = True),添加這個並不會抓取第一頁 – leboMagma
我編輯答案寫一個適當的答覆c omment,我希望它可以幫助 – Javitronxo
是的,謝謝,linkextractor現在可以跟隨鏈接,但它似乎不斷地將頁面剪貼到末尾,然後再次沿着prev鏈接到開頭 – leboMagma