轉到下一頁上showthread.php與scrapy

我是新來scrapy。大約4天，我被困在轉到下一頁時提取showthread.php（論壇基於vbulletin）。轉到下一頁上showthread.php與scrapy

我的目標：http://forum.femaledaily.com/showthread.php?359-Hair-Smoothing

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from femaledaily.items import FemaledailyItem 

class Femaledaily(scrapy.Spider): 
    name = "femaledaily" 
    allowed_domains = ["femaledaily.com"] 
    start_urls = [ 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4", 
    ] 

    def parse(self, response): 
     for thd in response.css("tbody > tr "): 
      print "==========NEW THREAD======" 
      url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract() 
      url[0] = "http://forum.femaledaily.com/"+url[0] 
      print url[0] 
      yield scrapy.Request(url[0], callback=self.parse_thread) 

    def parse_thread(self, response): 
     for page in response.xpath('//ol[@id="posts"]/li'): 
      item = FemaledailyItem() 
      item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract() 
      # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first() 
      post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract() 

      if not post_creator: 
       item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract() 
      else: 
       item['post_creator'] = post_creator 

      item['post_content'] = "" 

      cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract() 
      for ct in cot: 
       item['post_content'] += ct.replace('\t','').replace('\n','') 

      yield item

我能夠得到第一10個職位爲每個線程，但我很困惑如何去到下一個頁面。有任何想法嗎？

來源

2015-07-01 Fathur Rachman Widhiantoko

在你的代碼做，以便它可以正確分頁略有變化，

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from femaledaily.items import FemaledailyItem 

class Femaledaily(scrapy.Spider): 
    name = "femaledaily" 
    allowed_domains = ["femaledaily.com"] 
    BASE_URL = "http://forum.femaledaily.com/" 
    start_urls = [ 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4", 
    ] 

    def parse(self, response): 
     for thd in response.css("tbody > tr "): 
      print "==========NEW THREAD======" 
      url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract() 
      url = "http://forum.femaledaily.com/"+url[0] 
      yield scrapy.Request(url, callback=self.parse_thread) 

     # pagination 
     next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract() 
     if next_page: 
      yield Request(self.BASE_URL + next_page[0], callback=self.parse) 
     else: 
      return 

    def parse_thread(self, response): 
     for page in response.xpath('//ol[@id="posts"]/li'): 
      item = FemaledailyItem() 
      item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract() 
      # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first() 
      post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract() 

      if not post_creator: 
       item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract() 
      else: 
       item['post_creator'] = post_creator 

      item['post_content'] = "" 

      cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract() 
      for ct in cot: 
       item['post_content'] += ct.replace('\t','').replace('\n','') 

      yield item 

     # pagination 
     next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract() 
     if next_page: 
      yield Request(self.BASE_URL + next_page[0], callback=self.parse_thread) 
     else: 
      return

這裏先提取下一個頁面的鏈接（即單前鋒箭頭），並給予該next_page_url的請求，使回調功能與被調用的地方相同。當它到達最後一頁時，next-page-url消失並停止。

來源

2015-07-01 07:14:19 Jithin

謝謝，我修改了我的代碼庫：D –

轉到下一頁上showthread.php與scrapy

回答

相關問題