2015-07-01 61 views
0

我是新來scrapy。大約4天,我被困在轉到下一頁時提取showthread.php(論壇基於vbulletin)。轉到下一頁上showthread.php與scrapy

我的目標:http://forum.femaledaily.com/showthread.php?359-Hair-Smoothing

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from femaledaily.items import FemaledailyItem 

class Femaledaily(scrapy.Spider): 
    name = "femaledaily" 
    allowed_domains = ["femaledaily.com"] 
    start_urls = [ 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4", 
    ] 

    def parse(self, response): 
     for thd in response.css("tbody > tr "): 
      print "==========NEW THREAD======" 
      url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract() 
      url[0] = "http://forum.femaledaily.com/"+url[0] 
      print url[0] 
      yield scrapy.Request(url[0], callback=self.parse_thread) 

    def parse_thread(self, response): 
     for page in response.xpath('//ol[@id="posts"]/li'): 
      item = FemaledailyItem() 
      item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract() 
      # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first() 
      post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract() 

      if not post_creator: 
       item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract() 
      else: 
       item['post_creator'] = post_creator 

      item['post_content'] = "" 

      cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract() 
      for ct in cot: 
       item['post_content'] += ct.replace('\t','').replace('\n','') 

      yield item 

我能夠得到第一10個職位爲每個線程,但我很困惑如何去到下一個頁面。有任何想法嗎?

回答

1

在你的代碼做,以便它可以正確分頁略有變化,

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from femaledaily.items import FemaledailyItem 

class Femaledaily(scrapy.Spider): 
    name = "femaledaily" 
    allowed_domains = ["femaledaily.com"] 
    BASE_URL = "http://forum.femaledaily.com/" 
    start_urls = [ 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4", 
    ] 

    def parse(self, response): 
     for thd in response.css("tbody > tr "): 
      print "==========NEW THREAD======" 
      url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract() 
      url = "http://forum.femaledaily.com/"+url[0] 
      yield scrapy.Request(url, callback=self.parse_thread) 

     # pagination 
     next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract() 
     if next_page: 
      yield Request(self.BASE_URL + next_page[0], callback=self.parse) 
     else: 
      return 

    def parse_thread(self, response): 
     for page in response.xpath('//ol[@id="posts"]/li'): 
      item = FemaledailyItem() 
      item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract() 
      # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first() 
      post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract() 

      if not post_creator: 
       item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract() 
      else: 
       item['post_creator'] = post_creator 

      item['post_content'] = "" 

      cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract() 
      for ct in cot: 
       item['post_content'] += ct.replace('\t','').replace('\n','') 

      yield item 

     # pagination 
     next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract() 
     if next_page: 
      yield Request(self.BASE_URL + next_page[0], callback=self.parse_thread) 
     else: 
      return 

這裏先提取下一個頁面的鏈接(即單前鋒箭頭),並給予該next_page_url的請求,使回調功能與被調用的地方相同。當它到達最後一頁時,next-page-url消失並停止。

+0

謝謝,我修改了我的代碼庫:D –