2015-04-24 71 views
1

我已經在scrapy上寫了一個蜘蛛來抓取一個網站。除了一件事情之外,一切都運行正常。一旦蜘蛛到達最後一頁,它就開始從最後一頁抓回第一頁。 這是我的代碼。 進口scrapy 從scrapy.http導入請求Scrapy spider不會在頁面結束後停止

from tutorial.items import DmozItem 

class DmozSpider(scrapy.Spider): 
    name = "tutorial" 
    allowed_domain = ["jabong.com"] 
    start_urls = [ 
      "http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1" 
     ] 

    page_index = 1 

    def parse(self,response): 
     products = response.xpath('//li') 
     if products: 
      for product in products: 
       item = DmozItem() 
       item_url = product.xpath('@data-url').extract() 
       item_url = "http://www.jabong.com/" + item_url[0] if item_url else '' 
       if item_url: 
         request=Request(url=item_url,callback=self.parse_page2,meta={"item":item}, 
           headers={"Accept": 
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}) 
        request.meta['item'] = item 
        yield request 
     else: 
      return 
     self.page_index += 1 
      if self.page_index: 
        yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=%s" %    (self.page_index), 
      headers={"Referer": "http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/", 
       "X-Requested-With": "XMLHttpRequest"}, 
         callback=self.parse) 

    def parse_page2(self, response): 
     sizes=[] 
      item = response.meta['item'] 
     item['site_name'] = 'jabong' 
     item['site_domain'] = 'http://www.jabong.com' 

     name = response.xpath('.//span[contains(@id, "qa-title-product")]/span/text()').extract() 
     item['name'] = name[0] if name else '' 

     brand = response.xpath('.//span[contains(@id, "qa-prd-brand")]/text()').extract() 
     item['brand'] = brand[0] if brand else '' 

     desc1 = response.xpath('.//div[contains(@id, "productInfo")]/p/text()').extract() 
     desc2 = response.xpath('.//div[contains(@id, "productInfo")]/p/span/text()').extract() 
     item['desc'] = desc1[0] if desc1 else desc2[0] if desc2 else '' 

     sku = response.xpath('//*[@id="qa-sku"]/text()').extract() 
     item['sku'] = sku[0] if sku else '' 

     item['age'] = 'adult'  

     gender = response.xpath('.//a[contains(@id, "qa-breadcrumb2")]/span/text()').extract() 
     item['gender'] = gender[0] if gender else ''   

     category = response.xpath('.//a[contains(@id, "qa-breadcrumb3")]/span/text()').extract() 
     item['category'] = category[0] if category else '' 

     sub_category = response.xpath('.//a[contains(@id, "qa-breadcrumb4")]/span/text()').extract() 
     item['sub_category'] = sub_category[0] if sub_category else '' 

     size = response.xpath('.//ul[contains(@id, "listProductSizes")]/li/text()').extract() 
     item['size'] = sizes 
     if size: 
      for s in size: 
       sizes.append(s.strip()) 
      item['size'] = sizes 


     material = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Fabric Details")]/../td[2]/text()').extract() 
     if material: 
      item['material'] = material[0] 
     else: 
      material = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Fabric")]/../td[2]/text()').extract() 
      item['material'] = material[0] if material else '' 

     pattern = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Pattern")]/../td[2]/text()').extract() 
     item['pattern'] = pattern[0] if pattern else '' 

     color = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Color")]/../td[2]/text()').extract() 
     item['colors'] = color if color else ''  

     style = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Style")]/../td[2]/text()').extract() 
     item['style'] = style[0] if style else '' 

     images = response.xpath('.//div[contains(@class, "thumb-slider pos-abs")]/span/@data-image-big').extract() 
     item['images'] = images if images else '' 

     price1 = response.xpath('.//span[contains(@id, "before_price")]/span[2]/text()').extract() 
     item['price'] = {} 
     item['price']['mrp'] = price1[0].strip() if price1 else '' 
     item['price']['discount'] = '' 
     item['price']['sale'] = '' 

     care_tips = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Wash Care")]/../td[2]/text()').extract() 
     item['care_tips'] = care_tips[0] if care_tips else '' 

     item['url'] = response.url 

     item['tags'] = '' 
     yield item 

回答

0

看來這個網站會重定向到http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=*http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/,正常嗎?如果是這樣,我認爲根據你的代碼這是一個無盡的蜘蛛。

+0

如果你在郵遞員身上測試郵件以及必要的蜘蛛,它會給出只有該頁面的結果 –

+0

根據我的本地測試,如上所述,URL將被重定向。對於郵遞員,我不熟悉它,你可以忽略我的答案,並等待其他答案。 –