1
我已經在scrapy上寫了一個蜘蛛來抓取一個網站。除了一件事情之外,一切都運行正常。一旦蜘蛛到達最後一頁,它就開始從最後一頁抓回第一頁。 這是我的代碼。 進口scrapy 從scrapy.http導入請求Scrapy spider不會在頁面結束後停止
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "tutorial"
allowed_domain = ["jabong.com"]
start_urls = [
"http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
if products:
for product in products:
item = DmozItem()
item_url = product.xpath('@data-url').extract()
item_url = "http://www.jabong.com/" + item_url[0] if item_url else ''
if item_url:
request=Request(url=item_url,callback=self.parse_page2,meta={"item":item},
headers={"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
yield request
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=%s" % (self.page_index),
headers={"Referer": "http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/",
"X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
sizes=[]
item = response.meta['item']
item['site_name'] = 'jabong'
item['site_domain'] = 'http://www.jabong.com'
name = response.xpath('.//span[contains(@id, "qa-title-product")]/span/text()').extract()
item['name'] = name[0] if name else ''
brand = response.xpath('.//span[contains(@id, "qa-prd-brand")]/text()').extract()
item['brand'] = brand[0] if brand else ''
desc1 = response.xpath('.//div[contains(@id, "productInfo")]/p/text()').extract()
desc2 = response.xpath('.//div[contains(@id, "productInfo")]/p/span/text()').extract()
item['desc'] = desc1[0] if desc1 else desc2[0] if desc2 else ''
sku = response.xpath('//*[@id="qa-sku"]/text()').extract()
item['sku'] = sku[0] if sku else ''
item['age'] = 'adult'
gender = response.xpath('.//a[contains(@id, "qa-breadcrumb2")]/span/text()').extract()
item['gender'] = gender[0] if gender else ''
category = response.xpath('.//a[contains(@id, "qa-breadcrumb3")]/span/text()').extract()
item['category'] = category[0] if category else ''
sub_category = response.xpath('.//a[contains(@id, "qa-breadcrumb4")]/span/text()').extract()
item['sub_category'] = sub_category[0] if sub_category else ''
size = response.xpath('.//ul[contains(@id, "listProductSizes")]/li/text()').extract()
item['size'] = sizes
if size:
for s in size:
sizes.append(s.strip())
item['size'] = sizes
material = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Fabric Details")]/../td[2]/text()').extract()
if material:
item['material'] = material[0]
else:
material = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Fabric")]/../td[2]/text()').extract()
item['material'] = material[0] if material else ''
pattern = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Pattern")]/../td[2]/text()').extract()
item['pattern'] = pattern[0] if pattern else ''
color = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Color")]/../td[2]/text()').extract()
item['colors'] = color if color else ''
style = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Style")]/../td[2]/text()').extract()
item['style'] = style[0] if style else ''
images = response.xpath('.//div[contains(@class, "thumb-slider pos-abs")]/span/@data-image-big').extract()
item['images'] = images if images else ''
price1 = response.xpath('.//span[contains(@id, "before_price")]/span[2]/text()').extract()
item['price'] = {}
item['price']['mrp'] = price1[0].strip() if price1 else ''
item['price']['discount'] = ''
item['price']['sale'] = ''
care_tips = response.xpath('//*[@id="productInfo"]/table/tr/td[contains(text(),"Wash Care")]/../td[2]/text()').extract()
item['care_tips'] = care_tips[0] if care_tips else ''
item['url'] = response.url
item['tags'] = ''
yield item
如果你在郵遞員身上測試郵件以及必要的蜘蛛,它會給出只有該頁面的結果 –
根據我的本地測試,如上所述,URL將被重定向。對於郵遞員,我不熟悉它,你可以忽略我的答案,並等待其他答案。 –