2013-06-12 40 views
0

我試圖從下面提到的URL特定字符串的提取某些字符串:如何從URL

樣品網址:

http://www.ladyblush.com/buy-sarees-online.html?p=1 
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1 
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1 

我想提取:

productCategory = "sarees" productSubCategory = "" 
productCategory = "ladies" productSubCategory = "suits" 
productCategory = "women" productSubCategory = "fashion-accessories" 

而且等等。其實我正在寫一個蜘蛛,我需要從上面提到的URL提取productCategory和productSubCategory ..所以我試圖從response.url解析方法內提取這些字段。有人可以幫我請

我的代碼:

import re 
from scrapy.http import Request 
from eScraper.items import EscraperItem 
from scrapy.selector import HtmlXPathSelector 
from scrapy.contrib.spiders import CrawlSpider 

#------------------------------------------------------------------------------ 

class ESpider(CrawlSpider): 

    name = "ladyblushSpider"  
    allowed_domains = ["ladyblush.com"]  
    URLSList = [] 

    for n in range (1,100): 

     URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n)) 
     URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n)) 

    start_urls = URLSList 

    def parse(self, response): 

     item = EscraperItem() 
     hxs = HtmlXPathSelector(response) 
     sites = hxs.select('//div[@class="third thumbnailSpillLarge"]') 
     items = [] 

     for site in sites: 

      item = EscraperItem() 

      item['currency'] = 'INR' 
      item['productCategory'] = [""] 
      item['productSubCategory'] = [""] 
      item['productSite'] = ["http://ladyblush.com/"] 
      item['productImage'] = site.select('./a/div/img/@src').extract() 
      item['productTitle'] = site.select('./a/div/img/@title').extract()       
      item['productURL'] = [site.select('./a/@href').extract()[0].replace(" ","%20")] 
      productMRP = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="old-price"]//span[@class="price"]/text()').extract() 
      productPrice = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="special-price"]//span[@class="price"]/text()').extract() 

      if productMRP and productPrice: 
       price = [productMRP[1].strip()] + [productPrice[1].strip()] 
      else: 
       price = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//span[@class="regular-price"]//span[@class="price"]/text()').extract()  
      item['productPrice'] = price 

      items.append(item)    
      secondURL = item['productURL'][0] 
      request = Request(secondURL,callback=self.parsePage2) 
      request.meta['item'] = item 

      yield request 

    def parsePage2(self, response): 

     hxs = HtmlXPathSelector(response) 

     item = response.meta['item'] 

     if hxs.select('//div[@class="addtocart-container"]/div/text()').extract():     
      item['availability'] = False    
     else:     
      item['availability'] = True 

     if hxs.select('//label[@class="required"]/text()').extract(): 
      item['hasVariants'] = True 
     else:   
      item['hasVariants'] = False 
     item['image_urls'] = list(set(item['productImage'])) 
     item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[@class="std"]/text()').extract()])] 
     item['productImage'] = item['productImage'] + hxs.select('//div[@class="more-views"]/ul/li/a/img/@src').extract() + hxs.select('//div[@class="more-views"]/ul/li/a/@href').extract() 

     return item 

#------------------------------------------------------------------------------ 
+0

使用urllib2和beautifulsoup進行解析。 scrapy也在那裏。 – rajpy

+0

發佈您的代碼到目前爲止 – njzk2

+0

將我的代碼添加到問題。我想在解析方法中提取上述細節。 –

回答

1

可以在parse方法從

response.url得到的URL。然後,您可以解析,只是得到的url路徑

import os 
test = 'buy-women-fashion-accessories.html?p=1' 
parts = os.path.splitext(test) 
# ('buy-women-fashion-accessories', '.html?p=1') 
parts[0].split('-')[1:] 
# ['women', 'fashion', 'accessories'] 

雖然這是相當薄弱的解決方案。你確定數據沒有存儲在你正在解析的頁面的html中的某處,而不是查看url嗎?

+0

我已經檢查過,據我所知這是本網站的最佳方式。 –