2016-01-22 89 views
-1

我想抓取配方網站:我的recipes.com爲了提取配方的細節,我將存儲在android應用程序中的sqlite數據庫。除了配方持續時間,我能夠提取所有配方詳細信息。這裏的問題並不是所有的食譜都有相同的格式;有些包含烹飪時間和準備時間,有些包含總時間,有些則不包含。以下是我用來刮取網站的代碼以及我針對持續時間的html代碼。Python - Scrapy爬行myrecipes.com問題

我試着運行代碼,但輸出不會註冊。我懷疑這個問題出在if-else語句中,爲了解釋不同的配方格式。任何幫助,將不勝感激。

from scrapy.selector import Selector 
from scrapy.spiders import CrawlSpider, Rule 

from myrecipes.items import MyrecipesRecipe, Ingredient, Nutrients 

class MyrecipesSpider(CrawlSpider): 
    name = "myrecipes" # name of the spider to be used when crawling 
    allowed_domains = ["myrecipes.com"] # where the spider is allowed to go 
    start_urls = ["http://www.myrecipes.com/recipe/indian-chickpea-vegetable-stew"] 

def parse(self, response): 
    sel = Selector(response) # the selector 
    recipe = MyrecipesRecipe() 

    # Name 
    recipe['name'] = sel.xpath("substring-before(//title/text(),' Recipe')").extract() 

    # Cuisine 
    recipe['cuisine'] = "Indian" 

    # Ingredients 
    ingredients = [] 
    ingredient_nodes = sel.xpath('//*[@class = "panel-pane pane-entity-field pane-node-field-ingredients"]/div/div') 

    for ingredient_node in ingredient_nodes: 
     try: 
      name = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "name"]/text()').extract() 
      quantity = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "amount"]/text()').extract() 
     except: 
      continue 

     ingredient = Ingredient() 
     ingredient['name'] = name 
     ingredient['quantity'] = quantity 
     ingredients.append(ingredient) 

    recipe['ingredients'] = ingredients 

    # Directions 
    instructions = [] 
    instruction_nodes = sel.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]') 

    for instruction_node in instruction_nodes: 
     try: 
      instruction_step = instruction_node.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]/*/text()').extract() 
     except: 
      continue 
     instructions.append(instruction_step) 

    recipe['instructions'] = instructions 

    # Nutritional Info 
    nutrients = [] 
    nutrient_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-nutrition-data"]/div/div[@itemprop = "nutrition"]') 

    for nutrient_node in nutrient_nodes: 
     try: 
      name = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains (@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/text()').extract() 
      quantity = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains(@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/span/text()').extract() 
     except: 
      continue 

     nutrient = Nutrients() 
     nutrient['name'] = name 
     nutrient['quantity'] = quantity 
     nutrients.append(nutrient) 
    nutrient_name = [] 
    x = nutrients[0].get('name') 
    for i in x: 
     if i != "\n": 
      nutrient_name.append(i) 
    nutrients[0]['name'] = nutrient_name 

    recipe['nutrients'] = nutrients 

    # Recipe time 
    duration_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div[@class = "field-collection-container clearfix"]') 

    for duration_node in duration_nodes: 
     try: 
      path = duration_node.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div/div[@class = "field-recipe-time"]/div/div/span[1]/text()').extract() 
      if path == 'Prep: ': 
       recipe['prep_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract() 
      elif path == 'Cook: ': 
       recipe['cook_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract() 
      elif path == 'Total: ': 
       recipe['total_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract() 
     except: 
      continue 


    # Number of Servings 
    recipe['servings'] = sel.xpath("substring-after(//div[@class = 'panel-pane pane-entity-field pane-node-field-yield']/div[@class = 'pane-content']/div[@itemprop = 'yield']/div[@class = 'field-yield']/text(), ': ')").extract() 

    return recipe 

HTML片段:

<div class="panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"> 
    <h2 class="pane-title">Recipe Time</h2> 

    <div class="pane-content"> 
     <div class="field-collection-container clearfix"> 
     <div class="field-recipe-time"> 
     <div class="field-collection-view clearfix view-mode-recipe-time"> 
     <div class="recipe-time-info"> 
      <span class="recipe-time-text">Prep: </span> 
      <span class="recipe-time-duration">25 Minutes</span> 
     </div> 
     </div> </div> 
      <div class="field-recipe-time"> 
      <div class="field-collection-view clearfix view-mode-recipe-time field-collection-view-final"> 
     <div class="recipe-time-info"> 
      <span class="recipe-time-text">Cook: </span> 
      <span class="recipe-time-duration">45 Minutes</span> 
     </div> 
     </div> </div> 
     </div> </div> 


      </div> 
+1

'path'是一個列表,你將它與一個字符串進行比較,'.extract()'返回一個列表。 – eLRuLL

回答

-1

問題是與您的XPath建設。更多情況下,當有很多html元素時,缺少任何元素的可能性會增加。如果你發現構建相對xpath很困難,我建議你使用瀏覽器的xpath選擇器。您可以右鍵單擊元素並選擇相關的xpath。試試這個:

duration_nodes = sel.xpath('//*[@id="block-system-main"]/div/div[4]/div[1]/div/div[3]/div/div/div') 
    for duration_node in duration_nodes: 
     try: 
      path = ''.join(duration_node.xpath('div/div/span[1]/text()').extract()) 
      if path == 'Prep: ': 
       recipe['prep_time'] = duration_node.xpath('div/div/span[2]/text()').extract() 
      elif path == 'Cook: ': 
       recipe['cook_time'] = duration_node.xpath('div/div/span[2]/text()').extract() 
      elif path == 'Total: ': 
       recipe['total_time'] = duration_node.xpath('div/div/span[2]/text()').extract() 
     except: 
      continue