2014-11-24 30 views
0

您好,我對Python和Scrapy非常陌生,這是我的第一個代碼,我無法解決看起來很基本的問題。scrapy爬蟲將多個項目類傳遞給管道

我有履帶一套做兩件事情:1 找到所有分頁網址,看望他們,並從每個頁面獲取一些數據 2-獲得在結果頁上列出的所有環節,老朋友他們和抓取每個位置數據

我正在使用回調的規則解析每個項目的決定。 我創建到類items.py內的每個解析器

第二個規則是處理完美,但第一個沒有被處理,我找不到哪裏是錯誤。

是我收到的終端運行履帶

2014-11-24 02:30:39-0200 [apontador] ERROR: Error processing {'city': u'BR-SP-S\xe3o Paulo', 
    'coordinates': {'lat': u'-23.56588', 'lng': u'-46.64777'}, 
    'current_url': 'http://www.apontador.com.br/local/search.html?q=supermercado&loc_z=S%C3%A3o+Paulo%2C+SP&loc=S%C3%A3o+Paulo%2C+SP&loc_y=S%C3%A3o+Paulo%2C+SP', 
    'datetime': datetime.datetime(2014, 11, 24, 2, 30, 39, 703972), 
    'depth': 0, 
    'domain': 'apontador.com.br', 
    'link_cat': 'ls', 
    'loc_cat': u'supermercado', 
    'session_id': -1, 
    'site_name': u'Apontador', 
    'state': u'BR-SP'} 
    Traceback (most recent call last): 
     File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain 
     return process_chain(self.methods[methodname], obj, *args) 
     File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain 
     d.callback(input) 
     File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback 
     self._startRunCallbacks(result) 
     File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks 
     self._runCallbacks() 
    --- <exception caught here> --- 
     File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks 
     current.result = callback(current.result, *args, **kw) 
     File "/locman/scrapy/locman/pipelines.py", line 37, in process_item 
     'neighborhood': item['neighborhood'], 
    File "/usr/local/lib/python2.7/dist-packages/scrapy/item.py", line 50, in __getitem__ 
     return self._values[key] 
    exceptions.KeyError: 'neighborhood' 

望着錯誤信息的錯誤信息是明確表示scrapy試圖處理在items.py所有項目,不尊重的定義項目類由每個回調調用。

如果你看到的文件items.py有兩類:1- apontadorlsItem,2- apontadordsItem

類apontadordsItem有鑰匙「街坊」,但該項目類apontadorlsItem沒有鑰匙「街坊」 。我創建了這兩個類來支持兩個不同的回調解析器函數,這取決於xpath規則。我這樣做是因爲有兩種類型的頁面正在使用不同的信息集進行爬網。規則工作正常,因爲我可以在日誌文件中看到,爬蟲正在工作,問題在於處理/保存它!

如何聲明管道使用不同的項目匹配規則,具體取決於搜尋器使用的源items.py類。

請幫幫忙,我被困

蜘蛛文件 - 蜘蛛/ apontador.py

import scrapy 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector 
from scrapy.selector import Selector 
from datetime import datetime 
from tld import get_tld 
from locman.items import apontadorlsItem 
from locman.items import apontadordsItem 

class apontador(CrawlSpider): 
    name = 'apontador' 
    session_id = -1 
    start_urls = ["http://www.apontador.com.br/local/search.html?q=supermercado&loc_z=S%C3%A3o+Paulo%2C+SP&loc=S%C3%A3o+Paulo%2C+SP&loc_y=S%C3%A3o+Paulo%2C+SP"] 
    rules = (
      # Rule for LS - Link source - Search results page 
      Rule(SgmlLinkExtractor(allow=("",),restrict_xpaths=("//nav[@class='pagination']")), callback='parse_items_ls', follow= True), 

      # Rule for DS - Data Source - Location data page 
      Rule(SgmlLinkExtractor(allow=("",),restrict_xpaths=(
       "//article[@class='poi card highlight']", 
       "//li[@class='similar-place sponsored']", 
       "//div[@class='recomendations']", 
       "//ul[@class='similar-places-list']", 
       "//article[@class='poi card']")), 
       callback='parse_items_ds', 
       follow= True), 
    ) 

    def __init__(self, session_id=-1, *args, **kwargs): 
     super(apontador, self).__init__(*args, **kwargs) 
     self.session_id = session_id 

    def parse_start_url(self, response): 
     self.response_url = response.url 
     return self.parse_items_ls(response) 

    # Callback item type LS 
    def parse_items_ls(self, response): 
     self.response_url = response.url 
     sel = Selector(response) 
     items_ls = [] 
     item_ls = apontadorlsItem() 
     item_ls["session_id"] = self.session_id 
     item_ls["depth"] = response.meta["depth"] 
     item_ls["current_url"] = response.url 

    # Get site name in metadata 
     meta_site = sel.xpath("//meta[@property='og:site_name']/@content").extract() 
     item_ls["site_name"] = u''.join(meta_site) 

    # Get latitude and longitude in metadata 
     meta_latitude = sel.xpath("//meta[@name='apontador:latitude']/@content").extract() 
     latitude = ''.join(meta_latitude) 

     meta_longitude = sel.xpath("//meta[@name='apontador:longitude']/@content").extract() 
     longitude = ''.join(meta_longitude) 

    # Convert the coordinates to an array 
     coordinates = {"lng": longitude , "lat": latitude} 
     item_ls["coordinates"] = coordinates 

    # This items gets the strings directly from meta data keywords and creates a list 
     meta_keywords_ls = sel.xpath("//meta[@name='keywords']/@content").extract() 
     meta_keywords_ls_str = u''.join(meta_keywords_ls) 
     meta_keywords_ls_list = meta_keywords_ls_str.split(", ") 
     meta_state = meta_keywords_ls_list[6] 
     meta_city = meta_keywords_ls_list[5] 
     meta_loc_cat = meta_keywords_ls_list[4] 

     item_ls["state"] = u"BR-" + meta_state 
     item_ls["city"] = u"BR-" + meta_state + "-" + meta_city 
     item_ls["loc_cat"] = meta_loc_cat 

    # This items gets the domain name using the TLD module 
     domain = get_tld(response.url) 
     item_ls["domain"] = domain 

    # This items gets datetime 
     item_ls["datetime"] = datetime.now() 

    # This items defines de link category   
     item_ls["link_cat"] = "ls" 
     yield item_ls 


    # Callback item type DS 
    def parse_items_ds(self, response): 
     self.response_url = response.url 
     sel = Selector(response) 
     items_ds = [] 
     item_ds = apontadordsItem() 
     item_ds["session_id"] = self.session_id 
     item_ds["depth"] = response.meta["depth"] 
     item_ds["current_url"] = response.url 

    # Get site name in metadata 
     meta_site = sel.xpath("//meta[@property='og:site_name']/@content").extract() 
     item_ds["site_name"] = u''.join(meta_site) 

    # Get location name in metadata 
     meta_loc_name = sel.xpath("//meta[@property='og:title']/@content").extract() 
     item_ds["loc_name"] = u''.join(meta_loc_name) 

    # Get location source id in metadata 
     meta_loc_source_id = sel.xpath("//meta[@name='apontador:place-id']/@content").extract() 
     item_ds["loc_source_id"] = ''.join(meta_loc_source_id) 

    # Get location street address in metadata 
     meta_loc_address = sel.xpath("//meta[@property='business:contact_data:street_address']/@content").extract() 
     meta_loc_address_str = u''.join(meta_loc_address) 
     meta_loc_address_list = meta_loc_address_str.split(", ") 
     meta_loc_address_number = meta_loc_address_list[1] 
     meta_loc_address_street = meta_loc_address_list[0] 
     item_ds["loc_street"] = meta_loc_address_street 
     item_ds["loc_number"] = meta_loc_address_number 

    # Get latitude and longitude in metadata 
     meta_latitude = sel.xpath("//meta[@property='place:location:latitude']/@content").extract() 
     latitude = ''.join(meta_latitude) 

     meta_longitude = sel.xpath("//meta[@property='place:location:longitude']/@content").extract() 
     longitude = ''.join(meta_longitude) 

     coordinates = {"lng": longitude , "lat": latitude} 
     item_ds["coordinates"] = coordinates 

    # This items gets the neighborhood, loc_cat, loc_sub_categoryfrom meta data keywords, creates a list and populates the fields from the list 
     meta_keywords_ds = sel.xpath("//meta[@name='keywords']/@content").extract() 
     meta_keywords_ds_str = u''.join(meta_keywords_ds) 
     meta_keywords_ds_list = meta_keywords_ds_str.split(", ") 
     meta_loc_cat = meta_keywords_ds_list[9] 
     meta_loc_cat_sub = meta_keywords_ds_list[8] 
     meta_neighborhood = meta_keywords_ds_list[5] 

     item_ds["loc_cat"] = meta_loc_cat 
     item_ds["loc_cat_sub"] = meta_loc_cat_sub 
     item_ds["neighborhood"] = meta_neighborhood 

    # Region informations 
     meta_statec = sel.xpath("//meta[@property='business:contact_data:region']/@content").extract() 
     meta_state = u''.join(meta_statec) 
     item_ds["state"] = u"BR-" + meta_state 

     meta_cityc = sel.xpath("//meta[@property='business:contact_data:locality']/@content").extract() 
     meta_city = u''.join(meta_cityc) 
     item_ds["city"] = u"BR-" + meta_state + "-" + meta_city 

     meta_postal_code = sel.xpath("//meta[@property='business:contact_data:postal_code']/@content").extract() 
     item_ds["loc_postal_code"] = ''.join(meta_postal_code) 

    # This items gets the domain name using the TLD module 
     domain = get_tld(response.url) 
     item_ds["domain"] = domain 

    # This items gets datetime as an i 
     item_ds["datetime"] = datetime.now() 

     item_ds["link_cat"] = "ds" 
     yield item_ds 

項目文件 - items.py

from scrapy.item import Item, Field 

class apontadorlsItem(Item): 
    datetime = Field() 
    session_id = Field() 
    depth = Field() 
    link_cat = Field() 
    site_name = Field() 
    domain = Field() 
    current_url = Field() 
    city = Field() 
    state = Field() 
    loc_cat = Field() 
    coordinates = Field() 

class apontadordsItem(Item): 
    datetime = Field() 
    session_id = Field() 
    depth = Field() 
    link_cat = Field() 
    site_name = Field() 
    domain = Field() 
    current_url = Field() 
    state = Field() 
    city = Field() 
    neighborhood = Field() 
    loc_name = Field() 
    loc_street = Field() 
    loc_number = Field() 
    loc_postal_code = Field() 
    loc_source_id = Field() 
    loc_cat = Field() 
    loc_cat_sub = Field() 
    coordinates = Field() 

管道文件 - pipelines.py

from scrapy.exceptions import DropItem 
from scrapy_mongodb import MongoDBPipeline 

class apontadorpipe(MongoDBPipeline): 

    def process_item(self, item, spider): 
     if self.config['buffer']: 
      self.current_item += 1 
      item = dict(item) 

      self.item_buffer.append(item) 

      if self.current_item == self.config['buffer']: 
       self.current_item = 0 
       return self.insert_item(self.item_buffer, spider) 
      else: 
       return item 

     matching_item = self.collection.find_one(
      {'datetime': item['datetime'], 
      'session_id': item['session_id'], 
      'depth': item['depth'], 
      'link_cat': item['link_cat'], 
      'site_name': item['site_name'], 
      'domain': item['domain'], 
      'current_url': item['current_url'], 
      'state': item['state'], 
      'city': item['city'], 
      'neighborhood': item['neighborhood'], 
      'loc_name': item['loc_name'], 
      'loc_street': item['loc_street'], 
      'loc_number': item['loc_number'], 
      'loc_postal_code': item['loc_postal_code'], 
      'loc_cat': item['loc_cat'], 
      'loc_cat_sub': item['loc_cat_sub'], 
      'loc_source_id': item['loc_source_id'], 
      'coordinates': item['coordinates']} 
     ) 

     if matching_item is not None: 
      raise DropItem(
       "Duplicate found for %s, %s" % 
       item['current_url'] 
      ) 
     else: 
      return self.insert_item(item, spider) 

設置文件settings.py

BOT_NAME = 'locman' 

SPIDER_MODULES = 'locman.spiders' 
NEWSPIDER_MODULE = 'locman.spiders' 
DEPTH_LIMIT = 10000 

DEFAULT_ITEM_CLASS = 'locman.items.apontador' 

ITEM_PIPELINES = { 
    'locman.pipelines.apontadorpipe': 100 
} 

# 'scrapy_mongodb.MongoDBPipeline' connection 
MONGODB_URI = 'connection string' 
MONGODB_DATABASE = '' 
MONGODB_COLLECTION = '' 

DOWNLOADER_MIDDLEWARES = { 
     'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None, 
     'locman.ua.rotate_useragent.RotateUserAgentMiddleware' :400 
    } 
+0

你想存儲從'parse_items_ls()'回調未來的項目? – alecxe 2014-11-24 06:14:49

+0

是的,我想從items.py中使用de class apontadorlsItem來存儲parse_items_ls()中的項目。 – 2014-11-24 18:58:48

+0

parse_items_ds()將數據存儲到mongoDB,但我不確定它是否使用items.py – 2014-11-24 19:00:16

回答

0

它看起來像那個項目沒有關鍵「鄰居」。確保以下事情。

  1. 你有沒有拼寫錯誤「鄰里」
  2. 「鄰居」是定義項目類
  3. 項目[「鄰居」]在蜘蛛被初始化

確保項目具有關鍵「鄰居「在文件」/ locman/scrapy/locman/pipeline。PY」,第37行,在process_item

if item.get('neighborhood', None): 

它會返回None,如果項目還沒有關鍵的‘鄰居’,你還可以設置默認值,而不是沒有這樣的

if item.get('neighborhood', 'default_value') 
+0

中的類apontadordsItem嗨Tasawer,我檢查了所有建議,我沒有找到它們中的任何一個!蜘蛛正在獲取數據,我認爲這個問題正在寫入MongoDB。 – 2014-11-25 17:39:18

+0

你可能有更多的一個項目,所以有可能有一些項目沒有重視「鄰居」,在獲得項目[「鄰居」的價值之前添加支票,我已經改進了我的答案看它..謝謝 – 2014-11-26 05:31:25

+0

嘿..感謝您的幫助/提示,​​我會嘗試它,我讓你知道結果。 – 2014-11-26 07:17:14

0

感謝了很多幫助!我找到了一個好的workround我的問題,這正是我需要的!

在pipeline.py我從items.py中導入了兩個類,定義了2個不同的函數和每個字典。可以有不同的重複記錄處理和不同的寫作過程他爲每個項目類數據庫!

爲pipeline.py新代碼:

from scrapy.exceptions import DropItem 
from scrapy_mongodb import MongoDBPipeline 

from locman.items import apontadorlsItem 
from locman.items import apontadordsItem 

class apontadorpipe(MongoDBPipeline): 

def process_item_ds(self, item, spider): 
    if self.config['buffer']: 
     self.current_item += 1 
     item = dict(apontadordsItem) 

     self.item_buffer.append(item) 

     if self.current_item == self.config['buffer']: 
      self.current_item = 0 
      return self.insert_item(self.item_buffer, spider) 
     else: 
      return item 

     if isinstance(item, apontadordsItem): 
      matching_item = self.collection.find_one(
       {'datetime': item['datetime'], 
       'session_id': item['session_id'], 
       'link_cat': item['link_cat'], 
       'site_name': item['site_name'].encode('utf-8'), 
       'domain': item['domain'], 
       'current_url': item['current_url'], 
       'state': item['state'], 
       'city': item['city'].encode('utf-8'), 
       'neighborhood': item['neighborhood'].encode('utf-8'), 
       'loc_name': item['loc_name'].encode('utf-8'), 
       'loc_street': item['loc_street'].encode('utf-8'), 
       'loc_number': item['loc_number'], 
       'loc_postal_code': item['loc_postal_code'], 
       'loc_cat': item['loc_cat'], 
       'loc_cat_sub': item['loc_cat_sub'], 
       'loc_source_id': item['loc_source_id'], 
       'loc_phone': item['loc_phone'], 
       'address': item['address'].encode('utf-8'), 
       'coordinates': item['coordinates']} 
      ) 

      if matching_item is not None: 
       raise DropItem(
        "Duplicate found for %s, %s" % 
        item['current_url'], 
        item['loc_source_id'], 
       ) 

      else: 

       return self.insert_item(item, spider) 


def process_item_ls(self, item, spider): 
    if self.config['buffer']: 
     self.current_item += 1 
     item = dict(apontadorlsItem) 

     self.item_buffer.append(item) 

     if self.current_item == self.config['buffer']: 
      self.current_item = 0 
      return self.insert_item(self.item_buffer, spider) 
     else: 
      return item 

     if isinstance(item, apontadorlsItem): 
      matching_item = self.collection.find_one(
       {'datetime': item['datetime'], 
       'session_id': item['session_id'], 
       'link_cat': item['link_cat'], 
       'site_name': item['site_name'].encode('utf-8'), 
       'domain': item['domain'], 
       'current_url': item['current_url'], 
       'state': item['state'], 
       'city': item['city'].encode('utf-8'), 
       'loc_cat': item['loc_cat'].encode('utf-8'), 
       'coordinates': item['coordinates']} 
      ) 

      if matching_item is not None: 
       raise DropItem(
        "Duplicate found for %s, %s" % 
        item['current_url'], 
       ) 

      else: 

       return self.insert_item(item, spider)