2016-02-19 57 views
0

我需要得到一個tracback錯誤的幫助。在某些情況下,我有一個scrapy蜘蛛,很明顯會出現一些處理錯誤,它意味着要廢棄一個網站並在MongoDB中存儲URL,Date和Title。我不確定該在哪裏開始調試。這裏是錯誤;在Python/Scrapy中調試回溯函數

2016-02-19 18:34:47 [scrapy] DEBUG: Crawled (200) <GET http://www.stuff.co.nz/business/farming/77040713/farmer-thomas-king-loses-supreme-court-case-over-farm-eviction> (referer: http://www.stuff.co.nz/business/) 
2016-02-19 18:34:47 [scrapy] ERROR: Error processing {'date': 'Thu Feb 18 19:49:51 UTC 2016', 
'title': 'Farmer loses eviction case', 
'url': 'http://www.stuff.co.nz/business/farming/77040713/farmer-thomas-king-loses-supreme-court-case-over-farm-eviction'} 
Traceback (most recent call last): 
    File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 588, in _runCallbacks 
    current.result = callback(current.result, *args, **kw) 
    File "/usr/lib/python2.7/site-packages/scrapy_mongodb.py", line 222, in process_item 
    return self.insert_item(item, spider) 
    File "/usr/lib/python2.7/site-packages/scrapy_mongodb.py", line 251, in insert_item 
    self.collection.insert(item, continue_on_error=True) 
    File "/usr/lib64/python2.7/site-packages/pymongo/collection.py", line 1926, in insert 
    check_keys, manipulate, write_concern) 
    File "/usr/lib64/python2.7/site-packages/pymongo/collection.py", line 430, in _insert 
    gen(), check_keys, self.codec_options, sock_info) 
    File "/usr/lib64/python2.7/site-packages/pymongo/pool.py", line 254, in write_command 
    helpers._check_command_response(result) 
    File "/usr/lib64/python2.7/site-packages/pymongo/helpers.py", line 188, in _check_command_response 
    raise OperationFailure(msg % errmsg, code, response) 
OperationFailure: not authorized on article to execute command { insert: "stuffconz", ordered: false, writeConcern: { fsync: false }, documents: [ { _id: ObjectId('56c6a97702f22371605f4668'), url: "http://www.stuff.co.nz/business/farming/77040713/farmer-thomas-king-loses-supreme-court-case-over-farm-eviction", date: "Thu Feb 18 19:49:51 UTC 2016", title: "Farmer loses eviction case" } ] } 

這是scrapy蜘蛛;

from __future__ import absolute_import 

from scrapy import Spider 
from scrapy.selector import Selector 
from scrapy.http import Request 
from scrapy.linkextractors import LinkExtractor 
from harland.items import * 
from scrapy.spiders import CrawlSpider, Rule 

class StuffSpider(CrawlSpider): 
    name = "stuff" 
    allowed_domains = ["stuff.co.nz"] 
    start_urls = [ 
     "http://stuff.co.nz/business/", 
    ] 

    rules = (
     Rule(LinkExtractor(allow=".*/business.*"), callback='parse_article_page', follow=True), 
    ) 

    def parse_article_page(self, response): 

     article = Selector(response) 
     page = Selector(response).xpath('/html/head/meta[9]') 
     page_type = page.xpath('//meta[@property="og:type"]/@content').extract()[0] 

     if "article" in page_type: 
      item = StuffItem() 

      item_url = page.xpath('//meta[@property="og:url"]/@content').extract()[0] 
      item['url'] = str(item_url) 
      item_title = page.xpath('//meta[@property="og:title"]/@content').extract()[0] 
      item['title'] = str(item_title) 
      item_date = page.xpath('//*[@itemprop="datePublished"]/@content').extract()[0] 
      item['date'] = str(item_date) 
      yield item 

而這裏的管道,直接從這裏取:https://github.com/sebdah/scrapy-mongodb/blob/master/scrapy_mongodb.py

""" 
scrapy-mongodb - MongoDB pipeline for Scrapy 
Homepage: https://github.com/sebdah/scrapy-mongodb 
Author: Sebastian Dahlgren <[email protected]> 
License: Apache License 2.0 <http://www.apache.org/licenses/LICENSE-2.0.html> 
Copyright 2013 Sebastian Dahlgren 
Licensed under the Apache License, Version 2.0 (the "License"); 
you may not use this file except in compliance with the License. 
You may obtain a copy of the License at 
    http://www.apache.org/licenses/LICENSE-2.0 
Unless required by applicable law or agreed to in writing, software 
distributed under the License is distributed on an "AS IS" BASIS, 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
See the License for the specific language governing permissions and 
limitations under the License. 
""" 
import datetime 

from pymongo import errors 
from pymongo.mongo_client import MongoClient 
from pymongo.mongo_replica_set_client import MongoReplicaSetClient 
from pymongo.read_preferences import ReadPreference 

from scrapy import log 
from scrapy.contrib.exporter import BaseItemExporter 

VERSION = '0.9.1' 


def not_set(string): 
    """ Check if a string is None or '' 
    :returns: bool - True if the string is empty 
    """ 
    if string is None: 
     return True 
    elif string == '': 
     return True 
    return False 


class MongoDBPipeline(BaseItemExporter): 
    """ MongoDB pipeline class """ 
    # Default options 
    config = { 
     'uri': 'mongodb://harland:[email protected]:27017', 
     'fsync': False, 
     'write_concern': 0, 
     'database': 'article', 
     'collection': 'stuffconz', 
     'replica_set': None, 
     'unique_key': None, 
     'buffer': None, 
     'append_timestamp': True, 
     'stop_on_duplicate': 0, 
    } 

    # Item buffer 
    current_item = 0 
    item_buffer = [] 

    # Duplicate key occurence count 
    duplicate_key_count = 0 

    def load_spider(self, spider): 
     self.crawler = spider.crawler 
     self.settings = spider.settings 

     # Versions prior to 0.25 
     if not hasattr(spider, 'update_settings') and hasattr(spider, 'custom_settings'): 
      self.settings.setdict(spider.custom_settings or {}, priority='project') 

    def open_spider(self, spider): 
     self.load_spider(spider) 

     # Configure the connection 
     self.configure() 

     if self.config['replica_set'] is not None: 
      connection = MongoReplicaSetClient(
       self.config['uri'], 
       replicaSet=self.config['replica_set'], 
       w=self.config['write_concern'], 
       fsync=self.config['fsync'], 
       read_preference=ReadPreference.PRIMARY_PREFERRED) 
     else: 
      # Connecting to a stand alone MongoDB 
      connection = MongoClient(
       self.config['uri'], 
       fsync=self.config['fsync'], 
       read_preference=ReadPreference.PRIMARY) 

     # Set up the collection 
     database = connection[self.config['database']] 
     self.collection = database[self.config['collection']] 
     log.msg(u'Connected to MongoDB {0}, using "{1}/{2}"'.format(
      self.config['uri'], 
      self.config['database'], 
      self.config['collection'])) 

     # Ensure unique index 
     if self.config['unique_key']: 
      self.collection.ensure_index(self.config['unique_key'], unique=True) 
      log.msg(u'Ensuring index for key {0}'.format(
       self.config['unique_key'])) 

     # Get the duplicate on key option 
     if self.config['stop_on_duplicate']: 
      tmpValue = self.config['stop_on_duplicate'] 
      if tmpValue < 0: 
       log.msg(
        (
         u'Negative values are not allowed for' 
         u' MONGODB_STOP_ON_DUPLICATE option.' 
        ), 
        level=log.ERROR 
       ) 
       raise SyntaxError(
        (
         'Negative values are not allowed for' 
         ' MONGODB_STOP_ON_DUPLICATE option.' 
        ) 
       ) 
      self.stop_on_duplicate = self.config['stop_on_duplicate'] 
     else: 
      self.stop_on_duplicate = 0 

    def configure(self): 
     """ Configure the MongoDB connection """ 
     # Handle deprecated configuration 
     if not not_set(self.settings['MONGODB_HOST']): 
      log.msg(
       u'DeprecationWarning: MONGODB_HOST is deprecated', 
       level=log.WARNING) 
      mongodb_host = self.settings['MONGODB_HOST'] 

      if not not_set(self.settings['MONGODB_PORT']): 
       log.msg(
        u'DeprecationWarning: MONGODB_PORT is deprecated', 
        level=log.WARNING) 
       self.config['uri'] = 'mongodb://{0}:{1:i}'.format(
        mongodb_host, 
        self.settings['MONGODB_PORT']) 
      else: 
       self.config['uri'] = 'mongodb://{0}:27017'.format(mongodb_host) 

     if not not_set(self.settings['MONGODB_REPLICA_SET']): 
      if not not_set(self.settings['MONGODB_REPLICA_SET_HOSTS']): 
       log.msg(
        (
         u'DeprecationWarning: ' 
         u'MONGODB_REPLICA_SET_HOSTS is deprecated' 
        ), 
        level=log.WARNING) 
       self.config['uri'] = 'mongodb://{0}'.format(
        self.settings['MONGODB_REPLICA_SET_HOSTS']) 

     # Set all regular options 
     options = [ 
      ('uri', 'MONGODB_URI'), 
      ('fsync', 'MONGODB_FSYNC'), 
      ('write_concern', 'MONGODB_REPLICA_SET_W'), 
      ('database', 'MONGODB_DATABASE'), 
      ('collection', 'MONGODB_COLLECTION'), 
      ('replica_set', 'MONGODB_REPLICA_SET'), 
      ('unique_key', 'MONGODB_UNIQUE_KEY'), 
      ('buffer', 'MONGODB_BUFFER_DATA'), 
      ('append_timestamp', 'MONGODB_ADD_TIMESTAMP'), 
      ('stop_on_duplicate', 'MONGODB_STOP_ON_DUPLICATE') 
     ] 

     for key, setting in options: 
      if not not_set(self.settings[setting]): 
       self.config[key] = self.settings[setting] 

     # Check for illegal configuration 
     if self.config['buffer'] and self.config['unique_key']: 
      log.msg(
       (
        u'IllegalConfig: Settings both MONGODB_BUFFER_DATA ' 
        u'and MONGODB_UNIQUE_KEY is not supported' 
       ), 
       level=log.ERROR) 
      raise SyntaxError(
       (
        u'IllegalConfig: Settings both MONGODB_BUFFER_DATA ' 
        u'and MONGODB_UNIQUE_KEY is not supported' 
       )) 

    def process_item(self, item, spider): 
     """ Process the item and add it to MongoDB 
     :type item: Item object 
     :param item: The item to put into MongoDB 
     :type spider: BaseSpider object 
     :param spider: The spider running the queries 
     :returns: Item object 
     """ 
     item = dict(self._get_serialized_fields(item)) 

     if self.config['buffer']: 
      self.current_item += 1 

      if self.config['append_timestamp']: 
       item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} 

      self.item_buffer.append(item) 

      if self.current_item == self.config['buffer']: 
       self.current_item = 0 
       return self.insert_item(self.item_buffer, spider) 

      else: 
       return item 

     return self.insert_item(item, spider) 

    def close_spider(self, spider): 
     """ Method called when the spider is closed 
     :type spider: BaseSpider object 
     :param spider: The spider running the queries 
     :returns: None 
     """ 
     if self.item_buffer: 
      self.insert_item(self.item_buffer, spider) 

    def insert_item(self, item, spider): 
     """ Process the item and add it to MongoDB 
     :type item: (Item object) or [(Item object)] 
     :param item: The item(s) to put into MongoDB 
     :type spider: BaseSpider object 
     :param spider: The spider running the queries 
     :returns: Item object 
     """ 
     if not isinstance(item, list): 
      item = dict(item) 

      if self.config['append_timestamp']: 
       item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} 

     if self.config['unique_key'] is None: 
      try: 
       self.collection.insert(item, continue_on_error=True) 
       log.msg(
        u'Stored item(s) in MongoDB {0}/{1}'.format(
         self.config['database'], self.config['collection']), 
        level=log.DEBUG, 
        spider=spider) 
      except errors.DuplicateKeyError: 
       log.msg(u'Duplicate key found', level=log.DEBUG) 
       if (self.stop_on_duplicate > 0): 
        self.duplicate_key_count += 1 
        if (self.duplicate_key_count >= self.stop_on_duplicate): 
         self.crawler.engine.close_spider(
          spider, 
          'Number of duplicate key insertion exceeded' 
         ) 
       pass 

     else: 
      key = {} 
      if isinstance(self.config['unique_key'], list): 
       for k in dict(self.config['unique_key']).keys(): 
        key[k] = item[k] 
      else: 
       key[self.config['unique_key']] = item[self.config['unique_key']] 

      self.collection.update(key, item, upsert=True) 

      log.msg(
       u'Stored item(s) in MongoDB {0}/{1}'.format(
        self.config['database'], self.config['collection']), 
       level=log.DEBUG, 
       spider=spider) 

     return item 

如果有人可以只點我在正確的方向,將是這樣的幫助!我已經玩了好幾天了,但是我開始覺得我對Python和scrapy還不夠了解。

乾杯,

+0

我對MongoDB並不熟悉,但是在嘗試將文檔插入到「文章」集合中時,這看起來像是權限/授權錯誤。據我所知,這不是一個scrapy問題。來自''mongodb:// harland:Ase0peedi @ localhost:27017'的證書是否在嘗試向mongodb客戶端插入文檔時工作? –

+0

是的,這些都是證書,這很奇怪,因爲我可以使用這些證書對數據庫進行身份驗證......但是一旦Scrapy試圖插入某些內容,我會收到驗證錯誤。我真的被卡住了! – Hamish

回答

0

之前執行了該方法的parse_article_page',您需要登錄到使用scrapy方法廢棄從該網站數據的網站。

錯誤清楚地表明:「OperationFailure:未授權執行命令的文章」,表示您無權執行命令。

class MySpider(scrapy.Spider): 
    name = 'myspider' 
    def start_requests(self): 
     return [scrapy.FormRequest("http://www.example.com/login", 
      formdata={'user': 'john', 'pass': 'secret'}, 
      callback=self.logged_in)] 
    def logged_in(self, response): 
     # here you would extract links to follow and return Requests for 
     # each of them, with another callback 
     pass 
+0

我不認爲這是指網站,而是數據庫。該網站不需要驗證即可訪問該文章。數據庫被稱爲「文章」。 – Hamish