2016-09-21 73 views
2

我試圖在MySQL數據庫中保存報廢的數據。我script.py是使用scrapy的MySQL數據庫錯誤

# -*- coding: utf-8 -*- 
import scrapy 
import unidecode 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from lxml import html 


class ElementSpider(scrapy.Spider): 
    name = 'books' 
    download_delay = 3 
    allowed_domains = ["goodreads.com"] 
    start_urls = ["https://www.goodreads.com/list/show/19793.I_Marked_My_Calendar_For_This_Book_s_Release",] 

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="next_page"]',)), callback="parse", follow= True),) 

    def parse(self, response): 
     for href in response.xpath('//div[@id="all_votes"]/table[@class="tableList js-dataTooltip"]/tr/td[2]/div[@class="js-tooltipTrigger tooltipTrigger"]/a/@href'):  
      full_url = response.urljoin(href.extract()) 
      print full_url 
      yield scrapy.Request(full_url, callback = self.parse_books) 
      break; 


     next_page = response.xpath('.//a[@class="next_page"]/@href').extract() 
     if next_page: 
      next_href = next_page[0] 
      next_page_url = 'https://www.goodreads.com' + next_href 
      print next_page_url 
      request = scrapy.Request(next_page_url, self.parse) 
      yield request 

    def parse_books(self, response): 
     yield{ 
      'url': response.url, 
      'title':response.xpath('//div[@id="metacol"]/h1[@class="bookTitle"]/text()').extract(), 
      'link':response.xpath('//div[@id="metacol"]/h1[@class="bookTitle"]/a/@href').extract(), 
     } 

而且pipeline.py是

# -*- coding: utf-8 -*- 

# Define your item pipelines here 
# 
# Don't forget to add your pipeline to the ITEM_PIPELINES setting 
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 


import MySQLdb 
import hashlib 
from scrapy.exceptions import DropItem 

from scrapy.http import Request 
import sys 

class SQLStore(object): 
    def __init__(self): 
     self.conn = MySQLdb.connect("localhost","root","","books") 
     self.cursor = self.conn.cursor() 
     print "connected to DB" 

    def process_item(self, item, spider): 
     print "hi" 

     try: 
      self.cursor.execute("""INSERT INTO books_data(next_page_url) VALUES (%s)""", (item['url'])) 
      self.conn.commit() 

     except Exception, e: 
      print e 

當我運行該腳本沒有任何錯誤。蜘蛛運行良好,但我認爲光標不指向process_item。即使它不打印嗨。

回答

2

你的方法簽名是錯誤的,它應該採取項目和蜘蛛參數:

process_item(self, item, spider) 

你也需要有管道安裝在您的settings.py文件:

ITEM_PIPELINES = {"project_name.path.SQLStore"} 

你語法也不正確,您需要傳遞一個元組:

self.cursor.execute("""INSERT INTO books_data(next_page_url) VALUES (%s)""", 
    (item['url'],) # <- add , 
+0

已經有tr ied這個,但不工作。我在settings.py中添加了管道,如下所示:ITEM_PIPELINES ='test1.pipelines.SQLStore':300, } –

+0

你的piplines目錄中的init.py文件中有什麼?你也有'process_item(self,item,spider)'? –

+0

那麼scrapy如何找到你的SQLStore管道? –