1
目前,我有以下結構的scrapy項目:Scrapy是不是能找到我的蜘蛛在當前項目
.
├── articlescraper
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── pipelines.py
│ ├── pipelines.pyc
│ ├── scheduler.py
│ ├── scheduler.pyc
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── nujijspider.py
│ └── nujijspider.pyc
└── scrapy.cfg
現在在我的scheduler.py我調用這個函數:
from Queue import Queue
import threading
import time
import sys
import imp
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class Scheduler(object):
"""Scheduler is the base class for the Scheduler
This class loops on the queue object and calls the needed crawlers from within
Reschedules articles to be crawled again
"""
def __init__(self):
self.articleInformation = {}
self.taskQueue = Queue()
def append_work(self, work):
if work['Url'] not in self.articleInformation:
self.articleInformation[work['Id']] = work
print self.articleInformation
def schedule(self):
article = self.taskQueue.get()
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("articlecommentspider",url="///")
process.start()
但是這會導致此錯誤scrapy:
File "/usr/local/lib/python2.7/site-packages/scrapy/spiderloader.py", line 43, in load
raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: articlecommentspider'
蜘蛛:
class ArticleCommentSpider(scrapy.Spider):
"""ArticleCommentSpider Can look for all the the comments on an article page
Those article pages are specific to www.nujij.nl and nu.nl related websites
"""
name = 'articlecommentspider'
allowed_domains = ['nujij.nl']
def __init__(self, *args, **kwargs):
super(ArticleCommentSpider, self).__init__(*args, **kwargs)
arg = args.get('url')
if not arg:
print arg
self.start_urls = arg
def parse(self,response):
title = response.xpath("//h1"+matchClass('title')+"//text()").extract()[1] ## Title is weird defined inside Nujij.nl (<h1 class="title">)
articleId = prog.search(response.url).group().split('.')[0] ## This regex matches things like (873238.lynkx in url)
response.replace(body=response.body.replace('<br>', '\n')) # Needed for comments which have alot of <br> tags
for item in response.xpath('//ol[@class="reacties"]//li'+ matchClass('hidenum')): ## Every list item underneath the reactions
commentId = item.xpath('@id').extract_first() ## Id from the first list item (unique on every article)
c = item.xpath('.//div[@class="reactie-body "]/text()').extract()
c = ''.join(map(unicode.strip, c))
date = item.xpath('normalize-space(.//span[@class="tijdsverschil"])').extract()
date = dateparser.parse("".join(date))
articleComment = Comment()
articleComment['Id'] = articleId+"+"+str(commentId)
articleComment['Source'] = str(title)
articleComment['IndexedAt'] = date
articleComment['Url'] = response.url
articleComment['Parser'] = "nujij.nl"
articleComment['Content'] = str(c)
articleComment['Subject'] = {
"url" : response.url,
"title": str(title)
}
print articleComment
當列出與scrapy列表scrapers我得到他們兩個。 scheduler文件也在articlescraper項目中。我怎麼能不在這個過程中調用刮板
也有在文件中表示這 '從scrapy.crawler進口CrawlerProcess 從scrapy.utils.project進口get_project_settings 過程= CrawlerProcess(get_project_settings()) #'followall'是項目蜘蛛之一的名稱。 process.crawl('followall',domain ='scrapinghub.com') process.start()#腳本將在此處阻塞,直到抓取完成爲止# –
嗯,我從來沒有真正嘗試過使用它的方式,但我只是試着在'testspiders'回購文檔推薦,它確實工作得很好。你的部分結構幾乎相同。 'scrapy list'是否會返回正確的蜘蛛名字? – Granitosaurus
是的,這就是我爲什麼這麼奇怪。我已經改變了你的方法,它現在起作用了! –