3
我想安排我的蜘蛛在爬行完成後的1小時內再次運行。在我的代碼spider_closed
方法正在調用後爬行結束。現在如何從這個方法再次運行蜘蛛。或者是否有任何可用的設置來安排scrapy蜘蛛。如何安排scrapy蜘蛛在特定時間後爬行?
這是我的基本蜘蛛代碼。
import scrapy
import codecs
from a2i.items import A2iItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
class A2iSpider(scrapy.Spider):
name = "notice"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
allowed_domains = ["prothom-alo.com"]
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def parse(self, response):
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
print "*"*70
print url
print "\n\n"
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':2,'url' : url})
def parse_page(self, response):
filename = "response.txt"
depth = response.meta['depth']
with open(filename, 'a') as f:
f.write(str(depth))
f.write("\n")
f.write(response.meta['url'])
f.write("\n")
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':depth+1,'url' : url})
def spider_closed(self, spider):
print "$"*2000