我已經設置規則從start_url獲取下一頁,但它不工作,它只抓取start_urls頁面以及該頁面中的鏈接(使用parseLinks)。它不會轉到規則中設置的下一頁。我如何跳轉到下一個頁面在Scrapy規則
有幫助嗎?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'testes2'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/pesquisa/filtro/?tipo=0&local=0'
]
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), follow=True),)
def parse(self, response):
sel = Selector(response)
urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
for url in urls:
url = urljoin(response.url, url)
self.log('URLS: %s' % url)
yield Request(url, callback = self.parseLinks)
def parseLinks(self, response):
sel = Selector(response)
titulo = sel.xpath('h1/text()').extract()
morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
url = sel.xpath('//div[@class="contentContacto sendUrl"]/a/text()').extract()
telefone = sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
fax = sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
print titulo, email, morada
檢查這個答案,這將解決這個問題:http://stackoverflow.com/questions/13227546/scrapy-crawls-first-page-but -does-not-follow-links?answertab = votes#tab-top – Perefexexos