如果我的問題太瑣碎了,但是我今天早上從牆背後開始,我很抱歉...我是scrapy的新手,我已經閱讀了文檔, t上找不到我的答案...scrapy.Request不回撥我的功能
我寫了這個蜘蛛,當我在rules = (Rule(LinkExtractor(), callback='parse_body'),)
稱之爲parse_body
,它的作用:
tchatch = response.xpath('//div[@class="ProductPriceBox-item detail"]/div/a/@href').extract()
print('\n TROUVE \n')
print(tchatch)
print('\n DONE \n')
但是,當我重新命名,到處都在我的代碼,功能parse_body
由剛剛parse
,它只是:
print('\n EN FAIT, ICI : ', response.url, '\n')
似乎我的scrapy.Request
請求從來沒有被調用...... 我甚至會打印很多無用的東西來知道我的代碼是否正在運行這些函數,但除了上面寫的print
之外,它什麼都不打印。
有什麼想法嗎?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider):
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://musique.fnac.com/a10484807/The-Cranberries-Something-else-CD-album']
rules = (
Rule(LinkExtractor(), callback='parse_body'),
)
def parse_body(self, response):
item = FnacItem()
nb_sales = response.xpath('//body//table[@summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').re(r'([\d]*) ventes')
country = response.xpath('//body//table[@summary="données détaillée du vendeur"]/tbody/tr/td/text()').re(r'([A-Z].*)')
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
print(response.url)
test_list = response.xpath('//a/@href')
for test_list in response.xpath('.//div[@class="ProductPriceBox-item detail"]'):
tchatch = response.xpath('//div[@class="ProductPriceBox-item detail"]/div/a/@href').extract()
print('\n TROUVE \n')
print(tchatch)
print('\n DONE \n')
yield scrapy.Request(response.url, callback=self.parse_iframe, meta={'item': item})
def parse_iframe(self, response):
f_item1 = response.meta['item']
print('\n EN FAIT, ICI : ', response.url, '\n')
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
if (len(iframexx) != 0):
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'], callback=self.extract_or_loop, meta={'item': f_item1})
else:
yield scrapy.Request(response.url, callback=self.extract_or_loop, meta={'item': f_item1})
def extract_or_loop(self, response):
f_item2 = response.meta['item']
print('\n PEUT ETRE ICI ? \n')
address = response.xpath('//body//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//body//div/ul/li[contains(text(),"@")]/text()').extract()
name = response.xpath('//body//div/p[@class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*\s*)')
phone = response.xpath('//body//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//body//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//body//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
print('\n', name, '\n')
f_item2['name'] = ''.join(name).strip()
f_item2['address'] = ''.join(address).strip()
f_item2['phone'] = ''.join(phone).strip()
f_item2['email'] = ''.join(email).strip()
f_item2['vat'] = ''.join(vat).strip()
f_item2['siret'] = ''.join(siret).strip()
yield f_item2
else:
for sel in response.xpath('//html/body'):
list_urls = sel.xpath('//a/@href').extract()
list_iframe = response.xpath('//div[@class="ProductPriceBox-item detail"]/div/a/@href').extract()
if (len(list_iframe) != 0):
for list_iframe in list_urls:
print('\n', list_iframe, '\n')
print('\n GROS TCHATCH \n')
yield scrapy.Request(list_iframe, callback=self.parse_body)
for url in list_urls:
yield scrapy.Request(response.urljoin(url), callback=self.parse_body)
爲什麼不使用'Spider'取代'CrawlSpider'如果你不使用,即使真正的規則? –
因爲如果我把'蜘蛛'而不是'CrawlSpider',我有這個錯誤'raise NotImplementedError' –
這可能是因爲你沒有實現* parse *方法。 –