0
林上抓取此網站scrapy工作:Scrapy抓取第一頁,不跟隨其他環節
第1頁:http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626
因此,我認爲出現問題的是它從頁面1獲取所有鏈接,轉到子頁面(因此轉到它提取的鏈接,「子頁面」),然後轉到第2頁然後再做一次,但我認爲在第1頁後,它只會獲得第一個鏈接(而不是第2頁的所有鏈接),然後繼續第3頁並執行相同操作。
我嘗試了很多不同的代碼,我仍然無法得到它的權利,我希望你能看看我的代碼,並幫助我,我做什麼錯
碼蜘蛛
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from craig.items import CraigItem
from scrapy.http import Request
import re
class CraigSpiderSpider(CrawlSpider):
name = "craig_spider"
allowed_domains = ["randstad.nl"]
start_urls = (
"http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626",
"http://www.randstad.nl/mwp2/faces/baanZoeken?"
)
rules = (Rule (SgmlLinkExtractor(allow=("filters=vakgebied!5626", "pagina=")), callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
#Haalt alle links op
for link in sel.xpath(".//a[contains(@class, 'outer-read-more-link')]/@href").extract():
yield Request(link, callback=self.parse)
#Gaat alle links af en haalt alle text op
text_list = sel.xpath('//div[@id="basePage:page:twoColumn:r2:0:functieOmschrijvingPanel::content"]/text()').extract()
title_list = sel.xpath('//div[@id="basePage:page:panelTitleHeader"]//td[@class="af_panelBox_header-text"]//h1[@class="af_panelBox_header-element"]/text()').extract()
label_samenvatting = sel.xpath('//div[@id="basePage:page:twoColumn:r1:0:pfl1b"]//table//td//label/text()').extract()
opleidingniveau_list = sel.xpath('//div[@id="basePage:page:twoColumn:r1:0:pl1"]//ul//li/text()').extract()
soortbaan_list = sel.xpath('//table[@id="basePage:page:twoColumn:r1:0:soortDienstverbandRNL"]//td[@class="AFContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
uren_per_week_list = sel.xpath('//tr[@id="basePage:page:twoColumn:r1:0:it5"]//td[@class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
vakgebied_list = sel.xpath('//tr[@id="basePage:page:twoColumn:r1:0:vakgebieden"]//td[@class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]//li/text()').extract()
branche_list = sel.xpath('//tr[@id="basePage:page:twoColumn:r1:0:aanvraagBranch"]//td[@class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
datum = sel.xpath('//span[@class="date-changed"]/text()').extract()
if text_list:
title = ' '.join(title_list)
text = ' '.join(text_list)
samenvatting = ' '.join(label_samenvatting)
opleidingniveau = ' '.join(opleidingniveau_list)
soortbaan = ' '.join(soortbaan_list)
urenperweek = ' '.join(uren_per_week_list)
vakgebied = ' '.join(vakgebied_list)
branche = ' '.join(branche_list)
item = CraigItem()
item['link'] = response.url
item['title'] = title
item['text'] = text
item['samenvatting'] = samenvatting
item['opleidingniveau'] = opleidingniveau
item['soortbaan'] = soortbaan
item['urenperweek'] = urenperweek
item['vakgebied'] = vakgebied
item['branche'] = branche
item['date'] = datum
yield item
規範產品
from scrapy.item import Item, Field
class CraigItem(Item):
title = Field()
text = Field()
link = Field()
site = Field()
date = Field()
samenvatting = Field()
opleidingniveau = Field()
soortbaan = Field()
urenperweek = Field()
vakgebied = Field()
branche = Field()
這使得我的抓取工具獲得更多結果,但仍然不是全部。 – Beer