0
我已經寫上http://www.funda.nl/以下蜘蛛湊頁數:Scrapy蜘蛛返回在某些情況下,空饋電輸出
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem
class FundaSpider(CrawlSpider):
name = "funda_spider"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1,3001)]
self.base_url = "http://www.funda.nl/koop/%s/" % place
self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
if re.search(r'/appartement-',link.url):
item['property_type'] = "apartment"
elif re.search(r'/huis-',link.url):
item['property_type'] = "house"
yield scrapy.Request(link.url, callback=self.parse_dir_contents, meta={'item': item})
def parse_dir_contents(self, response):
new_item = response.request.meta['item']
title = response.xpath('//title/text()').extract()[0]
postal_code = re.search(r'\d{4} [A-Z]{2}', title).group(0)
address = re.findall(r'te koop: (.*) \d{4}',title)[0]
price_dd = response.xpath("//dt[contains(.,'Vraagprijs')]/following-sibling::dd[1]/text()").extract()[0]
price = re.findall(r' \d+.\d+', price_dd)[0].strip()
year_built_dd = response.xpath("//dt[contains(.,'Bouwjaar')]/following-sibling::dd[1]/text()").extract()[0]
year_built = re.findall(r'\d+', year_built_dd)[0]
area_dd = response.xpath("//dt[contains(.,'Woonoppervlakte')]/following-sibling::dd[1]/text()").extract()[0]
area = re.findall(r'\d+', area_dd)[0]
rooms_dd = response.xpath("//dt[contains(.,'Aantal kamers')]/following-sibling::dd[1]/text()").extract()[0]
rooms = re.findall('\d+ kamer',rooms_dd)[0].replace(' kamer','')
bedrooms = re.findall('\d+ slaapkamer',rooms_dd)[0].replace(' slaapkamer','')
new_item['postal_code'] = postal_code
new_item['address'] = address
new_item['price'] = price
new_item['year_built'] = year_built
new_item['area'] = area
new_item['rooms'] = rooms
new_item['bedrooms'] = bedrooms
yield new_item
其中FundaItem
被定義爲
import scrapy
class FundaItem(scrapy.Item):
# define the fields for your item here like:
url = scrapy.Field()
title = scrapy.Field()
address = scrapy.Field()
postal_code = scrapy.Field()
price = scrapy.Field() # Listing price ("Vraagprijs")
year_built = scrapy.Field() # Year built ("Bouwjaar")
area = scrapy.Field() # Built area ("Woonoppervlakte")
rooms = scrapy.Field() # Number of rooms
bedrooms = scrapy.Field() # Number of bedrooms
property_type = scrapy.Field() # House or apartment
例如,如果我跑它與命令
scrapy crawl funda_spider -a place=amsterdam -o amsterdam.json
然後我得到一個719 KB的JSON文件,它啓動像這樣:
[
{"year_built": "1984", "area": "31", "url": "http://www.funda.nl/koop/amsterdam/appartement-49800928-jan-muschstraat-8/", "price": "132.500", "bedrooms": "1", "postal_code": "1065 LX", "rooms": "1", "address": "Jan Muschstraat 8", "property_type": "apartment"},
{"year_built": "1990", "area": "79", "url": "http://www.funda.nl/koop/amsterdam/appartement-85255640-zeeburgerkade-738-pp/", "price": "300.000", "bedrooms": "1", "postal_code": "1019 HT", "rooms": "2", "address": "Zeeburgerkade 738 +PP", "property_type": "apartment"},
{"year_built": "1906", "area": "93", "url": "http://www.funda.nl/koop/amsterdam/appartement-49897032-cliffordstraat-22-huis/", "price": "550.000", "bedrooms": "3", "postal_code": "1051 GT", "rooms": "4", "address": "Cliffordstraat 22 -HUIS", "property_type": "apartment"},
通過指定關鍵字「阿姆斯特丹」,我颳了所有的房屋和公寓從http://www.funda.nl/koop/amsterdam/。
到目前爲止,這麼好。但Funda也有整個省份的頁面,例如http://www.funda.nl/koop/provincie-zuid-holland/。如果我嘗試湊這個使用
scrapy crawl funda_spider -a place=provincie-zuid-holland -o zuid_holland.json
我得到一個空的JSON文件:
[
我懷疑的是,阿姆斯特丹以外的一些房子或公寓,解析以某種方式去錯了,這導致整個JSON輸出爲空。我怎樣才能讓蜘蛛在南荷蘭省生產呢?