如果您想要爬網,你可以看看CrawlSpider
,從scrapy
。我也使用lxml.html
只是因爲它提供了更多的靈活性。
要安裝這些庫,你可以使用:
pip install scrapy
pip install lxml
腳手架基本scrapy項目,您可以使用command:
scrapy startproject elections
然後添加蜘蛛和項目:
選舉/蜘蛛/ spider.py
from scrapy.spiders import CrawlSpider, Rule
from elections.items import ElectionsItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from lxml import html
class ElectionsSpider(CrawlSpider):
name = "elections"
allowed_domains = ["elections.ca"]
start_urls = ["http://www.elections.ca/WPAPPS/WPR/EN/NC/Details?province=-1&distyear=2013&district=-1&party=-1&pageno=1&totalpages=55&totalcount=1372&viewall=1"]
rules = (
Rule(LxmlLinkExtractor(
allow = ('http://www.elections.ca/WPAPPS/WPR/EN/NC/Details.*'),
),
callback='parse_item',
follow=True
),
)
def unindent(self, string):
return ''.join(map(str.strip, string.encode('utf8').splitlines(1)))
def parse_item(self, response):
item = ElectionsItem()
original_html = Selector(response).extract()
lxml_obj = html.fromstring(original_html)
for entry in lxml_obj.xpath('.//fieldset[contains(@class,"wpr-detailgroup")]'):
date = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]/span[contains(@class,"date")]')
if date:
item['date'] = self.unindent(date[0].text.strip())
party = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]')
if party:
item['party'] = self.unindent(party[0].text.strip())
start_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][1]')
if start_date:
item['start_date'] = self.unindent(start_date[0].text.strip())
end_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][2]')
if end_date:
item['end_date'] = self.unindent(end_date[0].text.strip())
electoral_district = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Electoral district:")]')
if electoral_district:
item['electoral_district'] = self.unindent(electoral_district[0].tail.strip())
registered_association = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Registered association:")]')
if registered_association:
item['registered_association'] = self.unindent(registered_association[0].tail.strip())
for candidate in entry.xpath('.//table[contains(@class, "wpr-datatable")]//tr[not(@class)]'):
item['elected'] = len(candidate.xpath('.//img[contains(@alt, "contestant won this nomination contest")]'))
candidate_name = candidate.xpath('.//td[contains(@headers,"name")]')
if candidate_name:
item['candidate_name'] = self.unindent(candidate_name[0].text.strip())
item['address'] = self.unindent(candidate.xpath('.//td[contains(@headers,"address")]')[0].text_content().strip())
item['financial_agent'] = self.unindent(candidate.xpath('.//td[contains(@headers,"fa")]')[0].text_content().strip())
yield item
選舉/項目。PY
from scrapy.item import Item, Field
class ElectionsItem(Item):
date = Field()
party = Field()
start_date = Field()
end_date = Field()
electoral_district = Field()
registered_association = Field()
elected = Field()
candidate_name = Field()
address = Field()
financial_agent = Field()
選舉/ settings.py
BOT_NAME = 'elections'
SPIDER_MODULES = ['elections.spiders']
NEWSPIDER_MODULE = 'elections.spiders'
ITEM_PIPELINES = {
'elections.pipelines.ElectionsPipeline': 300,
}
選舉/ pipelines.py
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exporters import CsvItemExporter
class electionsPipeline(object):
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.files = {}
def spider_opened(self, spider):
file = open('%s_ads.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
您可以[R un蜘蛛通過運行command:
scrapy runspider elections/spiders/spider.py
從您的項目的根。
應該在項目中創建的根elections.csv
,像這樣:
financial_agent,end_date,candidate_name,registered_association,electoral_district,elected,address,date,party,start_date
"Jan BalcaThornhill, OntarioL4J 1V9","September 09, 2015",Leslyn Lewis,,Scarborough--Rouge Park,1,"Markham, OntarioL6B 0K9","September 09, 2015",,"September 07, 2015"
"Mark HicksPasadena, Newfoundland and LabradorA0L 1K0","September 08, 2015",Roy Whalen,,Long Range Mountains,1,"Deer Lake, Newfoundland and LabradorA8A 3H6","September 08, 2015",,"August 21, 2015"
,"September 08, 2015",Wayne Ruth,,Long Range Mountains,0,"Kippens, Newfoundland and LabradorA2N 3B8","September 08, 2015",,"August 21, 2015"
,"September 08, 2015",Mark Krol,,St. John's South--Mount Pearl,1,"Woodbridge, OntarioL4L 1Y5","September 08, 2015",,"August 24, 2015"
,"September 08, 2015",William MacDonald Alexander,,Bow River,1,"Calgary, AlbertaT2V 0M1","September 08, 2015",,"September 04, 2015"
(...)
你想湊所有條目?或者你想過濾一個搜索條件(省/領土,再分配年,選舉區,政黨,協會關鍵詞,參賽者關鍵詞,比賽日期)? –
您使用哪個分隔符?另外,如果您在某些字段中有分隔符,是否使用了quotechar? – Fejs
@IvanChaer我想根本沒有任何過濾的東西,截至目前,我的代碼可以做到這一點 - 這只是一個問題,獲取存儲在每個頁面上的所有信息,再加上csv輸出問題。 – HowenWilson