import scrapy
from universities.items import UniversitiesItem
def clean_full_name(full_name):
sp = full_name.split(',')
last_name = sp[0].strip()
first_name = sp[1].replace('\r\n', '').strip()
first_name = ' '.join(first_name.split()[:-1]).strip()
return ' '.join([last_name, first_name])
class DerexlUniversity(scrapy.Spider):
name = 'drexel_university'
allowed_domains = ['drexel.edu']
start_urls = ['http://drexel.edu/search?t=people&q=']
def __init__(self):
self.last_name = ''
def parse(self, response):
with open('kw.txt') as file_object:
last_names = file_object.readlines()
for ln in ['Chong', 'Zhao']:
self.last_name = ln.strip()
print('-----------------------------------------------------')
print("scraping last name: ", self.last_name)
query = response.url + self.last_name
yield scrapy.Request(query, callback=self.parse_item)
def parse_item(self, response):
self.logger.info('This is item page %s', self.last_name)
result_rows = response.xpath('//table//tr[@class="result-row"]')
result_details = response.xpath('//table//tr[@class="result-details"]')
for row, detail in zip(result_rows, result_details):
full_name = row.xpath('.//span[@class="fullname"]/text()').extract_first()
if full_name:
full_name = clean_full_name(full_name)
if self.last_name in full_name.split():
item = UniversitiesItem()
item['fullname'] = full_name
item['university'] = 'Drexel University'
try:
item['email'] = row.xpath('.//span[@class="email-address"]/a/@href').extract_first()[7:]
item['phone'] = row.xpath('.//span[@class="phone-numbers"]/a/@href').extract_first()[4:]
person_detail = detail.xpath('.//span[@class="person-detail"]/text()').extract()
except ValueError:
pass
else:
person_detail_clean = ', '.join([pd.strip() for pd in person_detail[0].split(',')][1:])
item['person_detail'] = person_detail_clean
yield item
for循環中有2個關鍵詞,即'衝'和'趙'。我試圖將結果保存在CSV文件中。每次在parse_item函數的for循環中生成一個新項目。但是,只有'趙'正在被拯救。我無法弄清楚爲什麼。scrapy結果只保存一個項目在循環中
我檢查,衝一直沒有結果 –
[http://drexel.edu/search?t=people&q=chong](http://drexel.edu/search?t=people&q= chong)有166場比賽。 – user8314628