1
以下Scrapy crawlspider工作正常,除了URL的輸出(response.url):Scrapy:重複Response.URL每個記錄
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'newstl'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/agencies/']
rules = [
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/.*/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stltoday1.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
#Extract article information
name = response.xpath('//th[@scope="row"]/text()').extract()
position = response.xpath('//th[@scope="row"]/following-sibling::*[1]/text()').extract()
salary = response.xpath('//th[@scope="row"]/following-sibling::*[2]/text()').extract()
hiredate = response.xpath('//th[@scope="row"]/following-sibling::*[3]/text()').extract()
url = response.url
for item in zip(name,position, salary, hiredate, url):
scraped_info = {
'url' : item[4],
'name' : item[0],
'position' : item[1],
'salary' : item[2],
'hiredate' : item[3]
}
yield scraped_info
在輸出被顯示的URL的1個字符CSV的每一行。任何方式使它重複每個記錄的整個URL?