2017-07-31 33 views
1

以下Scrapy crawlspider工作正常,除了URL的輸出(response.url):Scrapy:重複Response.URL每個記錄

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 

class Spider2(CrawlSpider): 
    #name of the spider 
    name = 'newstl' 

    #list of allowed domains 
    allowed_domains = ['graphics.stltoday.com'] 

    #starting url for scraping 
    start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/agencies/'] 

    rules = [ 
    Rule(LinkExtractor(
     allow=['/apps/payrolls/salaries/.*/$']), 
     callback='parse_item', 
     follow=True), 
    ] 

    #setting the location of the output csv file 
    custom_settings = { 
     'FEED_FORMAT' : "csv", 
     'FEED_URI' : 'tmp/stltoday1.csv' 
    } 

    def parse_item(self, response): 
     #Remove XML namespaces 
     response.selector.remove_namespaces() 

     #Extract article information 
     name = response.xpath('//th[@scope="row"]/text()').extract() 
     position = response.xpath('//th[@scope="row"]/following-sibling::*[1]/text()').extract() 
     salary = response.xpath('//th[@scope="row"]/following-sibling::*[2]/text()').extract() 
     hiredate = response.xpath('//th[@scope="row"]/following-sibling::*[3]/text()').extract() 
     url = response.url 

     for item in zip(name,position, salary, hiredate, url): 
      scraped_info = { 
       'url' : item[4], 
       'name' : item[0], 
       'position' : item[1], 
       'salary' : item[2], 
       'hiredate' : item[3] 
      } 
      yield scraped_info 

在輸出被顯示的URL的1個字符CSV的每一行。任何方式使它重複每個記錄的整個URL?

回答

2

你不應該荏苒url,只是把它直接:

url = response.url 
for item in zip(name, position, salary, hiredate): 
    yield { 
     'url' : url, 
     'name' : item[0], 
     'position' : item[1], 
     'salary' : item[2], 
     'hiredate' : item[3] 
    } 

而且,代替多次遍歷整個樹,遍歷結果行,並從每個項目的情況下獲得所需的信息:

for row in response.xpath('//th[@scope="row"]'): 
    yield { 
     "url": url, 
     "name": row.xpath('./text()').extract_first(), 
     "position": row.xpath('./following-sibling::*[1]/text()').extract_first(), 
     "salary": row.xpath('./following-sibling::*[2]/text()').extract_first(), 
     "hiredate": row.xpath('./following-sibling::*[3]/text()').extract_first(), 
    }