2014-02-13 70 views
1

目的是編程能夠在crawlspider:如何自動檢索URL AJAX調用?

1)檢索屬於此頁的表中的鏈接的URL:http://cordis.europa.eu/fp7/security/projects_en.html

2)按照從所有這些URL的AJAX調用找到刪除包含我想要抓取的數據的最終(「AJAX」)網址

3)刮掉由AJAX URL標識的最終頁面。

到目前爲止,我已經寫了兩個蜘蛛Scrapy下:

1)第一個檢索從開始頁面上的鏈接的URL。這裏是代碼:

from scrapy.spider import Spider 
    from scrapy.selector import HtmlXPathSelector 
    from cordis.items import CordisItem 

    class MySpider(Spider): 
     name = "Cordis1" 
     allowed_domains = ["cordis.europa.eu"] 
     start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"] 

     def parse(self, response): 
      hxs = HtmlXPathSelector(response) 
      titles = hxs.select("//p") 
      items = [] 
      for titles in titles: 
       item = CordisItem() 
       item ["link"] = titles.select("//ul/li/span/a/@href").extract() 
      return item 

2)第二個從「AJAX」URL中刪除數據。下面是代碼:

from scrapy.spider import Spider 
from scrapy.selector import Selector 

class EssaiSpider(Spider): 
    name = "aze" 
    allowed_domains = ["cordis.europa.eu"] 
    start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607", 
    "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"] 

    def parse(self, response): 
     sel = Selector(response) 
     acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract() 
     short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract() 
     start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract() 
     end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract() 
     long_desc = sel.xpath("//*[@class='tech']/p/text()").extract() 
     cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract() 
     contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract() 
     type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract() 
     sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract() 
     coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract() 
     coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract() 
     part = sel.xpath("//*[@class='participants']") 
     for part in part: 
      part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract() 
      part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract() 
      part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract() 
      part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract() 
      part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract() 
      part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract() 
      part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract() 
      part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()    
      part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract() 
      part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()    
      part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract() 
      part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()    
      part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract() 
      part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()    
      part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract() 
      part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()   
      part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract() 
      part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()    
      part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract() 
      part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()   
      part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract() 
      part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()   
      part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract() 
      part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()   
      part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract() 
      part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()   
      part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract() 
      part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()   
      part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract() 
      part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()   
      part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract() 
      part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()   
      part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract() 
      part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()   
      part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract() 
      part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()   
      part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract() 
      part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()  
      part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract() 
      part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()  
      part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract() 
      part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()   
      part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract() 
      part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()   
      part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract() 
      part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()   
      part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract() 
      part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract() 
      part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract() 
      part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()   
      part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract() 
      part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()   
      part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract() 
      part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()   
      part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract() 
      part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()   
      part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract() 
      part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()   
      part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract() 
      part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()   
      part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract() 
      part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()   
      part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract() 
      part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()  
      part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract() 
      part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract() 
      part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract() 
      part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract() 
      part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract() 
      part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract() 
      part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract() 
      part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract() 
      part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract() 
      part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract() 
      part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract() 
      part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract() 
      part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract() 
      part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract() 
      part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract() 
      part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract() 
      part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract() 
      part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()  
     print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat 

我可以手動檢索什麼,而因爲缺乏更好的術語,我已經用Netbug過濾XHR請求每個第一蜘蛛產生的URL的名爲「Ajax」的網址。然後,我只需將這些「AJAX」網址提供給第二個Spider即可。

但是有可能自動檢索這些「AJAX」URL?

更一般地說,如何編寫一個執行上述所有三個操作的抓取蜘蛛?

+0

這是http://stackoverflow.com/questions/21730314/cannot-find-correct-xpath-under-shell/21732112?noredirect=1#21732112跟進問題 – user3301871

回答

0

是的,它可以自動檢索這些網址,但你必須弄清楚ajax加載內容的網址是什麼。這是一個簡單的教程。

1.做你的研究

在Chrome控制檯如果要開放的網絡標籤,過濾器和通過XML請求,你會得到「引發」字段。在右邊你有JavaScript文件,其中包含負責生成請求的代碼。 Chrome控制檯顯示從中調用請求的行。

enter image description here

在你的情況代碼的最重要的部分是在文件jQuery的projects.js,線415 ,該行表示,這樣的事情:

$.ajax({ 
     async:  true, 
     type:  'GET', 
     url:  URL, 

正如你看到的有一個URL變量在這裏。你需要找到它的編碼,略高於幾行:

var URL = '/projects/index.cfm?fuseaction=app.csa'; // production 

    switch(type) { 
     ... 
     case 'doc': 
      URL += '&action=read&xslt-template=projects/xsl/projectdet_' + I18n.locale + '.xslt&rcn=' + me.ref; 
      break; 
    } 

所以通過將基本URL生成的URL,一些字符串開始動作,然後兩個變量I18n.locale和me.ref。請記住,此網址是相對的,因此您需要獲取網址根。

I18n.locale原來只是一個字符串「_en」,me.ref來自哪裏?

再次CTRL +在源標籤控制檯發現,你會發現這條線的jQuery:

// record reference 
    me.ref = $("#PrjSrch>input[name='REF']").val(); 

原來有一個隱藏的形式存在於每一個URL並生成每次請求需要從該值me.ref字段。

現在,您只需將這些知識應用於scrapy項目。

2.在scrapy蜘蛛中使用你的知識。

在這一點上,你知道你必須做什麼。您需要從所有項目的開始url開始,獲取所有鏈接,爲這些鏈接提出請求,然後從每次請求後收到的內容中提取一個jax url,並生成我們從那裏獲取的url的請求。

from scrapy.selector import Selector 
from scrapy.spider import Spider 
from scrapy.http import Request 
from eu.items import EuItem 
from urlparse import urljoin 


class CordisSpider(Spider): 
    name = 'cordis' 
    start_urls = ['http://cordis.europa.eu/fp7/security/projects_en.html'] 
    base_url = "http://cordis.europa.eu/projects/" 
    # template string for ajax request based on what we know from investigating webpage 
    base_ajax_url = "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=%s" 

    def parse(self, response): 
     """ 
     Extract project links from start_url, for each generate GET request, 
     and then assign a function self.get_ajax_content to handle response. 
     """ 
     hxs = Selector(response) 
     links = hxs.xpath("//ul/li/span/a/@href").extract() 
     for link in links: 
      link = urljoin(self.base_url,link) 
      yield Request(url=link,callback=self.get_ajax_content) 

    def get_ajax_content(self,response): 
     """ 
     Extract AJAX link and make a GET request 
     for the desired content, assign callback 
     to handle response from this request. 
     """ 
     hxs = Selector(response) 
     # xpath analogy of jquery line we've seen 
     ajax_ref = hxs.xpath('//form[@id="PrjSrch"]//input[@name="REF"]/@value').extract() 
     ajax_ref = "".join(ajax_ref) 
     ajax_url = self.base_ajax_url % (ajax_ref,) 
     yield Request(url=ajax_url,callback=self.parse_items) 

    def parse_items(self,response): 
     """ 
     Response here should contain content 
     normally loaded asynchronously with AJAX. 
     """ 
     xhs = Selector(response) 
     # you can do your processing here 
     title = xhs.xpath("//div[@class='projttl']//text()").extract() 
     i = EuItem() 
     i["title"] = title 
     return i 
+0

非常感謝你4詳細的答案。我會看着它,儘快回覆你! – user3301871