目的是編程能夠在crawlspider:如何自動檢索URL AJAX調用?
1)檢索屬於此頁的表中的鏈接的URL:http://cordis.europa.eu/fp7/security/projects_en.html
2)按照從所有這些URL的AJAX調用找到刪除包含我想要抓取的數據的最終(「AJAX」)網址
3)刮掉由AJAX URL標識的最終頁面。
到目前爲止,我已經寫了兩個蜘蛛Scrapy下:
1)第一個檢索從開始頁面上的鏈接的URL。這裏是代碼:
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from cordis.items import CordisItem
class MySpider(Spider):
name = "Cordis1"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CordisItem()
item ["link"] = titles.select("//ul/li/span/a/@href").extract()
return item
2)第二個從「AJAX」URL中刪除數據。下面是代碼:
from scrapy.spider import Spider
from scrapy.selector import Selector
class EssaiSpider(Spider):
name = "aze"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
"http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]
def parse(self, response):
sel = Selector(response)
acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
part = sel.xpath("//*[@class='participants']")
for part in part:
part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()
part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()
part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()
part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()
part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()
part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()
part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()
part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()
part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()
part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()
part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()
part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()
part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()
part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()
part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()
part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()
part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()
part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()
part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()
part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()
part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()
part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()
part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()
part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()
part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()
part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()
print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
我可以手動檢索什麼,而因爲缺乏更好的術語,我已經用Netbug過濾XHR請求每個第一蜘蛛產生的URL的名爲「Ajax」的網址。然後,我只需將這些「AJAX」網址提供給第二個Spider即可。
但是有可能自動檢索這些「AJAX」URL?
更一般地說,如何編寫一個執行上述所有三個操作的抓取蜘蛛?
這是http://stackoverflow.com/questions/21730314/cannot-find-correct-xpath-under-shell/21732112?noredirect=1#21732112跟進問題 – user3301871