2015-11-18 72 views
1

我想使用CasperJS從表格中提取連續的(內部)HTML字段到列表中。我知道很容易從表中提取連續的元素屬性,但我無法弄清楚如何提取連續的HTML字段。用CasperJS提取表格元素

爲了演示,這裏有一個簡單的HTML表:

<html> 
    <head></head> 
    <body> 
    <table> 
     <tbody> 
     <tr><td name="a">1</td><td>hop</td></tr> 
     <tr><td name="b">2</td><td>skip</td></tr> 
     <tr><td name="c">3</td><td>jump</td></tr> 
     </tbody> 
    </table> 
    </body> 
</html> 

這裏是一個完整的卡斯帕程序從表中抽取位:

"use strict"; 
var casper = require('casper').create(); 

casper.start('file:///tmp/casper-so.html'); 

// I want to print the list '["a", "b", "c"]' 
casper.then(function a1() { 
    var names = casper.getElementsAttribute('table tr td[name]', 'name'); 
    // prints ["a", "b", "c"] as desired... 
    console.log(JSON.stringify(names, null, 2)); 
}); 

// I want to print the list '["hop", "skip", "jump"]' 
casper.then(function a2() { 
    var verbs = ???; 
    // What should go on the previous line in order to print 
    // ["hop", "skip", "jump"]? 
    console.log(JSON.stringify(verbs, null, 2)); 
}); 

casper.run(); 

在代碼中評論,我知道如何使用casper.getElementsAttribute()提取所有td [名稱]字段。但我還沒有想出一個簡單的方法來從表中給定的列中提取內部HTML。任何指針?

另外:我一直在做的是一次提取一個元素,迭代一個索引,使用看起來像table tr:nth-child(' + index + ') td:nth-child(2)的CSS,但感覺相當酷刑。我希望找到更簡單的東西。

回答

4

這裏有一個解決方案,從getElementsAttribute()的卡斯帕的定義重那兒剽竊:

// print the list '["hop", "skip", "jump"]' 
casper.then(function a2() { 
    var verbs = casper.evaluate(function() { 
     return [].map.call(__utils__.findAll('table tr td:nth-child(2)'), function (e) { return e.innerHTML; }); 
    }); 
    console.log(JSON.stringify(verbs, null, 2)); 
}); 
0

另一種解決方案是讓TD信息的對象,然後從對象的文本:

//get hop - 2nd td in DOM 
var tdObject = this.getElementInfo('tr td:nth-of-type(2)'); 
tdTwoObjectText = tdObject.text.trim(); 

//get skip - 4th td in DOM 
var tdObject = this.getElementInfo('tr td:nth-of-type(4)'); 
tdFourObjectText = tdObject.text.trim(); 

//get jump - 6th td in DOM 
var tdObject = this.getElementInfo('tr td:nth-of-type(6)'); 
tdSixObjectText = tdObject.text.trim();