2014-04-24 26 views
0

我想去this page並從每個鏈接颳去每篇論文的'標題'和'作者'。到目前爲止,我有這個(我的問題,我需要幫助的人士在內部代碼中的註釋):需要CasperJS刮助手

var utils = require('utils'); 
var casper = require('casper').create({ 
    verbose: true, 
    logLevel: 'error', 
    pageSettings: { 
    loadImages: false, 
    loadPlugins: false, 
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36' 
    }, 
    clientScripts: ['lib/jquery.min.js'] 
}); 

var i = 0; 
var links = []; 
var thesis_data = []; 

function getThesisLinks() { 
    var links = document.querySelectorAll('');//Not sure what should go in ('') 
    return [].map.call(links, function(link) { 
    return link.getAttribute('href'); 
    }); 
} 

function loopThroughThesisLinks() { 
    // Recurses until all links are processed 
    if (i < links.length) { 
    this.echo('[LINK #' + i + '] ' + links[i]); 
    getThesisData.call(this, links[i]); 
    i++; 
    this.run(loopThroughThesisLinks); 
    } else { 
    utils.dump(thesis_data); 
    this.exit(); 
    } 
} 

function getThesisData(link) { 
    this.start(link, function() { 

    // Get title of thesis - not sure what element to insert for this.fetchText 
    var title = this.fetchText(''); 

    // Get name of authors - not sure what element to insert for this.fetchText 
    var author = this.fetchText(''); 

    // Add the title & author data to the thesis_data array 
    var data = { 
     title: title, 
     author: author 
    }; 
    thesis_data.push(data); 

    }); 
} 

casper.start('http://ses.library.usyd.edu.au/handle/2123/345/browse?type=dateissued&sort_by=2&order=DESC&rpp=1495&etal=0&submit_browse=Update', function() { 
    links = this.evaluate(getThesisLinks); 

    // Convert relative links to absolute URLs 
    for (var i = 0; i < links.length; i++) { 
    links[i] = "http://ses.library.usyd.edu.au/handle/" + links[i]; 
    } 

    utils.dump(links); 
}); 

casper.run(loopThroughThesisLinks); 

任何援助將不勝感激。

+0

提供一些更多的細節,像什麼問題。 – AsTeR

回答

1

這是所有鏈接簡單的CSS選擇器:

var links = document.querySelectorAll(
      'table.misctable > tbody > tr > td:nth-of-type(3) > a'); 

您還可以使用XPath這樣的:

var x = require('casper').selectXPath; // goes to the beginning of the file 
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]')); 

我認爲你可以找出作者 -query。我可能會以循環方式使用casper.thenOpen以不同的方式進行爬網,因爲這種方式很難讀取,其他startrun調用處於不同的功能。

隨着casper.thenOpen它應該是這樣的:

var x = require('casper').selectXPath; // goes to the beginning of the file 

function loopThroughThesisLinks() { 
    // Recurses until all links are processed 
    if (i < links.length) { 
    this.echo('[LINK #' + i + '] ' + links[i]); 
    getThesisData.call(this, links[i]); 
    i++; 
    this.then(loopThroughThesisLinks); 
    } else { 
    utils.dump(thesis_data); 
    this.exit(); 
    } 
} 

function getThesisData(link) { 
    this.thenOpen(link, function() { 
    var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]')); 
    var author = this.fetchText(x('//table//tr/td[1][contains(text(),"Authors:")]/../td[2]')); 

    // Add the title & author data to the thesis_data array 
    var data = { 
     title: title, 
     author: author 
    }; 
    thesis_data.push(data); 
    }); 
}