require(httr)
require(XML)
basePage <- "http://capitol.hawaii.gov"
h <- handle(basePage)
GET(handle = h)
res <- GET(handle = h, path = "/advreports/advreport.aspx?year=2013&report=deadline&rpt_type=&measuretype=hb&title=House")
# parse content for "Transmitted to Governor" text
resXML <- htmlParse(content(res, as = "text"))
resTable <- getNodeSet(resXML, '//*/table[@id ="GridViewReports"]/tr/td[3]')
appRows <-sapply(resTable, xmlValue)
include <- grepl("Transmitted to Governor", appRows)
resUrls <- xpathSApply(resXML, '//*/table[@id ="GridViewReports"]/tr/td[2]//@href')
appUrls <- resUrls[include]
# look at just the first
res <- GET(handle = h, path = appUrls[1])
resXML <- htmlParse(content(res, as = "text"))
xpathSApply(resXML, '//*[text()[contains(.,"Passed Final Reading")]]', xmlValue)
[1] "Passed Final Reading as amended in SD 2 with Representative(s) Fale, Jordan,
Tsuji voting aye with reservations; Representative(s) Cabanilla, Morikawa, Oshiro,
Tokioka voting no (4) and none excused (0)."
讓包httr
處理通過建立handle
所有的後臺工作。
如果你想運行在所有92個鏈接:
# get all the links returned as a list (will take sometime)
# print statement included for sanity
res <- lapply(appUrls, function(x){print(sprintf("Got url no. %d",which(appUrls%in%x)));
GET(handle = h, path = x)})
resXML <- lapply(res, function(x){htmlParse(content(x, as = "text"))})
appString <- sapply(resXML, function(x){
xpathSApply(x, '//*[text()[contains(.,"Passed Final Reading")]]', xmlValue)
})
head(appString)
> head(appString)
$href
[1] "Passed Final Reading as amended in SD 2 with Representative(s) Fale, Jordan, Tsuji voting aye with reservations; Representative(s) Cabanilla, Morikawa, Oshiro, Tokioka voting no (4) and none excused (0)."
$href
[1] "Passed Final Reading, as amended (CD 1). 25 Aye(s); Aye(s) with reservations: none . 0 No(es): none. 0 Excused: none."
[2] "Passed Final Reading as amended in CD 1 with Representative(s) Cullen, Har voting aye with reservations; Representative(s) McDermott voting no (1) and none excused (0)."
$href
[1] "Passed Final Reading, as amended (CD 1). 25 Aye(s); Aye(s) with reservations: none . 0 No(es): none. 0 Excused: none."
[2] "Passed Final Reading as amended in CD 1 with none voting aye with reservations; Representative(s) Hashem, McDermott voting no (2) and none excused (0)."
$href
[1] "Passed Final Reading, as amended (CD 1). 24 Aye(s); Aye(s) with reservations: none . 0 No(es): none. 1 Excused: Ige."
[2] "Passed Final Reading as amended in CD 1 with none voting aye with reservations; none voting no (0) and Representative(s) Say excused (1)."
$href
[1] "Passed Final Reading, as amended (CD 1). 25 Aye(s); Aye(s) with reservations: none . 0 No(es): none. 0 Excused: none."
[2] "Passed Final Reading as amended in CD 1 with Representative(s) Johanson voting aye with reservations; none voting no (0) and none excused (0)."
$href
[1] "Passed Final Reading, as amended (CD 1). 25 Aye(s); Aye(s) with reservations: none . 0 No(es): none. 0 Excused: none."
[2] "Passed Final Reading as amended in CD 1 with none voting aye with reservations; none voting no (0) and none excused (0)."
我建議你通過這個線程看看這裏,我試圖瞭解湊一個網站。 http://www.talkstats.com/showthread.php/26153-Still-trying-to-learn-to-scrape?highlight=still+learning+to+scrape –
我在這上面花了幾個小時,這並不容易: (你可以獲取第一頁的內容,但第二個不接受我傳遞'__VIEWSTATE'和一些其他參數[如這裏所示](http://stackoverflow.com/questions/15853204/how-我可以到'resp <-GET(「http://capitol.hawaii.gov/advreports/)。 advreport.aspx?year = 2013&report = deadline&rpt_type =&measuretype = hb&title = House Bills「); writeBin(content(resp,'raw'),tf); readHTMLTable(tf)$ GridViewReports',但第二個站點殺死它:( –