2017-10-10 122 views
0

我試圖使用rvestthis site.下載文件列表文件名是固定的,但下載網址與模式(僅數十位數)不匹配,所以我無法根據任何標準構建下載網址列表。我如何使用鏈接名稱下載實際文件?使用R通過鏈接名稱獲取下載URL

到目前爲止,我可以得到感興趣的文件列表(基於CSS選擇器),並且我可以獲得網站上所有鏈接的列表,但我不知道如何匹配它們。我需要能夠檢查網站的變化並下載任何名稱更改的文件,因此使用文件名訪問文件非常重要。我對HTML/CSS不是很熟悉,所以這可能是我無法弄清楚這個可能簡單的任務的原因。

library(rvest) 

# url with list of download files 

url <- "http://www-air.larc.nasa.gov/cgi-bin/ArcView/actamerica.2016?C130=1" 
doc <- read_html(url) 

# getting everything within the CSS selector "td a" 

all <- html_text(html_nodes(doc, "td a")) 

# getting list of certain file names 

filetype <- "PICARRO" 
files <- all[grep(filetype, all)] 

# this returns a list of all links on the page, 
# but I'm not sure how to match the links up with their names 

html_attr(html_nodes(doc, "a"), "href") 

非常感謝您的幫助。

回答

1

略有不同的方法。

抓住所有下載的文件名和URL:

library(httr) 
library(rvest) 
library(tidyverse) 

pg <- read_html("http://www-air.larc.nasa.gov/cgi-bin/ArcView/actamerica.2016?C130=1") 

fils <- html_nodes(pg, xpath=".//a[contains(@href, 'cgi-bin/enzFile')]") 

data_frame(
    filename = html_text(fils), 
    link = sprintf("http://www-air.larc.nasa.gov%s", html_attr(fils, "href")) 
) -> xdf 

glimpse(xdf) 
## Observations: 719 
## Variables: 2 
## $ filename <chr> "ACTAMERICA-Elevation_C130_20160711_R0.ict", "ACTAMERICA-Elevation_C130_20160715_R0.ict", "ACTAMERI... 
## $ link  <chr> "http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f... 

xdf 
## # A tibble: 719 x 2 
## filename                                                                   link 
## <chr>                                                                   <chr> 
## 1 ACTAMERICA-Elevation_C130_20160711_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303731315f52302e696374 
## 2 ACTAMERICA-Elevation_C130_20160715_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303731355f52302e696374 
## 3 ACTAMERICA-Elevation_C130_20160718_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303731385f52302e696374 
## 4 ACTAMERICA-Elevation_C130_20160719_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303731395f52302e696374 
## 5 ACTAMERICA-Elevation_C130_20160721_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303732315f52302e696374 
## 6 ACTAMERICA-Elevation_C130_20160722_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303732325f52302e696374 
## 7 ACTAMERICA-Elevation_C130_20160725_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303732355f52302e696374 
## 8 ACTAMERICA-Elevation_C130_20160726_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303732365f52302e696374 
## 9 ACTAMERICA-Elevation_C130_20160727_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303732375f52302e696374 
## 10 ACTAMERICA-Elevation_C130_20160801_R0.ict http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d456c65766174696f6e5f433133305f32303136303830315f52302e696374 
## # ... with 709 more rows 

讓你關心的人:

picarro <- filter(xdf, grepl("PICARRO", filename)) 

下載他們:

walk2(picarro$link, picarro$filename, download.file) 
## trying URL 'http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d5049434152524f5f433133305f32303136303532375f52422e696374' 
## Content type 'text/plain' length 1023662 bytes (999 KB) 
## ================================================== 
## downloaded 999 KB 
## 
## trying URL 'http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d5049434152524f5f433133305f32303136303731315f52302e696374' 
## Content type 'text/plain' length 886392 bytes (865 KB) 
## ================================================== 
## downloaded 865 KB 
## 
## trying URL 'http://www-air.larc.nasa.gov/cgi-bin/enzFile?f49DA0512C4E81E3C01FDB44A33CD88AAFE2f7075622d6169722f414354414d45524943412f323031362f433133305f41495243524146542f444947414e47492e4a4f534855412f414354414d45524943412d5049434152524f5f433133305f32303136303731355f52302e696374' 
## Content type 'text/plain' length 530339 bytes (517 KB) 
## ================================================== 
## downloaded 517 KB 

1

如何使用map2purrr結合兩個向量allhtml_attr(html_nodes(doc, "a"), "href"),然後根據文件中篩​​選類型名

url <- "http://www-air.larc.nasa.gov/cgi-bin/ArcView/actamerica.2016?C130=1" 
doc <- read_html(url) 
all <- html_text(html_nodes(doc, "td a")) 
href <- html_attr(html_nodes(doc, "a"), "href") 

z <- purrr::map2(all, href, function(x, y) data.frame(x, y)) 
z <- do.call(rbind, z) 
filetype <- "PICARRO" 
z[grep(filetype, z[,1]),]