2016-08-19 51 views
2

我渴望爲一組的哲基爾主題的來源和演示網址提取到data.frame從R中的一些遺漏值

library(rvest) 

info <- read_html("https://github.com/jekyll/jekyll/wiki/themes") 

data <- info %>% 
html_nodes(" #wiki-body li") 

data 
{xml_nodeset (115)} 


[11] <li>Typewriter - (<a href="https://github.com/alixedi/typewriter">source</a>, <a href="http://alixedi.github.io/typewriter">demo</a>)</li> 
[12] <li>block-log - (<a href="https://github.com/anandubajith/block-log">source</a>), <a href="https://anandu.net/demo/block-log/">demo</a>)</li> 
[13] <li>Otter Pop - (<a href="https://github.com/tybenz/otter-pop">source</a>)</li> 

所以我想一個data.frame列表中提取的HREF( DF)與3列,例如

name  source          demo 
Typewriter https://github.com/alixedi/typewriter   http://alixedi.github.io/typewriter 

我能提取所有的HREF作爲載體,但是,你可以看到,從[13]沒有演示了一些網站,所以我再遇到困難

有沒有簡單的方法我可以從數據創建df?

withDemo <- info %>% 
    html_nodes(xpath = "//li[contains(., 'source') and contains(., 'demo')]") 

withoutDemo <- info %>% 
    html_nodes(xpath = "//li[contains(., 'source') and not(contains(.,'demo'))]") 

然後,與源集合創建數據框:可能使用purrr庫

回答

3

這是你purrr -ish答案:

library(rvest) 
library(purrr) 
library(dplyr) 

info <- read_html("https://github.com/jekyll/jekyll/wiki/themes") 

themes <- html_nodes(info, xpath=".//div[@class='markdown-body']/*/li") 

zero_to_na <- function(x) { ifelse(length(x)==0, NA, x) } 

df <- data_frame(name=gsub(" [- ]*\\(.*$", "", html_text(themes)), 
       source=map_chr(themes, ~html_attr(html_nodes(., xpath=".//a[contains(., 'source')]"), "href")), 
       demo=map_chr(themes, ~zero_to_na(html_attr(html_nodes(., xpath=".//a[contains(., 'demo')]"), "href")))) 

glimpse(df) 
## Observations: 115 
## Variables: 3 
## $ name <chr> "Jalpc", "Pixyll", "Jekyll Metro", "Midnight", "Leap Day", "F... 
## $ source <chr> "https://github.com/Jack614/jalpc_jekyll_theme", "https://git... 
## $ demo <chr> "http://www.jack003.com", "http://pixyll.com/", "http://blog-... 

交替:

map_df(themes, function(x) { 
    data_frame(name=gsub(" [- ]*\\(.*$", "", html_text(x)), 
      source=html_attr(html_nodes(x, xpath=".//a[contains(., 'source')]"), "href"), 
      demo=zero_to_na(html_attr(html_nodes(x, xpath=".//a[contains(., 'demo')]"), "href"))) 
}) 

gsub/sub/etc不管你不想要的「名字」的哪一部分。

+0

感謝INC purrr功能更好的選擇。這是相當的學習曲線 – pssguy

4
data_out <- c() 
for (i in 1:length(data)) { 
    row <- data.frame(html_text(data[i]), as.character(html_children(data[[i]]))[1], as.character(html_children(data[[i]]))[2]) 
    data_out <- rbind(data_out, row) 
} 
names(data_out) <- c("name", "source", "demo") 
data_out$name <- gsub(" - [(]source, demo[)]", "", data_out$name) 
data_out$source <- gsub("<a href=\"|\">source</a>", "", data_out$source) 
data_out$demo <- gsub("<a href=\"|\">demo</a>", "", data_out$demo) 
+0

感謝提供另一個替代 – pssguy

2

可以單獨使用XPath兩組分開收集那些演示數據和那些沒有演示數據和演示鏈接:

sourceNdemo <- withDemo %>% 
    html_children() %>%    # get all children 
    html_attr("href") %>%   # get the href attributes 
    matrix(ncol = 2, byrow = TRUE) # 2 pieces of data for each row 

sourceNdemo <- setNames(
    data.frame(html_text(withDemo), sourceNdemo), # html_text to get "name" column 
    c("name", "source", "demo")) 

然後,創建用於與僅源數據的那些數據幀

source <- withoutDemo %>% 
    html_children() %>% 
    html_attr("href") 

# set demo = NA for easy rbind-ing 
source <- data.frame(name = html_text(withoutDemo), source = source, demo = NA) 

rbind兩個dataframes

allInfo <- rbind(sourceNdemo, source) 

「名稱」 列現在包含像 「Jalpc - (來源演示)」 項和 「」 Bitwiser材料(源,演示)」。你可以擺脫額外的 「(源代碼,演示)」 位使用GSUB:

allInfo$name <- sub("\\s(-\\s)?\\(.+$", "", allInfo$name, perl = TRUE) 
+0

這似乎最快的辦法,可能是更大的投入 – pssguy