2015-06-26 68 views
3

我有一個data.frame s的列表。例如在data.frames列表中的特定data.frame列上的高效函數

set.seed(1) 
my_list <- list() 
ids = c("a","b","c","d","e") 
for(i in 1:5){ 
    my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids))) 
} 

我想是有效地計算每個ID(在「ID」欄)中的「P」,「M」,「d」的方差,並且「a」列在所有數據框在列表中。理想情況下,這將返回一個data.frame這樣的(基於上面得出的值):

> result.df 
    id  var_p  var_m  var_d  var_a 
1 a 0.2371569 1.7810729 0.08264279 0.5074250 
2 b 0.1091675 0.2107997 1.15051229 1.1578691 
3 c 0.5385789 0.765.44215343 0.3137903 
4 d 1.0174542 0.7818498 0.06414317 0.6079849 
5 e 0.7343667 1.2870542 1.41615858 0.7362462 

回答

3

使用my_list

library(plyr) 
df = do.call(rbind, my_list) 
out = ddply(df, .(id), colwise(var, c('p','m','d','a'))) 

#> out 
# id   p   m   d   a 
#1 a 0.2371569 1.7810729 0.08264279 0.5074250 
#2 b 0.1091675 0.2107997 1.15051229 1.1578691 
#3 c 0.5385789 0.765.44215343 0.3137903 
#4 d 1.0174542 0.7818498 0.06414317 0.6079849 
#5 e 0.7343667 1.2870542 1.41615858 0.7362462 

或者基礎R替代方案中,使用的lapply組合和apply

df = do.call(rbind, my_list) 
df1 = do.call(rbind, 
     lapply(split(df, df$id), 
     function(x) apply(subset(x, select = c(p,m,d,a)), 2, var))) 

out = transform(df1, id = row.names(df1)) 

#> out 
#   p   m   d   a id 
#a 0.2371569 1.7810729 0.08264279 0.5074250 a 
#b 0.1091675 0.2107997 1.15051229 1.1578691 b 
#c 0.5385789 0.765.44215343 0.3137903 c 
#d 1.0174542 0.7818498 0.06414317 0.6079849 d 
#e 0.7343667 1.2870542 1.41615858 0.7362462 e 

或使用doBy

library(doBy) 
df = do.call(rbind, my_list) 
out = summaryBy(p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var) 

#> out 
# id   p   m   d   a 
#1 a 0.2371569 1.7810729 0.08264279 0.5074250 
#2 b 0.1091675 0.2107997 1.15051229 1.1578691 
#3 c 0.5385789 0.765.44215343 0.3137903 
#4 d 1.0174542 0.7818498 0.06414317 0.6079849 
#5 e 0.7343667 1.2870542 1.41615858 0.7362462 

或者用sqldf

library(sqldf) 
df = do.call(rbind, my_list) 
out = sqldf("select id, variance(p), variance(m), 
      variance(d), variance(a) from df group by id") 

#> out 
# id variance(p) variance(m) variance(d) variance(a) 
#1 a 0.2371569 1.7810729 0.08264279 0.5074250 
#2 b 0.1091675 0.2107997 1.15051229 1.1578691 
#3 c 0.5385789 0.765.44215343 0.3137903 
#4 d 1.0174542 0.7818498 0.06414317 0.6079849 
#5 e 0.7343667 1.2870542 1.41615858 0.7362462 
2

更新使用bind_rows()(效率比do.call(rbind,...)每@hadley建議)

library(dplyr) 
dat <- bind_rows(dat)[,c("id","p","m","d","a")] 
dat %>% group_by(id) %>% summarise_each(funs(var)) 

# id   p   m   d   a 
# 1 a 0.2371569 1.7810729 0.08264279 0.5074250 
# 2 b 0.1091675 0.2107997 1.15051229 1.1578691 
# 3 c 0.5385789 0.765.44215343 0.3137903 
# 4 d 1.0174542 0.7818498 0.06414317 0.6079849 
# 5 e 0.7343667 1.2870542 1.41615858 0.7362462 
+3

'bind_rows()'會更有效 – hadley

3

這裏是一個基礎R方法

dat <- do.call(rbind,my_list) 
aggregate(cbind(p,m,d,a) ~ id, var, data=dat) 

其給出

id   p   m   d   a 
1 a 0.2371569 1.7810729 0.08264279 0.5074250 
2 b 0.1091675 0.2107997 1.15051229 1.1578691 
3 c 0.5385789 0.765.44215343 0.3137903 
4 d 1.0174542 0.7818498 0.06414317 0.6079849 
5 e 0.7343667 1.2870542 1.41615858 0.7362462 
3
library(data.table) 
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")] 
# id   p   m   d   a 
# 1: a 0.2371569 1.7810729 0.08264279 0.5074250 
# 2: b 0.1091675 0.2107997 1.15051229 1.1578691 
# 3: c 0.5385789 0.765.44215343 0.3137903 
# 4: d 1.0174542 0.7818498 0.06414317 0.6079849 
# 5: e 0.7343667 1.2870542 1.41615858 0.7362462 
+2

@Frank喜歡它 - 感謝! – C8H10N4O2

+1

您可以鏈接結果以避免中間分配..(+1)。 – Arun

+0

@真的,謝謝 – C8H10N4O2

相關問題