2013-06-18 70 views
2

我有一個R中的數據框列表,每個數據框都從包含行列的不同文件加載。例如,這些文件可能包含不同比賽中不同運動員的完成位置。將R中的不完整數據框合併到矩陣中

相同的元素(運動員)可以出現在多個數據框(比賽)中,但沒有數據框必須包含所有元素。

我想填充排名矩陣與運動員排列和比賽列。凡有在特定比賽的運動員沒有高低應改爲0

舉例來說,如果我有:

[[1]] 
    name rank 
1 Alice 1 
2 Bob 2 
3 Carla 3 
4 Diego 4 

[[2]] 
    name rank 
1 Alice 2 
2 Carla 1 
3 Eric 3 
4 Frank 4 
5 Gary 5 

[[3]] 
    name rank 
1 Bob 5 
2 Carla 4 
3 Diego 3 
4 Eric 1 
5 Gary 2 

我想產生一個矩陣:

 1 2 3 
Alice 1 2 0 
Bob 2 0 5 
Carla 3 1 4 
Diego 4 0 3 
Eric 0 3 1 
Frank 0 4 0 
Gary 0 5 2 

我正在尋找一種有效的方法來做到這一點:我的數據更像是每個數據幀200個數據幀和10000個排名元素(總共15000個獨特元素),因此最終矩陣將大約爲15000x200

回答

2

這是一個使用reshape2包解決方案:

require(reshape2) 
dcast(do.call(rbind, lapply(seq_along(ll), function(ix) 
     transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0) 

    name 1 2 3 
1 Alice 1 2 0 
2 Bob 2 0 5 
3 Carla 3 1 4 
4 Diego 4 0 3 
5 Eric 0 3 1 
6 Frank 0 4 0 
7 Gary 0 5 2 

其中ll是你的data.frame名單。


或等價:

dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))), 
    name ~ id, value.var = "rank", fill = 0) 

一個data.table解決方案:

require(data.table) 
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))] 
setkey(pp, "name", "id") 
pp[CJ(unique(name), 1:3)][is.na(rank), rank := 0L][, as.list(rank), by = name] 

    name V1 V2 V3 
1: Alice 1 2 0 
2: Bob 2 0 5 
3: Carla 3 1 4 
4: Diego 4 0 3 
5: Eric 0 3 1 
6: Frank 0 4 0 
7: Gary 0 5 2 

一些基準測試(現在,我們已經相當一些答案):

names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="") 
names <- unique(names) 

dd_create <- function() { 
    nrow <- sample(c(100:500), 1) 
    ncol <- 3 
    data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow)) 
} 

ll <- replicate(1e3, dd_create(), simplify = FALSE) 

require(reshape2) 
require(data.table) 
Arun1_reshape2 <- function(ll) { 
    # same as @agstudy's 
    dcast(do.call(rbind, lapply(seq_along(ll), function(ix) 
      transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0) 
} 

Arun2_reshape2 <- function(ll) { 
    dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))), 
     name ~ id, value.var = "rank", fill = 0) 
} 

eddi_reshape2 <- function(ll) { 
    dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0) 
} 

Arun_data.table <- function(ll) { 
    pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))] 
    setkey(pp, "name", "id") 
    pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name] 
} 

merge.all <- function(x, y) { 
    merge(x, y, all=TRUE, by="name") 
} 

Hong_Ooi <- function(ll) { 
    for(i in seq_along(ll)) 
     names(ll[[i]])[2] <- paste0("rank", i) 
    out <- Reduce(merge.all, ll)  
} 

require(microbenchmark) 
microbenchmark(arun1 <- Arun1_reshape2(ll), 
       arun2 <- Arun2_reshape2(ll), 
       eddi <- eddi_reshape2(ll), 
       hong <- Hong_Ooi(ll), 
       arun.dt <- Arun_data.table(ll), times=10) 

Unit: seconds 
          expr  min  lq median   uq  max neval 
    arun1 <- Arun1_reshape2(ll) 9.157160 9.177143 9.366775 9.715767 28.043125 10 
    arun2 <- Arun2_reshape2(ll) 8.408356 8.437066 8.494233 9.018796 10.075029 10 
     eddi <- eddi_reshape2(ll) 8.056605 8.314110 8.402396 8.474129 9.124581 10 
      hong <- Hong_Ooi(ll) 82.457432 82.716930 82.908646 108.413217 321.164598 10 
arun.dt <- Arun_data.table(ll) 2.006474 2.123331 2.212783 2.311619 2.738914 10 
1

這裏的數據,因爲OP不給可再現例如:

dput(ll) 
list(structure(list(name = structure(1:4, .Label = c("Alice", 
"Bob", "Carla", "Diego"), class = "factor"), rank = 1:4), .Names = c("name", 
"rank"), class = "data.frame", row.names = c("1", "2", "3", "4" 
)), structure(list(name = structure(1:5, .Label = c("Alice", 
"Carla", "Eric", "Frank", "Gary"), class = "factor"), rank = c(2L, 
1L, 3L, 4L, 5L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5")), structure(list(name = structure(1:5, .Label = c("Bob", 
"Carla", "Diego", "Eric", "Gary"), class = "factor"), rank = c(5L, 
4L, 3L, 1L, 2L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5"))) 

幾乎相同的溶液作爲@Arun之一,但在2個separtes步驟:

## add race column 
ll <- lapply(seq_along(ll),function(x){ 
    ll[[x]]$race <- x 
    ll[[x]] 
    }) 
## create a long data.frame 
dd <- do.call(rbind,ll) 
## transform to the wide format 
library(reshape2) 

dcast(name~race,data=dd,fill=0,value.var='rank') 

    name 1 2 3 
1 Alice 1 2 0 
2 Bob 2 0 5 
3 Carla 3 1 4 
4 Diego 4 0 3 
5 Eric 0 3 1 
6 Frank 0 4 0 
7 Gary 0 5 2 
+0

agstudy,的確的確如此。但爲什麼答案不同? – Arun

+1

@阿倫,因爲我錯過了value.var ='rank':)我編輯我的答案,並添加數據。 – agstudy

1

另一個看起來好像是Reduce用例。

merge.all <- function(x, y) 
merge(x, y, all=TRUE, by="name") 

# to avoid problems with merged name clashes 
for(i in seq_along(ll)) 
    names(ll[[i]])[2] <- paste0("rank", i) 

out <- Reduce(merge.all, ll) 

你必須稍微修改您的數據幀,以避免merge抱怨名稱衝突;一個for循環以及任何用於此目的的工作。

任何缺席的比賽都會有NA。你可以用0代替它們out[is.na(out)] <- 0;你應該問自己,這是否明智。例如,如果您這樣做,那麼簡單的彙總統計信息(如方法,差異等)將會產生令人誤解的結果。如果你想做更復雜的建模,情況也是一樣。相比之下,大多數R建模功能將足夠聰明以排除NAs。

+0

我覺得'Reduce'在很多列表元素上會非常慢。 – Arun

+0

洪,任何想法爲什麼你的答案結束與該錯誤(請參閱我的基準編輯)? – Arun

+0

這是因爲合併結果中的列名重複。您需要將每個數據框中的名稱'rank'更改爲不重複的內容,例如'rank1','rank2'等。 –

2

這裏有一個簡單的reshape2解決方案:

library(reshape2) 

dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0) 
# name 1 2 3 
#1 Alice 1 2 0 
#2 Bob 2 0 5 
#3 Carla 3 1 4 
#4 Diego 4 0 3 
#5 Eric 0 3 1 
#6 Frank 0 4 0 
#7 Gary 0 5 2 

Arun的基準線是非常有趣的,它似乎像什麼data.table取得好是熔化部分,什麼reshape2取得好是dcast,所以這裏是兩全其美:

library(reshape2) 
library(data.table) 

pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))] 
dcast(pp, name ~ id, fill = 0, value.var = 'rank') 

使用Arun的基準數據:

names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="") 
names <- unique(names) 

dd_create <- function() { 
    nrow <- sample(c(100:500), 1) 
    ncol <- 3 
    data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow)) 
} 

ll <- replicate(1e3, dd_create(), simplify = FALSE) 

Arun_data.table <- function(ll) { 
    pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))] 
    setkey(pp, "name", "id") 
    pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name] 
} 

mix_of_both = function(ll) { 
    pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))] 
    dcast(pp, name ~ id, fill = 0, value.var = 'rank') 
} 

require(microbenchmark) 
microbenchmark(Arun_data.table(ll), mix_of_both(ll), times = 10) 
# Unit: milliseconds 
#    expr  min  lq median  uq  max neval 
# Arun_data.table(ll) 2568.333 2586.0079 2626.7704 2832.8076 2911.1314 10 
#  mix_of_both(ll) 615.166 739.9383 766.8994 788.5822 821.0478 10 
+0

eddi,非常有趣!我只是讀了它的要點。看起來真好! (我已經爲你+1了)。我會盡快詳細檢查並回信。 – Arun