2016-06-09 61 views
30

對於我的每行數據的最新值的我想計算的最新value總和爲每個group總和跨組

dt = data.table(group = c('a','b','a','a','b','a'), 
       value = c(10, 5, 20, 15, 15, 10), 
       desired = c(10, 15, 25, 20, 30, 25)) 
# group value desired 
#1:  a 10  10 
#2:  b  5  15 
#3:  a 20  25 # latest value of a is 20, of b is 5 
#4:  a 15  20 # latest value of a is 15, of b is 5 
#5:  b 15  30 
#6:  a 10  25 

desired列是我想達到什麼,我可以用一個天真的循環來做到這一點,但是我的數據很大,有很多行和組(1M +行,1000+組)。

for (i in seq_len(nrow(dt))) { 
    # can use `set` to make this faster, but still too slow 
    # this is just to illustrate *a* solution 
    dt[i, desired1 := dt[1:i, value[.N], by = group][, sum(V1)]] 
} 

回答

18

即使是簡單的邏輯從@eddi(下評論)減少環島一個如下圖所示:

dt[, incr := diff(c(0, value)), by = group][, ans := cumsum(incr)] 

不知道如何延伸到更多的羣體,但這裏的一對的示例數據與3組:

# I hope I got the desired output correctly 
require(data.table) 
dt = data.table(group = c('a','b','c','a','a','b','c','a'), 
       value = c(10, 5, 20, 25, 15, 15, 30, 10), 
       desired = c(10, 15, 35, 50, 40, 50, 60, 55)) 

添加rleid

dt[, id := rleid(group)] 

提取最後一行每個group, id

last = dt[, .(value=value[.N]), by=.(group, id)] 

last將有獨特的id。現在想法是獲得每個id的增量,然後加入+更新回來。

last = last[, incr := value - shift(value, type="lag", fill=0L), by=group 
      ][, incr := cumsum(incr)-value][] 

現在加入+更新:

dt[last, ans := value + i.incr, on="id"][, id := NULL][] 
# group value desired ans 
# 1:  a 10  10 10 
# 2:  b  5  15 15 
# 3:  c 20  35 35 
# 4:  a 25  50 50 
# 5:  a 15  40 40 
# 6:  b 15  50 50 
# 7:  c 30  60 60 
# 8:  a 10  55 55 

我還不知道在哪裏/如果突破將看..它仔細了。我立即寫下來,以便更多的目光。


上500組比較10,000行與大衛的解決方案:

require(data.table) 
set.seed(45L) 
groups = apply(matrix(sample(letters, 500L*10L, TRUE), ncol=10L), 1L, paste, collapse="") 
uniqueN(groups) # 500L 
N = 1e4L 
dt = data.table(group=sample(groups, N, TRUE), value = sample(100L, N, TRUE)) 

arun <- function(dt) { 

    dt[, id := rleid(group)] 
    last = dt[, .(value=value[.N]), by=.(group, id)] 
    last = last[, incr := value - shift(value, type="lag", fill=0L), by=group 
       ][, incr := cumsum(incr)-value][] 
    dt[last, ans := value + i.incr, on="id"][, id := NULL][] 
    dt$ans 
} 

david <- function(dt) { 
    dt[, indx := .I] 
    res <- dcast(dt, indx ~ group) 
    for (j in names(res)[-1L]) 
     set(res, j = j, value = res[!is.na(res[[j]])][res, on = "indx", roll = TRUE][[j]]) 
    rowSums(as.matrix(res)[, -1], na.rm = TRUE) 

} 

system.time(ans1 <- arun(dt)) ## 0.024s 
system.time(ans2 <- david(dt)) ## 38.97s 
identical(ans1, as.integer(ans2)) 
# [1] TRUE 
+6

這是偉大的,謝謝我!關於連接和rleid有點困惑 - 不僅僅是'dt [,incr:= diff(c(0,value)),by = group] [,ans:= cumsum(incr)]'work?(I'我不知道是否我錯過了一些邏輯) – eddi

+0

哦,是的,我認爲那會奏效!這種迂迴的方式歸結爲你的單線。 – Arun

+2

哇,這是一個不錯的單線... – andrew

2

我會爲每個組創建一個列,顯示該組的最新值。然後,只需總結這些列:

library(zoo) 
result <- rep(0, nrow(dt)) 
for(g in dt[, unique(group)]) { 
    result <- result + dt[, na.fill(na.locf(ifelse(group==g, 1, NA)*value, na.rm=F), 0)] 
} 

all(dt[, desired] == result) 
+0

更新答案器。--- 1E3 +組 – andrew

+4

太慢,但由於 – eddi

-2

使用dplyr,適用於許多團體,但數據不能是數據表。

library(dplyr) 
library(tidyr) 
library(zoo) 
dt %>% 
    mutate(row_number = row_number()) %>% 
    spread(group, value) %>% 
    arrange(row_number) %>% 
    mutate_each(funs(na.locf(., na.rm = FALSE))) %>% 
    mutate(answer = rowSums(.[,-1:-2], na.rm = T)) 

使用的示例數據上面的函數(注意data.frame()data.table()

dt = data.frame(group = c('a','b','a','a','b','a'), 
       value = c(10, 5, 20, 15, 15, 10), 
       desired = c(10, 15, 25, 20, 30, 25)) 
    desired row_number a b answer 
1  10   1 10 NA  10 
2  15   2 10 5  15 
3  25   3 20 5  25 
4  20   4 15 5  20 
5  30   5 15 15  30 
6  25   6 10 15  25 

dt = data.frame(group = c('a','b','c','a','a','b','c','a'), 
       value = c(10, 5, 20, 25, 15, 15, 30, 10), 
       desired = c(10, 15, 35, 50, 40, 50, 60, 55)) 

    desired row_number a b c answer 
1  10   1 10 NA NA  10 
2  15   2 10 5 NA  15 
3  35   3 10 5 20  35 
4  50   4 25 5 20  50 
5  40   5 15 5 20  40 
6  50   6 15 15 20  50 
7  60   7 15 15 30  60 
8  55   8 10 15 30  55 
+8

這需要太多的內存/太慢了。 – eddi