你的代碼不能按原樣運行,所以我無法準確知道你在找什麼。您的描述建議您需要Days
之間StartDate
和EndDate
之間的Qty
的總和,按Type
分組。這將產生這樣一個矩陣:
df <- data.frame(ID,StartDate,EndDate,Type,Qty,stringsAsFactors=FALSE)
Days <- min(StartDate):max(EndDate)
is.between <- function(x,df) with(df,x>=StartDate & x<=EndDate)
get.sums <- function(df) sapply(Days,function(d,df) sum(df[is.between(d,df),"Qty"]),df)
do.call(rbind,lapply(split(df,df$Type), get.sums))
# [,1] [,2] [,3] [,4] [,5]
# A 0.5 0.5 0.5 0.0 0
# B 0.0 2.5 2.5 3.5 1
這是一個data.table方法,可能會更快。請注意0和get.sums(...)
的不同定義。
DT <- data.table(df,key="Type")
is.between <- function(x,a,b) x>=a & x <= b
get.sums <- function(day) DT[,list(day,Qty=sum(Qty[is.between(day,StartDate,EndDate)])),by=Type]
long <- rbindlist(lapply(Days,get.sums))
result <- dcast.data.table(long,Type~day,value.var="Qty")
result
# Type 1 2 3 4 5
# 1: A 0.5 0.5 0.5 0.0 0
# 2: B 0.0 2.5 2.5 3.5 1
這裏有一些基準測試與希望比較有代表性的例子的數據集(800行,500個開始日期,總日期範圍>900天),也測試@阿倫的回答。
# more representative example
set.seed(1) # for reproducibility
StartDate <- sample(1:500,800,replace=TRUE)
EndDate <- StartDate + rpois(800,400)
Type <- sample(LETTERS[1:20],800,replace=TRUE)
Qty <- rnorm(800,10,2)
Days <- min(StartDate):max(EndDate)
df <- data.frame(StartDate,EndDate,Type,Qty, stringsAsFactors=FALSE)
比較數據幀方法和兩種數據表方法。
library(data.table)
library(reshape2)
DT <- data.table(df,key="Type")
f.df <- function() {
is.between <- function(x,df) with(df,x>=StartDate & x<=EndDate)
get.sums <- function(df) sapply(Days,function(d,df) sum(df[is.between(d,df),"Qty"]),df)
do.call(rbind,lapply(split(df,df$Type), get.sums))
}
f.dt1 <- function() {
is.between <- function(x,a,b) x>=a & x <= b
get.sums <- function(day) DT[,list(day,Qty=sum(Qty[is.between(day,StartDate,EndDate)])),by=Type]
long <- rbindlist(lapply(Days,get.sums))
dcast.data.table(long,Type~day,value.var="Qty")
}
f.dt2 <- function() {
lookup <- data.table(StartDate=Days, EndDate=Days)
setkey(lookup)
j_olaps <- foverlaps(DT, lookup, by.x=c("StartDate", "EndDate"), type="any")
dcast.data.table(j_olaps, Type ~ StartDate, value.var="Qty", fun.agg=sum, na.rm=TRUE)
}
identical(f.dt1(),f.dt2()) # same result? YES!
# [1] TRUE
library(microbenchmark)
microbenchmark(f.df(),f.dt1(),f.dt2(),times=10)
# Unit: milliseconds
# expr min lq median uq max neval
# f.df() 1199.76370 1212.03787 1222.6558 1243.8743 1275.5526 10
# f.dt1() 1634.92675 1664.98885 1689.7812 1714.2662 1798.9121 10
# f.dt2() 91.53245 95.19545 129.2789 158.0789 208.1818 10
So @ Arun的方法比df方法快10倍,比上面dt方法快17倍。
你看過reshape2或plyr包嗎? – dayne 2014-10-10 16:30:11
請顯示您的預期結果 – 2014-10-10 16:30:48