R：如何將用戶定義的函數應用於多列數據表並在滑動窗口中進行評估，即移動平均值

我想要一個滑動窗口來計算一個滑動窗口上的幾個量（最小，最大，總計，平均值）整個數據表大約有130列。該窗口滑過由第一個鍵設置的數據組。以下代碼適用於一列，但我不知道如何修改多列的代碼。R：如何將用戶定義的函數應用於多列數據表並在滑動窗口中進行評估，即移動平均值

# Rolling sums 
# ref http://stackoverflow.com/questions/23501262/r-using-data-table-to-efficiently-test-rolling-conditions-across-multiple-rows 
# 
library(data.table) 
# 
# Generate sample data 
# 
set.seed(57) 
min_nSamples=5 
max_nSamples=20 
repN<-function(un) { 
    rep(un,sample(min_nSamples:max_nSamples,1)) 
} 
nUnits=1 
units<- unlist(lapply(1:nUnits,repN)) 
dt<-data.table(unit = units) 
dt <- dt[,{ 
    ## if there's just one row in the group of ID's, return nothing 
    list(
    index = as.double(seq(1,.N)) 
)}, by = c("unit")] 
dt[, var1:=runif(nrow(dt), -5, 5)] 
if (FALSE) { 
    dt[, var2:=runif(nrow(dt), 5, 10)] 
    dt[, var3:=runif(nrow(dt), 10, 15)] 
} 
setkeyv(dt,c("unit","index")) 


# Reference 
# http://stackoverflow.com/questions/14937165/using-dynamic-column-names-in-data-table 

# Get a complete map of units/indices 
keys = key(dt) 
key_dt = CJ(event=unique(dt$unit), index=min(dt$index):max(dt$index)) 

key_map_values = dt[key_dt] 

window_size = 5L 
window_dt = key_map_values[, list(window_index = seq(index-window_size, index-1L, by=1L)), by=keys] 

setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window" 
setkeyv(key_map_values, keys) 

ans = key_map_values[window_dt] 

q_mov_avg = ans[, mean(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"] 
q_mov_tot = ans[, sum(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"] 
q_mov_max = ans[, max(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"] 
q_mov_min = ans[, min(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"] 

q_all_step_1 <- merge(q_mov_avg,q_mov_max,by=c("unit","i.index")) 
setnames(q_all_step_1,"V1.x","avg_var1") 
setnames(q_all_step_1,"V1.y","max_var1") 
q_all_step_2 <- merge(q_all_step_1,q_mov_tot,by=c("unit","i.index")) 
setnames(q_all_step_2,"V1","tot_var1") 
q_all <- merge(q_all_step_2,q_mov_min,by=c("unit","i.index")) 
setnames(q_all,"V1","min_var1") 
setnames(q_all,"i.index","index") 
setkey(q_all,unit,index) 
dt$src="orig" 
q_all$src="window" 

dt_melt <- melt(dt,id=c(1:2,4),measure=3) 
q_all_melt <- melt(q_all,id=c(1:2,7),measure=3:6) 
dt_q_melt<-rbind(dt_melt,q_all_melt) 
dt_q_melt[src=="window"]$index <- dt_q_melt[src=="window"]$index - window_size/2 
ggplot(data=dt_q_melt[unit==1,],aes(x=index,y=value,group=variable,color=variable)) + 
    geom_line() + 
    geom_point()

如何修改代碼時的數據樣本有更多的列運行，上方設置假爲真？

來源

2014-10-28 user3969377

以下代碼使用滑動窗口中的行來評估數據表的多列上的用戶定義函數。有兩個鍵，第一個鍵用於分組數據，第二個鍵是滑動窗口的方向。移動窗口的結果與數據一起繪製。

# 
# Generate sample data 
# 
library(data.table) 
library(ggplot2) 
library(reshape2) 
fakeData<-function() { 

    set.seed(103) 
    # Little dataset 
    min_nSamples=30 
    max_nSamples=50 
    nVar = 10 
    nUnits=30 
    if(FALSE) { 
    # Big dataset 
    min_nSamples=80 
    max_nSamples=120 
    nVar = 130 
    nUnits=330 
    } 

    repN<-function(un) { 
    rep(un,sample(min_nSamples:max_nSamples,1)) 
    } 
    units<- unlist(lapply(1:nUnits,repN)) 
    dt<-data.table(unit = units) 
    dt <- dt[,{ 
    ## if there's just one row in the group of ID's, return nothing 
    list(
     index = as.double(seq(1,.N)) 
    )}, by = c("unit")] 
    for (iv in seq(1,nVar)) { 
    vname <- paste("var_",iv,sep="") 
    expr <- bquote(.(as.name(vname)):=runif(nrow(dt), -5, 5)) 
    dt[, eval(expr)] 
    } 
    setkeyv(dt,c("unit","index")) 
    return(dt) 
} 
dt<-fakeData() 
(dim(dt)) 

# Get a complete map of units/indices 
keys = key(dt) 
key_dt = dt[,keys,with=FALSE] 
setkey(key_dt) 

key_map_values = dt[key_dt] 

window_size = 10L 
fmla_txt = parse(text=paste("list(window_index = seq(", keys[2], "-window_size, ", 
          keys[2],"-1L, by=1L))",sep="")) 
key_map_values[is.na(get(keys[2])), (keys[2]):=-1 ] # Remove NA's 
window_dt = key_map_values[, eval(fmla_txt), by=keys] 

setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window" 
setkeyv(key_map_values, keys) 

ans = key_map_values[window_dt] 

mov_mean <- function(v,indx) { 
    mean(v, na.rm=TRUE) * (!any(indx < 1) | NA) 
} 
mov_total <- function(v,indx) { 
    sum(v, na.rm=TRUE) * (!any(indx < 1) | NA) 
} 
mov_min <- function(v,indx) { 
    max(v, na.rm=TRUE) * (!any(indx < 1) | NA) 
} 
mov_max <- function(v,indx) { 
    min(v, na.rm=TRUE) * (!any(indx < 1) | NA) 
} 


# q_mov_avg = ans[, lapply(.SD, mov_mean, index), by="unit,i.index"] 
# 
fmla_txt = parse(text=paste("lapply(.SD, mov_mean, ", keys[2], ")",sep="")) 
ikey2=paste("i.",keys[2],sep="") 
klist=c(keys[1],ikey2) 
q_mov_avg = ans[, eval(fmla_txt), by=klist] 
fmla_txt = parse(text=paste("lapply(.SD, mov_total, ", keys[2], ")",sep="")) 
q_mov_tot = ans[, eval(fmla_txt), by=klist] 
fmla_txt = parse(text=paste("lapply(.SD, mov_max, ", keys[2], ")",sep="")) 
q_mov_max = ans[, eval(fmla_txt), by=klist] 
fmla_txt = parse(text=paste("lapply(.SD, mov_min, ", keys[2], ")",sep="")) 
q_mov_min = ans[, eval(fmla_txt), by=klist] 
# 
change_col_names <- function(dt,cname) { 
    namesToChange <- names(dt)[grep(paste(keys,collapse="|"),names(dt),invert=TRUE)] 
    chgName <- function(n) { 
    setnames(dt,n,paste(cname,n,sep=".")) 
    } 
    lapply(namesToChange,chgName) 

    # dt$index<-NULL 
    fmla_txt = parse(text=paste(keys[2],":=NULL", sep="")) 
    dt[,eval(fmla_txt)] 
    setnames(dt,ikey2,keys[2]) 
    setkeyv(dt,keys) 
    return(dt) 
} 
q_mov_avg <- change_col_names(q_mov_avg,"avg") 
q_mov_tot <- change_col_names(q_mov_tot,"sum") 
q_mov_max <- change_col_names(q_mov_max,"max") 
q_mov_min <- change_col_names(q_mov_min,"min") 
dt_all <- Reduce(merge,list(dt,q_mov_avg,q_mov_tot,q_mov_max,q_mov_min)) 
# dt_all$window_index <- dt_all$index - window_size/2 
dt_all$window_index <- dt_all[,keys[2],with=FALSE] - window_size/2 
# 
nplots=3 
fmla_txt = parse(text=paste("punit = sort(sample(unique(dt_all[,",keys[1],"]),nplots))", sep="")) 
eval(fmla_txt) 
for (idx_key1 in punit) { 
    vname <- sample(names(dt)[!(names(dt) %in% key(dt))],1) 
    fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,]", sep="")) 
    pdt <- melt(eval(fmla_txt),id=c("index"), 
       measure=c(vname,paste("max",vname,sep="."), 
         paste("min",vname,sep="."), 
         paste("avg",vname,sep="."), 
         paste("sum",vname,sep="."))) 
    pdt[variable==vname,"variable"] = "Data" 
    pdt[variable==paste("max",vname,sep="."),"variable"] = "Max" 
    pdt[variable==paste("min",vname,sep="."),"variable"] = "Min" 
    pdt[variable==paste("avg",vname,sep="."),"variable"] = "Avg" 
    pdt[variable==paste("sum",vname,sep="."),"variable"] = "Sum" 
    fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,",keys[2],"]", sep="")) 
    pdt[variable=="Data","x"]=eval(fmla_txt) 
    fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,window_index]", sep="")) 
    pdt[variable!="Data","x"]=eval(fmla_txt) 
    gp<-ggplot(data=pdt,aes(x=x,y=value,group=variable,color=variable)) + 
    geom_line(size=1.1) + 
    labs(title=paste(keys[1],idx_key1),y=vname)+theme(legend.title=element_blank()) 
    print(gp) 
}

enter image description here

來源

2014-10-28 12:12:39 user3969377

R：如何將用戶定義的函數應用於多列數據表並在滑動窗口中進行評估，即移動平均值

回答

相關問題