2016-04-22 94 views
0

我正在處理一些大的時間序列數據集,每個文件中有大約200萬行。到目前爲止,我一直在使用ddply來彙總數據,就像我想要的那樣,但不幸的是它變得太慢了,我真的需要更快的方式。這是我的代碼:使用dplyr軟件包複製ddply? ddply太慢

DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35))) 
    DF <- DF[which(DF$DateTime != 0),] 
    options(digits.secs = 3) 
    DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")  
    completecase <- DF[complete.cases(DF),] 
    midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2) 
    **# creating 10 millisecond time intervals** 
    cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01) 
    **#creating new Time variable with the 10 millisecond breaks** 
    midpoint$Time = cut(midpoint$DateTime, breaks = cuts) 
    **#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument** 
    mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE) 
    mp$Time = as.POSIXct(mp$Time) 
    mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata') 
    mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"] 
    mp_xts = makeReturns(mp_xts) 

ddply太慢了。我知道我可以用data.table包或dplyr包來做到這一點,但我似乎無法找到的是複製.drop = FALSE參數。其他軟件包都會丟棄所有空的時間間隔,但對我來說保留它們以進行分析非常重要。

我想要做的是基本上,每10毫秒彙總我的時間序列,並用0或locf填充NA時間幀。所以如果它做對了,我應該有6.25 * 60 * 60 * 100 = 2,250,000行作爲我的輸出。

任何想法,我可以做得更快,同時做ddply完全一樣的事情?

> dput(DF[1:20,]) 
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806, 
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808, 
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813, 
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82, 
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85, 
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt" 
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L, 
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L, 
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L, 
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L, 
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L, 
    84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 
    84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L 
    ), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L, 
    82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L, 
    82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L, 
    84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 
    84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L 
    ), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L, 
    82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L, 
    82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L, 
    87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L, 
    84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L, 
    0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L, 
    82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L), 
    BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L 
    ), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L, 
    85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L, 
    84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1", 
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2", 
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4", 
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5", 
"AO5"), row.names = c(NA, 20L), class = "data.frame") 

請讓我知道我是否應該發佈任何其他信息,真的很感謝幫助。

> sessionInfo() 
R version 3.2.2 (2015-08-14) 
Platform: x86_64-pc-linux-gnu (64-bit) 
Running under: Ubuntu precise (12.04.5 LTS) 

locale: 
[1] LC_CTYPE=en_IN.UTF-8  LC_NUMERIC=C    LC_TIME=en_IN.UTF-8  LC_COLLATE=en_IN.UTF-8  LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8  
[8] LC_NAME=C     LC_ADDRESS=C    LC_TELEPHONE=C    LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C  

attached base packages: 
[1] stats  graphics grDevices utils  datasets methods base  

other attached packages: 
[1] highfrequency_0.4 xts_0.9-7   zoo_1.7-12  data.table_1.9.7 

loaded via a namespace (and not attached): 
[1] tools_3.2.2  grid_3.2.2  lattice_0.20-33 
+0

你可以使用data.table然後添加空水平恢復在使用合併。 – Heroka

+0

包含對'library'或'require'的調用是很好的,特別是如果你的代碼依賴於多個pkgs。同時讓您的示例代碼儘可能小,有助於讓人們疑惑。 – jangorecki

+0

@jangorecki我其實在我的代碼中調用了庫,這只是一個更大的代碼塊。我沒有打擾其他人發帖,因爲我想盡量保持這個問題的儘可能緊密。 – UtdMan

回答

3

data.table最新development version增加非相等連接允許爲這類問題非常緊湊和快速的解決方案:

dt = as.data.table(df) # or convert in place using setDT 
breaks = dt[, {tmp = seq(min(DateTime) - 0.01, max(DateTime) + 0.01, 0.01); 
       .(Start = tmp, End = c(tail(tmp, -1), Inf))}]; 

dt[breaks, on = .(DateTime >= Start, DateTime < End), 
    lapply(.SD, mean), by = .EACHI] 
+1

@Frank我認爲[這個FR](https://github.com/Rdatatable/data.table/issues/1615)會解決這個問題 - 他們真的應該被命名爲「開始」和「結束」 – eddi

+0

我得到錯誤這個解決方案,我是data.table的新手,所以無法弄清楚問題所在。錯誤如下:''.data.table'中的錯誤(dt,break,lapply(.SD,mean),on =。(DateTime> =: 找不到函數「。」 – UtdMan

+0

@UtdMan add 'sessionInfo'對你的問題來說,它會更容易調試 – jangorecki

2

除了嘗試在Rcpp中編寫自己的C++實現外,無法比data.table做得更好。無論如何,這裏是一個data.table解決方案,使用findInterval()而不是cut(),這應該稍微快一點。此外,我已經擴大了最小和最大範圍外的休息時間,以獲得一些額外的空白間隔來測試。

library(data.table); 
setDT(DF); 
breaks <- seq.POSIXt(min(DF$DateTime)-0.01,max(DF$DateTime)+0.01,0.01); 
breaks; 
## [1] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" 
## [4] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" 
## [7] "2015-12-20 22:45:00 EST" 
diff(breaks); 
## Time differences in secs 
## [1] 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 
ints <- findInterval(DF$DateTime,breaks); 
ints; 
## [1] 2 2 2 2 2 2 2 2 2 2 3 3 3 3 4 6 6 6 6 6 
agg <- DF[,lapply(.SD,mean),breaks[ints],.SDcols=-1L]; 
agg; 
##     breaks Seq  BP1 BQ1 BO1 AP1 AQ1 AO1  BP2 BQ2 BO2  AP2 AQ2 AO2  BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4  BP5 BQ5 BO5 AP5 AQ5 AO5 
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3 
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0 
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0 
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0 
emp <- setdiff(seq_along(breaks),ints); 
emp; 
## [1] 1 5 7 
agg <- rbind(fill=T,agg,data.table(breaks=breaks[emp])); 
agg; 
##     breaks Seq  BP1 BQ1 BO1 AP1 AQ1 AO1  BP2 BQ2 BO2  AP2 AQ2 AO2  BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4  BP5 BQ5 BO5 AP5 AQ5 AO5 
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3 
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0 
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0 
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0 
## 5: 2015-12-20 22:45:00 NA  NA NA NA NA NA NA  NA NA NA  NA NA NA  NA NA NA NA NA NA NA NA NA NA NA NA  NA NA NA NA NA NA 
## 6: 2015-12-20 22:45:00 NA  NA NA NA NA NA NA  NA NA NA  NA NA NA  NA NA NA NA NA NA NA NA NA NA NA NA  NA NA NA NA NA NA 
## 7: 2015-12-20 22:45:00 NA  NA NA NA NA NA NA  NA NA NA  NA NA NA  NA NA NA NA NA NA NA NA NA NA NA NA  NA NA NA NA NA NA