我正在處理一些大的時間序列數據集,每個文件中有大約200萬行。到目前爲止,我一直在使用ddply來彙總數據,就像我想要的那樣,但不幸的是它變得太慢了,我真的需要更快的方式。這是我的代碼:使用dplyr軟件包複製ddply? ddply太慢
DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
DF <- DF[which(DF$DateTime != 0),]
options(digits.secs = 3)
DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")
completecase <- DF[complete.cases(DF),]
midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
**# creating 10 millisecond time intervals**
cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
**#creating new Time variable with the 10 millisecond breaks**
midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
**#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
mp$Time = as.POSIXct(mp$Time)
mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
mp_xts = makeReturns(mp_xts)
ddply太慢了。我知道我可以用data.table包或dplyr包來做到這一點,但我似乎無法找到的是複製.drop = FALSE參數。其他軟件包都會丟棄所有空的時間間隔,但對我來說保留它們以進行分析非常重要。
我想要做的是基本上,每10毫秒彙總我的時間序列,並用0或locf填充NA時間幀。所以如果它做對了,我應該有6.25 * 60 * 60 * 100 = 2,250,000行作爲我的輸出。
任何想法,我可以做得更快,同時做ddply完全一樣的事情?
> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806,
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808,
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813,
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82,
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85,
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L,
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L,
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L,
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L,
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L,
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L,
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L,
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L,
84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L,
84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L,
82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L,
82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L,
84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L,
84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L,
82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L,
82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L,
87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L,
84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L,
82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L),
BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L,
85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L,
84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1",
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2",
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4",
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5",
"AO5"), row.names = c(NA, 20L), class = "data.frame")
請讓我知道我是否應該發佈任何其他信息,真的很感謝幫助。
> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)
locale:
[1] LC_CTYPE=en_IN.UTF-8 LC_NUMERIC=C LC_TIME=en_IN.UTF-8 LC_COLLATE=en_IN.UTF-8 LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] highfrequency_0.4 xts_0.9-7 zoo_1.7-12 data.table_1.9.7
loaded via a namespace (and not attached):
[1] tools_3.2.2 grid_3.2.2 lattice_0.20-33
你可以使用data.table然後添加空水平恢復在使用合併。 – Heroka
包含對'library'或'require'的調用是很好的,特別是如果你的代碼依賴於多個pkgs。同時讓您的示例代碼儘可能小,有助於讓人們疑惑。 – jangorecki
@jangorecki我其實在我的代碼中調用了庫,這只是一個更大的代碼塊。我沒有打擾其他人發帖,因爲我想盡量保持這個問題的儘可能緊密。 – UtdMan