我建議 「iotools」 和「數據的組合。表」,沿此線的東西:
library(iotools)
library(data.table)
melt(data.table(ind = seq_along(vec), trimws(mstrsplit(vec, ";"))),
"ind", na.rm = TRUE)[
, c("key", "val") := tstrsplit(value, " ", TRUE)][
, c("variable", "value") := NULL][]
或者,如果你想要一個 「廣」 的形式(如@ GGrothendieck的答案):
dcast(
melt(data.table(ind = seq_along(vec), trimws(mstrsplit(vec, ";"))),
"ind", na.rm = TRUE)[
, c("key", "val") := tstrsplit(value, " ", TRUE)][
, c("variable", "value") := NULL][], ind ~ key, value.var = "val")
我建議上面,因爲你比較以下內容:
樣本數據長度3,大約100000,大約100萬。
vec <- c("id a; sex m; age 16; type 1;","id a; sex m; age 16;","id a; sex m; age 16; type 3")
v100k <- rep(vec, ceiling(100000/length(vec)))
v1M <- rep(vec, ceiling(1000000/length(vec)))
我們要測試的方法:
library(iotools)
library(data.table)
funAM_l <- function(invec) {
melt(data.table(ind = seq_along(invec), trimws(mstrsplit(invec, ";"))), "ind", na.rm = TRUE)[
, c("key", "val") := tstrsplit(value, " ", TRUE)][
, c("variable", "value") := NULL][]
}
funAM_w <- function(invec) dcast(funAM_l(invec), ind ~ key, value.var = "val")
funMT <- function(v) {
z <- strsplit(v, split = "(\\;)(\\s+)?", perl = TRUE)
lapply(z,function(s) {v <- unlist(strsplit(s,' ')); setNames(as.list(v[c(F,T)]),v[c(T,F)]) })
}
funF <- function(invec) rbindlist(lapply(invec, function(x) { fread(gsub(";", "\n", x)) }), idcol = TRUE)
funGG <- function(invec) read.dcf(textConnection(sub(" ",": ",trimws(unlist(strsplit(paste0(invec, ";"),";"))))))
我的建議是不會贏得任何比賽的小載體:
library(microbenchmark)
microbenchmark(funAM_l(vec), funAM_w(vec), funF(vec), funGG(vec), funMT(vec))
# Unit: microseconds
# expr min lq mean median uq max neval
# funAM_l(vec) 1474.163 1525.3765 1614.28414 1573.6325 1601.3815 2828.481 100
# funAM_w(vec) 3293.376 3482.9510 3741.30381 3553.7240 3714.1730 6787.863 100
# funF(vec) 690.761 729.4900 830.61645 756.4610 777.6725 4083.904 100
# funGG(vec) 182.281 209.8405 220.46376 220.8055 232.1820 280.788 100
# funMT(vec) 57.288 76.5225 84.81496 83.2755 90.3120 166.352 100
但看看會發生什麼,當我們擴展向量:
system.time(funAM_l(v100k))
# user system elapsed
# 0.24 0.00 0.24
system.time(funAM_w(v100k))
# user system elapsed
# 0.296 0.000 0.296
system.time(funMT(v100k))
# user system elapsed
# 1.768 0.000 1.768
system.time(funF(v100k))
# user system elapsed
# 21.960 0.136 22.068
system.time(funGG(v100k))
# user system elapsed
# 30.968 0.004 30.940
下面是它在100萬長度的向量上的表現。
system.time(funAM_w(v1M))
# user system elapsed
# 4.316 0.092 4.402
我的另一個建議將是看cSplit
從我的「splitstackshape」包。這比@Marat的方法要好一點。
這是百萬值:
library(splitstackshape)
system.time(dcast(
cSplit(cSplit(data.table(ind = seq_along(v1M), v1M), "v1M", ";", "long"), "v1M", " "),
ind ~ v1M_1, value.var = "v1M_2"))
# user system elapsed
# 13.744 0.156 13.882
都不能值*包含*';'(例如,作爲一個字符串的一部分,或轉義)? –
哪部分代碼是瓶頸? –
Konrad Rudolph的問題的答案是否定的。 「;」只能顯示爲分隔符。 – user1701545