這是一個dplyr版本@KenBenoit解決方案(請參閱dplyr.matrix
函數)。以下是測試所有三種方法的代碼。
library(dplyr)
library(data.table)
library(survival)
library(microbenchmark)
data(heart, package = "survival")
data <- heart
old.matrix <- function(data) {
# Number of unique subjects
n.subjects <- data %>% group_by(id) %>% n_groups()
# Unique failure times
fail.time <- data %>% filter(event == 1) %>% distinct(stop) %>% arrange(stop) %>% .$stop
# Number of unique failure times
n.fail.time <- length(fail.time)
# Pre-fill matrix. Will be filled with covariate values.
mat <- matrix(NA_real_, nrow = n.subjects, ncol = n.fail.time)
# Run loop
for(i in 1:n.subjects) { # Number of subjects
data.subject <- data[data$id == i, ] # subsetting here provides nice speed-up
for(j in 1:n.fail.time) { # Number of failure times.
value <- subset(data.subject, (start < fail.time[j]) & (stop >= fail.time[j]), select = transplant, drop = TRUE)
if(length(value) == 0) { # An early event or censor will return empty value. Assign to zero.
mat[i, j] <- 0
}
else {
mat[i, j] <- value # True value
}
}
}
mat
}
dplyr.matrix <- function(data) {
# Number of unique subjects
n.subjects <- data %>% group_by(id) %>% n_groups()
# Unique failure times
fail.time <- data %>% filter(event == 1) %>% distinct(stop) %>% arrange(stop) %>% .$stop
# Number of unique failure times
n.fail.time <- length(fail.time)
# add a serial number for each id
data <- data %>% group_by(id) %>% mutate(id.serial = 1:length(start))
# needed because transplant is a factor in the heart dataset
data$transplant <- as.integer(data$transplant)
# create a "long" format data.frame of subjects, observation number, and start/stop times
data.long <- data.frame(
id = rep(1:n.subjects, each = n.fail.time * max(data$id.serial)),
id.serial = rep(1:max(data$id.serial), max(data$id.serial), n.subjects * max(data$id.serial)),
fail.time = rep(fail.time, each = max(data$id.serial))
)
# merge in start and stop times
data.merge <- left_join(data.long, data[, c("start", "stop", "transplant", "id", "id.serial")], by = c("id", "id.serial"))
# eliminate missings (for which no 2nd observation took place)
data.merge <- na.omit(data.merge)
# this replicates the "value" assignment in the loop
data.merge <- data.merge %>% mutate(value = transplant * ((start < fail.time) & (stop >= fail.time)))
# sum on the ids by fail time
data.merge <- data.merge %>% group_by(id, fail.time) %>% summarise(value = sum(value))
# convert to a matrix
data.matrix <- matrix(data.merge$value, ncol = n.fail.time, byrow = TRUE, dimnames = list(1:n.subjects, fail.time))
data.matrix
}
data.table.matrix <- function(data) {
# Number of unique subjects
n.subjects <- data %>% group_by(id) %>% n_groups()
# Unique failure times
fail.time <- data %>% filter(event == 1) %>% distinct(stop) %>% arrange(stop) %>% .$stop
# Number of unique failure times
n.fail.time <- length(fail.time)
dataDT <- data.table(data[, c("id", "start", "stop", "transplant")])
# add a serial number for each id
dataDT[, idObs := 1:length(start), by = id ]
# needed because transplant is a factor in the heart dataset
dataDT[, transplant := as.integer(transplant)]
# create a "long" format data.table of subjects, observation number, and start/stop times
matDT <- data.table(subject = rep(1:n.subjects, each = n.fail.time * max(dataDT$idObs)),
idObs = rep(1:max(dataDT$idObs), max(dataDT$idObs), n.subjects * max(dataDT$idObs)),
fail.time = rep(fail.time, each = max(dataDT$idObs)))
# merge in start and stop times
setkey(matDT, subject, idObs)
setkey(dataDT, id, idObs)
matDT <- dataDT[matDT]
# eliminate missings (for which no 2nd observation took place)
matDT <- matDT[!is.na(transplant)]
# this replicates the "value" assignment in the loop
matDT[, value := transplant * ((start < fail.time) & (stop >= fail.time))]
# sum on the ids by fail time
matDT2 <- matDT[, list(matVal = sum(value)), by = list(id, fail.time)]
# convert to a matrix
mat2 <- matrix(matDT2$matVal, ncol = n.fail.time, byrow = TRUE, dimnames = list(1:n.subjects, fail.time))
mat2
}
all(dplyr.matrix(data) == old.matrix(data))
all(dplyr.matrix(data) == data.table.matrix(data))
microbenchmark(
old.matrix(data),
dplyr.matrix(data),
data.table.matrix(data),
times = 50
)
輸出從微基準:
Unit: milliseconds
expr min lq mean median uq max neval cld
old.matrix(data) 325.949687 328.102482 333.20923 329.39368 331.28305 373.44774 50 c
dplyr.matrix(data) 17.586146 18.317833 20.04662 18.95724 19.62431 60.15858 50 b
data.table.matrix(data) 9.464045 9.892281 10.72819 10.29394 11.44812 12.67738 50 a
上述結果對應有100觀察的數據集。當我在1000個觀察數據集上測試這個數據時,data.table
開始拉開更多。
Unit: milliseconds
expr min lq mean median uq max neval cld
old.matrix(data) 13095.7836 13114.1858 13162.5019 13134.0735 13150.217 13318.2496 5 c
dplyr.matrix(data) 1067.1942 1075.5291 1149.0789 1166.8951 1197.998 1237.7787 5 b
data.table.matrix(data) 104.5133 155.2074 159.6794 159.6364 166.764 212.2758 5 a
data.table
現在是贏家。
我真的不知道R,但我可以讀一些行,並告訴一些等價的C.有沒有辦法使用Rcpp來轉換庫? –
一旦它在C中,矢量化部分不是太難。 –
@JeremiahDicharry:不,沒有辦法自動轉換庫。 Rcpp不是一個R-to-C(++)編譯器,而是「僅僅」一個相當有用的膠片。有一些臨時演員(如vectorisation)。 –