0
嗨我目前正在做Web使用率挖掘。爲此,我需要遍歷所有數據條目(204002行)(每行是包含時間戳和訪問頁面的Web會話),並對它們進行一些處理。這裏是數據的輸入:我的循環通過data.table的性能
structure(list(cookie = "1",
paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"SYSTEM", "SYSTEM", "SYSTEM")), time = list(c("2017-05-01T00:00:00.000Z",
"2017-05-01T00:00:10.000Z", "2017-05-01T00:00:41.000Z", "2017-05-01T00:00:48.000Z",
"2017-05-01T00:03:28.000Z", "2017-05-01T00:03:40.000Z", "2017-05-01T00:03:53.000Z",
"2017-05-01T00:04:09.000Z", "2017-05-01T00:04:17.000Z", "2017-05-01T00:04:26.000Z",
"2017-05-01T00:04:30.000Z", "2017-05-01T00:04:34.000Z", "2017-05-01T00:04:40.000Z",
"2017-05-01T00:05:36.000Z", "2017-05-01T00:05:46.000Z", "2017-05-01T00:05:52.000Z",
"2017-05-01T00:06:00.000Z", "2017-05-01T00:06:38.000Z", "2017-05-01T00:06:57.000Z",
"2017-05-01T00:07:01.000Z")), length = 20L, durationInMin = 7.01666666666667), .Names = c("cookie",
"paths", "time", "length", "durationInMin"), class = c("data.table",
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x00000000001f0788>)
我看看是否需要將會話分成兩個或多個會話。 爲此,我查看會話中的每個時間戳,並將它們與此會話中的上一個時間戳進行比較。如果差異跨越邊界,則會話分爲兩個會話。 結果是帶有新會話的新Data.Table。代碼有效,但非常慢(多個小時)。隨着時間的推移,速度會變慢。首先,我認爲這是循環內部不斷增加的列表,但我通過執行沒有結果列表的循環來檢查它。 我的代碼如下:
function(sessions) {
durationCalc <- function(timeList) {
last <-
strptime(timeList[[1]][length(timeList[[1]])], format = "%Y-%m-%dT%H:%M:%S")
first <-
strptime(timeList[[1]][length(1)], format = "%Y-%m-%dT%H:%M:%S")
res <- as.numeric(difftime(last, first, units = 'mins'))
}
id <- 1
border <- 30
maxCount <- nrow(sessions)
# list for the final sessions
finalSessions <- vector("list", maxCount)
# iterate over every session to break down into smaller sessions
for (i in 1:maxCount) {
print(paste("working on session", i, "of", maxCount))
currentStartPosition <- 1
row <- sessions[i, ]
sessionLength <- length(row$time[[1]])
# if the session containts only one path/timestamp, there is no further processing required
# if it contains two or more, each timestamp has to be checked.
if (sessionLength < 2) {
finalSessions[[id]] <- row
id <- id + 1
}
else{
currentTime <-
strptime(row$time[[1]][1], format = "%Y-%m-%dT%H:%M:%S")
for (j in 2:sessionLength) {
nextTime = strptime(row$time[[1]][j], format = "%Y-%m-%dT%H:%M:%S")
diff <-
as.numeric(difftime(nextTime, currentTime, units = 'mins'))
# if the timestamp is 30 minutes or more later the current sessions (row) gets split
if (diff > border) {
# make a copy of the original row and modify values, then add the modified row to the finalSessions
# the currentStartposition gets the currentTimestamp and the loop continues
currentSession <- row
currentSession$cookie = id
currentSession$time[[1]] <-
list(row$time[[1]][currentStartPosition:j - 1])
currentSession$paths[[1]] <-
list(row$paths[[1]][currentStartPosition:j - 1])
currentSession$durationInMin <-
durationCalc(currentSession$time)
currentSession$length <- length(currentSession$paths[[1]])
currentStartPosition = j
finalSessions[[id]] <- currentSession
id <- id + 1
}
# at last the currentTimestamp gets the next Time stamp, it iterates over the whole timestamp list
currentTime = nextTime
}
# after the loop the final session gets built. copy the original row, modify the values and add it to the finalSessions
currentSession <- row
currentSession$cookie = id
currentSession$time[[1]] <-
list(row$time[[1]][currentStartPosition:sessionLength])
currentSession$paths[[1]] <-
list(row$paths[[1]][currentStartPosition:sessionLength])
currentSession$durationInMin <-
durationCalc(currentSession$time)
currentSession$length <- length(currentSession$paths[[1]])
finalSessions[[id]] <- currentSession
id <- id + 1
}
}
finalSessions <- rbindlist(finalSessions)
}
你可以添加一些數據嗎? (https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) – minem
使用lapply而不是循環和矢量化您的工作應該做的伎倆。 – YCR
@MārtiņšMiglinieks我添加了一個數據行 – webusag