2017-06-29 113 views
0

嗨我目前正在做Web使用率挖掘。爲此,我需要遍歷所有數據條目(204002行)(每行是包含時間戳和訪問頁面的Web會話),並對它們進行一些處理。這裏是數據的輸入:我的循環通過data.table的性能

structure(list(cookie = "1", 
    paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "SYSTEM", "SYSTEM", "SYSTEM")), time = list(c("2017-05-01T00:00:00.000Z", 
    "2017-05-01T00:00:10.000Z", "2017-05-01T00:00:41.000Z", "2017-05-01T00:00:48.000Z", 
    "2017-05-01T00:03:28.000Z", "2017-05-01T00:03:40.000Z", "2017-05-01T00:03:53.000Z", 
    "2017-05-01T00:04:09.000Z", "2017-05-01T00:04:17.000Z", "2017-05-01T00:04:26.000Z", 
    "2017-05-01T00:04:30.000Z", "2017-05-01T00:04:34.000Z", "2017-05-01T00:04:40.000Z", 
    "2017-05-01T00:05:36.000Z", "2017-05-01T00:05:46.000Z", "2017-05-01T00:05:52.000Z", 
    "2017-05-01T00:06:00.000Z", "2017-05-01T00:06:38.000Z", "2017-05-01T00:06:57.000Z", 
    "2017-05-01T00:07:01.000Z")), length = 20L, durationInMin = 7.01666666666667), .Names = c("cookie", 
"paths", "time", "length", "durationInMin"), class = c("data.table", 
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x00000000001f0788>) 

我看看是否需要將會話分成兩個或多個會話。 爲此,我查看會話中的每個時間戳,並將它們與此會話中的上一個時間戳進行比較。如果差異跨越邊界,則會話分爲兩個會話。 結果是帶有新會話的新Data.Table。代碼有效,但非常慢(多個小時)。隨着時間的推移,速度會變慢。首先,我認爲這是循環內部不斷增加的列表,但我通過執行沒有結果列表的循環來檢查它。 我的代碼如下:

function(sessions) { 
     durationCalc <- function(timeList) { 
     last <- 
      strptime(timeList[[1]][length(timeList[[1]])], format = "%Y-%m-%dT%H:%M:%S") 
     first <- 
      strptime(timeList[[1]][length(1)], format = "%Y-%m-%dT%H:%M:%S") 
     res <- as.numeric(difftime(last, first, units = 'mins')) 
     } 




     id <- 1 
     border <- 30 
     maxCount <- nrow(sessions) 

     # list for the final sessions 
     finalSessions <- vector("list", maxCount) 

     # iterate over every session to break down into smaller sessions 
     for (i in 1:maxCount) { 
     print(paste("working on session", i, "of", maxCount)) 
     currentStartPosition <- 1 
     row <- sessions[i, ] 
     sessionLength <- length(row$time[[1]]) 

     # if the session containts only one path/timestamp, there is no further processing required 
    # if it contains two or more, each timestamp has to be checked. 
     if (sessionLength < 2) { 
      finalSessions[[id]] <- row 
      id <- id + 1 
     } 
     else{ 
      currentTime <- 
      strptime(row$time[[1]][1], format = "%Y-%m-%dT%H:%M:%S") 
      for (j in 2:sessionLength) { 
      nextTime = strptime(row$time[[1]][j], format = "%Y-%m-%dT%H:%M:%S") 
      diff <- 
       as.numeric(difftime(nextTime, currentTime, units = 'mins')) 
     # if the timestamp is 30 minutes or more later the current sessions (row) gets split 
      if (diff > border) { 
     # make a copy of the original row and modify values, then add the modified row to the finalSessions 
     # the currentStartposition gets the currentTimestamp and the loop continues 
       currentSession <- row 
       currentSession$cookie = id 
       currentSession$time[[1]] <- 
       list(row$time[[1]][currentStartPosition:j - 1]) 
       currentSession$paths[[1]] <- 
       list(row$paths[[1]][currentStartPosition:j - 1]) 
       currentSession$durationInMin <- 
       durationCalc(currentSession$time) 
       currentSession$length <- length(currentSession$paths[[1]]) 
       currentStartPosition = j 

       finalSessions[[id]] <- currentSession 
       id <- id + 1 

      } 
      # at last the currentTimestamp gets the next Time stamp, it iterates over the whole timestamp list 
      currentTime = nextTime 
      } 

     # after the loop the final session gets built. copy the original row, modify the values and add it to the finalSessions 
      currentSession <- row 
      currentSession$cookie = id 
      currentSession$time[[1]] <- 
      list(row$time[[1]][currentStartPosition:sessionLength]) 
      currentSession$paths[[1]] <- 
      list(row$paths[[1]][currentStartPosition:sessionLength]) 
      currentSession$durationInMin <- 
      durationCalc(currentSession$time) 
      currentSession$length <- length(currentSession$paths[[1]]) 
      finalSessions[[id]] <- currentSession 
      id <- id + 1 
     } 
     } 

     finalSessions <- rbindlist(finalSessions) 



    } 
+0

你可以添加一些數據嗎? (https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) – minem

+0

使用lapply而不是循環和矢量化您的工作應該做的伎倆。 – YCR

+0

@MārtiņšMiglinieks我添加了一個數據行 – webusag

回答

0

試試這個:

sessions <- structure(list(cookie = "1", 
       paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
           "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
           "LMCash", 
           "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
           "LMCash", 
           "SYSTEM", "SYSTEM", "SYSTEM")), 
       time = list(c(
"2017-05-01T00:00:00.000Z", 
"2017-05-01T00:00:10.000Z", 
"2017-05-01T00:00:41.000Z", 
"2017-05-01T00:00:48.000Z", 
"2017-05-01T00:03:28.000Z", 
"2017-05-01T00:03:40.000Z", 
"2017-05-01T00:03:53.000Z", 
"2017-05-01T00:04:09.000Z", 
"2017-05-01T00:04:17.000Z", 
"2017-05-01T00:04:26.000Z", 
"2017-05-01T00:04:30.000Z", 
"2017-05-01T00:04:34.000Z", 
"2017-05-01T00:04:40.000Z", 
"2017-05-01T00:05:36.000Z", 
"2017-05-01T00:05:46.000Z", 
"2017-05-01T00:05:52.000Z", 
"2017-05-01T00:06:00.000Z", 
"2017-05-01T00:06:38.000Z", 
"2017-05-01T00:06:57.000Z", 
"2017-05-01T00:40:01.000Z")), 
length = 20L, 
durationInMin = 7.01666666666667), .Names = c("cookie", 
"paths", "time", "length", "durationInMin"), 
class = c("data.table", "data.frame"), 
row.names = c(NA, -1L)) 


s <- replicate(1000, sessions, simplify = F) 
# str(s) 
s <- rbindlist(s) 


ff <- function(s) { 

    dFormat <- "%Y-%m-%dT%H:%M:%S" 
    durationCalc2 <- function(timeList) { 
    tt <- timeList 
    (tt[length(tt)] - tt[1])/60 
    } 

    id <- 1 
    border <- 30 
    maxCount <- nrow(s) 

    finalSessions <- vector("list", maxCount) 

    for (i in 1:maxCount) { 
    # print(paste("working on session", i, "of", maxCount)) 
    cSP <- 1 
    row <- s[i, ] 
    TIME <- row$time[[1]] 
    PATHS <- row$paths[[1]] 
    sessionLength <- length(TIME) 
    TIMES <- strptime(TIME, format = dFormat) 
    TIMES <- as.numeric(TIMES) 

    if (sessionLength < 2) { 
     finalSessions[[id]] <- row 
     id <- id + 1 
    } else{ 
     # currentTime <- strptime(TIME[1], format = dFormat) 
     cT2 <- TIMES[1] 
     for (j in 2:sessionLength) { 
     # nextTime <- strptime(TIME[j], format = dFormat) 
     nT2 <- TIMES[j] 
     # diff <- as.numeric(difftime(nextTime, currentTime, units = 'mins')) 
     diff <- (nT2 - cT2) /60 

     if (diff > border) { 
      cS <- row 
      cS$cookie = id 
      index <- cSP:j - 1 
      cS$time[[1]] <- list(TIME[index]) 
      cS$paths[[1]] <- list(PATHS[index]) 
      cS$durationInMin <- durationCalc2(TIMES[index]) 
      cS$length <- length(cS$paths[[1]]) 
      cSP <- j 
      finalSessions[[id]] <- cS 
      id <- id + 1 
     } 
     cT2 <- nT2 
     } 
     cS <- row 
     cS$cookie = id 
     cS$time[[1]] <- list(TIME[cSP:sessionLength]) 
     cS$paths[[1]] <- list(PATHS[cSP:sessionLength]) 
     newTIMES <- TIMES[cSP:sessionLength] 
     cS$durationInMin <- durationCalc2(newTIMES) 
     cS$length <- length(cS$paths[[1]]) 
     finalSessions[[id]] <- cS 
     id <- id + 1 
    } 
    } 
    finalSessions <- rbindlist(finalSessions) 
    finalSessions 
} 

應該是近似快2倍:

system.time(rez1 <- yourFunction(s)) #5.81 
system.time(rez2 <- ff(s)) # 2.74 

2.58/5.81 

all.equal(rez1, rez2) 

下一次,嘗試分析你的代碼(如果您會做到這一點,那麼你會看到,difftime是緩慢的,它可能會加快)。 此外,您提供的示例數據不好,它沒有包含所有代碼執行的示例!