ID  | Date  | Mode | Time 
------ | --------- | ------- | ----- 
1234 | 12/10/16 | Bus  | 120 
1234 | 12/10/16 | Bus  | 130 
1234 | 12/10/16 | Bus  | 290 
1234 | 12/10/16 | Train | 310 
1234 | 12/10/16 | Bus  | 330 
4567 | 12/10/16 | Bus  | 220 
4567 | 12/10/16 | Bus  | 230 
4567 | 13/10/16 | Bus  | 290 
4567 | 13/10/16 | Bus  | 450 
4567 | 14/10/16 | Train | 1000 


我想要創建一個第五列,用於標識旅程階段是否已鏈接,即第二旅程鏈接到第一旅程,是第二旅程鏈接到第二旅程(其中1 =已鏈接,0 =未鏈接) 。


  • 的jnys是同一人,發生在同一天

  • 2的巴士旅程都在彼此的60分鐘(這樣一在彼此的60分鐘的公共汽車和火車旅程將不被連接)

  • 如果第i + 1和第i個旅程是鏈接的,則第i + 1旅程不能鏈接到第i + 2旅程


ID  | Date  | Mode | Time | Linked 
------ | --------- | ------- | ----- | ----- 
1234 | 12/10/16 | Bus  | 120 | 0 
1234 | 12/10/16 | Bus  | 130 | 1 
1234 | 12/10/16 | Bus  | 290 | 0 
1234 | 12/10/16 | Train | 310 | 0 
1234 | 12/10/16 | Bus  | 330 | 0 
4567 | 12/10/16 | Bus  | 220 | 0 
4567 | 12/10/16 | Bus  | 230 | 1 
4567 | 13/10/16 | Bus  | 290 | 0 
4567 | 13/10/16 | Bus  | 450 | 0 
4567 | 14/10/16 | Train | 1000 | 0 



for (i in 2:dim(df)[1]){ 
    if (df$ID[i]==df$ID[i-1]){ 
    if (df$Mode[i]==df$Mode[i-1]){ 
     if ((df$Time[i]-df$Time[i-1]) < 60){ 
     df$linked[i] <- 1 
     else { 
     df$linked[i] <- 0 
    else { 
     df$linked[i] <- 0 
    else { 
    df$linked[i] <- 0 

transform(DF, linked = ave(Time, ID, Date, cumsum(c(FALSE, Mode[-1] != Mode[-nrow(DF)])), 
     FUN = function(x) c(0, diff(x) < 60))) 


sqldf("select a.*, coalesce(a.ID = b.ID and 
          a.Date = b.Date and 
          a.Mode = b.Mode and 
          a.Time < b.Time + 60, 0) linked 
     from DF a left join DF b on a.rowid = b.rowid + 1") 



dt <- as.data.table(DF) 
dt[, linked := (Time < shift(Time, fill = -60) + 60) * 
       (Mode == shift(Mode, fill = Mode[1])), by = "ID,Date"] 


DF %>% 
    group_by(ID, Date) %>% 
    mutate(linked = (Time < lag(Time, default = -Inf) + 60) * 
        (Mode == lag(Mode, default = Mode[1]))) %>% 



Lines <- 
"ID  | Date  | Mode | Time 
------ | --------- | ------- | ----- 
1234 | 12/10/16 | Bus  | 120 
1234 | 12/10/16 | Bus  | 130 
1234 | 12/10/16 | Bus  | 290 
1234 | 12/10/16 | Train | 310 
1234 | 12/10/16 | Bus  | 330 
4567 | 12/10/16 | Bus  | 220 
4567 | 12/10/16 | Bus  | 230 
4567 | 13/10/16 | Bus  | 290 
4567 | 13/10/16 | Bus  | 450 
4567 | 14/10/16 | Train | 1000" 
DF <- read.table(text = Lines, header = TRUE, sep = "|", strip.white = TRUE, 
comment = "-", as.is = TRUE) 



DF %>% 
    # The journeys are for the same person, take place on the same day 
    # and on the same mode of transport 
    group_by(ID, Date, Mode) %>% 
    # 2 bus journeys are within 60 mins of one another 
    mutate(linked0 = c(Inf, diff(Time))<60, 
      # if the i+1th and the ith journey are linked, 
      # then the i+1th journey cannot be linked to the i+2th journey 
      linkedsum = cumsum(linked0), 
      linked = ifelse(linkedsum==1, linked0, 0)) 

     ID  Date Mode Time linked0 linkedsum linked 
    <int> <chr> <chr> <int> <lgl>  <int> <dbl> 
1 1234 12/10/16 Bus 120 FALSE   0  0 
2 1234 12/10/16 Bus 130 TRUE   1  1 
3 1234 12/10/16 Bus 290 FALSE   1  0 
4 1234 12/10/16 Train 310 FALSE   0  0 
5 1234 12/10/16 Bus 330 TRUE   2  0 
6 4567 12/10/16 Bus 220 FALSE   0  0 
7 4567 12/10/16 Bus 230 TRUE   1  1 
8 4567 13/10/16 Bus 290 FALSE   0  0 
9 4567 13/10/16 Bus 450 FALSE   0  0 
10 4567 14/10/16 Train 1000 FALSE   0  0 

要在數據庫中執行此,請參閱dplyr database vignette