2014-03-27 37 views
9

我只是想弄清楚如何做兩個data.tables條件連接。有條件(不等式)加入data.table

dt2

dt2

我寫了一個sqldf條件加入給我,其開始或結束時間是其他的開始/結束時間內的電路。

sqldf("select dt2.start, dt2.finish, dt2.counts, dt1.id, dt1.circuit 
     from dt2 
     left join dt1 on (
      (dt2.start >= dt1.start and dt2.start < dt1.finish) or 
      (dt2.finish >= dt1.start and dt2.finish < dt1.finish) 
     )") 

result

這給了我正確的結果,但它是我大十歲上下的數據集太慢。

什麼是data.table沒有矢量掃描的方法?

這裏是我的數據:

dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L, 
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200, 
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000, 
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""), 
    end = structure(c(1393626600, 1393631519, 1393639200, 1393632000, 
    1393660500, 1393673400, 1393667999, 1393671600, 1393677000 
    ), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001", 
    "1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009" 
    ), class = "factor")), .Names = c("circuit", "start", "end", 
"id"), class = "data.frame", row.names = c(NA, -9L))) 


dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct", 
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000 
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L, 
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L, 
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L, 
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end", 
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame")) 
+0

你應該可以通過在data.table中滾動連接來做到這一點。 – Roland

+0

您可以嘗試按照sqldf主頁的速度添加索引。也可以嘗試使用MySQL而不是SQLite來查看是否更快。 –

+0

「dt2」是時間尺度的一個分區,'dt1'的時間沒有限制嗎? –

回答

8

使用新實施非等距加入功能在目前的開發人員版本,v1.9.7:

ans = dt1[dt2, on=.(start <= end, end > start), 
      .(i.start, i.end, counts, id, circuit, cndn = i.start < x.start & i.end >= x.end), 
      allow.cartesian=TRUE 
     ][!cndn %in% TRUE] 

條件start <= end, end >= start(注意>=箱子)將檢查如果兩個區間以任何方式重疊。一側的開放間隔由end > start部分(>而不是>=)完成。但它仍然還拿起類型的間隔:

  dt1: start=================end 
    dt2: start--------------------------------end ## start < start, end > end 

  dt1: start=================end 
       dt2: start----------end   ## end == end 

cndn列是檢查並刪除這些案件。希望這些情況不是很多,以免我們不必要地實現不需要的行。

請參閱開發版here的安裝說明。


PS:在這種情況下,解決方案不是那麼簡單,因爲我想的是,那是因爲該解決方案需要一個OR操作。有可能做兩個條件連接,然後將它們綁定在一起。

也許在某些時候,我們不得不考慮以更直接的方式將連接擴展到這些操作的可行性。

4

如果執行得更快,但在這裏是在數據表的方法了一槍不知道。我重塑dt1並使用findInterval來確定dt2中的時間與dt1中的時間對齊的位置。

dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L, 
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200, 
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000, 
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""), 
    end = structure(c(1393626600, 1393631519, 1393639200, 1393632000, 
    1393660500, 1393673400, 1393667999, 1393671600, 1393677000 
    ), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001", 
    "1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009" 
    ), class = "factor")), .Names = c("circuit", "start", "end", 
"id"), class = "data.frame", row.names = c(NA, -9L))) 

dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct", 
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000 
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L, 
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L, 
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L, 
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end", 
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame")) 

# > dt1 
    # circuit    start     end id 
# 1:  b 2014-02-28 16:00:00 2014-02-28 17:30:00 1001 
# 2:  a 2014-02-28 17:52:00 2014-02-28 18:51:59 1002 
# 3:  b 2014-02-28 18:00:00 2014-02-28 21:00:00 1003 
# 4:  a 2014-02-28 18:52:00 2014-02-28 19:00:00 1004 
# 5:  b 2014-03-01 00:05:00 2014-03-01 02:55:00 1005 
# 6:  c 2014-02-28 23:00:00 2014-03-01 06:30:00 1006 
# 7:  a 2014-03-01 01:40:00 2014-03-01 04:59:59 1007 
# 8:  a 2014-03-01 05:00:00 2014-03-01 06:00:00 1008 
# 9:  b 2014-03-01 04:30:00 2014-03-01 07:30:00 1009 

# > dt2 
        # start     end seconds counts 
# 1: 2014-02-28 16:00:00 2014-02-28 16:59:59 3599  1 
# 2: 2014-02-28 17:00:00 2014-02-28 17:30:00 1800  1 
# 3: 2014-02-28 17:30:00 2014-02-28 17:51:59 1319  0 
# 4: 2014-02-28 17:51:59 2014-02-28 17:59:59  480  1 
# 5: 2014-02-28 17:59:59 2014-02-28 19:00:00 3601  2 
# 6: 2014-02-28 19:00:00 2014-02-28 21:00:00 7200  1 
# 7: 2014-02-28 21:00:00 2014-02-28 22:59:59 7199  0 
# 8: 2014-02-28 22:59:59 2014-03-01 00:04:59 3900  1 
# 9: 2014-03-01 00:04:59 2014-03-01 01:39:59 5700  2 
# 10: 2014-03-01 01:39:59 2014-03-01 02:55:00 4501  3 
# 11: 2014-03-01 02:55:00 2014-03-01 04:29:59 5699  2 
# 12: 2014-03-01 04:29:59 2014-03-01 06:00:00 5401  3 
# 13: 2014-03-01 06:00:00 2014-03-01 06:30:00 1800  2 
# 14: 2014-03-01 06:30:00 2014-03-01 07:30:00 3600  1 

## reshapes dt1 from wide to long 
## puts start and end times into one column and sorts by time 
## this is so that you can use findInterval later 
dt3 <- dt1[,list(time = c(start,end)), by = "circuit,id"][order(time)] 
dt3[,ntvl := seq_len(nrow(dt3))] 
    # circuit id    time ntvl 
# 1:  b 1001 2014-02-28 16:00:00 1 
# 2:  b 1001 2014-02-28 17:30:00 2 
# 3:  a 1002 2014-02-28 17:52:00 3 
# 4:  b 1003 2014-02-28 18:00:00 4 
# 5:  a 1002 2014-02-28 18:51:59 5 
# 6:  a 1004 2014-02-28 18:52:00 6 
# 7:  a 1004 2014-02-28 19:00:00 7 
# 8:  b 1003 2014-02-28 21:00:00 8 
# 9:  c 1006 2014-02-28 23:00:00 9 
# 10:  b 1005 2014-03-01 00:05:00 10 
# 11:  a 1007 2014-03-01 01:40:00 11 
# 12:  b 1005 2014-03-01 02:55:00 12 
# 13:  b 1009 2014-03-01 04:30:00 13 
# 14:  a 1007 2014-03-01 04:59:59 14 
# 15:  a 1008 2014-03-01 05:00:00 15 
# 16:  a 1008 2014-03-01 06:00:00 16 
# 17:  c 1006 2014-03-01 06:30:00 17 
# 18:  b 1009 2014-03-01 07:30:00 18 

## map interval to id 
dt4 <- dt3[,list(ntvl = seq(from = min(ntvl), to = max(ntvl)-1), by = 1),by = "circuit,id"] 
setkey(dt4, ntvl) 
    # circuit id ntvl 
# 1:  b 1001 1 
# 2:  a 1002 3 
# 3:  a 1002 4 
# 4:  b 1003 4 
# 5:  b 1003 5 
# 6:  b 1003 6 
# 7:  a 1004 6 
# 8:  b 1003 7 
# 9:  c 1006 9 
# 10:  c 1006 10 
# 11:  b 1005 10 
# 12:  c 1006 11 
# 13:  b 1005 11 
# 14:  a 1007 11 
# 15:  c 1006 12 
# 16:  a 1007 12 
# 17:  c 1006 13 
# 18:  a 1007 13 
# 19:  b 1009 13 
# 20:  c 1006 14 
# 21:  b 1009 14 
# 22:  c 1006 15 
# 23:  b 1009 15 
# 24:  a 1008 15 
# 25:  c 1006 16 
# 26:  b 1009 16 
# 27:  b 1009 17 
    # circuit id ntvl 

## finds intervals in dt2 
dt2[,`:=`(ntvl_start = findInterval(start, dt3[["time"]], rightmost.closed = FALSE), 
    ntvl_end = findInterval(end, dt3[["time"]], rightmost.closed = FALSE))] 
        # start     end seconds counts ntvl_start ntvl_end 
# 1: 2014-02-28 16:00:00 2014-02-28 16:59:59 3599  1   1  1 
# 2: 2014-02-28 17:00:00 2014-02-28 17:30:00 1800  1   1  2 
# 3: 2014-02-28 17:30:00 2014-02-28 17:51:59 1319  0   2  2 
# 4: 2014-02-28 17:51:59 2014-02-28 17:59:59  480  1   2  3 
# 5: 2014-02-28 17:59:59 2014-02-28 19:00:00 3601  2   3  7 
# 6: 2014-02-28 19:00:00 2014-02-28 21:00:00 7200  1   7  8 
# 7: 2014-02-28 21:00:00 2014-02-28 22:59:59 7199  0   8  8 
# 8: 2014-02-28 22:59:59 2014-03-01 00:04:59 3900  1   8  9 
# 9: 2014-03-01 00:04:59 2014-03-01 01:39:59 5700  2   9  10 
# 10: 2014-03-01 01:39:59 2014-03-01 02:55:00 4501  3   10  12 
# 11: 2014-03-01 02:55:00 2014-03-01 04:29:59 5699  2   12  12 
# 12: 2014-03-01 04:29:59 2014-03-01 06:00:00 5401  3   12  16 
# 13: 2014-03-01 06:00:00 2014-03-01 06:30:00 1800  2   16  17 
# 14: 2014-03-01 06:30:00 2014-03-01 07:30:00 3600  1   17  18 

## joins, by start time, then by end time 
## the commented out lines may be a better alternative 
## if there are many NA values 
setkey(dt2, ntvl_start) 
dt_ans_start <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = NA] 
# dt_ans_start <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = 0] 
# dt_ans_start_na <- dt2[!dt4] 
setkey(dt2, ntvl_end) 
dt_ans_end <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = NA] 
# dt_ans_end <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = 0] 
# dt_ans_end_na <- dt2[!dt4] 

## bring them all together and remove duplicates 
dt_ans <- unique(rbind(dt_ans_start, dt_ans_end), by = c("start", "id")) 
dt_ans <- dt_ans[!(is.na(id) & counts > 0)] 
dt_ans[,ntvl := NULL] 
setkey(dt_ans,start) 
        # start     end counts id circuit 
# 1: 2014-02-28 16:00:00 2014-02-28 16:59:59  1 1001  b 
# 2: 2014-02-28 17:00:00 2014-02-28 17:30:00  1 1001  b 
# 3: 2014-02-28 17:30:00 2014-02-28 17:51:59  0 NA  NA 
# 4: 2014-02-28 17:51:59 2014-02-28 17:59:59  1 1002  a 
# 5: 2014-02-28 17:59:59 2014-02-28 19:00:00  2 1002  a 
# 6: 2014-02-28 17:59:59 2014-02-28 19:00:00  2 1003  b 
# 7: 2014-02-28 19:00:00 2014-02-28 21:00:00  1 1003  b 
# 8: 2014-02-28 21:00:00 2014-02-28 22:59:59  0 NA  NA 
# 9: 2014-02-28 22:59:59 2014-03-01 00:04:59  1 1006  c 
# 10: 2014-03-01 00:04:59 2014-03-01 01:39:59  2 1006  c 
# 11: 2014-03-01 00:04:59 2014-03-01 01:39:59  2 1005  b 
# 12: 2014-03-01 01:39:59 2014-03-01 02:55:00  3 1006  c 
# 13: 2014-03-01 01:39:59 2014-03-01 02:55:00  3 1005  b 
# 14: 2014-03-01 01:39:59 2014-03-01 02:55:00  3 1007  a 
# 15: 2014-03-01 02:55:00 2014-03-01 04:29:59  2 1006  c 
# 16: 2014-03-01 02:55:00 2014-03-01 04:29:59  2 1007  a 
# 17: 2014-03-01 04:29:59 2014-03-01 06:00:00  3 1006  c 
# 18: 2014-03-01 04:29:59 2014-03-01 06:00:00  3 1007  a 
# 19: 2014-03-01 04:29:59 2014-03-01 06:00:00  3 1009  b 
# 20: 2014-03-01 06:00:00 2014-03-01 06:30:00  2 1006  c 
# 21: 2014-03-01 06:00:00 2014-03-01 06:30:00  2 1009  b 
# 22: 2014-03-01 06:30:00 2014-03-01 07:30:00  1 1009  b 
        # start     end counts id circuit