2013-04-26 49 views
10

我想要一個函數,它返回矢量匹配子序列的初始指示。例如:在較大的矢量中匹配序列

y <- c("a","a","a","b","c") 

multi_match(c("a","a"), y) 
# [1] 1 2 

multi_match(c("a","b"), y) 
# [1] 3 

我有一個粗略的實現,但我覺得我必須重新發明輪子,它有點笨重。有沒有更好的方法來實現這一點,還是有一個預先存在的功能在某處具有類似的功能?

multi_match <- function(x, table){ 
    # returns initial indicies of all substrings in table which match x 
    if(length(table) < length(x)){ 
     return(NA) 
    }else{ 
     check_mat <- matrix(nrow = length(x), ncol = length(table)) 
     for(i in 1:length(x)){ 
      check_mat[i,] <- table %in% x[i] 
     } 
     out <- vector(length = length(table)) 
     for(i in 1:(length(table)-(length(x)-1))){ 
      check <- vector(length=length(x)) 
      for(j in 1:length(x)){ 
       check[j] <- check_mat[j,(i+(j-1))] 
      } 
      out[i] <- all(check) 
     } 
     if(length(which(out))==0){ 
      return(NA) 
     }else{ 
      return(which(out)) 
     } 
    } 
} 
+1

有很多字符串中的Bioconductor的包'BioStrings'匹配功能,雖然這一點也適用「字符串',即'長度爲1的字符向量',而不是你擁有的向量。 http://www.bioconductor.org/packages/2.12/bioc/html/Biostrings.html – 2013-04-26 19:51:29

回答

16

嘗試rollapply動物園:

> library(zoo) 
> which(rollapply(y, 2, identical, c("a", "a"))) 
[1] 1 2 
> which(rollapply(y, 2, identical, c("a", "b"))) 
[1] 3 
+0

這很好! – JoFrhwld 2013-04-26 19:55:37

2
set.seed(0) 
a <- sample(1:6,12000, TRUE) 
b <- 2:4 

vecIn <- function(a,b){ 
which(
Reduce('+', lapply(seq_along(y <- lapply(b, '==', a)), function(x){ 
              y[[x]][x:(length(a) - length(b) +x)] 
              } 
       ) 
    ) == length(b) 
    ) 
} 

> vecIn(a,b) 
[1]  2 154 986 1037 1046 1257 1266 1750 2375 2677 3184 3206 
[13] 3499 3526 3882 4238 4311 4388 4437 4580 4714 4766 4827 5046 
[25] 5279 5629 6153 6842 6856 6919 7200 7516 7520 7707 7824 7859 
[37] 8140 8191 8687 9208 9281 9313 10022 10320 10617 10720 10958 11179 
[49] 11567 11591 11698 11811 

library(zoo) 
library(rbenchmark) 

func1 <- function(a,b){ 
gregexpr(paste0(b,collapse=""),paste0(a,collapse="")) 
} 

func2 <- function(a,b){ 
which(rollapply(a, length(b), identical, b)) 
} 

func3 <- vecIn 

一些基準

benchmark(func1(a,b), func2(a,b), func3(a,b)) 
     test replications elapsed relative user.self sys.self user.child 
1 func1(a, b)   100 0.673 5.904  0.680 0.000   0 
2 func2(a, b)   100 28.808 252.702 28.198 0.672   0 
3 func3(a, b)   100 0.114 1.000  0.116 0.000   0 
    sys.child 
1   0 
2   0 
3   0