2013-10-23 90 views
4

我有一個矩陣「mat」,其中012編碼SNPs爲列,人爲行。例如:根據第二個數據集和拆分列重新編碼

> mat<-matrix(c("0","1","0","1","2","0","1","1","2"),3,byrow=T) 
> rownames(mat)<-c("ID1","ID2","ID3") 
> colnames(mat)<-c("rs123","rs333","rs9000") 

> mat 

    rs123 rs333 rs9000 
ID1 "0" "1" "0" 
ID2 "1" "2" "0" 
ID3 "1" "1" "2" 

在一個不同的矩陣「MAT2」我有兩列(即主要和次要等位基因)和的SNP爲行相應的等位基因。

> mat2<-matrix(c("A","T","C","T","T","G"),3,byrow=T) 
> rownames(mat2)<-c("rs123","rs333","rs9000") 
> colnames(mat2)<-c("Allele_A","Allele_B") 

> mat2 

     Allele_A Allele_B 
rs123  "A"  "T"  
rs333  "C"  "T"  
rs9000  "T"  "G" 

現在我要重新編碼從第一矩陣012編碼的單核苷酸多態性是在兩列:他們應該是各自等位基因A有兩個新列,如果他們的代碼是零,A/B,如果它是一個和B/B如果是兩個。在我的例子中,我想獲得以下內容:

> mat3<-matrix(c("A","C","T","A","T","T","A","T","T","T","T","T","A","C","G","T","T","G"),3,byrow=T) 
> rownames(mat3)<-c("ID1","ID2","ID3") 
> colnames(mat3)<-c("rs123_1","rs333_1","rs9000_1","rs123_2","rs333_2","rs9000_2") 

> mat3 

    rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2 
ID1  "A"  "C"  "T"  "A"  "T"  "T"  
ID2  "A"  "T"  "T"  "T"  "T"  "T"  
ID3  "A"  "C"  "G"  "T"  "T"  "G" 

你能幫我實現嗎?先謝謝你!

回答

4
library(reshape2) 
library(data.table) 

mat<-data.table(matrix(c("ID1","0","1","0","ID2","1","2","0","ID3","1","1","2"),3,byrow=T)) 
setnames(mat, c("ID","rs123","rs333","rs9000")) 

mat2<-data.table(matrix(c("rs123","A","T","rs333","C","T","rs9000","T","G"),3,byrow=T)) 
setnames(mat2, c("rs","Allele_A","Allele_B")) 

mat <- data.table(melt(mat, id.vars = 'ID')) 
setnames(mat,'variable','rs') 

mat <- merge(mat,mat2, by = 'rs', all.x = TRUE) 
mat[,a1 := Allele_A] 
mat[value == 2,a1 := Allele_B] 
mat[,a2 := Allele_B] 
mat[value == 0,a2 := Allele_A] 

mat1 <- dcast(mat, ID~rs, value.var = 'a1') 
mat2 <- dcast(mat, ID~rs, value.var = 'a2') 
mat <- merge(mat1,mat2,by = 'ID', suffixes = c('_1','_2')) 

輸出

> mat 
    ID rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2 
1 ID1  A  C  T  A  T  T 
2 ID2  A  T  T  T  T  T 
3 ID3  A  C  G  T  T  G 
+0

完美,這爲我做了這份工作! – VGaertner

2
繼承人

另一個data.table溶液

library(data.table) 
DT.mat1 <- as.data.table(mat) 
DT.mat2 <- as.data.table(mat2, keep.rownames=TRUE) 

## Make sure all characters 
DT.mat2[, names(DT.mat2) := lapply(.SD, as.character)] 

# set key to the row names 
setkey(DT.mat2, rn) 

DT.mat1[, setNames(nm=paste(names(.SD), "_", rep(1:2, times=ncol(.SD))), 
       unlist(lapply(names(.SD), function(x) { 
         .sdx <- .SD[[x]] 
         DT.mat2[.(x)][, list(ifelse(.sdx==2, Allele_B, Allele_A), 
              ifelse(.sdx==0, Allele_B, Allele_A))] 
       }), recursive=FALSE, use.names=FALSE) 
)] 

rs123 _ 1 rs333 _ 2 rs9000 _ 1 rs123 _ 2 rs333 _ 1 rs9000 _ 2 
1:   A   T   C   C   T   G 
2:   A   A   T   C   T   G 
3:   A   A   C   C   G   T 

您可以使用microbenchmark(來自microbenchmark軟件包)來計時。我不知道你的實際數據有多大,但我估計隨着數據的增長,這將是相當快

+0

和最不可讀的獎去......;) –

+0

它遍歷'.SD'(mat1 cols)的名稱並將它們用作mat2的關鍵字。然後兩個簡單的'ifelse'語句......顯然是 –

+0

! +1爲聰明。缺乏可讀性似乎是嵌套的'data.table'解決方案的一個共同特徵 - 有些人可能會說,對於速度增益而言是一個公平的價格。 –

0

相當少比提出Codoremifa,作爲一個功能:

allele_count_to_genotype <- function(M_count, M_geno){ 

    # assuming same order of SNPs in both matrices! 

    # generate empty target matrix with right col and row names 
    M_target <- matrix(ncol=ncol(M_count)*2, nrow=nrow(M_count)) 
    rownames(M_target) <- rownames(M_count) 
    colnames(M_target) <- c(paste(colnames(M_count),"1",sep="_"), 
          paste(colnames(M_count),"2",sep="_")) 

    # fill the target matrix with counts 
    M_target[,1:ncol(M_count)]     <- M_count 
    M_target[, (ncol(M_count)+1):ncol(M_target)] <- M_count 

    # substitute the counts 
    for(k in 1:ncol(M_count)){ 
     M_target[,k] <- gsub("0", M_geno[k,1], M_target[,k]) 
     M_target[,k] <- gsub("1", M_geno[k,1], M_target[,k]) 
     M_target[,k] <- gsub("2", M_geno[k,2], M_target[,k]) 
     M_target[,(ncol(M_count)+k)] <- gsub("0", M_geno[k,1], M_target[,(ncol(M_count)+k)]) 
     M_target[,(ncol(M_count)+k)] <- gsub("1", M_geno[k,2], M_target[,(ncol(M_count)+k)]) 
     M_target[,(ncol(M_count)+k)] <- gsub("2", M_geno[k,2], M_target[,(ncol(M_count)+k)]) 
    } 

    return(M_target) 
} 

效果很好:

> allele_count_to_genotype(M_count=mat, M_geno=mat2) 
    rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2 
ID1 "A"  "C"  "T"  "A"  "T"  "T"  
ID2 "A"  "T"  "T"  "T"  "T"  "T"  
ID3 "A"  "C"  "G"  "T"  "T"  "G" 
1

下面是使用stackswitch一個基礎R方式,並unstack

s.mat <- stack(data.frame(mat)) 
left.allele.choice <- function(x) switch(x, "0"=1, "1"=1, "2"=2) 
right.allele.choice <- function(x) switch(x, "0"=1, "1"=2, "2"=2) 
get.allele.choice <- function(allele.choice) { 
    mapply(function(values, ind) mat2[ind, allele.choice(values)], 
      values=s.mat$values, ind=s.mat$ind) 
} 
left.side <- unstack(transform(s.mat, 
           values=get.allele.choice(left.allele.choice))) 
right.side <- unstack(transform(s.mat, 
           values=get.allele.choice(right.allele.choice))) 
result <- cbind(left.side, right.side) 
colnames(result) <- make.unique(colnames(result)) 
# rs123 rs333 rs9000 rs123.1 rs333.1 rs9000.1 
# 1  A  C  T  A  T  T 
# 2  A  T  T  T  T  T 
# 3  A  C  G  T  T  G