2016-01-27 60 views
0

我正在循環中對遺傳標記的數據幀進行'B等位基因頻率'(BAF)計算。基於BAF概念背後的數學邏輯,我試圖編寫一個代碼來執行它,但是效率太低。BAF計算循環的替代

我輸入:

theta <- 'Probe sample1 sample2 sample3 sample4 sample5 AAm ABm BBm 
     AX-1 0.674 0.756 0.694 0.671 0.754 0.167 0.281 0.671 
     AX-2 0.117 0.907 0.501 0.904 0.548 0.116 0.506 0.903 
     AX-3 0.068 0.075 0.071 0.208 0.038 0.06 0.445 0.846' 
theta <- read.table(text=theta, header=T) 

我的腳本:

theta.split <- split(theta, 1:nrow(theta)) 

for(k in 1:length(theta.split)){ 
    thetax <- as.data.frame(theta.split[[k]]) 
    for(i in 2:(ncol(thetax)-3)){ 
    if(as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$AAm))){ 
    thetax[1,i] <- 0} 

    if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$AAm)) && as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$ABm))){ 
     thex <- as.numeric(as.character(thetax[1,i])) 
     theAA <- as.numeric(as.character(thetax$AAm)) 
     theAB <- as.numeric(as.character(thetax$ABm)) 
     bafx <- ((0.5)*(thex - theAA))/(theAB - theAA) 
     thetax[1,i] <- bafx} 

    if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$ABm)) && as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$BBm))){ 
     thex <- as.numeric(as.character(thetax[1,i])) 
     theAB <- as.numeric(as.character(thetax$ABm)) 
     theBB <- as.numeric(as.character(thetax$BBm)) 
     bafx <- 0.5 + ((0.5)*(thex-theAB)/(theBB-theAB)) 
     thetax[1,i] <- bafx} 

    if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$BBm))){ 
     thetax[1,i] <- 1} 

    } 
    theta[k,] <- thetax 
} 
out <- theta 

我的預期輸出:

out <- 'Probe sample1 sample2 sample3 sample4  sample5  AAm  ABm BBm 
     AX-1 1.000 1.000 1.000 1.000 1.000 0.167 0.281 0.671 
     AX-2 0.001 1.000 0.493 1.000 0.552 0.116 0.506 0.903 
     AX-3 0.010 0.019 0.014 0.192 0.000 0.06 0.445 0.846' 
out <- read.table(text=out, header=T) 

我將不勝感激您的任何想法,使這個代碼更聰明。

回答

1

您可以利用apply和vectorised計算來避免循環。下面以剛剛超過三分之一的時間:

library(dplyr) 

#Take main code in your loops out as a function 
#Using vectorised logical calcs instead of if statements 
#sampleVec will be a vector and thetaDf will be the original theta dataframe 
bafxFn <- function(sampleVec, thetaDf) { 

    testAAm <- sampleVec < thetaDf$AAm 
    sampleVec <- sampleVec * (1 - testAAm) 

    testAAmABm <- (sampleVec >= thetaDf$AAm) * (sampleVec < thetaDf$ABm) 
    bafx <- ((0.5) * (sampleVec - thetaDf$AAm))/(thetaDf$ABm - thetaDf$AAm) 
    sampleVec <- testAAmABm * bafx + (1 - testAAmABm) * sampleVec 

    testABmBBm <- (sampleVec >= thetaDf$ABm) * (sampleVec < thetaDf$BBm) 
    bafx <- 0.5 + ((0.5) * (sampleVec - thetaDf$ABm))/(thetaDf$BBm - thetaDf$ABm) 
    sampleVec <- testABmBBm * bafx + (1 - testABmBBm) * sampleVec 

    testBBm <- sampleVec >= thetaDf$BBm 
    sampleVec <- testBBm * 1 + (1 - testBBm) * sampleVec 

    sampleVec 
} 

#Subset original data frame to just leave the sample columns (using dplyr's select function) 
sampleDf <- 
    theta %>% select(-Probe, -AAm, -ABm, -BBm) 

#Use apply to loop through columns of remaining data 
#passing columns in as vectors 
outSampleDf <- 
    sampleDf %>% 
    apply(2, bafxFn, thetaDf = theta) %>% 
    as.data.frame() 

#And then bind results back together (using dplyr's bind_cols) 
outDf <- 
    bind_cols(
    theta %>% select(Probe), 
    outSampleDf, 
    theta %>% select(AAm, ABm, BBm) 
) 

有可能是處理一些子集的一個更合適的方法,卻來概括它在你已經超過5條樣品列案。

outDf 
Source: local data frame [3 x 9] 

    Probe  sample1 sample2 sample3 sample4 sample5 AAm ABm BBm 
    (fctr)  (dbl)  (dbl)  (dbl)  (dbl)  (dbl) (dbl) (dbl) (dbl) 
1 AX-1 1.000000000 1.00000000 1.00000000 1.0000000 1.0000000 0.167 0.281 0.671 
2 AX-2 0.001282051 1.00000000 0.49358974 1.0000000 0.5528967 0.116 0.506 0.903 
3 AX-3 0.010389610 0.01948052 0.01428571 0.1922078 0.0000000 0.060 0.445 0.846