2017-03-06 26 views
0

我在優化循環以根據data.frame內的條件累積添加數字時遇到問題。下面是輸入data.frame其中包含從接近大數據集的幾行,以一百萬行:根據data.frame內的條件(大數據集)累加的循環

inputData <- structure(list(SNP_pos = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L), .Label = c("SNP_1", "SNP_2", "SNP_3", "SNP_4", "SNP_5", "SNP_6", "SNP_7", "SNP_8", "SNP_9", "SNP_10", "SNP_11", "SNP_12", "SNP_13", "SNP_14"), class = "factor"), sample_id = callele1 = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), sample_id_x = c(8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9152L, 9152L), allele2 = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 4L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), snp_diff = c(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0), IBS = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1), IBD = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1)), .Names = c("SNP_pos", "sample_id", "allele1", "sample_id_x", "allele2", "snp_diff", "IBS", "IBD"), row.names = c(NA, 100L), class = "data.frame")

及以下的預期輸出data.frame:

outputData <- structure(list(SNP_pos = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L), .Label = c("SNP_1", "SNP_2", "SNP_3", "SNP_4", "SNP_5", "SNP_6", "SNP_7", "SNP_8", "SNP_9", "SNP_10", "SNP_11", "SNP_12", "SNP_13", "SNP_14"), class = "factor"), sample_id = callele1 = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), sample_id_x = c(8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9152L, 9152L), allele2 = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 4L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), snp_diff = c(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0), IBS = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1), IBD = c(NA, NA, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 1, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 1, 0, 1)), .Names = c("SNP_pos", "sample_id", "allele1", "sample_id_x", "allele2", "snp_diff", "IBS", "IBD"), row.names = c(NA, 100L), class = "data.frame")

下面是我使用生成的輸出文件的代碼:

for (i in 1:nrow(inputData)) { inputData$IBD<-ifelse(inputData$IBD==0,inputData$IBD<-inputData$IBD,ifelse (inputData$allele1==inputData$allele2&inputData$sample_id_x!=shift(inputData$sample_id_x),inputData$IBD<-inputData$IBD,ifelse (inputData$allele1==inputData$allele2&inputData$sample_id_x==shift(inputData$sample_id_x),inputData$IBD<-shift(inputData$IBD)+1,inputData$IBD<-inputData$IBD))) }

  1. 第一條件比較是否柱IBD == 0並且如果是這樣,它留下IBD爲0
  2. 第二條件然後檢查是否柱allele1 == allele2以及,sample_id_x是否不等於之前的sample_id_x(上面的那個)。如果滿足這個條件,那麼IBD應該保持不變。
  3. 最後,如果列allele1 == allele2和sample_id_x ==之前的sample_id_x(它上面的那個),那麼將IBD添加到先前的IBD(它上面的那個),否則保持原樣。 上面的代碼工作,但運行了很長時間,我需要更多的資源,我的for循環。

請需要優化的代碼,或提出一個更好的一個輔助...

回答

0
#First, create a vector with boolean where sub-conditions of the third condition are met  
temp = as.numeric(c(FALSE, sapply(2:nrow(inputData), function(i) 
    inputData$sample_id_x[i] == inputData$sample_id_x[i-1])) & #1st sub-condition 
     (inputData$allele1 == inputData$allele2) & #2nd sub-condition 
     inputData$IBD != 0) #3rd sub-condition 

#If the value in 'IBD' is zero, then temp2 = 0, otherwise 1 
temp2 = as.numeric(temp + inputData$IBD != 0) 

ave(temp2, 
#Everytime 'temp' is zero, it starts a new group 
cumsum(sapply(1:length(temp), function(x) ifelse(temp[x]==0, 1, 0))), 
FUN = cumsum) 
+1

謝謝@ d.b。你是救命的人,很長時間以來,這讓我們「把頭撞在磚牆上」...... –