我從幾個重疊的NP /個體中找到了我試圖比較的基因分型數據。 正如你在下面的數據結構中看到的,e[1,2]
和e[2,3]
有NA。現在我想用NA值代替d[1,2]
(1)和d[2,3]
(1)。根據另一個數據表中的NA替換一個數據表中的值
d <- structure(list(`100099681` = c(0L, 2L, 0L), `101666591` = c(1L, 1L, 0L), `102247652` = c(1L, 1L, 1L), `102284616` = c(0L, 1L, 0L), `103582612` = c(0L, 1L, 1L), `104344528` = c(2L, 1L, 0L), `105729734` = c(1L, 0L, 1L), `109897137` = c(0L, 0L, 2L), `112768301` = c(0L, 1L, 1L), `114724443` = c(1L, 1L, 1L), `114826164` = c(1L, 0L, 1L), `115358770` = c(0L, 2L, 0L), `115399788` = c(1L, 1L, 0L), `118669033` = c(0L, 1L, 1L), `118875482` = c(2L, 1L, 0L), `119366362` = c(0L, 2L, 0L), `119627971` = c(0L, 1L, 1L), `120295351` = c(0L, 2L, 0L), `120998030` = c(0L, 0L, 2L)), .Names = c("100099681", "101666591", "102247652", "102284616", "103582612", "104344528", "105729734", "109897137", "112768301", "114724443", "114826164", "115358770", "115399788", "118669033", "118875482", "119366362", "119627971", "120295351", "120998030"), row.names = c("7:100038150_C", "7:100079759_T", "7:100256942_A"), class = "data.frame")
> d
# 100099681 101666591 102247652 102284616 103582612 104344528 105729734 109897137 112768301 114724443 114826164 115358770 115399788 118669033 118875482 119366362 119627971 120295351 120998030
#7:100038150_C 0 1 1 0 0 2 1 0 0 1 1 0 1 0 2 0 0 0 0
#7:100079759_T 2 1 1 1 1 1 0 0 1 1 0 2 1 1 1 2 1 2 0
#7:100256942_A 0 0 1 0 1 0 1 2 1 1 1 0 0 1 0 0 1 0 2
e<- structure(list(`100099681` = c(1L, 1L, 0L), `101666591` = c(NA, 1L, 1L), `102247652` = c(0L, NA, 0L), `102284616` = c(1L, 1L, 0L), `103582612` = c(1L, 0L, 1L), `104344528` = c(1L, 0L, 1L), `105729734` = c(0L, 0L, 1L), `109897137` = c(1L, 1L, 0L), `112768301` = c(0L, 1L, 1L), `114724443` = c(0L, 2L, 0L), `114826164` = c(0L, 0L, 2L), `115358770` = c(0L, 0L, 2L), `115399788` = c(0L, 2L, 0L), `118669033` = c(0L, 0L, 2L), `118875482` = c(0L, 1L, 1L), `119366362` = c(2L, 1L, 0L), `119627971` = c(0L, 1L, 1L), `120295351` = c(0L, 2L, 0L), `120998030` = c(0L, 2L, 1L)), .Names = c("100099681", "101666591", "102247652", "102284616", "103582612", "104344528", "105729734", "109897137", "112768301", "114724443", "114826164", "115358770", "115399788", "118669033", "118875482", "119366362", "119627971", "120295351", "120998030"), row.names = c("7:100038150_C", "7:100079759_T", "7:100256942_A"), class = "data.frame")
> e
# 100099681 101666591 102247652 102284616 103582612 104344528 105729734 109897137 112768301 114724443 114826164 115358770 115399788 118669033 118875482 119366362 119627971 120295351 120998030
#7:100038150_C 1 NA 0 1 1 1 0 1 0 0 0 0 0 0 0 2 0 0 0
#7:100079759_T 1 1 NA 1 0 0 0 1 1 2 0 0 2 0 1 1 1 2 2
#7:100256942_A 0 1 0 0 1 1 1 0 1 0 2 2 0 2 1 0 1 0 1
因此我預期的產出將是
> expected_d
# 100099681 101666591 102247652 102284616 103582612 104344528 105729734 109897137 112768301 114724443 114826164 115358770 115399788 118669033 118875482 119366362 119627971 120295351 120998030
#7:100038150_C 0 NA 1 0 0 2 1 0 0 1 1 0 1 0 2 0 0 0 0
#7:100079759_T 2 1 NA 1 1 1 0 0 1 1 0 2 1 1 1 2 1 2 0
#7:100256942_A 0 0 1 0 1 0 1 2 1 1 1 0 0 1 0 0 1 0 2
到目前爲止,我已經得到了這一點;
g <- which(is.na(e), arr.ind=TRUE)
> g
# row col
#7:100038150_C 1 2
#7:100079759_T 2 3
然後嘗試使用apply
功能由「TEST」更換位置(或NA爲此事)
apply(g, 1, function(x){
e[x[1], x[2]] <- "TEST" }
)
#> apply(g, 1, function(x){ e[x[1], x[2]] <- "TEST" })
#7:100038150_C 7:100079759_T
# "TEST" "TEST"
我將在幾百萬行/列運行這段代碼所以速度將是一個問題。 預先感謝您:)
哇。這很聰明。 –
與Steven一致,在0/1/2的數據集上像一個魅力一樣工作,但不適用於包含字符的列:'在NA ^(is.na(adf))中的錯誤* tdf:非數字參數二元運算符 執行停止' – Bas
@Bas您的示例沒有顯示任何非數字列。 – akrun