SampX <- textConnection("CCT6 - Characters 1-33
GAT1 - Characters 34-68
IMD3 - Characters 69-99
PDR3 - Characters 100-130
RIM15 - Characters 131-168")
dfSampX <-read.table(SampX, sep="-")
dfSampX$V4 <- as.numeric(sub("Characters ", "", dfSampX$V2))
sampdat <- read.table(textConnection("Sample Data
1 000000000000000000000000000N01000000000000N0N000000000N00N0000NN00N0N000000100000N00N0N0000000NNNN011111111111111111111111111111110000000000000000000N000000N0000000000N
2 000000000000000000000000000N01000000000000N0N000000000N00N0000NN00N0N000000100000N00N0N0000000NNNN011111111111111111111111111111110000000000000000000N000000N0000000000N
"), header=TRUE,stringsAsFactors=FALSE)
此代碼將細分爲羣:
apply(dfSampX[,c(3,4)], 1, function(x) substr(sampdat[,2], x["V4"], x["V3"]))
[,1] [,2]
[1,] "000000000000000000000000000N01000" "000000000N0N000000000N00N0000NN00N0"
[2,] "000000000000000000000000000N01000" "000000000N0N000000000N00N0000NN00N0"
[,3] [,4]
[1,] "N000000100000N00N0N0000000NNNN0" "1111111111111111111111111111111"
[2,] "N000000100000N00N0N0000000NNNN0" "1111111111111111111111111111111"
[,5]
[1,] "0000000000000000000N000000N0000000000N"
[2,] "0000000000000000000N000000N0000000000N"
這個代碼將提供以列表格式片段:
res <- lapply(sampdat$Data, function(x)
apply(dfSampX[,c(3,4)], 1, function(y) substr(x, y["V4"], y["V3"])))
res2 <- lapply(res, function(x){ names(x) <- dfSampX$V1 ; return(x)})
res2
[[1]]
CCT6 GAT1
"000000000000000000000000000N01000" "000000000N0N000000000N00N0000NN00N0"
IMD3 PDR3
"N000000100000N00N0N0000000NNNN0" "1111111111111111111111111111111"
RIM15
"0000000000000000000N000000N0000000000N"
[[2]]
CCT6 GAT1
"000000000000000000000000000N01000" "000000000N0N000000000N00N0000NN00N0"
IMD3 PDR3
"N000000100000N00N0N0000000NNNN0" "1111111111111111111111111111111"
RIM15
"0000000000000000000N000000N0000000000N"
而且能獲得指定的輸出格式:
for (samp in seq_along(res2)) { cat("Sample ", samp, "\n")
invisible(sapply(1:5, function(y)
cat(as.character(dfSampX$V1[y]), " - ", res2[[samp]][y], "\n"))) }
Sample 1
CCT6 - 000000000000000000000000000N01000
GAT1 - 000000000N0N000000000N00N0000NN00N0
IMD3 - N000000100000N00N0N0000000NNNN0
PDR3 - 1111111111111111111111111111111
RIM15 - 0000000000000000000N000000N0000000000N
Sample 2
CCT6 - 000000000000000000000000000N01000
GAT1 - 000000000N0N000000000N00N0000NN00N0
IMD3 - N000000100000N00N0N0000000NNNN0
PDR3 - 1111111111111111111111111111111
RIM15 - 0000000000000000000N000000N0000000000N
The 01需要來抑制列表結構中的NULL返回。
這很好用!我只是不知道如何保存最終數據,以便以後可以再次訪問它。 –
你可以打開一個文件連接並使用帶有'con ='參數的writeLines,或者你可以使用'save(strs,file =「strpieces.rda」)' –
現在用這個代碼運行的一個問題是它從最終結構中的數據中分離出原始樣本ID號。在我的例子中,樣本從1開始依次出現。但是,在我的實際數據集中,情況並非如此。我怎樣才能保持連接,以便最終的輸出將具有原始數據表中附加到分解字符串的任何樣本? –