2016-03-02 101 views
1

這裏賦予功能到另一列的新列是我處理的數據的樣本數據幀。對於那些熟悉基因數據格式的人來說,它基本上是一個修改後的VCF文件。如果沒有,基本上每行都包含變體可能存在的基因組中位置的信息。通過在數據幀

samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"), 
    Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L, 
    8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L, 
    8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L, 
    3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G", 
    "T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L, 
    1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C", 
    "G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997", 
    "AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029" 
    ), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149", 
    "rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889", 
    "rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom", 
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame") 

我想要做的是從「信息」列中提取值。但是,此列中包含的信息對於每行都不相同,並不總是以相同的順序出現。因此,我想使用模式匹配來獲取我感興趣的值。

我寫了一個小函數來提取包含在Info列中的各種「超級種羣」(例如AMR,AFR,EUR,SAS,EAS)的「等位基因頻率」(AF)。

extractAF <- function(pop, vec) { 
    info <- unlist((strsplit(vec, ";", fixed=TRUE))) 
    AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2]) 
    return(AF) 
} 

此功能需要兩個參數:「流行」,這是一個字符串,指定超級人口以提取,並且其目的是把我的數據幀的信息欄的「VEC」。

extractAF("AFR_AF", samp[1,'Info']) 
#[1] 0.5779 

extractAF("AFR_AF", samp[5,'Info']) 
#[1] 0.6528 

不過,我希望它做的數據幀中的每一行,並創建包含數據的新列:通過一個單一的載體,通過當

功能按預期工作。當我使用dplyr的功能發生變異,我風與相同值的列:

library("dplyr") 
mutate(samp, AFR_AF = extractAF("AFR_AF", Info)) 

我看了一個帖子(我似乎現在不能找到,否則我會引用它),上述發生變異一次傳遞所有行,而不是我需要的逐行。

所以,我想下面的幾個其他的方式在此基礎上post

apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x)) 

錯誤申請(SAMP [ 「信息」],1,函數(X)extractAF( 「AMR_AF」,X) ): 暗淡(X)必須有一個正長度

samp[, extractAF("AMR_AF", Info), by = .I] 

錯誤[.data.frame(SAMP,extractAF( 「AMR_AF」,信息),通過= .I): 未使用的參數(由= .I)

samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)] 

Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) : 
    unused argument (by = 1:nrow(samp)) 

UPDATE

一個包含NA和AF = 0在下面的INFO列附加樣本數據集:

結構(列表(CHROM = C( 「CHR1」,「CHR1 (「rs6429774」,「rs6429776」,「chr1」,「chr1」,「chr1」, 「chr1」),POS = c(16090898L,16091074L,16091583L,16092212L, 16093560L,16093639L) NA, 「rs74528955」,「rs904912」,NA),REF = c(「G」,「A」,「T」,「C」,「T」,「C」), ALT = c(「A 「,」G「,」A「,」T「,」 A「,」T「),QUAL = c(NA,NA,NA,NA,NA, NA),FILTER = c(NA,NA,NA,NA,NA,NA),INFO = = 1606; AF = 0.320687; AN = 5008; NS = 2504; DP = 21565; EAS_AF = 0.1419; AMR_AF = 0.2983; AFR_AF = 0.525; EUR_AF = 0.3509; SAS_AF = 0.2137; AA = G |||; CSQ = A | ENSG00000162458 | ENST00000441801 | Transcript | upstream_gene_variant ||||||| 96 | 1 ||||||; ERB = A || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335「,」AC = 1690; AF = 0。33746; AN = 5008; NS = 2504; DP = 20247; EAS_AF = 0.1498; AMR_AF = 0.3012; AFR_AF = 0.5681; EUR_AF = 0.3549; SAS_AF = 0.227; AA = G |||; CSQ = G | ENSG00000162458 | ENST00000441801 |解說詞| 5_prime_UTR_variant | 81 | |||||| 1 | ||||||; ERB = G || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335「,NA, 」AC = 8; AF = 0.00159744; AN = 5008 ; NS = 2504; DP = 19197; EAS_AF = 0.0079; AMR_AF = 0; AFR_AF = 0; EUR_AF = 0; SAS_AF = 0; AA = C |||; CSQ = T | ENSG00000162458 | ENST00000441801 |文稿| ||| intron_variant ||||| 1 = |||||| GENCODE = ENST00000441801; ERB = T || proximal_1216 | Regulator_Feature | proximal_enhancer; FUNSEQ = 0.3335「,」AC = 3282; AF = 0.655351; AN = 5008; NS = 2504; DP = 14721; EAS_AF = 0.8343; AMR_AF = 0.6916; AFR_AF = 0.4259; EUR_AF = 0.6531; SAS_AF = 0.7577; AA = A |||; CSQ = A | ENSG00000162458 | ENST00000441801 |文稿| intron_variant |||||||| 1 ||||||; GENCODE = ENST00000441801; FUNSEQ = 0.1483「, 」AC = 5; AF = 0.000998403; AN = 5008; NS = 2504; DP = 14736; EAS_AF = 0.003; AMR_AF = 0; AFR_AF = 0 ; EUR_AF = 0; SAS_AF = 0.002; AA = C |||; CSQ = T | ENSG00000162458 | ENST00000441 801 | Transcript | intron_variant |||||||| 1 ||||||; GENCODE = ENST00000441801; FUNSEQ = 0.1483「 )),row.names = 14:19,class =」data.frame「,。名= C( 「CHROM」, 「POS」, 「ID」, 「REF」, 「ALT」, 「QUAL」, 「FILTER」, 「INFO」))

+1

你的應用功能接近'申請(SAMP,1,函數(X)extractAF(「AFR_AF 「,x [5]))' –

+0

請正確格式化您的代碼 –

回答

3

您可能不需要這些公式因爲sub是向量化的。首先創建所有可能的代碼的變量,如(AFR,AMR,EUR等...)。使用向量來創建搜索模式,要經過Info柱,並與所有的比賽返回一個新的數據幀:

all_pop <- c("AMR_AF", "AFR_AF", "EUR_AF", "SAS_AF", "EAS_AF") 
pat <- paste0(".*\\b", all_pop, "=(\\d+(\\.\\d+)?)\\b.*") 

out <- sapply(pat, sub, "\\1", samp$Info) 
newdf <- setNames(as.data.frame(out), all_pop) 
#  AMR_AF AFR_AF EUR_AF SAS_AF EAS_AF 
# 1 0.8357 0.5779 0.7366 0.8466 0.9921 
# 2 0.8444 0.6725 0.7366 0.8538 0.9921 
# 3 0.8415 0.6558 0.7376 0.8466 0.9921 
# 4 0.8386 0.6339 0.7376 0.8466 0.9921 
# 5 0.8487 0.6528 0.7714 0.8599 0.9921 
# 6 0.8458 0.6362 0.7714 0.8599 0.9911 
# 7 <NA> <NA> <NA> <NA> <NA> 
# 8 <NA> <NA> <NA> <NA> <NA> 
# 9 0.7954 0.5651 0.7167 0.82 0.9653 
# 10 <NA> <NA> <NA> <NA> <NA> 
# 11 0.8458 0.6362 0.7724 0.8671 0.9921 
# 12 0.8473 0.6528 0.7724 0.8671 0.9921 
# 13 0.8473 0.6346 0.7724 0.8671 0.9921 
+1

非常好的解決方案!任何不使用「cbind」將其與原始數據框結合的原因? – ONeillMB1

+0

是的,你可以結合它,但你找到最好的 –

+0

在將這個應用到我的大型數據集,當AF = 0時,我遇到了一個問題。我對regexp的知識很差,但我認爲這與「 d +「? – ONeillMB1