2015-06-17 23 views
1

我花了很多時間四處觀察,無法找到解決具體問題的解決方案。我真的很感激任何幫助。解析具有兩列的文件,詳細描述行的父 - 子關係

我有一個data.frame有5列,其中每行詳細說明一個特定的分類學級別(即superkingdom Bacteria)。這些級別中的每一級都通過父子關係與另一行(和級別)相關。 taxon_id是一個唯一值(即superkingdom Bacteria的taxon_id爲2),並通過parent_id變量連接到其他行(即Proteobacteria門被嵌套在superkingdom細菌內)。作爲一個快速回顧,taxon_id是唯一的值(和父值),parent_id是每個孩子如何關聯回父。還有一個丰度變量,將所有子行的所有丰度相加,使其成爲累計和)。

這裏是什麼樣的數據幀看起來像一個例子:

abundance <- c(0.8157, 0.8153, 0.4947, 0.4807, 0.3444, 0.3444, 0.3444, 0.3444, 0.3444, 0.2104, 0.1466, 0.1401, 0.1299, 0.1299, 0.123, 0.1112, 0.1034, 0.1034, 0.1034, 0.1034, 0.07469, 0.07469, 0.06833, 0.06833, 0.06384, 0.06384, 0.06314, 0.06314, 0.06298, 0.06271, 0.0625, 0.0625, 0.0625, 0.06184, 0.06184, 0.05218, 0.05218, 0.05218, 0.04547, 0.04547, 0.04547, 0.04532, 0.03121, 0.02122, 0.01438, 0.01438, 0.01438, 0.01438, 0.01438, 0.01411, 0.01329, 0.01329, 0.01329, 0.01329, 0.0121, 0.0121, 0.0121, 0.01116, 0.01071, 0.008966, 0.008088, 0.008067, 0.007638, 0.007638, 0.00761, 0.00761, 0.00761, 0.00761, 0.00761, 0.007381, 0.006763, 0.00676, 0.00676, 0.00676, 0.006413, 0.004669, 0.004669, 0.004669, 0.004669, 0.004669, 0.004637, 0.004637, 0.004637, 0.004637, 0.004637, 0.003418, 0.003346, 0.002479, 0.002479, 0.002479, 0.002479, 0.001818, 0.001818, 0.001818, 0.001818) 
taxon_id <- c(131567, 2, 1224, 1236, 135619, 224372, 1177179, 519051, 59753, 1239, 91061, 186826, 1301, 1300, 1313, 869308, 2037, 85003, 1760, 201174, 543, 91347, 570, 573, 186802, 186801, 31979, 1485, 1491, 1221327, 85006, 85023, 33882, 1160710, 36807, 72407, 1185419, 1460422, 662, 641, 135623, 672, 1219061, 85007, 135622, 53246, 267888, 1307437, 176102, 1229202, 85025, 1817, 1206725, 37327, 1743, 31957, 85009, 1747, 1159092, 28211, 1284824, 1236101, 1716, 1653, 1883, 2062, 85011, 1160718, 114687, 81852, 38284, 1350, 1352, 525279, 1311, 198251, 54526, 82117, 198252, 314261, 90371, 59201, 28901, 590, 1218143, 525260, 862512, 204441, 41295, 191, 95605, 356, 41294, 1073, 95607) 
parent_id <- c(1, 131567, 2, 1224, 1236, 135619, 519051, 59753, 224372, 2, 1239, 91061, 1300, 186826, 1301, 1313, 85003, 1760, 201174, 2, 91347, 1236, 543, 570, 186801, 1239, 186802, 31979, 1485, 1491, 2037, 85006, 85023, 36807, 33882, 573, 1460422, 72407, 641, 135623, 1236, 662, 672, 2037, 1236, 267888, 135622, 176102, 53246, 672, 85007, 85025, 37327, 1817, 31957, 85009, 2037, 1743, 1313, 1224, 573, 573, 1653, 85007, 2062, 85011, 2037, 114687, 1883, 186826, 1716, 81852, 1350, 1352, 1301, 54526, 82117, 28211, 198251, 198252, 59201, 28901, 590, 543, 90371, 38284, 38284, 28211, 204441, 41295, 191, 28211, 356, 41294, 1073) 
rank <- c("no_rank", "superkingdom", "phylum", "class", "order", "family", "strain", "species", "genus", "phylum", "class", "order", "genus", "family", "species", "strain", "order", "subclass", "class", "phylum", "family", "order", "genus", "species", "order", "class", "family", "genus", "species", "strain", "suborder", "family", "genus", "strain", "species", "subspecies", "no rank", "strain", "genus", "family", "order", "species", "strain", "suborder", "order", "genus", "family", "strain", "species", "strain", "family", "genus", "strain", "species", "genus", "family", "suborder", "species", "strain", "class", "strain", "strain", "genus", "family", "genus", "family", "suborder", "strain", "species", "family", "species", "genus", "species", "strain", "species", "genus", "no rank", "no rank", "species", "strain", "strain", "subspecies", "species", "genus", "no rank", "strain", "strain", "order", "family", "genus", "species", "order", "family", "genus", "species") 
taxonomy_name <- c("cellular_organisms", "Bacteria", "Proteobacteria", "Gammaproteobacteria", "Oceanospirillales", "Alcanivoracaceae", "Alcanivorax_hongdengensis_A-11-3", "Alcanivorax_hongdengensis", "Alcanivorax", "Firmicutes", "Bacilli", "Lactobacillales", "Streptococcus", "Streptococcaceae", "Streptococcus_pneumoniae", "Streptococcus_pneumoniae_SPN021198", "Actinomycetales", "Actinobacteridae", "Actinobacteria", "Actinobacteria", "Enterobacteriaceae", "Enterobacteriales", "Klebsiella", "Klebsiella_pneumoniae", "Clostridiales", "Clostridia", "Clostridiaceae", "Clostridium", "Clostridium_botulinum", "Clostridium_botulinum_CDC66177", "Micrococcineae", "Microbacteriaceae", "Microbacterium", "Microbacterium_laevaniformans_OR221", "Microbacterium_laevaniformans", "Klebsiella_pneumoniae_subsp._pneumoniae", "Klebsiella_pneumoniae_subsp._pneumoniae_ST258-K26BO", "Klebsiella_pneumoniae_subsp._pneumoniae_ST258", "Vibrio", "Vibrionaceae", "Vibrionales", "Vibrio_vulnificus", "Vibrio_vulnificus_NBRC_15645_=_ATCC_27562", "Corynebacterineae", "Alteromonadales", "Pseudoalteromonas", "Pseudoalteromonadaceae", "Pseudoalteromonas_agarivorans_S816", "Pseudoalteromonas_agarivorans", "Vibrio_vulnificus_B2", "Nocardiaceae", "Nocardia", "Nocardia_brevicatena_NBRC_12119", "Nocardia_brevicatena", "Propionibacterium", "Propionibacteriaceae", "Propionibacterineae", "Propionibacterium_acnes", "Streptococcus_pneumoniae_PNI0010", "Alphaproteobacteria", "Klebsiella_pneumoniae_VAKPC252", "Klebsiella_pneumoniae_JHCK1", "Corynebacterium", "Corynebacteriaceae", "Streptomyces", "Streptomycetaceae", "Streptomycineae", "Streptomyces_auratus_AGR0001", "Streptomyces_auratus", "Enterococcaceae", "Corynebacterium_accolens", "Enterococcus", "Enterococcus_faecium", "Enterococcus_faecium_TX1330", "Streptococcus_agalactiae", "Candidatus_Pelagibacter", "SAR11_cluster", "unclassified_Alphaproteobacteria", "Candidatus_Pelagibacter_ubique", "Candidatus_Pelagibacter_ubique_HTCC1002", "Salmonella_enterica_subsp._enterica_serovar_Typhimurium", "Salmonella_enterica_subsp._enterica", "Salmonella_enterica", "Salmonella", "Salmonella_enterica_subsp._enterica_serovar_Typhimurium_str._STm1", "Corynebacterium_accolens_ATCC_49725", "Corynebacterium_accolens_ATCC_49726", "Rhodospirillales", "Rhodospirillaceae", "Azospirillum", "Azospirillum_sp._B4", "Rhizobiales", "Bradyrhizobiaceae", "Rhodopseudomonas", "Rhodopseudomonas_sp._B29")  
mydata <- data.frame(abundance, taxon_id, parent_id, rank, taxonomy_name) 

我想這data.frame解析成不同的格式,更可分析。這將是這個樣子:

Taxa <- c("cellular_organisms(no_rank)", "cellular_organisms(no_rank)_Bacteria(superkingdom)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)_Alcanivorax(genus)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)_Alcanivorax(genus)_Alcanivorax_hongdengensis(species)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)_Alcanivorax(genus)_Alcanivorax_hongdengensis(species)_Alcanivorax_hongdengensis_A-11-3(strain)") 
Proportion <- c(0, 0.0103, 0.01396, 0.00176, 0, 0, 0, 0, 0.3444) 
OutcomeData <- data.frame(Taxa, Proportion) 

正如你可以從原始數據看,豐細菌是0.8153,但減去所有的子行的豐後,我們得到的0.0103剩餘部分。我想爲所有完整的父子關係生成這樣的表格。關於這個示例表的另一個簡要說明,爲了簡單起見,我排除了大量的子樣本,因此這裏的數學計算不會添加回到之前的父樣本,數據會嚴重分支,但我想要一個更清晰的示例。

+0

是否該比例應該是節點的丰度減去所有孩子的多餘?所以一個孩子的丰度會從父母和祖父母的... – cr1msonB1ade

+0

@ cr1msonB1ade是的。孩子的豐富程度將從父母和祖父母等中減去。一旦所有的孩子丰度都被減去,那麼使用餘數作爲丰度。 – EpiBlake

回答

1

下面是我製作的一個快速功能 - 它不是寫得很好,但它看起來像是有效的。在相信它之前,你應該先測試一下:

mydata$rank<-as.character(mydata$rank) 
deparsefile<-function(df){ 
    df$newlabel<-paste(df$rank,df$taxonomy_name) 
    df$remainder<-df$abundance 
    parentstodo=c(131567) 
    while(length(parentstodo)>=1){ 
    parentstodo1<-df$taxon_id[df$parent_id %in% parentstodo] 
    for(i in parentstodo){ 
     if(sum(df$parent_id==i)==0){ 
     next() 
     } 
     df$newlabel[df$parent_id==i]<-paste(df$newlabel[df$parent_id==i],df$newlabel[df$taxon_id==i]) 
     df$remainder[df$taxon_id==i]<-df$abundance[df$taxon_id==i]-sum(df$abundance[df$parent_id==i]) 
    } 
    parentstodo<-parentstodo1 
    } 
    df 
} 

deparsefile(mydata) 
+0

謝謝你的迴應。當我在完整的數據集上嘗試這個時,它運行時沒有錯誤,但餘數列給出了奇怪的數字。我得到的是負數,我得到的數字應該是零(例如應變水平比例應該是0.3444)。 – EpiBlake

+0

我剛剛手動檢查了幾條測試線 - 它們看起來很合理。那麼對於那些底層的分類羣(即沒有後代),你希望剩下的總數等於總數? (我剛剛改變了這一點)我認爲負數主要是由於四捨五入。 – jeremycg

+0

太棒了!我也經歷過檢查數字和由於四捨五入的負面因素對我有意義。對於父 - 子樹底部的分類羣(taxon_id在parent_id列中沒有出現),餘數應該等於丰度。 – EpiBlake