我花了很多時間四處觀察,無法找到解決具體問題的解決方案。我真的很感激任何幫助。解析具有兩列的文件,詳細描述行的父 - 子關係
我有一個data.frame有5列,其中每行詳細說明一個特定的分類學級別(即superkingdom Bacteria)。這些級別中的每一級都通過父子關係與另一行(和級別)相關。 taxon_id是一個唯一值(即superkingdom Bacteria的taxon_id爲2),並通過parent_id變量連接到其他行(即Proteobacteria門被嵌套在superkingdom細菌內)。作爲一個快速回顧,taxon_id是唯一的值(和父值),parent_id是每個孩子如何關聯回父。還有一個丰度變量,將所有子行的所有丰度相加,使其成爲累計和)。
這裏是什麼樣的數據幀看起來像一個例子:
abundance <- c(0.8157, 0.8153, 0.4947, 0.4807, 0.3444, 0.3444, 0.3444, 0.3444, 0.3444, 0.2104, 0.1466, 0.1401, 0.1299, 0.1299, 0.123, 0.1112, 0.1034, 0.1034, 0.1034, 0.1034, 0.07469, 0.07469, 0.06833, 0.06833, 0.06384, 0.06384, 0.06314, 0.06314, 0.06298, 0.06271, 0.0625, 0.0625, 0.0625, 0.06184, 0.06184, 0.05218, 0.05218, 0.05218, 0.04547, 0.04547, 0.04547, 0.04532, 0.03121, 0.02122, 0.01438, 0.01438, 0.01438, 0.01438, 0.01438, 0.01411, 0.01329, 0.01329, 0.01329, 0.01329, 0.0121, 0.0121, 0.0121, 0.01116, 0.01071, 0.008966, 0.008088, 0.008067, 0.007638, 0.007638, 0.00761, 0.00761, 0.00761, 0.00761, 0.00761, 0.007381, 0.006763, 0.00676, 0.00676, 0.00676, 0.006413, 0.004669, 0.004669, 0.004669, 0.004669, 0.004669, 0.004637, 0.004637, 0.004637, 0.004637, 0.004637, 0.003418, 0.003346, 0.002479, 0.002479, 0.002479, 0.002479, 0.001818, 0.001818, 0.001818, 0.001818)
taxon_id <- c(131567, 2, 1224, 1236, 135619, 224372, 1177179, 519051, 59753, 1239, 91061, 186826, 1301, 1300, 1313, 869308, 2037, 85003, 1760, 201174, 543, 91347, 570, 573, 186802, 186801, 31979, 1485, 1491, 1221327, 85006, 85023, 33882, 1160710, 36807, 72407, 1185419, 1460422, 662, 641, 135623, 672, 1219061, 85007, 135622, 53246, 267888, 1307437, 176102, 1229202, 85025, 1817, 1206725, 37327, 1743, 31957, 85009, 1747, 1159092, 28211, 1284824, 1236101, 1716, 1653, 1883, 2062, 85011, 1160718, 114687, 81852, 38284, 1350, 1352, 525279, 1311, 198251, 54526, 82117, 198252, 314261, 90371, 59201, 28901, 590, 1218143, 525260, 862512, 204441, 41295, 191, 95605, 356, 41294, 1073, 95607)
parent_id <- c(1, 131567, 2, 1224, 1236, 135619, 519051, 59753, 224372, 2, 1239, 91061, 1300, 186826, 1301, 1313, 85003, 1760, 201174, 2, 91347, 1236, 543, 570, 186801, 1239, 186802, 31979, 1485, 1491, 2037, 85006, 85023, 36807, 33882, 573, 1460422, 72407, 641, 135623, 1236, 662, 672, 2037, 1236, 267888, 135622, 176102, 53246, 672, 85007, 85025, 37327, 1817, 31957, 85009, 2037, 1743, 1313, 1224, 573, 573, 1653, 85007, 2062, 85011, 2037, 114687, 1883, 186826, 1716, 81852, 1350, 1352, 1301, 54526, 82117, 28211, 198251, 198252, 59201, 28901, 590, 543, 90371, 38284, 38284, 28211, 204441, 41295, 191, 28211, 356, 41294, 1073)
rank <- c("no_rank", "superkingdom", "phylum", "class", "order", "family", "strain", "species", "genus", "phylum", "class", "order", "genus", "family", "species", "strain", "order", "subclass", "class", "phylum", "family", "order", "genus", "species", "order", "class", "family", "genus", "species", "strain", "suborder", "family", "genus", "strain", "species", "subspecies", "no rank", "strain", "genus", "family", "order", "species", "strain", "suborder", "order", "genus", "family", "strain", "species", "strain", "family", "genus", "strain", "species", "genus", "family", "suborder", "species", "strain", "class", "strain", "strain", "genus", "family", "genus", "family", "suborder", "strain", "species", "family", "species", "genus", "species", "strain", "species", "genus", "no rank", "no rank", "species", "strain", "strain", "subspecies", "species", "genus", "no rank", "strain", "strain", "order", "family", "genus", "species", "order", "family", "genus", "species")
taxonomy_name <- c("cellular_organisms", "Bacteria", "Proteobacteria", "Gammaproteobacteria", "Oceanospirillales", "Alcanivoracaceae", "Alcanivorax_hongdengensis_A-11-3", "Alcanivorax_hongdengensis", "Alcanivorax", "Firmicutes", "Bacilli", "Lactobacillales", "Streptococcus", "Streptococcaceae", "Streptococcus_pneumoniae", "Streptococcus_pneumoniae_SPN021198", "Actinomycetales", "Actinobacteridae", "Actinobacteria", "Actinobacteria", "Enterobacteriaceae", "Enterobacteriales", "Klebsiella", "Klebsiella_pneumoniae", "Clostridiales", "Clostridia", "Clostridiaceae", "Clostridium", "Clostridium_botulinum", "Clostridium_botulinum_CDC66177", "Micrococcineae", "Microbacteriaceae", "Microbacterium", "Microbacterium_laevaniformans_OR221", "Microbacterium_laevaniformans", "Klebsiella_pneumoniae_subsp._pneumoniae", "Klebsiella_pneumoniae_subsp._pneumoniae_ST258-K26BO", "Klebsiella_pneumoniae_subsp._pneumoniae_ST258", "Vibrio", "Vibrionaceae", "Vibrionales", "Vibrio_vulnificus", "Vibrio_vulnificus_NBRC_15645_=_ATCC_27562", "Corynebacterineae", "Alteromonadales", "Pseudoalteromonas", "Pseudoalteromonadaceae", "Pseudoalteromonas_agarivorans_S816", "Pseudoalteromonas_agarivorans", "Vibrio_vulnificus_B2", "Nocardiaceae", "Nocardia", "Nocardia_brevicatena_NBRC_12119", "Nocardia_brevicatena", "Propionibacterium", "Propionibacteriaceae", "Propionibacterineae", "Propionibacterium_acnes", "Streptococcus_pneumoniae_PNI0010", "Alphaproteobacteria", "Klebsiella_pneumoniae_VAKPC252", "Klebsiella_pneumoniae_JHCK1", "Corynebacterium", "Corynebacteriaceae", "Streptomyces", "Streptomycetaceae", "Streptomycineae", "Streptomyces_auratus_AGR0001", "Streptomyces_auratus", "Enterococcaceae", "Corynebacterium_accolens", "Enterococcus", "Enterococcus_faecium", "Enterococcus_faecium_TX1330", "Streptococcus_agalactiae", "Candidatus_Pelagibacter", "SAR11_cluster", "unclassified_Alphaproteobacteria", "Candidatus_Pelagibacter_ubique", "Candidatus_Pelagibacter_ubique_HTCC1002", "Salmonella_enterica_subsp._enterica_serovar_Typhimurium", "Salmonella_enterica_subsp._enterica", "Salmonella_enterica", "Salmonella", "Salmonella_enterica_subsp._enterica_serovar_Typhimurium_str._STm1", "Corynebacterium_accolens_ATCC_49725", "Corynebacterium_accolens_ATCC_49726", "Rhodospirillales", "Rhodospirillaceae", "Azospirillum", "Azospirillum_sp._B4", "Rhizobiales", "Bradyrhizobiaceae", "Rhodopseudomonas", "Rhodopseudomonas_sp._B29")
mydata <- data.frame(abundance, taxon_id, parent_id, rank, taxonomy_name)
我想這data.frame解析成不同的格式,更可分析。這將是這個樣子:
Taxa <- c("cellular_organisms(no_rank)", "cellular_organisms(no_rank)_Bacteria(superkingdom)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)_Alcanivorax(genus)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)_Alcanivorax(genus)_Alcanivorax_hongdengensis(species)", "cellular_organisms(no_rank)_Bacteria(superkingdom)_Proteobacteria(phylum)_Gammaproteobacteria(class)_Oceanospirillales(order)_Alcanivoracaceae(family)_Alcanivorax(genus)_Alcanivorax_hongdengensis(species)_Alcanivorax_hongdengensis_A-11-3(strain)")
Proportion <- c(0, 0.0103, 0.01396, 0.00176, 0, 0, 0, 0, 0.3444)
OutcomeData <- data.frame(Taxa, Proportion)
正如你可以從原始數據看,豐細菌是0.8153,但減去所有的子行的豐後,我們得到的0.0103剩餘部分。我想爲所有完整的父子關係生成這樣的表格。關於這個示例表的另一個簡要說明,爲了簡單起見,我排除了大量的子樣本,因此這裏的數學計算不會添加回到之前的父樣本,數據會嚴重分支,但我想要一個更清晰的示例。
是否該比例應該是節點的丰度減去所有孩子的多餘?所以一個孩子的丰度會從父母和祖父母的... – cr1msonB1ade
@ cr1msonB1ade是的。孩子的豐富程度將從父母和祖父母等中減去。一旦所有的孩子丰度都被減去,那麼使用餘數作爲丰度。 – EpiBlake