2017-08-28 36 views
1

我有以下數據框。group_by總結和摺疊

 df <- structure(list(Genecoverage = c(19.8511111111111, 10.1516966067864, 
14.5631205673759, 7.25225225225225, 10.774011299435, 11.4794520547945, 
17.7967032967033, 12.6770616770617, 14.1375, 13.2422422422422, 
14.0379403794038, 11.4844006568145, 21.296875, 18.90625, 24.3293253173013 
), Gene = c("k141_32902_11", "k141_32902_16", "k141_32902_22", 
"k141_32902_23", "k141_32902_27", "k141_32902_28", "k141_32902_29", 
"k141_32902_3", "k141_32902_30", "k141_32902_37", "k141_32902_38", 
"k141_32902_5", "k141_3238_18", "k141_3238_3", "k141_3238_6"), 
    sample.x = c("10", "10", "10", "10", "10", "10", "10", "10", 
    "10", "10", "10", "10", "10", "10", "10"), Phylum = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Firmicutes", class = "factor"), 
    Class = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 1L, 1L, 1L), .Label = c("Bacilli", "Tissierellia" 
    ), class = "factor"), Order = structure(c(2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Bacillales", 
    "Tissierellales"), class = "factor"), Family = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Peptoniphilaceae", 
    "Staphylococcaceae"), class = "factor"), Genus = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Peptoniphilus", 
    "Staphylococcus"), class = "factor"), Species = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "unknown", class = "factor"), 
    newgene = c("k141_32902", "k141_32902", "k141_32902", "k141_32902", 
    "k141_32902", "k141_32902", "k141_32902", "k141_32902", "k141_32902", 
    "k141_32902", "k141_32902", "k141_32902", "k141_3238", "k141_3238", 
    "k141_3238")), .Names = c("Genecoverage", "Gene", "sample.x", 
"Phylum", "Class", "Order", "Family", "Genus", "Species", "newgene" 
), row.names = c("42481", "42486", "42493", "42494", "42498", 
"42499", "42500", "42501", "42502", "42509", "42510", "42512", 
"41540", "41546", "41552"), class = "data.frame") 

這將返回以下數據框

 Genecoverage   Gene sample.x  Phylum  Class   Order   Family   Genus Species newgene 
42481 19.851111 k141_32902_11  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42486 10.151697 k141_32902_16  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42493 14.563121 k141_32902_22  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42494  7.252252 k141_32902_23  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42498 10.774011 k141_32902_27  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42499 11.479452 k141_32902_28  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42500 17.796703 k141_32902_29  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42501 12.677062 k141_32902_3  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42502 14.137500 k141_32902_30  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42509 13.242242 k141_32902_37  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42510 14.037940 k141_32902_38  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
42512 11.484401 k141_32902_5  10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
41540 21.296875 k141_3238_18  10 Firmicutes  Bacilli  Bacillales Staphylococcaceae Staphylococcus unknown k141_3238 
41546 18.906250 k141_3238_3  10 Firmicutes  Bacilli  Bacillales Staphylococcaceae Staphylococcus unknown k141_3238 
41552 24.329325 k141_3238_6  10 Firmicutes  Bacilli  Bacillales Staphylococcaceae Staphylococcus unknown k141_3238 

我想通過計算基因的中位數,並創建了聚集在被合併的newgenes新列。

newdata <- data.frame(df%>% 

group_by(Phylum,Class,Order,Family,Genus,Species,newgene)%>% 
     summarise_if(is.numeric, median)) 

這將返回以下

 Phylum  Class   Order   Family   Genus Species newgene Genecoverage 
1 Firmicutes  Bacilli  Bacillales Staphylococcaceae Staphylococcus unknown k141_3238  21.29688 
2 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902  12.95965 

我這裏的問題是,I'm失去從基因列中的信息。因此,例如理想情況下,我想創建一個摺疊所有基因列名稱的新列。例如,在輸出數據幀以上newgene k141_3238相當於合併後的基因k141_32902_11,k141_32902_16,k141_32902_22 ...

這是最終的數據幀應該是什麼樣子:

Phylum  Class   Order   Family   Genus Species newgene 
Firmicutes  Bacilli  Bacillales Staphylococcaceae Staphylococcus unknown k141_3238 
Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 
    Genecoverage Concatenated_genes 
     21.3 k141_3238_18,k141_3238_3,k141_3238_6 
     13.0 k141_32902_11,k141_32902_16,k141_32902_22 
+1

好的將從現在開始。 – Scath

+0

你應該可以添加如下內容:summary(Concatenated_genes = paste(Gene,collapse =「,」)) – user108363

+0

不知道該怎麼做? – david

回答

0

我們可以使用summarise_all與用戶 - 定義的功能。如果列是數字,此函數將計算中位數,否則使用toString連接所有字符串信息。

library(dplyr) 

newdata <- df %>% 
    group_by(Phylum, Class, Order, Family, Genus, Species, newgene) %>% 
    select(-sample.x) %>% 
    summarise_all(funs(ifelse(is.numeric(.), median(.), toString(.)))) %>% 
    rename(Concatenated_genes = Gene) 
+0

錯誤重命名(,Concatenated_genes =基因): 未使用的參數(Concatenated_genes =基因) – david

+0

好的,改名爲dplyr :: rename,它工作。 太好了! – david