2014-02-26 74 views
0

我有data.frame(df)看起來像:集羣定性數據

   ZN.N   ZL.N 
MMP2 (1.89,3.58] (2.13,4.1] 
AEBP1 (1.89,3.58] (2.13,4.1] 
A1AG1 (1.89,3.58] (2.13,4.1] 
A1AT [0.364,1.89] [0.275,2.13] 
A2MG [0.364,1.89] [0.275,2.13] 
ENOA (1.89,3.58] (2.13,4.1] 

而且我想集羣基於兩個變量row.names(蛋白質)( ZN.N和ZL.N)。我可以使用這種類型的數據的方法還是使用層次聚類?

我已經試過

df.k2 <- k.means(df, 2) 

,但它不工作。我聚類真正的新所以道歉的問題是否真的傻了,非常感謝

這裏是我的data.frame的dput

structure(list(ZN.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("[0.364,1.89]", "(1.89,3.58]"), class = "factor"), 
ZL.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 
2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 
1L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 1L, 
3L, 3L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 1L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 3L, 1L, 3L, 2L, 
1L, 1L, 2L, 3L, 1L), .Label = c("[0.275,2.13]", "(2.13,4.1]", 
"(4.1,6.78]"), class = "factor")), .Names = c("ZN.N", "ZL.N"), class = "data.frame", row.names = c("MMP2", "AEBP1", "A1AG1", "A1AT", "A2MG", "ENOA", "ANGI", "ANGL2", "ANT3", "APOA1", "APOA2", "APOD", "PGBM", "PGS1", "CAH3", "CRAC1", "CILP1", "CILP2", "COMP", "CH3L1", "CH3L2", "CSPG4", "CCD80", "CO1A1", "CO2A1", "CO3A1", "CO6A1", "COCA1", "COFA1", "COIA1", "CO1A2", "CO6A2", "COBA2", "CO6A3", "C1QB", "C1R", "C1S", "CO3", "CO4B", "CO8A", "CFAB", "CFAH", "CRP", "KCRM", "CLC3A", "ECM1", "FIBA", "FIBB", "FIBG", "FGFP2", "FMOD", "FINC", "FBLN1", "FSTL1", "G3P", "HPT", "HBA", "HBB", "H2B1L", "H32", "H4", "HPLN1", "IGHA1", "IGHG1", "IGKC", "LAC6", "IGHM", "INHBA", "IBP3", "ITIH1", "MMP1", "LDHA", "LYSC", "TIMP1", "TIMP2", "MIME", "MOES", "MYG", "NID2", "NUCB1", "OSTP", "PPIA", "PPIB", "POSTN", "PRDX2", "PGAM1", "PA2GA", "PLTP", "PEDF", "IPSP", "LMNA", "PCOC1", "PRELP", "AMBP", "PDIA3", "PDIA6", "S10AA", "S10A8", "PRG4", "KPYM", "RNAS1", "HTRA1", "TRFE", "ALBU", "SAMP", "SMOC2", "MMP3", "TARSH", "TENA", "TENX", "TETN", "TSP3", "TSP4", "BGH3", "TTHY", "TR11B", "RL40", "CSPG2", "VIME", "VTNC")) 

回答

2

您在使用集羣故障的原因是kmeans需要一個數字矩陣,但是您要爲該函數提供一個包含因子變量的數據框。

你可以代替轉換這些因素爲數字,然後運行kmeans

set.seed(144) 
df$ZN.N <- as.numeric(df$ZN.N) 
df$ZL.N <- as.numeric(df$ZL.N) 
clusters <- kmeans(df, 2)$cluster 

clusters1 <- names(clusters[clusters == 1]) 
clusters1 
# [1] "MMP2" "AEBP1" "A1AG1" "ENOA" "APOA1" "PGS1" "CAH3" "CO1A1" "CO3A1" 
# [10] "C1R" "CO8A" "CRP" "KCRM" "FIBB" "FIBG" "HPT" "HBA" "H32" 
# [19] "H4" "IGHG1" "IGKC" "INHBA" "MYG" "NID2" "POSTN" "PLTP" "PEDF" 
# [28] "LMNA" "PDIA3" "PDIA6" "S10AA" "S10A8" "TENA" "TETN" "TSP3" "BGH3" 
# [37] "VIME" 
clusters2 <- names(clusters[clusters == 2]) 
clusters2 
# [1] "A1AT" "A2MG" "ANGI" "ANGL2" "ANT3" "APOA2" "APOD" "PGBM" "CRAC1" 
# [10] "CILP1" "CILP2" "COMP" "CH3L1" "CH3L2" "CSPG4" "CCD80" "CO2A1" "CO6A1" 
# [19] "COCA1" "COFA1" "COIA1" "CO1A2" "CO6A2" "COBA2" "CO6A3" "C1QB" "C1S" 
# [28] "CO3" "CO4B" "CFAB" "CFAH" "CLC3A" "ECM1" "FIBA" "FGFP2" "FMOD" 
# [37] "FINC" "FBLN1" "FSTL1" "G3P" "HBB" "H2B1L" "HPLN1" "IGHA1" "LAC6" 
# [46] "IGHM" "IBP3" "ITIH1" "MMP1" "LDHA" "LYSC" "TIMP1" "TIMP2" "MIME" 
# [55] "MOES" "NUCB1" "OSTP" "PPIA" "PPIB" "PRDX2" "PGAM1" "PA2GA" "IPSP" 
# [64] "PCOC1" "PRELP" "AMBP" "PRG4" "KPYM" "RNAS1" "HTRA1" "TRFE" "ALBU" 
# [73] "SAMP" "SMOC2" "MMP3" "TARSH" "TENX" "TSP4" "TTHY" "TR11B" "RL40" 
# [82] "CSPG2" "VTNC" 

在這段代碼,ZN.N被轉換成數字1和2,和ZL.N被轉換成數字1,2,然後kmeans計算聚類的點之間的歐氏距離。您必須確定這對您的應用程序是否合理。