2013-10-11 16 views
2

比方說,我有以下data.frame它涉及它屬於的R包到CRAN任務視圖的名稱:加在一起列

dictionary <- data.frame(task.view = c(rep("High.Performance.Computing", 3), rep("Machine.Learning", 3)), package = c("Rcpp", "HadoopStreaming", "rJava", "e1071", "nnet", "RWeka")) 

#     task.view   package 
# High.Performance.Computing   Rcpp 
# High.Performance.Computing HadoopStreaming 
# High.Performance.Computing   rJava 
#   Machine.Learning   e1071 
#   Machine.Learning   nnet 
#   Machine.Learning   RWeka 

我再算上

package.referals <- data.frame(Rcpp = c(1, 0, 1, 1), HadoopStreaming = c(1, 0, 0, 0), rJava = c(1, 0, 0, 1), e1071 = c(1, 1, 1, 1), nnet = c(1, 0, 0, 0), RWeka = c(1, 0, 0, 1), row.names = paste("student pkg", 1:4)) 

#    Rcpp HadoopStreaming rJava e1071 nnet RWeka 
# student pkg 1 1    1  1  1 1  1 
# student pkg 2 0    0  0  1 0  0 
# student pkg 3 1    0  0  1 0  0 
# student pkg 4 1    0  1  1 0  1 

我如何調整自己的package.referals的列data.frame以上基於我的包任務視圖的data.frame:次數每包是從一個學生寫的四個工具一個叫關係?

E.g.我想輸出是

data.frame(High.Performance.Computing = c(3, 0, 1, 2), Machine.Learning = c(3, 1, 1, 2), row.names = paste("student pkg", 1:4)) 

#    High.Performance.Computing Machine.Learning 
# student pkg 1       3    3 
# student pkg 2       0    1 
# student pkg 3       1    1 
# student pkg 4       2    2 

我嘗試以下,但試圖將其重組爲輸出我想(總結和轉)當我卡住了:

require(data.table) 

# column names of package.referals data.frame 
package.referals.colnames <- names(package.referals) 

# a data.table of my task view and package relations, keyed by package name 
dictionary.dt <- data.table(dictionary, key = "package") 

# a data.table of my package.referals data.frame, transposed, and keyed by package name 
package.referals.dt <- data.table(package = package.referals.colnames, t(package.referals), key="package") 

# Joining data.tables so that the package name and corresponding task view are on the same line 
dt <- package.referals.dt[J(dictionary.dt)] 
setkey(dt, "task.view") 

#   package student pkg 1 student pkg 2 student pkg 3 student pkg 4     task.view 
# 1: HadoopStreaming    1    0    0    0 High.Performance.Computing 
# 2:   Rcpp    1    0    1    1 High.Performance.Computing 
# 3:   rJava    1    0    0    1 High.Performance.Computing 
# 4:   e1071    1    1    1    1   Machine.Learning 
# 5:   nnet    1    0    0    0   Machine.Learning 
# 6:   RWeka    1    0    0    1   Machine.Learning 
+0

@ SimonO101有趣的是,我可以發誓我已經接受了友好提交的解決方案之一。我會再次這樣做 - 我的首選是使用data.table的方法,因爲它是一個包,我試圖移動到我的所有data.frames(無法看到它的任何缺點)。 –

回答

4

這裏是reshape和BA的解決方案SE R:

package.referals$id <- rownames(package.referals) 
pkgr <- melt(package.referals, variable.name="package") 
pkgr <- pkgr[pkgr$value>0,] 
df <- merge(pkgr, dictionary, all.x=TRUE) 
table(df$id, df$task.view) 

如果你真的想用data.table而不是merge,可以代替倒數第三行有:

pkgr <- data.table(pkgr, key="package") 
dictionary <- data.table(dictionary, key="package") 
df <- pkgr[dictionary] 
table(df$id, df$task.view) 
2

您可以匹配和重命名的package.referals列,然後做相同名稱的列rowSums ...

names(package.referals) <- dictionary$task.view[ match(names(package.referals) , dictionary$package) ] 

sapply(unique(names(package.referals)) , function(x) rowSums(package.referals[ , names(package.referals) %in% x ])) 
#    High.Performance.Computing Machine.Learning 
#student pkg 1       3    3 
#student pkg 2       0    1 
#student pkg 3       1    1 
#student pkg 4       2    2 
2

你也可以插入所有的信息在一個單一的data.frame,然後aggregate

dictionary <- data.frame(task.view = c(rep("High.Performance.Computing", 3), rep("Machine.Learning", 3)), package = c("Rcpp", "HadoopStreaming", "rJava", "e1071", "nnet", "RWeka")) 

    package.referals <- data.frame(Rcpp = c(1, 0, 1, 1), HadoopStreaming = c(1, 0, 0, 0), rJava = c(1, 0, 0, 1), e1071 = c(1, 1, 1, 1), nnet = c(1, 0, 0, 0), RWeka = c(1, 0, 0, 1), row.names = paste("student pkg", 1:4)) 

    pack.ref <- as.data.frame(t(package.referals)) #transpose for easier manipulation 

    pack.ref$task.view <- as.character(dictionary$task.view[unlist(lapply(colnames(package.referals), grep, dictionary$package))]) #add column with "task.view" of each package (here is obvious) 

    DF <- as.data.frame(t(aggregate(pack.ref[,1:4], by = list(pack.ref$task.view), sum))) #"aggregate" 

    DF 
    #          V1    V2 
    #Group.1  High.Performance.Computing Machine.Learning 
    #student pkg 1       3    3 
    #student pkg 2       0    1 
    #student pkg 3       1    1 
    #student pkg 4       2    2