編輯
我繼續修改combine
「處理」不平等訓練集大小。所有這些意思都是因爲我刪除了一大堆試圖將事情拼湊在一起而無法匹配的代碼。但我一直認爲結合了森林的部分,所以你仍然可以使用predict
:
my_combine <- function (...)
{
pad0 <- function(x, len) c(x, rep(0, len - length(x)))
padm0 <- function(x, len) rbind(x, matrix(0, nrow = len -
nrow(x), ncol = ncol(x)))
rflist <- list(...)
areForest <- sapply(rflist, function(x) inherits(x, "randomForest"))
if (any(!areForest))
stop("Argument must be a list of randomForest objects")
rf <- rflist[[1]]
classRF <- rf$type == "classification"
trees <- sapply(rflist, function(x) x$ntree)
ntree <- sum(trees)
rf$ntree <- ntree
nforest <- length(rflist)
haveTest <- !any(sapply(rflist, function(x) is.null(x$test)))
vlist <- lapply(rflist, function(x) rownames(importance(x)))
numvars <- sapply(vlist, length)
if (!all(numvars[1] == numvars[-1]))
stop("Unequal number of predictor variables in the randomForest objects.")
for (i in seq_along(vlist)) {
if (!all(vlist[[i]] == vlist[[1]]))
stop("Predictor variables are different in the randomForest objects.")
}
haveForest <- sapply(rflist, function(x) !is.null(x$forest))
if (all(haveForest)) {
nrnodes <- max(sapply(rflist, function(x) x$forest$nrnodes))
rf$forest$nrnodes <- nrnodes
rf$forest$ndbigtree <- unlist(sapply(rflist, function(x) x$forest$ndbigtree))
rf$forest$nodestatus <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$nodestatus, nrnodes)))
rf$forest$bestvar <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$bestvar, nrnodes)))
rf$forest$xbestsplit <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$xbestsplit, nrnodes)))
rf$forest$nodepred <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$nodepred, nrnodes)))
tree.dim <- dim(rf$forest$treemap)
if (classRF) {
rf$forest$treemap <- array(unlist(lapply(rflist,
function(x) apply(x$forest$treemap, 2:3, pad0,
nrnodes))), c(nrnodes, 2, ntree))
}
else {
rf$forest$leftDaughter <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$leftDaughter, nrnodes)))
rf$forest$rightDaughter <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$rightDaughter, nrnodes)))
}
rf$forest$ntree <- ntree
if (classRF)
rf$forest$cutoff <- rflist[[1]]$forest$cutoff
}
else {
rf$forest <- NULL
}
#
#Tons of stuff removed here...
#
if (classRF) {
rf$confusion <- NULL
rf$err.rate <- NULL
if (haveTest) {
rf$test$confusion <- NULL
rf$err.rate <- NULL
}
}
else {
rf$mse <- rf$rsq <- NULL
if (haveTest)
rf$test$mse <- rf$test$rsq <- NULL
}
rf
}
然後你就可以測試它是這樣的:
data(iris)
d <- iris[sample(150,150),]
d1 <- d[1:70,]
d2 <- d[71:150,]
rf1 <- randomForest(Species ~ ., d1, ntree=50, norm.votes=FALSE)
rf2 <- randomForest(Species ~ ., d2, ntree=50, norm.votes=FALSE)
rf.all <- my_combine(rf1,rf2)
predict(rf.all,newdata = iris)
顯然,這種帶有絕對沒有保修! :)
這兩個數據集的確切結構是什麼? (使用'str'來查看)。他們是否有完全相同的變量,都以同樣的方式命名? – joran
@joran他們做的。他們都是我手動拆分的較大訓練集的子集。當我運行'str'時,第一行是:''data.frame':\t 38735 obs。 55個變量:'對於d1和''data.frame':\t 38734 obs。 55個變量:'對於d2。後面是每個數據集的相同名稱。 – josh
檢查每個射頻對象的投票對象的維數。並非所有因素水平都存在於您的訓練數據的每個子集中。 – joran