也許我錯過了一些東西,因爲這似乎是一個簡單的問題,但是我在網上查找並沒有在文獻中找到任何東西。基於節點值約束的k均值聚類
基本上我需要做的是根據它們的位置(所以緯度/經度作爲每個節點的特徵,相似性度量的歐幾里德距離)對具有固定數量的簇的一組目的地城市進行聚類。一切似乎都很好,一個K-means就可以做到這一點。但是,對於每個羣集,我有以下限制:每個城市(節點)都有相應的值分配給它,並且每個羣集中這些值的總和不應超過固定閾值(所有羣集的閾值相同)。有沒有簡單的方法來做到這一點?
也許我錯過了一些東西,因爲這似乎是一個簡單的問題,但是我在網上查找並沒有在文獻中找到任何東西。基於節點值約束的k均值聚類
基本上我需要做的是根據它們的位置(所以緯度/經度作爲每個節點的特徵,相似性度量的歐幾里德距離)對具有固定數量的簇的一組目的地城市進行聚類。一切似乎都很好,一個K-means就可以做到這一點。但是,對於每個羣集,我有以下限制:每個城市(節點)都有相應的值分配給它,並且每個羣集中這些值的總和不應超過固定閾值(所有羣集的閾值相同)。有沒有簡單的方法來做到這一點?
你有2種選擇:
- 你可以改用rpart包作爲一個集羣,並使用重量和minbucket選項。然而,預測會給你的集羣將是矩形。
- 你可以看看源代碼,我在 https://searchcode.com/codesearch/view/18689414/發現:
kmeans <-
function(x, centers, iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"))
{
do_one <- function(nmeth) {
Z <-
switch(nmeth,
{ # 1
Z <- .Fortran(C_kmns, x, m, p,
centers = centers,
as.integer(k), c1 = integer(m), integer(m),
nc = integer(k), double(k), double(k), integer(k),
double(m), integer(k), integer(k),
as.integer(iter.max), wss = double(k),
ifault = 0L)
switch(Z$ifault,
stop("empty cluster: try a better set of initial centers",
call.=FALSE),
warning(gettextf("did not converge in %d iterations",
iter.max), call.=FALSE, domain =NA),
stop("number of cluster centres must lie between 1 and nrow(x)",
call.=FALSE)
)
Z
},
{ # 2
Z <- .C(C_kmeans_Lloyd, x, m, p,
centers = centers, as.integer(k),
c1 = integer(m), iter = as.integer(iter.max),
nc = integer(k), wss = double(k))
if(Z$iter > iter.max)
warning("did not converge in ",
iter.max, " iterations", call.=FALSE)
if(any(Z$nc == 0))
warning("empty cluster: try a better set of initial centers", call.=FALSE)
Z
},
{ # 3
Z <- .C(C_kmeans_MacQueen, x, m, p,
centers = as.double(centers), as.integer(k),
c1 = integer(m), iter = as.integer(iter.max),
nc = integer(k), wss = double(k))
if(Z$iter > iter.max)
warning("did not converge in ",
iter.max, " iterations", call.=FALSE)
if(any(Z$nc == 0))
warning("empty cluster: try a better set of initial centers", call.=FALSE)
Z
})
Z
}
x <- as.matrix(x)
m <- as.integer(nrow(x))
if(is.na(m)) stop("invalid nrow(x)")
p <- as.integer(ncol(x))
if(is.na(p)) stop("invalid ncol(x)")
if(missing(centers))
stop("'centers' must be a number or a matrix")
nmeth <- switch(match.arg(algorithm),
"Hartigan-Wong" = 1,
"Lloyd" = 2, "Forgy" = 2,
"MacQueen" = 3)
if(length(centers) == 1L) {
if (centers == 1) nmeth <- 3
k <- centers
## we need to avoid duplicates here
if(nstart == 1)
centers <- x[sample.int(m, k), , drop = FALSE]
if(nstart >= 2 || any(duplicated(centers))) {
cn <- unique(x)
mm <- nrow(cn)
if(mm < k)
stop("more cluster centers than distinct data points.")
centers <- cn[sample.int(mm, k), , drop=FALSE]
}
} else {
centers <- as.matrix(centers)
if(any(duplicated(centers)))
stop("initial centers are not distinct")
cn <- NULL
k <- nrow(centers)
if(m < k)
stop("more cluster centers than data points")
}
if(iter.max < 1) stop("'iter.max' must be positive")
if(ncol(x) != ncol(centers))
stop("must have same number of columns in 'x' and 'centers'")
if(!is.double(x)) storage.mode(x) <- "double"
if(!is.double(centers)) storage.mode(centers) <- "double"
Z <- do_one(nmeth)
best <- sum(Z$wss)
if(nstart >= 2 && !is.null(cn))
for(i in 2:nstart) {
centers <- cn[sample.int(mm, k), , drop=FALSE]
ZZ <- do_one(nmeth)
if((z <- sum(ZZ$wss)) < best) {
Z <- ZZ
best <- z
}
}
centers <- matrix(Z$centers, k)
dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
cluster <- Z$c1
if(!is.null(rn <- rownames(x)))
names(cluster) <- rn
totss <- sum(scale(x, scale = FALSE)^2)
structure(list(cluster = cluster, centers = centers, totss = totss,
withinss = Z$wss, tot.withinss = best,
betweenss = totss - best, size = Z$nc),
class = "kmeans")
}
## modelled on print methods in the cluster package
print.kmeans <- function(x, ...)
{
cat("K-means clustering with ", length(x$size), " clusters of sizes ",
paste(x$size, collapse=", "), "\n", sep="")
cat("\nCluster means:\n")
print(x$centers, ...)
cat("\nClustering vector:\n")
print(x$cluster, ...)
cat("\nWithin cluster sum of squares by cluster:\n")
print(x$withinss, ...)
cat(sprintf(" (between_SS/total_SS = %5.1f %%)\n",
100 * x$betweenss/x$totss),
"Available components:\n", sep="\n")
print(names(x))
invisible(x)
}
fitted.kmeans <- function(object, method = c("centers", "classes"), ...)
{
method <- match.arg(method)
if (method == "centers") object$centers[object$cl, , drop=FALSE]
else object$cl
}
請注意,如果代碼檢查的改善發生這些行:
if((z <- sum(ZZ$wss)) < best) {
Z <- ZZ
best <- z
}
在這裏你可以添加你的約束。
這是一個**揹包**類型的問題,而不是一個聚類問題。另外,不要在緯度/經度上使用k-means。 –
不,這不是一個揹包問題。它只是與約束聚類。 –