該功能是基於鏈接的答案:
CosineSimilarities <- function(m, top.k) {
# Computes cosine similarity between each row and all other rows in a matrix.
#
# Args:
# m: Matrix of values.
# top.k: Number of top rows to show for each row.
#
# Returns:
# Data frame with columns for pair of rows, and cosine similarity, for top
# `top.k` rows per row.
#
# Similarity computation
cp <- tcrossprod(m)
mm <- rowSums(m^2)
result <- cp/sqrt(outer(mm, mm))
# Top similar rows from train (per row)
# Use `top.k + 1` to remove the self-reference (similarity = 1)
top <- apply(result, 2, order, decreasing=TRUE)[seq(top.k + 1), ]
result.df <- data.frame(row.id1=c(col(top)), row.id2=c(top))
result.df$cosine.similarity <- result[as.matrix(result.df[, 2:1])]
# Remove same-row records and return
return(result.df[result.df$row.id1 != result.df$row.id2, ])
}
例如:
(m <- matrix(1:9, nrow=3))
# [,1] [,2] [,3]
# [1,] 1 4 7
# [2,] 2 5 8
# [3,] 3 6 9
CosineSimilarities(m, 1)
# row.id1 row.id2 cosine.similarity
# 2 1 2 0.9956
# 4 2 3 0.9977
# 6 3 2 0.9977