2015-09-26 141 views

回答

-1

該功能是基於鏈接的答案:

CosineSimilarities <- function(m, top.k) { 
    # Computes cosine similarity between each row and all other rows in a matrix. 
    # 
    # Args: 
    # m: Matrix of values. 
    # top.k: Number of top rows to show for each row. 
    # 
    # Returns: 
    # Data frame with columns for pair of rows, and cosine similarity, for top 
    # `top.k` rows per row. 
    # 
    # Similarity computation 
    cp <- tcrossprod(m) 
    mm <- rowSums(m^2) 
    result <- cp/sqrt(outer(mm, mm)) 
    # Top similar rows from train (per row) 
    # Use `top.k + 1` to remove the self-reference (similarity = 1) 
    top <- apply(result, 2, order, decreasing=TRUE)[seq(top.k + 1), ] 
    result.df <- data.frame(row.id1=c(col(top)), row.id2=c(top)) 
    result.df$cosine.similarity <- result[as.matrix(result.df[, 2:1])] 
    # Remove same-row records and return 
    return(result.df[result.df$row.id1 != result.df$row.id2, ]) 
} 

例如:

(m <- matrix(1:9, nrow=3)) 
#  [,1] [,2] [,3] 
# [1,] 1 4 7 
# [2,] 2 5 8 
# [3,] 3 6 9 
CosineSimilarities(m, 1) 
# row.id1 row.id2 cosine.similarity 
# 2  1  2   0.9956 
# 4  2  3   0.9977 
# 6  3  2   0.9977