2014-05-19 81 views
6

現在,我有以下由original.df %.% group_by(Category) %.% tally() %.% arrange(desc(n))創建的data.frame。創建「其他」字段

DF <- structure(list(Category = c("E", "K", "M", "L", "I", "A", 
"S", "G", "N", "Q"), n = c(163051, 127133, 106680, 64868, 49701, 
47387, 47096, 45601, 40056, 36882)), .Names = c("Category", 
"n"), row.names = c(NA, 10L), class = c("tbl_df", "tbl", "data.frame" 
)) 

     Category  n 
1    E 163051 
2    K 127133 
3    M 106680 
4    L 64868 
5    I 49701 
6    A 47387 
7    S 47096 
8    G 45601 
9    N 40056 
10    Q 36882 

我想創建一個「其他」字段從最低排名類別n。即

 Category  n 
1    E 163051 
2    K 127133 
3    M 106680 
4    L 64868 
5    I 49701 
6   Other 217022 

現在,我做

rbind(filter(DF, rank(rev(n)) <= 5), 
    summarise(filter(DF, rank(rev(n)) > 5), Category = "Other", n = sum(n))) 

其中倒塌沒有進入前5所有類別到其他類別。

但我很好奇dplyr或其他現有的軟件包中是否有更好的方法。 「更好」我的意思是更簡潔/可讀。我也有興趣採用更聰明或更靈活的方法來選擇Other

回答

5

不同的封裝/不同的語法版本:

library(data.table) 

dt = as.data.table(DF) 

dt[order(-n), # your data is already sorted, so this does nothing for it 
    if (.BY[[1]]) .SD else list("Other", sum(n)), 
    by = 1:nrow(dt) <= 5][, !"nrow", with = F] 
# Category  n 
#1:  E 163051 
#2:  K 127133 
#3:  M 106680 
#4:  L 64868 
#5:  I 49701 
#6: Other 217022 
6

這是另一種方法,假設每個類別(前5名的至少)只發生一次:

df %.% 
    arrange(desc(n)) %.%  #you could skip this step since you arranged the input df already according to your question 
    mutate(Category = ifelse(1:n() > 5, "Other", Category)) %.% 
    group_by(Category) %.% 
    summarize(n = sum(n)) 

# Category  n 
#1  E 163051 
#2  I 49701 
#3  K 127133 
#4  L 64868 
#5  M 106680 
#6 Other 217022 

編輯:

我只注意到通過降低n我的輸出沒有訂購任何更多。再次運行代碼後,我發現訂單一直保留到group_by(Category)之後,但之後我運行summarize時,訂單已不存在(或者更確切地說,它似乎是按Category訂購的)。那應該是這樣嗎?

這裏有三個方法:

m <- 5 #number of top results to show in final table (excl. "Other") 
n <- m+1 

#preserves the order (or better: reesatblishes it by index) 
df <- arrange(df, desc(n)) %.% #this could be skipped if data already ordered 
    mutate(idx = 1:n(), Category = ifelse(idx > m, "Other", Category)) %.% 
    group_by(Category) %.% 
    summarize(n = sum(n), idx = first(idx)) %.% 
    arrange(idx) %.% 
    select(-idx) 

#doesnt preserve the order (same result as in first dplyr solution, ordered by Category) 
df[order(df$n, decreasing=T),]  #this could be skipped if data already ordered 
df[n:nrow(df),1] <- "Other" 
df <- aggregate(n ~ Category, data = df, FUN = "sum") 

#preserves the order (without extra index) 
df[order(df$n, decreasing=T),]  #this could be skipped if data already ordered 
df[n:nrow(df),1] <- "Other" 
df[n,2] <- sum(df$n[df$Category == "Other"]) 
df <- df[1:n,] 
0

該函數修改的列,與Other更換不頻繁的條目,或者通過指定最小頻率,或者通過指定預期的類別的結果數目。

#' @title Group infrequent entries into 'Other category' 
#' @description Useful when you want to constrain the number of unique values in a column. 
#' @param .data Data containing variable. 
#' @param var Variable containing infrequent entries, to be collapsed into "Other". 
#' @param n Threshold for total number of categories above "Other". 
#' @param count Threshold for total count of observations before "Other". 
#' @param by Extra variables to group by when calculating \code{n} or \code{count}. 
#' @param copy Should \code{.data} be copied? Currently only \code{TRUE} is supported. 
#' @param other.category Value that infrequent entries are to be collapsed into. Defaults to \code{"Other"}. 
#' @return \code{.data} but with \code{var} changed to be grouped into smaller categories. 
#' @export 
mutate_other <- function(.data, var, n = 5, count, by = NULL, copy = TRUE, other.category = "Other"){ 
    stopifnot(is.data.table(.data), 
      is.character(other.category), 
      identical(length(other.category), 1L)) 

    had.key <- haskey(.data) 

    if (!isTRUE(copy)){ 
    stop("copy must be TRUE") 
    } 

    out <- copy(.data) 

    if (had.key){ 
    orig_key <- key(out) 
    } else { 
    orig_key <- "_order" 
    out[, "_order" := 1:.N] 
    setkeyv(out, "_order") 
    } 

    if (is.character(.data[[var]])){ 
    stopifnot(!("nvar" %in% names(.data)), 
       var %in% names(.data)) 

    N <- .rank <- NULL 
    n_by_var <- 
     out %>% 
     .[, .N, keyby = c(var, by)] %>% 
     .[, .rank := rank(-N)] 

    out <- merge(out, n_by_var, by = c(var, by)) 

    if (missing(count)){ 
     out[, (var) := dplyr::if_else(.rank <= n, out[[var]], other.category)] 
    } else { 
     out[, (var) := dplyr::if_else(N >= count, out[[var]], other.category)] 
    } 
    out <- 
     out %>% 
     .[, N := NULL] %>% 
     .[, .rank := NULL] 

    setkeyv(out, orig_key) 

    if (!had.key){ 
     out[, (orig_key) := NULL] 
     setkey(out, NULL) 
    } 
    out 

    } else { 
    warning("Attempted to use by = on a non-character vector. Aborting.") 
    return(.data) 
    } 
} 

https://github.com/HughParsonage/hutils/blob/master/R/mutate_other.R

相關問題