2013-08-23 36 views
5

有沒有一種簡單而快速的方法來獲得在整數向量中出現的每個整數的頻率:R什麼是獲取矢量中整數頻率的最快方法?

這裏是我的嘗試至今:

x <- floor(runif(1000000)*1000) 

print('*** using TABLE:') 
system.time(as.data.frame(table(x))) 

print('*** using HIST:') 
system.time(hist(x,breaks=min(x):(max(x)+1),plot=FALSE,right=FALSE)) 

print('*** using SORT') 
system.time({cdf<-cbind(sort(x),seq_along(x)); cdf<-cdf[!duplicated(cdf[,1]),2]; c(cdf[-1],length(x)+1)-cdf}) 

print('*** using ECDF') 
system.time({i<-min(x):max(x); cdf<-ecdf(x)(i)*length(x); cdf-c(0,cdf[-length(i)])}) 

print('*** counting in loop') 
system.time({h<-rep(0,max(x)+1);for(i in seq_along(x)){h[x[i]]<-h[x[i]]+1}; h}) 

#print('*** vectorized summation') #This uses too much memory if x is large 
#system.time(colSums(matrix(rbind(min(x):max(x))[rep(1,length(x)),]==x,ncol=max(x)-min(x)+1))) 

#Note: There are some fail cases in some of the above methods that need patching if, for example, there is a chance that some integer bins are unoccupied 

和這裏的結果:

[1] "*** using TABLE:" 
    user system elapsed 
    1.26 0.03 1.29 
[1] "*** using HIST:" 
    user system elapsed 
    0.11 0.00 0.10 
[1] "*** using SORT" 
    user system elapsed 
    0.22 0.02 0.23 
[1] "*** using ECDF" 
    user system elapsed 
    0.17 0.00 0.17 
[1] "*** counting in loop" 
    user system elapsed 
    3.12 0.00 3.12 

正如你可以看到table是可笑緩慢,hist似乎是最快的。但是hist(正如我使用它)正在處理可任意指定的斷點,而我只是想要整數。沒有辦法交換這種靈活性以獲得更好的性能嗎?

C,for(i=0;i<1000000;i++)h[x[i]]++;會很快起來。

回答

6

最快的是使用tabulate,但它需要正整數作爲輸入,所以你必須做一個快速的單調變換。

set.seed(21) 
x <- as.integer(runif(1e6)*1000) 
system.time({ 
    adj <- 1L - min(x) 
    y <- setNames(tabulate(x+adj), sort(unique(x))) 
}) 
4

不要忘記你可以內嵌C++代碼R.

library(inline) 

src <- ' 
Rcpp::NumericVector xa(a); 
int n_xa = xa.size(); 
int test = max(xa); 
Rcpp::NumericVector xab(test); 
for (int i = 0; i < n_xa; i++) 
xab[xa[i]-1]++; 
return xab; 
' 
fun <- cxxfunction(signature(a = "numeric"),src, plugin = "Rcpp") 
2

我覺得製表或C++版本的路要走,但這裏的一些代碼中使用rbenchmark這是一個很大的包看時機(我加慢一些功能測試過):

###################### 
### ---Clean Up--- ### 
###################### 

rm(list = ls()) 
gc() 

###################### 
### ---Packages--- ### 
##################### 

require(parallel) 
require(data.table) 
require(rbenchmark) 
require(inline) 


####################### 
### ---Functions--- ### 
####################### 

# Competitor functions by Breyal 
Breyal.using_datatable <- function(x) {DT <- data.table(x = x, weight = 1, key = "x"); DT[, length(weight), by = x]} 
Breyal.using_lapply_1c_eq <- function(x = sort(x)) { lapply(unique(x), function(u) sum(x == u)) } # 1 core 
Breyal.using_mclapply_8c_eq <- function(x = sort(x)) { mclapply(unique(x), function(u) sum(x == u), mc.cores = 8L) } # 8 cores 

# Competitor functions by tennenrishin 
tennenrishin.using_table <- function(x) as.data.frame(table(x)) 
tennenrishin.using_hist <- function(x) hist(x,breaks=min(x):(max(x)+1),plot=FALSE,right=FALSE) 
tennenrishin.using_sort <- function(x) {cdf<-cbind(sort(x),seq_along(x)); cdf<-cdf[!duplicated(cdf[,1]),2]; c(cdf[-1],length(x)+1)-cdf} 
tennenrishin.using_ecdf <- function(x) {i<-min(x):max(x); cdf<-ecdf(x)(i)*length(x); cdf-c(0,cdf[-length(i)])} 
tennenrishin.using_counting_loop <- function(x) {h<-rep(0,max(x)+1);for(i in seq_along(x)){h[x[i]]<-h[x[i]]+1}; h} 

# Competitor function by Ulrich 
Ulrich.using_tabulate <- function(x) { 
    adj <- 1L - min(x) 
    y <- setNames(tabulate(x+adj), sort(unique(x))) 
    return(y) 
} 

# I couldn't get the Joe's C++ version to work (my laptop won't install inline) butI suspect that would be the fastest solution 

################## 
### ---Data--- ### 
################## 

# Set seed so results are reproducable 
set.seed(21) 

# Data vector 
x <- floor(runif(1000000)*1000) 


##################### 
### ---Timings--- ### 
##################### 

# Benchmarks using Ubuntu 13.04 x64 with 8GB RAM and i7-2600K CPU @ 3.40GHz 
benchmark(replications = 5, 
      tennenrishin.using_table(x), 
      tennenrishin.using_hist(x), 
      tennenrishin.using_sort(x), 
      tennenrishin.using_ecdf(x), 
      tennenrishin.using_counting_loop(x), 
      Ulrich.using_tabulate(x), 
      Breyal.using_datatable(x), 
      Breyal.using_lapply_1c_eq(x), 
      Breyal.using_mclapply_8c_eq(x), 
      order = "relative") 

導致在下列時間

        test replications elapsed relative user.self sys.self user.child sys.child 
6   Ulrich.using_tabulate(x)   5 0.176 1.000  0.176 0.000  0.00  0.000 
2   tennenrishin.using_hist(x)   5 0.468 2.659  0.468 0.000  0.00  0.000 
3   tennenrishin.using_sort(x)   5 0.687 3.903  0.688 0.000  0.00  0.000 
4   tennenrishin.using_ecdf(x)   5 0.749 4.256  0.748 0.000  0.00  0.000 
7   Breyal.using_datatable(x)   5 2.960 16.818  2.960 0.000  0.00  0.000 
1   tennenrishin.using_table(x)   5 4.651 26.426  4.596 0.052  0.00  0.000 
9  Breyal.using_mclapply_8c_eq(x)   5 10.817 61.460  0.140 1.196  54.62  7.112 
5 tennenrishin.using_counting_loop(x)   5 10.922 62.057 10.912 0.000  0.00  0.000 
8  Breyal.using_lapply_1c_eq(x)   5 36.807 209.131 36.768 0.000  0.00  0.000 
+1

內聯可能會讓工作變得有點痛苦。在Windows上,你需要[rtools包](http://cran.r-project.org/bin/windows/Rtools/),但我不確定Ubuntu。我用我的代碼運行了你的測試,並且舒服地贏了,比表格解決方案快4倍。 – Joe

相關問題