2011-04-13 55 views
11

我正在滾動迴歸非常相似,下面的代碼滾動窗口迴歸:並行化中的R

library(PerformanceAnalytics) 
library(quantmod) 
data(managers) 

FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4) 
MyRegression <- function(df,FL) { 
    df <- as.data.frame(df) 
    model <- lm(FL,data=df[1:30,]) 
    predict(model,newdata=df[31,]) 
} 

system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL, 
    by.column = FALSE, align = "right", na.pad = TRUE)) 

我已經得到了一些額外的處理器,所以我試圖找到一種方法來並行滾動窗口。如果這是一個非滾動迴歸,我可以很容易地進行並行化使用申請家庭的功能...

回答

9

最明顯的一種是使用lm.fit()代替lm()這樣你就不會招致處理的公式等全部開銷。

更新:所以,當我說明顯我的意思說的是言自明,但看似難以實現

有點擺弄周圍後,我想出了這個

library(PerformanceAnalytics) 
library(quantmod) 
data(managers) 

第一階段是要認識到,模型矩陣可以預先構建的,所以我們做到這一點,並將其轉換回動物園對象與rollapply()使用:

mmat2 <- model.frame(Next(HAM1) ~ HAM1 + HAM2 + HAM3 + HAM4, data = managers, 
        na.action = na.pass) 
mmat2 <- cbind.data.frame(mmat2[,1], Intercept = 1, mmat2[,-1]) 
mmatZ <- as.zoo(mmat2) 

現在我們需要的是將採用lm.fit()做繁重的任務,而不必在每次迭代創建設計矩陣的功能:

MyRegression2 <- function(Z) { 
    ## store value we want to predict for 
    pred <- Z[31, -1, drop = FALSE] 
    ## get rid of any rows with NA in training data 
    Z <- Z[1:30, ][!rowSums(is.na(Z[1:30,])) > 0, ] 
    ## Next() would lag and leave NA in row 30 for response 
    ## but we precomputed model matrix, so drop last row still in Z 
    Z <- Z[-nrow(Z),] 
    ## fit the model 
    fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1]) 
    ## get things we need to predict, in case pivoting turned on in lm.fit 
    p <- fit$rank 
    p1 <- seq_len(p) 
    piv <- fit$qr$pivot[p1] 
    ## model coefficients 
    beta <- fit$coefficients 
    ## this gives the predicted value for row 31 of data passed in 
    drop(pred[, piv, drop = FALSE] %*% beta[piv]) 
} 

定時的比較:

> system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL, 
+         by.column = FALSE, align = "right", 
+         na.pad = TRUE)) 
    user system elapsed 
    0.925 0.002 1.020 
> 
> system.time(Result2 <- rollapply(mmatZ, 31, FUN = MyRegression2, 
+         by.column = FALSE, align = "right", 
+         na.pad = TRUE)) 
    user system elapsed 
    0.048 0.000 0.05 

這提供了原來的一個非常合理的改進。現在檢查產生的物體是否相同:

> all.equal(Result, Result2) 
[1] TRUE 

Enjoy!

+0

@Zach我當然相信你知道你在做什麼這裏做的更快 - 試圖讓一步 - 預測? – 2011-04-13 15:58:51

+0

@Gavin Simpson是的,那就是我正在做的。我也試着將其並行化。 – Zach 2011-04-13 17:02:32

+0

@Zach - 剛剛發佈了一個更新,其中包含了實現我的lm.fit()'建議的代碼。這樣做比我感謝的要複雜一點。 – 2011-04-13 18:15:04

1

您可以通過更新分解來縮短運行時間。這會在每次迭代時產生frm1成本,而不是frm1,其中n是窗口寬度。以下是比較兩者的代碼。在C++中執行它可能會快得多,但LINPACK dchuddchdd不包含在R中,因此您必須編寫一個包來執行此操作。此外,我記得讀,你可以與其他實現比LINPACK dchuddchdd用於R更新

library(SamplerCompare) # for LINPACK `chdd` and `chud` 
roll_forcast <- function(X, y, width){ 
    n <- nrow(X) 
    p <- ncol(X) 
    out <- rep(NA_real_, n) 

    is_first <- TRUE 
    i <- width 
    while(i < n){ 
    if(is_first){ 
     is_first <- FALSE 
     qr. <- qr(X[1:width, ]) 
     R <- qr.R(qr.) 

     # Use X^T for the rest 
     X <- t(X) 

     XtY <- drop(tcrossprod(y[1:width], X[, 1:width])) 
    } else { 
     x_new <- X[, i] 
     x_old <- X[, i - width] 

     # update R 
     R <- .Fortran(
     "dchud", R, p, p, x_new, 0., 0L, 0L, 
     0., 0., numeric(p), numeric(p), 
     PACKAGE = "SamplerCompare")[[1]] 

     # downdate R 
     R <- .Fortran(
     "dchdd", R, p, p, x_old, 0., 0L, 0L, 
     0., 0., numeric(p), numeric(p), integer(1), 
     PACKAGE = "SamplerCompare")[[1]] 

     # update XtY 
     XtY <- XtY + y[i] * x_new - y[i - width] * x_old 
    } 

    coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE)) 
    coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE)) 

    i <- i + 1 
    out[i] <- X[, i] %*% coef. 
    } 

    out 
} 

# simulate data 
set.seed(101) 
n <- 10000 
wdth = 50 
X <- matrix(rnorm(10 * n), n, 10) 
y <- drop(X %*% runif(10)) + rnorm(n) 
Z <- cbind(y, X) 

# assign other function 
lm_version <- function(Z, width = wdth) { 
    pred <- Z[width + 1, -1, drop = FALSE] 
    ## fit the model 
    Z <- Z[-nrow(Z), ] 
    fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1]) 
    ## get things we need to predict, in case pivoting turned on in lm.fit 
    p <- fit$rank 
    p1 <- seq_len(p) 
    piv <- fit$qr$pivot[p1] 
    ## model coefficients 
    beta <- fit$coefficients 
    ## this gives the predicted value for row 31 of data passed in 
    drop(pred[, piv, drop = FALSE] %*% beta[piv]) 
} 

# show that they yield the same 
library(zoo) 
all.equal(
    rollapply(Z, wdth + 1, FUN = lm_version, 
      by.column = FALSE, align = "right", fill = NA_real_), 
    roll_forcast(X, y, wdth)) 
#R> [1] TRUE 

# benchmark 
library(compiler) 
roll_forcast <- cmpfun(roll_forcast) 
lm_version <- cmpfun(lm_version) 
microbenchmark::microbenchmark(
    new = roll_forcast(X, y, wdth), 
    prev = rollapply(Z, wdth + 1, FUN = lm_version, 
        by.column = FALSE, align = "right", fill = NA_real_), 
    times = 10) 
#R> Unit: milliseconds 
#R> expr  min  lq  mean median  uq  max neval cld 
#R> new 113.7637 115.4498 129.6562 118.6540 122.4930 230.3414 10 a 
#R> prev 639.6499 674.1677 682.1996 678.6195 686.8816 763.8034 10 b 
+0

好主意,謝謝! – Zach 2018-02-19 13:46:51