2015-09-30 72 views
0

我正在嘗試在大型數據框中編制索引。 sdata框架有300萬觀察值和26個變量(在底部)。R中的向量化索引

setsize <- 6 
eccent <- 150 
ctrX <- 400 
ctrY <- 300 

xyrotate <- function(x,y,ctrX,ctrY,angle){ 
    distX <- x - ctrX; 
    distY <- y - ctrY; 
    radians <- angle * (pi/180); 
    rotX <- ctrX + (distX*cos(radians)) - (distY*sin(radians)); 
    rotY <- ctrY + (distX*sin(radians)) + (distY*cos(radians)); 
    coordinates <- list("X" = rotX,"Y" = rotY) 
    return(coordinates) 
} 

loc <- data.frame(x = numeric(setsize), 
       y = numeric(setsize)) 
loc$x[1] <- ctrX 
loc$y[1] <- ctrY - eccent 
for(i in 2:setsize){ 
    coord <- xyrotate(loc$x[1], loc$y[1],ctrX,ctrY,(i-1)*(360/setsize)) 
    loc$x[i] <- coord$X 
    loc$y[i] <- coord$Y 
} 
gazedist <- matrix(nrow=nrow(sdata), ncol = setsize) 
for(d in 1:setsize){ 
    x <- sdata$RIGHT_GAZE_X-loc$x[d] 
    y <- sdata$RIGHT_GAZE_Y-loc$y[d] 
    gazedist[,d] <- sqrt(x^2+y^2) 
} 
sdata$gdist_T <- 0 
sdata$gdist_T <- gazedist[ ,sdata$t_targLoc] 

這裏的最後一行導致R崩潰。有什麼方法可以將sdata $ t_targLoc [i]的值插入gazedistance [i,d]的d值中。 for循環相當於將是:

for(i in 1:length(gazedist)){ 
    sdata$gdist_T[i] <- gazedist[i,sdata$t_targLoc[i]] 
} 

但是,這將是R中慢...

SDATA結構

structure(list(RIGHT_GAZE_X = c(409.5, 409.6, 409.5, 409.4, 409.3, 
409.2, 409.1, 409, 408.9, 408.8), RIGHT_GAZE_Y = c(291.9, 291.5, 
290.9, 290.3, 290.3, 290.3, 289.8, 289.2, 288.7, 288.8), RECORDING_SESSION_LABEL = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ET101", class = "factor"), 
    t_block = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), t_trialNum = c(129L, 
    129L, 129L, 129L, 129L, 129L, 129L, 129L, 129L, 129L), t_subjNum = c(101L, 
    101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L), t_colCond = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "green", class = "factor"), 
    t_targLoc = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), t_targID = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "L", class = "factor"), 
    t_targShape = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = "diamond", class = "factor"), t_singLoc = c(5L, 
    5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), t_singPres = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "P", class = "factor"), 
    t_singDist = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), t_singAngle = c(120L, 
    120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L), t_targAngle = c(120L, 
    120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L), t_RESP = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "L", class = "factor"), 
    t_ACC = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), t_RT = c(686.1062, 
    686.1062, 686.1062, 686.1062, 686.1062, 686.1062, 686.1062, 
    686.1062, 686.1062, 686.1062), TRIAL_START_TIME = c(1031031L, 
    1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 
    1031031L, 1031031L, 1031031L), TIMESTAMP = c(1031030, 1031032, 
    1031034, 1031036, 1031038, 1031040, 1031042, 1031044, 1031046, 
    1031048), IP_START_TIME = c(1031031L, 1031031L, 1031031L, 
    1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 
    1031031L), currtime = c(0, 2, 4, 6, 8, 10, 12, 14, 16, 18 
    ), currsamp = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), gdist_T = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0), gdist_S = c(0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), gdist_NS = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("RIGHT_GAZE_X", 
"RIGHT_GAZE_Y", "RECORDING_SESSION_LABEL", "t_block", "t_trialNum", 
"t_subjNum", "t_colCond", "t_targLoc", "t_targID", "t_targShape", 
"t_singLoc", "t_singPres", "t_singDist", "t_singAngle", "t_targAngle", 
"t_RESP", "t_ACC", "t_RT", "TRIAL_START_TIME", "TIMESTAMP", "IP_START_TIME", 
"currtime", "currsamp", "gdist_T", "gdist_S", "gdist_NS"), row.names = 53170:53179, class = "data.frame") 
+1

You rea讓你不必上傳所有的行,也許只有前10個? –

+1

「這裏的最後一行導致R崩潰。」 - >什麼錯誤信息?這條線本身是可以的,完全等同於循環(所以無論工作還是失敗)。 –

回答

2

看起來你想抓住第i行和sdist$$t_targLoc[i]gazedist。有一個內置的。用途:

sdata$gdist_T <- gazedist[cbind(1:nrow(gazedist),sdata$t_targLoc)] 

下面是一個例子:

m <- matrix(1:25,nc=5) 
m 
#  [,1] [,2] [,3] [,4] [,5] 
# [1,] 1 6 11 16 21 
# [2,] 2 7 12 17 22 
# [3,] 3 8 13 18 23 
# [4,] 4 9 14 19 24 
# [5,] 5 10 15 20 25 
v <- c(1,3,5,2,4) 
m[cbind(1:5,v)] 
# [1] 1 12 23 9 20 

順便說一句,你的第一個for迴路可以被替換爲:

loc <- as.data.frame(xyrotate(ctrX,ctrY-eccent, ctrX,ctrY,(1:(setsize-1))*360/setsize)) 
loc <- rbind(c(X=ctrX,Y=ctrY-eccent),lloc) 

你的第二個for迴路可以被替換,例如,

f <- function(x,y) { 
    x <- sdata$RIGHT_GAZE_X-x 
    y <- sdata$RIGHT_GAZE_Y-y 
    sqrt(x^2+y^2) 
} 
ggazedist <- mapply(f, loc$x, loc$y) 
identical(gazedist,ggazedist) 
# [1] TRUE 
0

可以,但我懷疑你會節省很多時間,除非你想出一個完全矢量化的方法。換句話說,你必須避免使用像apply或sapply這樣的函數,它們都基於C中的for循環,因此不會比循環中的循環快得多,即使循環也是如此。

someFunction <- function(x) ifelse(x %in% seq(0, 50000, 100), 1, 0) 

# Here you have "vectorized" the indexing 
system.time(sapply(1:nrow(diamonds), someFunction)) 
# 2.6 elapsed secs 

## vs here where you're just using a for loop 

system.time(
for(i in 1:nrow(diamonds)) { 
    k[i] <- someFunction(i) 
} 
) 
# 2.7 elapsed secs 
0

是的,我認爲這是有效的。基本上你想從矩陣中得到一個向量,其中行索引是1:nrow(sdat),列索引是sdata$t_targLoc。這不是內置的(我知道),但我們可以將矩陣轉換爲向量並獲取正確的值。

gazedist_vals = as.vector(gazedist) 
rows = 1:nrow(sdat) 
cols = sdat$t_targLoc 
indices = (cols - 1) * nrow(gazedist) + rows 
sdata$gdist_T = gazedist_vals[indices] 

我認爲那會做你想做的。

x = matrix(c(5,2,65,8,4,2), nrow = 2) 
x 
#  [,1] [,2] [,3] 
# [1,] 5 65 4 
# [2,] 2 8 2 
as.vector(x) 
# [1] 5 2 65 8 4 2 
rows = c(1, 1, 2) 
cols = c(3,2,1) 
inds = (cols - 1) * nrow(x) + rows 
as.vector(x)[inds] 
# [1] 4 65 2 

一個評論:

因爲t_targLoc總是3.這裏有一個小插圖,您的數據摘錄並沒有說明這非常好你的問題是徹底的,但最小例子是更可取的。您向我們提供了26列數據,其中只需要一列數據。你給了我們函數和代碼來計算距離,當你可​​以給出一個距離矩陣(然後只需要一個數據列)。像我的x矩陣和rowscols載體的一個小例子可能是您需要展示問題的所有東西。