2013-01-08 85 views
2

我試圖連續加入多個數據集並標記來自第一個數據集的觀察結果,這些數據集在隨後的數據集中找不到匹配項。下面是一個例子,我模擬原始數據集加上三個加法加入。目前的代碼做我想要的,但效率非常低。對於大數據集,可能需要幾天時間。是否可以通過應用或其他功能來完成此任務?在R中合併大數據集並標記不匹配

#Toy datasets: x, y, z and w 

#dataset X 
id <- c(1:10, 1:100) 
X1 <- rnorm(110, mean = 0, sd = 1) 
year <- c("2004","2005","2006","2001","2002") 
year <- rep(year, 22) 

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr") 
month <- rep(month, 11) 

x <- data.frame(id, X1, month, year) 

#dataset Y 
id2 <- c(1:10, 41:110) 
Y1 <- rnorm(80, mean = 0 , sd = 1) 
year <- c("2004","2005","2006","2001") 
year <- rep(year, 20) 

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr") 
month <- rep(month, 8) 

y <- data.frame(id2,Y1, year,month) 


#dataset z 
id3 = c(1:60, 401:10000) 
Z1 = rpois(9660, 10) 
year = c('2004','2005','2006','2002') 
year = rep(year, 2415) 

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr") 
month <- rep(month, 966) 

z = data.frame(id3,Z1,year,month) 

#dataset w 
id4 = c(1:300, 20:29) 
W1 = rnorm(310, 20, 36) 
year = c('2004','2005','2006','2000','2002') 
year = rep(year, 62) 

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr") 
month <- rep(month, 31) 

w = data.frame(id4, W1, year, month) 


x$id2 = x$yflag = x$zflag = x$wflag = rep(NA, nrow(x)) 


y.index = rep(NA, nrow(x)) 
z.index = rep(NA, nrow(x)) 
w.index = rep(NA, nrow(x)) 

for(i in 1:nrow(x)) { 

    #compare to dataset y, insert yflag == 1 if the same ID, month, year is in x, otherwise 0 
    y.index = which(as.character(y$id2) == as.character(x$id[i]) 
        & as.character(y$year) == as.character(x$year[i]) 
        & as.character(y$month) == as.character(x$month[i])) 
    x$yflag[i] = ifelse(length(y.index==1), 1, 0) 
    x$id2[i] = ifelse(length(y.index) == 1, y$id2[y.index], x$id[i]) 

    ## compare to dataset z, insert zflag == 1 if the same ID, month, year is in x, otherwise 0 
    z.index <- which(as.character(z$id3) == as.character(x$id[i]) 
        & as.character(z$month) == as.character(x$month[i]) 
        & as.character(z$year) == as.character(x$year[i])) 
    x$zflag[i] <- ifelse(length(z.index == 1), 1, 0) 


    ## compare to dataset w, insert wflag == 1 if the same ID, month, year is in x, otherwise 0 
    w.index <- which(as.character(w$id4) == as.character(x$id[i]) 
        & as.character(w$month) == as.character(x$month[i]) 
        & as.character(w$year) == as.character(x$year[i])) 
    x$wflag[i] <- ifelse(length(w.index == 1), 1, 0) 
} 

print(x) 
+0

你試過'合併()'? – Andrie

+0

合併不會正確標記觀察結果,它會拋出信息,因爲我無法看到標誌的等價物。 'test.merge = merge(x,y,by.x ='id',by.y ='id2')'例如並不能解決問題。當然,我可能沒有正確實施它 –

+0

你試過'match()'函數嗎? –

回答

2

在衆多的解決方案:
創建四個data.frames

x$match.idx <- do.call(paste, c(x[,c("id", "month", "year")], sep=":")) 
y$match.idx <- do.call(paste, c(y[,c("id2", "month", "year")], sep=":")) 
z$match.idx <- do.call(paste, c(z[,c("id3", "month", "year")], sep=":")) 
w$match.idx <- do.call(paste, c(w[,c("id4", "month", "year")], sep=":")) 

xy.m <- match(x$match.idx, y$match.idx) 
xz.m <- match(x$match.idx, z$match.idx) 
xw.m <- match(x$match.idx, w$match.idx) 
x$yflag <- x$zflag <- x$wflag <- 0 
x$yflag[which(!is.na(xy.m))] <- 1 
x$zflag[which(!is.na(xz.m))] <- 1 
x$wflag[which(!is.na(xw.m))] <- 1 

x <- subset(x, select=-c(match.idx)) 
> head(x) 

    id   X1 month year wflag zflag yflag 
1 1 -0.2470932 Jul 2004  1  1  1 
2 2 0.2262816 Aug 2005  1  1  1 
3 3 0.8473442 Sep 2006  1  1  1 
4 4 0.9338628 Oct 2001  0  0  1 
5 5 -0.1385540 Nov 2002  1  0  0 
6 6 0.7825385 Dec 2004  1  0  0 
+0

將索引列與'paste()'組合起來的好主意! +1 –

+1

不錯。 'paste()'實際上比我使用的'interaction()'快得多。這裏有一個'data.table'的方法,假設你已經爲你所有的'data.frame'(例如DTx,DTy ...)創建了'data.table'。我發現語法* much *更清晰:'temp < - DTx [,paste(id,year,month)]; DTx [,':='(wflag = as.numeric(temp%in%DTw [,paste(id4,year,month)]),zflag = as.numeric(temp%in%DTz [,paste(id3,year ,month)]),yflag = as.numeric(temp%in%DTy [,paste(id2,year,month)]))]' – A5C1D2H2I1M1N2O1R2T1

+0

非常感謝!這非常有幫助。 –

1

我建議結合within()interaction()如下:

output <- within(x, { 
    temp <- interaction(id, month, year) # Something to match to 
    # The actual matching takes place here 
    # The `+0` at the end is a lazy way to convert 
    # TRUE and FALSE logical values to numeric 1 and 0 
    wflag <- temp %in% with(w, interaction(id4, month, year)) + 0 
    zflag <- temp %in% with(z, interaction(id3, month, year)) + 0 
    yflag <- temp %in% with(y, interaction(id2, month, year)) + 0 
    # Remove the temp variable that we created 
    # since it's no longer required. 
    rm(temp) 
}) 

head(output) 
# id   X1 month year yflag zflag wflag 
# 1 1 -0.03595218 Jul 2004  1  1  1 
# 2 2 0.56329165 Aug 2005  1  1  1 
# 3 3 0.74372988 Sep 2006  1  1  1 
# 4 4 1.49634088 Oct 2001  1  0  0 
# 5 5 0.23107131 Nov 2002  0  0  1 
# 6 6 0.15121196 Dec 2004  0  0  1 
tail(output) 
#  id   X1 month year yflag zflag wflag 
# 105 95 -0.0911546 Nov 2002  0  0  1 
# 106 96 -0.4140724 Dec 2004  0  0  1 
# 107 97 -0.1477702 Jan 2005  0  0  1 
# 108 98 -0.3164388 Feb 2006  0  0  1 
# 109 99 -0.5082118 Mar 2001  0  0  0 
# 110 100 -0.6072856 Apr 2002  0  0  1