2016-05-15 52 views
2

面板數據幀我有兩個數據幀:創建基於另外兩個dataframes

ANNUALSALARY <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame") 

和:

WEEKLYPRODUCTIVITY <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame")) 

我有興趣創建一個數據幀需要的最低出SLY_ADMINSLY_MKTSLY_FIN每個FIRM。然後從PR_ADMIN,PR_MKTPR_FIN以及Z_ADMIN,Z_MKTZ_FIN中得到相應的值。例如如果SLY_MKT是FIRM A的最小值,那麼它將返回PR_MKTZ_MKT 5周。面板數據幀看起來像這樣(我已經手動創建):

REQUIRED <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR = c(1, 5, 4, 3, 2, 5, 0, 1, 2, 3, 5, 4, 3, 2, 1), MIN_SLY = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.003, 0.003, 0.003, 0.003, 0.003, 0.03, 0.03, 0.03, 0.03, 0.03), SLY_DEPT = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("SLY_ADMIN", "SLY_FIN", "SLY_MKT"), class = "factor"), Z = c(1, 2, 3, 4, 5, 4, 3, 2, 1, 9, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR", "MIN_SLY", "SLY_DEPT", "Z"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame")) 

請幫忙。謝謝

回答

3

我們可以使用data.table。使用max.col獲取「ANNUALSALARY」中數值列最​​小值的索引。然後,我們轉換「data.frame」到「data.table」和「寬」到「長」格式melt它,得到了「MIN_SLY」和「S

library(data.table) 
i1 <- max.col(-1*ANNUALSALARY[-1]) 
dN <- melt(setDT(ANNUALSALARY), id.var = "FIRM", value.name = "MIN_SLY", 
    variable.name = "SLY_DEPT")[ , .SD[which.min(MIN_SLY)], by = FIRM] 
setDT(WEEKLYPRODUCTIVITY) 

代替melt ING或者,我們可以創建「data.table」使用「I1」

dN <- data.table(FIRM= ANNUALSALARY$FIRM, 
       MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], 
       SLY_DEPT = names(ANNUALSALARY)[-1][i1]) 

然後,我們通過「WEEKLYPRODUCTIVITY」和melt基於在列名patterns「長」格式join的「DN」。我們order按'FIRM','variable','WEEKS',根據「WEEKS」值創建一個分組變量('gr1'),由'FIRM'分組。

dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), 
    value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS) 
     ][, gr1 := cumsum(WEEKS==1), FIRM][] 

最後,我們加入了data.table I1」使用創造',on‘企業’,子集,其中的行‘GR1’等於‘I1’,並選擇感興趣的列。

res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM" 
      ][gr1==i1][,names(REQUIRED), with = FALSE] 

all.equal(as.data.frame(res), REQUIRED, check.attributes=FALSE) 
#[1] TRUE 
res 
# FIRM WEEKS PR MIN_SLY SLY_DEPT Z 
# 1: A  1 1 0.100 SLY_ADMIN 1 
# 2: A  2 5 0.100 SLY_ADMIN 2 
# 3: A  3 4 0.100 SLY_ADMIN 3 
# 4: A  4 3 0.100 SLY_ADMIN 4 
# 5: A  5 2 0.100 SLY_ADMIN 5 
# 6: B  1 5 0.003 SLY_MKT 4 
# 7: B  2 0 0.003 SLY_MKT 3 
# 8: B  3 1 0.003 SLY_MKT 2 
# 9: B  4 2 0.003 SLY_MKT 1 
#10: B  5 3 0.003 SLY_MKT 9 
#11: C  1 5 0.030 SLY_FIN 1 
#12: C  2 4 0.030 SLY_FIN 2 
#13: C  3 3 0.030 SLY_FIN 3 
#14: C  4 2 0.030 SLY_FIN 4 
#15: C  5 1 0.030 SLY_FIN 5 
+0

感謝答案 –

+0

當我運行DN2行:在'[.data.table'錯誤(DN,WEEKLYPRODUCTIVITY,上= 「FIRM」): 邏輯錯誤。我不是一個data.table,但提供了'on'參數。 –

+0

@PolarBear我沒有收到任何錯誤。我使用'data.table_1.9.6' – akrun

3

一種不同的方法,而且還使用data.table包:

library(data.table) 
# convert the dataframes to datatables (which is an enhanced form of dataframe) 
setDT(ANNUALSALARY) 
setDT(WEEKLYPRODUCTIVITY) 

# join them on 'FIRM' 
res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM'] 
# create a convenience vector with the columnnames starting with 'SLY_ 
sly.cols <- grep('^SLY_', names(res), value = TRUE) 

# create the 'MIN_SLY' & 'SLY_DEPT' columns 
res[, `:=` (MIN_SLY = min(.SD), 
      SLY_DEPT = sly.cols[which.min(.SD)]), 
    by = 1:nrow(res), .SDcols = sly.cols][] 

# melt it in log format and create the 'PR' & 'Z' column 
res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), 
      measure.vars = patterns('^PR_','^Z_'), 
      value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable] 
             ][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable], 
               Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), 
             by = .(FIRM,WEEKS) 
             ][, variable := NULL] 

# removing the duplicates 
res2 <- res2[!duplicated(res2)] 

導致:

> res2 
    FIRM WEEKS MIN_SLY SLY_DEPT PR Z 
1: A  1 0.100 SLY_ADMIN 1 1 
2: A  2 0.100 SLY_ADMIN 5 2 
3: A  3 0.100 SLY_ADMIN 4 3 
4: A  4 0.100 SLY_ADMIN 3 4 
5: A  5 0.100 SLY_ADMIN 2 5 
6: B  1 0.003 SLY_MKT 5 4 
7: B  2 0.003 SLY_MKT 0 3 
8: B  3 0.003 SLY_MKT 1 2 
9: B  4 0.003 SLY_MKT 2 1 
10: B  5 0.003 SLY_MKT 3 9 
11: C  1 0.030 SLY_FIN 5 1 
12: C  2 0.030 SLY_FIN 4 2 
13: C  3 0.030 SLY_FIN 3 3 
14: C  4 0.030 SLY_FIN 2 4 
15: C  5 0.030 SLY_FIN 1 5 
+0

謝謝。這很漂亮,更簡單一些。 –

+0

您是否知道當plm包使用數據表時遇到任何已知問題? –

+0

@PolarBear我從來沒有使用過這個包。你遇到過問題嗎?如果是這樣,最好問一個新問題。 – Jaap

2

這是一個棘手的問題!我想出了一個基於max.col(),merge()和索引矩陣的基礎R解決方案。

請注意,爲簡潔起見,我使用變量名稱salprod

sufs <- c('ADMIN','MKT','FIN'); 
slys <- paste0('SLY_',sufs); 
mins <- max.col(-sal[slys]); 
res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)])); 
res.sufs <- sub('.*_','',res$SLY_DEPT); 
for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; }; 

res; 
## FIRM WEEKS SLY_DEPT MIN_SLY PR Z 
## 1  A  1 SLY_ADMIN 0.100 1 1 
## 2  A  2 SLY_ADMIN 0.100 5 2 
## 3  A  3 SLY_ADMIN 0.100 4 3 
## 4  A  4 SLY_ADMIN 0.100 3 4 
## 5  A  5 SLY_ADMIN 0.100 2 5 
## 6  B  1 SLY_MKT 0.003 5 4 
## 7  B  2 SLY_MKT 0.003 0 3 
## 8  B  3 SLY_MKT 0.003 1 2 
## 9  B  4 SLY_MKT 0.003 2 1 
## 10 B  5 SLY_MKT 0.003 3 9 
## 11 C  1 SLY_FIN 0.030 5 1 
## 12 C  2 SLY_FIN 0.030 4 2 
## 13 C  3 SLY_FIN 0.030 3 3 
## 14 C  4 SLY_FIN 0.030 2 4 
## 15 C  5 SLY_FIN 0.030 1 5 

標杆
## libraries 
library(data.table); 
library(microbenchmark); 

## define inputs, including data.table instances for akrun and maximus solutions 
sal <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame"); 
prod <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame")); 
sal.dt <- as.data.table(sal); 
prod.dt <- as.data.table(prod); 

## solutions 
bgoldst <- function(sal,prod) { sufs <- c('ADMIN','MKT','FIN'); slys <- paste0('SLY_',sufs); mins <- max.col(-sal[slys]); res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)])); res.sufs <- sub('.*_','',res$SLY_DEPT); for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; }; res; }; 
akrun <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { i1 <- max.col(-1*ANNUALSALARY[,-1,with=F]); dN <- data.table(FIRM= ANNUALSALARY$FIRM, MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], SLY_DEPT = names(ANNUALSALARY)[-1][i1]); dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)][, gr1 := cumsum(WEEKS==1), FIRM][]; res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"][gr1==i1]; res[,!names(res)%in%c('i1','variable','gr1'),with=F]; }; 
maximus <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']; sly.cols <- grep('^SLY_', names(res), value = TRUE); res[, `:=` (MIN_SLY = min(.SD), SLY_DEPT = sly.cols[which.min(.SD)]), by = 1:nrow(res), .SDcols = sly.cols][]; res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), measure.vars = patterns('^PR_','^Z_'), value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable], Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), by = .(FIRM,WEEKS)][, variable := NULL]; res2 <- res2[!duplicated(res2)]; }; 

## proofs of equivalence 
ex <- bgoldst(sal,prod); co <- names(ex); 
identical(ex,transform(as.data.frame(akrun(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT))); 
## [1] TRUE 
identical(ex,transform(as.data.frame(maximus(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT))); 
## [1] TRUE 

## benchmark 
microbenchmark(bgoldst(sal,prod),akrun(sal.dt,prod.dt),maximus(sal.dt,prod.dt)); 
## Unit: milliseconds 
##      expr  min  lq  mean median  uq  max neval 
##  bgoldst(sal, prod) 1.639193 1.730070 1.883285 1.807047 1.881031 3.230917 100 
## akrun(sal.dt, prod.dt) 6.392125 6.666251 7.744077 6.901033 7.230752 53.621663 100 
## maximus(sal.dt, prod.dt) 5.002254 5.229979 5.853681 5.423492 6.034609 12.182544 100 
+0

感謝您的基地R解決方案。數據表與plm包一起使用時,通常有什麼問題嗎?我一直在思考這個問題很長時間。 –

+0

不客氣。我從來沒有使用過plm包,所以我是個錯誤的人。你可以試着問阿克魯恩,他對R很熟悉。 – bgoldst

+2

很好的R基礎解決方案! – Jaap