2013-07-19 61 views
1

我想確定包含指標的組。在下面的示例中,我想確定包含county == 'other'districts。如果district中有任何county == 'other',那麼我希望指示變量爲1,否則0,對於該區域中的每一行。下面是splitlapplyany這樣做的幾個嘗試,但是它們都不起作用。也許我可以提取所有的行爲county == 'other',爲該子集定義一個指示符,然後將該子集合並回原始數據集,但我一直認爲必須有一個更簡單的方法。感謝您的任何建議。標識包含指標的組

df.1 <- read.table(text = ' 

    state district county apples 
     AA   EC  AB  100 
     AA   EC  BC  10 
     AA   EC  DC  150 
     AA   C  FG  200 
     AA   C  other  20 
     AA   C  HC  250 
     AA   WC  RT  300 
     AA   WC  TT  30 
     AA   WC  other  350 

', header=TRUE, stringsAsFactors = FALSE) 

desired.result <- read.table(text = ' 

    state district county apples indicator 
     AA   EC  AB  100   0 
     AA   EC  BC  10   0 
     AA   EC  DC  150   0 
     AA   C  FG  200   1 
     AA   C  other  20   1 
     AA   C  HC  250   1 
     AA   WC  RT  300   1 
     AA   WC  TT  30   1 
     AA   WC  other  350   1 

', header=TRUE, stringsAsFactors = FALSE) 

# various attempts that do not work 

with(df.1, lapply(split(county, district), function(x) {any(x)=='county' <- 1})) 
with(df.1, lapply(split(county, district), function(x) {ifelse(any(x)=='other', 1, 0)})) 
with(df.1, lapply(split(county, district), function(x) {any(x)=='other'})) 
with(df.1, lapply(split(df.1 , district), function(x) {any(x$county)=='other'})) 
with(df.1, lapply(split(county, district), function(x) {x=='other'})) 

編輯

這裏是集/合併的方法,我上面提到的:我更喜歡使用基地R.

回答

2
library(data.table) 

dt = data.table(df.1) 
dt[, indicator := 1*any(county == 'other'), by = district] 

dt 
#   state district county apples indicator 
#1:    AA       EC     AB    100         0 
#2:    AA       EC     BC     10         0 
#3:    AA       EC     DC    150         0 
#4:    AA        C     FG    200         1 
#5:    AA        C  other     20         1 
#6:    AA        C     HC    250         1 
#7:    AA       WC     RT    300         1 
#8:    AA       WC     TT     30         1 
#9:    AA       WC  other    350         1 
  

這裏有一個基礎的解決方案

df.indicator <- df.1[df.1$county == 'other',] 
df.indicator <- df.indicator[,1:2] 
df.indicator$indicator = 1 
merge(df.1, df.indicator, by=c('state', 'district'), all=TRUE) 

- 這是多少更慢,更醜,但如果這是OP的東西,哦哦:)

df.1$indicator = as.numeric(ave(df.1$county, df.1$district, 
           FUN = function(x) {1*any(x == "other")})) 

或者

df.1$indicator <- with(df.1, ave(county=='other', district, FUN=max)) 

或者

df.1$indicator <- with(df.1, ave(county=='other', district, FUN=any)+0L) 
0

這是我已經能夠拿出迄今使用的功能應用的家人最好的:

df.1 <- read.table(text = ' 

    state district county apples 
     AA   EC  AB  100 
     AA   EC  BC  10 
     AA   EC  DC  150 
     AA   C  FG  200 
     AA   C  other  20 
     AA   C  HC  250 
     AA   WC  RT  300 
     AA   WC  TT  30 
     AA   WC  other  350 

', header=TRUE, stringsAsFactors = FALSE) 

z <- with(df.1, lapply(split(df.1, district), function(x) { merge(x, ifelse('other' %in% x$county, 1, 0), all=TRUE) })) ; z 
df.2 <- do.call(rbind, z) 
rownames(df.2) = NULL 
df.2 

給予:

state district county apples y 
1 AA  C  FG 200 1 
2 AA  C other  20 1 
3 AA  C  HC 250 1 
4 AA  EC  AB 100 0 
5 AA  EC  BC  10 0 
6 AA  EC  DC 150 0 
7 AA  WC  RT 300 1 
8 AA  WC  TT  30 1 
9 AA  WC other 350 1 
0

當試圖用我的實際數據來實現上述兩個答案,我意識到我不得不考慮一個新的變量df.1$year,我需要,以滿足更復雜的條件之前,指示變量應該是:df.1$county == 'other' & is.na(df.1$apples)districtyear。以下是修訂後的數據集df.1和修訂後的lapply聲明以實施這些新條件。我還沒有得到ave來處理這些新的情況,但我沒有借用一些eddi的代碼。

這個修改過的場景與我原來的問題似乎過於緊密相關,無法發佈一個新問題。

# identify district/year combinations containing >= 1 row in which county == 'other' & apples == NA 

df.1 <- read.table(text = ' 

    state district county year apples 
     AA   EC  A 1980  100 
     AA   EC  B 1980  10 
     AA   EC  C 1980  150 
     AA   C  G 1980  200 
     AA   C other 1980  20 
     AA   C  I 1980  250 
     AA   WC  R 1980  300 
     AA   WC  S 1980  NA 
     AA   WC other 1980  350 
     AA   EC  A 1999 1100 
     AA   EC  D 1999  NA 
     AA   EC  E 1999 1150 
     AA   C  H 1999 1200 
     AA   C  I 1999  120 
     AA   C  J 1999 1250 
     AA   WC  R 1999 1300 
     AA   WC other 1999  NA 
     AA   WC  T 1999 1350 

', header=TRUE, stringsAsFactors = FALSE) 

df.1$my.grouping <- paste(df.1$district, df.1$year, sep = '') 

z <- lapply(split(df.1, df.1$my.grouping), function(x) { merge(x, 1*any(x$county == "other" & is.na(x$apples)), all=TRUE)}) 
df.2 <- do.call(rbind, z) 
rownames(df.2) = NULL 
df.2 

    state district county year apples my.grouping y 
1  AA  C  G 1980 200  C1980 0 
2  AA  C other 1980  20  C1980 0 
3  AA  C  I 1980 250  C1980 0 
4  AA  C  H 1999 1200  C1999 0 
5  AA  C  I 1999 120  C1999 0 
6  AA  C  J 1999 1250  C1999 0 
7  AA  EC  A 1980 100  EC1980 0 
8  AA  EC  B 1980  10  EC1980 0 
9  AA  EC  C 1980 150  EC1980 0 
10 AA  EC  A 1999 1100  EC1999 0 
11 AA  EC  D 1999  NA  EC1999 0 
12 AA  EC  E 1999 1150  EC1999 0 
13 AA  WC  R 1980 300  WC1980 0 
14 AA  WC  S 1980  NA  WC1980 0 
15 AA  WC other 1980 350  WC1980 0 
16 AA  WC  R 1999 1300  WC1999 1 
17 AA  WC other 1999  NA  WC1999 1 
18 AA  WC  T 1999 1350  WC1999 1