2013-10-25 142 views
2

我想知道是否有更簡潔的方法來替換數據幀中的變量的NA值。下面的代碼似乎比我認爲在r中可能的時間更長。例如,我不知道一些可以更簡潔地做到這一點的軟件包/工具。用「ID」替換數據幀變量中的NA值與來自其他數據幀的值

有沒有方法可以替換或合併值只有當它們是NA?在使用all.x = T合併兩個數據幀後,我有一些NA值,我想用另一個數據幀中的信息替換那些使用公共變量鏈接替換的數據。

# get dataframes 
breaks <- structure(list(Break = 1:11, Value = c(2L, 13L, 7L, 9L, 40L, 
21L, 10L, 37L, 7L, 26L, 42L)), .Names = c("Break", "Value"), class = "data.frame", row.names = c(NA, 
-11L)) 

fsites <- structure(list(Site = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L), Plot = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 0L, 
1L, 2L, 3L, 4L, 5L), Break = c(1L, 5L, 7L, 8L, 11L, 1L, 6L, 11L, 
1L, 4L, 6L, 8L, 9L, 11L)), .Names = c("Site", "Plot", "Break" 
), class = "data.frame", row.names = c(NA, -14L)) 

bps <- structure(list(Site = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L), Plot = c(0L, 1L, 2L, 3L, 1L, 2L, 0L, 1L, 2L, 3L, 4L), Value = c(0.393309653, 
0.12465733, 0.27380161, 0.027288989, 0.439712533, 0.289724079, 
0.036429062, 0.577460008, 0.820375917, 0.323217357, 0.28637503 
)), .Names = c("Site", "Plot", "Value"), class = "data.frame", row.names = c(NA, 
-11L)) 

# merge fsites and bps 
df1 <- merge(fsites, bps, by=c("Site", "Plot"), all.x=T) 

# merge df1 and breaks to get values to eventually replace the NA values in 
# df1$Values.x, here "Break" is the ID by which to replace the NA values 
df2 <- merge(df1, breaks, by=c("Break")) 

# Create a new column 'Value' that uses Value.x, unless NA, then Value.y 
df3 <- df2 
df3$Value <- df2$Value.x 
df2.na <- is.na(df2$Value.x) 
df3$Value[df2.na] <- df2$Value.y[df2.na] 

# get rid of unnecessary columns 
cols <- c(1:3,6) 
df4 <- df3[,cols] 
+0

什麼是 「更好的」 對你意味着什麼?在大型數據集上運行速度更快,代碼更簡單/更短/更簡潔,還有其他功能? – gung

回答

5

在那裏只(breaksfsitesbps和)df1周圍的階段:

df1$Value <- ifelse(is.na(df1$Value), 
          breaks$Value[match(df1$Break, breaks$Break)], df1$Value) 

#> df1 
# Site Plot Break  Value 
#1  1 0  1 0.39330965 
#2  1 1  5 0.12465733 
#3  1 2  7 0.27380161 
#4  1 3  8 0.02728899 
#5  1 4 11 42.00000000 
#6  2 0  1 2.00000000 
#7  2 1  6 0.43971253 
#8  2 2 11 0.28972408 
#9  3 0  1 0.03642906 
#10 3 1  4 0.57746001 
#11 3 2  6 0.82037592 
#12 3 3  8 0.32321736 
#13 3 4  9 0.28637503 
#14 3 5 11 42.00000000 

#just to test with your `df4` 
> sort(df1$Value) == sort(df4$Value) 
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE