2017-04-20 54 views
2

所以我有三個數據集需要合併。這些包含4年級和5年級的學校數據和閱讀/數學成績。其中一個是一個長形式的數據集,在一些變量中有很多缺失(是的,我確實需要長數據),另外兩個全面缺失的數據。所有這些數據框都包含一個列,該列具有數據庫中每個人的唯一ID號。合併具有廣泛形式完整數據的NA的長型數據以覆蓋NA

這裏是產生我與......我需要使用有以下三個數據幀的工作類型data.frames的一個小例子,一個完整的可重複的例子:school_lfschool4school5school_lf與港定居人士及school4school5長表數據是我需要使用填充在這漫長的表單數據的NA的(由idgrade

set.seed(890) 
school <- NULL 
school$id <-sample(102938:999999, 100) 
school$selected <-sample(0:1, 100, replace = T) 
school$math4 <- sample(400:500, 100) 
school$math5 <- sample(400:500, 100) 
school$read4 <- sample(400:500, 100) 
school$read5 <- sample(400:500, 100) 
school <- as.data.frame(school) 

# Delete observations at random from the school df 

indm4 <- which(school$math4 %in% sample(school$math4, 25)) 
school$math4[indm4] <- NA 

indm5 <- which(school$math5 %in% sample(school$math5, 50)) 
school$math5[indm5] <- NA 

indr4 <- which(school$read4 %in% sample(school$read4, 70)) 
school$read4[indr4] <- NA 

indr5 <- which(school$read5 %in% sample(school$read5, 81)) 
school$read5[indr5] <- NA 

# Separate Read and Math 
read <- as.data.frame(subset(school, select = -c(math4, math5))) 
math <- as.data.frame(subset(school, select = -c(read4, read5))) 

# Now turn this into long form data... 
clr <- melt(read, id.vars = c("id", "selected"), variable.name = "variable", value.name = "readscore") 
clm <- melt(math, id.vars = c("id", "selected"), value.name = "mathscore") 

# Clean up the grades for each of these... 
clr$grade <- ifelse(clr$variable == "read4", 4, 
      ifelse(clr$variable == "read5", 5, NA)) 

clm$grade <- ifelse(clm$variable == "math4", 4, 
      ifelse(clm$variable == "math5", 5, NA)) 

# Put all these in one df 
school_lf <-cbind(clm, clr$readscore) 
school_lf$readscore <- school_lf$`clr$readscore` # renames 
school_lf$`clr$readscore` <- NULL # deletes 
school_lf$variable <- NULL # deletes 

############### 


# Generate the 2 data frames with IDs that have the full data 

set.seed(890) 
school4 <- NULL 
school4$id <-sample(102938:999999, 100) 
school4$selected <-sample(0:1, 100, replace = T) 
school4$math4 <- sample(400:500, 100) 
school4$read4 <- sample(400:500, 100) 
school4$grade <- 4 
school4 <- as.data.frame(school4) 


set.seed(890) 
school5 <- NULL 
school5$id <-sample(102938:999999, 100) 
school5$selected <-sample(0:1, 100, replace = T) 
school5$math5 <- sample(400:500, 100) 
school5$read5 <- sample(400:500, 100) 
school5$grade <- 5 
school5 <- as.data.frame(school5) 

我需要合併寬表單數據的DFS轉換爲長格式的數據以用實際值替換NA。我已經嘗試了下面的代碼,但它引入了幾個列,而不是將讀取分數和數學分數合併到NA中。我只需要一列閱讀分數和一列數學分數,而不是六個單獨列(read.x,read.y,math.x,math.y,和readscore)。

sch <- merge(school_lf, school4, by = c("id", "grade", "selected"), all = T) 
sch <- merge(sch, school5, by = c("id", "grade", "selected"), all = T) 

任何幫助,不勝感激!我一直試圖解決這個問題,現在已經好幾個小時了,還沒有取得任何進展(所以我想問一下)

回答

0

您可以使用dplyr中的​​3210函數。如果第一個矢量中的值是NA,它將查看第二個矢量中相同位置上的值是否不是NA並將其選中。如果再次NA,則進入第三個。

library(dplyr) 
sch %>% mutate(mathscore = coalesce(mathscore, math4, math5)) %>% 
    mutate(readscore = coalesce(readscore, read4, read5)) %>% 
    select(id:readscore) 
+0

感謝您的答覆!該功能看起來整潔,但我無法理解它在做什麼......我試圖與樣本數據運行它,它給了我這個錯誤:'mutate_impl錯誤(.data,dots):object'math4'not found' – rowbust

+0

然後你可能在不同的數據集上運行它,它只是說math4不是不在de數據集中,而是應用函數上。 – Edwin

0

編輯:我只是試圖做這種做法對我的實際數據,並沒有工作,因爲替換數據也有一些來港定居,並作爲一個結果,DFS我嘗試做​​3210與有不同數量行......回到原點。

我能夠用下面的代碼解決這個問題(雖然它不是最優雅或直接的,而且@ Edwin的回答幫助我指出了正確的方向。效率非常歡迎!

# Idea: put both in long form and stack on top of one another... then merge like that! 

sch4r <- as.data.frame(subset(school4, select = -c(mathscore))) 
sch4m <- as.data.frame(subset(school4, select = -c(readscore))) 

sch5r <- as.data.frame(subset(school5, select = -c(mathscore))) 
sch5m <- as.data.frame(subset(school5, select = -c(readscore))) 


# Put these in LF 
sch4r_lf <- melt(sch4r, id.vars = c("id", "selected", "grade"), value.name = "readscore") 
sch4m_lf <- melt(sch4m, id.vars = c("id", "selected", "grade"), value.name = "mathscore") 

sch5r_lf <- melt(sch5r, id.vars = c("id", "selected", "grade"), value.name = "readscore") 
sch5m_lf <- melt(sch5m, id.vars = c("id", "selected", "grade"), value.name = "mathscore") 

# Combine in one DF 
sch_full_4 <-cbind(sch4r_lf, sch4m_lf$mathscore) 
sch_full_4$mathscore <- sch_full_4$`sch4m_lf$mathscore` 
sch_full_4$`sch4m_lf$mathscore` <- NULL # deletes 
sch_full_4$variable <- NULL 

sch_full_5 <- cbind(sch5r_lf, sch5m$mathscore) 
sch_full_5$mathscore <- sch_full_5$`sch5m$mathscore` 
sch_full_5$`sch5m$mathscore` <- NULL 
sch_full_5$variable <- NULL 

# Stack together 
sch_full <- rbind(sch_full_4,sch_full_5) 
sch_full$selected <- NULL # delete this column... 

# MERGE together 
final_school_math <- mutate(school_lf, mathscore = coalesce(school_lf$mathscore, sch_full$mathscore)) 
final_school_read <- mutate(school_lf, readscore = coalesce(school_lf$readscore, sch_full$readscore)) 

final_df <- cbind(final_school_math, final_school_read$readscore) 
final_df$readscore <- final_df$`final_school_read$readscore` 
final_df$`final_school_read$readscore` <- NULL