2012-10-09 68 views
1

我有一個填充了課程ID,學生ID,週數(第一週爲1,第二週爲2)的數據框,以及有關每個用戶在每個課程中做了什麼的一些信息每週。如果教師在那一週與該學生進行了「干預」,那麼df的最後兩列是非NA,否則是NA。我想比較每個學生在第一次干預之前的行爲。數據框中的特定聚合

所以我想要做的是一個專欄'HasIntervened',這個專欄比學生的第一次干預少了幾個星期,而真實週數大於或等於幾周,但我有一個地獄創建這個簡單列的時間。我相當肯定aggregate將成爲未來的路,但我只是沒有以正確的方式思考問題。

這是第60行的dput數據幀的(5名學生的價值):

structure(list(UserID = c(4188948L, 4188948L, 4188948L, 4188948L, 
4188948L, 4188948L, 4735684L, 4735684L, 4735684L, 4735684L, 4735684L, 
4735684L, 6292486L, 6292486L, 6292486L, 6292486L, 6292486L, 6292486L, 
6469671L, 6469671L, 6469671L, 6469671L, 6469671L, 6469671L, 6538263L, 
6538263L, 6538263L, 6538263L, 6538263L, 6538263L, 6621258L, 6621258L, 
6621258L, 6621258L, 6621258L, 6621258L, 6891869L, 6891869L, 6891869L, 
6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 
6891869L, 6891869L, 6978155L, 6978155L, 6978155L, 6978155L, 6978155L, 
6978155L, 7195846L, 7195846L, 7195846L, 7195846L, 7195846L, 7195846L 
), CourseID = c(6567871L, 6567871L, 6567871L, 6567871L, 6567871L, 
6567871L, 6567168L, 6567168L, 6567168L, 6567168L, 6567168L, 6567168L, 
6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 6567159L, 
6567159L, 6567159L, 6567159L, 6567159L, 6567159L, 6567162L, 6567162L, 
6567162L, 6567162L, 6567162L, 6567162L, 6567853L, 6567853L, 6567853L, 
6567853L, 6567853L, 6567853L, 6567159L, 6567159L, 6567159L, 6567159L, 
6567159L, 6567159L, 6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 
6567864L, 6567873L, 6567873L, 6567873L, 6567873L, 6567873L, 6567873L, 
6567859L, 6567859L, 6567859L, 6567859L, 6567859L, 6567859L), 
WeekInCourse = c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 
3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 
4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 
5, 6, 1, 2, 3, 4, 5, 6), WeekPostCount = c(1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5L, 3L, 4L, 3L, 3L, 0L, 4L, 
0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 2L, 
2L, 0L, 0L, 4L, 0L, 3L, 0L, 3L, 0L, 0L, 0L), WeekLoginCount = c(2L, 
1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 4L, 4L, 1L, 0L, 
0L, 0L, 3L, 3L, 1L, 0L, 0L, 0L, 2L, 1L, 0L, 0L, 0L, 0L, 1L, 
1L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 4L, 1L, 0L, 0L, 
0L, 0L, 3L, 3L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), 
WeekPointsPercent = c(0, 0, 0, 0, 0, 0, 0, 0.185714285714286, 
0.375, 0.2, 0, 0, 0, 0.85, 0.7, 0.4, 0.7, 0.7, 0, 0.857142857142857, 
0.35, 0, 0, 0.712765957446808, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0, 0, 0, 0, 0, 0.5, 0.5, 
0, 0, 0.7, 1, 1, 0.375, 0.723076923076923, 0, 0.738636363636364 
), CumulativePointsPercent = c(0, 0, 0, 0, 0, 0, 0, 0.185714285714286, 
0.254545454545455, 0.235294117647059, 0.235294117647059, 
0.10958904109589, 0, 0.85, 0.8, 0.533333333333333, 0.55, 
0.563636363636364, 0, 0.857142857142857, 0.623076923076923, 
0.476470588235294, 0.476470588235294, 0.600558659217877, 
0, 1, 0.0666666666666667, 0.0666666666666667, 0.0461538461538462, 
0.0461538461538462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0.25, 0.166666666666667, 0.0555555555555556, 0.05, 0.0454545454545455, 
0, 0.5, 0.5, 0.166666666666667, 0.15, 0.2, 1, 1, 0.615384615384615, 
0.669230769230769, 0.621428571428571, 0.666666666666667), 
RiskEstimate = c(0.627717786405816, 0.986868933315635, 0.986687587608184, 
0.993909863003438, 0.997123961252086, 0.995862152216296, 
0.914011371723269, 0.925359536086114, 0.902625588346349, 
0.956922151061089, 0.977244888475535, 0.975006380719003, 
0.215420992232115, 0.174623555825523, 0.241380495376484, 
0.699712463799006, 0.692014530298594, 0.697966901130338, 
0.765071150059092, 0.763071307309743, 0.767261726128078, 
0.835918063362269, 0.854949153314029, 0.805318343915736, 
0.792873572656207, 0.790581615380765, 0.82622599277251, 0.9330287497742, 
0.965763061363497, 0.951226314109191, 0.851355921713566, 
0.991081300877175, 0.989671569185701, 0.995402298000919, 
0.997671718747865, 0.996593366142757, 0.738690043138604, 
0.865412845144037, 0.831369850200541, 0.93845410260835, 0.968400480533385, 
0.9533338828382, 0.624930735381371, 0.981915016747928, 0.985037736895337, 
0.994680902796769, 0.996907588471311, 0.995388109404559, 
0.887995464972052, 0.970620002831325, 0.97136665697772, 0.992618626388727, 
0.99543249839328, 0.992149889176406, 0.923802324633255, 0.984464950934932, 
0.978726967214146, 0.971473084822075, 0.97886220009245, 0.979311013989987 
), RiskBin = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L), InterventionID = c(NA, 26L, NA, NA, NA, 
NA, NA, NA, NA, NA, 50L, NA, NA, NA, NA, NA, 73L, NA, NA, 
NA, NA, NA, 56L, NA, NA, NA, NA, 46L, NA, NA, NA, 33L, NA, 
NA, NA, NA, 15L, NA, NA, 43L, 53L, NA, NA, NA, NA, NA, 71L, 
NA, NA, NA, NA, NA, 78L, NA, NA, 36L, NA, NA, 80L, NA), InterventionType = structure(c(NA, 
2L, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, 
2L, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, 3L, NA, NA, NA, 
2L, NA, NA, NA, NA, 3L, NA, NA, 3L, 2L, NA, NA, NA, NA, NA, 
2L, NA, NA, NA, NA, NA, 2L, NA, NA, 3L, NA, NA, 3L, NA), .Label = c("", 
"At-Risk Form", "Email", "Other", "Phone"), class = "factor")), .Names = c("UserID", 
"CourseID", "WeekInCourse", "WeekPostCount", "WeekLoginCount", 
"WeekPointsPercent", "CumulativePointsPercent", "RiskEstimate", 
"RiskBin", "InterventionID", "InterventionType"), row.names = c(NA, 
60L), class = "data.frame") 
+0

需要嘛,我敢肯定,你應該改爲使用'ave'。 –

回答

2
courses$HasIntervened <- as.logical(with(courses, ave(InterventionID, 
               UserID, CourseID, # grouping factors 
              FUN=function(x) cumsum(!is.na(x))))) 
+0

我很確定這將是我的解決方案,但是'with'部分是什麼?我正在查看它的文檔,我不明白爲什麼它在這裏有幫助。 –

+1

它允許您使用列名作爲變量,使代碼更具可讀性。另外明智的是你會寫:'ave(courses $ InterventionID,courses $ UserID,courses $ CourseID,function(x){...})'。從技術上講,您正在創建一個「環境」,其中列是命名實體,而在.GlobalEnv中,這些值不能按名稱訪問。 –

+0

啊,好吧......這就像是臨時打電話給'attach',那種。非常感謝!我希望有一些功能,我沒有聽說過,我應該使用,似乎是「大」。 –

1

試試這個:

foo = your.data 
foo$WeekInCourse[is.na(foo$InterventionID)]=Inf 
bar = setNames(aggregate(WeekInCourse ~ UserID, foo, min),c("UserID","FirstW")) 
foo = merge(foo, bar, by="UserID") 
your.data$HasIntervened = your.data$WeekInCourse >= foo$FirstW 
0

這應該工作:

library(plyr) 
ddply(df, .(UserID), function(x) { 
    i <- which.min((x$InterventionID)) 
    if(i>1) { 
     x$HasIntervened <- c(rep(FALSE,i-1), rep(TRUE, nrow(x)-i+1)) 
     } else { 
     x$HasIntervened <- TRUE  
     } 
    x 
    }) 
1

A data.table方法編碼的優雅和存儲效率

library(data.table) 
# assuming your data is in DF 
DT <- as.data.table(DF) 
# set the key to ensure that the data is sorted by week within 
# each user/course combination 
setkey(DT, UserID, CourseID, WeekInCourse) 
# using cumsum 
DT[,hasIntervened := cumsum(!is.na(InterventionID))>0 ,by =list(CourseID, UserID)] 

data.table語法避免了with