來自R數據集的基礎上匹配

我做了一個詞學習實驗與54名與會者中刪除行。設計是這樣的，每個參與者將學習12個同源詞和12個非同源詞。但是，我不得不從數據集中刪除一些觀測值，最終得到1591個同源觀測值和1816個非同源觀測值。來自R數據集的基礎上匹配

現在，我想計算分數同源和非同源詞之間的相關性。這意味着我將不得不將1816個非同源觀察值減少到1591，因爲相關性總是以相同的樣本進行的。

我可能只是刪除行1592年至1816年，但不會是理想的，因爲我會失去對以後參加的所有非同源（數據由參與者數量排序）。

我更願意做的是循環所有參與者，併爲每個參與者刪除儘可能多的「過剩」非同源詞，以便同類和非同源詞的數量與該參與者的數量相等。

這是2名參與者數據的一個例子（注意單詞在多個時間點進行測試，理想的應該也考慮到刪除數據時）：

structure(list(Participant = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", 
"25", "26", "27", "28", "29", "30", "31", "34", "35", "36", "37", 
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", 
"49", "50", "51", "52", "54", "55", "56", "57"), class = "factor"), 
    Word = structure(c(5L, 77L, 23L, 40L, 30L, 8L, 73L, 28L, 
    48L, 44L, 58L, 69L, 50L, 57L, 45L, 6L, 56L, 53L, 63L, 65L, 
    77L, 5L, 40L, 23L, 30L, 8L, 28L, 73L, 48L, 58L, 44L, 50L, 
    69L, 57L, 45L, 56L, 6L, 63L, 53L, 65L, 23L, 30L, 40L, 5L, 
    8L, 77L, 73L, 48L, 28L, 57L, 69L, 58L, 50L, 45L, 44L, 53L, 
    65L, 6L, 63L, 56L, 5L, 40L, 8L, 77L, 30L, 23L, 28L, 48L, 
    73L, 57L, 45L, 50L, 69L, 58L, 44L, 63L, 53L, 56L, 6L, 16L, 
    13L, 81L, 82L, 52L, 1L, 12L, 75L, 55L, 78L, 70L, 66L, 80L, 
    83L, 64L, 68L, 25L, 47L, 11L, 26L, 4L, 19L, 36L, 13L, 16L, 
    82L, 81L, 52L, 1L, 75L, 12L, 78L, 55L, 70L, 80L, 66L, 64L, 
    83L, 68L, 25L, 11L, 47L, 4L, 26L, 19L, 36L, 13L, 16L, 1L, 
    82L, 52L, 81L, 78L, 12L, 75L, 55L, 70L, 80L, 66L, 64L, 83L, 
    68L, 25L, 4L, 11L, 47L, 36L, 19L, 26L), .Label = c("aambeeld", 
    "bezem", "brandblusser", "broodrooster", "buis", "citruspers", 
    "dienblad", "dobber", "dweil", "emmer", "garde", "gesp", 
    "gieter", "gum", "heggenschaar", "hengel", "hes", "kaars", 
    "kapstok", "keppel", "kist", "klapper", "klos", "knikker", 
    "knuffel", "kooi", "kous", "kraag", "kroon", "kruiwagen", 
    "kruk", "kurk", "kussen", "kwast", "lantaarn", "lessenaar", 
    "mijter", "onderzetter", "pak", "passer", "peddel", "pet", 
    "pruik", "puntenslijper", "rammelaar", "reddingsvest", "rietje", 
    "rits", "romper", "sambabal", "schort", "schroef", "servet", 
    "skelter", "slab", "slang", "slinger", "speen", "speldje", 
    "spijker", "spuit", "staf", "stamper", "stelt", "stofzuiger", 
    "stokpaard", "stolp", "tamboerijn", "tol", "tooi", "toverstaf", 
    "tuinbroek", "tulband", "vergiet", "veter", "vijl", "vijzel", 
    "waaier", "wafelijzer", "wip", "zaag", "zeis", "zwemvleugel" 
    ), class = "factor"), Cognate = structure(c(2L, 2L, 2L, 2L, 
    2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
    2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Cognate", 
    "Non-cognate"), class = "factor"), TestingMoment = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Main2", 
    "Main4", "Post", "FollowUp"), class = "factor"), Score = c(0, 
    1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0.71, 1, 1, 0.86, 
    1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 
    1, 0.86, 1, 0, 0, 0, 0, 1, 0, 0.43, 1, 1, 0, 0, 0, 0, 1, 
    1, 0.86, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.75, 0, 0, 0, 0.57, 
    0, 0, 0, 0.45, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 
    0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0.8, 
    1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 
    0, 0)), .Names = c("Participant", "Word", "Cognate", "TestingMoment", 
"Score"), row.names = c(97L, 98L, 99L, 100L, 101L, 102L, 103L, 
104L, 105L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 
118L, 120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 130L, 
133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L, 144L, 
145L, 146L, 147L, 148L, 149L, 150L, 152L, 154L, 155L, 157L, 158L, 
159L, 160L, 161L, 162L, 163L, 164L, 165L, 166L, 168L, 169L, 170L, 
171L, 172L, 173L, 174L, 175L, 178L, 180L, 181L, 182L, 183L, 184L, 
185L, 186L, 188L, 189L, 190L, 191L, 193L, 194L, 195L, 196L, 197L, 
198L, 199L, 200L, 201L, 202L, 204L, 205L, 206L, 207L, 208L, 209L, 
210L, 211L, 212L, 213L, 214L, 215L, 216L, 217L, 218L, 219L, 220L, 
221L, 222L, 223L, 224L, 225L, 226L, 228L, 229L, 230L, 231L, 232L, 
233L, 234L, 235L, 236L, 237L, 238L, 239L, 240L, 241L, 242L, 243L, 
244L, 245L, 246L, 247L, 248L, 250L, 251L, 252L, 253L, 254L, 255L, 
256L, 257L, 258L, 259L, 260L, 261L, 262L, 263L, 264L), class = "data.frame")

什麼是最好的去哪裏？

來源

2017-04-25 Johanna

我不明白你想刪除什麼非同源。 – TheBiro

好吧，基本上我想保留與參與者數量和測試時間相關的非同源詞。在那之後，這樣匹配哪些將被刪除並不重要（也許我可以保留那些與同源性最匹配的那些長度，但現在不會那麼重要）。 – Johanna

難道對於一些參與者的同源數>非同源數？ – G5W

答案稍微小一些。

df2 = df  ## Preserve original data 
for(Part in levels(df2$Participant)) { 
    Tab = table(df2$Cognate[df2$Participant == Part]) 
    if(Tab[1] == Tab[2]) { next } 
    Big = ifelse(Tab[1] > Tab[2], 1, 2) 
    Small = ifelse(Tab[1] < Tab[2], 1, 2) 

    Rem1 = sample(Tab[Big], Tab[Big] - Tab[Small]) 
    Remove = which(df2$Participant == Part & df2$Cognate == levels(df2$Cognate)[Big])[Rem1] 
    df2 = df2[-Remove,] 
} 
table(df2$Cognate)

來源

2017-04-25 14:42:58 G5W

不錯！在我的代碼中，我添加了一個if來檢查同源和非同源的數量是否相等，但是我添加了一個「break」。它會導致爲什麼bre漏？這就是爲什麼你有'下一個'？ – TheBiro

@TheBiro'next'會立即在'for'環（參與者）的下一個值，而不執行循環的其餘部分。我認爲你的'break'可能實際上是一個錯誤。我認爲，'break'將完全停止'for'循環的執行等不下去了以後participants.Too糟糕了，我們只需要一個參與者來測試與:-) – G5W

萬一你有興趣，我有爲兩位參與者添加了示例數據。 – Johanna

這裏是我的答案，它的大和奇怪，但它遍歷每個用戶，請檢查是否Cognate或Non-cognate是更加頻繁和刪除，直到它得到相等的（記住你的數據分配給data變量）：

final_data <- NULL 
for (ptcp in unique(data$Participant)) { 

    # subset to chose each participant 
    new_data <- data[which(data$Participant==ptcp), ] 

    # Check if Non-cognate and Cognate are equal 
    if (length(which(new_data$Cognate=="Non-cognate")) == length(which(new_data$Cognate=="Cognate"))) break 

    # Check if have more Non-cognate than Cognate 
    if (length(which(new_data$Cognate=="Non-cognate")) > length(which(new_data$Cognate=="Cognate"))) { 
    # Loop while they are not equal 
    while (length(which(new_data$Cognate=="Non-cognate")) > length(which(new_data$Cognate=="Cognate"))) { 

     # Removes the first row of "non-cognate 
     id <- which(new_data$Cognate=="Non-cognate") 
     new_data <- new_data[-id[1],] 

    } 

    # Check if have more Cognate than Non-cognate 
    } else if (length(which(new_data$Cognate=="Cognate")) > length(which(new_data$Cognate=="Non-cognate"))) { 
    # Loop while they are not equal 
    while (length(which(new_data$Cognate=="Cognate")) > length(which(new_data$Cognate=="Non-cognate"))) { 

     # Removes the first row of "non-cognate 
     id <- which(new_data$Cognate=="Cognate") 
     new_data <- new_data[-id[1],] 

    } 

    } 

    # Combine each user to final_data 
    final_data <- rbind(final_data, new_data) 

}

來源

2017-04-25 14:27:13 TheBiro

來自R數據集的基礎上匹配

回答

相關問題