2017-06-08 55 views
1

我想編寫一個函數,將數據幀分割爲訓練,交叉驗證和測試集。在R中將數據集分割爲訓練,交叉驗證和測試數據集ifelse返回意外結果

我的代碼如下,由一個小數據集的例子:

library(ISLR) 
    library(data.table) 
    data <- Auto 

    seed <- 12 
    train <- 0.7 
    test <- 0.6 

    # Function_split_test_train_regression <- function(data, train, test, seed){ 

     set.seed(seed) 
     setDT(data) 
     data[, index := row.names(data)] 
     train_index <- sample(data$index, train * nrow(data)) 
     test_index <- ifelse(test == 1, setdiff(data$index, train_index), 
             sample(setdiff(data$index, train_index), test * length(setdiff(data$index, train_index)))) 
    # etc 
    #} 

在這一點上,我做了一些檢查,我得到一個令我感到詫異的結果是:

 > test == 1 
     [1] FALSE 
     > sample(setdiff(data$index, train_index), 
       test * length(setdiff(data$index, train_index))) 
     [1] "225" "186" "41" "381" "356" "178" "147" "158" "21" "259" "207" "159" "250" "167" "128" "218" "271" "197" "376" "19" "77" 
     [22] "205" "46" "3" "212" "238" "61" "11" "68" "130" "200" "274" "127" "305" "201" "32" "48" "184" "290" "349" "155" "370" 
     [43] "366" "333" "243" "161" "108" "65" "125" "306" "357" "189" "337" "118" "364" "6" "149" "87" "252" "194" "362" "383" "93" 
     [64] "38" "18" "322" "220" "307" "60" "353" 
     > test_index <- ifelse(test == 1, setdiff(data$index, train_index), 
    sample(setdiff(data$index, train_index), 
      test * length(setdiff(data$index, train_index)))) 
     > test_index 
     [1] "219" 

爲什麼iflese返回219而不是第二個參數的值(因爲條件測試== 1計算結果爲FALSE)?

您的建議將不勝感激。

============================================== ==================================

編輯

繼在意見中提出的建議我用名稱test_fraction更改了代碼名稱測試的代碼,但問題依然存在。新代碼:

library(ISLR) 
library(data.table) 
data <- Auto 

seed <- 12 
train_fraction <- 0.7 
test_fraction <- 0.6 

# Function_split_test_crossval_train_regr <- function(data, train, test, seed){ 

    set.seed(seed) 
    setDT(data) 
    data[, index := row.names(data)] 
    train_index <- sample(data$index, train_fraction * nrow(data)) 
    test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index), 
                test_fraction * length(setdiff(data$index, train_index)))) 
#} 

結果:

> train_index 
    [1] "119" "118" "143" "344" "293" "341" "305" "95" "82" "58" "226" "35" "363" "111" "84" "137" "24" "151" "381" "110" "93" 
[22] "198" "133" "6" "112" "228" "62" "36" "165" "353" "271" "385" "322" "291" "316" "268" "333" "37" "377" "176" "343" "281" 
[43] "245" "75" "238" "183" "215" "68" "274" "64" "224" "391" "26" "83" "66" "308" "1" "372" "161" "170" "300" "52" "30" 
[64] "15" "57" "148" "312" "311" "194" "367" "27" "342" "260" "181" "163" "171" "193" "210" "327" "248" "172" "263" "47" "351" 
[85] "166" "292" "278" "61" "116" "204" "309" "200" "96" "330" "383" "346" "249" "368" "41" "38" "235" "4" "77" "273" "191" 
[106] "212" "99" "31" "286" "79" "184" "284" "267" "374" "355" "358" "124" "114" "335" "70" "203" "379" "14" "287" "67" "34" 
[127] "340" "127" "91" "222" "240" "387" "357" "242" "310" "347" "142" "103" "105" "117" "189" "361" "177" "126" "392" "5" "317" 
[148] "174" "352" "87" "234" "147" "202" "261" "277" "214" "290" "339" "109" "43" "120" "169" "318" "56" "94" "115" "314" "320" 
[169] "276" "237" "296" "307" "23" "186" "360" "146" "313" "152" "206" "328" "60" "195" "69" "107" "97" "92" "325" "20" "362" 
[190] "157" "101" "10" "192" "134" "251" "259" "2" "29" "265" "331" "144" "63" "384" "81" "338" "364" "213" "380" "150" "48" 
[211] "54" "354" "187" "283" "356" "389" "72" "32" "121" "376" "33" "359" "349" "239" "241" "232" "196" "74" "156" "201" "390" 
[232] "326" "285" "51" "131" "304" "85" "45" "336" "280" "178" "128" "98" "275" "246" "65" "39" "188" "55" "90" "197" "9" 
[253] "173" "40" "295" "149" "230" "140" "135" "236" "21" "369" "301" "220" "122" "253" "208" "388" "159" "282" "88" "158" "167" 
[274] "257" 
> sample(setdiff(data$index, train_index), 
+              test_fraction * length(setdiff(data$index, train_index))) 
[1] "337" "378" "164" "225" "16" "44" "221" "179" "25" "28" "324" "175" "139" "154" "17" "252" "211" "155" "233" "162" "130" 
[22] "216" "255" "190" "365" "373" "73" "207" "42" "3" "348" "227" "49" "12" "53" "315" "199" "256" "129" "375" "205" "18" 
[43] "289" "168" "264" "160" "145" "382" "136" "302" "185" "323" "100" "270" "113" "294" "247" "345" "209" "104" "321" "7" "138" 
[64] "78" "386" "366" "298" "231" "86" "19" 
> test_fraction == 1 
[1] FALSE 
> test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index), 
+              test_fraction * length(setdiff(data$index, train_index)))) 
> test_index 
[1] "28" 
+1

'test'是一個參數函數'ifelse'參數,它與您的變量名稱相同,所以您需要'test = test == 1'。我建議你改變你的變量的名字。 – TheBiro

回答

0

我不知道爲什麼會這樣,我希望有人拿出一個解釋。

但我找到了解決問題的辦法。您需要將參數傳遞給ifelse()裏面的對象:

ifelse(

    test_fraction == 1, 

    test_index <- setdiff(data$index, train_index), 

    test_index <- sample(setdiff(data$index, train_index),test_fraction * length(setdiff(data$index, train_index))) 

) 

我沒有,如果這是不好的做法與否,但它的工作原理。它也可以用於在條件中指定多個條件,如我的答案here

相關問題