2016-08-22 18 views
2

我想要執行t.test以獲得指定向量之間的pvalue。讓我們用下面的數據爲例:從數據幀開始的向量之間的T檢驗

structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
         24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
         30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
         19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
               8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
       disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
         167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
         71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
         301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 
              123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 
              150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 
                            3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
                            3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
                            3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11 
             ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
                3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
                1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
                1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
                          19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
                          18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
                          17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6 
                ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
                   0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1, 
                                 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
                                 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, 
                                            3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
                                            3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c("M_PP", "O_PP", "C_PP", "K_MM", 
                                                     "T_MM", "C_MM", "R_PP", "E_PP", "W_PP", "Q_PP", "R_MM", "T_MM", 
                                                     "V_MM", "Q_MM", "F_PP", "D_PP", "S_PP", "Z_PP", "K_PP", "G_PP", "F_MM", 
                                                     "D_MM", "S_MM", "Z_MM", "K_MM", "F_MM", "A_PP", "D_PP", "T_PP", 
                                                     "R_MM", "D_MM", "T_MM"), Name = c("Mark", "Mark", "Mark", "Mark", 
                                                            "Mark", "Mark", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", 
                                                            "Tom", "Tom", "Tim", "Tim", "Tim", "Tim", "Tim", "Tim", "Tim", 
                                                            "Tim", "Tim", "Tim", "Tim", "Tim", "Greg", "Greg", "Greg", 
                                                            "Greg", "Greg", "Greg")), .Names = c("mpg", "cyl", "disp", 
                                                                      "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb", "Name" 
                                                            ), row.names = c(NA, -32L), class = "data.frame") 

下面你可以看到一組可以從這個數據幀進行區分:

mpg cyl disp hp drat wt qsec vs am gear carb Name 
1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 M_PP Mark 
2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 O_PP Mark 
3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 C_PP Mark 
4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 K_MM Mark 
5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 T_MM Mark 
6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 C_MM Mark 

所以,我想執行的t.testPPMM - Mark的子組(carb列)。我感興趣的專欄是gear。我想知道,在這些小組中,齒輪數量的差異在統計上是重要的。

這樣的分析應該從這個數據中爲所有的組執行,如Mark

結果(pvalues)可以存儲在附加列中的同一數據框中。這意味着將在屬於同一組的所有行中重複pvalues。

+1

你有什麼已經嘗試過?你想如何處理M/O/C ...前綴? –

+0

我試圖在循環中使用函數'grep1'來完成它。不幸的是,我不能說這是爲整個數據工作,因爲它是太大的數據,我的電腦在循環中處理它... –

回答

1

這是很直轉發時使用dplyr,

library(dplyr) 
df %>% 
    group_by(Name) %>% 
    mutate(carb1 = gsub('.*_', '', carb), p_values = t.test(cyl[carb1 == 'PP'], cyl[carb1 == 'MM'])$p.value) %>% 
    select(-carb1) 

#Source: local data frame [32 x 13] 
#Groups: Name [4] 

#  mpg cyl disp hp drat wt qsec vs am gear carb Name p_values 
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>  <dbl> 
#1 21.0  6 160.0 110 3.90 2.620 16.46  0  1  4 M_PP Mark 0.2301996 
#2 21.0  6 160.0 110 3.90 2.875 17.02  0  1  4 O_PP Mark 0.2301996 
#3 22.8  4 108.0 93 3.85 2.320 18.61  1  1  4 C_PP Mark 0.2301996 
#4 21.4  6 258.0 110 3.08 3.215 19.44  1  0  3 K_MM Mark 0.2301996 
#5 18.7  8 360.0 175 3.15 3.440 17.02  0  0  3 T_MM Mark 0.2301996 
#6 18.1  6 225.0 105 2.76 3.460 20.22  1  0  3 C_MM Mark 0.2301996 
#7 14.3  8 360.0 245 3.21 3.570 15.84  0  0  3 R_PP Tom 0.1294094 
#8 24.4  4 146.7 62 3.69 3.190 20.00  1  0  4 E_PP Tom 0.1294094 
#9 22.8  4 140.8 95 3.92 3.150 22.90  1  0  4 W_PP Tom 0.1294094 
#10 19.2  6 167.6 123 3.92 3.440 18.30  1  0  4 Q_PP Tom 0.1294094 

注:我以前cylgear引發錯誤

Error: data are essentially constant

1

我打算使用cyl而不是gear,因爲t.test在使用gear時會拋出「數據本質上不變」的錯誤。假設數據是一個名爲d的對象。

我們分幾步進行。

  1. 我們將數據幀拆分爲Name,通過split(d, d$Name)
  2. 我們使用carb後綴創建一個新變量group
  3. 我們在子集內爲cylt.test

    D <- lapply(split(d, d$Name), function(x) { 
         x$group <- factor(gsub(".*(PP|MM)", "\\1", x$carb)) 
         t.test(x$cyl[x$group=="PP"], x$cyl[x$group=="MM"])$p.value 
        }) 
    

到目前爲止輸出:

D 
# $Greg 
# [1] 0.7250302 

# $Mark 
# [1] 0.2301996 

# $Tim 
# [1] 0.5995106 

# $Tom 
# [1] 0.1294094 

我們完成的重塑D成數據幀,並與我們的原始數據幀合併它:

D <- data.frame(Name = names(D), 
       pvalue = unlist(D)) 
merge(d, D) 
# Name mpg cyl disp hp drat wt qsec vs am  gear carb pvalue 
# 1 Greg 26.0 4 120.3 91 4.43 2.140 16.70 0 1 0.37495820 A_PP 0.7250302 
# 2 Greg 30.4 4 95.1 113 3.77 1.513 16.90 1 1 -2.07140903 D_PP 0.7250302 
# 3 Greg 15.8 8 351.0 264 4.22 3.170 14.50 0 1 -0.73900855 T_PP 0.7250302 
# 4 Greg 19.7 6 145.0 175 3.62 2.770 15.50 0 1 -0.09174744 R_MM 0.7250302 
# 5 Greg 15.0 8 301.0 335 3.54 3.570 14.60 0 1 -1.55889142 D_MM 0.7250302 
# 6 Greg 21.4 4 121.0 109 4.11 2.780 18.60 1 1 0.78601261 T_MM 0.7250302 
# 7 Mark 21.0 6 160.0 110 3.90 2.620 16.46 0 1 1.60209096 M_PP 0.2301996 
# 8 Mark 21.0 6 160.0 110 3.90 2.875 17.02 0 1 0.25393125 O_PP 0.2301996 
# 9 Mark 22.8 4 108.0 93 3.85 2.320 18.61 1 1 -1.14837484 C_PP 0.2301996 
# 10 Mark 21.4 6 258.0 110 3.08 3.215 19.44 1 0 0.68440881 K_MM 0.2301996 
# 11 Mark 18.7 8 360.0 175 3.15 3.440 17.02 0 0 -1.04994050 T_MM 0.2301996 
# 12 Mark 18.1 6 225.0 105 2.76 3.460 20.22 1 0 -2.18665934 C_MM 0.2301996 
# ..snip.. 
相關問題