2016-03-24 64 views
4

這是我嘗試製作可重複問題的第一步。試圖成爲SO的更好成員。由於意外的結果大小,功能停止運行

這裏是數據集

reach.dat <- structure(list(stream = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Brooks Creek", "Buncombe Hollow Creek", 
"Bypass Channel", "Cape Horn Creek", "Cougar Creek", "Dog Creek", 
"Indian George Creek", "Jim Creek", "Lower Speelyai", "North Siouxon Creek", 
"Ole Creek", "Panamaker Creek", "Siouxon Creek", "Speelyai Creek", 
"West Fork Speelyai Creek", "West Tributary Speelyai Creek"), class = "factor"), 
    reach = structure(c(21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 
    29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 1L, 2L, 3L, 
    4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L, 18L, 19L, 20L, 46L, 47L, 38L, 39L, 40L, 41L, 42L, 43L, 
    44L, 45L), .Label = c("BNH_0001", "BNH_0002", "BNH_0003", 
    "BNH_0004", "BNH_0005", "BNH_0006", "BNH_0007", "BNH_0008", 
    "BNH_0009", "BPC_0001", "BPC_0002", "BPC_0003", "BPC_0004", 
    "BPC_0005", "BPC_0006", "BPC_0007", "BPC_0008", "BPC_0009", 
    "BPC_0010", "BPC_0011", "BRK_001", "BRK_002", "BRK_003", 
    "BRK_004", "BRK_005", "BRK_006", "BRK_007", "BRK_008", "BRK_009", 
    "BRK_010", "BRK_011", "BRK_012", "BRK_013", "BRK_014", "BRK_015", 
    "BRK_016", "BRK_017", "CGR_0001", "CGR_0002", "CGR_0003", 
    "CGR_0004", "CGR_0005", "CGR_0006", "CGR_0007", "CGR_0008", 
    "CHC_0001", "CHC_0002", "DOG_0001", "ING_0001", "ING_0002", 
    "ING_0003", "ING_0004", "ING_0005", "ING_0006", "ING_0007", 
    "JMC_0001", "JMC_0002", "LSPL_0001", "NSX_0001", "NSX_0002", 
    "OLE_0001", "OLE_0002", "OLE_0003", "OLE_0004", "OLE_0005", 
    "OLE_0006", "PMR_0001", "PMR_0002", "SPL_0001", "SPL_0002", 
    "SPL_0003", "SPL_0004", "SPL_0005", "SPL_0006", "SPL_0007", 
    "SPL_0008", "SPL_0009", "SPL_0010", "SPL_0011", "SPL_0012", 
    "SPL_0013", "SPL_0014", "SPL_0015", "SPL_0016", "SPL_0017", 
    "SPL_0018", "SPL_0019", "SPL_0020", "SPL_0021", "SPL_0022", 
    "SPL_0023", "SXN_0001", "SXN_0002", "SXN_0003", "SXN_0004", 
    "SXN_0005", "SXN_0006", "SXN_0007", "SXN_0008", "WFSPL_0001", 
    "WFSPL_0002", "WFSPL_0003", "WFSPL_0004", "WFSPL_0005", "WTSPL_0001", 
    "WTSPL_0002", "WTSPL_0003", "WTSPL_0004", "WTSPL_0005"), class = "factor"), 
    length = c(178.9, 170.1, 185.8, 199.7, 190.3, 190, 172.3, 
    196.4, 179, 183.4, 182.4, 196.7, 188.4, 181.5, 178.9, 196.8, 
    181.2, 116.2, 121.4, 124.2, 180, 111.7, 130.3, 127.5, 143.4, 
    75.7, 591.1, 640.9, 582.2, 659, 534.9, 621.9, 636, 564.2, 
    547.1, 537.2, 589.6, 329.5, 192.7, 454.4, 464.5, 477.6, 544.8, 
    500.1, 473.1, 481.7, 506.4), SUM_LWD = c(0.288093907, 1.324221046, 
    1.763186222, 0.774161242, 0.391907514, 0.889842105, 0, 1.5200611, 
    1.097765363, 0.573173391, 0, 0.622267412, 0.427070064, 1.43338843, 
    1.151928452, 1.935365854, 1.856622517, 0.326333907, 0.07199341, 
    0.037520129, 0, 0, 0, 0.169647059, 0.138493724, 0.098678996, 
    0, 0, 0.217279285, 0.179044006, 0.097868761, 0.210644798, 
    0, 0.708259482, 0.145567538, 0.03877513, 0.026611262, 2.302579666, 
    2.125583809, 0.092033451, 0.176533907, 0.955904523, 0.175624082, 
    1.434553089, 1.038575354, 0.198048578, 1.042061611), Avg_RPD = c(0.352, 
    0.343333333, 0.325, 0.34, 0.225, 0.2475, 0.254, 0.2125, 0.276666667, 
    0.22, 0.32875, 0.23125, 0.215, 0.268, 0.305, 0.243636, 0.335714286, 
    0.2, 0.216666667, 0.24, 0.243333333, 0.56, 0.2575, 0.208, 
    0.335833333, 0.376666667, 0.175, 0.695, 0.75, 0.546666667, 
    1.188333333, 1.58, 0.885, 0.448571429, NA, NA, NA, 0.611818182, 
    0.50875, NA, 0.77, 0.49875, NA, 0.544, 1.017777778, 0.9175, 
    0.623333333), Avg_Substrate_Pools = c(86.10955531, 104.0366873, 
    83.94224019, 88.24737809, 88.93432719, 82.59257502, 84.02947757, 
    81.71253918, 82.76023619, 82.88298227, 84.91750332, 81.21887219, 
    85.05625823, 82.04273063, 84.01489539, 92.41003006, 117.3416424, 
    88.61396078, 88.86511047, 83.38603629, 71.73828707, 76.57563745, 
    86.2069123, 83.62789949, 81.19546417, 80.89002676, 182.5586, 
    160.63761, 134.82532, 123.1769494, 112.0805653, 93.59420959, 
    180.5731068, 82.91509529, NA, 61.9283, NA, 111.7153167, 83.79134743, 
    61.9283, 89.51477005, NA, NA, 86.08708892, 85.3472397, 110.8504333, 
    91.00371559)), .Names = c("stream", "reach", "length", "SUM_LWD", 
"Avg_RPD", "Avg_Substrate_Pools"), row.names = c(NA, 47L), class = "data.frame") 

這是一小片數據幀的,我已經除去因子的變量(在此情況下的流)的一個的幾個等級。所以你會在腳本的開頭看到,我添加了更多的列,刪除NA(這是我相信我的問題的來源,因爲此代碼適用於沒有任何NA單元的變量)。然後我放下關卡,因爲我剛纔提到的原始數據集有更多的流。當我運行ht.means.xx時,錯誤發生在腳本的末尾,出現錯誤Error: result size (5), expected 4 or 1。我認爲5是正確的數字,因爲有5個不同的流名稱。我沒有通過刪除具有NA值的行來移除整個流。每行實際上是特定流的到達範圍,並且每個流都具有超過1個範圍。因此,即使達到(行)因爲具有NA值而被移除,由於該流的其他達到(行)不具有NA值,所以流仍應該存在。

這是腳本。這有點令人困惑,我正在盡我所能。我認爲,如果你閱讀我的描述並查看所創建的data.frame(它的結構),這將是有道理的。

# Getting Started --------------------------------------------------------- 

require(dplyr) 
require(sampling) 
require(xtable) 
require(car) 
require(lattice) 

## Read in Data on reaches for generating 1st order probabilities 

#add a column for stream ID 
reach.dat$streamID <- as.numeric(reach.dat$stream) 

#add a column for reach ID 
reach.dat$reachID <- 1:47 

#add probabilities of selecting each reach, within each stream (prop to length) 
length.totals <- reach.dat %>% 
    group_by(stream) %>% 
    summarise(totals = sum(length)) 

reach.dat <- merge(reach.dat, length.totals, by = "stream") 

reach.dat$length.probs <- with(reach.dat, length/totals) 

#reorder columns for organization 
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length", 
          "length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")] 
reach.dat <- reach.dat[,1:8] 

# Remove the NA values in specific columns. Specify Column 
completeFun <- function(reach.dat, desiredCols) { 
    completeVec <- complete.cases(reach.dat[, desiredCols]) 
    return(reach.dat[completeVec, ]) 
} 
reach.dat <- completeFun(reach.dat, "Avg_RPD") 

droplevels(reach.dat) 

nsim <- 100 

# Running Simulations ----------------------------------------------------- 


#this function draws a unequal probability sample within each stream 
sample.fun.pi <- function(stream.no, n.vec){ 
    sample <- 
    #ifelse statement to deal with the streams that have only 1 reach 
    ifelse(length(reach.dat[reach.dat$streamID==stream.no, "reachID"]) == 1, 
      #this line says to sample that one reach, if the stream only has one reach 
      reach.dat[which(reach.dat$streamID==stream.no), "reachID"], 
      #if more than one reach, draw a uneq prob sample of size n.vec 
      list(sample(subset(reach.dat, streamID == stream.no)$reachID, 
         size = n.vec[stream.no], 
         #probabilities proportional to length 
         prob = reach.dat[reach.dat$streamID == stream.no, "length.probs"], 
         #without replacement 
         replace = FALSE))) 
    return(unlist(sample)) 
} 

strata.uneq.pi <- function(n.vec, nsim){ 
    #store nsim samples in a matrix called sample.all 
    sample.all <- matrix(NA, nrow = sum(n.vec), ncol = nsim) 
    for(i in 1:nsim){ 
    #for each sample, apply the above function to all the streams 
    sample.all[, i] <- unlist(apply(cbind(1:length(unique(reach.dat$streamID))), 1, 
            sample.fun.pi, n.vec)) 
    } 
    return(sample.all) 
} 
#define sample sizes 
n1.6<-c(1,1,1,1,1) 
n7.8<-c(1,1,1,1,1) 
n9.10<-c(2,1,1,1,1) 

#Set seed 
set.seed(303) 

#build a matrix of samples for every sample size 
sample.1.6 <- strata.uneq.pi(n1.6, nsim) 
sample.7.8 <- strata.uneq.pi(n7.8, nsim) 
sample.9.10 <- strata.uneq.pi(n9.10, nsim) 


#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is 
# selected out of the total number of simulations 

pi.1.6 <- numeric(length(reach.dat$reach)) 
for(i in 1:length(reach.dat$reach)) { 
    #total number of times the reach is sampled out of the total num of sims 
    pi.1.6[i] <- sum(sample.1.6==i)/nsim 
} 

pi.7.8 <- numeric(length(reach.dat$reach)) 
for(i in 1:length(reach.dat$reach)) { 
    #total number of times the reach is sampled out of the total num of sims 
    pi.7.8[i] <- sum(sample.7.8==i)/nsim 
} 

pi.9.10 <- numeric(length(reach.dat$reach)) 
for(i in 1:length(reach.dat$reach)) { 
    #total number of times the reach is sampled out of the total num of sims 
    pi.9.10[i] <- sum(sample.9.10==i)/nsim 
} 


# Use this to enter the variable values of choice 
reach.dat$y<-(reach.dat$Avg_RPD) 
#find number of reaches in each stream for calculating ht estimator 
reach.dat$stream <- as.factor(as.character(reach.dat$stream)) 
num.reaches <- unlist(with(reach.dat, tapply(reach, stream, length))) 

#find true mean of responses within each stream 
true_y <- reach.dat %>% 
    group_by(stream) %>% 
    summarise(true_y=mean(y)) %>% 
    ungroup() 

#this function calculates the ht estimator for each simulated sample from above 
#uses the estimated inclusion probabilities 
ht.fun <- function(simnum, sample.n, pi.n){ 
    #reaches that were selected in the sample 
    reach <- sample.n[, simnum] 
    #estimated inclusion probabilities for those reaches 
    pi <- pi.n[sample.n[, simnum]] 
    #y-values that were selected 
    y <- reach.dat[sample.n[, simnum], "y"] 
    #the streams that the selected reaches belong to 
    stream <- reach.dat[sample.n[, simnum], "stream"] 
    #organize into a dataframe so we can use dplyr on it 
    data <- cbind.data.frame(reach, pi, y, stream) 
    #use dplyr to calculate the ht estimates of the total and the mean 
    ht.strata <- data %>% 
    tbl_df %>% 
    group_by(stream) %>% 
    summarise(total = sum(y/pi)) %>% 
    mutate(size = num.reaches) %>% 
    mutate(mean = total/size) 
    return(ht.strata[, "mean"]) 
} 

#apply to every one of the nsim samples for each sampling rate 

ht.means.1.6 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.1.6, 
           pi.n = pi.1.6)) 


ht.means.7.8 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.7.8, 
           pi.n = pi.7.8)) 


ht.means.9.10 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.9.10, 
            pi.n = pi.9.10)) 
+5

你舉的例子是可重複的(嗚!),但沒有真正_minimal_,這是另一部分[問一個偉大的R第( http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610#5963610)。更短,更簡潔的問題更有可能得到答案,因爲通過對很多代碼進行排序以查看它的工作原理是一種痛苦。 – alistaire

+0

我覺得主要是模擬的代碼與問題非常相關。我可能會錯過一些東西。而且由於某些功能彼此相互構成,我覺得我應該包括所有功能。我覺得這是最小考慮我開始...... – Christopher

+0

這一行產生一個錯誤'reach.dat $ length.probs < - with(reach.dat,length/totals)'。沒有稱爲「總計」的列。請再次運行您的代碼,並確保我們可以在沒有錯誤的情況下結束。你有'totals.x'和'totals.y'這是你比較'長度'? –

回答

1

這裏有一些變化,你可以做:

#add a column for stream ID 
reach.dat$streamID <- as.numeric(reach.dat$stream) 

#add a column for reach ID 
reach.dat$reachID <- 1:nrow(reach.dat) 

#add probabilities of selecting each reach, within each stream (prop to length) 
reach.dat <- reach.dat %>% 
    group_by(stream) %>% 
    mutate(totals = sum(length), 
     length.probs=length/totals) 

#reorder columns for organization 
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length", 
          "length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")] 
reach.dat <- reach.dat[,1:8] 
reach.dat <- reach.dat[!is.na(reach.dat$Avg_RPD),] 
reach.dat <- droplevels(reach.dat) 

#add a column for reach ID (Again) 
reach.dat$reachID <- 1:nrow(reach.dat) 

nsim <- 10 

# Running Simulations ----------------------------------------------------- 
#define sample sizes 
sizes <- list(
    n1.6=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), 
    n7.8=c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1), 
    n9.10=c(2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1), 
    n11.13=c(2,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1), 
    n14=c(2,1,2,1,1,1,1,1,1,1,1,1,1,3,1,1), 
    n15=c(3,1,2,1,1,1,1,1,1,1,1,1,1,3,1,1), 
    n16=c(3,1,2,1,1,1,1,1,1,1,1,1,1,4,1,1), 
    n17.18=c(3,2,2,1,1,1,1,1,1,1,1,1,1,4,1,1), 
    n19=c(3,2,2,1,2,1,1,1,1,1,1,1,2,4,1,1), 
    n20=c(3,2,2,1,2,1,1,1,1,1,1,1,2,5,1,1), 
    n21=c(4,2,2,1,2,1,1,1,1,1,1,1,2,5,1,1), 
    n22=c(4,2,2,1,2,1,2,1,1,1,1,1,2,5,1,1), 
    n23=c(4,2,3,1,2,1,2,1,1,1,1,1,2,5,1,1), 
    n24=c(4,2,3,1,2,1,2,1,1,1,1,1,2,6,1,1), 
    n25.26=c(4,2,3,1,2,1,2,1,1,1,2,1,2,6,1,1), 
    n27=c(5,2,3,1,2,1,2,1,1,1,2,1,2,6,1,1), 
    n28=c(5,3,3,1,2,1,2,1,1,1,2,1,2,6,1,1), 
    n29=c(5,3,3,1,2,1,2,1,1,1,2,1,2,7,1,1), 
    n30.31=c(5,3,3,1,2,1,2,1,1,1,2,1,2,7,2,2), 
    n32=c(5,3,4,1,3,1,2,1,1,1,2,1,3,7,2,2), 
    n33.35=c(6,3,4,1,3,1,2,1,1,1,2,1,3,8,2,2), 
    n36=c(6,3,4,1,3,1,3,1,1,1,2,1,3,8,2,2), 
    n37.38=c(6,3,4,1,3,1,3,1,1,1,2,1,3,9,2,2), 
    n39.40=c(7,4,4,1,3,1,3,1,1,1,2,1,3,9,2,2), 
    n41=c(7,4,5,1,3,1,3,1,1,1,2,1,3,9,2,2), 
    n42.43=c(7,4,5,1,3,1,3,1,1,1,3,1,3,10,2,2), 
    n44=c(7,4,5,1,4,1,3,1,1,1,3,1,4,10,2,2), 
    n45=c(8,4,5,1,4,1,3,1,1,1,3,1,4,10,2,2), 
    n46.49=c(8,4,5,1,4,1,3,1,1,1,3,1,4,11,2,2), 
    n50=c(9,5,6,1,4,1,4,1,1,1,3,1,4,12,3,3)) 

#Set seed 
set.seed(303) 

s <- split(reach.dat, reach.dat[,"streamID"]) 
s.reach <- lapply(s, '[[', "reachID") 
s.probs <- lapply(s, '[[', "length.probs") 
ind1 <- lengths(s.probs) == 1 
strata.uneq.pi <- function(n.vec, nsim) { 
    replicate(nsim, {out <- vector("list", length(n.vec)) 
    out[ind1] <- s.reach[ind1] 
    out[!ind1] <- Map(sample, x=s.reach[!ind1], 
          size=n.vec[!ind1], 
          prob=s.probs[!ind1], 
          replace=FALSE) 
    unlist(out) 
    }) 
} 


#build a matrix of samples for every sample size 
samples <- lapply(sizes, function(x) strata.uneq.pi(x, nsim)) 


#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is 
# selected out of the total number of simulations 
getpi <- function(smple, nsim) { 
    len <- length(reach.dat$reach) 
    pi <- numeric(len) 
    for(i in 1:len) pi[i] <- sum(smple == i)/nsim 
    pi 
} 

pi.lst <- lapply(samples, getpi, nsim=nsim) 

# Use this to enter the variable values of choice 
reach.dat$y <- reach.dat$Avg_RPD 

#find number of reaches in each stream for calculating ht estimator 
num.reaches <- table(reach.dat$stream) 

#find true mean of responses within each stream 
true_y <- reach.dat %>% 
    group_by(stream) %>% 
    summarise(true_y=mean(y)) 
reach.dat$reachID 
as.data.frame(reach.dat) 
#this function calculates the ht estimator for each simulated sample from above 
#uses the estimated inclusion probabilities 

ht.fun <- function(sample.n, pi.n) { 
    apply(sample.n, 2, function(ind) { 
    df <- data.frame(pi=pi.n[ind], 
        reach.dat[ind, "y"], 
        reach.dat[ind, "stream"], stringsAsFactors = FALSE) 
    #dim(df) 
    unique(df$stream) 
    df$size <- num.reaches[df$stream] 
    df <- na.omit(df %>% group_by(stream) %>% 
     summarise(total=sum(y/pi), 
      mean=total/size[1])) 
    df$mean} 
) 
} 

#apply to every one of the nsim samples for each sampling rate 
ht.lst <- Map(ht.fun, samples, pi.lst) 


n <- rep(names(sizes), each=16) 

stream <- as.character(rep(sort(unique(reach.dat$stream)), 30)) 

true_mean <- rep(true_y$true_y,30) 

sim_dat <- data.frame(cbind(stream, n, true_mean, do.call(rbind, ht.lst))) 
+0

這確實回答了原來的問題!謝謝皮埃爾! – Christopher

+0

不客氣!任何時候 –