這是我嘗試製作可重複問題的第一步。試圖成爲SO的更好成員。由於意外的結果大小,功能停止運行
這裏是數據集
reach.dat <- structure(list(stream = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Brooks Creek", "Buncombe Hollow Creek",
"Bypass Channel", "Cape Horn Creek", "Cougar Creek", "Dog Creek",
"Indian George Creek", "Jim Creek", "Lower Speelyai", "North Siouxon Creek",
"Ole Creek", "Panamaker Creek", "Siouxon Creek", "Speelyai Creek",
"West Fork Speelyai Creek", "West Tributary Speelyai Creek"), class = "factor"),
reach = structure(c(21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 46L, 47L, 38L, 39L, 40L, 41L, 42L, 43L,
44L, 45L), .Label = c("BNH_0001", "BNH_0002", "BNH_0003",
"BNH_0004", "BNH_0005", "BNH_0006", "BNH_0007", "BNH_0008",
"BNH_0009", "BPC_0001", "BPC_0002", "BPC_0003", "BPC_0004",
"BPC_0005", "BPC_0006", "BPC_0007", "BPC_0008", "BPC_0009",
"BPC_0010", "BPC_0011", "BRK_001", "BRK_002", "BRK_003",
"BRK_004", "BRK_005", "BRK_006", "BRK_007", "BRK_008", "BRK_009",
"BRK_010", "BRK_011", "BRK_012", "BRK_013", "BRK_014", "BRK_015",
"BRK_016", "BRK_017", "CGR_0001", "CGR_0002", "CGR_0003",
"CGR_0004", "CGR_0005", "CGR_0006", "CGR_0007", "CGR_0008",
"CHC_0001", "CHC_0002", "DOG_0001", "ING_0001", "ING_0002",
"ING_0003", "ING_0004", "ING_0005", "ING_0006", "ING_0007",
"JMC_0001", "JMC_0002", "LSPL_0001", "NSX_0001", "NSX_0002",
"OLE_0001", "OLE_0002", "OLE_0003", "OLE_0004", "OLE_0005",
"OLE_0006", "PMR_0001", "PMR_0002", "SPL_0001", "SPL_0002",
"SPL_0003", "SPL_0004", "SPL_0005", "SPL_0006", "SPL_0007",
"SPL_0008", "SPL_0009", "SPL_0010", "SPL_0011", "SPL_0012",
"SPL_0013", "SPL_0014", "SPL_0015", "SPL_0016", "SPL_0017",
"SPL_0018", "SPL_0019", "SPL_0020", "SPL_0021", "SPL_0022",
"SPL_0023", "SXN_0001", "SXN_0002", "SXN_0003", "SXN_0004",
"SXN_0005", "SXN_0006", "SXN_0007", "SXN_0008", "WFSPL_0001",
"WFSPL_0002", "WFSPL_0003", "WFSPL_0004", "WFSPL_0005", "WTSPL_0001",
"WTSPL_0002", "WTSPL_0003", "WTSPL_0004", "WTSPL_0005"), class = "factor"),
length = c(178.9, 170.1, 185.8, 199.7, 190.3, 190, 172.3,
196.4, 179, 183.4, 182.4, 196.7, 188.4, 181.5, 178.9, 196.8,
181.2, 116.2, 121.4, 124.2, 180, 111.7, 130.3, 127.5, 143.4,
75.7, 591.1, 640.9, 582.2, 659, 534.9, 621.9, 636, 564.2,
547.1, 537.2, 589.6, 329.5, 192.7, 454.4, 464.5, 477.6, 544.8,
500.1, 473.1, 481.7, 506.4), SUM_LWD = c(0.288093907, 1.324221046,
1.763186222, 0.774161242, 0.391907514, 0.889842105, 0, 1.5200611,
1.097765363, 0.573173391, 0, 0.622267412, 0.427070064, 1.43338843,
1.151928452, 1.935365854, 1.856622517, 0.326333907, 0.07199341,
0.037520129, 0, 0, 0, 0.169647059, 0.138493724, 0.098678996,
0, 0, 0.217279285, 0.179044006, 0.097868761, 0.210644798,
0, 0.708259482, 0.145567538, 0.03877513, 0.026611262, 2.302579666,
2.125583809, 0.092033451, 0.176533907, 0.955904523, 0.175624082,
1.434553089, 1.038575354, 0.198048578, 1.042061611), Avg_RPD = c(0.352,
0.343333333, 0.325, 0.34, 0.225, 0.2475, 0.254, 0.2125, 0.276666667,
0.22, 0.32875, 0.23125, 0.215, 0.268, 0.305, 0.243636, 0.335714286,
0.2, 0.216666667, 0.24, 0.243333333, 0.56, 0.2575, 0.208,
0.335833333, 0.376666667, 0.175, 0.695, 0.75, 0.546666667,
1.188333333, 1.58, 0.885, 0.448571429, NA, NA, NA, 0.611818182,
0.50875, NA, 0.77, 0.49875, NA, 0.544, 1.017777778, 0.9175,
0.623333333), Avg_Substrate_Pools = c(86.10955531, 104.0366873,
83.94224019, 88.24737809, 88.93432719, 82.59257502, 84.02947757,
81.71253918, 82.76023619, 82.88298227, 84.91750332, 81.21887219,
85.05625823, 82.04273063, 84.01489539, 92.41003006, 117.3416424,
88.61396078, 88.86511047, 83.38603629, 71.73828707, 76.57563745,
86.2069123, 83.62789949, 81.19546417, 80.89002676, 182.5586,
160.63761, 134.82532, 123.1769494, 112.0805653, 93.59420959,
180.5731068, 82.91509529, NA, 61.9283, NA, 111.7153167, 83.79134743,
61.9283, 89.51477005, NA, NA, 86.08708892, 85.3472397, 110.8504333,
91.00371559)), .Names = c("stream", "reach", "length", "SUM_LWD",
"Avg_RPD", "Avg_Substrate_Pools"), row.names = c(NA, 47L), class = "data.frame")
這是一小片數據幀的,我已經除去因子的變量(在此情況下的流)的一個的幾個等級。所以你會在腳本的開頭看到,我添加了更多的列,刪除NA(這是我相信我的問題的來源,因爲此代碼適用於沒有任何NA單元的變量)。然後我放下關卡,因爲我剛纔提到的原始數據集有更多的流。當我運行ht.means.xx時,錯誤發生在腳本的末尾,出現錯誤Error: result size (5), expected 4 or 1
。我認爲5是正確的數字,因爲有5個不同的流名稱。我沒有通過刪除具有NA值的行來移除整個流。每行實際上是特定流的到達範圍,並且每個流都具有超過1個範圍。因此,即使達到(行)因爲具有NA值而被移除,由於該流的其他達到(行)不具有NA值,所以流仍應該存在。
這是腳本。這有點令人困惑,我正在盡我所能。我認爲,如果你閱讀我的描述並查看所創建的data.frame(它的結構),這將是有道理的。
# Getting Started ---------------------------------------------------------
require(dplyr)
require(sampling)
require(xtable)
require(car)
require(lattice)
## Read in Data on reaches for generating 1st order probabilities
#add a column for stream ID
reach.dat$streamID <- as.numeric(reach.dat$stream)
#add a column for reach ID
reach.dat$reachID <- 1:47
#add probabilities of selecting each reach, within each stream (prop to length)
length.totals <- reach.dat %>%
group_by(stream) %>%
summarise(totals = sum(length))
reach.dat <- merge(reach.dat, length.totals, by = "stream")
reach.dat$length.probs <- with(reach.dat, length/totals)
#reorder columns for organization
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length",
"length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")]
reach.dat <- reach.dat[,1:8]
# Remove the NA values in specific columns. Specify Column
completeFun <- function(reach.dat, desiredCols) {
completeVec <- complete.cases(reach.dat[, desiredCols])
return(reach.dat[completeVec, ])
}
reach.dat <- completeFun(reach.dat, "Avg_RPD")
droplevels(reach.dat)
nsim <- 100
# Running Simulations -----------------------------------------------------
#this function draws a unequal probability sample within each stream
sample.fun.pi <- function(stream.no, n.vec){
sample <-
#ifelse statement to deal with the streams that have only 1 reach
ifelse(length(reach.dat[reach.dat$streamID==stream.no, "reachID"]) == 1,
#this line says to sample that one reach, if the stream only has one reach
reach.dat[which(reach.dat$streamID==stream.no), "reachID"],
#if more than one reach, draw a uneq prob sample of size n.vec
list(sample(subset(reach.dat, streamID == stream.no)$reachID,
size = n.vec[stream.no],
#probabilities proportional to length
prob = reach.dat[reach.dat$streamID == stream.no, "length.probs"],
#without replacement
replace = FALSE)))
return(unlist(sample))
}
strata.uneq.pi <- function(n.vec, nsim){
#store nsim samples in a matrix called sample.all
sample.all <- matrix(NA, nrow = sum(n.vec), ncol = nsim)
for(i in 1:nsim){
#for each sample, apply the above function to all the streams
sample.all[, i] <- unlist(apply(cbind(1:length(unique(reach.dat$streamID))), 1,
sample.fun.pi, n.vec))
}
return(sample.all)
}
#define sample sizes
n1.6<-c(1,1,1,1,1)
n7.8<-c(1,1,1,1,1)
n9.10<-c(2,1,1,1,1)
#Set seed
set.seed(303)
#build a matrix of samples for every sample size
sample.1.6 <- strata.uneq.pi(n1.6, nsim)
sample.7.8 <- strata.uneq.pi(n7.8, nsim)
sample.9.10 <- strata.uneq.pi(n9.10, nsim)
#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is
# selected out of the total number of simulations
pi.1.6 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
#total number of times the reach is sampled out of the total num of sims
pi.1.6[i] <- sum(sample.1.6==i)/nsim
}
pi.7.8 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
#total number of times the reach is sampled out of the total num of sims
pi.7.8[i] <- sum(sample.7.8==i)/nsim
}
pi.9.10 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
#total number of times the reach is sampled out of the total num of sims
pi.9.10[i] <- sum(sample.9.10==i)/nsim
}
# Use this to enter the variable values of choice
reach.dat$y<-(reach.dat$Avg_RPD)
#find number of reaches in each stream for calculating ht estimator
reach.dat$stream <- as.factor(as.character(reach.dat$stream))
num.reaches <- unlist(with(reach.dat, tapply(reach, stream, length)))
#find true mean of responses within each stream
true_y <- reach.dat %>%
group_by(stream) %>%
summarise(true_y=mean(y)) %>%
ungroup()
#this function calculates the ht estimator for each simulated sample from above
#uses the estimated inclusion probabilities
ht.fun <- function(simnum, sample.n, pi.n){
#reaches that were selected in the sample
reach <- sample.n[, simnum]
#estimated inclusion probabilities for those reaches
pi <- pi.n[sample.n[, simnum]]
#y-values that were selected
y <- reach.dat[sample.n[, simnum], "y"]
#the streams that the selected reaches belong to
stream <- reach.dat[sample.n[, simnum], "stream"]
#organize into a dataframe so we can use dplyr on it
data <- cbind.data.frame(reach, pi, y, stream)
#use dplyr to calculate the ht estimates of the total and the mean
ht.strata <- data %>%
tbl_df %>%
group_by(stream) %>%
summarise(total = sum(y/pi)) %>%
mutate(size = num.reaches) %>%
mutate(mean = total/size)
return(ht.strata[, "mean"])
}
#apply to every one of the nsim samples for each sampling rate
ht.means.1.6 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.1.6,
pi.n = pi.1.6))
ht.means.7.8 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.7.8,
pi.n = pi.7.8))
ht.means.9.10 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.9.10,
pi.n = pi.9.10))
你舉的例子是可重複的(嗚!),但沒有真正_minimal_,這是另一部分[問一個偉大的R第( http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610#5963610)。更短,更簡潔的問題更有可能得到答案,因爲通過對很多代碼進行排序以查看它的工作原理是一種痛苦。 – alistaire
我覺得主要是模擬的代碼與問題非常相關。我可能會錯過一些東西。而且由於某些功能彼此相互構成,我覺得我應該包括所有功能。我覺得這是最小考慮我開始...... – Christopher
這一行產生一個錯誤'reach.dat $ length.probs < - with(reach.dat,length/totals)'。沒有稱爲「總計」的列。請再次運行您的代碼,並確保我們可以在沒有錯誤的情況下結束。你有'totals.x'和'totals.y'這是你比較'長度'? –