2017-09-06 72 views
0

我正在嵌套邏輯迴歸模型用3次的成果表示選擇A的適當的規格,B或C的第一級表示A和B或C之間的選擇,並且所述第二級別代表B和C之間的選擇。某些組成數據的代碼位於下方,但我不確定是否正確指定了模型。根據定義,B或C的概率大於B的概率,但是對於非常小的樣本量,當從後驗採樣時,「BorC」可能小於B.這樣的小樣本大小可能不會在實際數據中出現我很感興趣,但事實發生了這種事實讓我覺得我做錯了什麼。謝謝!在pymc3嵌套logit模型

import numpy as np 
import pandas as pd 
import pymc3 as pm 
from scipy.special import logit 
import matplotlib.pyplot as plt 
from theano import shared 

import cPickle as pickle # python 2 

def hierarchical_normal(name, shape, mu=0.,cs=5.): 
    delta = pm.Normal('delta_{}'.format(name), 0., 1., shape=shape) 
    sigma = pm.HalfCauchy('sigma_{}'.format(name), cs) 

    return pm.Deterministic(name, mu + delta * sigma) 

NUTS_KWARGS = {'target_accept': 0.99} 
SEED = 4260026 # from random.org, for reproducibility 

np.random.seed(SEED) 

ndraws = 1000 

counts =[[19, 50, 37], 
     [21, 67, 55], 
     [11, 53, 38], 
     [17, 54, 45], 
     [24, 93, 66], 
     [27, 53, 70]] 

counts = pd.DataFrame(counts,columns=["A","B","C"]) 

counts["BorC"] = counts[["B","C"]].sum(axis=1) 
counts["n"] = counts[["A","B","C"]].sum(axis=1) 
print counts 

group = counts.index.values 
n_group = np.unique(group).size 

obs_B = counts.B.values 
obs_BorC = counts.BorC.values 
obs_n = counts.n.values 

obs_n_ = shared(obs_n) 

with pm.Model() as model: 
    beta0  = pm.Normal('beta0', 0.,sd=5.) 
    beta_group = hierarchical_normal('beta_group', n_group) 

    eta_BorC = beta0 + beta_group[group] 
    p_BorC  = pm.math.sigmoid(eta_BorC) 
    like_BorC = pm.Binomial('obs_BorC', obs_n_, p_BorC, observed=obs_BorC) 

    alpha0  = pm.Normal('alpha0', 0.,sd=5.) 
    alpha_group = hierarchical_normal('alpha_group', n_group) 

    eta_BgivenBorC = alpha0 + alpha_group[group] 
    p_BgivenBorC = pm.math.sigmoid(eta_BgivenBorC) 
    like_BgivenBorC = pm.Binomial('obs_BgivenBorC', obs_BorC, p_BgivenBorC, observed=obs_B) 

    p_B = p_BgivenBorC*p_BorC 
    like_B = pm.Binomial('obs_B', obs_n_, p_B, observed=obs_B) 

    trace = pm.sample(draws=ndraws, random_seed=SEED,nuts_kwargs=NUTS_KWARGS) 

obs_n_.set_value(np.array([10]*6)) 

pp_trace = pm.sample_ppc(trace, model=model) 
C = pp_trace['obs_BorC']-pp_trace['obs_B'] 
print np.min(np.min(C)) 
B = pp_trace['obs_B'] 
C = np.sum(C,axis=1) 
B = np.sum(B,axis=1) 
diff = C-B 
plt.figure() 
plt.hist(diff,50) 
plt.show() 

編輯:我從瀏覽,日誌似然自動通過,這意味着我的like_B的規格是多餘的所有變量總結的pymc3代碼中看到。在這種情況下,我想我只需要弄清楚如何正確設置obs_BorC來進行後採樣。

回答

0

這是一個嘗試的解決方案,如果沒有更好的解決方案,我認爲這是可行的解決方法。我只是做了一個自定義後驗採樣器,其中obs_BorC每次迭代都會被重置。

import numpy as np 
import pandas as pd 
import pymc3 as pm 
from scipy.special import logit 
import matplotlib.pyplot as plt 
from theano import shared 
from collections import defaultdict 

from scipy.stats import norm 

def hierarchical_normal(name, shape, mu=0.,cs=5.): 
    delta = pm.Normal('delta_{}'.format(name), 0., 1., shape=shape) 
    sigma = pm.HalfCauchy('sigma_{}'.format(name), cs) 

    return pm.Deterministic(name, mu + delta * sigma) 

NUTS_KWARGS = {'target_accept': 0.99} 
SEED = 4260026 # from random.org, for reproducibility 

np.random.seed(SEED) 

ndraws = 1000 

counts =[[19, 50, 37], 
     [21, 67, 55], 
     [11, 53, 38], 
     [17, 54, 45], 
     [24, 93, 66], 
     [27, 53, 70]] 

counts = pd.DataFrame(counts,columns=["A","B","C"]) 

counts["BorC"] = counts[["B","C"]].sum(axis=1) 
counts["n"] = counts[["A","B","C"]].sum(axis=1) 
print counts 

group = counts.index.values 
n_group = np.unique(group).size 

obs_B = counts.B.values 
obs_BorC = counts.BorC.values 
obs_n = counts.n.values 

obs_n_ = shared(obs_n) 
obs_BorC_ = shared(obs_BorC) 

with pm.Model() as model: 
    beta0  = pm.Normal('beta0', 0.,sd=5.) 
    beta_group = hierarchical_normal('beta_group', n_group) 

    eta_BorC = beta0 + beta_group[group] 
    p_BorC  = pm.math.sigmoid(eta_BorC) 
    like_BorC = pm.Binomial('obs_BorC', obs_n_, p_BorC, observed=obs_BorC) 

    alpha0  = pm.Normal('alpha0', 0.,sd=5.) 
    alpha_group = hierarchical_normal('alpha_group', n_group) 

    eta_BgivenBorC = alpha0 + alpha_group[group] 
    p_BgivenBorC = pm.math.sigmoid(eta_BgivenBorC) 
    like_BgivenBorC = pm.Binomial('obs_BgivenBorC', obs_BorC_, p_BgivenBorC, observed=obs_B) 

    trace = pm.sample(draws=ndraws, random_seed=SEED,nuts_kwargs=NUTS_KWARGS) 

#plt.figure() 
#axs = pm.forestplot(trace,varnames=['beta0','beta_group','alpha0','alpha_group']) 
#plt.savefig("Forest.png") 
#plt.close() 

obs_n_.set_value(np.array([10000]*6)) 
indices = np.random.randint(0, len(trace), 1000) 
ppc = defaultdict(list) 
for idx in indices: 
    param = trace[idx] 
    n_BorC = model["obs_BorC"].distribution.random(point=param,size=None) 
    obs_BorC_.set_value(np.array(n_BorC)) 

    n_B = model["obs_BgivenBorC"].distribution.random(point=param,size=None) 
    ppc["obs_BorC"].append(n_BorC) 
    ppc["obs_B"].append(n_B) 
pp_trace = {k: np.asarray(v) for k, v in ppc.items()} 
#pp_trace = pm.sample_ppc(trace, model=model,samples=20000) 
C = pp_trace['obs_BorC']-pp_trace['obs_B'] 
print np.min(np.min(C)) 
B = pp_trace['obs_B'] 
C = np.sum(C,axis=1) 
B = np.sum(B,axis=1) 
diff = (C-B)/60000. 
plt.figure() 
x = np.linspace(np.min(diff),np.max(diff),200) 
mean = np.mean(diff) 
std = np.std(diff) 
plt.hist(diff,50,normed=True) 
plt.plot(x,norm.pdf(x,mean,std)) 
plt.plot() 
plt.show()