2017-03-07 61 views
1

我試圖實現我自己的LSTM網絡。我實現了反向傳播算法,但它沒有通過梯度檢查。不知道錯誤在哪裏。請幫助LSTM backprop梯度檢查問題

這裏的問題是代碼:

def backward_propagation(self, x, y, cache): 
# T - the length of the sequence 
T = len(y) 
# perform forward propagation 
cache = self.forward_propagation(x) 

# ... 

# delta for output layer 
dy = cache['y'].copy() 
dy[np.arange(len(y)), y] -= 1. # softmax loss gradient 
dhtmp = np.zeros((1, self.hidden_dim)) 
dctmp = np.zeros((1, self.hidden_dim)) 

for t in np.arange(T)[::-1]: 
    dV += np.outer(dy[t], h[t].T) 
    dhtmp = self.V.T.dot(dy[t]) 

    for bptt_step in np.arange(0, t+1)[::-1]: 
     # add to gradients at each previous step 
     do[bptt_step] = dhtmp * ct[bptt_step] 
     dct[bptt_step] = dhtmp * o[bptt_step] 

     dctmp += dct[bptt_step] * (1.0 - ct[bptt_step]**2) 

     di[bptt_step] = dctmp * g[bptt_step] 
     df[bptt_step] = dctmp * c[bptt_step-1] 
     dg[bptt_step] = dctmp * i[bptt_step] 

     # backprop activation functions 
     diga[bptt_step] = di[bptt_step] * i[bptt_step] * (1.0 - i[bptt_step]) 
     dfga[bptt_step] = df[bptt_step] * f[bptt_step] * (1.0 - f[bptt_step]) 
     doga[bptt_step] = do[bptt_step] * o[bptt_step] * (1.0 - o[bptt_step]) 
     dgga[bptt_step] = dg[bptt_step] * (1.0 - g[bptt_step] ** 2) 

     # backprop matrix multiply 
     dWi += np.outer(diga[bptt_step], h[bptt_step-1]) 
     dWf += np.outer(dfga[bptt_step], h[bptt_step-1]) 
     dWo += np.outer(doga[bptt_step], h[bptt_step-1]) 
     dWg += np.outer(dgga[bptt_step], h[bptt_step-1]) 


     dUi[:, x[bptt_step]] += diga[bptt_step] 
     dUf[:, x[bptt_step]] += dfga[bptt_step] 
     dUo[:, x[bptt_step]] += doga[bptt_step] 
     dUg[:, x[bptt_step]] += dgga[bptt_step] 

     # update deltas for next step 
     # here dh is accumulated as shared variable 
     dhtmp = np.dot(self.Wi, diga[bptt_step]) 
     # dhtmp += np.dot(self.Wf, dfga[bptt_step]) <- is it needed to accumulate other dhtmp's? 
     # dhtmp += np.dot(self.Wo, doga[bptt_step]) 
     # dhtmp += np.dot(self.Wg, dgga[bptt_step]) 
     dctmp = dctmp * f[bptt_step] 

return [dV, dWi, dWf, dWo, dWg, dUi, dUf, dUo, dUg] 

我想我可以使矩陣向量乘法或改變dhtmp,dctmp一些錯誤。

回答

1

好吧,過了一會我終於明白了。有一個額外的內部循環。此代碼正常工作:

def backward_propagation(self, x, y, cache): 
    # T - the length of the sequence 
    T = len(y) 
    # perform forward propagation 
    cache = self.forward_propagation(x) 

    #... 

    # delta for output layer 
    dy = cache['y'].copy() 
    dy[np.arange(len(y)), y] -= 1.0 # softmax loss gradient 
    # print("dy: ", dy) 
    dhtmp = np.zeros((1, self.hidden_dim)) 
    dh_prev = np.zeros((1, self.hidden_dim)) 
    dctmp = np.zeros((1, self.hidden_dim)) 

    for t in np.arange(T)[::-1]: 
     dV += np.outer(dy[t], h[t].T) 
     dhtmp = self.V.T.dot(dy[t]) + dh_prev 

     # add to gradients at each previous step 
     do[t] = dhtmp * ct[t] 
     dct[t] = dhtmp * o[t] 

     dctmp += dct[t] * (1.0 - ct[t]**2) 

     di[t] = dctmp * g[t] 
     df[t] = dctmp * c[t-1] 
     dg[t] = dctmp * i[t] 

     # backprop activation functions 
     diga[t] = di[t] * i[t] * (1.0 - i[t]) 
     dfga[t] = df[t] * f[t] * (1.0 - f[t]) 
     doga[t] = do[t] * o[t] * (1.0 - o[t]) 
     dgga[t] = dg[t] * (1.0 - g[t] ** 2) 

     # backprop matrix multiply 
     dWi += np.outer(diga[t], h[t-1]) 
     dWf += np.outer(dfga[t], h[t-1]) 
     dWo += np.outer(doga[t], h[t-1]) 
     dWg += np.outer(dgga[t], h[t-1]) 


     dUi[:, x[t]] += diga[t] 
     dUf[:, x[t]] += dfga[t] 
     dUo[:, x[t]] += doga[t] 
     dUg[:, x[t]] += dgga[t] 

     # update deltas for next step 
     # here dh is accumulated as shared variable 
     dh_prev = np.dot(self.Wi.T, diga[t]) 
     dh_prev += np.dot(self.Wf.T, dfga[t]) 
     dh_prev += np.dot(self.Wo.T, doga[t]) 
     dh_prev += np.dot(self.Wg.T, dgga[t]) 
     dctmp = dctmp * f[t] 

    return [dV, dWi, dWf, dWo, dWg, dUi, dUf, dUo, dUg] 

希望有人會發現這個答案有用。