0

技術信息:theano GRU RNN亞當優化

操作系統:Mac OS X 10.9.5

IDE:Eclipse的Mars.1版本(4.5.1),用的PyDev和蟒蛇解釋器(語法版本3.4)

GPU:的NVIDIA GeForce GT 650M

利布斯:numpy的,aeosa,獅身人面像-1.3.1,theano 0.7,NLTK-3.1

我的背景:我很新的theano和numpy的和避風港」在機器學習或離散數學方面取得了正式課程。

自然語言處理的迴歸神經網絡我目前就是從這裏取使用:

https://github.com/dennybritz/rnn-tutorial-gru-lstm/blob/master/gru_theano.py

這個文件所做的唯一變化與字符串'float32'更換到theano.config.floatX引用。

我也使用存儲庫中包含的utils.py和train.py模塊,只做了很小的修改。

亞當優化我計劃在地方的例子庫實現新元/ RMS代碼包含在這裏找到:​​3210

此處轉載(再次引用到.config.floatX替換爲硬編碼'float32') :

theanoththeano.sharedthshtheano.tensorTnumpy作爲np

def adam(loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8): 
    """ 
    ADAM update rules 
    Default values are taken from [Kingma2014] 

    References: 
    [Kingma2014] Kingma, Diederik, and Jimmy Ba. 
    "Adam: A Method for Stochastic Optimization." 
    arXiv preprint arXiv:1412.6980 (2014). 
    http://arxiv.org/pdf/1412.6980v4.pdf 
    """ 

    updates = [] 
    all_grads = th.grad(loss, all_params) 
    alpha = learning_rate 
    t = thsh(np.float32(1)) 
    b1_t = b1*gamma**(t-1) #(Decay the first moment running average coefficient) 

    for theta_previous, g in zip(all_params, all_grads): 
     m_previous = thsh(np.zeros(theta_previous.get_value().shape.astype('float32'))) 
     v_previous = thsh(np.zeros(theta_previous.get_value().shape.astype('float32'))) 

     m = b1_t*m_previous + (1 - b1_t)*g # (Update biased first moment estimate) 
     v = b2*v_previous + (1 - b2)*g**2 # (Update biased second raw moment estimate) 
     m_hat = m/(1-b1**t)    # (Compute bias-corrected first moment estimate) 
     v_hat = v/(1-b2**t)    # (Compute bias-corrected second raw moment estimate) 
     theta = theta_previous - (alpha * m_hat)/(T.sqrt(v_hat) + e) #(Update parameters) 

     updates.append((m_previous, m)) 
     updates.append((v_previous, v)) 
     updates.append((theta_previous, theta)) 
    updates.append((t, t + 1.)) 
    return updates 

我的問題是這樣的:

你會如何修改GRUTheano模塊來代替內置新元/ rmsprop功能的使用上面的方法亞當?

它看起來像主要變化將是線GRUTheano的99-126:

# SGD parameters 
    learning_rate = T.scalar('learning_rate') 
    decay = T.scalar('decay') 

    # rmsprop cache updates 
    mE = decay * self.mE + (1 - decay) * dE ** 2 
    mU = decay * self.mU + (1 - decay) * dU ** 2 
    mW = decay * self.mW + (1 - decay) * dW ** 2 
    mV = decay * self.mV + (1 - decay) * dV ** 2 
    mb = decay * self.mb + (1 - decay) * db ** 2 
    mc = decay * self.mc + (1 - decay) * dc ** 2 

    self.sgd_step = theano.function(
     [x, y, learning_rate, theano.Param(decay, default=0.9)], 
     [], 
     updates=[(E, E - learning_rate * dE/T.sqrt(mE + 1e-6)), 
       (U, U - learning_rate * dU/T.sqrt(mU + 1e-6)), 
       (W, W - learning_rate * dW/T.sqrt(mW + 1e-6)), 
       (V, V - learning_rate * dV/T.sqrt(mV + 1e-6)), 
       (b, b - learning_rate * db/T.sqrt(mb + 1e-6)), 
       (c, c - learning_rate * dc/T.sqrt(mc + 1e-6)), 
       (self.mE, mE), 
       (self.mU, mU), 
       (self.mW, mW), 
       (self.mV, mV), 
       (self.mb, mb), 
       (self.mc, mc) 
       ]) 

回答

0

我沒有測試此代碼,但您需要更改的唯一的事情就是告訴更新使用亞當(..),而不是在這裏已經提供的更新,所以這樣的事情應該工作(完整的代碼如下所示(我們需要擺脫rmsprop東西)):

import numpy as np 
import theano as theano 
import theano.tensor as T 
from theano.gradient import grad_clip 
import time 
import operator 

class GRUTheano(object): 
    def __init__(self, word_dim, hidden_dim=128, bptt_truncate=-1): 
     # Assign instance variables 
     self.word_dim = word_dim 
     self.hidden_dim = hidden_dim 
     self.bptt_truncate = bptt_truncate 
     # Initialize the network parameters 
     E = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim)) 
     U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim)) 
     W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim)) 
     V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim)) 
     b = np.zeros((6, hidden_dim)) 
     c = np.zeros(word_dim) 
     # Theano: Created shared variables 
     self.E = theano.shared(name='E', value=E.astype(theano.config.floatX)) 
     self.U = theano.shared(name='U', value=U.astype(theano.config.floatX)) 
     self.W = theano.shared(name='W', value=W.astype(theano.config.floatX)) 
     self.V = theano.shared(name='V', value=V.astype(theano.config.floatX)) 
     self.b = theano.shared(name='b', value=b.astype(theano.config.floatX)) 
     self.c = theano.shared(name='c', value=c.astype(theano.config.floatX)) 
     # We store the Theano graph here 
     self.theano = {} 
     self.__theano_build__() 

    def __theano_build__(self): 
     E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c 

     x = T.ivector('x') 
     y = T.ivector('y') 

     def forward_prop_step(x_t, s_t1_prev, s_t2_prev): 
      # This is how we calculated the hidden state in a simple RNN. No longer! 
      # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev)) 

      # Word embedding layer 
      x_e = E[:,x_t] 

      # GRU Layer 1 
      z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0]) 
      r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1]) 
      c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2]) 
      s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev 

      # GRU Layer 2 
      z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3]) 
      r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4]) 
      c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5]) 
      s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev 

      # Final output calculation 
      # Theano's softmax returns a matrix with one row, we only need the row 
      o_t = T.nnet.softmax(V.dot(s_t2) + c)[0] 

      return [o_t, s_t1, s_t2] 

    [o, s, s2], updates = theano.scan(
     forward_prop_step, 
     sequences=x, 
     truncate_gradient=self.bptt_truncate, 
     outputs_info=[None, 
         dict(initial=T.zeros(self.hidden_dim)), 
         dict(initial=T.zeros(self.hidden_dim))]) 

    prediction = T.argmax(o, axis=1) 
    o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) 

    # Total cost (could add regularization here) 
    cost = o_error 

    # Gradients 
    dE = T.grad(cost, E) 
    dU = T.grad(cost, U) 
    dW = T.grad(cost, W) 
    db = T.grad(cost, b) 
    dV = T.grad(cost, V) 
    dc = T.grad(cost, c) 

    # Assign functions 
    self.predict = theano.function([x], o) 
    self.predict_class = theano.function([x], prediction) 
    self.ce_error = theano.function([x, y], cost) 
    self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc]) 

    self.params = [self.E, self.U, self.W, self.V, self.b, self.c] 

    updates=adam(cost, self.params) 
    self.sgd_step = theano.function(
     inputs=[x, y], 
     outputs=[], 
     updates=updates 
    ) 


def calculate_total_loss(self, X, Y): 
    return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)]) 


def calculate_loss(self, X, Y): 
    # Divide calculate_loss by the number of words 
    num_words = np.sum([len(y) for y in Y]) 
    return self.calculate_total_loss(X,Y)/float(num_words) 


def adam(loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, 
    gamma=1-1e-8): 
    """ 
    ADAM update rules 
    Default values are taken from [Kingma2014] 

    References: 
    [Kingma2014] Kingma, Diederik, and Jimmy Ba. 
    "Adam: A Method for Stochastic Optimization." 
    arXiv preprint arXiv:1412.6980 (2014). 
    http://arxiv.org/pdf/1412.6980v4.pdf 

    """ 
    updates = [] 
    all_grads = theano.grad(loss, all_params) 
    alpha = learning_rate 
    t = theano.shared(np.float32(1)) 
    b1_t = b1*gamma**(t-1) #(Decay the first moment running average coefficient) 

    for theta_previous, g in zip(all_params, all_grads): 
     m_previous = theano.shared(np.zeros(theta_previous.get_value().shape, 
             dtype=theano.config.floatX)) 
     v_previous = theano.shared(np.zeros(theta_previous.get_value().shape, 
             dtype=theano.config.floatX)) 

     m = b1_t*m_previous + (1 - b1_t)*g        # (Update biased first moment estimate) 
     v = b2*v_previous + (1 - b2)*g**2        # (Update biased second raw moment estimate) 
     m_hat = m/(1-b1**t)           # (Compute bias-corrected first moment estimate) 
     v_hat = v/(1-b2**t)           # (Compute bias-corrected second raw moment estimate) 
     theta = theta_previous - (alpha * m_hat)/(T.sqrt(v_hat) + e) #(Update parameters) 

     updates.append((m_previous, m)) 
     updates.append((v_previous, v)) 
     updates.append((theta_previous, theta)) 
    updates.append((t, t + 1.)) 
    return updates