3

基於官方Theano教程(http://deeplearning.net/tutorial/code/lstm.py)中提供的LSTM代碼,我改變了LSTM層代碼(即函數lstm_layer()param_init_lstm())以執行GRU。提供的LSTM代碼訓練良好,但不是GRU I編碼:LSTM訓練集的準確度上升到1(訓練成本= 0),而GRU停滯在0.7(訓練成本= 0) 0.3)。Theano中的GRU實現

下面是我用於GRU的代碼。我保留了與教程中相同的函數名稱,以便可以將代碼直接複製粘貼到其中。什麼能解釋GRU的糟糕表現?

import numpy as np 
def param_init_lstm(options, params, prefix='lstm'): 
    """ 
    GRU 
    """ 
    W = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the input in the reset gate 
         ortho_weight(options['dim_proj']), 
         ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate 
         axis=1)   
    params[_p(prefix, 'W')] = W 

    U = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the previous hidden state in the reset gate 
         ortho_weight(options['dim_proj']), 
         ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate 
         axis=1)   
    params[_p(prefix, 'U')] = U 

    b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate   
    params[_p(prefix, 'b')] = b.astype(config.floatX)  
    return params 


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 
    nsteps = state_below.shape[0] 
    if state_below.ndim == 3: 
     n_samples = state_below.shape[1] 
    else: 
     n_samples = 1 

    def _slice(_x, n, dim): 
     if _x.ndim == 3: 
      return _x[:, :, n * dim:(n + 1) * dim] 
     return _x[:, n * dim:(n + 1) * dim] 

    def _step(m_, x_, h_): 
     preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 
     preact += x_ 

     r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate 
     u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate 

     U_h_t = _slice(tparams[_p(prefix, 'U')], 2, options['dim_proj']) 
     x_h_t = _slice(x_, 2, options['dim_proj']) 

     h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t) 
     h = (1. - u) * h_ + u * h_t_temp   
     h = m_[:,None] * h + (1. - m_)[:,None] * h_ 

     return h 

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 
        tparams[_p(prefix, 'b')]) 

    dim_proj = options['dim_proj'] 
    rval, updates = theano.scan(_step, 
           sequences=[mask, state_below], 
           outputs_info=[tensor.alloc(numpy_floatX(0.), 
                  n_samples, 
                  dim_proj)], 
           name=_p(prefix, '_layers'), 
           n_steps=nsteps) 

    return rval[0] 

回答

7

問題來自於最後一行,return rval[0]:應該改爲return rval

在官方Theano教程(http://deeplearning.net/tutorial/code/lstm.py)提供的LSTM代碼使用return rval[0]因爲outputs_info包含2個元素:

rval, updates = theano.scan(_step, 
          sequences=[mask, state_below], 
          outputs_info=[tensor.alloc(numpy_floatX(0.), 
                 n_samples, 
                 dim_proj), 
              tensor.alloc(numpy_floatX(0.), 
                 n_samples, 
                 dim_proj)], 
          name=_p(prefix, '_layers'), 
          n_steps=nsteps) 
return rval[0] 

在GRU,outputs_info只包含一個元素:

outputs_info=[tensor.alloc(numpy_floatX(0.), 
          n_samples, 
          dim_proj)], 

,儘管括號中,它不會返回代表掃描輸出的Theano變量列表的列表,而是直接返回一個Theano變量。然後

rval被饋送到一個彙集層(在這種情況下,平均池層):

enter image description here

通過僅在GRU代碼rval服用rval[0]在GRU,由於是一個Theano變量,而不是一個一個Theano變量的列表中,則在除去紅色矩形的部分:

enter image description here

這意味着你嘗試執行第e句子分類只是使用第一個單詞。


另一個GRU實現,可在LSTM教程中插入:

# weight initializer, normal by default 
def norm_weight(nin, nout=None, scale=0.01, ortho=True): 
    if nout is None: 
     nout = nin 
    if nout == nin and ortho: 
     W = ortho_weight(nin) 
    else: 
     W = scale * numpy.random.randn(nin, nout) 
    return W.astype('float32') 

def param_init_lstm(options, params, prefix='lstm'): 
    """ 
    GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py 
    """ 
    nin = options['dim_proj'] 
    dim = options['dim_proj'] 
    # embedding to gates transformation weights, biases 
    W = numpy.concatenate([norm_weight(nin, dim), 
          norm_weight(nin, dim)], axis=1) 
    params[_p(prefix, 'W')] = W 
    params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 

    # recurrent transformation weights for gates 
    U = numpy.concatenate([ortho_weight(dim), 
          ortho_weight(dim)], axis=1) 
    params[_p(prefix, 'U')] = U 

    # embedding to hidden state proposal weights, biases 
    Wx = norm_weight(nin, dim) 
    params[_p(prefix, 'Wx')] = Wx 
    params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 

    # recurrent transformation weights for hidden state proposal 
    Ux = ortho_weight(dim) 
    params[_p(prefix, 'Ux')] = Ux 
    return params 


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 

    nsteps = state_below.shape[0] 

    if state_below.ndim == 3: 
     n_samples = state_below.shape[1] 
    else: 
     n_samples = state_below.shape[0] 

    dim = tparams[_p(prefix, 'Ux')].shape[1] 

    if mask is None: 
     mask = tensor.alloc(1., state_below.shape[0], 1) 

    # utility function to slice a tensor 
    def _slice(_x, n, dim): 
     if _x.ndim == 3: 
      return _x[:, :, n*dim:(n+1)*dim] 
     return _x[:, n*dim:(n+1)*dim] 

    # state_below is the input word embeddings 
    # input to the gates, concatenated 
    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 
     tparams[_p(prefix, 'b')] 
    # input to compute the hidden state proposal 
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 
     tparams[_p(prefix, 'bx')] 

    # step function to be used by scan 
    # arguments | sequences |outputs-info| non-seqs 
    def _step_slice(m_, x_, xx_, h_,   U, Ux): 
     preact = tensor.dot(h_, U) 
     preact += x_ 

     # reset and update gates 
     r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 
     u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 

     # compute the hidden state proposal 
     preactx = tensor.dot(h_, Ux) 
     preactx = preactx * r 
     preactx = preactx + xx_ 

     # hidden state proposal 
     h = tensor.tanh(preactx) 

     # leaky integrate and obtain next hidden state 
     h = u * h_ + (1. - u) * h 
     h = m_[:, None] * h + (1. - m_)[:, None] * h_ 

     return h 

    # prepare scan arguments 
    seqs = [mask, state_below_, state_belowx] 
    _step = _step_slice 
    shared_vars = [tparams[_p(prefix, 'U')], 
        tparams[_p(prefix, 'Ux')]] 

    init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0) 

    rval, updates = theano.scan(_step, 
            sequences=seqs, 
            outputs_info=[init_state], 
            non_sequences=shared_vars, 
            name=_p(prefix, '_layers'), 
            n_steps=nsteps, 
            strict=True) 
    return rval 

作爲一個側面說明,Keras修正了這個問題,因爲follows

results, _ = theano.scan(
    _step, 
    sequences=inputs, 
    outputs_info=[None] + initial_states, 
    go_backwards=go_backwards) 

# deal with Theano API inconsistency 
if type(results) is list: 
    outputs = results[0] 
    states = results[1:] 
else: 
    outputs = results 
    states = []