2016-08-23 51 views
4

我想實現與keras完全連接的神經網絡layer normalization層正常化。我遇到的問題是,所有的損失是NaN,它不學習。這裏是我的代碼:未能實現與keras

class DenseLN(Layer): 
    def __init__(self, output_dim, init='glorot_uniform', activation='linear', weights=None, 
       W_regularizer=None, b_regularizer=None, activity_regularizer=None, 
       W_constraint=None, b_constraint=None, bias=True, input_dim=None, **kwargs): 
     self.init = initializations.get(init) 
     self.activation = activations.get(activation) 
     self.output_dim = output_dim 
     self.input_dim = input_dim 
     self.epsilon = 1e-5   

     self.W_regularizer = regularizers.get(W_regularizer) 
     self.b_regularizer = regularizers.get(b_regularizer) 
     self.activity_regularizer = regularizers.get(activity_regularizer) 

     self.W_constraint = constraints.get(W_constraint) 
     self.b_constraint = constraints.get(b_constraint) 

     self.bias = bias 
     self.initial_weights = weights 
     self.input_spec = [InputSpec(ndim=2)] 

     if self.input_dim: 
      kwargs['input_shape'] = (self.input_dim,) 
     super(DenseLN, self).__init__(**kwargs) 

    def ln(self, x): 
     # layer normalization function 
     m = K.mean(x, axis=0) 
     std = K.sqrt(K.var(x, axis=0) + self.epsilon) 
     x_normed = (x - m)/(std + self.epsilon) 
     x_normed = self.gamma * x_normed + self.beta 
     return x_normed 

    def build(self, input_shape): 
     assert len(input_shape) == 2 
     input_dim = input_shape[1] 
     self.input_spec = [InputSpec(dtype=K.floatx(), 
            shape=(None, input_dim))] 

     self.gamma = K.variable(np.ones(self.output_dim) * 0.2, name='{}_gamma'.format(self.name)) 
     self.beta = K.zeros((self.output_dim,), name='{}_beta'.format(self.name)) 

     self.W = self.init((input_dim, self.output_dim), 
          name='{}_W'.format(self.name)) 
     if self.bias: 
      self.b = K.zeros((self.output_dim,), 
          name='{}_b'.format(self.name)) 
      self.trainable_weights = [self.W, self.gamma, self.beta, self.b] 
     else: 
      self.trainable_weights = [self.W, self.gamma, self.beta] 
     self.regularizers = [] 
     if self.W_regularizer: 
      self.W_regularizer.set_param(self.W) 
      self.regularizers.append(self.W_regularizer) 

     if self.bias and self.b_regularizer: 
      self.b_regularizer.set_param(self.b) 
      self.regularizers.append(self.b_regularizer) 

     if self.activity_regularizer: 
      self.activity_regularizer.set_layer(self) 
      self.regularizers.append(self.activity_regularizer) 

     self.constraints = {} 
     if self.W_constraint: 
      self.constraints[self.W] = self.W_constraint 
     if self.bias and self.b_constraint: 
      self.constraints[self.b] = self.b_constraint 

     if self.initial_weights is not None: 
      self.set_weights(self.initial_weights) 
      del self.initial_weights 

    def call(self, x, mask=None): 
     output = K.dot(x, self.W) 
     output = self.ln(output) 
     #print (theano.tensor.shape(output)) 
     if self.bias: 
      output += self.b 
     return self.activation(output) 

    def get_output_shape_for(self, input_shape): 
     assert input_shape and len(input_shape) == 2 
     return (input_shape[0], self.output_dim) 

model = Sequential() 
model.add(Dense(12, activation='sigmoid', input_dim=12)) 
model.add(DenseLN(98, activation='sigmoid')) 
model.add(DenseLN(108, activation='sigmoid')) 
model.add(DenseLN(1)) 
adadelta = Adadelta(lr=0.1, rho=0.95, epsilon=1e-08) 
adagrad = Adagrad(lr=0.003, epsilon=1e-08) 

model.compile(loss='poisson', 
       optimizer=adagrad, 
       metrics=['accuracy']) 

model.fit(X_train_scale, 
      Y_train, 
      batch_size=3000, 
      callbacks=[history], 
      nb_epoch=300) 

你知道這裏有什麼問題,我該如何解決?提前致謝!

編輯:

我也曾嘗試層的一些組合,並發現了一些weired。如果輸入和輸出層都正常Dense層,精度會非常低,幾乎爲零。但是,如果輸入層是DenseLN,即我的定製層,則精確度將首先爲0.6+,並且在數十次迭代之後,它再次減小到零。事實上,我從Dense層複製了大部分代碼,所有區別是ln函數和self.ln(output)中的call函數。此外,我還將gammabeta添加到trainable_weights

任何幫助表示讚賞!

+0

的問題是客觀....當我將其更改爲'二進制entropy' – user5779223

+0

我建議你從緻密層實現這個作爲一個獨立的操作它是固定的,類似於批量標準化層通常如何實現。它也會使整個代碼更簡單,因爲這個層不會有任何參數。我建議你看看BatchNorm是如何在Keras中實現的:https://github.com/fchollet/keras/blob/master/keras/layers/normalization.py –

回答

2

如果將它作爲單獨的層實現,它會更清晰和更靈活。像這樣的東西應該工作:

class LayerNorm(Layer): 
    """ Layer Normalization in the style of https://arxiv.org/abs/1607.06450 """ 
    def __init__(self, scale_initializer='ones', bias_initializer='zeros', **kwargs): 
     super(LayerNorm, self).__init__(**kwargs) 
     self.epsilon = 1e-6 
     self.scale_initializer = initializers.get(scale_initializer) 
     self.bias_initializer = initializers.get(bias_initializer) 

    def build(self, input_shape): 
     self.scale = self.add_weight(shape=(input_shape[-1],), 
            initializer=self.scale_initializer, 
            trainable=True, 
            name='{}_scale'.format(self.name)) 
     self.bias = self.add_weight(shape=(input_shape[-1],), 
            initializer=self.bias_initializer, 
            trainable=True, 
            name='{}_bias'.format(self.name)) 
     self.built = True 

    def call(self, x, mask=None): 
     mean = K.mean(x, axis=-1, keepdims=True) 
     std = K.std(x, axis=-1, keepdims=True) 
     norm = (x - mean) * (1/(std + self.epsilon)) 
     return norm * self.scale + self.bias 

    def compute_output_shape(self, input_shape): 
     return input_shape