我修改了MNIST(28x28)Convnet教程代碼以接受更大的圖像(150x150)。但是,當我嘗試訓練我收到此錯誤(請參閱完整的堆棧跟蹤結束):爲什麼我的TensorFlow Convnet(嘗試)培訓會導致NaN漸變?
W tensorflow/core/common_runtime/executor.cc:1076] 0x2e97d30 Compute status: Invalid argument: ReluGrad input is not finite. : Tensor had NaN values
這裏是我的代碼。令人擔憂的是,當使用磁盤中的圖像數據時,會產生與生成嘈雜的紅色/藍色/綠色方塊並嘗試按顏色對它們進行分類相同的錯誤。生成RGB數據的代碼與掃描JPG圖像數據目錄的代碼不同。因此,無論是在我自己的數據中加載的系統錯誤的方式,還是我提出的架構都有問題。 (我可以包括這些模塊,但我擔心它可能會使這篇文章難以理解的長。)
編輯:我已經嘗試過這個相同的代碼適度更大的圖像(30x30),它工作。所以也許這個錯誤與(150x150)問題的高維度有關?
import tensorflow as tf
import numpy as np
import data.image_loader
###############################
##### Set hyperparameters #####
###############################
num_epochs = 2
width = 150
height = 150
num_categories = 2
num_channels = 3
batch_size = 100 # for my sanity
num_training_examples = 2000
num_test_examples = 200
num_batches = num_training_examples/batch_size
####################################################################################
##### It's convenient to define some methods to perform frequent routine tasks #####
####################################################################################
def weight_variable(shape):
'''
Generates a TensorFlow Tensor. This Tensor gets initialized with values sampled from the truncated normal
distribution. Its purpose will be to store model parameters.
:param shape: The dimensions of the desired Tensor
:return: The initialized Tensor
'''
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
'''
Generates a TensorFlow Tensor. This Tensor gets initialized with values sampled from <some?> distribution.
Its purpose will be to store bias values.
:param shape: The dimensions of the desired Tensor
:return: The initialized Tensor
'''
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
'''
Generates a conv2d TensorFlow Op. This Op flattens the weight matrix (filter) down to 2D, then "strides" across the
input Tensor x, selecting windows/patches. For each little_patch, the Op performs a right multiply:
W . little_patch
and stores the result in the output layer of feature maps.
:param x: a minibatch of images with dimensions [batch_size, height, width, 3]
:param W: a "filter" with dimensions [window_height, window_width, input_channels, output_channels]
e.g. for the first conv layer:
input_channels = 3 (RGB)
output_channels = number_of_desired_feature_maps
:return: A TensorFlow Op that convolves the input x with the filter W.
'''
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
'''
Genarates a max-pool TensorFlow Op. This Op "strides" a window across the input x. In each window, the maximum value
is selected and chosen to represent that region in the output Tensor. Hence the size/dimensionality of the problem
is reduced.
:param x: A Tensor with dimensions [batch_size, height, width, 3]
:return: A TensorFlow Op that max-pools the input Tensor, x.
'''
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
############################
##### Set up the model #####
############################
x = tf.placeholder("float", shape=[None, height, width, num_channels])
x_image = tf.reshape(x, [-1, width, height, num_channels])
y_ = tf.placeholder("float", shape=[None, num_categories])
#1st conv layer
W_conv1 = weight_variable([5, 5, num_channels, 32]) #5x5 conv window, 3 colour channels, 32 outputted feature maps
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
#2nd conv layer
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
#fully connected layer
W_fc1 = weight_variable([38 * 38 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 38*38*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
#droupout
keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#softmax output layer
W_fc2 = weight_variable([1024, num_categories])
b_fc2 = bias_variable([num_categories])
y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
#saving model
saver = tf.train.Saver()
###################################
##### Load data from the disk #####
###################################
dataset = data.image_loader.ImageLoad(base_path="/home/hal9000/Datasets/id_dataset3",
num_categories=num_categories,
width=width,
height=height)
data_training = np.asarray(np.split(dataset.data_training, num_batches))
labels_training = np.asarray(np.split(dataset.labels_training, num_batches))
data_test = np.split(dataset.data_test, 1)
labels_test = np.split(dataset.labels_test, 1)
####################################################
##### Train the model and evaluate performance #####
####################################################
with tf.Session() as sess:
cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
#train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
train_step = tf.train.AdamOptimizer(0.0005).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess.run(tf.initialize_all_variables())
for j in range(num_epochs):
for i in range(num_batches):
train_step.run(feed_dict={x: np.asarray(data_training[i]), y_: np.asarray(labels_training[i]), keep_prob: 0.5})
print "=== EPOCH: " + str(j) + " ==="
print "test accuracy: %g \n"%accuracy.eval(feed_dict={x: data_test[i], y_: labels_test[i], keep_prob: 1.0})
saver.save(sess, "saved_models/convnet_image" + str(j) + ".ckpt")
錯誤:
I tensorflow/core/common_runtime/local_device.cc:40] Local device intra op parallelism threads: 8
I tensorflow/core/common_runtime/direct_session.cc:58] Direct session inter op parallelism threads: 8
W tensorflow/core/common_runtime/executor.cc:1076] 0xc8991e0 Compute status: Invalid argument: ReluGrad input is not finite. : Tensor had NaN values
[[Node: gradients/Relu_grad/Relu/CheckNumerics = CheckNumerics[T=DT_FLOAT, message="ReluGrad input is not finite.", _device="/job:localhost/replica:0/task:0/cpu:0"](add)]]
W tensorflow/core/common_runtime/executor.cc:1076] 0xc8991e0 Compute status: Invalid argument: ReluGrad input is not finite. : Tensor had NaN values
[[Node: gradients/Relu_1_grad/Relu_1/CheckNumerics = CheckNumerics[T=DT_FLOAT, message="ReluGrad input is not finite.", _device="/job:localhost/replica:0/task:0/cpu:0"](add_1)]]
W tensorflow/core/common_runtime/executor.cc:1076] 0xc8991e0 Compute status: Invalid argument: ReluGrad input is not finite. : Tensor had NaN values
[[Node: gradients/Relu_2_grad/Relu_2/CheckNumerics = CheckNumerics[T=DT_FLOAT, message="ReluGrad input is not finite.", _device="/job:localhost/replica:0/task:0/cpu:0"](add_2)]]
Traceback (most recent call last):
File "/home/hal9000/PycharmProjects/TensorFlow_Experiments_0.4/neural_nets/image_convnet.py", line 137, in <module>
train_step.run(feed_dict={x: np.asarray(data_training[i]), y_: np.asarray(labels_training[i]), keep_prob: 0.5})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1325, in run
_run_using_default_session(self, feed_dict, self.graph, session)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2945, in _run_using_default_session
session.run(operation, feed_dict)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 368, in run
results = self._do_run(target_list, unique_fetch_targets, feed_dict_string)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 444, in _do_run
e.code)
tensorflow.python.framework.errors.InvalidArgumentError: ReluGrad input is not finite. : Tensor had NaN values
[[Node: gradients/Relu_grad/Relu/CheckNumerics = CheckNumerics[T=DT_FLOAT, message="ReluGrad input is not finite.", _device="/job:localhost/replica:0/task:0/cpu:0"](add)]]
Caused by op u'gradients/Relu_grad/Relu/CheckNumerics', defined at:
File "/home/hal9000/PycharmProjects/TensorFlow_Experiments_0.4/neural_nets/image_convnet.py", line 131, in <module>
train_step = tf.train.AdamOptimizer(0.0005).minimize(cross_entropy)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 186, in minimize
aggregation_method=aggregation_method)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 232, in compute_gradients
aggregation_method=aggregation_method)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py", line 445, in gradients
in_grads = _AsList(grad_fn(op_wrapper, *out_grads))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_grad.py", line 126, in _ReluGrad
t = _VerifyTensor(op.inputs[0], op.name, "ReluGrad input is not finite.")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_grad.py", line 119, in _VerifyTensor
verify_input = array_ops.check_numerics(t, message=msg)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 48, in check_numerics
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 664, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1834, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1043, in __init__
self._traceback = _extract_stack()
...which was originally created as op u'Relu', defined at:
File "/home/hal9000/PycharmProjects/TensorFlow_Experiments_0.4/neural_nets/image_convnet.py", line 82, in <module>
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 547, in relu
return _op_def_lib.apply_op("Relu", features=features, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 664, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1834, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1043, in __init__
self._traceback = _extract_stack()
Process finished with exit code 1
嗨,我發現你的答案似乎適用於我。這是我關於這個問題:https://stackoverflow.com/questions/47247675/difference-between-distribute-and-local-machine-version-when-training-by-tensorf – yanachen
但我很困惑,爲什麼發生,可以如果有,你會提供一些提示或理解嗎?非常感謝。 – yanachen