0
tensorflow:tensorflow-GPU 0.12tensorflow執行python崩潰
蟒:anaconda4.2.9(python3.5)
GPU:Nvidia的940M(筆記本)(2GB)
OS:win7-64bit SP1
Cuda的:8.0
cudnn:5.0
IDE:PYC危害
MNIST測試下GPU(細胞神經網絡),但,當涉及到我自己的項目,蟒蛇crashes.I調試我的代碼,並發現功能「**session.run()**
」導致這種problem.The錯誤OK是:
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:586] Could not identify NUMA node of /job:localhost/replica:0/task:0/gpu:0, defaulting to 0. Your kernel may not have been built with NUMA support.
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_event.cc:49] Error polling for event status: failed to query event: CUDA_ERROR_LAUNCH_FAILED
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_event_mgr.cc:198] Unexpected Event status: 1
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:385] could not create cudnn handle: **CUDNN_STATUS_INTERNAL_ERROR**
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:352] could not destroy cudnn handle: **CUDNN_STATUS_BAD_PARAM**
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\kernels\conv_ops.cc:532] **Check failed: stream->parent()->GetConvolveAlgorithms(&algorithms)**
由於Mnist運行正常,因此我的GPU驅動程序,cuda和cudnn沒有任何缺陷。我真的不知道這個問題是如何發生的。
這是我的代碼:
import cv2
import os
import tensorflow as tf
import data_trans as dt
with tf.variable_scope('weights'):
weights={
# 60*60*3->60*60*32->30*30*32
'conv1':tf.get_variable('conv1',[5,5,3,32],initializer=tf.contrib.layers.xavier_initializer_conv2d()),
# 30*30*32->30*30*64->15*15*64
'conv2':tf.get_variable('conv2',[5,5,32,64],initializer=tf.contrib.layers.xavier_initializer_conv2d()),
# 15*15*64->12*12*128->6*6*128
'conv3':tf.get_variable('conv3',[4,4,64,128],initializer=tf.contrib.layers.xavier_initializer_conv2d()),
# 6*6*128->256
'fc1':tf.get_variable('fc1',[6*6*128,256],initializer=tf.contrib.layers.xavier_initializer()),
# 256->2
'fc2':tf.get_variable('fc2',[256,2],initializer=tf.contrib.layers.xavier_initializer())
}
with tf.variable_scope('biases'):
biases = {
'conv1':tf.get_variable('conv1',[32,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'conv2':tf.get_variable('conv2',[64,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'conv3':tf.get_variable('conv3',[128,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'fc1':tf.get_variable('fc1',[256,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'fc2':tf.get_variable('fc2',[2,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32))
}
def inference(images):
images = (tf.cast(images,tf.float32)/255)
conv1 = tf.nn.bias_add(tf.nn.conv2d(images,weights['conv1'],strides=[1,1,1,1],padding='SAME'),biases['conv1'])
relu1 = tf.nn.relu(conv1)
pool1 = tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
conv2 = tf.nn.bias_add(tf.nn.conv2d(pool1,weights['conv2'],strides=[1,1,1,1],padding='SAME'),biases['conv2'])
relu1 = tf.nn.relu(conv2)
pool2 = tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
conv3 = tf.nn.bias_add(tf.nn.conv2d(pool2,weights['conv3'],strides=[1,1,1,1],padding='VALID'),biases['conv3'])
relu3 = tf.nn.relu(conv3)
pool3 = tf.nn.max_pool(relu3,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')
flatten = tf.reshape(pool3,[-1,weights['fc1'].get_shape().as_list()[0]])
drop = tf.nn.dropout(flatten,0.5)
fc1 = tf.matmul(drop,weights['fc1']) + biases['fc1']
fc_relu1 = tf.nn.relu(fc1)
fc2 = tf.matmul(fc_relu1,weights['fc2']) + biases['fc2']
return fc2
def train():
dt.encode_to_tfrecords('../train_data/train.txt','../train_data','data.tfrecords',(60,60))
image,label = dt.decode_from_tfrecords('../train_data/data.tfrecords')
batch_image,batch_label = dt.get_batch(image,label,batch_size=10,crop_size=60)
inf = inference(batch_image)
predicts = tf.nn.softmax(inf)
cross_entropy = -tf.reduce_mean(batch_label * tf.log(predicts))
train_step = tf.train.GradientDescentOptimizer(1e-2).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(predicts, 1), tf.argmax(batch_label, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
#if os.path.exists(os.path.join('model','model.ckpt')) is True:
# tf.train.Saver(max_to_keep=None).restore(sess,os.path.join('model','model.ckpt'))
for epcho in range(8):
print(sess.run(accuracy))
print('here!')
coord.request_stop()
coord.join(threads)
train()
data_trans.py包含三個功能用變換圖像tfrecords:
import cv2
import tensorflow as tf
def encode_to_tfrecords(label_file,data_root,new_name='data.tfrecords',resize=None):
writer = tf.python_io.TFRecordWriter(data_root + '/' + new_name)
num_example = 0
with open(label_file,'r') as f:
for l in f.readlines():
l = l.split()
path = data_root+'/'+l[0]
image = cv2.imread(path)
if resize is not None:
image = cv2.resize(image,resize)
height,width,nchannel = image.shape
label = int(l[1])
example = tf.train.Example(features=tf.train.Features(feature={
'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
'nchannel':tf.train.Feature(int64_list=tf.train.Int64List(value=[nchannel])),
'image':tf.train.Feature(bytes_list=tf.train.BytesList(value=[image.tobytes()])),
'label':tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
}))
serialized = example.SerializeToString()
writer.write(serialized)
num_example += 1
print(label_file,'Sample_Num:',num_example)
writer.close()
#encode_to_tfrecords('../train_data/train.txt','../train_data')
def decode_from_tfrecords(filename,num_epoch=None):
filename_queue = tf.train.string_input_producer([filename],num_epoch)
reader = tf.TFRecordReader()
_,serialized = reader.read(filename_queue)
example = tf.parse_single_example(serialized,features={
'height':tf.FixedLenFeature([],tf.int64),
'width':tf.FixedLenFeature([],tf.int64),
'nchannel':tf.FixedLenFeature([],tf.int64),
'image':tf.FixedLenFeature([],tf.string),
'label':tf.FixedLenFeature([],tf.int64)
})
label = tf.cast(example['label'],tf.int32)
image = tf.decode_raw(example['image'],tf.uint8)
image = tf.reshape(image,tf.stack([
tf.cast(example['height'],tf.int32),
tf.cast(example['width'],tf.int32),
tf.cast(example['nchannel'],tf.int32)
]))
return image, label
#encode_to_tfrecords("../train_data/train.txt","../train_data",'data.tfrecords')
#image,label=decode_from_tfrecords('../train_data/data.tfrecords')
#print image[0]
def get_batch(image,label,batch_size,crop_size):
distorted_image = tf.random_crop(image,[crop_size, crop_size, 3])
distorted_image = tf.image.random_flip_up_down(distorted_image)
images,label_batch = tf.train.shuffle_batch([distorted_image,label],batch_size=batch_size,capacity=130,min_after_dequeue=100)
return images,tf.one_hot(tf.reshape(label_batch,[batch_size]), 2)
你安裝了GPU版本或CPU版本? – Eliethesaiyan
它似乎也可能是由cuda和tensorflow的版本引起的https://github.com/tensorflow/tensorflow/issues/2033 – Eliethesaiyan
感謝您的回覆。什麼是GPU版本?如果我的cuda或tensorflow版本不合適,Mnist如何正確運行? – spring