2017-04-12 66 views
0

我使用Tensorflow MLP訓練CIFAR 100蟒蛇的數據集,但是當我執行的代碼,有人可以幫助我獲得送入Ÿ佔位符和代碼運行batch_ys,目前我得到這個,我不知道是否有更多的Windows 10說:「Python已經停止工作」,這裏的代碼(8-3.py):Tensorflow Python已經停止工作運行內存

import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt 
import time 
import os 
from read import unpickle 
dir = os.path.dirname(os.path.realpath(__file__)) 
from read_label import read_label 
current_batch = 0 
t1 = time.time() 
# Load MNIST Data 

from tensorflow.examples.tutorials.mnist import input_data 
#mnist = input_data.read_data_sets(dir + "/MNIST_data/", one_hot=True) 

# Learning Parameters 
learning_rate = 0.001 
training_epochs = 1500 
batch_size = 5500 
display_step = 1 

# Network Parameters 
n_hidden_1 = 1024 # 1st layer num features 
n_hidden_2 = 1024 # 2nd layer num features 
n_hidden_3 = 1024 
n_hidden_4 = 1024 
n_input = 3072 # MNIST data input (img shape: 28*28) 
n_classes = 100 # MNIST total classes (0-9 digits) 

# tf Graph input 
x = tf.placeholder("float", [None, 3072]) 
y = tf.placeholder("float", [None, 100]) 

#weights layer 1 
h = tf.Variable(tf.random_normal([n_input, n_hidden_1])) 
#bias layer 1 
bias_layer_1 = tf.Variable(tf.random_normal([n_hidden_1])) 
#layer 1 
layer_1 = tf.nn.relu(tf.add(tf.matmul(x,h),bias_layer_1)) 

#weights layer 2 
w = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])) 
#bias layer 2 
bias_layer_2 = tf.Variable(tf.random_normal([n_hidden_2])) 
#layer 2 
layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1,w),bias_layer_2)) 

h1 = tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])) 
bias_layer_3 = tf.Variable(tf.random_normal([n_hidden_3])) 
layer_3 = tf.nn.relu(tf.add(tf.matmul(layer_2, h1), bias_layer_3)) 

w1 = tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4])) 
bias_layer_4 = tf.Variable(tf.random_normal([n_hidden_4])) 
layer_4 = tf.nn.relu(tf.add(tf.matmul(layer_3, w1), bias_layer_4)) 

#weights output layer 
output = tf.Variable(tf.random_normal([n_hidden_4, n_classes])) 
#bias output layer 
bias_output = tf.Variable(tf.random_normal([n_classes])) 
#output layer 
output_layer = tf.matmul(layer_4, output) + bias_output 

# cost function 
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output_layer, y)) 
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_layer, labels=y)) 

# optimizer 
optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) 
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost) 

#Plot settings 
avg_set = [] 
epoch_set=[] 

# Initializing the variables 
# init = tf.initialize_all_variables() 
init = tf.global_variables_initializer() 

# Launch the graph 
with tf.Session() as sess: 

    sess.run(init) 

    # Training cycle 
    for epoch in range(training_epochs): 
     avg_cost = 0.0 
     # trchou 
     total_batch = int(50000/batch_size) 
     # total_batch = 2000 
     # Loop over all batches 
     for i in range(total_batch): 
      # trchou 
      #batch_xs, batch_ys = mnist.train.next_batch(batch_size) 
      batch_xs = unpickle('train')[b'data'][current_batch:batch_size, :] 
      #batch_ys = read_label('train')[current_batch:batch_size] 
      batch_ys = tf.one_hot(read_label('train')[current_batch:batch_size], depth=100, dtype="float").eval() 
      print(x) 
      print(batch_ys) 
      #print(read_label('train').shape) 
      # Fit training using batch data 
      sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys}) 
      # Compute average loss 
      avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch 
      current_batch += batch_size 
     # Display logs per epoch step 
     if epoch % display_step == 0: 
      print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)) 
     avg_set.append(avg_cost) 
     epoch_set.append(epoch+1) 
     ''' 
     if(cost == 0.000000000): 
      print("The cost value of this training has reached 0, exit? (y/n)") 
      a = input() 
      if(a == 'y'): 
       print("You chose to break it.") 
       break 
      elif(a == 'n'): 
       print("Training will continue.") 
     ''' 
    t2 = time.time() 
    t_min = int((t2-t1)/60) 
    t_sec = int((t2-t1)%60) 
    print("Training phase finished, time elapsed {:d}min {:d}secs.".format(t_min, t_sec)) 

    # Plot the learning curve 
    plt.plot(epoch_set,avg_set, 'o', label='MLP Training phase') 
    plt.ylabel('cost') 
    plt.xlabel('epoch') 
    plt.legend() 
    plt.show() 

    # Save the model after learning 
    model_saver = tf.train.Saver() 
    model_saver.save(sess, "C:/cifar-model/my_model_mlp.chkp") 

    # Testing cycle 
    correct_prediction = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1)) 
    # Calculate accuracy 
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 
    print("Model accuracy:", accuracy.eval({x: unpickle('test')[b'data'], y: read_label('test')})) 
''' 
# Restore model & testing 
with tf.Session() as sess: 

    model_saver.restore(sess, "C:/model-batchsize_55000_epoch_500_4_hiddens_learningrate_0.001/my_model_mlp.chkp") 
    print("Model restored.") 
    print("Initialized") 
    # Test model 
    correct_prediction = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1)) 
    # Calculate accuracy 
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 
    print("Model accuracy:", accuracy.eval({x: batch_xs, y: batch_ys})) 
''' 

回溯:

C:\Users\Administrator\learn_tensorflow\cifar-100-python>python 8-3.py 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library cublas64_80.dll locally 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library cudnn64_5.dll locally 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library cufft64_80.dll locally 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library nvcuda.dll locally 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library curand64_80.dll locally 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "BestSplits" device_type: "CPU"') for unknown op: BestSplits 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "CountExtremelyRandomStats" device_type: "CPU"') for unknown op: CountExtremelyRandomStats 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "FinishedNodes" device_type: "CPU"') for unknown op: FinishedNodes 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "GrowTree" device_type: "CPU"') for unknown op: GrowTree 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "ReinterpretStringToFloat" device_type: "CPU"') for unknown op: ReinterpretStringToFloat 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "SampleInputs" device_type: "CPU"') for unknown op: SampleInputs 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "ScatterAddNdim" device_type: "CPU"') for unknown op: ScatterAddNdim 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel ('op: "TopNInsert" device_type: "CPU"') for unknown op: TopNInsert 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "TopNRemove" device_type: "CPU"') for unknown op: TopNRemove 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "TreePredictions" device_type: "CPU"') for unknown op: TreePredictions 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "UpdateFertileSlots" device_type: "CPU"') for unknown op: UpdateFertileSlots 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:885] Found device 0 with properties: 
name: GeForce GT 730 
major: 3 minor: 5 memoryClockRate (GHz) 0.9015 
pciBusID 0000:01:00.0 
Total memory: 2.00GiB 
Free memory: 1.66GiB 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:906] DMA: 0 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:916] 0: Y 
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0) 
<class 'list'> 
Tensor("Placeholder:0", shape=(?, 3072), dtype=float32) 
<class 'list'> 
Tensor("Placeholder:0", shape=(?, 3072), dtype=float32) 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\bfc_allocator.cc:244] tried to allocate 0 bytes 
W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\allocator_retry.cc:32] Request to allocate 0 bytes 
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:104] EigenAllocator for GPU ran out of memory when allocating 0. See error logs for more detailed info. 

C:\Users\Administrator\learn_tensorflow\cifar-100-python> 
+0

任何人都可以告訴我需要什麼信息才能得到答案嗎? – Cro

回答

0

您的內存不足,這意味着您嘗試在每一步都傳遞太多數據。

這是因爲您的batch_size太高,請嘗試較小的值(比如說32)並查看它是否有效。由於性能原因,您可能會嘗試更高的值。


批處理用於通過並行操作來提高性能。換句話說,您可以加載更多數據以加快速度。但是加載更多的數據會帶來內存方面的成本,您必須將其加載到GPU RAM中。

你有兩種情況:

  • 它的速度太慢,我不使用我的所有GPU RAM =>增加batch_size
  • 我跑出來的RAM =>減少batch_size

如果您達到batch_size=1仍然會觸發OOM的點,那麼您的GPU上沒有足夠的RAM:

  • 嘗試簡單的任務,以較少的維
  • 不使用GPU,但CPU,那麼你會用你的CPU內存,平時你有更多的CPU RAM比GPU RAM。
+0

batch_size 500不能正常工作,batch_size 5不能正常工作。 – Cro

+0

輸出說錯誤分配內存0 – Cro

+0

試過,它什麼都沒做 – Cro