圖形卡的OOM CUDA/Tensorflow

嗨我試圖訓練一個modell，當我得到這個錯誤信息：如果我理解它的權利，說我沒有VRAM，但我有一個華碩GTX1080 A8G遊戲應該有足夠的Vram 。我在所有工作之前嘗試過，但突然間無處可去。我深深網：圖形卡的OOM CUDA/Tensorflow

# -*- coding: utf-8 -*- 
""" 
Created on Thu Jun 29 11:52:11 2017 
@author: tobia 
""" 
#importing pre_processing libaries 
import numpy as np 
from keras.models import load_model 
import os 
#importing Deep Learning Libaries 
from keras import layers 
from keras.models import Sequential 
from keras.callbacks import TensorBoard 
from keras.layers import Flatten,Dense,Conv2D,MaxPooling2D,Dropout,BatchNormalization,Activation 
def load_data(): 


    key_values = np.empty((0,8),dtype = 'uint8') 
    picture_data = np.empty((0,60,80), dtype = 'uint8') 


    for i in range(len(os.listdir('data/key_values'))): 
     buffer = np.load('data/key_values/values_{0}.npy'.format(i+1)) 
     key_values = np.append(key_values,buffer,axis = 0) 
     buffer_2 = np.load('data/video/video_{}.npy'.format(i+1)) 
     picture_data = np.append(picture_data,buffer_2,axis = 0) 
    picture_data = picture_data.reshape((len(key_values),60,80,1))  
    """ 
    train_data = np.load("data/Processed/train_data.npy") 
    train = train_data[:] 


    picture_data = np.array([i[1] for i in train]).reshape(-1,60,80,1) 
    key_values = np.array([i[0] for i in train]) 
    key_values = np.squeeze(key_values) 
    """ 
    # key_values = np.reshape(key_values,(len(key_values[:]),1,7))  
    # picture_data = np.reshape(picture_data,(len(picture_data[:]),1,60,80,1)) 
    return key_values,picture_data 
class Network: 
    def __init__(self): 

     pass 

    def model_1(self,picture_data,key_values): 
     model = Sequential() 
     model.add(Conv2D(96, 11,input_shape = (60,80,1),activation = "relu")) 
     model.add(MaxPooling2D(pool_size = 3,strides =1)) 
     model.add(BatchNormalization(axis = 1)) 
     model.add(Flatten()) 
     model.add(Dense(units = 8, activation ="softmax")) 
     model.compile(optimizer ='adam', loss = 'categorical_crossentropy',metrics = ['accuracy']) 

     model.summary() 
     return model 
    def start(self,picture_data,key_values): 
     model = self.model_1(picture_data,key_values) 
     tbCallBack= TensorBoard(log_dir = "./logs",histogram_freq=0,write_graph=True,write_images=True) 
     model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack]) 

    def predict_key(self,live_image,model): 
     self.model = model 

     x = self.model.predict(live_image,batch_size =3) 

     return x 

input_k = input("Start new Training press: N or to contiune learning press C") 
if(input_k == 'N'): 
    key_values,picture_data= load_data() 
    test = Network() 
    test.start(picture_data,key_values) 
elif(input_k == 'C'): 

    model = load_model('Models/Modell.h5') 
    visual = TensorBoard(log_dir = "./logs",histogram_freq=0,write_graph=True,write_images=True) 
    key_values,picture_data = load_data() 
    model.fit(picture_data,key_values,batch_size = 1000,epochs=1,validation_split = 0.1,callbacks = [visual]) 
    model.save("Models/Modell.h5")

錯誤消息：

File "<ipython-input-1-73951c078cac>", line 1, in <module> 
    runfile('C:/Users/tobia/Desktop/Ai_Star/ai_train.py', wdir='C:/Users/tobia/Desktop/Ai_Star') 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile 
    execfile(filename, namespace) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile 
    exec(compile(f.read(), filename, 'exec'), namespace) 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module> 
    test.start(picture_data,key_values) 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 66, in start 
    model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack]) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 870, in fit 
    initial_epoch=initial_epoch) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1507, in fit 
    initial_epoch=initial_epoch) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1156, in _fit_loop 
    outs = f(ins_batch) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 2269, in __call__ 
    **self.session_kwargs) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 789, in run 
    run_metadata_ptr) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 997, in _run 
    feed_dict_string, options, run_metadata) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 1132, in _do_run 
    target_list, options, run_metadata) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 1152, in _do_call 
    raise type(e)(node_def, op, message) 
ResourceExhaustedError: OOM when allocating tensor with shape[313344,8] 
[[Node: gradients/dense_1/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, _class=["loc:@dense_1/MatMul"], transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](flatten_1/Reshape, gradients/dense_1/Softmax_grad/mul_1)]] 
Caused by op 'gradients/dense_1/MatMul_grad/MatMul_1', defined at: 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 231, in <module> 
    main() 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 227, in main 
    kernel.start() 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelapp.py", line 477, in start 
    ioloop.IOLoop.instance().start() 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start 
    super(ZMQIOLoop, self).start() 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\ioloop.py", line 888, in start 
    handler_func(fd_obj, events) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper 
    return fn(*args, **kwargs) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events 
    self._handle_recv() 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv 
    self._run_callback(callback, msg) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback 
    callback(*args, **kwargs) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper 
    return fn(*args, **kwargs) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher 
    return self.dispatch_shell(stream, msg) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell 
    handler(stream, idents, msg) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request 
    user_expressions, allow_stdin) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute 
    res = shell.run_cell(code, store_history=store_history, silent=silent) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell 
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell 
    interactivity=interactivity, compiler=compiler, result=result) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2808, in run_ast_nodes 
    if self.run_code(code, result): 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code 
    exec(code_obj, self.user_global_ns, self.user_ns) 
    File "<ipython-input-1-73951c078cac>", line 1, in <module> 
    runfile('C:/Users/tobia/Desktop/Ai_Star/ai_train.py', wdir='C:/Users/tobia/Desktop/Ai_Star') 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile 
    execfile(filename, namespace) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile 
    exec(compile(f.read(), filename, 'exec'), namespace) 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module> 
    test.start(picture_data,key_values) 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 66, in start 
    model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack]) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 870, in fit 
    initial_epoch=initial_epoch) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1490, in fit 
    self._make_train_function() 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1014, in _make_train_function 
    self.total_loss) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\optimizers.py", line 405, in get_updates 
    grads = self.get_gradients(loss, params) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\optimizers.py", line 71, in get_gradients 
    grads = K.gradients(loss, params) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 2307, in gradients 
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 540, in gradients 
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 346, in _MaybeCompile 
    return grad_fn() # Exit early 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 540, in <lambda> 
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_grad.py", line 825, in _MatMulGrad 
    grad_b = math_ops.matmul(a, grad, transpose_a=True) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1816, in matmul 
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1217, in _mat_mul 
    transpose_b=transpose_b, name=name) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op 
    op_def=op_def) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op 
    original_op=self._default_original_op, op_def=op_def) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__ 
    self._traceback = _extract_stack() 
...which was originally created as op 'dense_1/MatMul', defined at: 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 231, in <module> 
    main() 
[elided 20 identical lines from previous traceback] 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module> 
    test.start(picture_data,key_values) 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 64, in start 
    model = self.model_1(picture_data,key_values) 
    File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 57, in model_1 
    model.add(Dense(units = 8, activation ="softmax")) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 476, in add 
    output_tensor = layer(self.outputs[0]) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\topology.py", line 596, in __call__ 
    output = self.call(inputs, **kwargs) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\layers\core.py", line 843, in call 
    output = K.dot(inputs, self.kernel) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 976, in dot 
    out = tf.matmul(x, y) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1816, in matmul 
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1217, in _mat_mul 
    transpose_b=transpose_b, name=name) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op 
    op_def=op_def) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op 
    original_op=self._default_original_op, op_def=op_def) 
    File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__ 
    self._traceback = _extract_stack() 
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[313344,8] 
[[Node: gradients/dense_1/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, _class=["loc:@dense_1/MatMul"], transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](flatten_1/Reshape, gradients/dense_1/Softmax_grad/mul_1)]]

來源

2017-06-29 DrDeep

對此：「我想它之前的一切工作，但突然冒出它不會工作了。」也許你的GPU有分配內存的殭屍進程。你通常可以用'nvidia-smi'來觀察。在這種情況下，一個可能的解決方案是重新啓動。 –

您可能想看看https://stackoverflow.com/documentation/tensorflow/10621/tensorflow-gpu-setup/31879/control-the-gpu-memory-allocation#t=201706291458179435665來管理CUDA內存。 – npf

再次重新啓動蟒蛇嘗試。直到您在代碼中提到之前，GPU內存不會釋放。有一段時間，在同一個python shell中再次運行深度學習程序時，沒有提到需要使用此OOM錯誤的內存量。請參閱這篇文章

How to prevent tensorflow from allocating the totality of a GPU memory?

來源

2017-07-01 10:03:27 Azad

圖形卡的OOM CUDA/Tensorflow

回答

相關問題