ResourceExhaustedError во время обучения - PullRequest
0 голосов
/ 17 мая 2019

Я тренирую нейронную сеть для сегментирования объемных медицинских изображений.Каждое изображение имеет размер 192x192x160 (один канал), и я обучаю их по размеру пакета 2. Моя проблема заключается в том, что я получаю ResourceExhaustedError случайно во время обучения (иногда после 4 эпох, иногда после 10, а иногда просто нетподнять любую ошибку).Я использую Keras 2.1.1 с tenorflow 1.3.0 на Ubuntu 16.04.5 LTS (Xenial Xerus).Код выполняется на графическом процессоре nVidia GeForce GTX 1080ti.

В приведенном ниже коде train_images представляет собой массив значений numpy 100x192x192x160, train_masks представляет собой массив значений numpie 100x192x192x160x3, val_images представляет собой массив numyразмером 30x192x192x160 и val_masks размером 30x192x192x160x3.

Как я могу убедиться, что моя программа не будет остановлена ​​этим ResourceExhaustedError?Не стесняйтесь спрашивать, считаете ли вы, что какая-либо дополнительная информация может помочь!

Я пытался убить все процессы PID на моем GPU с помощью командной строки nvidia-smi перед запуском обучения.

history = model.fit(train_images,
                        train_masks,
                        batch_size=params['batch_size'],
                        epochs=params['epochs'],
                        verbose=1,
                        shuffle=True,
                        validation_data = (val_images, val_masks))

ResourceExhaustedError                    Traceback (most recent call last)
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1326     try:
-> 1327       return fn(*args)
   1328     except errors.OpError as e:

/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1305                                    feed_dict, fetch_list, target_list,
-> 1306                                    status, run_metadata)
   1307 

/export/share/anaconda3/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
     87             try:
---> 88                 next(self.gen)
     89             except StopIteration:

/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
    465           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466           pywrap_tensorflow.TF_GetCode(status))
    467   finally:

ResourceExhaustedError: OOM when allocating tensor with shape[2,32,192,192,160]
     [[Node: training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2 = Conv3DBackpropInputV2[T=DT_FLOAT, _class=["loc:@conv3d_17/convolution"], data_format="NDHWC", padding="SAME", strides=[1, 1, 1, 1, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/conv3d_17/convolution_grad/Shape, conv3d_17/kernel/read, training/Adam/gradients/conv3d_17/add_grad/Reshape)]]

During handling of the above exception, another exception occurred:

ResourceExhaustedError                    Traceback (most recent call last)
<ipython-input-9-6fa9876b44a1> in <module>()
----> 1 train(images[0:params['n_images'],:,:,:], masks[0:params['n_images'],:,:,:,:], images[100:130,:,:,:], masks[100:130,:,:,:,:], params)

<ipython-input-7-6118d16d7eba> in train(train_images, train_masks, val_images, val_masks, params)
     78                         verbose=1,
     79                         shuffle=True,
---> 80                         validation_data = (val_images, val_masks))
     81                         #callbacks=[model_checkpoint, earlystopping])
     82     model.save('results/' + params2name(params) + '/weights.h5')

/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1648                               initial_epoch=initial_epoch,
   1649                               steps_per_epoch=steps_per_epoch,
-> 1650                               validation_steps=validation_steps)
   1651 
   1652     def evaluate(self, x=None, y=None,

/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
   1211                     batch_logs['size'] = len(batch_ids)
   1212                     callbacks.on_batch_begin(batch_index, batch_logs)
-> 1213                     outs = f(ins_batch)
   1214                     if not isinstance(outs, list):
   1215                         outs = [outs]

/export/share/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2350         session = get_session()
   2351         updated = session.run(fetches=fetches, feed_dict=feed_dict,
-> 2352                               **self.session_kwargs)
   2353         return updated[:len(self.outputs)]
   2354 

/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    893     try:
    894       result = self._run(None, fetches, feed_dict, options_ptr,
--> 895                          run_metadata_ptr)
    896       if run_metadata:
    897         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1122     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1123       results = self._do_run(handle, final_targets, final_fetches,
-> 1124                              feed_dict_tensor, options, run_metadata)
   1125     else:
   1126       results = []

/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1319     if handle is None:
   1320       return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1321                            options, run_metadata)
   1322     else:
   1323       return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1338         except KeyError:
   1339           pass
-> 1340       raise type(e)(node_def, op, message)
   1341 
   1342   def _extend_graph(self):

ResourceExhaustedError: OOM when allocating tensor with shape[2,32,192,192,160]
     [[Node: training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2 = Conv3DBackpropInputV2[T=DT_FLOAT, _class=["loc:@conv3d_17/convolution"], data_format="NDHWC", padding="SAME", strides=[1, 1, 1, 1, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/conv3d_17/convolution_grad/Shape, conv3d_17/kernel/read, training/Adam/gradients/conv3d_17/add_grad/Reshape)]]

Caused by op 'training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2', defined at:
  File "/export/share/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/export/share/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/export/share/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/export/share/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/export/share/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/export/share/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/export/share/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/export/share/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/export/share/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "/export/share/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-6fa9876b44a1>", line 1, in <module>
    train(images[0:params['n_images'],:,:,:], masks[0:params['n_images'],:,:,:,:], images[100:130,:,:,:], masks[100:130,:,:,:,:], params)
  File "<ipython-input-7-6118d16d7eba>", line 80, in train
    validation_data = (val_images, val_masks))
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1627, in fit
    self._make_train_function()
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 990, in _make_train_function
    loss=self.total_loss)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
    return func(*args, **kwargs)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/optimizers.py", line 415, in get_updates
    grads = self.get_gradients(loss, params)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/optimizers.py", line 73, in get_gradients
    grads = K.gradients(loss, params)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2389, in gradients
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 348, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_grad.py", line 80, in _Conv3DGrad
    data_format=data_format),
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 664, in conv3d_backprop_input_v2
    name=name)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

...which was originally created as op 'conv3d_17/convolution', defined at:
  File "/export/share/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 18 identical lines from previous traceback]
  File "<ipython-input-9-6fa9876b44a1>", line 1, in <module>
    train(images[0:params['n_images'],:,:,:], masks[0:params['n_images'],:,:,:,:], images[100:130,:,:,:], masks[100:130,:,:,:,:], params)
  File "<ipython-input-7-6118d16d7eba>", line 54, in train
    model = unet(params,imsz)
  File "/DATA/jeaneliott/bladderectum/utils.py", line 115, in unet
    conv9 = Conv3D(params['n_feat_maps'], (3, 3, 3), activation='relu', padding='same')(up9)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/layers/convolutional.py", line 172, in call
    dilation_rate=self.dilation_rate)
  File "/export/share/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 3364, in conv3d
    data_format=tf_data_format)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 672, in convolution
    op=op)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 338, in with_space_to_batch
    return op(input, num_spatial_dims, padding)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 664, in op
    name=name)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 146, in _non_atrous_convolution
    name=name)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 524, in conv3d
    data_format=data_format, name=name)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[2,32,192,192,160]
     [[Node: training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2 = Conv3DBackpropInputV2[T=DT_FLOAT, _class=["loc:@conv3d_17/convolution"], data_format="NDHWC", padding="SAME", strides=[1, 1, 1, 1, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/conv3d_17/convolution_grad/Shape, conv3d_17/kernel/read, training/Adam/gradients/conv3d_17/add_grad/Reshape)]]
...