Получение NotFoundError при обучении модели с использованием TPU на Kaggle - PullRequest
1 голос
/ 03 апреля 2020

Возникла следующая ошибка, когда я пытался обучить модель с использованием TPU.

TPU был предоставлен Kaggle , и я создал свой код в собственной онлайн-среде IDE.

Я использовал набор данных с тензорным потоком с генератором изображений.

Генератор:

generator = ImageDataGenerator(rotation_range=10, zoom_range = 0.10, width_shift_range=0.1, height_shift_range=0.1)

Наборы данных:

train_dataset = (tf.data.Dataset.from_generator(
    generator.flow,
    args=[x_train, y_train],
    output_types=(tf.float32, tf.float32),
    output_shapes=(x_train.shape[1:], y_train.shape[1:]))
    .repeat()
    .cache()
    .shuffle(x_train.shape[1])
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset = (tf.data.Dataset.from_generator(
    generator.flow,
    args=[x_test, y_test],
    output_types=(tf.float32, tf.float32),
    output_shapes=(x_test.shape[1:], y_test.shape[1:]))
    .repeat()
    .cache()
    .shuffle(x_test.shape[1])
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

Модель:

with strategy.scope():
    model = Sequential()

    model.add(EfficientNetB7(include_top=False, weights='imagenet', input_shape=x_train.shape[1:], pooling='max')),

    model.add(Dense(4, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()

Модель обучения:

history = model.fit_generator(train_dataset, epochs=50, steps_per_epoch=y_train.shape[0]//BATCH_SIZE, validation_data=test_dataset, validation_steps=4, callbacks=[es])

Ошибка:

---------------------------------------------------------------------------
NotFoundError                             Traceback (most recent call last)
<ipython-input-49-1319d7727fb8> in <module>
----> 1 history = model.fit_generator(train_dataset, epochs=50, steps_per_epoch=y_train.shape[0]//BATCH_SIZE, validation_data=test_dataset, validation_steps=4, callbacks=[es])

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py in new_func(*args, **kwargs)
    322               'in a future version' if date is None else ('after %s' % date),
    323               instructions)
--> 324       return func(*args, **kwargs)
    325     return tf_decorator.make_decorator(
    326         func, new_func, 'deprecated',

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1304         use_multiprocessing=use_multiprocessing,
   1305         shuffle=shuffle,
-> 1306         initial_epoch=initial_epoch)
   1307 
   1308   @deprecation.deprecated(

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    817         max_queue_size=max_queue_size,
    818         workers=workers,
--> 819         use_multiprocessing=use_multiprocessing)
    820 
    821   def evaluate(self,

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    340                 mode=ModeKeys.TRAIN,
    341                 training_context=training_context,
--> 342                 total_epochs=epochs)
    343             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    344 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    126         step=step, mode=mode, size=current_batch_size) as batch_logs:
    127       try:
--> 128         batch_outs = execution_function(iterator)
    129       except (StopIteration, errors.OutOfRangeError):
    130         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
     96     # `numpy` translates Tensors to values in Eager mode.
     97     return nest.map_structure(_non_none_constant_value,
---> 98                               distributed_function(input_fn))
     99 
    100   return execution_function

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/util/nest.py in map_structure(func, *structure, **kwargs)
    566 
    567   return pack_sequence_as(
--> 568       structure[0], [func(*x) for x in entries],
    569       expand_composites=expand_composites)
    570 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/util/nest.py in <listcomp>(.0)
    566 
    567   return pack_sequence_as(
--> 568       structure[0], [func(*x) for x in entries],
    569       expand_composites=expand_composites)
    570 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in _non_none_constant_value(v)
    128 
    129 def _non_none_constant_value(v):
--> 130   constant_value = tensor_util.constant_value(v)
    131   return constant_value if constant_value is not None else v
    132 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_util.py in constant_value(tensor, partial)
    820   """
    821   if isinstance(tensor, ops.EagerTensor):
--> 822     return tensor.numpy()
    823   if not is_tensor(tensor):
    824     return tensor

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in numpy(self)
    940     """
    941     # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
--> 942     maybe_arr = self._numpy()  # pylint: disable=protected-access
    943     return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
    944 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in _numpy(self)
    908       return self._numpy_internal()
    909     except core._NotOkStatusException as e:
--> 910       six.raise_from(core._status_to_exception(e.code, e.message), None)
    911 
    912   @property

/opt/conda/lib/python3.6/site-packages/six.py in raise_from(value, from_value)

NotFoundError: 7 root error(s) found.
  (0) Not found: {{function_node __inference_distributed_function_662787}} No registered 'PyFunc' OpKernel for 'CPU' devices compatible with node {{node PyFunc}}
    .  Registered:  <no registered kernels>

     [[PyFunc]]
     [[MultiDeviceIteratorGetNextFromShard]]
     [[RemoteCall]]
     [[IteratorGetNextAsOptional_6]]
  (1) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
  (2) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
  (3) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
  (4) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
  (5) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
  (6) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
0 successful operations.
2 derived errors ignored.
...