Возникла следующая ошибка, когда я пытался обучить модель с использованием TPU.
TPU был предоставлен Kaggle , и я создал свой код в собственной онлайн-среде IDE.
Я использовал набор данных с тензорным потоком с генератором изображений.
Генератор:
generator = ImageDataGenerator(rotation_range=10, zoom_range = 0.10, width_shift_range=0.1, height_shift_range=0.1)
Наборы данных:
train_dataset = (tf.data.Dataset.from_generator(
generator.flow,
args=[x_train, y_train],
output_types=(tf.float32, tf.float32),
output_shapes=(x_train.shape[1:], y_train.shape[1:]))
.repeat()
.cache()
.shuffle(x_train.shape[1])
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
test_dataset = (tf.data.Dataset.from_generator(
generator.flow,
args=[x_test, y_test],
output_types=(tf.float32, tf.float32),
output_shapes=(x_test.shape[1:], y_test.shape[1:]))
.repeat()
.cache()
.shuffle(x_test.shape[1])
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
Модель:
with strategy.scope():
model = Sequential()
model.add(EfficientNetB7(include_top=False, weights='imagenet', input_shape=x_train.shape[1:], pooling='max')),
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
Модель обучения:
history = model.fit_generator(train_dataset, epochs=50, steps_per_epoch=y_train.shape[0]//BATCH_SIZE, validation_data=test_dataset, validation_steps=4, callbacks=[es])
Ошибка:
---------------------------------------------------------------------------
NotFoundError Traceback (most recent call last)
<ipython-input-49-1319d7727fb8> in <module>
----> 1 history = model.fit_generator(train_dataset, epochs=50, steps_per_epoch=y_train.shape[0]//BATCH_SIZE, validation_data=test_dataset, validation_steps=4, callbacks=[es])
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py in new_func(*args, **kwargs)
322 'in a future version' if date is None else ('after %s' % date),
323 instructions)
--> 324 return func(*args, **kwargs)
325 return tf_decorator.make_decorator(
326 func, new_func, 'deprecated',
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1304 use_multiprocessing=use_multiprocessing,
1305 shuffle=shuffle,
-> 1306 initial_epoch=initial_epoch)
1307
1308 @deprecation.deprecated(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
340 mode=ModeKeys.TRAIN,
341 training_context=training_context,
--> 342 total_epochs=epochs)
343 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
344
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
126 step=step, mode=mode, size=current_batch_size) as batch_logs:
127 try:
--> 128 batch_outs = execution_function(iterator)
129 except (StopIteration, errors.OutOfRangeError):
130 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
96 # `numpy` translates Tensors to values in Eager mode.
97 return nest.map_structure(_non_none_constant_value,
---> 98 distributed_function(input_fn))
99
100 return execution_function
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/util/nest.py in map_structure(func, *structure, **kwargs)
566
567 return pack_sequence_as(
--> 568 structure[0], [func(*x) for x in entries],
569 expand_composites=expand_composites)
570
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/util/nest.py in <listcomp>(.0)
566
567 return pack_sequence_as(
--> 568 structure[0], [func(*x) for x in entries],
569 expand_composites=expand_composites)
570
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in _non_none_constant_value(v)
128
129 def _non_none_constant_value(v):
--> 130 constant_value = tensor_util.constant_value(v)
131 return constant_value if constant_value is not None else v
132
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_util.py in constant_value(tensor, partial)
820 """
821 if isinstance(tensor, ops.EagerTensor):
--> 822 return tensor.numpy()
823 if not is_tensor(tensor):
824 return tensor
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in numpy(self)
940 """
941 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
--> 942 maybe_arr = self._numpy() # pylint: disable=protected-access
943 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
944
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in _numpy(self)
908 return self._numpy_internal()
909 except core._NotOkStatusException as e:
--> 910 six.raise_from(core._status_to_exception(e.code, e.message), None)
911
912 @property
/opt/conda/lib/python3.6/site-packages/six.py in raise_from(value, from_value)
NotFoundError: 7 root error(s) found.
(0) Not found: {{function_node __inference_distributed_function_662787}} No registered 'PyFunc' OpKernel for 'CPU' devices compatible with node {{node PyFunc}}
. Registered: <no registered kernels>
[[PyFunc]]
[[MultiDeviceIteratorGetNextFromShard]]
[[RemoteCall]]
[[IteratorGetNextAsOptional_6]]
(1) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
(2) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
(3) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
(4) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
(5) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
(6) Cancelled: {{function_node __inference_distributed_function_662787}} Function was cancelled before it was started
0 successful operations.
2 derived errors ignored.