Я пытаюсь обучить нейронную сеть, используя TPU на Google Cloud Platform (GCP).
Я сохранил свои файлы как tfrecords локально и открыл Блокнот Jupyter, работающий на виртуальной машине (вычислительный движок), где я пишу свой код для обучения.
Мой код выполняется, пока не начнется обучение. Затем я получаю сообщение об ошибке:
NotFoundError: тип операции не зарегистрирован как «ParallelInterleaveDataset» в
двоичный файл работает на n-b2696fa0-w-0. Убедитесь, что Op и Kernel
зарегистрирован в двоичном файле, запущенном в этом процессе. Обратите внимание, что если вы
загрузка сохраненного графика, который использовал ops из tf.contrib, доступ (например,)
tf.contrib.resampler
должно быть сделано перед импортом графика, так как
Операции contrib лениво регистрируются при первом обращении к модулю.
Я немного погуглил и натолкнулся на это с помощью Google: недоступен tenorflow op . В нем говорится, что некоторые операции недопустимы в коде для TPU.
Однако я никогда не использую функцию с именем «ParallelInterleaveDataset». Мой вопрос:
В чем может быть причина этой проблемы и что я могу сделать, чтобы решить ее и обучить свою сеть на TPU?
-
Полное сообщение об ошибке для полноты:
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:TPU job name tpu_worker
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Error recorded from training_loop: Op type not registered 'ParallelInterleaveDataset' in binary running on n-b2696fa0-w-0. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.
INFO:tensorflow:training_loop marked as finished
WARNING:tensorflow:Reraising captured error
---------------------------------------------------------------------------
NotFoundError Traceback (most recent call last)
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1316 # Ensure any changes to the graph are reflected in the runtime.
-> 1317 self._extend_graph()
1318 return self._call_tf_sessionrun(
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in _extend_graph(self)
1351 with self._graph._session_run_lock(): # pylint: disable=protected-access
-> 1352 tf_session.ExtendSession(self._session)
1353
NotFoundError: Op type not registered 'ParallelInterleaveDataset' in binary running on n-b2696fa0-w-0. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.
During handling of the above exception, another exception occurred:
NotFoundError Traceback (most recent call last)
<ipython-input-115-ee69fe04790e> in <module>
----> 1 tpu_estimator.train(input_fn=train_input_fn, steps=1)
~/yes/lib/python3.6/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
2407 if ctx.is_running_on_cpu(is_export_mode=False):
2408 with ops.device('/device:CPU:0'):
-> 2409 return input_fn(**kwargs)
2410
2411 # For TPU computation, input_fn should be invoked in a tf.while_loop for
~/yes/lib/python3.6/site-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py in raise_errors(self, timeout_sec)
126 else:
127 logging.warn('Reraising captured error')
--> 128 six.reraise(typ, value, traceback)
129
130 for k, (typ, value, traceback) in kept_errors:
~/yes/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/yes/lib/python3.6/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
2401 if batch_size_for_input_fn is not None:
2402 _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
-> 2403 batch_size_for_input_fn)
2404
2405 # For export_savedmodel, input_fn is never passed to Estimator. So,
~/yes/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
~/yes/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
~/yes/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py in _train_model_default(self, input_fn, hooks, saving_listeners)
~/yes/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in MonitoredTrainingSession(master, is_chief, checkpoint_dir, scaffold, hooks, chief_only_hooks, save_checkpoint_secs, save_summaries_steps, save_summaries_secs, config, stop_grace_period_secs, log_step_count_steps, max_wait_secs, save_checkpoint_steps, summary_dir)
502
503 if hooks:
--> 504 all_hooks.extend(hooks)
505 return MonitoredSession(
506 session_creator=session_creator,
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, stop_grace_period_secs)
919 * it cannot be sent to tf.train.start_queue_runners.
920
--> 921 Args:
922 session_creator: A factory object to create session. Typically a
923 `ChiefSessionCreator` which is the default one.
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs)
641
642 # Create the session.
--> 643 self._coordinated_creator = self._CoordinatedSessionCreator(
644 session_creator=session_creator or ChiefSessionCreator(),
645 hooks=self._hooks,
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in __init__(self, sess_creator)
1105
1106 Calls to `run()` are delegated to the wrapped session. If a call raises the
-> 1107 exception `tf.errors.AbortedError` or `tf.errors.UnavailableError`, the
1108 wrapped session is closed, and a new one is created by calling the factory
1109 again.
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in _create_session(self)
1110 """
1111
-> 1112 def __init__(self, sess_creator):
1113 """Create a new `_RecoverableSession`.
1114
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in create_session(self)
798 self.coord = None
799 self.tf_sess = None
--> 800 self._stop_grace_period_secs = stop_grace_period_secs
801
802 def create_session(self):
~/yes/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py in create_session(self)
564 self._master,
565 saver=self._scaffold.saver,
--> 566 checkpoint_dir=self._checkpoint_dir,
567 checkpoint_filename_with_path=self._checkpoint_filename_with_path,
568 config=self._config,
~/yes/lib/python3.6/site-packages/tensorflow/python/training/session_manager.py in prepare_session(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)
292 if not local_init_success:
293 raise RuntimeError(
--> 294 "Init operations did not make model ready for local_init. "
295 "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op),
296 init_fn,
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
927 try:
928 result = self._run(None, fetches, feed_dict, options_ptr,
--> 929 run_metadata_ptr)
930 if run_metadata:
931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1150 if final_fetches or final_targets or (handle and feed_dict_tensor):
1151 results = self._do_run(handle, final_targets, final_fetches,
-> 1152 feed_dict_tensor, options, run_metadata)
1153 else:
1154 results = []
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1326 if handle is None:
1327 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1328 run_metadata)
1329 else:
1330 return self._do_call(_prun_fn, handle, feeds, fetches)
~/yes/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
NotFoundError: Op type not registered 'ParallelInterleaveDataset' in binary running on n-b2696fa0-w-0. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.