TensorFlow saver.save () не работает согласованно - приводит к дампу ядра и «завершается вызовом после выброса экземпляра« std :: length_error »» - PullRequest
0 голосов
/ 27 апреля 2020

Я тренирую модель с использованием Tensorflow, для которой требуются огромные объемы памяти, поэтому я использую для l oop рекурсивную загрузку новых обучающих данных. В конце каждого l oop я сохраняю модель методом model.save () следующим образом:

 def save(self, sess=None):
    if not sess:
        raise AttributeError("TensorFlow session not provided.")
    saver = tf.train.Saver(self.vars)
    save_path = saver.save(sess, "./tmp/%s.ckpt" % self.name)
    print("Model saved in file: %s" % save_path)

Затем я перезагружаю модель в следующей итерации l oop путем создания экземпляра объекта модели и последующего вызова model.load (), который выглядит следующим образом:

    def load(self, sess=None):
        if not sess:
             raise AttributeError("TensorFlow session not provided.")
        saver = tf.train.import_meta_graph('./tmp/%s.ckpt.meta' % self.name)
        save_path = "./tmp/%s.ckpt" % self.name
        saver.restore(sess,tf.train.latest_checkpoint('./tmp/'))
        print("Model restored from file: %s" % save_path)

Обычно модель будет сохранять и загружать без проблем, как I l oop, через наборы обучающих данных.

Однако на некоторых итерациях l oop при сохранении модели появляется следующая ошибка:

terminate called after throwing an instance of 'std::length_error'
what():  basic_string::append
Fatal Python error: Aborted

Thread 0x00007fd3a5080700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Thread 0x00007fd55f4ed700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner  File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Thread 0x00007fd499374700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Thread 0x00007fd57cf17700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Thread 0x00007fd14affd700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Thread 0x00007fd14b7fe700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Thread 0x00007fd62af20700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 299 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 551 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tqdm/_monitor.py", line 69 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap

Current thread 0x00007fd63926c700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3166 in _as_graph_def
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3238 in as_graph_def
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1246 in export_meta_graph
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1203 in save
File "/home/dxcl/graph_model/models.py", line 85 in save
File "/home/dxcl/graph_model/unsupervised_train.py", line 351 in train
File "/home/dxcl/graph_model/unsupervised_train.py", line 362 in main
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/absl/app.py", line 250 in _run_main
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/absl/app.py", line 299 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40 in run
File "/home/dxcl/graph_model/unsupervised_train.py", line 365 in <module>
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/runpy.py", line 85 in _run_code
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/runpy.py", line 193 in _run_module_as_main
Aborted (core dumped)

Я заметил, что этого не происходит, когда я уменьшаю количество обучающих данных, подаваемых в каждом l oop, так что это может быть связано с памятью. Я просто не понимаю, почему это сработало бы в некоторых случаях, но не в других?

(я работаю с tenorflow-gpu 1.15 и делаю это в терминале JupyterLab. С удовольствием предоставлю любую другую соответствующую информацию или код!)

...