Я тренирую модель с использованием Tensorflow, для которой требуются огромные объемы памяти, поэтому я использую для l oop рекурсивную загрузку новых обучающих данных. В конце каждого l oop я сохраняю модель методом model.save () следующим образом:
def save(self, sess=None):
if not sess:
raise AttributeError("TensorFlow session not provided.")
saver = tf.train.Saver(self.vars)
save_path = saver.save(sess, "./tmp/%s.ckpt" % self.name)
print("Model saved in file: %s" % save_path)
Затем я перезагружаю модель в следующей итерации l oop путем создания экземпляра объекта модели и последующего вызова model.load (), который выглядит следующим образом:
def load(self, sess=None):
if not sess:
raise AttributeError("TensorFlow session not provided.")
saver = tf.train.import_meta_graph('./tmp/%s.ckpt.meta' % self.name)
save_path = "./tmp/%s.ckpt" % self.name
saver.restore(sess,tf.train.latest_checkpoint('./tmp/'))
print("Model restored from file: %s" % save_path)
Обычно модель будет сохранять и загружать без проблем, как I l oop, через наборы обучающих данных.
Однако на некоторых итерациях l oop при сохранении модели появляется следующая ошибка:
terminate called after throwing an instance of 'std::length_error'
what(): basic_string::append
Fatal Python error: Aborted
Thread 0x00007fd3a5080700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x00007fd55f4ed700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x00007fd499374700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x00007fd57cf17700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x00007fd14affd700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x00007fd14b7fe700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 295 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/queue.py", line 164 in get
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/summary/writer/event_file_writer.py", line 159 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x00007fd62af20700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 299 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 551 in wait
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tqdm/_monitor.py", line 69 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/threading.py", line 884 in _bootstrap
Current thread 0x00007fd63926c700 (most recent call first):
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3166 in _as_graph_def
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3238 in as_graph_def
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1246 in export_meta_graph
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1203 in save
File "/home/dxcl/graph_model/models.py", line 85 in save
File "/home/dxcl/graph_model/unsupervised_train.py", line 351 in train
File "/home/dxcl/graph_model/unsupervised_train.py", line 362 in main
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/absl/app.py", line 250 in _run_main
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/absl/app.py", line 299 in run
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40 in run
File "/home/dxcl/graph_model/unsupervised_train.py", line 365 in <module>
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/runpy.py", line 85 in _run_code
File "/home/dxcl/conda_envs/py_env_2/lib/python3.6/runpy.py", line 193 in _run_module_as_main
Aborted (core dumped)
Я заметил, что этого не происходит, когда я уменьшаю количество обучающих данных, подаваемых в каждом l oop, так что это может быть связано с памятью. Я просто не понимаю, почему это сработало бы в некоторых случаях, но не в других?
(я работаю с tenorflow-gpu 1.15 и делаю это в терминале JupyterLab. С удовольствием предоставлю любую другую соответствующую информацию или код!)