Описание
- Я тренирую модель
keras
, используя tensorflow 2.x
с multiworker distributed stragtegy
. - И теперь я хочу сохранить обученная модель с функцией
model.save
. - Каждый работник имеет код python
model.save(model_dir)
. - . После обучения рабочие
all
пытаются сохранить модель в том же каталоге, и иногда вызывает некоторые исключения. - По моему мнению,
save operation
должен делать cheif
работник, а не весь работник.
Показаны фрагменты кода работника ниже:
import sys
import os
import tensorflow as tf
from tensorflow.keras import layers
from absl import app, flags
import numpy as np
import json
FLAGS = flags.FLAGS
flags.DEFINE_string("logs", "logs", "log dir")
tf_config = {
"task": {
"index": 0,
"type": "worker"
},
"cluster": {
"worker": ["localhost:21834", "localhost:27271"],
}
}
os.environ["TF_CONFIG"] = json.dumps(tf_config)
print(json.loads(os.environ["TF_CONFIG"]))
def main(argv):
del argv # Unused
BATCH_SIZE = 100
SAMPLE_SIZE = 50000
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with strategy.scope():
model = tf.keras.Sequential([
layers.Dense(64, activation='relu'),
layers.Dense(32, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=[tf.keras.metrics.AUC()])
log_dir = FLAGS.logs
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
histogram_freq=1,
update_freq='epoch')
train_dataset = tf.data.Dataset.from_tensor_slices(
(np.random.randint(1000, size=(SAMPLE_SIZE, 31)),
np.random.randint(2, size=(SAMPLE_SIZE, 1))))
train_dataset = train_dataset.batch(BATCH_SIZE)
validation_dataset = tf.data.Dataset.from_tensor_slices(
(np.random.randint(1000, size=(SAMPLE_SIZE, 31)),
np.random.randint(2, size=(SAMPLE_SIZE, 1))))
validation_dataset = validation_dataset.batch(BATCH_SIZE)
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
train_dataset = train_dataset.with_options(options)
validation_dataset = validation_dataset.with_options(options)
model.fit(train_dataset,
epochs=5,
steps_per_epoch=10,
validation_data=validation_dataset,
validation_steps=5)
model_dir = FLAGS.logs + '/models'
model.save(model_dir)
if __name__ == '__main__':
app.run(main)
Журналы ошибок
I0414 10:49:00.277214 18928 builder_impl.py:775] Assets written to: logs/models\assets
Traceback (most recent call last):
File "worker1.py", line 75, in <module>
app.run(main)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\absl\app.py", line 299, in run
_run_main(main, args)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\absl\app.py", line 250, in _run_main
sys.exit(main(argv))
File "worker1.py", line 71, in main
model.save(model_dir)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\keras\engine\network.py", line 1062, in save
signatures, options)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\keras\saving\save.py", line 134, in save_model
signatures, options)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\keras\saving\saved_model\save.py", line 78, in save
save_lib.save(model, filepath, signatures, options)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\saved_model\save.py", line 969, in save
path, saved_model.SerializeToString(deterministic=True))
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 532, in atomic_write_string_to_file
rename(temp_pathname, filename, overwrite)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 491, in rename
rename_v2(oldname, newname, overwrite)
File "D:\Program Files\anaconda3\envs\tf-nightly-cpu\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 508, in rename_v2
compat.as_bytes(src), compat.as_bytes(dst), overwrite)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbe in position 114: invalid start byte
2020-04-14 10:49:00.770267: W tensorflow/core/common_runtime/eager/context.cc:491] Unable to destroy server_ object, so releasing instead. Servers don't support clean shutdown.
Вопросы
- Это ошибка тензор потока?
- Если 1 нет. Должен ли я сменить директора на разных работников? Если да, то какая модель является последней моделью для сервировки?
- Если 1 - нет. Есть ли способ, которым
chief worker
сохраняет только модель?