В tf2 я пытаюсь обучить нейронную сеть распределенным способом, используя керасы.Я следую этому уроку: https://www.tensorflow.org/alpha/tutorials/distribute/keras
Стратегия, используемая в этом уроке - MirroredStrategy.Но я хочу запустить код с нескольких машин.В настоящее время в tf2 для распределенных кераров поддерживаются только стратегии Mirrored и Parameter-Server.Поэтому я просто попытался изменить стратегию на Parameter-Server (с Mirrored он работает нормально).
Я получаю следующую ошибку:
RuntimeError: Error copying tensor to device: /job:worker/replica:0/task:0/device:CPU:0. /job:worker/replica:0/task:0/device:CPU:0 unknown device.
Еще одна вещь, которая не имеет к этому отношениясообщение.В tf2 поддерживается только MirroredStrategy для пользовательских тренировочных циклов (без использования керас или оценщика).Кто-нибудь знает, можно ли как-то распределить обучение по нескольким машинам с помощью пользовательских циклов обучения?
Заранее спасибо!
Вот код для 1-го работника.Это то же самое, что и в учебнике с добавлением TF_CONFIG и изменением стратегии.(тензор потока 2.0 альфа) * 1013 *
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_datasets as tfds
import os, json
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
mnist_train, mnist_test = datasets['train'], datasets['test']
os.environ['TF_CONFIG'] = json.dumps({
"cluster": {
"worker": ["x.y.z.w1:12345", "x.y.z.w2:12345"],
"ps": ["x.y.z.w3:12345"]
},
"task": {"type": "worker", "index": 0}
})
strategy = tf.distribute.experimental.ParameterServerStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
# You can also do info.splits.total_num_examples to get the total
# number of examples in the dataset.
num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples
BUFFER_SIZE = 10000
BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
def scale(image, label):
image = tf.cast(image, tf.float32)
image /= 255
return image, label
train_dataset = mnist_train.map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)
with strategy.scope():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
# Define the checkpoint directory to store the checkpoints
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Function for decaying the learning rate.
# You can define any decay function you need.
def decay(epoch):
if epoch < 3:
return 1e-3
elif epoch >= 3 and epoch < 7:
return 1e-4
else:
return 1e-5
# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
model.optimizer.lr.numpy()))
callbacks = [
tf.keras.callbacks.TensorBoard(log_dir='./logs'),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
save_weights_only=True),
tf.keras.callbacks.LearningRateScheduler(decay),
PrintLR()
]
model.fit(train_dataset, epochs=12, callbacks=callbacks)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
eval_loss, eval_acc = model.evaluate(eval_dataset)
print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))
Ожидаемый результат должен быть примерно таким:
Epoch 1/12
938/Unknown - 11s 11ms/step - loss: 0.2151 - accuracy: 0.9380
Learning rate for epoch 1 is 0.0010000000474974513
938/938 [==============================] - 11s 11ms/step - loss: 0.2151 - accuracy: 0.9380
Epoch 2/12
937/938 [============================>.] - ETA: 0s - loss: 0.0678 - accuracy: 0.9801
Learning rate for epoch 2 is 0.0010000000474974513
938/938 [==============================] - 7s 7ms/step - loss: 0.0678 - accuracy: 0.9801
Epoch 3/12
935/938 [============================>.] - ETA: 0s - loss: 0.0473 - accuracy: 0.9865
Learning rate for epoch 3 is 0.0010000000474974513
938/938 [==============================] - 7s 8ms/step - loss: 0.0473 - accuracy: 0.9865
Epoch 4/12
931/938 [============================>.] - ETA: 0s - loss: 0.0258 - accuracy: 0.9931
Learning rate for epoch 4 is 9.999999747378752e-05
938/938 [==============================] - 7s 7ms/step - loss: 0.0258 - accuracy: 0.9931
Epoch 5/12
934/938 [============================>.] - ETA: 0s - loss: 0.0229 - accuracy: 0.9944
Learning rate for epoch 5 is 9.999999747378752e-05
938/938 [==============================] - 7s 8ms/step - loss: 0.0228 - accuracy: 0.9944
Epoch 6/12
928/938 [============================>.] - ETA: 0s - loss: 0.0211 - accuracy: 0.9947
Learning rate for epoch 6 is 9.999999747378752e-05
938/938 [==============================] - 7s 7ms/step - loss: 0.0211 - accuracy: 0.9947
Epoch 7/12
928/938 [============================>.] - ETA: 0s - loss: 0.0196 - accuracy: 0.9952
Learning rate for epoch 7 is 9.999999747378752e-05
938/938 [==============================] - 7s 8ms/step - loss: 0.0195 - accuracy: 0.9952
Epoch 8/12
928/938 [============================>.] - ETA: 0s - loss: 0.0169 - accuracy: 0.9961
Learning rate for epoch 8 is 9.999999747378752e-06
938/938 [==============================] - 7s 8ms/step - loss: 0.0168 - accuracy: 0.9961
Epoch 9/12
925/938 [============================>.] - ETA: 0s - loss: 0.0167 - accuracy: 0.9962
Learning rate for epoch 9 is 9.999999747378752e-06
938/938 [==============================] - 7s 8ms/step - loss: 0.0166 - accuracy: 0.9962
Epoch 10/12
935/938 [============================>.] - ETA: 0s - loss: 0.0164 - accuracy: 0.9962
Learning rate for epoch 10 is 9.999999747378752e-06
938/938 [==============================] - 7s 8ms/step - loss: 0.0164 - accuracy: 0.9962
Epoch 11/12
932/938 [============================>.] - ETA: 0s - loss: 0.0163 - accuracy: 0.9963
Learning rate for epoch 11 is 9.999999747378752e-06
938/938 [==============================] - 7s 7ms/step - loss: 0.0162 - accuracy: 0.9963
Epoch 12/12
933/938 [============================>.] - ETA: 0s - loss: 0.0161 - accuracy: 0.9963
Learning rate for epoch 12 is 9.999999747378752e-06
938/938 [==============================] - 7s 7ms/step - loss: 0.0160 - accuracy: 0.9964
И что я получаю:
Traceback (most recent call last):
File "keras_dist.py", line 58, in <module>
metrics=['accuracy'])
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/distribute_lib.py", line 189, in __exit__
self._device_scope.__exit__(exception_type, exception_value, traceback)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/context.py", line 603, in device
yield
File "keras_dist.py", line 53, in <module>
tf.keras.layers.Dense(10, activation='softmax')
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/training/tracking/base.py", line 456, in _method_wrapper
result = method(self, *args, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/sequential.py", line 108, in __init__
self.add(layer)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/training/tracking/base.py", line 456, in _method_wrapper
result = method(self, *args, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/sequential.py", line 169, in add
layer(x)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 662, in __call__
self._set_mask_metadata(inputs, outputs, previous_mask)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 391, in call_context
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 653, in __call__
self._set_inputs(inputs, outputs)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/func_graph.py", line 319, in inner_cm
self._graph_key = old_graph_key
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
yield g
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5293, in get_controller
yield default
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
yield g
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/context.py", line 485, in _mode
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
yield g
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/func_graph.py", line 314, in inner_cm
yield g
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 653, in __call__
self._set_inputs(inputs, outputs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 6361, in __exit__
self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4183, in name_scope
yield "" if new_stack is None else new_stack + "/"
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 594, in __call__
self._maybe_build(inputs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1713, in _maybe_build
self.build(input_shapes)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/layers/convolutional.py", line 165, in build
dtype=self.dtype)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 377, in add_weight
aggregation=aggregation)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/training/tracking/base.py", line 622, in _add_variable_with_custom_getter
**kwargs_for_getter)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 152, in make_variable
aggregation=aggregation)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 212, in __call__
return cls._variable_v1_call(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 175, in _variable_v1_call
aggregation=aggregation)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 58, in getter
return captured_getter(captured_previous, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/distribute_lib.py", line 823, in creator_with_resource_vars
return self._create_variable(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/parameter_server_strategy.py", line 361, in _create_variable
return var_creator(*args, **kwargs)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4269, in _colocate_with_for_gradient
yield
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4333, in colocate_with
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4269, in _colocate_with_for_gradient
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/parameter_server_strategy.py", line 361, in _create_variable
return var_creator(*args, **kwargs)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4411, in device
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/parameter_server_strategy.py", line 361, in _create_variable
return var_creator(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 154, in <lambda>
previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 2492, in default_variable_creator
import_scope=import_scope, distribute_strategy=distribute_strategy)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 216, in __call__
return super(VariableMetaclass, cls).__call__(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 422, in __init__
constraint=constraint)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 619, in _init_from_args
ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5622, in init_scope
yield
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/context.py", line 485, in _mode
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5622, in init_scope
yield
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/tape.py", line 122, in stop_recording
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5622, in init_scope
yield
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 551, in _init_from_args
graph_mode=self._in_graph_mode)
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4775, in _attr_scope
yield # The code within the context runs here.
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 545, in _init_from_args
initial_value() if init_from_fn else initial_value,
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 134, in <lambda>
init_val = lambda: initializer(shape, dtype=dtype)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/init_ops_v2.py", line 434, in __call__
return self._random_generator.random_uniform(shape, -limit, limit, dtype)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/init_ops_v2.py", line 797, in random_uniform
shape=shape, minval=minval, maxval=maxval, dtype=dtype, seed=self.seed)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/random_ops.py", line 240, in random_uniform
minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1050, in convert_to_tensor
return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1108, in convert_to_tensor_v2
as_ref=False)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1186, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 304, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 245, in constant
allow_broadcast=True)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 253, in _constant_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 110, in convert_to_eager_tensor
t = ops.EagerTensor(value, handle, device, dtype)
RuntimeError: Error copying tensor to device: /job:worker/replica:0/task:0/device:CPU:0. /job:worker/replica:0/task:0/device:CPU:0 unknown device.