TF-2.0-альфа проблема с запуском распределенных кера с помощью стратегии ParameterServer - PullRequest
0 голосов
/ 31 мая 2019

В tf2 я пытаюсь обучить нейронную сеть распределенным способом, используя керасы.Я следую этому уроку: https://www.tensorflow.org/alpha/tutorials/distribute/keras

Стратегия, используемая в этом уроке - MirroredStrategy.Но я хочу запустить код с нескольких машин.В настоящее время в tf2 для распределенных кераров поддерживаются только стратегии Mirrored и Parameter-Server.Поэтому я просто попытался изменить стратегию на Parameter-Server (с Mirrored он работает нормально).

Я получаю следующую ошибку:

RuntimeError: Error copying tensor to device: /job:worker/replica:0/task:0/device:CPU:0. /job:worker/replica:0/task:0/device:CPU:0 unknown device.

Еще одна вещь, которая не имеет к этому отношениясообщение.В tf2 поддерживается только MirroredStrategy для пользовательских тренировочных циклов (без использования керас или оценщика).Кто-нибудь знает, можно ли как-то распределить обучение по нескольким машинам с помощью пользовательских циклов обучения?

Заранее спасибо!

Вот код для 1-го работника.Это то же самое, что и в учебнике с добавлением TF_CONFIG и изменением стратегии.(тензор потока 2.0 альфа) * ​​1013 *

from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow_datasets as tfds

import os, json

datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)

mnist_train, mnist_test = datasets['train'], datasets['test']

os.environ['TF_CONFIG'] = json.dumps({
    "cluster": {
        "worker": ["x.y.z.w1:12345", "x.y.z.w2:12345"],
        "ps": ["x.y.z.w3:12345"]
    },
    "task": {"type": "worker", "index": 0}
})

strategy = tf.distribute.experimental.ParameterServerStrategy()

print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

# You can also do info.splits.total_num_examples to get the total
# number of examples in the dataset.

num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples

BUFFER_SIZE = 10000

BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

def scale(image, label):
  image = tf.cast(image, tf.float32)
  image /= 255

  return image, label

train_dataset = mnist_train.map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

with strategy.scope():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
  ])

  model.compile(loss='sparse_categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

# Define the checkpoint directory to store the checkpoints

checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# Function for decaying the learning rate.
# You can define any decay function you need.
def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch >= 3 and epoch < 7:
    return 1e-4
  else:
    return 1e-5

# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
                                                      model.optimizer.lr.numpy()))

callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay),
    PrintLR()
]

model.fit(train_dataset, epochs=12, callbacks=callbacks)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

eval_loss, eval_acc = model.evaluate(eval_dataset)

print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))

Ожидаемый результат должен быть примерно таким:

Epoch 1/12
    938/Unknown - 11s 11ms/step - loss: 0.2151 - accuracy: 0.9380
Learning rate for epoch 1 is 0.0010000000474974513
938/938 [==============================] - 11s 11ms/step - loss: 0.2151 - accuracy: 0.9380
Epoch 2/12
937/938 [============================>.] - ETA: 0s - loss: 0.0678 - accuracy: 0.9801
Learning rate for epoch 2 is 0.0010000000474974513
938/938 [==============================] - 7s 7ms/step - loss: 0.0678 - accuracy: 0.9801
Epoch 3/12
935/938 [============================>.] - ETA: 0s - loss: 0.0473 - accuracy: 0.9865
Learning rate for epoch 3 is 0.0010000000474974513
938/938 [==============================] - 7s 8ms/step - loss: 0.0473 - accuracy: 0.9865
Epoch 4/12
931/938 [============================>.] - ETA: 0s - loss: 0.0258 - accuracy: 0.9931
Learning rate for epoch 4 is 9.999999747378752e-05
938/938 [==============================] - 7s 7ms/step - loss: 0.0258 - accuracy: 0.9931
Epoch 5/12
934/938 [============================>.] - ETA: 0s - loss: 0.0229 - accuracy: 0.9944
Learning rate for epoch 5 is 9.999999747378752e-05
938/938 [==============================] - 7s 8ms/step - loss: 0.0228 - accuracy: 0.9944
Epoch 6/12
928/938 [============================>.] - ETA: 0s - loss: 0.0211 - accuracy: 0.9947
Learning rate for epoch 6 is 9.999999747378752e-05
938/938 [==============================] - 7s 7ms/step - loss: 0.0211 - accuracy: 0.9947
Epoch 7/12
928/938 [============================>.] - ETA: 0s - loss: 0.0196 - accuracy: 0.9952
Learning rate for epoch 7 is 9.999999747378752e-05
938/938 [==============================] - 7s 8ms/step - loss: 0.0195 - accuracy: 0.9952
Epoch 8/12
928/938 [============================>.] - ETA: 0s - loss: 0.0169 - accuracy: 0.9961
Learning rate for epoch 8 is 9.999999747378752e-06
938/938 [==============================] - 7s 8ms/step - loss: 0.0168 - accuracy: 0.9961
Epoch 9/12
925/938 [============================>.] - ETA: 0s - loss: 0.0167 - accuracy: 0.9962
Learning rate for epoch 9 is 9.999999747378752e-06
938/938 [==============================] - 7s 8ms/step - loss: 0.0166 - accuracy: 0.9962
Epoch 10/12
935/938 [============================>.] - ETA: 0s - loss: 0.0164 - accuracy: 0.9962
Learning rate for epoch 10 is 9.999999747378752e-06
938/938 [==============================] - 7s 8ms/step - loss: 0.0164 - accuracy: 0.9962
Epoch 11/12
932/938 [============================>.] - ETA: 0s - loss: 0.0163 - accuracy: 0.9963
Learning rate for epoch 11 is 9.999999747378752e-06
938/938 [==============================] - 7s 7ms/step - loss: 0.0162 - accuracy: 0.9963
Epoch 12/12
933/938 [============================>.] - ETA: 0s - loss: 0.0161 - accuracy: 0.9963
Learning rate for epoch 12 is 9.999999747378752e-06
938/938 [==============================] - 7s 7ms/step - loss: 0.0160 - accuracy: 0.9964

И что я получаю:

Traceback (most recent call last):
  File "keras_dist.py", line 58, in <module>
    metrics=['accuracy'])
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/distribute_lib.py", line 189, in __exit__
    self._device_scope.__exit__(exception_type, exception_value, traceback)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/context.py", line 603, in device
    yield
  File "keras_dist.py", line 53, in <module>
    tf.keras.layers.Dense(10, activation='softmax')
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/training/tracking/base.py", line 456, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/sequential.py", line 108, in __init__
    self.add(layer)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/training/tracking/base.py", line 456, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/sequential.py", line 169, in add
    layer(x)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 662, in __call__
    self._set_mask_metadata(inputs, outputs, previous_mask)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 391, in call_context
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 653, in __call__
    self._set_inputs(inputs, outputs)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/func_graph.py", line 319, in inner_cm
    self._graph_key = old_graph_key
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
    yield g
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5293, in get_controller
    yield default
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
    yield g
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/context.py", line 485, in _mode
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
    yield g
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/func_graph.py", line 314, in inner_cm
    yield g
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 653, in __call__
    self._set_inputs(inputs, outputs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 6361, in __exit__
    self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4183, in name_scope
    yield "" if new_stack is None else new_stack + "/"
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 594, in __call__
    self._maybe_build(inputs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1713, in _maybe_build
    self.build(input_shapes)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/layers/convolutional.py", line 165, in build
    dtype=self.dtype)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 377, in add_weight
    aggregation=aggregation)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/training/tracking/base.py", line 622, in _add_variable_with_custom_getter
    **kwargs_for_getter)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 152, in make_variable
    aggregation=aggregation)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 212, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 175, in _variable_v1_call
    aggregation=aggregation)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 58, in getter
    return captured_getter(captured_previous, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/distribute_lib.py", line 823, in creator_with_resource_vars
    return self._create_variable(*args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/parameter_server_strategy.py", line 361, in _create_variable
    return var_creator(*args, **kwargs)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4269, in _colocate_with_for_gradient
    yield
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4333, in colocate_with
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4269, in _colocate_with_for_gradient
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/parameter_server_strategy.py", line 361, in _create_variable
    return var_creator(*args, **kwargs)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4411, in device
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/distribute/parameter_server_strategy.py", line 361, in _create_variable
    return var_creator(*args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 154, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 2492, in default_variable_creator
    import_scope=import_scope, distribute_strategy=distribute_strategy)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 216, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 422, in __init__
    constraint=constraint)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 619, in _init_from_args
    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5622, in init_scope
    yield
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/context.py", line 485, in _mode
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5622, in init_scope
    yield
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/eager/tape.py", line 122, in stop_recording
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 5622, in init_scope
    yield
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 551, in _init_from_args
    graph_mode=self._in_graph_mode)
  File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4775, in _attr_scope
    yield  # The code within the context runs here.
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/resource_variable_ops.py", line 545, in _init_from_args
    initial_value() if init_from_fn else initial_value,
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 134, in <lambda>
    init_val = lambda: initializer(shape, dtype=dtype)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/init_ops_v2.py", line 434, in __call__
    return self._random_generator.random_uniform(shape, -limit, limit, dtype)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/init_ops_v2.py", line 797, in random_uniform
    shape=shape, minval=minval, maxval=maxval, dtype=dtype, seed=self.seed)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/ops/random_ops.py", line 240, in random_uniform
    minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1050, in convert_to_tensor
    return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1108, in convert_to_tensor_v2
    as_ref=False)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1186, in internal_convert_to_tensor
    ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 304, in _constant_tensor_conversion_function
    return constant(v, dtype=dtype, name=name)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 245, in constant
    allow_broadcast=True)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 253, in _constant_impl
    t = convert_to_eager_tensor(value, ctx, dtype)
  File "/home/ubuntu/.local/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 110, in convert_to_eager_tensor
    t = ops.EagerTensor(value, handle, device, dtype)
RuntimeError: Error copying tensor to device: /job:worker/replica:0/task:0/device:CPU:0. /job:worker/replica:0/task:0/device:CPU:0 unknown device.
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...