Пользовательский цикл обучения с использованием tenorflow-gpu 1.14 и tf.distribute.MirroredStrategy () приводит к значению ValueError - PullRequest
1 голос
/ 01 июля 2019

Я пытаюсь запустить пользовательский цикл обучения на нескольких графических процессорах, используя tf.distribute.MirroredStrategy().В то время как тренировочный цикл отлично работает на одном графическом процессоре, ValueError: 'handle' is not available outside the replica context or a 'tf.distribute.Strategy.update()' call выбрасывается, когда я пытаюсь использовать несколько графических процессоров.Я использую tenorflow 1.14 и Python 3.7.3.

Я включил минимальный пример того, что я попробовал ниже.Настраиваемый цикл обучения работает без проблем на одном графическом процессоре, но моя попытка использовать tf.distribute.MirroredStrategy() для нескольких графических процессоров не удалась с сообщением об ошибке (полный вывод)

ValueError                                Traceback (most recent call last)
<ipython-input-11-3fda5d330457> in <module>
      1 with mirrored_strategy.scope():
----> 2     model, train_op, X1_in, X2_in = create_model_and_train_op()
      3     with tf.Session() as sess:
      4         sess.run(tf.global_variables_initializer())
      5         for sample_ind in range(n_samples):

<ipython-input-7-8f5b3971bbe2> in create_model_and_train_op()
      6 
      7     model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])
----> 8     train_op = tf.train.AdamOptimizer().minimize(loss)
      9 
     10     return model, train_op, X1_in, X2_in

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
    401         aggregation_method=aggregation_method,
    402         colocate_gradients_with_ops=colocate_gradients_with_ops,
--> 403         grad_loss=grad_loss)
    404 
    405     vars_with_grad = [v for g, v in grads_and_vars if g is not None]

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in compute_gradients(self, loss, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, grad_loss)
    510         gate_gradients=(gate_gradients == Optimizer.GATE_OP),
    511         aggregation_method=aggregation_method,
--> 512         colocate_gradients_with_ops=colocate_gradients_with_ops)
    513     if gate_gradients == Optimizer.GATE_GRAPH:
    514       grads = control_flow_ops.tuple(grads)

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_impl.py in gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
    156         ys, xs, grad_ys, name, colocate_gradients_with_ops,
    157         gate_gradients, aggregation_method, stop_gradients,
--> 158         unconnected_gradients)
    159   # pylint: enable=protected-access
    160 

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
    595     xs = [
    596         x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597         for x in xs
    598     ]
    599     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in <listcomp>(.0)
    595     xs = [
    596         x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597         for x in xs
    598     ]
    599     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/distribute/values.py in handle(self)
    641       device = distribute_lib.get_update_device()
    642       if device is None:
--> 643         raise ValueError("`handle` is not available outside the replica context"
    644                          " or a `tf.distribute.Strategy.update()` call.")
    645     return self.get(device=device).handle

ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.

Единственное исправлениеGoogle предложил обновить его до версии 2.0.0 - beta.Интересно, есть ли способ решить эту проблему и в 1.14?

Вот минимальный пример того, что я пробовал:

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model

import sys
print (sys.version)
print(tf.__version__)

input_dim = 42
n_samples = 10000

x1_data = np.random.rand(n_samples,input_dim)
x2_data = np.random.rand(n_samples,input_dim)

def create_model_and_train_op():
    X1_in = Input(shape=(input_dim,))
    X2_in = Input(shape=(input_dim,))
    XY = Concatenate(axis=-1)([X1_in,X2_in])
    loss = Dense(1)(XY)
    model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])

    # Error message is thrown in the following line if using MirroredStrategy()
    train_op = tf.train.AdamOptimizer().minimize(loss)

    return model, train_op, X1_in, X2_in


##### Single GPU: Runs without problems
model, train_op, X1_in, X2_in = create_model_and_train_op()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for sample_ind in range(n_samples):
        sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })


##### Multiple GPU: Results in error message
mirrored_strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync))

with mirrored_strategy.scope():
    model, train_op, X1_in, X2_in = create_model_and_train_op()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for sample_ind in range(n_samples):
            sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })
...