Я пытаюсь запустить пользовательский цикл обучения на нескольких графических процессорах, используя tf.distribute.MirroredStrategy()
.В то время как тренировочный цикл отлично работает на одном графическом процессоре, ValueError: 'handle' is not available outside the replica context or a 'tf.distribute.Strategy.update()' call
выбрасывается, когда я пытаюсь использовать несколько графических процессоров.Я использую tenorflow 1.14 и Python 3.7.3.
Я включил минимальный пример того, что я попробовал ниже.Настраиваемый цикл обучения работает без проблем на одном графическом процессоре, но моя попытка использовать tf.distribute.MirroredStrategy()
для нескольких графических процессоров не удалась с сообщением об ошибке (полный вывод)
ValueError Traceback (most recent call last)
<ipython-input-11-3fda5d330457> in <module>
1 with mirrored_strategy.scope():
----> 2 model, train_op, X1_in, X2_in = create_model_and_train_op()
3 with tf.Session() as sess:
4 sess.run(tf.global_variables_initializer())
5 for sample_ind in range(n_samples):
<ipython-input-7-8f5b3971bbe2> in create_model_and_train_op()
6
7 model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])
----> 8 train_op = tf.train.AdamOptimizer().minimize(loss)
9
10 return model, train_op, X1_in, X2_in
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
401 aggregation_method=aggregation_method,
402 colocate_gradients_with_ops=colocate_gradients_with_ops,
--> 403 grad_loss=grad_loss)
404
405 vars_with_grad = [v for g, v in grads_and_vars if g is not None]
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in compute_gradients(self, loss, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, grad_loss)
510 gate_gradients=(gate_gradients == Optimizer.GATE_OP),
511 aggregation_method=aggregation_method,
--> 512 colocate_gradients_with_ops=colocate_gradients_with_ops)
513 if gate_gradients == Optimizer.GATE_GRAPH:
514 grads = control_flow_ops.tuple(grads)
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_impl.py in gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
156 ys, xs, grad_ys, name, colocate_gradients_with_ops,
157 gate_gradients, aggregation_method, stop_gradients,
--> 158 unconnected_gradients)
159 # pylint: enable=protected-access
160
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
595 xs = [
596 x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597 for x in xs
598 ]
599 xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in <listcomp>(.0)
595 xs = [
596 x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597 for x in xs
598 ]
599 xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/distribute/values.py in handle(self)
641 device = distribute_lib.get_update_device()
642 if device is None:
--> 643 raise ValueError("`handle` is not available outside the replica context"
644 " or a `tf.distribute.Strategy.update()` call.")
645 return self.get(device=device).handle
ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.
Единственное исправлениеGoogle предложил обновить его до версии 2.0.0 - beta.Интересно, есть ли способ решить эту проблему и в 1.14?
Вот минимальный пример того, что я пробовал:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model
import sys
print (sys.version)
print(tf.__version__)
input_dim = 42
n_samples = 10000
x1_data = np.random.rand(n_samples,input_dim)
x2_data = np.random.rand(n_samples,input_dim)
def create_model_and_train_op():
X1_in = Input(shape=(input_dim,))
X2_in = Input(shape=(input_dim,))
XY = Concatenate(axis=-1)([X1_in,X2_in])
loss = Dense(1)(XY)
model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])
# Error message is thrown in the following line if using MirroredStrategy()
train_op = tf.train.AdamOptimizer().minimize(loss)
return model, train_op, X1_in, X2_in
##### Single GPU: Runs without problems
model, train_op, X1_in, X2_in = create_model_and_train_op()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for sample_ind in range(n_samples):
sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })
##### Multiple GPU: Results in error message
mirrored_strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync))
with mirrored_strategy.scope():
model, train_op, X1_in, X2_in = create_model_and_train_op()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for sample_ind in range(n_samples):
sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })