Я пытаюсь обучить основному CNN, используя смешанную точность, т.е. используя тензорные ядра RTX 2080. Код прекрасно работает с tenorflow 13.1 в Ubuntu, но завершается неудачно с вилкой tenorflow от nvidia, говоря «Переменные не найдены». Понятия не имею почему. Кто-нибудь может помочь? Подробности ниже.
Код взят от NVidia Docs, т.е. Часть 3 на https://devblogs.nvidia.com/video-mixed-precision-techniques-tensor-cores-deep-learning/
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32: variable = tf.cast(variable, dtype)
return variable
def bm(inputs):
_,_,h,w=inputs.get_shape().as_list()
top_layer =tf.layers.conv2d(inputs,64,7,use_bias=False,data_format='channels_first',padding='SAME')
top_layer=tf.contrib.layers.batch_norm(top_layer,data_format="NCHW",fused=True)
top_layer=tf.layers.max_pooling2d(top_layer,2,2,data_format='channels_first')
top_layer=tf.reshape(top_layer,(-1,64*(h//2)*(w//2)))
top_layer=tf.layers.dense(top_layer,128,activation=tf.nn.relu)
return top_layer
def btm(inputs,labels,nlabel):
inputs=tf.cast(inputs,tf.float16)
with tf.device('/gpu:0'),tf.variable_scope('fp32_vars', custom_getter=float32_variable_storage_getter):
top_layer=bm(inputs)
logits=tf.layers.dense(top_layer,nlabel,activation=None)
logits=tf.cast(logits, tf.float32)
loss = tf.losses.sparse_softmax_cross_entropy( logits=logits,labels=labels)
optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9)
loss_scale=128.0
grads, varis = zip(*optimizer.compute_gradients(loss * loss_scale))
grads,_ =tf.clip_by_global_norm(grads,5.0)
grads = [grad / loss_scale for grad in grads]
#gradvars= optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(zip(grads,varis))#gradvars)
return inputs,labels,loss,train_op
nchan,h,w,nlabel=3,224,224,100
inputs=tf.placeholder(tf.float32,(None,nchan,h,w))
labels=tf.placeholder(tf.int32,(None,))
inputs,labels,loss,train_op=btm(inputs,labels,nlabel)
from tensorflow import ConfigProto
config = ConfigProto()
config.gpu_options.allow_growth = True
sess=tf.Session(config=config)
bs=128
inputs_np=np.random.random(size=(bs,nchan,h,w)).astype(np.float32)
labels_np=np.random.randint(nlabel,size=(bs,)).astype(np.int32)
sess.run(tf.global_variables_initializer())
for step in range(20):
loss_np,_=sess.run([loss,train_op],{inputs:inputs_np,labels:labels_np})
print("Loss",loss_np)
Вывод с использованием официального TF 13.1. все в порядке:
...
Loss 5.3065777
Loss 5.251279
Loss 5.1554813
Loss 5.036022
Loss 4.9095006
Loss 4.788646
Loss 4.680414
Loss 4.584101
Loss 4.5076714
Loss 4.4439754
Loss 4.3807573
Loss 4.3181067
Loss 4.253341
Loss 4.1884956
Loss 4.1250153
Loss 4.0654
Loss 4.0059204
Loss 3.9472775
Loss 3.8848066
Loss 3.8182044
Process finished with exit code 0
Используя распределение тензорного потока NVidias, я получаю (https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
WARNING:tensorflow:From test.py:20: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.conv2d instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
* https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
* https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.
WARNING:tensorflow:From test.py:22: max_pooling2d (from tensorflow.python.layers.pooling) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.max_pooling2d instead.
WARNING:tensorflow:From test.py:24: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/losses/losses_impl.py:209: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Traceback (most recent call last):
File "test.py", line 47, in <module>
inputs,labels,loss,train_op=btm(inputs,labels,nlabel)
File "test.py", line 41, in btm
train_op = optimizer.apply_gradients(zip(grads,varis))#gradvars)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 604, in apply_gradients
name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2097, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1941, in BuildCondBranch
original_result = fn()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 597, in do_update
name+'-apply')
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 629, in _apply_gradients_helper
raise ValueError("No variables provided.")
ValueError: No variables provided.