Почему базовое обучение смешанной точности CNN не работает на тензорном потоке (форк NVIDA)? - PullRequest
0 голосов
/ 07 мая 2019

Я пытаюсь обучить основному CNN, используя смешанную точность, т.е. используя тензорные ядра RTX 2080. Код прекрасно работает с tenorflow 13.1 в Ubuntu, но завершается неудачно с вилкой tenorflow от nvidia, говоря «Переменные не найдены». Понятия не имею почему. Кто-нибудь может помочь? Подробности ниже.

Код взят от NVidia Docs, т.е. Часть 3 на https://devblogs.nvidia.com/video-mixed-precision-techniques-tensor-cores-deep-learning/

import tensorflow as tf
import numpy as np



def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
                                    initializer=None, regularizer=None,
                                    trainable=True,
                                    *args, **kwargs):
  storage_dtype = tf.float32 if trainable else dtype
  variable = getter(name, shape, dtype=storage_dtype,
                    initializer=initializer, regularizer=regularizer,
                    trainable=trainable,
                    *args, **kwargs)
  if trainable and dtype != tf.float32:  variable = tf.cast(variable, dtype)
  return variable

def bm(inputs):
    _,_,h,w=inputs.get_shape().as_list()
    top_layer =tf.layers.conv2d(inputs,64,7,use_bias=False,data_format='channels_first',padding='SAME')
    top_layer=tf.contrib.layers.batch_norm(top_layer,data_format="NCHW",fused=True)
    top_layer=tf.layers.max_pooling2d(top_layer,2,2,data_format='channels_first')
    top_layer=tf.reshape(top_layer,(-1,64*(h//2)*(w//2)))
    top_layer=tf.layers.dense(top_layer,128,activation=tf.nn.relu)
    return top_layer


def btm(inputs,labels,nlabel):
    inputs=tf.cast(inputs,tf.float16)
    with tf.device('/gpu:0'),tf.variable_scope('fp32_vars', custom_getter=float32_variable_storage_getter):
        top_layer=bm(inputs)
        logits=tf.layers.dense(top_layer,nlabel,activation=None)
    logits=tf.cast(logits, tf.float32)
    loss = tf.losses.sparse_softmax_cross_entropy( logits=logits,labels=labels)
    optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9)
    loss_scale=128.0
    grads, varis = zip(*optimizer.compute_gradients(loss * loss_scale))
    grads,_ =tf.clip_by_global_norm(grads,5.0)
    grads = [grad / loss_scale for grad in grads]
    #gradvars= optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(zip(grads,varis))#gradvars)
    return inputs,labels,loss,train_op

nchan,h,w,nlabel=3,224,224,100
inputs=tf.placeholder(tf.float32,(None,nchan,h,w))
labels=tf.placeholder(tf.int32,(None,))
inputs,labels,loss,train_op=btm(inputs,labels,nlabel)
from tensorflow import ConfigProto
config = ConfigProto()
config.gpu_options.allow_growth = True
sess=tf.Session(config=config)
bs=128
inputs_np=np.random.random(size=(bs,nchan,h,w)).astype(np.float32)
labels_np=np.random.randint(nlabel,size=(bs,)).astype(np.int32)
sess.run(tf.global_variables_initializer())
for step in range(20):
    loss_np,_=sess.run([loss,train_op],{inputs:inputs_np,labels:labels_np})
    print("Loss",loss_np)

Вывод с использованием официального TF 13.1. все в порядке:

...
Loss 5.3065777
Loss 5.251279
Loss 5.1554813
Loss 5.036022
Loss 4.9095006
Loss 4.788646
Loss 4.680414
Loss 4.584101
Loss 4.5076714
Loss 4.4439754
Loss 4.3807573
Loss 4.3181067
Loss 4.253341
Loss 4.1884956
Loss 4.1250153
Loss 4.0654
Loss 4.0059204
Loss 3.9472775
Loss 3.8848066
Loss 3.8182044

Process finished with exit code 0

Используя распределение тензорного потока NVidias, я получаю (https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)

WARNING:tensorflow:From test.py:20: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.conv2d instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

WARNING:tensorflow:From test.py:22: max_pooling2d (from tensorflow.python.layers.pooling) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.max_pooling2d instead.
WARNING:tensorflow:From test.py:24: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/losses/losses_impl.py:209: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Traceback (most recent call last):
  File "test.py", line 47, in <module>
    inputs,labels,loss,train_op=btm(inputs,labels,nlabel)
  File "test.py", line 41, in btm
    train_op = optimizer.apply_gradients(zip(grads,varis))#gradvars)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 604, in apply_gradients
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2097, in cond
    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1941, in BuildCondBranch
    original_result = fn()
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 597, in do_update
    name+'-apply')
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 629, in _apply_gradients_helper
    raise ValueError("No variables provided.")
ValueError: No variables provided.



...