Падение использовать TensorCore из Tensorflow Mixed Precision Tutorial - PullRequest
0 голосов
/ 06 мая 2020

Я следил за учебником по смешанной точности от Tensorflow: https://www.tensorflow.org/guide/keras/mixed_precision, но, видимо, я не могу использовать TensorCore. Моя настройка:

Поскольку я не получил никакой скорости вверх (на самом деле даже более медленный результат за счет добавления смешанной точности), я решил использовать tenbsorBoard, чтобы посмотреть, получу ли я какие-либо 16-битные вычисления. Поэтому я немного изменил свой код, чтобы иметь больше итераций и записывать некоторые. Я смог контролировать, что действительно получаю 0% 16-битных вычислений.

Поскольку использование этих тензорных ядер кажется простым из учебника смешанной точности, мне интересно, в чем может быть проблема в моем случае. Что мне не хватает на Windows? Любая помощь приветствуется

Мой код (вам может потребоваться запустить его от имени администратора для доступа к правам записи):

import tensorflow as tf

import datetime

import os.path


from tensorflow.keras import datasets, layers, models

from tensorflow.keras.mixed_precision import experimental as mixed_precision

import matplotlib.pyplot as plt



# set the policy

policy = mixed_precision.Policy('mixed_float16')

mixed_precision.set_policy(policy)

print('Compute dtype: %s' % policy.compute_dtype)

print('Variable dtype: %s' % policy.variable_dtype)



mnist = tf.keras.datasets.mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()



# Normalize pixel values to be between 0 and 1
# 
train_images = train_images.reshape(60000, 784).astype('float32') / 255

test_images = test_images.reshape(10000, 784).astype('float32') / 255



# create the model
inputs = tf.keras.Input(shape=(784,), name='digits')

if tf.config.list_physical_devices('GPU'):
  
    print('The model will run with 4096 units on a GPU')
  
    num_units = 4096

else:
  
    # Use fewer units on CPUs so the model finishes in a reasonable amount of time
  
    print('The model will run with 64 units on a CPU')
  
    num_units = 64


dense1 = layers.Dense(num_units, activation='relu', name='dense_1')

x = dense1(inputs)

dense2 = layers.Dense(num_units, activation='relu', name='dense_2')

x = dense2(x)

print('x.dtype: %s' % x.dtype.name)

# 'kernel' is dense1's variable

print('dense1.kernel.dtype: %s' % dense1.kernel.dtype.name)

x = layers.Dense(10, name='dense_logits')(x)

outputs = layers.Activation('softmax', dtype='float32', name='predictions')(x)

print('Outputs dtype: %s' % outputs.dtype.name)

model = tf.keras.Model(inputs=inputs, outputs=outputs)



# Compile and train the model

model.compile(loss='sparse_categorical_crossentropy',

    optimizer=tf.keras.optimizers.RMSprop(),

    metrics=['accuracy'])



log_dir = os.path.join("logs", "fit_MNIST", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,

    histogram_freq=1,

    profile_batch='400, 600')



initial_weights = model.get_weights()


history = model.fit(train_images, train_labels,
batch_size=64,
epochs=5,

    validation_data=(test_images, test_labels),

    callbacks=[tensorboard_callback])



# evaluate
test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print(test_acc)

И журнал

C:\Users\Michael\PycharmProjects\BVS>py -3.7 -m MNIST.train
2020-05-06 09:53:59.760686: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-05-06 09:54:01.903395: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library nvcuda.dll
2020-05-06 09:54:01.940372: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1544] Found device 0 with properties:
pciBusID: 0000:65:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.545GHz coreCount: 68 deviceMemorySize: 11.00GiB deviceMemoryBandwidth: 573.69GiB/s
2020-05-06 09:54:01.953851: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-05-06 09:54:01.967405: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_10.dll
2020-05-06 09:54:01.982849: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cufft64_10.dll
2020-05-06 09:54:01.989259: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library curand64_10.dll
2020-05-06 09:54:02.000515: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusolver64_10.dll
2020-05-06 09:54:02.008599: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusparse64_10.dll
2020-05-06 09:54:02.036045: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2020-05-06 09:54:02.042418: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1686] Adding visible gpu devices: 0
2020-05-06 09:54:02.048313: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-05-06 09:54:02.081395: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x17acc96b860 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-05-06 09:54:02.087398: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-05-06 09:54:02.093847: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1544] Found device 0 with properties:
pciBusID: 0000:65:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.545GHz coreCount: 68 deviceMemorySize: 11.00GiB deviceMemoryBandwidth: 573.69GiB/s
2020-05-06 09:54:02.104982: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-05-06 09:54:02.109817: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_10.dll
2020-05-06 09:54:02.115502: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cufft64_10.dll
2020-05-06 09:54:02.119405: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library curand64_10.dll
2020-05-06 09:54:02.125012: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusolver64_10.dll
2020-05-06 09:54:02.129275: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusparse64_10.dll
2020-05-06 09:54:02.135712: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2020-05-06 09:54:02.140090: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1686] Adding visible gpu devices: 0
2020-05-06 09:54:02.818915: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1085] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-05-06 09:54:02.826267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1091]      0
2020-05-06 09:54:02.832173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1104] 0:   N
2020-05-06 09:54:02.837490: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1230] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 8584 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5)
2020-05-06 09:54:02.860430: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x17aee2fe5b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-05-06 09:54:02.867611: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): GeForce RTX 2080 Ti, Compute Capability 7.5
Compute dtype: float16
Variable dtype: float32
The model will run with 4096 units on a GPU
x.dtype: float16
dense1.kernel.dtype: float32
Outputs dtype: float32
2020-05-06 09:54:03.793536: I tensorflow/core/profiler/lib/profiler_session.cc:154] Profiler session started.
2020-05-06 09:54:03.798555: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1372] Profiler found 1 GPUs
2020-05-06 09:54:03.806491: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cupti64_101.dll
2020-05-06 09:54:03.933366: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1494] CUPTI activity buffer flushed
Epoch 1/5
2020-05-06 09:54:04.667227: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_10.dll
397/938 [===========>..................] - ETA: 5s - loss: 0.3610 - accuracy: 0.90472020-05-06 09:54:09.555345: I tensorflow/core/profiler/lib/profiler_session.cc:154] Profiler session started.
596/938 [==================>...........] - ETA: 3s - loss: 0.3028 - accuracy: 0.9194WARNING:tensorflow:From C:\Users\Michael\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\summary_ops_v2.py:1277: stop (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
2020-05-06 09:54:12.391419: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1494] CUPTI activity buffer flushed
2020-05-06 09:54:12.449130: I tensorflow/core/profiler/internal/gpu/device_tracer.cc:217]  GpuTracer has collected 32964 callback api events and 32964 activity events.
2020-05-06 09:54:13.421652: I tensorflow/core/profiler/rpc/client/save_profile.cc:168] Creating directory: logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12
2020-05-06 09:54:13.918583: I tensorflow/core/profiler/rpc/client/save_profile.cc:174] Dumped gzipped tool data for trace.json.gz to logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12\DESKTOP-AEJL5TN.trace.json.gz
2020-05-06 09:54:14.366916: I tensorflow/core/profiler/utils/event_span.cc:288] Generation of step-events took 0.508 ms

2020-05-06 09:54:14.414129: I tensorflow/python/profiler/internal/profiler_wrapper.cc:110] Creating directory: logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12Dumped tool data for overview_page.pb to logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12\DESKTOP-AEJL5TN.overview_page.pb
Dumped tool data for input_pipeline.pb to logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12\DESKTOP-AEJL5TN.input_pipeline.pb
Dumped tool data for tensorflow_stats.pb to logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12\DESKTOP-AEJL5TN.tensorflow_stats.pb
Dumped tool data for kernel_stats.pb to logs\fit_MNIST\20200506-095403\train\plugins\profile\2020_05_06_07_54_12\DESKTOP-AEJL5TN.kernel_stats.pb

938/938 [==============================] - 15s 16ms/step - loss: 0.2555 - accuracy: 0.9325 - val_loss: 0.1904 - val_accuracy: 0.9584
Epoch 2/5
938/938 [==============================] - 12s 13ms/step - loss: 0.1268 - accuracy: 0.9696 - val_loss: 0.2004 - val_accuracy: 0.9634
Epoch 3/5
938/938 [==============================] - 12s 13ms/step - loss: 0.0966 - accuracy: 0.9776 - val_loss: 0.1817 - val_accuracy: 0.9666
Epoch 4/5
938/938 [==============================] - 12s 12ms/step - loss: 0.0850 - accuracy: 0.9813 - val_loss: 0.1250 - val_accuracy: 0.9748
Epoch 5/5
938/938 [==============================] - 12s 13ms/step - loss: 0.0710 - accuracy: 0.9848 - val_loss: 0.1980 - val_accuracy: 0.9714
313/313 - 1s - loss: 0.1980 - accuracy: 0.9714
0.9714000225067139
...