Метрики Tensorflow 2 дают неверные результаты с двумя графическими процессорами - PullRequest
1 голос
/ 16 июня 2020

Я взял этот фрагмент кода из документации tenorflow о распределенном обучении с пользовательским l oop https://www.tensorflow.org/tutorials/distribute/custom_training и просто исправил его для работы с tf.keras.metrics.AU C и запустите его с 2 GPUS (2 Nvidia V100 с машины DGX).

# Import TensorFlow
import tensorflow as tf

# Helper libraries
import numpy as np


print(tf.__version__)


fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

# Adding a dimension to the array -> new shape == (28, 28, 1)
# We are doing this because the first layer in our model is a convolutional
# layer and it requires a 4D input (batch_size, height, width, channels).
# batch_size dimension will be added later on.
train_images = train_images[..., None]
test_images = test_images[..., None]

# One hot
train_labels = tf.keras.utils.to_categorical(train_labels, 10)
test_labels = tf.keras.utils.to_categorical(test_labels, 10)

# Getting the images in [0, 1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor, it will be auto-detected.
GPUS = [0, 1]
devices = ["/gpu:" + str(gpu_id) for gpu_id in GPUS]
strategy = tf.distribute.MirroredStrategy(devices=devices)

print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))


BUFFER_SIZE = len(train_images)

BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10


train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE)

train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)


def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(64, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

  return model


with strategy.scope():
  # Set reduction to `none` so we can do the reduction afterwards and divide by
  # global batch size.
  loss_object = tf.keras.losses.CategoricalCrossentropy(
      from_logits=True,
      reduction=tf.keras.losses.Reduction.NONE)
  def compute_loss(labels, predictions):
    per_example_loss = loss_object(labels, predictions)
    return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)


with strategy.scope():
  test_loss = tf.keras.metrics.Mean(name='test_loss')

  train_accuracy = tf.keras.metrics.CategoricalAccuracy(
      name='train_accuracy')
  test_accuracy = tf.keras.metrics.CategoricalAccuracy(
      name='test_accuracy')
  train_auc = tf.keras.metrics.AUC(name='train_auc')
  test_auc = tf.keras.metrics.AUC(name='test_auc')


# model, optimizer, and checkpoint must be created under `strategy.scope`.
with strategy.scope():
  model = create_model()

  optimizer = tf.keras.optimizers.Adam()


def train_step(inputs):
  images, labels = inputs

  with tf.GradientTape() as tape:
    predictions = model(images, training=True)
    loss = compute_loss(labels, predictions)

  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_accuracy(labels, predictions)
  train_auc(labels, predictions)
  return loss

def test_step(inputs):
  images, labels = inputs

  predictions = model(images, training=False)
  t_loss = loss_object(labels, predictions)

  test_loss.update_state(t_loss)
  test_accuracy(labels, predictions)
  test_auc(labels, predictions)


# `run` replicates the provided computation and runs it
# with the distributed input.
@tf.function
def distributed_train_step(dataset_inputs):
  per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
  return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                         axis=None)

@tf.function
def distributed_test_step(dataset_inputs):
  return strategy.run(test_step, args=(dataset_inputs,))


for epoch in range(EPOCHS):
  # TRAIN LOOP
  total_loss = 0.0
  num_batches = 0
  for x in train_dist_dataset:
    total_loss += distributed_train_step(x)
    num_batches += 1
  train_loss = total_loss / num_batches

  # TEST LOOP
  for x in test_dist_dataset:
    distributed_test_step(x)

  template = ("Epoch {}, Loss: {}, Accuracy: {}, AUC: {},"
              "Test Loss: {}, Test Accuracy: {}, Test AUC: {}")
  print (template.format(epoch+1,
                         train_loss, train_accuracy.result()*100, train_auc.result()*100,
                         test_loss.result(), test_accuracy.result()*100, test_auc.result()*100))

  test_loss.reset_states()
  train_accuracy.reset_states()
  test_accuracy.reset_states()
  train_auc.reset_states()
  test_auc.reset_states()

Проблема в том, что оценка AU C определенно неверна, потому что она превышает свой диапазон (должно быть от 0 до 100), и я получаю эти результаты, выполнив приведенный выше код один раз:

Epoch 1, Loss: 1.8061423301696777, Accuracy: 66.00833892822266, AUC: 321.8688659667969,Test Loss: 1.742477536201477, Test Accuracy: 72.0999984741211, Test AUC: 331.33709716796875
Epoch 2, Loss: 1.7129968404769897, Accuracy: 74.9816665649414, AUC: 337.37017822265625,Test Loss: 1.7084736824035645, Test Accuracy: 75.52999877929688, Test AUC: 337.1878967285156
Epoch 3, Loss: 1.643971562385559, Accuracy: 81.83333587646484, AUC: 355.96209716796875,Test Loss: 1.6072628498077393, Test Accuracy: 85.3499984741211, Test AUC: 370.603759765625
Epoch 4, Loss: 1.5887378454208374, Accuracy: 87.27833557128906, AUC: 373.6204528808594,Test Loss: 1.5906082391738892, Test Accuracy: 87.13999938964844, Test AUC: 371.9998474121094
Epoch 5, Loss: 1.581775426864624, Accuracy: 88.0, AUC: 373.9468994140625,Test Loss: 1.5964380502700806, Test Accuracy: 86.68000030517578, Test AUC: 371.0227355957031
Epoch 6, Loss: 1.5764907598495483, Accuracy: 88.49166870117188, AUC: 375.2404479980469,Test Loss: 1.5832056999206543, Test Accuracy: 87.94000244140625, Test AUC: 373.41998291015625
Epoch 7, Loss: 1.5698528289794922, Accuracy: 89.19166564941406, AUC: 376.473876953125,Test Loss: 1.5770654678344727, Test Accuracy: 88.58000183105469, Test AUC: 375.5516662597656
Epoch 8, Loss: 1.564456820487976, Accuracy: 89.71833801269531, AUC: 377.8564758300781,Test Loss: 1.5792100429534912, Test Accuracy: 88.27000427246094, Test AUC: 373.1791687011719
Epoch 9, Loss: 1.5612279176712036, Accuracy: 90.02000427246094, AUC: 377.9949645996094,Test Loss: 1.5729509592056274, Test Accuracy: 88.9800033569336, Test AUC: 375.5257263183594
Epoch 10, Loss: 1.5562015771865845, Accuracy: 90.54000091552734, AUC: 378.9789123535156,Test Loss: 1.56815767288208, Test Accuracy: 89.3499984741211, Test AUC: 375.8636474609375

Точность нормальная, но кажется, что это единственный метри c, который ведет себя хорошо. Я пробовал и другие показатели, но они не оцениваются правильно. Кажется, что проблемы возникают при использовании более одного графического процессора, потому что, когда я запускаю этот код с одним графическим процессором, он дает правильные результаты.

...