низкая точность проверки при обучении alexnet_v2 (slim) из Scartch на imagenet - PullRequest
0 голосов
/ 30 октября 2018

Я попытался обучить скретч формы модели alexnet_v2 на наборе данных imagenet в распределенном режиме (horovod + tenorflow). Все полностью подключенные слои были преобразованы в conv2d слои в alexnet_v2. и я встречаю некоторые странные вопросы. Сначала трудно потерять исходную модель alexnet_v2, затем я добавляю слой bn после каждого слоя conv2d. после 100 эпох точность поезда составляет почти 0,6, тогда как точность проверки составляет всего 0,35, потери больше по сравнению с потерями в поездах. вот мой alexnet_v2:

trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev)

def alexnet_v2_arg_scope(weight_decay=0.02, is_training = True):
  batch_norm_params = {
    'is_training' : is_training,
    'decay' : 0.9,
    'epsilon' : 0.0001,
    'updates_collections':tf.GraphKeys.UPDATE_OPS
                } 
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                  activation_fn=tf.nn.relu,
                  biases_initializer=tf.constant_initializer(0.1),
                  weights_regularizer=slim.l2_regularizer(weight_decay),
                  normalizer_fn=slim.batch_norm,
                  normalizer_params=batch_norm_params
                  ):
    with slim.arg_scope([slim.conv2d], padding='SAME'):
      with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
return arg_sc

def alexnet_v2(inputs,
         num_classes=1000,
         is_training=True,
         dropout_keep_prob=0.5,
         spatial_squeeze=False,
         scope='alexnet_v2',
         global_pool=False):
  with tf.variable_scope(scope, 'alexnet_v2',inputs],reuse=tf.AUTO_REUSE) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    # Collect outputs for conv2d, fully_connected and max_pool2d.
    with slim.arg_scope([slim.conv2d, 
                         slim.fully_connected,slim.max_pool2d],
                         outputs_collections=[end_points_collection]                    
                       ):
      net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
      net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')
      net = slim.conv2d(net, 192, [5, 5], scope='conv2')
      net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
      net = slim.conv2d(net, 384, [3, 3], scope='conv3')
      net = slim.conv2d(net, 384, [3, 3], scope='conv4')
      net = slim.conv2d(net, 256, [3, 3], scope='conv5')
      net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')
      # Use conv2d instead of fully_connected layers.
      with slim.arg_scope([slim.conv2d],
                      weights_initializer=trunc_normal(0.005),
                      biases_initializer=tf.constant_initializer(0.1),
                      normalizer_fn=None         
                          ):
        net = slim.conv2d(net, 4096, [5, 5], padding='VALID',
                      scope='fc6')
        net = slim.dropout(net, dropout_keep_prob, is_training=is_training,scope='dropout6')
        net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
        # Convert end_points_collection into a end_point dict.
        end_points = slim.utils.convert_collection_to_dict(
          end_points_collection)
        if global_pool:
          net = tf.reduce_mean(net, [1, 2], keep_dims=True,name='global_pool')
        end_points['global_pool'] = net
        if num_classes:
          net = slim.dropout(net,dropout_keep_prob,is_training=is_training,scope='dropout7')
          net = slim.conv2d(net, num_classes, [1, 1],
                            activation_fn=None,
                            normalizer_fn=None,
                            biases_initializer=tf.zeros_initializer(),
                            scope='fc8')
          if spatial_squeeze:
            net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
            end_points[sc.name + '/fc8'] = net
return net, end_points
alexnet_v2.default_image_size = 224

, а train.py:

def train_data_generator():
  return ds

def validation_data_generator():
  return ds

def main(argv=None):
  num_classes = 1001
  opt_gpu = opt(lr)
  opt = hvd.DistributedOptimizer(opt_gpu)

  train_dataset = train_data_generator()
  validation_dataset = validation_data_generator()
  iterator = tf.data.Iterator.from_structure(output_types=train_dataset.output_types,                                          output_shapes=train_dataset.output_shapes)
  train_init_op = iterator.make_initializer(train_dataset)
  validation_init_op = iterator.make_initializer(validation_dataset)

  labels, images, f = iterator.get_next()
  images = tf.reshape(images, shape=[batch_size, height, width, 3])
  labels = tf.reshape(labels, [batch_size])

  is_training = tf.placeholder(tf.bool)
  dropout_keep_prob = tf.placeholder(tf.float32)
  weight_decay = tf.placeholder(tf.float32)

  with slim.arg_scope(alexnet.alexnet_v2_arg_scope(
       weight_decay=weight_decay,is_training=is_training)):
    pred, _ = alexnet.alexnet_v2(images, num_classes, spatial_squeeze=True, is_training=is_training, dropout_keep_prob=dropout_keep_prob) 

  cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred, labels=labels, name='cross-entropy'))
  l2_loss = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
  loss = cross_entropy + l2_loss
  pred_soft = tf.nn.softmax(pred)
  top_1 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(pred_soft, labels, 1),dtype=tf.float32),name='top_1')
  top_5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(pred_soft, labels, 5),dtype=tf.float32),name='top_5')

  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies([tf.group(*update_ops)]):
    train_step = opt.minimize(loss, global_step=global_steps)
  with tf.control_dependencies([train_step,loss,top_1,top_5]):
    train_op = tf.no_op(name='train_op')
  with tf.control_dependencies([loss,top_1,top_5]):
    eval_op = tf.no_op(name='eval_op')

  with tf.train.MonitoredTrainingSession(checkpoint_dir = checkpoint_dir,
                                     config = config,
                                     hooks = hooks 
                                    ) as mon_sess:   
    global_step = 0
    mon_sess._coordinated_creator.tf_sess.run(train_init_op)
    while global_step < max_steps:
      global_step, _ = mon_sess.run([global_steps, train_op],feed_dict={weight_decay:0.02,is_training:True,dropout_keep_prob:0.5})

    mon_sess._coordinated_creator.tf_sess.run(validation_init_op)
    t1,t5,el = [],[],[]
    for i in range(eval_steps):
      _, l, top1,top5 = mon_sess.run([eval_op, loss, top_1, top_5],feed_dict={weight_decay:0.02,is_training:True,dropout_keep_prob:0.5})
      t1.append(top1)
      t5.append(top5)
      el.append(l)
    import numpy
    ac1 = numpy.mean(t1)
    ac5 = numpy.mean(t5)
    evalloss = numpy.mean(el)
    print('validation done top1 accuracy: %f , top5 accuracy: %f , validation: %f' % (ac1,ac5,evalloss))

Я думаю, что-то не так с некоторыми параметрами, но я не могу найти это. Я ценю ваш совет.

...