Модель переподготовки с одним дополнительным классом - PullRequest
0 голосов
/ 09 марта 2020

Я пытаюсь переобучить модель (Pointnet ++), добавив один дополнительный класс (т.е. моя модель была обучена с 6 классами, но теперь я добавляю еще один класс, так что теперь общее количество классов равно 7). Я посмотрел на похожие вопросы через inte rnet, но я не могу понять, в чем проблема в моем коде. Может кто-нибудь, пожалуйста, направьте меня .. Ниже приведен код для справки

Определение модели в файле с именем MODEL:

def placeholder_inputs(batch_size, num_point):
   pointclouds_pl = tf.placeholder(tf.float32, shape=(batch_size, num_point, 3))
   labels_pl = tf.placeholder(tf.int32, shape=(batch_size, num_point))
   smpws_pl = tf.placeholder(tf.float32, shape=(batch_size, num_point))
   return pointclouds_pl, labels_pl, smpws_pl


def get_model(point_cloud, is_training, num_class, bn_decay=None):
""" Semantic segmentation PointNet, input is BxNx3, output Bxnum_class """
   batch_size = point_cloud.get_shape()[0].value
   num_point = point_cloud.get_shape()[1].value
   end_points = {}
   l0_xyz = point_cloud
   l0_points = None
   end_points['l0_xyz'] = l0_xyz

# Layer 1
   l1_xyz, l1_points, l1_indices = pointnet_sa_module(l0_xyz, l0_points, npoint=1024, radius=2, nsample=32, mlp=[32,32,64], mlp2=None, group_all=False, is_training=is_training, bn_decay=bn_decay, scope='layer1')
   l2_xyz, l2_points, l2_indices = pointnet_sa_module(l1_xyz, l1_points, npoint=256, radius=4, nsample=32, mlp=[64,64,128], mlp2=None, group_all=False, is_training=is_training, bn_decay=bn_decay, scope='layer2')
   l3_xyz, l3_points, l3_indices = pointnet_sa_module(l2_xyz, l2_points, npoint=64, radius=8, nsample=32, mlp=[128,128,256], mlp2=None, group_all=False, is_training=is_training, bn_decay=bn_decay, scope='layer3')
   l4_xyz, l4_points, l4_indices = pointnet_sa_module(l3_xyz, l3_points, npoint=16, radius=16, nsample=32, mlp=[256,256,512], mlp2=None, group_all=False, is_training=is_training, bn_decay=bn_decay, scope='layer4')

# Feature Propagation layers
   l3_points = pointnet_fp_module(l3_xyz, l4_xyz, l3_points, l4_points, [256,256], is_training, bn_decay, scope='fa_layer1')
   l2_points = pointnet_fp_module(l2_xyz, l3_xyz, l2_points, l3_points, [256,256], is_training, bn_decay, scope='fa_layer2')
   l1_points = pointnet_fp_module(l1_xyz, l2_xyz, l1_points, l2_points, [256,128], is_training, bn_decay, scope='fa_layer3')
   l0_points = pointnet_fp_module(l0_xyz, l1_xyz, l0_points, l1_points, [128,128,128], is_training, bn_decay, scope='fa_layer4')

# FC layers
   net = tf_util.conv1d(l0_points, 128, 1, padding='VALID', bn=True, is_training=is_training, scope='fc1', bn_decay=bn_decay)
   end_points['feats'] = net 
   net = tf_util.dropout(net, keep_prob=0.5, is_training=is_training, scope='dp1')
   net = tf_util.conv1d(net, num_class, 1, padding='VALID', activation_fn=None, scope='fc2')

   return net, end_points


def get_loss(pred, label, smpw):
""" pred: BxNxC,
    label: BxN, 
   smpw: BxN """
   classify_loss = tf.losses.sparse_softmax_cross_entropy(labels=label, logits=pred, weights=smpw)
   tf.summary.scalar('classify loss', classify_loss)
   tf.add_to_collection('losses', classify_loss)
   return classify_loss

Код, где я пытаюсь Чтобы восстановить и переобучить модель:

def train():
with tf.Graph().as_default():
    with tf.device('/gpu:'+str(GPU_INDEX)):
        pointclouds_pl, labels_pl, smpws_pl = MODEL.placeholder_inputs(BATCH_SIZE, NUM_POINT)
        is_training_pl = tf.placeholder(tf.bool, shape=())
        print (is_training_pl)

        batch = tf.Variable(0)
        bn_decay = get_bn_decay(batch)
        tf.summary.scalar('bn_decay', bn_decay)
        print ("--- Get model and loss")
        # Get model and loss 
        pred, end_points = MODEL.get_model(pointclouds_pl, is_training_pl, NUM_CLASSES, bn_decay=bn_decay)
        loss = MODEL.get_loss(pred, labels_pl, smpws_pl)
        tf.summary.scalar('loss', loss)

        correct = tf.equal(tf.argmax(pred, 2), tf.to_int64(labels_pl))
        accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / float(BATCH_SIZE*NUM_POINT)
        tf.summary.scalar('accuracy', accuracy)

        print("--- Get training operator")
        # Get training operator
        learning_rate = get_learning_rate(batch)
        tf.summary.scalar('learning_rate', learning_rate)
        if OPTIMIZER == 'momentum':
            optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=MOMENTUM)
        elif OPTIMIZER == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        train_op = optimizer.minimize(loss, global_step=batch)

        # Add ops to save and restore all the variables.
        saver = tf.train.Saver()

    # Create a session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    config.log_device_placement = False
    sess = tf.Session(config=config)

    # Add summary writers
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), sess.graph)
    test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test'), sess.graph)

    #sess.run(init, {is_training_pl: True})            
    ###############################################################################################
    log_string("Model restoring.")
    saver.restore(sess, MODEL_PATH)
    log_string("Model restored.")  ###################################################################

    # Init variables
    init = tf.global_variables_initializer()
    sess.run(init)

    ops = {'pointclouds_pl': pointclouds_pl,
           'labels_pl': labels_pl,
           'smpws_pl': smpws_pl,
           'is_training_pl': is_training_pl,
           'pred': pred,
           'loss': loss,
           'train_op': train_op,
           'merged': merged,
           'step': batch,
           'end_points': end_points}

    best_acc = -1
    for epoch in range(MAX_EPOCH):
        log_string('**** EPOCH %03d ****' % (epoch))
        sys.stdout.flush()

        train_one_epoch(sess, ops, train_writer, TRAIN_DATASET)
        if epoch%5==0:
            acc = eval_one_epoch(sess, ops, test_writer, TEST_DATASET)
            acc = eval_whole_scene_one_epoch(sess, ops, test_writer, TEST_DATASET_WHOLE_SCENE)
        if acc > best_acc:
            best_acc = acc
            save_path = saver.save(sess, os.path.join(LOG_DIR, "best_model_retrained_epoch_%03d"%(epoch)))
            log_string("Model saved in file: %s" % save_path)

        # Save the variables to disk.
        if epoch % 10 == 0:
            save_path = saver.save(sess, os.path.join(LOG_DIR, "model_retrained"))
            log_string("Model saved in file: %s" % save_path)

Когда я запустил приведенный выше код, я получил ошибку ниже:

Caused by op 'save/Assign_143', defined at:
File "train_2_retrain_4.1_water.py", line 448, in <module>
   train()
File "train_2_retrain_4.1_water.py", line 144, in train
   saver = tf.train.Saver()
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 
1338, in __init__
   self.build()
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 
1347, in build 
   self._build(self._filename, build_save=True, build_restore=True)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 
1384, in _build
   build_save=build_save, build_restore=build_restore)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 
835, in _build_internal
   restore_sequentially, reshape)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 
494, in _AddRestoreOps
   assign_ops.append(saveable.restore(saveable_tensors, shapes))
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 
185, in restore
   self.op.get_shape().is_fully_defined())
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 
283, in assign
   validate_shape=validate_shape)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 60, in assign
   use_locking=use_locking, name=name)
File "/home/ubuntu/anaconda3/lib/python3.6/site- 
packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Assign requires shapes of both tensors to match. lhs shape= [7] rhs shape= [6]
 [[Node: save/Assign_143 = Assign[T=DT_FLOAT, _class=["loc:@fc2/biases"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](fc2/biases, save/RestoreV2:143)]]
 [[Node: save/RestoreV2/_267 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_508_save/RestoreV2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Любая помощь очень ценится.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...