Я пытался обучить классификатор изображений на основе MobilenetV2, но потери не смогли сойтись, я не уверен, правильно ли я использую tenorflow - PullRequest
0 голосов
/ 02 ноября 2019

Я пытался использовать модель MobilenetV2 в качестве классификатора изображений. Существует 10 категорий, которые случайным образом выбираются из набора классификационных данных ImageNet. Но потеря не сходится во время тренировки, она всегда имеет тенденцию колебаться назад и вперед с большим значением. Я не знаю, правильно ли я использую API-интерфейс tenorflow, потому что я не очень разбираюсь в режиме ожидания в новой версии. Может кто-нибудь помочь мне? Большое спасибо.

Я использую версию 1.14. Я попытался запустить код прямо на версии 2.0, и я получил аналогичные результаты. Модель, которую я использую, https://github.com/qxde01/keras-alchemy/blob/master/models/mobilenet_v2.py

Ниже приведен мой тренировочный код и код обработки набора данных:

# train code
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import numpy as np
import os
import random
import tensorflow as tf

from config import cfg
from mobile_net_v2.dataset import ImageNet_Dataset
from mobile_net_v2 import mobilenet_v2

logger = logging.getLogger("train")
logger.setLevel(logging.INFO)

tf.compat.v1.enable_eager_execution()
tf.keras.backend.clear_session()


def set_random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.compat.v1.set_random_seed(seed)

def main():
    cfg.merge_from_file(args.cfg)

    train_dataset = ImageNet_Dataset(**cfg.DATASET.IMGNET)
    def get_train_data_by_cv2(index):
        img, cls_label, path = train_dataset[index]
        return img, cls_label, path
    index_sum = len(train_dataset.imgs_path_list)
    assert index_sum != 0, 'Dataset information read error'
    index = list(range(0, index_sum))
    random.shuffle(index)
    dataset = tf.data.Dataset.from_tensor_slices(index)
    dataset = dataset.map(lambda index: tf.py_function(
        get_train_data_by_cv2, [index], [tf.float32, tf.int32, tf.string]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32).repeat(10).shuffle(32)

    MNV2_Model = mobilenet_v2.MobileNetV2(include_top=True, input_shape=(127, 127, 3), alpha=0.5, classes=cfg.DATASET.IMGNET.MAX_NUM_OF_CLASS)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
    # optimizer = tf.keras.optimizers.Adam(learning_rate=cfg.TRAIN.BASE_LR)
    global_step = tf.Variable(0, dtype=tf.int64)

    checkpoint = tf.train.Checkpoint(Model=MNV2_Model , Global_Step=global_step, Optimizer=optimizer)
    ckp_manager = tf.train.CheckpointManager(checkpoint, directory=args.ckp_dir, max_to_keep=args.num_of_ckp_to_keep)
    if ckp_manager.latest_checkpoint:
        print("Restored from {}".format(ckp_manager.latest_checkpoint))
        checkpoint.restore(ckp_manager.latest_checkpoint)
    else:
        print("Initializing from scratch.")

    # tensorboard
    summary_writer = tf.compat.v2.summary.create_file_writer(args.tsb_dir)

    batch_count_end = tf.convert_to_tensor(0, dtype=tf.float32)
    batch_count = 0
    grads_cache = []
    for batch_index, (img, label_cls, path) in enumerate(dataset):
        with tf.GradientTape() as tape:

            cls = MNV2_Model(img)

            batch, c = cls.shape
            # batch, h, w, c = cls.shape
            cls_reshape = tf.reshape(cls, [batch, -1])
            label_cls = tf.reshape(label_cls, [-1])
            # cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_reshape, labels=label_cls)
            cls_loss = tf.keras.losses.sparse_categorical_crossentropy(y_pred=cls_reshape, y_true=label_cls)
            cls_loss = tf.reduce_mean(cls_loss)
        # trainable_variables = MNV2_Model.trainable_variables
        # tape_watched_variables = tape.watched_variables()  

        gradients = tape.gradient(cls_loss, MNV2_Model.trainable_variables)
        len_ = len(gradients)
        if batch_count_end.numpy() == 0:
            optimizer.apply_gradients(zip(gradients, MNV2_Model.trainable_variables))
            logger.info("train step = {}, cls_loss = {:.5f}".format(global_step.numpy(), cls_loss.numpy()))
            with summary_writer.as_default():
                tf.compat.v2.summary.scalar("cls_loss", cls_loss, step=global_step)
                tf.compat.v2.summary.scalar("LR", optimizer.learning_rate, step=global_step)
        else:
            if batch_count == 0:
                grads_cache = gradients
                batch_count += 1
                logger.info("train step = {}, cls_loss = {:.5f}".format(global_step.numpy(), cls_loss.numpy()))
                with summary_writer.as_default():
                    tf.compat.v2.summary.scalar("cls_loss", cls_loss, step=global_step)
                    tf.compat.v2.summary.scalar("LR", optimizer.learning_rate, step=global_step)
            else:
                for ind in range(len_):
                    if gradients[ind] is None:
                        continue
                    grads_cache[ind] += gradients[ind]  
                if batch_count == batch_count_end.numpy():
                    batch_count = 0

                    for ind in range(len_):
                        if grads_cache[ind] is None:
                            continue
                        grads_cache[ind] = grads_cache[ind] / (batch_count_end + 1.)
                    optimizer.apply_gradients(zip(grads_cache, MNV2_Model.trainable_variables))
                else:
                    batch_count += 1
        if batch_index % args.ckp_save_interval == 0:
            ckp_manager.save()
        global_step = global_step + 1



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    req_grp = parser.add_argument_group('required')
    req_grp.add_argument('--ckp_dir', default="./backbone_cp", help='directory to save checkpoints to')
    req_grp.add_argument('--ckp_save_interval', default=2000, help='interval between model checkpoints')
    req_grp.add_argument('--num_of_ckp_to_keep', default=5, help='the number of checkpoints to keep')
    req_grp.add_argument('--tsb_dir', default="./backbone_tsb", help='directory to save tensorboard info to')
    req_grp.add_argument('--cfg', type=str, default='../config_yaml/mobilenetv2_lt_config.yaml',
                         help='configuration of tracking')
    parser.add_argument('--seed', type=int, default=123456, help='random seed')
    args = parser.parse_args()

    logger.info("train start")
    if not os.path.exists(args.ckp_dir):
        abs_dir = os.path.abspath(args.ckp_dir)
        logger.warning("checkpoint directory does not exist , now we will crate this directory : "+abs_dir)
        os.makedirs(abs_dir)
    if not os.path.exists(args.tsb_dir):
        abs_dir = os.path.abspath(args.tsb_dir)
        logger.warning("tensorboard directory does not exist , now we will crate this directory : "+abs_dir)
        os.makedirs(abs_dir)
    assert os.path.exists(args.cfg), 'The configuration file does not exist, and the path may be filled incorrectly.'

    set_random_seed(args.seed)
    main()

# ILSVRC2012_CLASS dataset code
# -*- coding: UTF-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import cv2
import numpy as np
import tensorflow as tf
import random
import scipy.io
from config import cfg

class ImageNet_Dataset:
    def __init__(self, ROOT, WIND_ROOT, MAX_NUM_OF_CLASS=1000, ):
        self.WIND_To_ID_Dict = {}
        synsets = scipy.io.loadmat(WIND_ROOT)['synsets']
        for synset in synsets:
            self.WIND_To_ID_Dict[synset[0][1][0]] = synset[0][0][0][0]  # WIND=ID
        self.imgs_path_list = []
        self.img_label_list = []
        anno_file = os.path.join(ROOT, 'anno.txt')
        if not os.path.exists(anno_file):
            print("Tag file does not exist, create")
            count = 0
            for file_name in os.listdir(ROOT):
                path = os.path.join(ROOT, file_name)
                if os.path.isdir(path) and file_name[0] == "n":
                    this_class_img_list = os.listdir(path)
                    self.imgs_path_list += [os.path.join(path, img_name) for img_name in this_class_img_list]
                    this_img_label = self.WIND_To_ID_Dict[file_name] - 1
                    this_img_label_list = [this_img_label]*len(this_class_img_list)
                    self.img_label_list += this_img_label_list
                    count += 1
                print("finish:", path, "count", count)

            file = open(os.path.join(ROOT, 'anno.txt'), 'w')
            for img_path, label in zip(self.imgs_path_list, self.img_label_list):
                file.writelines(img_path + " " + "{}".format(label) + "\n")
            file.close()
        else:
            print("Tag file exists, read")
            for line in open(anno_file, encoding='utf-8'):
                path_and_label = line.strip("\n").split(' ')
                self.imgs_path_list += [path_and_label[0]]
                self.img_label_list += [int(path_and_label[1])]
            count_temp = 0
            val_mem = -1
            for index, label in enumerate(self.img_label_list):
                if label != val_mem:
                    val_mem = label
                    count_temp += 1
                    if label >= 1000:
                        print("Outlier:", label)
            print("A total of {} classes".format(count_temp))
        # Crop data set
        if MAX_NUM_OF_CLASS != 1000:
            # Randomly select N classes
            temp = list(np.arange(1000))
            random_choose_label = random.sample(temp, MAX_NUM_OF_CLASS)
            # Select class and path
            label_list_copy = self.img_label_list[:]
            path_list_copy = self.imgs_path_list[:]
            self.img_label_list = []
            self.imgs_path_list = []
            for index, label in enumerate(label_list_copy):
                if label in random_choose_label:
                    self.img_label_list.append(label_list_copy[index])
                    self.imgs_path_list.append(path_list_copy[index])
            # Select WIND
            choosed_WIND = {}
            # Recode tags and WIND
            val_mem = -1
            count_temp = -1
            for index, val in enumerate(self.img_label_list):
                if val != val_mem:
                    val_mem = val
                    count_temp += 1
                    for WIND in self.WIND_To_ID_Dict.keys():
                        if self.WIND_To_ID_Dict[WIND] == val:
                            choosed_WIND[WIND] = count_temp
                            print("Selected class:", WIND)
                            break
                self.img_label_list[index] = count_temp
            self.WIND_To_ID_Dict = choosed_WIND
            print("Final selection of {} classes".format(len(self.WIND_To_ID_Dict)))
            # Save the new WIND and ID table
            file = open(os.path.join(ROOT, 'anno_select.txt'), 'w')
            for KIND in self.WIND_To_ID_Dict.keys():
                file.writelines(KIND + " " + "{}".format(self.WIND_To_ID_Dict[KIND]) + "\n")
            file.close()
            file = open(os.path.join(ROOT, 'new_anno.txt'), 'w')
            for path, label in zip(self.imgs_path_list, self.img_label_list):
                file.writelines(path + " " + "{}".format(label) + "\n")
            file.close()

    def __getitem__(self, index):
        image = cv2.imread(self.imgs_path_list[index])
        label = self.img_label_list[index]
        image_resize = cv2.resize(image, (127, 127)).astype(np.float32)
        return image_resize, label, self.imgs_path_list[index]


if __name__ == "__main__":
    dataset = ImageNet_Dataset(**cfg.DATASET.IMGNET)
    path_list = dataset.imgs_path_list
    label_list = dataset.img_label_list
    WIND_ID = dataset.WIND_To_ID_Dict
    print("Total number of training pictures", len(dataset.imgs_path_list))
    check_list = [100, 1500, 2000, 1300, 188, 1503]
    for ind in check_list:
        im, label = dataset[ind]
        cv2.imshow("label {} ind {}".format(label, ind), im)
    cv2.waitKey()

Ниже приводится печатное содержание учебного процесса:

2019-11-02 09:16:17.201821: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library nvcuda.dll
2019-11-02 09:16:17.289288: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.493
pciBusID: 0000:01:00.0
2019-11-02 09:16:17.289592: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-11-02 09:16:17.291786: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-11-02 09:16:17.292160: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2019-11-02 09:16:17.295064: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.493
pciBusID: 0000:01:00.0
2019-11-02 09:16:17.295369: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-11-02 09:16:17.297488: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-11-02 09:16:17.872697: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-11-02 09:16:17.872919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187]      0 
2019-11-02 09:16:17.873052: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0:   N 
2019-11-02 09:16:17.875796: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3001 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1)
W1102 09:16:18.032456 14192 deprecation.py:323] From C:\software\Anaconda3\envs\CV_env\lib\site-packages\tensorflow\python\data\util\random_seed.py:58: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Initializing from scratch.
2019-11-02 09:16:20.087193: W .\tensorflow/core/framework/model.h:213] Encountered a stop event that was not preceded by a start event.
W1102 09:16:20.077111 15760 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111 16592 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111  6140 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111  1564 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.092706 15760 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
I1102 09:16:22.997265 14192 train_backbone.py:97] train step = 0, cls_loss = 2.30259
I1102 09:16:23.513478 14192 train_backbone.py:97] train step = 1, cls_loss = 2.30025
I1102 09:16:23.750537 14192 train_backbone.py:97] train step = 2, cls_loss = 2.29853
I1102 09:16:23.980576 14192 train_backbone.py:97] train step = 3, cls_loss = 2.24046
I1102 09:16:24.480434 14192 train_backbone.py:97] train step = 4, cls_loss = 2.41200
I1102 09:16:24.714752 14192 train_backbone.py:97] train step = 5, cls_loss = 2.36562
I1102 09:16:24.949074 14192 train_backbone.py:97] train step = 6, cls_loss = 2.37916
I1102 09:16:25.183419 14192 train_backbone.py:97] train step = 7, cls_loss = 2.29218
I1102 09:16:25.402090 14192 train_backbone.py:97] train step = 8, cls_loss = 2.28306
I1102 09:16:25.637276 14192 train_backbone.py:97] train step = 9, cls_loss = 2.28749
I1102 09:16:25.887218 14192 train_backbone.py:97] train step = 10, cls_loss = 2.29699
I1102 09:16:26.121537 14192 train_backbone.py:97] train step = 11, cls_loss = 2.26547
I1102 09:16:26.355856 14192 train_backbone.py:97] train step = 12, cls_loss = 2.35975
I1102 09:16:26.590177 14192 train_backbone.py:97] train step = 13, cls_loss = 2.35546
I1102 09:16:26.813820 14192 train_backbone.py:97] train step = 14, cls_loss = 2.23688
I1102 09:16:27.049402 14192 train_backbone.py:97] train step = 15, cls_loss = 2.35449
I1102 09:16:27.314964 14192 train_backbone.py:97] train step = 16, cls_loss = 2.32679
I1102 09:16:27.564905 14192 train_backbone.py:97] train step = 17, cls_loss = 2.26078
I1102 09:16:27.799224 14192 train_backbone.py:97] train step = 18, cls_loss = 2.31926
I1102 09:16:28.028345 14192 train_backbone.py:97] train step = 19, cls_loss = 2.37051
I1102 09:16:28.262664 14192 train_backbone.py:97] train step = 20, cls_loss = 2.33416
I1102 09:16:28.481362 14192 train_backbone.py:97] train step = 21, cls_loss = 2.31699
I1102 09:16:28.715682 14192 train_backbone.py:97] train step = 22, cls_loss = 2.31238
I1102 09:16:28.965623 14192 train_backbone.py:97] train step = 23, cls_loss = 2.29348
I1102 09:16:29.371894 14192 train_backbone.py:97] train step = 24, cls_loss = 2.31252
I1102 09:16:29.624755 14192 train_backbone.py:97] train step = 25, cls_loss = 2.29014
I1102 09:16:29.868103 14192 train_backbone.py:97] train step = 26, cls_loss = 2.29235
I1102 09:16:30.104471 14192 train_backbone.py:97] train step = 27, cls_loss = 2.29885
I1102 09:16:30.343830 14192 train_backbone.py:97] train step = 28, cls_loss = 2.30771

(ellipsis)

I1102 09:31:27.488112 14192 train_backbone.py:97] train step = 3781, cls_loss = 2.29822
I1102 09:31:27.694538 14192 train_backbone.py:97] train step = 3782, cls_loss = 2.28998
I1102 09:31:27.898992 14192 train_backbone.py:97] train step = 3783, cls_loss = 2.29968
I1102 09:31:28.107434 14192 train_backbone.py:97] train step = 3784, cls_loss = 2.31657
I1102 09:31:28.313882 14192 train_backbone.py:97] train step = 3785, cls_loss = 2.33363
I1102 09:31:28.525316 14192 train_backbone.py:97] train step = 3786, cls_loss = 2.30077
I1102 09:31:28.735753 14192 train_backbone.py:97] train step = 3787, cls_loss = 2.29594
I1102 09:31:28.965138 14192 train_backbone.py:97] train step = 3788, cls_loss = 2.28513
I1102 09:31:29.176574 14192 train_backbone.py:97] train step = 3789, cls_loss = 2.30248
I1102 09:31:29.386014 14192 train_backbone.py:97] train step = 3790, cls_loss = 2.28849
I1102 09:31:29.595453 14192 train_backbone.py:97] train step = 3791, cls_loss = 2.28419
I1102 09:31:29.821846 14192 train_backbone.py:97] train step = 3792, cls_loss = 2.24379
I1102 09:31:30.029292 14192 train_backbone.py:97] train step = 3793, cls_loss = 2.31936
I1102 09:31:30.236737 14192 train_backbone.py:97] train step = 3794, cls_loss = 2.29435
I1102 09:31:30.442188 14192 train_backbone.py:97] train step = 3795, cls_loss = 2.29231
I1102 09:31:30.648634 14192 train_backbone.py:97] train step = 3796, cls_loss = 2.30401
I1102 09:31:30.855082 14192 train_backbone.py:97] train step = 3797, cls_loss = 2.28225
I1102 09:31:31.057540 14192 train_backbone.py:97] train step = 3798, cls_loss = 2.30102
I1102 09:31:31.268974 14192 train_backbone.py:97] train step = 3799, cls_loss = 2.27844
I1102 09:31:31.469438 14192 train_backbone.py:97] train step = 3800, cls_loss = 2.31878
I1102 09:31:31.676883 14192 train_backbone.py:97] train step = 3801, cls_loss = 2.31835
I1102 09:31:31.890312 14192 train_backbone.py:97] train step = 3802, cls_loss = 2.25618
I1102 09:31:32.096760 14192 train_backbone.py:97] train step = 3803, cls_loss = 2.31030
I1102 09:31:32.302210 14192 train_backbone.py:97] train step = 3804, cls_loss = 2.30390
I1102 09:31:32.510651 14192 train_backbone.py:97] train step = 3805, cls_loss = 2.29039
I1102 09:31:32.731063 14192 train_backbone.py:97] train step = 3806, cls_loss = 2.27882
I1102 09:31:32.961446 14192 train_backbone.py:97] train step = 3807, cls_loss = 2.33232
I1102 09:31:33.165899 14192 train_backbone.py:97] train step = 3808, cls_loss = 2.28104
I1102 09:31:33.379328 14192 train_backbone.py:97] train step = 3809, cls_loss = 2.28902

Process finished with exit code 0

1 Ответ

1 голос
/ 04 ноября 2019

Я попрошу себя ответить. Причина, по которой я обнаружил проблему сегодня, заключается в том, что уровень BN не был обновлен. Я виню в этом команду tenorflow за то, что она не предоставила подробную документацию. В более старых версиях слой BN нужно было обновлять вручную. В режиме ожидания этот обновленный вручную код не работает. Вам нужно добавить tf.keras.backend.set_learning_phase(True) до начала цикла обучения, и обучение будет работать нормально. Необходимо изменить на False перед циклом тестирования.

...