Я пытался использовать модель MobilenetV2 в качестве классификатора изображений. Существует 10 категорий, которые случайным образом выбираются из набора классификационных данных ImageNet. Но потеря не сходится во время тренировки, она всегда имеет тенденцию колебаться назад и вперед с большим значением. Я не знаю, правильно ли я использую API-интерфейс tenorflow, потому что я не очень разбираюсь в режиме ожидания в новой версии. Может кто-нибудь помочь мне? Большое спасибо.
Я использую версию 1.14. Я попытался запустить код прямо на версии 2.0, и я получил аналогичные результаты. Модель, которую я использую, https://github.com/qxde01/keras-alchemy/blob/master/models/mobilenet_v2.py
Ниже приведен мой тренировочный код и код обработки набора данных:
# train code
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import numpy as np
import os
import random
import tensorflow as tf
from config import cfg
from mobile_net_v2.dataset import ImageNet_Dataset
from mobile_net_v2 import mobilenet_v2
logger = logging.getLogger("train")
logger.setLevel(logging.INFO)
tf.compat.v1.enable_eager_execution()
tf.keras.backend.clear_session()
def set_random_seed(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.compat.v1.set_random_seed(seed)
def main():
cfg.merge_from_file(args.cfg)
train_dataset = ImageNet_Dataset(**cfg.DATASET.IMGNET)
def get_train_data_by_cv2(index):
img, cls_label, path = train_dataset[index]
return img, cls_label, path
index_sum = len(train_dataset.imgs_path_list)
assert index_sum != 0, 'Dataset information read error'
index = list(range(0, index_sum))
random.shuffle(index)
dataset = tf.data.Dataset.from_tensor_slices(index)
dataset = dataset.map(lambda index: tf.py_function(
get_train_data_by_cv2, [index], [tf.float32, tf.int32, tf.string]),
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(32).repeat(10).shuffle(32)
MNV2_Model = mobilenet_v2.MobileNetV2(include_top=True, input_shape=(127, 127, 3), alpha=0.5, classes=cfg.DATASET.IMGNET.MAX_NUM_OF_CLASS)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
# optimizer = tf.keras.optimizers.Adam(learning_rate=cfg.TRAIN.BASE_LR)
global_step = tf.Variable(0, dtype=tf.int64)
checkpoint = tf.train.Checkpoint(Model=MNV2_Model , Global_Step=global_step, Optimizer=optimizer)
ckp_manager = tf.train.CheckpointManager(checkpoint, directory=args.ckp_dir, max_to_keep=args.num_of_ckp_to_keep)
if ckp_manager.latest_checkpoint:
print("Restored from {}".format(ckp_manager.latest_checkpoint))
checkpoint.restore(ckp_manager.latest_checkpoint)
else:
print("Initializing from scratch.")
# tensorboard
summary_writer = tf.compat.v2.summary.create_file_writer(args.tsb_dir)
batch_count_end = tf.convert_to_tensor(0, dtype=tf.float32)
batch_count = 0
grads_cache = []
for batch_index, (img, label_cls, path) in enumerate(dataset):
with tf.GradientTape() as tape:
cls = MNV2_Model(img)
batch, c = cls.shape
# batch, h, w, c = cls.shape
cls_reshape = tf.reshape(cls, [batch, -1])
label_cls = tf.reshape(label_cls, [-1])
# cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_reshape, labels=label_cls)
cls_loss = tf.keras.losses.sparse_categorical_crossentropy(y_pred=cls_reshape, y_true=label_cls)
cls_loss = tf.reduce_mean(cls_loss)
# trainable_variables = MNV2_Model.trainable_variables
# tape_watched_variables = tape.watched_variables()
gradients = tape.gradient(cls_loss, MNV2_Model.trainable_variables)
len_ = len(gradients)
if batch_count_end.numpy() == 0:
optimizer.apply_gradients(zip(gradients, MNV2_Model.trainable_variables))
logger.info("train step = {}, cls_loss = {:.5f}".format(global_step.numpy(), cls_loss.numpy()))
with summary_writer.as_default():
tf.compat.v2.summary.scalar("cls_loss", cls_loss, step=global_step)
tf.compat.v2.summary.scalar("LR", optimizer.learning_rate, step=global_step)
else:
if batch_count == 0:
grads_cache = gradients
batch_count += 1
logger.info("train step = {}, cls_loss = {:.5f}".format(global_step.numpy(), cls_loss.numpy()))
with summary_writer.as_default():
tf.compat.v2.summary.scalar("cls_loss", cls_loss, step=global_step)
tf.compat.v2.summary.scalar("LR", optimizer.learning_rate, step=global_step)
else:
for ind in range(len_):
if gradients[ind] is None:
continue
grads_cache[ind] += gradients[ind]
if batch_count == batch_count_end.numpy():
batch_count = 0
for ind in range(len_):
if grads_cache[ind] is None:
continue
grads_cache[ind] = grads_cache[ind] / (batch_count_end + 1.)
optimizer.apply_gradients(zip(grads_cache, MNV2_Model.trainable_variables))
else:
batch_count += 1
if batch_index % args.ckp_save_interval == 0:
ckp_manager.save()
global_step = global_step + 1
if __name__ == "__main__":
parser = argparse.ArgumentParser()
req_grp = parser.add_argument_group('required')
req_grp.add_argument('--ckp_dir', default="./backbone_cp", help='directory to save checkpoints to')
req_grp.add_argument('--ckp_save_interval', default=2000, help='interval between model checkpoints')
req_grp.add_argument('--num_of_ckp_to_keep', default=5, help='the number of checkpoints to keep')
req_grp.add_argument('--tsb_dir', default="./backbone_tsb", help='directory to save tensorboard info to')
req_grp.add_argument('--cfg', type=str, default='../config_yaml/mobilenetv2_lt_config.yaml',
help='configuration of tracking')
parser.add_argument('--seed', type=int, default=123456, help='random seed')
args = parser.parse_args()
logger.info("train start")
if not os.path.exists(args.ckp_dir):
abs_dir = os.path.abspath(args.ckp_dir)
logger.warning("checkpoint directory does not exist , now we will crate this directory : "+abs_dir)
os.makedirs(abs_dir)
if not os.path.exists(args.tsb_dir):
abs_dir = os.path.abspath(args.tsb_dir)
logger.warning("tensorboard directory does not exist , now we will crate this directory : "+abs_dir)
os.makedirs(abs_dir)
assert os.path.exists(args.cfg), 'The configuration file does not exist, and the path may be filled incorrectly.'
set_random_seed(args.seed)
main()
# ILSVRC2012_CLASS dataset code
# -*- coding: UTF-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import cv2
import numpy as np
import tensorflow as tf
import random
import scipy.io
from config import cfg
class ImageNet_Dataset:
def __init__(self, ROOT, WIND_ROOT, MAX_NUM_OF_CLASS=1000, ):
self.WIND_To_ID_Dict = {}
synsets = scipy.io.loadmat(WIND_ROOT)['synsets']
for synset in synsets:
self.WIND_To_ID_Dict[synset[0][1][0]] = synset[0][0][0][0] # WIND=ID
self.imgs_path_list = []
self.img_label_list = []
anno_file = os.path.join(ROOT, 'anno.txt')
if not os.path.exists(anno_file):
print("Tag file does not exist, create")
count = 0
for file_name in os.listdir(ROOT):
path = os.path.join(ROOT, file_name)
if os.path.isdir(path) and file_name[0] == "n":
this_class_img_list = os.listdir(path)
self.imgs_path_list += [os.path.join(path, img_name) for img_name in this_class_img_list]
this_img_label = self.WIND_To_ID_Dict[file_name] - 1
this_img_label_list = [this_img_label]*len(this_class_img_list)
self.img_label_list += this_img_label_list
count += 1
print("finish:", path, "count", count)
file = open(os.path.join(ROOT, 'anno.txt'), 'w')
for img_path, label in zip(self.imgs_path_list, self.img_label_list):
file.writelines(img_path + " " + "{}".format(label) + "\n")
file.close()
else:
print("Tag file exists, read")
for line in open(anno_file, encoding='utf-8'):
path_and_label = line.strip("\n").split(' ')
self.imgs_path_list += [path_and_label[0]]
self.img_label_list += [int(path_and_label[1])]
count_temp = 0
val_mem = -1
for index, label in enumerate(self.img_label_list):
if label != val_mem:
val_mem = label
count_temp += 1
if label >= 1000:
print("Outlier:", label)
print("A total of {} classes".format(count_temp))
# Crop data set
if MAX_NUM_OF_CLASS != 1000:
# Randomly select N classes
temp = list(np.arange(1000))
random_choose_label = random.sample(temp, MAX_NUM_OF_CLASS)
# Select class and path
label_list_copy = self.img_label_list[:]
path_list_copy = self.imgs_path_list[:]
self.img_label_list = []
self.imgs_path_list = []
for index, label in enumerate(label_list_copy):
if label in random_choose_label:
self.img_label_list.append(label_list_copy[index])
self.imgs_path_list.append(path_list_copy[index])
# Select WIND
choosed_WIND = {}
# Recode tags and WIND
val_mem = -1
count_temp = -1
for index, val in enumerate(self.img_label_list):
if val != val_mem:
val_mem = val
count_temp += 1
for WIND in self.WIND_To_ID_Dict.keys():
if self.WIND_To_ID_Dict[WIND] == val:
choosed_WIND[WIND] = count_temp
print("Selected class:", WIND)
break
self.img_label_list[index] = count_temp
self.WIND_To_ID_Dict = choosed_WIND
print("Final selection of {} classes".format(len(self.WIND_To_ID_Dict)))
# Save the new WIND and ID table
file = open(os.path.join(ROOT, 'anno_select.txt'), 'w')
for KIND in self.WIND_To_ID_Dict.keys():
file.writelines(KIND + " " + "{}".format(self.WIND_To_ID_Dict[KIND]) + "\n")
file.close()
file = open(os.path.join(ROOT, 'new_anno.txt'), 'w')
for path, label in zip(self.imgs_path_list, self.img_label_list):
file.writelines(path + " " + "{}".format(label) + "\n")
file.close()
def __getitem__(self, index):
image = cv2.imread(self.imgs_path_list[index])
label = self.img_label_list[index]
image_resize = cv2.resize(image, (127, 127)).astype(np.float32)
return image_resize, label, self.imgs_path_list[index]
if __name__ == "__main__":
dataset = ImageNet_Dataset(**cfg.DATASET.IMGNET)
path_list = dataset.imgs_path_list
label_list = dataset.img_label_list
WIND_ID = dataset.WIND_To_ID_Dict
print("Total number of training pictures", len(dataset.imgs_path_list))
check_list = [100, 1500, 2000, 1300, 188, 1503]
for ind in check_list:
im, label = dataset[ind]
cv2.imshow("label {} ind {}".format(label, ind), im)
cv2.waitKey()
Ниже приводится печатное содержание учебного процесса:
2019-11-02 09:16:17.201821: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library nvcuda.dll
2019-11-02 09:16:17.289288: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.493
pciBusID: 0000:01:00.0
2019-11-02 09:16:17.289592: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-11-02 09:16:17.291786: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-11-02 09:16:17.292160: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2019-11-02 09:16:17.295064: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.493
pciBusID: 0000:01:00.0
2019-11-02 09:16:17.295369: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-11-02 09:16:17.297488: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-11-02 09:16:17.872697: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-11-02 09:16:17.872919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187] 0
2019-11-02 09:16:17.873052: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0: N
2019-11-02 09:16:17.875796: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3001 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1)
W1102 09:16:18.032456 14192 deprecation.py:323] From C:\software\Anaconda3\envs\CV_env\lib\site-packages\tensorflow\python\data\util\random_seed.py:58: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Initializing from scratch.
2019-11-02 09:16:20.087193: W .\tensorflow/core/framework/model.h:213] Encountered a stop event that was not preceded by a start event.
W1102 09:16:20.077111 15760 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111 16592 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111 6140 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111 1564 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.092706 15760 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
I1102 09:16:22.997265 14192 train_backbone.py:97] train step = 0, cls_loss = 2.30259
I1102 09:16:23.513478 14192 train_backbone.py:97] train step = 1, cls_loss = 2.30025
I1102 09:16:23.750537 14192 train_backbone.py:97] train step = 2, cls_loss = 2.29853
I1102 09:16:23.980576 14192 train_backbone.py:97] train step = 3, cls_loss = 2.24046
I1102 09:16:24.480434 14192 train_backbone.py:97] train step = 4, cls_loss = 2.41200
I1102 09:16:24.714752 14192 train_backbone.py:97] train step = 5, cls_loss = 2.36562
I1102 09:16:24.949074 14192 train_backbone.py:97] train step = 6, cls_loss = 2.37916
I1102 09:16:25.183419 14192 train_backbone.py:97] train step = 7, cls_loss = 2.29218
I1102 09:16:25.402090 14192 train_backbone.py:97] train step = 8, cls_loss = 2.28306
I1102 09:16:25.637276 14192 train_backbone.py:97] train step = 9, cls_loss = 2.28749
I1102 09:16:25.887218 14192 train_backbone.py:97] train step = 10, cls_loss = 2.29699
I1102 09:16:26.121537 14192 train_backbone.py:97] train step = 11, cls_loss = 2.26547
I1102 09:16:26.355856 14192 train_backbone.py:97] train step = 12, cls_loss = 2.35975
I1102 09:16:26.590177 14192 train_backbone.py:97] train step = 13, cls_loss = 2.35546
I1102 09:16:26.813820 14192 train_backbone.py:97] train step = 14, cls_loss = 2.23688
I1102 09:16:27.049402 14192 train_backbone.py:97] train step = 15, cls_loss = 2.35449
I1102 09:16:27.314964 14192 train_backbone.py:97] train step = 16, cls_loss = 2.32679
I1102 09:16:27.564905 14192 train_backbone.py:97] train step = 17, cls_loss = 2.26078
I1102 09:16:27.799224 14192 train_backbone.py:97] train step = 18, cls_loss = 2.31926
I1102 09:16:28.028345 14192 train_backbone.py:97] train step = 19, cls_loss = 2.37051
I1102 09:16:28.262664 14192 train_backbone.py:97] train step = 20, cls_loss = 2.33416
I1102 09:16:28.481362 14192 train_backbone.py:97] train step = 21, cls_loss = 2.31699
I1102 09:16:28.715682 14192 train_backbone.py:97] train step = 22, cls_loss = 2.31238
I1102 09:16:28.965623 14192 train_backbone.py:97] train step = 23, cls_loss = 2.29348
I1102 09:16:29.371894 14192 train_backbone.py:97] train step = 24, cls_loss = 2.31252
I1102 09:16:29.624755 14192 train_backbone.py:97] train step = 25, cls_loss = 2.29014
I1102 09:16:29.868103 14192 train_backbone.py:97] train step = 26, cls_loss = 2.29235
I1102 09:16:30.104471 14192 train_backbone.py:97] train step = 27, cls_loss = 2.29885
I1102 09:16:30.343830 14192 train_backbone.py:97] train step = 28, cls_loss = 2.30771
(ellipsis)
I1102 09:31:27.488112 14192 train_backbone.py:97] train step = 3781, cls_loss = 2.29822
I1102 09:31:27.694538 14192 train_backbone.py:97] train step = 3782, cls_loss = 2.28998
I1102 09:31:27.898992 14192 train_backbone.py:97] train step = 3783, cls_loss = 2.29968
I1102 09:31:28.107434 14192 train_backbone.py:97] train step = 3784, cls_loss = 2.31657
I1102 09:31:28.313882 14192 train_backbone.py:97] train step = 3785, cls_loss = 2.33363
I1102 09:31:28.525316 14192 train_backbone.py:97] train step = 3786, cls_loss = 2.30077
I1102 09:31:28.735753 14192 train_backbone.py:97] train step = 3787, cls_loss = 2.29594
I1102 09:31:28.965138 14192 train_backbone.py:97] train step = 3788, cls_loss = 2.28513
I1102 09:31:29.176574 14192 train_backbone.py:97] train step = 3789, cls_loss = 2.30248
I1102 09:31:29.386014 14192 train_backbone.py:97] train step = 3790, cls_loss = 2.28849
I1102 09:31:29.595453 14192 train_backbone.py:97] train step = 3791, cls_loss = 2.28419
I1102 09:31:29.821846 14192 train_backbone.py:97] train step = 3792, cls_loss = 2.24379
I1102 09:31:30.029292 14192 train_backbone.py:97] train step = 3793, cls_loss = 2.31936
I1102 09:31:30.236737 14192 train_backbone.py:97] train step = 3794, cls_loss = 2.29435
I1102 09:31:30.442188 14192 train_backbone.py:97] train step = 3795, cls_loss = 2.29231
I1102 09:31:30.648634 14192 train_backbone.py:97] train step = 3796, cls_loss = 2.30401
I1102 09:31:30.855082 14192 train_backbone.py:97] train step = 3797, cls_loss = 2.28225
I1102 09:31:31.057540 14192 train_backbone.py:97] train step = 3798, cls_loss = 2.30102
I1102 09:31:31.268974 14192 train_backbone.py:97] train step = 3799, cls_loss = 2.27844
I1102 09:31:31.469438 14192 train_backbone.py:97] train step = 3800, cls_loss = 2.31878
I1102 09:31:31.676883 14192 train_backbone.py:97] train step = 3801, cls_loss = 2.31835
I1102 09:31:31.890312 14192 train_backbone.py:97] train step = 3802, cls_loss = 2.25618
I1102 09:31:32.096760 14192 train_backbone.py:97] train step = 3803, cls_loss = 2.31030
I1102 09:31:32.302210 14192 train_backbone.py:97] train step = 3804, cls_loss = 2.30390
I1102 09:31:32.510651 14192 train_backbone.py:97] train step = 3805, cls_loss = 2.29039
I1102 09:31:32.731063 14192 train_backbone.py:97] train step = 3806, cls_loss = 2.27882
I1102 09:31:32.961446 14192 train_backbone.py:97] train step = 3807, cls_loss = 2.33232
I1102 09:31:33.165899 14192 train_backbone.py:97] train step = 3808, cls_loss = 2.28104
I1102 09:31:33.379328 14192 train_backbone.py:97] train step = 3809, cls_loss = 2.28902
Process finished with exit code 0