Я использую SSD (Single Shot Detector) для обучения в PyTorch. Тем не менее, моя потеря тренировок не уменьшилась ... Я искал и пробовал различные решения в течение недели, но проблема все еще остается.
Что мне делать? Моя функция потерь неверна?
Вот моя модель SSD300
SSD300(
(feature_layers): ModuleDict(
(conv1_1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu1_1): ReLU()
(conv1_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu1_2): ReLU()
(pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(conv2_1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu2_1): ReLU()
(conv2_2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu2_2): ReLU()
(pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(conv3_1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu3_1): ReLU()
(conv3_2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu3_2): ReLU()
(conv3_3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu3_3): ReLU()
(pool3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=True)
(conv4_1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu4_1): ReLU()
(conv4_2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu4_2): ReLU()
(conv4_3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu4_3): ReLU()
(pool4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(conv5_1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu5_1): ReLU()
(conv5_2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu5_2): ReLU()
(conv5_3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu5_3): ReLU()
(pool5): MaxPool2d(kernel_size=(3, 3), stride=(1, 1), padding=1, dilation=1, ceil_mode=False)
(conv6): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(6, 6), dilation=(6, 6))
(relu6): ReLU()
(conv7): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
(relu7): ReLU()
(conv8_1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
(relu8_1): ReLU()
(conv8_2): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(relu8_2): ReLU()
(conv9_1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
(relu9_1): ReLU()
(conv9_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(relu9_2): ReLU()
(conv10_1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
(relu10_1): ReLU()
(conv10_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
(relu10_2): ReLU()
(conv11_1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
(relu11_1): ReLU()
(conv11_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
(relu11_2): ReLU()
)
(localization_layers): ModuleDict(
(loc1): Sequential(
(l2norm_loc1): L2Normalization()
(conv_loc1): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_loc1): ReLU()
)
(loc2): Sequential(
(conv_loc2): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_loc2): ReLU()
)
(loc3): Sequential(
(conv_loc3): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_loc3): ReLU()
)
(loc4): Sequential(
(conv_loc4): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_loc4): ReLU()
)
(loc5): Sequential(
(conv_loc5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_loc5): ReLU()
)
(loc6): Sequential(
(conv_loc6): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_loc6): ReLU()
)
)
(confidence_layers): ModuleDict(
(conf1): Sequential(
(l2norm_conf1): L2Normalization()
(conv_conf1): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_conf1): ReLU()
)
(conf2): Sequential(
(conv_conf2): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_conf2): ReLU()
)
(conf3): Sequential(
(conv_conf3): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_conf3): ReLU()
)
(conf4): Sequential(
(conv_conf4): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_conf4): ReLU()
)
(conf5): Sequential(
(conv_conf5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_conf5): ReLU()
)
(conf6): Sequential(
(conv_conf6): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu_conf6): ReLU()
)
)
(predictor): Predictor()
)
Моя функция потерь определена как:
class SSDLoss(nn.Module):
def __init__(self, alpha=1, matching_func=None, loc_loss=None, conf_loss=None):
super().__init__()
self.alpha = alpha
self.matching_strategy = matching_strategy if matching_func is None else matching_func
self.loc_loss = LocalizationLoss() if loc_loss is None else loc_loss
self.conf_loss = ConfidenceLoss() if conf_loss is None else conf_loss
def forward(self, predicts, gts, dboxes):
"""
:param predicts: Tensor, shape is (batch, total_dbox_nums, 4+class_nums=(cx, cy, w, h, p_class,...)
:param gts: Tensor, shape is (batch*bbox_nums(batch), 1+4+class_nums) = [[img's_ind, cx, cy, w, h, p_class,...],..
:param dboxes: Tensor, shape is (total_dbox_nums, 4=(cx,cy,w,h))
:return:
loss: float
"""
# get predict's localization and confidence
pred_loc, pred_conf = predicts[:, :, :4], predicts[:, :, 4:]
# matching
pos_indicator, gt_loc, gt_conf = self.matching_strategy(gts, dboxes, batch_num=predicts.shape[0], threshold=0.5)
# calculate ground truth value considering default boxes
gt_loc = gt_loc_converter(gt_loc, dboxes)
# Localization loss
loc_loss = self.loc_loss(pos_indicator, pred_loc, gt_loc)
# Confidence loss
conf_loss = self.conf_loss(pos_indicator, pred_conf, gt_conf)
return conf_loss + self.alpha * loc_loss
class LocalizationLoss(nn.Module):
def __init__(self):
super().__init__()
self.smoothL1Loss = nn.SmoothL1Loss(reduction='none')
def forward(self, pos_indicator, predicts, gts):
N = pos_indicator.sum()
total_loss = self.smoothL1Loss(predicts, gts).sum(dim=-1) # shape = (batch num, dboxes num)
loss = total_loss.masked_select(pos_indicator)
return loss.sum() / N
class ConfidenceLoss(nn.Module):
def __init__(self, neg_factor=3):
"""
:param neg_factor: int, the ratio(1(pos): neg_factor) to learn pos and neg for hard negative mining
"""
super().__init__()
self.logsoftmax = nn.LogSoftmax(dim=-1)
self._neg_factor = neg_factor
def forward(self, pos_indicator, predicts, gts):
loss = (-gts * self.logsoftmax(predicts)).sum(dim=-1) # shape = (batch num, dboxes num)
N = pos_indicator.sum()
neg_indicator = torch.logical_not(pos_indicator)
pos_loss = loss.masked_select(pos_indicator)
neg_loss = loss.masked_select(neg_indicator)
neg_num = neg_loss.shape[0]
neg_num = min(neg_num, self._neg_factor * N)
_, topk_indices = torch.topk(neg_loss, neg_num)
neg_loss = neg_loss.index_select(dim=0, index=topk_indices)
return (pos_loss.sum() + neg_loss.sum()) / N
выход потери ниже;
Training... Epoch: 1, Iter: 1, [32/21503 (0%)] Loss: 28.804445
Training... Epoch: 1, Iter: 10, [320/21503 (1%)] Loss: 12.880742
Training... Epoch: 1, Iter: 20, [640/21503 (3%)] Loss: 15.932519
Training... Epoch: 1, Iter: 30, [960/21503 (4%)] Loss: 14.624641
Training... Epoch: 1, Iter: 40, [1280/21503 (6%)] Loss: 16.301014
Training... Epoch: 1, Iter: 50, [1600/21503 (7%)] Loss: 15.710087
Training... Epoch: 1, Iter: 60, [1920/21503 (9%)] Loss: 12.441727
Training... Epoch: 1, Iter: 70, [2240/21503 (10%)] Loss: 12.283393
Training... Epoch: 1, Iter: 80, [2560/21503 (12%)] Loss: 12.272835
Training... Epoch: 1, Iter: 90, [2880/21503 (13%)] Loss: 12.273635
Training... Epoch: 1, Iter: 100, [3200/21503 (15%)] Loss: 12.273409
Training... Epoch: 1, Iter: 110, [3520/21503 (16%)] Loss: 12.266172
Training... Epoch: 1, Iter: 120, [3840/21503 (18%)] Loss: 12.272820
Training... Epoch: 1, Iter: 130, [4160/21503 (19%)] Loss: 12.274920
Training... Epoch: 1, Iter: 140, [4480/21503 (21%)] Loss: 12.275247
Training... Epoch: 1, Iter: 150, [4800/21503 (22%)] Loss: 12.273258
Training... Epoch: 1, Iter: 160, [5120/21503 (24%)] Loss: 12.277486
Training... Epoch: 1, Iter: 170, [5440/21503 (25%)] Loss: 12.266512
Training... Epoch: 1, Iter: 180, [5760/21503 (27%)] Loss: 12.265674
Training... Epoch: 1, Iter: 190, [6080/21503 (28%)] Loss: 12.265306
Training... Epoch: 1, Iter: 200, [6400/21503 (30%)] Loss: 12.269717
Training... Epoch: 1, Iter: 210, [6720/21503 (31%)] Loss: 12.274122
Training... Epoch: 1, Iter: 220, [7040/21503 (33%)] Loss: 12.263970
Training... Epoch: 1, Iter: 230, [7360/21503 (34%)] Loss: 12.267252