Question

Я пытаюсь обучить эту модель keras-yolov3 из keras-yolov3 в моем настраиваемом наборе данных (который имеет только один класс для «персонажа») с помощью Google Colab.После первых 50 эпох использования полного Yolo с замороженными выходными слоями я получаю ошибку RunOutOfMemory памяти GPU с Tesla T4 GPU.Итак, я попытался обучить модель до первых 50 эпох, затем сохранил вес модели в logs / 000 / train_weights_stage_1.h5, а затем перезагрузил модель с новыми весами.

Это код изtrain.py, который я использовал для обучения первых 50 эпох.

"""
Retrain the YOLO model for your own dataset.
"""

"""
I am removing the checkpointing, since it takes a lot of space and training 
Yolov3 model doesn't take much time, just saving end weights
"""

import numpy as np
import keras.backend as K
from keras.layers import Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data


def _main():
    annotation_path = 'model_data/annotations.txt'
    log_dir = 'logs/000/'
    classes_path = 'model_data/people_tracking_classes.txt'
    anchors_path = 'model_data/yolo_anchors.txt'
    class_names = get_classes(classes_path)
    num_classes = len(class_names)
    anchors = get_anchors(anchors_path)

    input_shape = (416,416) # multiple of 32, hw

    is_tiny_version = len(anchors)==6 # default setting
    if is_tiny_version:
        model = create_tiny_model(input_shape, anchors, num_classes,
            freeze_body=2, weights_path='model_data/tiny_yolo.h5')
    else:
        model = create_model(input_shape, anchors, num_classes,
            freeze_body=2, weights_path='model_data/yolo.h5') # make sure you know what you freeze

    logging = TensorBoard(log_dir=log_dir) #SEE THIS
    checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3, verbose=0)
    #verbose=1 isn't that beneficial.
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

    val_split = 0.1
    with open(annotation_path) as f:
        lines = f.readlines()
    np.random.seed(10101)
    np.random.shuffle(lines)
    print(lines)
    np.random.seed(None)
    num_val = int(len(lines)*val_split)
    num_train = len(lines) - num_val

    # Train with frozen layers first, to get a stable loss.
    # Adjust num epochs to your dataset. This step is enough to obtain a not bad model.
    if True:
        model.compile(optimizer=Adam(lr=1e-3), loss={
            # use custom yolo_loss Lambda layer.
            'yolo_loss': lambda y_true, y_pred: y_pred})



        batch_size = 32
        print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
        history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
                steps_per_epoch=max(1, num_train//batch_size),
                validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
                validation_steps=max(1, num_val//batch_size),
                epochs=50,
                initial_epoch=0,
                callbacks=[logging]) #checkpoint])
        model.save_weights(log_dir + 'trained_weights_stage_1.h5')
        model.save(log_dir + 'trained_model_stage_1.h5')
        json_string = model.to_json()
        print('Saved Model Architecture to:  {}'.format(json_string))


    # Unfreeze and continue training, to fine-tune.
    # Train longer if the result is not good.
    """if True:
        for i in range(len(model.layers)):
            model.layers[i].trainable = True
        model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
        print('Unfreeze all of the layers.')

        batch_size = 32 # note that more GPU memory is required after unfreezing the body
        print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
        #import pdb; pdb.set_trace()
        history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
                      steps_per_epoch=max(1, num_train//batch_size),
                      validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
                      validation_steps=max(1, num_val//batch_size),
                      epochs=100,
                      initial_epoch=50,
                      callbacks=[logging, reduce_lr, early_stopping]) #removed checkpointing.
        model.save_weights(log_dir + 'trained_weights_final.h5')"""

    # Further training if needed.


def get_classes(classes_path):
    '''loads the classes'''
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

def get_anchors(anchors_path):
    '''loads the anchors from a file'''
    with open(anchors_path) as f:
        anchors = f.readline()
    anchors = [float(x) for x in anchors.split(',')]
    return np.array(anchors).reshape(-1, 2)


def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
            weights_path='model_data/yolo.h5'):
    '''create the training model'''
    K.clear_session() # get a new session
    image_input = Input(shape=(None, None, 3))
    h, w = input_shape
    num_anchors = len(anchors)

    #This I don't know what's happening, can understand the python, but not the
    # Logic behind y_true, but seems like 3 output layers of yolov3
    y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \
        num_anchors//3, num_classes+5)) for l in range(3)]

    model_body = yolo_body(image_input, num_anchors//3, num_classes)
    print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))



    if load_pretrained:
      """model.load_weights(filepath, by_name=False) loads the weights of 
      the model from a HDF5 file (created by save_weights). By default, the
      architecture is expected to be unchanged. To load weights into a different
      architecture (with some layers in common), use by_name=True to load only 
      those layers with the same name.
      """
      model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
      #skip_mismatch
      print(len(model_body.layers)) # To find out the number of layers, should
      # be 185.

      print('Load weights {}.'.format(weights_path))
      if freeze_body in [1, 2]:
          # Freeze darknet53 body or freeze all but 3 output layers.
          num = (185, len(model_body.layers)-3)[freeze_body-1]
          for i in range(num): model_body.layers[i].trainable = False
          print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))

    model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
        arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})(
        [*model_body.output, *y_true])
    model = Model([model_body.input, *y_true], model_loss)

    return model

def create_tiny_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
            weights_path='model_data/tiny_yolo.h5'):
    '''create the training model, for Tiny YOLOv3'''
    K.clear_session() # get a new session
    image_input = Input(shape=(None, None, 3))
    h, w = input_shape
    num_anchors = len(anchors)

    y_true = [Input(shape=(h//{0:32, 1:16}[l], w//{0:32, 1:16}[l], \
        num_anchors//2, num_classes+5)) for l in range(2)]

    model_body = tiny_yolo_body(image_input, num_anchors//2, num_classes)
    print('Create Tiny YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))

    if load_pretrained:
        model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
        print('Load weights {}.'.format(weights_path))
        if freeze_body in [1, 2]:
            # Freeze the darknet body or freeze all but 2 output layers.
            num = (20, len(model_body.layers)-2)[freeze_body-1]
            for i in range(num): model_body.layers[i].trainable = False
            print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))

    model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
        arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.7})(
        [*model_body.output, *y_true])
    model = Model([model_body.input, *y_true], model_loss)

    return model

def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes):
    '''data generator for fit_generator'''
    n = len(annotation_lines)
    i = 0
    while True:
        image_data = []
        box_data = []
        for b in range(batch_size):
            if i==0:
                np.random.shuffle(annotation_lines)
            image, box = get_random_data(annotation_lines[i], input_shape, random=True)
            image_data.append(image)
            box_data.append(box)
            i = (i+1) % n
        image_data = np.array(image_data)
        box_data = np.array(box_data)
        y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
        yield [image_data, *y_true], np.zeros(batch_size)

def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes):
    n = len(annotation_lines)
    if n==0 or batch_size<=0: return None
    return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes)

if __name__ == '__main__':
    _main()


# Explaining the Skipping loading of weights warning below.


"""These layers are the three output layers of the yolo network. You probably 
got the message because you changed the number of classes and therefore the 
shape of the last conv layers will change aswell. So you can just ignore the 
message, the network will work just fine."""

После завершения первых 50 эпох, как я уже говорил, я загрузил тренировочные веса в модель и попытался отрегулировать ее, разморозив все слои.Как можно видеть ниже:

from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import numpy as np
from keras.layers import Input, Lambda




from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data


import keras.losses
keras.losses.custom_loss = yolo_loss

log_dir = 'logs/000/'
annotation_path = 'model_data/annotations.txt'
val_split = 0.1

with open(annotation_path) as f:
    lines = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines)
print(lines)
np.random.seed(None)
num_val = int(len(lines)*val_split)
num_train = len(lines) - num_val

checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=10, verbose=0)



classes_path = 'model_data/new_classes.txt'


def get_classes(classes_path):
    '''loads the classes'''
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

class_names = get_classes(classes_path)  
num_classes = len(class_names)


from yolo3.model import yolo_head, yolo_body
from keras.layers import Input, Lambda
from keras.utils import CustomObjectScope


model = yolo_body(Input(shape=(None, None, 3)), 3, num_classes)
model.load_weights('logs/000/trained_weights_stage_1.h5', by_name=True) 


# Unfreeze and continue training, to fine-tune.
# Train longer if the result is not good.
if True:
    for i in range(len(model.layers)):
        model.layers[i].trainable = True
    model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
    print('Unfreeze all of the layers.')

    batch_size = 32 # note that more GPU memory is required after unfreezing the body
    print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
    #import pdb; pdb.set_trace()
    history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
                  steps_per_epoch=max(1, num_train//batch_size),
                  validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
                  validation_steps=max(1, num_val//batch_size),
                  epochs=100,
                  initial_epoch=50,
                  callbacks=[logging, reduce_lr, early_stopping]) #removed checkpointing.
    model.save_weights(log_dir + 'trained_weights_final.h5')

  # Further training if needed.`

Я получаю следующее ValueError при запуске кода выше:

ValueError                                Traceback (most recent call last)
<ipython-input-30-d6473c9875b2> in <module>()
     61     for i in range(len(model.layers)):
     62         model.layers[i].trainable = True
---> 63     model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
     64     print('Unfreeze all of the layers.')
     65 

/usr/local/lib/python3.6/dist-packages/keras/engine/training.py in compile(self, optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, target_tensors, **kwargs)
    117                                      'dictionary: "' + name + '". '
    118                                      'Only expected the following keys: ' +
--> 119                                      str(self.output_names))
    120             loss_functions = []
    121             for name in self.output_names:

ValueError: Unknown entry in loss dictionary: "yolo_loss". Only expected the following keys: ['conv2d_209', 'conv2d_217', 'conv2d_225']

В недоумении, как избавиться от этой ошибки и обучитьполная модель с сохраненными весами из первой части.Кроме того, я попытался загрузить полную модель из model.save('trained_model_stage_1.h5') и использовать model.load_model('trained_model_stage_1.h5', custom_objects={'yolo_loss' : yolo_loss}), но это также не работает для меня, и я получаю ошибку yolo_head is not defined.

ValueError: Неизвестная запись в словаре потерь: "yolo_loss". Ожидаются только следующие ключи: ['conv2d_209', 'conv2d_217', 'conv2d_225'] для Yolo

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

ValueError: Неизвестная запись в словаре потерь: "yolo_loss". Ожидаются только следующие ключи: ['conv2d_209', 'conv2d_217', 'conv2d_225'] для Yolo

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Похожие темы