Модель Pytorch (трансферное обучение) не учится - PullRequest
0 голосов
/ 16 июня 2019

Я изучаю учебное пособие по переводу в Pytorch: https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html

Я взял большую часть кода из учебного пособия по переводу и внес некоторые изменения, чтобы чаще печатать val_acc, чтобысоответствовать моему набору данных.

Несмотря на работоспособность исходного кода учебного пособия по переводу (модель Densenet тренировалась), мой код с изменениями больше не работает.Я не могу понять, почему.Я поиграл с темпами обучения и пока без изменений.

 from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy




def prepare_dataset_from_folder(data_dir, size, batch_size):
    """prepare datasets into dataloaders
    args
    ------
    data_dir = root_dir holding 'train_dir' and 'val_dir'

    size = size of image

    batch_size = batch_size
    """

    data_transforms = {
        'training': transforms.Compose([
           transforms.Resize((size,size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'validation': transforms.Compose([
            transforms.Resize((size,size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }


    image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                              data_transforms[x])
                      for x in ['training', 'validation']}


    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size= batch_size,
                                                 shuffle=True, num_workers=1) #num_workers leads to errors
                  for x in ['training', 'validation']}
    dataset_sizes = {x: len(image_datasets[x]) for x in ['training', 'validation']}
    class_names = image_datasets['training'].classes

    return dataloaders, dataset_sizes, class_names





def train_model(model, criterion, optimizer, scheduler, num_epochs=3, best_acc = 80.0, batch_size = 5):
    since = time.time()
    dataloaders, dataset_sizes, class_names =  prepare_dataset_from_folder(data_dir, size, batch_size)


    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        running_loss = 0.0
        running_corrects = 0

        for num, data in enumerate(dataloaders["training"]):

            if num % 100 == 0:
                   val_running_loss = 0.0
                   val_running_corrects = 0
                   model.eval()

                   for val_num, val_data in enumerate(dataloaders["validation"]):
                       inputs, labels = val_data
                       inputs = inputs.to(device)
                       labels = labels.to(device)
                       with torch.set_grad_enabled(False):
                           outputs = model(inputs)
                           _, preds = torch.max(outputs, 1)
                           loss = criterion(outputs, labels)

                       val_running_loss += loss.item() * inputs.size(0)
                       val_running_corrects += torch.sum(preds == labels.data)

                   print(dataset_sizes['validation']) 
                   val_loss = ( val_running_loss/dataset_sizes['validation'])
                   val_acc = (val_running_corrects.double() / dataset_sizes['validation'])#.cpu().numpy()
                   print('val_loss  {:.4f} val_acc: {:.4f}'.format(val_loss, val_acc))
                   if val_acc > best_acc:
                      print("Saving due to high val accuracy")
                      x = datetime.datetime.now()
                      time_stamp_str = str(x).split(' ')[0] + "_" 
                      save_path = os.path.join(SAVE_DIR, f"val_acc_{val_acc}.pt")
                      torch.save(model, save_path)

            scheduler.step()
            model.train()
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
             #what is this

            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                    #print(torch.is_grad_enabled())
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)


            if num % 100 == 0:
                    print("100 batches")
                    temp_running_corrects = (running_corrects.double() / ( len(inputs) *  num + 1)).cpu().numpy()
                    print(temp_running_corrects)

        epoch_loss = running_loss / dataset_sizes['training']
        epoch_acc = running_corrects.double() / dataset_sizes['training']


        print('Training Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))


    return model








if __name__=="__main__":



    data_dir = r'TEST 10'
    device = torch.device("cuda:0")
    PATH =" "
    size = 224
    batch_size = 2



    # pick your model
    #model_ft = torch.load(PATH)
    #model_ft = models.resnet18(pretrained=True)
    model_ft = models.densenet201(pretrained=True)



    #num_ftrs = model_ft.fc.in_features #in features
    num_ftrs = 1920 #densenet 201
    #https://discuss.pytorch.org/t/what-does-the-fc-in-feature-mean/4889


    model_ft.fc = nn.Linear(num_ftrs, 2)

    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.0001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1)


    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,num_epochs=3, batch_size = batch_size)

Вот результат:

Training Loss: 8.5898 Acc: 0.0102
Epoch 1/2
val_loss 8.5163 val_acc: 0.0191
val_loss 8.9667 val_acc: 0.0255
val_loss 9.3078 val_acc: 0.0127
val_loss 8.7855 val_acc: 0.0318
val_loss 8.6217 val_acc: 0.0255
val_loss 9.0631 val_acc: 0.0191
val_loss 8.5167 val_acc: 0.0255
val_loss 9.0499 val_acc: 0.0191
val_loss 9.0549 val_acc: 0.0255
val_loss 8.8373 val_acc: 0.0191
val_loss 8.9288 val_acc: 0.0191
val_loss 8.9968 val_acc: 0.0127
val_loss 9.2790 val_acc: 0.0127
val_loss 9.4389 val_acc: 0.0191
val_loss 8.6907 val_acc: 0.0318
val_loss 9.0903 val_acc: 0.0191
val_loss 9.0093 val_acc: 0.0191
val_loss 9.4387 val_acc: 0.0127
val_loss 9.1059 val_acc: 0.0191
val_loss 9.3480 val_acc: 0.0127
val_loss 8.9435 val_acc: 0.0191
val_loss 8.4412 val_acc: 0.0318
val_loss 8.8712 val_acc: 0.0382
val_loss 8.9125 val_acc: 0.0191
val_loss 9.3815 val_acc: 0.0127
val_loss 9.0214 val_acc: 0.0191
val_loss 9.4234 val_acc: 0.0127
val_loss 9.1625 val_acc: 0.0191

Однако, если я использую оригинальный учебник, моя модель, кажется, учится.

from __future__ import print_function, division
import pretrainedmodels as ptmodels

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy



def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['training', 'validation']:
            if phase == 'training':
                scheduler.step()  #what is this
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for num, data in enumerate(dataloaders[phase]):
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'training'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'training':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                if num % 100 == 0:
                    print("500 batches")
                    temp_running_corrects = (running_corrects.double() / ( len(inputs) *  num + 1)).cpu().numpy()
                    print(temp_running_corrects)
                    #print('running_corrects', running_corrects/( len(inputs) *  num + 1).numpy())

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'validation' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    save_path = os.path.join(SAVE_DIR, f"val_acc_{val_acc}.pt")
    torch.save(model, save_path)
    return model


if __name__=="__main__":

    img_size = 224      
    data_transforms = {
        'training': transforms.Compose([
           transforms.Resize((img_size,img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'validation': transforms.Compose([
            transforms.Resize((img_size,img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    data_dir = r'TEST 10'

    image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                              data_transforms[x])
                      for x in ['training', 'validation']}
    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=2,
                                                 shuffle=True, num_workers=4) #num_workers leads to errors
                  for x in ['training', 'validation']}
    dataset_sizes = {x: len(image_datasets[x]) for x in ['training', 'validation']}
    class_names = image_datasets['training'].classes

    device = torch.device("cuda:0")

    #model_ft = models.resnet18(pretrained=True)
    #model_ft = models.resnet152(pretrained = True)
    #model_ft = ptmodels.__dict__['polynet'](num_classes=1000, pretrained='imagenet')
    model_ft = models.densenet201(pretrained=True)
    #model_ft = models.resnext101_32x8d(pretrained=True).fc.in_features
    print(model_ft.features)

    #num_ftrs = model_ft.fc.in_features #in features
    num_ftrs = 1920 #densenet 201
    #num_ftrs = 2048
    #https://discuss.pytorch.org/t/what-does-the-fc-in-feature-mean/4889


    model_ft.fc = nn.Linear(num_ftrs, 2)
    #model_ft.last_linear = nn.Linear(num_ftrs, 2)

    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1)



    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                           num_epochs=3)

https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html

Training Loss:
Epoch 0/2

500 batches
0.0
500 batches
0.681592039800995
500 batches
0.6359102244389028
500 batches
0.6422628951747088
500 batches
0.6541822721598003
500 batches
0.6763236763236763
500 batches
0.694421315570358
500 batches
0.7059243397573162
500 batches
0.7108057464084947
500 batches
0.7096057745696835
500 batches
0.7126436781609196
500 batches
0.7142208087233075
500 batches
0.7251145356101625
500 batches
0.7293348712033834
500 batches
0.729382363441628
500 batches
0.7340886371209597
500 batches
0.7363323961262105

Я застрял в ней пару дней и не могу понять.Я просто пытаюсь распечатать точность проверки чаще.

Спасибо.

...