Я работаю над созданием генератора изображений, используя условный GAN в качестве базовой модели. Я столкнулся с ошибкой, которую не понимаю, как отлаживать, даже после поиска решений в Интернете. Я не уверен, следует ли мне изменять настройки для тренировки или делать какие-то корректировки моей модели, или что-то еще. Приветствуется любая помощь в том, что делать.
Модель CGAN, которую я использую:
class Generator(nn.Module):
def __init__(self, classes, channels, img_size, latent_dim):
super(Generator, self).__init__()
self.classes = classes
self.channels = channels
self.img_size = img_size
self.latent_dim = latent_dim
self.img_shape = (self.channels, self.img_size, self.img_size)
self.label_embedding = nn.Embedding(self.classes, self.classes) # process label information, behave as a lookup table
self.model = nn.Sequential(
*self._create_layer_1(self.latent_dim + self.classes, 128, False),
*self._create_layer_2(128, 256),
*self._create_layer_2(256, 512),
*self._create_layer_2(512, 1024),
nn.Linear(1024, int(np.prod(self.img_shape))),
nn.Tanh()
)
def _create_layer_1(self, size_in, size_out, normalize=True):
layers = [nn.Linear(size_in, size_out)]
if normalize:
layers.append(nn.BatchNorm1d(size_out))
layers.append(nn.LeakyReLU(0.2, inplace=True))
return layers
def _create_layer_2(self, size_in, size_out, normalize=True):
layers = [nn.ConvTranspose2d(size_in, size_out, 4, 2, 1, bias=False)]
if normalize:
layers.append(nn.BatchNorm1d(size_out))
layers.append(nn.LeakyReLU(0.2, inplace=True))
return layers
def forward(self, noise, labels):
z = torch.cat((self.label_embedding(labels), noise), -1)
x = self.model(z)
x = x.view(x.size(0), *self.img_shape)
return x
class Discriminator(nn.Module):
def __init__(self, classes, channels, img_size, latent_dim):
super(Discriminator, self).__init__()
self.classes = classes
self.channels = channels
self.img_size = img_size
self.latent_dim = latent_dim
self.img_shape = (self.channels, self.img_size, self.img_size)
self.label_embedding = nn.Embedding(self.classes, self.classes)
self.adv_loss = torch.nn.BCELoss()
self.model = nn.Sequential(
*self._create_layer_1(self.classes + int(np.prod(self.img_shape)), 1024, False, True),
*self._create_layer_2(1024, 512, True, True),
*self._create_layer_2(512, 256, True, True),
*self._create_layer_2(256, 128, False, False),
*self._create_layer_1(128, 1, False, False),
nn.Sigmoid()
)
def _create_layer_1(self, size_in, size_out, drop_out=True, act_func=True):
layers = [nn.Linear(size_in, size_out)]
if drop_out:
layers.append(nn.Dropout(0.4))
if act_func:
layers.append(nn.LeakyReLU(0.2, inplace=True))
return layers
def _create_layer_2(self, size_in, size_out, drop_out=True, act_func=True):
layers = [nn.Conv2d(size_in, size_out, 4, 2, 1, bias=False)]
if drop_out:
layers.append(nn.Dropout(0.4))
if act_func:
layers.append(nn.LeakyReLU(0.2, inplace=True))
return layers
def forward(self, image, labels):
x = torch.cat((image.view(image.size(0), -1), self.label_embedding(labels)), -1)
return self.model(x)
def loss(self, output, label):
return self.adv_loss(output, label)
Код для инициализации модели:
class Model(object):
def __init__(self,
name,
device,
data_loader,
classes,
channels,
img_size,
latent_dim,
style_dim=3):
self.name = name
self.device = device
self.data_loader = data_loader
self.classes = classes
self.channels = channels
self.img_size = img_size
self.latent_dim = latent_dim
self.style_dim = style_dim
self.netG = cganG(self.classes, self.channels, self.img_size, self.latent_dim)
self.netG.to(self.device)
self.netD = cganD(self.classes, self.channels, self.img_size, self.latent_dim)
self.netD.to(self.device)
self.optim_G = None
self.optim_D = None
@property
def generator(self):
return self.netG
@property
def discriminator(self):
return self.netD
def create_optim(self, lr, alpha=0.5, beta=0.999):
self.optim_G = torch.optim.Adam(filter(lambda p: p.requires_grad,
self.netG.parameters()),
lr=lr,
betas=(alpha, beta))
self.optim_D = torch.optim.Adam(filter(lambda p: p.requires_grad,
self.netD.parameters()),
lr=lr,
betas=(alpha, beta))
def _to_onehot(self, var, dim):
res = torch.zeros((var.shape[0], dim), device=self.device)
res[range(var.shape[0]), var] = 1.
return res
def train(self,
epochs,
log_interval=100,
out_dir='',
verbose=True):
self.netG.train()
self.netD.train()
viz_z = torch.zeros((self.data_loader.batch_size, self.latent_dim), device=self.device)
viz_noise = torch.randn(self.data_loader.batch_size, self.latent_dim, device=self.device)
nrows = self.data_loader.batch_size // 8
viz_label = torch.LongTensor(np.array([num for _ in range(nrows) for num in range(8)])).to(self.device)
viz_onehot = self._to_onehot(viz_label, dim=self.classes)
viz_style = torch.zeros((self.data_loader.batch_size, self.style_dim), device=self.device)
total_time = time.time()
for epoch in range(epochs):
batch_time = time.time()
for batch_idx, (data, target) in enumerate(self.data_loader):
data, target = data.to(self.device), target.to(self.device)
batch_size = data.size(0)
real_label = torch.full((batch_size, 1), 1., device=self.device)
fake_label = torch.full((batch_size, 1), 0., device=self.device)
# Train G
self.netG.zero_grad()
z_noise = torch.randn(batch_size, self.latent_dim, device=self.device)
x_fake_labels = torch.randint(0, self.classes, (batch_size,), device=self.device)
x_fake = self.netG(z_noise, x_fake_labels)
y_fake_g = self.netD(x_fake, x_fake_labels)
g_loss = self.netD.loss(y_fake_g, real_label)
g_loss.backward()
self.optim_G.step()
# Train D
self.netD.zero_grad()
y_real = self.netD(data, target)
d_real_loss = self.netD.loss(y_real, real_label)
y_fake_d = self.netD(x_fake.detach(), x_fake_labels)
d_fake_loss = self.netD.loss(y_fake_d, fake_label)
d_loss = (d_real_loss + d_fake_loss) / 2
d_loss.backward()
self.optim_D.step()
if verbose and batch_idx % log_interval == 0 and batch_idx > 0:
print('Epoch {} [{}/{}] loss_D: {:.4f} loss_G: {:.4f} time: {:.2f}'.format(
epoch, batch_idx, len(self.data_loader),
d_loss.mean().item(),
g_loss.mean().item(),
time.time() - batch_time))
vutils.save_image(data, os.path.join(out_dir, 'real_samples.png'), normalize=True)
with torch.no_grad():
viz_sample = self.netG(viz_noise, viz_label)
vutils.save_image(viz_sample, os.path.join(out_dir, 'fake_samples_{}.png'.format(epoch)), nrow=8, normalize=True)
batch_time = time.time()
torch.save(self.netG.state_dict(), os.path.join(out_dir, 'netG_{}.pth'.format(epoch)))
torch.save(self.netD.state_dict(), os.path.join(out_dir, 'netD_{}.pth'.format(epoch)))
self.save_to(path=out_dir, name=self.name, verbose=False)
if verbose:
print('Total train time: {:.2f}'.format(time.time() - total_time))
Код для настройки до обучения:
def main():
device = torch.device("cuda:0" if FLAGS.cuda else "cpu")
if FLAGS.train:
dataloader = torch.utils.data.DataLoader(
dset.ImageFolder(FLAGS.data_dir, transforms.Compose([
transforms.Resize(FLAGS.img_size),
transforms.CenterCrop(FLAGS.img_size),
transforms.ToTensor()
])),
batch_size=FLAGS.batch_size,
shuffle=True,
num_workers=4,
pin_memory=True
)
model = Model(FLAGS.model, device, dataloader, FLAGS.classes, FLAGS.channels, FLAGS.img_size, FLAGS.latent_dim)
model.create_optim(FLAGS.lr)
# Train
print("Start training...\n")
model.train(FLAGS.epochs, FLAGS.log_interval, FLAGS.out_dir, True)
if __name__ == '__main__':
from utils import boolean_string
parser.add_argument('--cuda', type=boolean_string, default=True, help='enable CUDA.')
parser.add_argument('--train', type=boolean_string, default=True, help='train mode or eval mode.')
parser.add_argument('--data_dir', type=str, default='../datasets', help='Directory for dataset.')
parser.add_argument('--out_dir', type=str, default='output', help='Directory for output.')
parser.add_argument('--epochs', type=int, default=800, help='number of epochs')
parser.add_argument('--batch_size', type=int, default=32, help='size of batches')
parser.add_argument('--lr', type=float, default=0.0002, help='learning rate')
parser.add_argument('--latent_dim', type=int, default=62, help='latent space dimension')
parser.add_argument('--classes', type=int, default=25, help='number of classes')
parser.add_argument('--img_size', type=int, default=128, help='size of images')
parser.add_argument('--channels', type=int, default=3, help='number of image channels')
Настройки:
PyTorch version: 1.1.0
CUDA version: 9.0.176
Args | Type | Value
--------------------------------------------------
cuda | bool | True
train | bool | True
resume | bool | False
data_dir | str | ../datasets
out_dir | str | output
epochs | int | 800
batch_size | int | 32
lr | float | 0.0002
latent_dim | int | 62
classes | int | 25
img_size | int | 128
channels | int | 3
Размер изображения на входе:
torch.Size([32, 3, 128, 128])
Структура модели:
Generator(
(label_embedding): Embedding(25, 25)
(model): Sequential(
(0): Linear(in_features=87, out_features=128, bias=True)
(1): LeakyReLU(negative_slope=0.2, inplace)
(2): ConvTranspose2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
(3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(4): LeakyReLU(negative_slope=0.2, inplace)
(5): ConvTranspose2d(256, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
(6): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(7): LeakyReLU(negative_slope=0.2, inplace)
(8): ConvTranspose2d(512, 1024, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
(9): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(10): LeakyReLU(negative_slope=0.2, inplace)
(11): Linear(in_features=1024, out_features=49152, bias=True)
(12): Tanh()
)
)
Discriminator(
(label_embedding): Embedding(25, 25)
(adv_loss): BCELoss()
(model): Sequential(
(0): Linear(in_features=49177, out_features=1024, bias=True)
(1): LeakyReLU(negative_slope=0.2, inplace)
(2): Conv2d(1024, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
(3): Dropout(p=0.4)
(4): LeakyReLU(negative_slope=0.2, inplace)
(5): Conv2d(512, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
(6): Dropout(p=0.4)
(7): LeakyReLU(negative_slope=0.2, inplace)
(8): Conv2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
(9): Linear(in_features=128, out_features=1, bias=True)
(10): Sigmoid()
)
)
Я получил ошибку:
File "main.py", line 121, in <module>
main()
File "main.py", line 56, in main
model.train(FLAGS.epochs, FLAGS.log_interval, FLAGS.out_dir, True)
File "build_gan.py", line 123, in train
x_fake = self.netG(z_noise, x_fake_labels)
File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "cgan.py", line 42, in forward
x = self.model(z)
File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 796, in forward
output_padding, self.groups, self.dilation)
RuntimeError: Expected 4-dimensional input for 4-dimensional weight 128 256, but got 2-dimensional input of size [32, 128] instead
Я использую свой собственный набор данных изображений с 3 каналами и 25 классами. Я пытался изменить размер изображения и размер ядра, но все равно получил ту же ошибку. Любая помощь в том, что мне делать для отладки, будет очень признательна.