У меня есть глубокая сеть с автоматическим кодировщиком, и я замечаю, что когда я использую nn.Sequential
, производительность обобщения лучше, чем когда я ее не использую (т.е. явно пропускаю входные данные через слои). Кто-нибудь еще заметил это поведение или может объяснить, почему это так? Pytorch по-разному обрабатывает регуляризацию в последовательном блоке?
Вот фрагмент кода; Я приложил полный код в конце для справки. Я переключаюсь между использованием nn.Sequential
и не использованием его с переменной use_sequential
. Я обнаружил, что, когда я не использую последовательный модуль, точность моего теста всегда хуже, чем если бы я его использовал.
class Net(nn.Module):
def __init__(self, hidden_dim, in_dim, use_sequential):
super(Net, self).__init__()
self.use_sequential = use_sequential
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.sig = nn.Sigmoid()
self.encoder = nn.Sequential(
nn.Linear(in_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, hidden_dim)
)
self.decoder = nn.Sequential(
nn.Linear(hidden_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, in_dim)
)
def encode(self, x):
if self.use_sequential:
x = self.encoder(x)
else:
x = self.lin1(x)
x = self.batchnorm(x)
x = self.lin2(x)
return x
def decode(self, x):
if self.use_sequential:
x = self.decoder(x)
else:
x = self.lin3(x)
x = self.batchnorm(x)
x = self.lin4(x)
return x
def forward(self, x):
x = self.encode(x)
x = self.decode(x)
x = self.sig(x) # Sigmoid for BCELoss
return x
Вот вывод сценария. Как вы можете видеть, точность / потеря теста хуже, когда модели тренируются, хотя их точность / потеря поезда остаются одинаковыми (я сам инициализирую все веса, смотри полный код в конце):
SEQUENTIAL TRAIN, Epoch 0: loss=0.7185, acc=0.51
NONSEQUENTIAL TRAIN, Epoch 0: loss=0.7185, acc=0.51
---> SEQUENTIAL TEST: Epoch 0: loss=0.7240, acc=0.50080
---> NONSEQUENTIAL TEST: Epoch 0: loss=0.7240, acc=0.50080
SEQUENTIAL TRAIN, Epoch 1: loss=0.7192, acc=0.49
NONSEQUENTIAL TRAIN, Epoch 1: loss=0.7192, acc=0.49
---> SEQUENTIAL TEST: Epoch 1: loss=0.7226, acc=0.49920
---> NONSEQUENTIAL TEST: Epoch 1: loss=0.7221, acc=0.49920
SEQUENTIAL TRAIN, Epoch 2: loss=0.7207, acc=0.50
NONSEQUENTIAL TRAIN, Epoch 2: loss=0.7208, acc=0.50
---> SEQUENTIAL TEST: Epoch 2: loss=0.7183, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 2: loss=0.7186, acc=0.49920
SEQUENTIAL TRAIN, Epoch 3: loss=0.7032, acc=0.54
NONSEQUENTIAL TRAIN, Epoch 3: loss=0.7033, acc=0.54
---> SEQUENTIAL TEST: Epoch 3: loss=0.7104, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 3: loss=0.7153, acc=0.49920
SEQUENTIAL TRAIN, Epoch 4: loss=0.7002, acc=0.56
NONSEQUENTIAL TRAIN, Epoch 4: loss=0.7002, acc=0.56
---> SEQUENTIAL TEST: Epoch 4: loss=0.7006, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 4: loss=0.7119, acc=0.49920
SEQUENTIAL TRAIN, Epoch 5: loss=0.6906, acc=0.55
NONSEQUENTIAL TRAIN, Epoch 5: loss=0.6907, acc=0.55
---> SEQUENTIAL TEST: Epoch 5: loss=0.6903, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 5: loss=0.7088, acc=0.49920
SEQUENTIAL TRAIN, Epoch 6: loss=0.6807, acc=0.54
NONSEQUENTIAL TRAIN, Epoch 6: loss=0.6811, acc=0.54
---> SEQUENTIAL TEST: Epoch 6: loss=0.6815, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 6: loss=0.7058, acc=0.49920
SEQUENTIAL TRAIN, Epoch 7: loss=0.6698, acc=0.52
NONSEQUENTIAL TRAIN, Epoch 7: loss=0.6702, acc=0.52
---> SEQUENTIAL TEST: Epoch 7: loss=0.6729, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 7: loss=0.7033, acc=0.49920
SEQUENTIAL TRAIN, Epoch 8: loss=0.6710, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 8: loss=0.6722, acc=0.60
---> SEQUENTIAL TEST: Epoch 8: loss=0.6643, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 8: loss=0.7014, acc=0.49920
SEQUENTIAL TRAIN, Epoch 9: loss=0.6642, acc=0.71
NONSEQUENTIAL TRAIN, Epoch 9: loss=0.6659, acc=0.65
---> SEQUENTIAL TEST: Epoch 9: loss=0.6612, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 9: loss=0.6999, acc=0.49920
SEQUENTIAL TRAIN, Epoch 10: loss=0.6593, acc=0.68
NONSEQUENTIAL TRAIN, Epoch 10: loss=0.6613, acc=0.68
---> SEQUENTIAL TEST: Epoch 10: loss=0.6570, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 10: loss=0.6988, acc=0.49920
SEQUENTIAL TRAIN, Epoch 11: loss=0.6522, acc=0.68
NONSEQUENTIAL TRAIN, Epoch 11: loss=0.6541, acc=0.68
---> SEQUENTIAL TEST: Epoch 11: loss=0.6540, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 11: loss=0.6978, acc=0.49920
SEQUENTIAL TRAIN, Epoch 12: loss=0.6651, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 12: loss=0.6679, acc=0.67
---> SEQUENTIAL TEST: Epoch 12: loss=0.6511, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 12: loss=0.6971, acc=0.49920
SEQUENTIAL TRAIN, Epoch 13: loss=0.6617, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 13: loss=0.6640, acc=0.67
---> SEQUENTIAL TEST: Epoch 13: loss=0.6494, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 13: loss=0.6964, acc=0.49920
SEQUENTIAL TRAIN, Epoch 14: loss=0.6506, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 14: loss=0.6527, acc=0.67
---> SEQUENTIAL TEST: Epoch 14: loss=0.6470, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 14: loss=0.6961, acc=0.49920
SEQUENTIAL TRAIN, Epoch 15: loss=0.6479, acc=0.69
NONSEQUENTIAL TRAIN, Epoch 15: loss=0.6500, acc=0.69
---> SEQUENTIAL TEST: Epoch 15: loss=0.6453, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 15: loss=0.6954, acc=0.49920
SEQUENTIAL TRAIN, Epoch 16: loss=0.6441, acc=0.70
NONSEQUENTIAL TRAIN, Epoch 16: loss=0.6461, acc=0.70
---> SEQUENTIAL TEST: Epoch 16: loss=0.6445, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 16: loss=0.6950, acc=0.49920
...
Я обнаружил, что проблема заключается в слое BatchNorm1d
, потому что, когда я вынимаю его из модели, проблема исчезает. Есть ли разница между BatchNorm1d
в последовательном блоке? Или я сделал ошибку, которую я пропускаю? Заранее спасибо за любую помощь!
Вот полный код:
import torch
from torch.utils import data
from torch.optim import Adam
from tqdm import tqdm
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, data, labels):
self.labels = labels
self.data = data
def __len__(self):
'Denotes the total number of samples'
return len(self.labels)
def __getitem__(self, index):
'Generates one sample of data'
# Load data and get label
X = self.data[index]
y = self.labels[index]
return X, y
class Net(nn.Module):
def __init__(self, hidden_dim, in_dim, use_sequential):
super(Net, self).__init__()
self.use_sequential = use_sequential
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.sig = nn.Sigmoid()
self.encoder = nn.Sequential(
nn.Linear(in_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, hidden_dim)
)
self.decoder = nn.Sequential(
nn.Linear(hidden_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, in_dim)
)
self.lin1 = nn.Linear(in_dim, in_dim)
self.lin1.weight.data.fill_(0.01)
self.lin1.bias.data.fill_(0.01)
self.batchnorm = nn.BatchNorm1d(in_dim)
self.batchnorm.weight.data.fill_(0.01)
self.batchnorm.bias.data.fill_(0.01)
self.lin2 = nn.Linear(in_dim, hidden_dim)
self.lin2.weight.data.fill_(0.01)
self.lin2.bias.data.fill_(0.01)
self.lin3 = nn.Linear(hidden_dim, in_dim)
self.lin3.weight.data.fill_(0.01)
self.lin3.bias.data.fill_(0.01)
self.lin4 = nn.Linear(in_dim, in_dim)
self.lin4.weight.data.fill_(0.01)
self.lin4.bias.data.fill_(0.01)
def encode(self, x):
if self.use_sequential:
x = self.encoder(x)
else:
x = self.lin1(x)
x = self.batchnorm(x)
x = self.lin2(x)
return x
def decode(self, x):
if self.use_sequential:
x = self.decoder(x)
else:
x = self.lin3(x)
x = self.batchnorm(x)
x = self.lin4(x)
return x
def forward(self, x):
x = self.encode(x)
x = self.decode(x)
x = self.sig(x) # Sigmoid for BCELoss
return x
def accuracy(preds, labels):
acc2 = 1 - torch.sum(torch.abs(preds-labels)).item() / (list(preds.size())[0]*list(preds.size())[1])
return acc2
def generate_data(block_size):
train_data = torch.randint(2, (10000, block_size)).float()
test_data = torch.randint(2, (2500, block_size)).float()
train_labels = train_data
test_labels = test_data
return train_data, train_labels, test_data, test_labels
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.BatchNorm1d:
m.weight.data.fill_(0.01)
m.bias.data.fill_(0.01)
if type(m) == nn.PReLU:
m.weight.data.fill_(0.01)
########################## Train code ####################
IN_DIM = 4
HIDDEN_DIM = 32
EPOCHS = 200
BATCH_SIZE = 256
# Generate data
train_data, train_labels, test_data, test_labels = generate_data(IN_DIM)
# Data loading
params = {'batch_size': BATCH_SIZE,
'shuffle': True,
'num_workers': 8}
training_set = Dataset(train_data, train_labels)
training_loader = torch.utils.data.DataLoader(training_set, **params)
# Sequential and non-sequential models
model_seq = Net(hidden_dim=HIDDEN_DIM, in_dim=IN_DIM, use_sequential=True)
model_non = Net(hidden_dim=HIDDEN_DIM, in_dim=IN_DIM, use_sequential=False)
model_seq.apply(init_weights)
model_non.apply(init_weights)
loss_fn = nn.BCEWithLogitsLoss()
optimizer_seq = Adam(model_seq.parameters(), lr=0.001)
optimizer_non = Adam(model_non.parameters(), lr=0.001)
# Training
for epoch in range(EPOCHS):
model_seq.train()
model_non.train()
for batch_idx, (batch, labels) in enumerate(training_loader):
# Testing sequential model
output_seq = model_seq(batch)
loss_seq = loss_fn(output_seq, labels)
optimizer_seq.zero_grad()
loss_seq.backward()
optimizer_seq.step()
# Testing non-sequential model
output_non = model_non(batch)
loss_non = loss_fn(output_non, labels)
optimizer_non.zero_grad()
loss_non.backward()
optimizer_non.step()
if batch_idx % (BATCH_SIZE-1) == 0:
pred_seq = torch.round(output_seq)
acc_seq = accuracy(pred_seq, labels)
print('SEQUENTIAL TRAIN, Epoch %2d: loss=%.4f, acc=%.2f' % (epoch, loss_seq.item(), acc_seq))
pred_non = torch.round(output_non)
acc_non = accuracy(pred_non, labels)
print('NONSEQUENTIAL TRAIN, Epoch %2d: loss=%.4f, acc=%.2f' % (epoch, loss_non.item(), acc_non))
# Sequential Validation
model_seq.eval()
val_output_seq = model_seq(test_data)
val_loss_seq = loss_fn(val_output_seq, test_labels)
val_pred_seq = torch.round(val_output_seq)
val_acc_seq = accuracy(val_pred_seq, test_labels)
print('---> SEQUENTIAL TEST: Epoch %2d: loss=%.4f, acc=%.5f' % (epoch, val_loss_seq.item(), val_acc_seq))
model_seq.train()
# Nonsequential Validation
model_non.eval()
val_output_non = model_non(test_data)
val_loss_non = loss_fn(val_output_non, test_labels)
val_pred_non = torch.round(val_output_non)
val_acc_non = accuracy(val_pred_non, test_labels)
print('---> NONSEQUENTIAL TEST: Epoch %2d: loss=%.4f, acc=%.5f' % (epoch, val_loss_non.item(), val_acc_non))
model_non.train()
print('\n')