Я пытаюсь обучить актера-критика c модели, но когда я добираюсь до бэкпроп за критику c, я получаю эту ошибку: RuntimeError: invalid gradient at index 0 - expected type torch.cuda.FloatTensor but got torch.FloatTensor
Я не могу определить, к какому градиенту относится ошибка. Кто-нибудь может помочь?
Вот трассировка стека:
Traceback (most recent call last):
File "train.py", line 338, in <module>
main()
File "train.py", line 327, in main
reinforce_trainer.train(opt.start_reinforce, opt.start_reinforce + opt.critic_pretrain_epochs - 1, True, start_time)
File "/home/fbommfim/init-tests/treeLSTM/lib/train/reinforce_trainer.py", line 56, in train
train_reward, critic_loss = self.train_epoch(epoch, pretrain_critic, no_update)
File "/home/fbommfim/init-tests/treeLSTM/lib/train/reinforce_trainer.py", line 153, in train_epoch
critic_loss = self.critic.backward(baselines.cuda(), rewards, critic_weights.cuda(), num_words, self.critic_loss_func, regression=True)
File "/home/fbommfim/init-tests/treeLSTM/lib/model/encoder_decoder/hybrid2seq_model.py", line 67, in backward
outputs.backward(grad_output)
File "/home/linuxbrew/.linuxbrew/Cellar/python/3.7.6_1/lib/python3.7/site-packages/torch/tensor.py", line 195, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/linuxbrew/.linuxbrew/Cellar/python/3.7.6_1/lib/python3.7/site-packages/torch/autograd/__init__.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: invalid gradient at index 0 - expected type torch.cuda.FloatTensor but got torch.FloatTensor
и соответствующий код: train_epoch
из reinforce_trainer
def train_epoch(self, epoch, pretrain_critic, no_update):
self.actor.train() # may also have self.critic.train() ?
total_reward, report_reward = 0, 0
total_critic_loss, report_critic_loss = 0, 0
total_sents, report_sents = 0, 0
total_words, report_words = 0, 0
last_time = time.time()
batch_count = len(self.train_data)
batch_order = torch.randperm(batch_count)
with tqdm(total = (batch_count)) as prog:
for i in range(batch_count):
batch = self.train_data[i] # batch_order[i]
if self.opt.data_type == 'code':
targets = batch[2]
attention_mask = batch[1][2][0].data.eq(lib.Constants.PAD).t()
elif self.opt.data_type == 'text':
targets = batch[2]
attention_mask = batch[0][0].data.eq(lib.Constants.PAD).t()
elif self.opt.data_type == 'hybrid':
targets = batch[2]
attention_mask_code = batch[1][2][0].data.eq(lib.Constants.PAD).t()
attention_mask_txt = batch[0][0].data.eq(lib.Constants.PAD).t()
batch_size = targets.size(1)
self.actor.zero_grad()
self.critic.zero_grad()
# Sample translations
if self.opt.has_attn:
if self.opt.data_type == 'code' or self.opt.data_type == 'text':
self.actor.decoder.attn.applyMask(attention_mask)
elif self.opt.data_type == 'hybrid':
self.actor.decoder.attn.applyMask(attention_mask_code, attention_mask_txt)
samples, outputs = self.actor.sample(batch, self.max_length)
# Calculate rewards
rewards, samples = self.sent_reward_func(samples.t().tolist(), targets.data.t().tolist())
reward = sum(rewards)
# Perturb rewards (if specified).
if self.pert_func is not None:
rewards = self.pert_func(rewards)
samples = torch.LongTensor(samples).t().contiguous()
rewards = torch.FloatTensor([rewards] * samples.size(0)).contiguous()
if self.opt.cuda:
samples = samples.cuda()
rewards = rewards.cuda()
# Update critic.
critic_weights = samples.ne(lib.Constants.PAD).float()
num_words = critic_weights.data.sum()
if not no_update:
if self.opt.data_type == 'code':
baselines = self.critic((batch[0], batch[1], samples, batch[3]), eval=False, regression=True)
elif self.opt.data_type == 'text':
baselines = self.critic((batch[0], batch[1], samples, batch[3]), eval=False, regression=True)
elif self.opt.data_type == 'hybrid':
baselines = self.critic((batch[0], batch[1], samples, batch[3]), eval=False, regression=True)
critic_loss = self.critic.backward(baselines, rewards, critic_weights, num_words, self.critic_loss_func, regression=True)
self.critic_optim.step()
else:
critic_loss = 0
# Update actor
if not pretrain_critic and not no_update:
# Subtract baseline from reward
norm_rewards = (rewards - baselines).data
actor_weights = norm_rewards * critic_weights
# TODO: can use PyTorch reinforce() here but that function is a black box.
# This is an alternative way where you specify an objective that gives the same gradient
# as the policy gradient's objective, which looks much like weighted log-likelihood.
actor_loss = self.actor.backward(outputs, samples, actor_weights, 1, self.actor_loss_func)
self.optim.step()
else:
actor_loss = 0
# Gather stats
total_reward += reward
report_reward += reward
total_sents += batch_size
report_sents += batch_size
total_critic_loss += critic_loss
report_critic_loss += critic_loss
total_words += num_words
report_words += num_words
self.opt.iteration += 1
print ("iteration: %s, loss: %s " % (self.opt.iteration, actor_loss))
print ("iteration: %s, reward: %s " % (self.opt.iteration, (report_reward / report_sents) * 100))
if i % self.opt.log_interval == 0 and i > 0:
print("""Epoch %3d, %6d/%d batches; actor reward: %.4f; critic loss: %f; %5.0f tokens/s; %s elapsed""" %
(epoch, i, batch_count, (report_reward / report_sents) * 100,
report_critic_loss / report_words,
report_words / (time.time() - last_time),
str(datetime.timedelta(seconds=int(time.time() - self.start_time)))))
report_reward = report_sents = report_critic_loss = report_words = 0
last_time = time.time()
prog.update(1)
return total_reward / total_sents, total_critic_loss / total_words
и backward
для hybrid2seq_model.py
:
def backward(self, outputs, targets, weights, normalizer, criterion, regression=False):
grad_output, loss = self.generator.backward(outputs, targets, weights, normalizer, criterion, regression)
outputs.cuda()
grad_output.cuda()
outputs.backward(grad_output)
return loss