Я пытаюсь реализовать статью «SegBot A Generi c Модель сегментации нейронного текста с сетью указателей», в этой статье они выполняют сегментацию topi c и элементарную дискурсивную сегментацию. Я использовал следующий код, но я получаю один и тот же вывод для каждого временного шага декодера. У меня есть кодер, а затем декодер, после чего я использую внимание для распределения по входу.
def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)
class PointerNetwork(nn.Module):
def __init__(self, input_size,weight_size, train_y,answer_seq_len, hidden_size=256, is_GRU=True):
super(PointerNetwork, self).__init__()
self.hidden_size = hidden_size
self.input_size = input_size
self.answer_seq_len = answer_seq_len
self.weight_size = weight_size
self.is_GRU = is_GRU
self.train_y=train_y
self.emb = nn.Embedding(input_size, emb_size) # embed inputs
if is_GRU:
self.enc = nn.GRU(input_size, hidden_size, batch_first=True,bidirectional=True)
self.dec = nn.GRUCell(emb_size, hidden_size) # GRUCell's input is always batch first
else:
self.enc = nn.LSTM(emb_size, hidden_size, batch_first=True)
self.dec = nn.LSTMCell(emb_size, hidden_size) # LSTMCell's input is always batch first
self.W1 = nn.Linear(hidden_size*2, weight_size, bias=False) # blending encoder
self.W2 = nn.Linear(hidden_size, weight_size, bias=False) # blending decoder
self.vt = nn.Linear(weight_size, 1, bias=False) # scaling sum of enc and dec by v.T
def forward(self, input_,decoder_input,test=False):
batch_size = input_.size(0)
encoder_states, hc = self.enc(input_) # encoder_state: (bs, L, H)
encoder_states = encoder_states.transpose(1, 0) # (L, bs, H)
hidden = to_var(hc[0]) # (bs, h)
cell_state = encoder_states[-1] # (bs, h)
probs = []
for i in range(self.answer_seq_len): # range(M)
if self.is_GRU:
if test:
hidden = self.dec(decoder_input[i], hidden) # (bs, h), (bs, h)
else:
hidden, cell_state = self.dec(decoder_input,(hidden, cell_state))#(bs,h),(bs, h)
blend1 = self.W1(encoder_states) # (L, bs, W)
blend2 = self.W2(hidden) # (bs, W)
blend_sum = F.tanh(blend1 + blend2) # (L, bs, W)
out = self.vt(blend_sum).squeeze() # (L, bs)
out = F.log_softmax(out.transpose(0, 1).contiguous(), -1) # (bs, L)
_v, indices = torch.max(out, 0)
probs.append(out)
probs = torch.stack(probs, dim=1) # (bs, M, L)
return probs
total_size = train_x.shape[0]
weight_size = 300
emb_size = 300
batch_size = 5
n_epochs = 10
answer_seq_len=10
input_seq_len = 109
# input, targets = make_seq_data(total_size, input_seq_len)
inp_size = input_seq_len
# # Convert to torch tensors
# input = to_var(torch.LongTensor(input)) # (N, L)
# targets = to_var(torch.LongTensor(targets)) # (N, L)
data_split = (int)(total_size * 0.9)
print (data_split)
train_X = train_x[:data_split]
train_Y = train_y[:data_split]
test_X = train_x[data_split:]
test_Y = train_y[data_split:]
train_decoder_X = train_decoder[:data_split]
train_decoder_Y = train_decoder[data_split:]
# from pointer_network import PointerNetwork
def train(model, X, Y, train_decoder_X,batch_size, n_epochs):
model.train()
optimizer = optim.Adam(model.parameters())
N = X.size(0)
L = X.size(1)
# M = Y.size(1)
for epoch in range(n_epochs + 1):
# for i in range(len(train_batches))
for i in range(0, N-batch_size, batch_size):
x = X[i:i+batch_size] # (bs, L)
y = Y[i:i+batch_size] # (bs, M)
# print ("shape of X",x.shape)
# print ("shape of Y",y.shape)
decoder_input=train_decoder_X[i:i+batch_size]
decoder_input=decoder_input.reshape((decoder_input.shape[1],decoder_input.shape[0],decoder_input.shape[2]))
probs = model(x,decoder_input) # (bs, M, L)
outputs = probs.view(-1, L) # (bs*M, L)
y = y.view(-1) # (bs*M)
loss = F.nll_loss(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 2 == 0:
print('epoch: {}, Loss: {:.5f}'.format(epoch, loss.item()))
decoder_input=decoder_input.reshape((decoder_input.shape[1],decoder_input.shape[0],decoder_input.shape[2]))
# print (decoder_input.shape)
# print (X.shape)
# print (Y.shape)
test(model, X, Y,train_decoder_X)
def test(model, X, Y,decoder_input):
decoder_input=decoder_input.reshape((decoder_input.shape[1],decoder_input.shape[0],decoder_input.shape[2]))
probs = model(X,decoder_input,test=True) # (bs, M, L)
# print (probs.shape)
_v, indices = torch.max(probs, 2) # (bs, M)
print (probs[0][0])
print (indices.shape)
print (Y.shape)
# print (indices.shape)
# print (probs.shape)
# print ("predicted",indices[0][0])
# print ("True",Y[0][0])
for ind, y in zip(indices, Y):
print (ind.data,y.data)
correct_count = sum([1 if torch.equal(ind.data, y.data) else 0 for ind, y in zip(indices, Y)])
print('Acc: {:.2f}% ({}/{})'.format(correct_count/len(X)*100, correct_count, len(X)))
model = PointerNetwork(emb_size, train_y=train_decoder_X,weight_size=weight_size, answer_seq_len=answer_seq_len)
if torch.cuda.is_available():
model.cuda()
train(model, train_X,train_Y, train_decoder_X, batch_size, n_epochs)
print('----Test result--- a')
# print (test_X.shape)
# print (test_Y.shape)
# print (train_decoder_Y.shape)
test(model, test_X, test_Y,train_decoder_Y)