получение одинакового вывода для каждой временной метки в GRU Decoder - PullRequest
0 голосов
/ 09 января 2020

Я пытаюсь реализовать статью «SegBot A Generi c Модель сегментации нейронного текста с сетью указателей», в этой статье они выполняют сегментацию topi c и элементарную дискурсивную сегментацию. Я использовал следующий код, но я получаю один и тот же вывод для каждого временного шага декодера. У меня есть кодер, а затем декодер, после чего я использую внимание для распределения по входу.

def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)
class PointerNetwork(nn.Module):
    def __init__(self, input_size,weight_size, train_y,answer_seq_len, hidden_size=256, is_GRU=True):

        super(PointerNetwork, self).__init__()

        self.hidden_size = hidden_size
        self.input_size = input_size
        self.answer_seq_len = answer_seq_len
        self.weight_size = weight_size
        self.is_GRU = is_GRU
        self.train_y=train_y

        self.emb = nn.Embedding(input_size, emb_size)  # embed inputs

        if is_GRU:
            self.enc = nn.GRU(input_size, hidden_size, batch_first=True,bidirectional=True)
            self.dec = nn.GRUCell(emb_size, hidden_size) # GRUCell's input is always batch first
        else:
            self.enc = nn.LSTM(emb_size, hidden_size, batch_first=True)

            self.dec = nn.LSTMCell(emb_size, hidden_size) # LSTMCell's input is always batch first

        self.W1 = nn.Linear(hidden_size*2, weight_size, bias=False) # blending encoder
        self.W2 = nn.Linear(hidden_size, weight_size, bias=False) # blending decoder
        self.vt = nn.Linear(weight_size, 1, bias=False) # scaling sum of enc and dec by v.T

    def forward(self, input_,decoder_input,test=False):

        batch_size = input_.size(0)
        encoder_states, hc = self.enc(input_) # encoder_state: (bs, L, H)

        encoder_states = encoder_states.transpose(1, 0) # (L, bs, H)

        hidden = to_var(hc[0])   # (bs, h)
        cell_state = encoder_states[-1]                                # (bs, h)


        probs = []

        for i in range(self.answer_seq_len): # range(M)
            if self.is_GRU:
                if test:
                    hidden = self.dec(decoder_input[i], hidden) # (bs, h), (bs, h)
            else:
                hidden, cell_state = self.dec(decoder_input,(hidden, cell_state))#(bs,h),(bs, h)

            blend1 = self.W1(encoder_states)          # (L, bs, W)
            blend2 = self.W2(hidden)                  # (bs, W)
            blend_sum = F.tanh(blend1 + blend2)    # (L, bs, W)
            out = self.vt(blend_sum).squeeze()        # (L, bs)
            out = F.log_softmax(out.transpose(0, 1).contiguous(), -1) # (bs, L)
            _v, indices = torch.max(out, 0)

            probs.append(out)
        probs = torch.stack(probs, dim=1)           # (bs, M, L)
        return probs 
total_size = train_x.shape[0]
weight_size = 300
emb_size = 300
batch_size = 5
n_epochs = 10
answer_seq_len=10
input_seq_len = 109
# input, targets = make_seq_data(total_size, input_seq_len)
inp_size = input_seq_len

# # Convert to torch tensors
# input = to_var(torch.LongTensor(input))     # (N, L)
# targets = to_var(torch.LongTensor(targets)) # (N, L)

data_split = (int)(total_size * 0.9)
print (data_split)
train_X = train_x[:data_split]
train_Y = train_y[:data_split]
test_X = train_x[data_split:]
test_Y = train_y[data_split:]
train_decoder_X = train_decoder[:data_split]
train_decoder_Y = train_decoder[data_split:]


# from pointer_network import PointerNetwork
def train(model, X, Y, train_decoder_X,batch_size, n_epochs):
    model.train()
    optimizer = optim.Adam(model.parameters())
    N = X.size(0)
    L = X.size(1)
    # M = Y.size(1)
    for epoch in range(n_epochs + 1):
        # for i in range(len(train_batches))
        for i in range(0, N-batch_size, batch_size):
            x = X[i:i+batch_size] # (bs, L)
            y = Y[i:i+batch_size] # (bs, M)

#             print ("shape of X",x.shape)
#             print ("shape of Y",y.shape)

            decoder_input=train_decoder_X[i:i+batch_size]
            decoder_input=decoder_input.reshape((decoder_input.shape[1],decoder_input.shape[0],decoder_input.shape[2]))
            probs = model(x,decoder_input) # (bs, M, L)
            outputs = probs.view(-1, L) # (bs*M, L)

            y = y.view(-1) # (bs*M)
            loss = F.nll_loss(outputs, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if epoch % 2 == 0:
            print('epoch: {}, Loss: {:.5f}'.format(epoch, loss.item()))

decoder_input=decoder_input.reshape((decoder_input.shape[1],decoder_input.shape[0],decoder_input.shape[2]))
#             print (decoder_input.shape)
#             print (X.shape)
#             print (Y.shape)

            test(model, X, Y,train_decoder_X)


def test(model, X, Y,decoder_input):

    decoder_input=decoder_input.reshape((decoder_input.shape[1],decoder_input.shape[0],decoder_input.shape[2]))
    probs = model(X,decoder_input,test=True) # (bs, M, L)
#     print (probs.shape)
    _v, indices = torch.max(probs, 2) # (bs, M)
    print (probs[0][0])
    print (indices.shape)
    print (Y.shape)
#     print (indices.shape)
#     print (probs.shape)
#     print ("predicted",indices[0][0])
#     print ("True",Y[0][0])
    for ind, y in zip(indices, Y):

        print (ind.data,y.data) 

    correct_count = sum([1 if torch.equal(ind.data, y.data) else 0 for ind, y in zip(indices, Y)])
    print('Acc: {:.2f}% ({}/{})'.format(correct_count/len(X)*100, correct_count, len(X)))

model = PointerNetwork(emb_size, train_y=train_decoder_X,weight_size=weight_size, answer_seq_len=answer_seq_len)
if torch.cuda.is_available():
    model.cuda()
train(model, train_X,train_Y, train_decoder_X, batch_size, n_epochs)
print('----Test result--- a')
# print (test_X.shape)
# print (test_Y.shape)
# print (train_decoder_Y.shape)
test(model, test_X, test_Y,train_decoder_Y)

...