PyTorch, преобразующий код LSTM для приема пакетов, не дает возможности прогнозировать - PullRequest
1 голос
/ 10 июня 2019

У меня проблема с тем, что если я буду подавать данные построчно, то моя модель двоичной классификации LSTM получит AUC 0,9+ в течение нескольких эпох. Когда я изменил код, чтобы он принимал пакеты, AUC застрял на 0,5, несмотря на уменьшение потерь.

Я создал игрушечный пример модели LSTM, чтобы попытаться проверить эту проблему. Я подозреваю, что моя архитектура модели передает неверную информацию, потому что в какой-то момент она предсказывает все как «положительное», но я не знаю, где. Код ниже:

Импорт и вспомогательные функции:

import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc, roc_auc_score

import multiprocessing

import pickle
import ast 
import s3fs
import json

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

import random
from random import randint
import os
import math
import time

# returns the softmax output into a readable prediction
def get_max_prob_result(input, ix_to_word):
    return ix_to_word[get_index_of_max(input)] 

# calculates the roc_auc score for the predictions
def get_auroc(truth, pred):
    assert len(truth) == len(pred)
    auc_score = roc_auc_score(np.array(truth),np.array(pred))
    return auc_score

def reorder_list(list, new_index_list):
    new_list = []
    for index in new_index_list:
        new_list.append(list[index])
    return new_list

def grab_batch(batch_size):
    seq=[]
    freq=[]
    target=[]
    time = []
    for k in range(batch_size):
        tseq, tfreq, ttime, ttarget = generate_patient()
        seq.append(tseq)
        freq.append(tfreq)
        time.append(ttime)
        target.append(ttarget)

    return seq, freq, time, target

Функция для случайного генерирования данных со структурой, которая соответствует моему реальному варианту использования (обратите внимание, что я создал правило, чтобы определить, когда цель положительна, что модель будет изучать):

events_to_ix = {'<PAD>':0,'non':1,'othernon':2,'neutral':3,'trigger':4}

final_seq = []
final_freq = []
final_time = []
final_target = []

dict_keys = list(events_to_ix.keys())[1:]

def generate_patient():
    num_seq = randint(1,100)
    patient_seq = []
    patient_freq = []
    patient_time = []
    patient_target = 0


    final_seq = []
    final_freq = []
    final_time = []

    for i in range(num_seq):
        step_seq = []
        step_freq = []
        step_time = []
        seq_length = randint(1,10)
        for k in range(seq_length):
            event = random.choice(dict_keys)
            if events_to_ix[event] in step_seq:
                continue
            step_seq.append(events_to_ix[event])
            step_freq.append(randint(1, 17))
            step_time.append(randint(0,(5+seq_length-k)))

        patient_seq.append(step_seq)
        patient_freq.append(step_freq)
        patient_time.append(step_time)

    for index, item in enumerate(patient_seq[-1]): 
        if item == 4 and patient_freq[-1][index] > 15 and patient_time[-1][index] < 3:
            patient_target = 1
            break

    # Loop through each concet in each timestep
    for step_idx, step in enumerate(patient_seq):
        concepts = [0]*len(events_to_ix)
        frequencies = [0]*len(events_to_ix)
        times = [0]*len(events_to_ix)
        for event_idx, event in enumerate(step):

            # Convert the textual concepts into their index representation
            concepts[event] = event

            # Append the frequencies and time encodings to their appropriate position in the list of 0's
            frequencies[event] = patient_freq[step_idx][event_idx]
            times[event] = patient_time[step_idx][event_idx]

        final_seq.append(concepts)
        final_freq.append(frequencies)
        final_time.append(times)
        #final_static.append(patient_static)

    final_seq = torch.LongTensor(final_seq)
    final_freq = torch.FloatTensor(final_freq).view(-1,len(events_to_ix),1)
    final_time = torch.FloatTensor(final_time).view(-1,len(events_to_ix),1)
    #final_static = torch.FloatTensor(final_static)

    return final_seq, final_freq, final_time, patient_target

Модель LSTM:

# Class containing the LSTM model initialization and feed-forward logic
class LSTMClassifier(nn.Module):
    # LSTM initialization
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size):
        super(LSTMClassifier, self).__init__()

        # Setting the hidden layer dimension of the LSTM
        self.hidden_dim = hidden_dim
        # Initializing the embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim-2)
        # Initializing the LSTM layer with one hidden layer 
        self.lstm = nn.LSTM(((embedding_dim*vocab_size)), hidden_dim, num_layers = 1, batch_first=False)
        # Initializing linear linear that takes the hidden layer output
        self.hidden2label = nn.Linear(hidden_dim, label_size)


    # Defining the hidden state of the LSTM
    def init_hidden(self,batch_size):
        # the first is the hidden h
        # the second is the cell  c
        return [autograd.Variable(torch.zeros(1,batch_size, self.hidden_dim).cuda()),
                autograd.Variable(torch.zeros(1,batch_size, self.hidden_dim).cuda())]

    # Defining the feed forward logic of the LSTM. It contains:
    # 1. The embedding layer
    # 2. The LSTM layer with one hidden layer
    # 3. The softmax layer
    def forward(self, seq, freq, time_data, seq_lengths):

        # Grab the mini-batch length and max sequence length (pre-ordered)
        # (need to do this in the forward logic because of data parallelism and how the GPU's will split up the batch)
        sequence_length = seq.size()[1]
        batch_length = seq.size()[0]

        # reset the LSTM hidden state. 
        # Must be done before you run a new batch. Otherwise the LSTM will treat a new batch as a continuation of a sequence
        self.hidden = self.init_hidden(batch_length)

        # This is the pass to the embedding layer. 
        # The sequence is of dimension N and the output is N x Demb
        embeds = self.embeddings(seq)

        # Concatenate the embedding output with the time and frequency vectors
        embeds = torch.cat((embeds,freq), dim=3)
        embeds = torch.cat((embeds,time_data), dim=3)

        # Because the LSTM excepts a dimension of (sequence length, batch size, feature size), and we have (batch size, seq length, feature size),
        # we need to switch the first and second dimension so that we get the correct input format
        embeds = torch.transpose(embeds, 0, 1)

        # Flatten the embedding dimension so that the input to the LSTM remains 3D rather than 4D
        x = embeds.view(sequence_length, batch_length, -1) 

        # pack the padded sequence so that paddings are ignored
        packed_x = torch.nn.utils.rnn.pack_padded_sequence(x, seq_lengths, batch_first=False)

        # Feed to the LSTM layer
        lstm_out, self.hidden = self.lstm(packed_x, self.hidden)

        # Feed the last layer of the LSTM into the linear layer
        y = self.hidden2label(self.hidden[0][-1])

        # Produce the softmax probabilities
        log_probs = F.log_softmax(y)

        return log_probs

Функция для запуска одной эпохи:

def train_epoch(model, loss_function, optimizer,batch_size, i):
    # Set model to training mode and initialize variables
    model.train()
    avg_loss = 0.0
    count = -1
    truth_res = []
    pred_res = []

    # Group the dataframe into dataframe chunks of length batch size and loop through each batch
    for j in range(1000):
        count += 1

        seq, freq, time_data, target = grab_batch(batch_size)

        # Sort the batches by descending size
        final_seq_ordered = sorted(enumerate(seq), key=lambda x: len(x[-1]), reverse=True) 

        # Grab the original indices from final_seq and see how they are now ordered
        final_seq_indices = [item[0] for item in final_seq_ordered]

        # Grab the actual values from the tupled master list of concepts
        seq = [item[1] for item in final_seq_ordered]
        freq = reorder_list(freq, final_seq_indices)
        time_data = reorder_list(time_data, final_seq_indices)
        target = reorder_list(target, final_seq_indices)

        # Grab the list of lengths of sequences, for the purpose of packing the padded sequenes
        seq_lengths = torch.LongTensor(list(map(len, seq)))

        # Grab the targets into a list and append it into the truth_res list in order to measure AUC performance
        truth_res.extend(target)

        # Pad the sequences
        seq = rnn_utils.pad_sequence(seq, batch_first = True)
        freq = rnn_utils.pad_sequence(freq, batch_first = True)
        time_data = rnn_utils.pad_sequence(time_data, batch_first = True)

        # Put the padded sequences into Variable and Cuda cores
        seq = autograd.Variable(seq.cuda())
        freq = autograd.Variable(freq.cuda())
        time_data = autograd.Variable(time_data.cuda())
        target = autograd.Variable(torch.LongTensor(target).cuda())

        # Feed the tensor Variables into the model
        pred = model(seq,freq,time_data,seq_lengths)
        # Append the predictions into a list for future AUC evaluation
        pred_label = pred.detach().max(1)[1].cpu().numpy()
        pred_res.extend(pred_label)

        # Reset the model gradient
        model.zero_grad()
        # Compute the loss
        loss = loss_function(pred, target)
        # Backpropagate
        loss.backward()
        # Update weights
        optimizer.step()

        # Computes the average loss
        avg_loss += loss.detach().item()

    # Computes the AUC score
    auc_score = get_auroc(truth_res, pred_res)     
    avg_loss /= (1000/batch_size)
    print('epoch: %d done! \n train avg_loss:%g , auc:%g' % (i, avg_loss, auc_score))

Основной цикл обучения:

#############################
### Set hyper parameters ###
############################
EMBEDDING_DIM = 32
HIDDEN_DIM = 50
EPOCH = 10
BATCH_SIZE = 16
best_val_auc = 0.5

model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=(len(events_to_ix)), label_size=2)
model = torch.nn.DataParallel(model.cuda())

weights = [(26/1000), 1]
class_weights = torch.FloatTensor(weights).cuda()
loss_function = nn.NLLLoss(weight=class_weights,reduction="sum",ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

no_up = 0

#####################################################
### Set loop to determine number of EPOCHs to run ###
#####################################################
for i in range(EPOCH):
    #############################################
    ### Run the training on the training data ###
    #############################################
    print('epoch: %d start!' % i)
    start = time.time()

    # Perform the training on the epoch
    train_epoch(model,loss_function, optimizer,BATCH_SIZE, i)

    print("1 epoch length of time")
    print(time.time() - start)
...