TensorFlow GPU 2.0: утилита для GPU равна 0 - PullRequest
0 голосов
/ 15 февраля 2020

Я пытаюсь запустить нейронный машинный перевод с моделью внимания (см. Ссылку на учебник и код ниже) https://www.tensorflow.org/tutorials/text/nmt_with_attention

import os
import re
import numpy as np
import tensorflow as tf
from collections import Counter
import warnings
import gzip
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

def preprocess_sentence(sentence):
    sentence = "<start> " + sentence + " <end>"
    return sentence


def parse_data(file_name):
    with open(file_name, 'r', encoding='utf8') as lines:
        lines = lines.read().split("\n")
        sentence_data = [line for line in lines]
        return sentence_data


def tokenizer(language):
    language_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=None, filters="", split=" "
    )
    # 统计词频生成词表
    language_tokenizer.fit_on_texts(language)
    # 将文本转成id
    tensor = language_tokenizer.texts_to_sequences(language)
    # padding操作,在句子后补0
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding="post")
    return language_tokenizer, tensor


def max_length(tensor):
    return max(len(t) for t in tensor)



def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
    if (shuffle):
        dataset = dataset.shuffle(len(input_tensor))
    dataset = dataset.repeat(epochs).batch(batch_size=batch_size, drop_remainder=True, )
    return dataset


strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    class Encoder(tf.keras.layers.Layer):
        def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
            super(Encoder, self).__init__()
            self.batch_size = batch_size
            self.encoding_units = encoding_units
            self.embedding = tf.keras.layers.Embedding(
                vocab_size, embedding_units)

            self.gru = tf.keras.layers.GRU(self.encoding_units, return_sequences=True, return_state=True,
                                           recurrent_initializer='glorot_uniform')


        def call(self, x, hidden):  
            x = self.embedding(x)
            output, state = self.gru(x, initial_state=hidden)
            return output, state

        def initialize_hidden_state(self):
            return tf.zeros((self.batch_size, self.encoding_units))


    class Attention(tf.keras.layers.Layer):
        def __init__(self, units):
            super(Attention, self).__init__()
            self.w1 = tf.keras.layers.Dense(units)
            self.w2 = tf.keras.layers.Dense(units)
            self.v = tf.keras.layers.Dense(1)

        def call(self, decoder_hidden, encoder_outputs):
            # decoder_hidden.shape:(batch_size, units)
            # encoder_outputs.shape:(batch_size, length, units)
            # decoder_hidden_with_time_axis.shape:(batch_size, 1, units)
            decoder_hidden_with_time_axis = tf.keras.backend.expand_dims(decoder_hidden, 1)
            # v_before:(batch_size, length, units)
            # v_after:(batch_size, length, 1)
            score = self.v(tf.keras.activations.tanh(self.w1(encoder_outputs) + self.w2(decoder_hidden_with_time_axis)))

            attention_weights = tf.keras.activations.softmax(score, axis=1)

            context_vector = attention_weights * encoder_outputs

            context_vector = tf.reduce_sum(context_vector, axis=1)
            return context_vector, attention_weights  # attention_weights是为了方便可视化


    class Decoder(tf.keras.layers.Layer):
        def __init__(self, vocab_size, embedding_units, decoding_units, batch_size):
            super(Decoder, self).__init__()
            self.batch_size = batch_size
            self.decoding_units = decoding_units
            self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_units)
            self.gru = tf.keras.layers.GRU(self.decoding_units, return_sequences=True, return_state=True,
                                           recurrent_initializer='glorot_uniform')
            self.fc = tf.keras.layers.Dense(vocab_size)  
            self.attention = Attention(self.decoding_units)

        def call(self, x, hidden, encoding_outputs):

            context_vector, attention_weights = self.attention(hidden, encoding_outputs)

            x = self.embedding(x)

            combined_x = tf.concat([tf.keras.backend.expand_dims(context_vector, 1), x], axis=-1)

            output, state = self.gru(combined_x)
            # output.shape: (batch_size, decoding_units)
            output = tf.reshape(output, (-1, output.shape[2]))
            # output.shape: (batch_size, vocab_size)
            output = self.fc(output)
            return output, state, attention_weights


    class Seq2Seq(tf.keras.Model):
        def __init__(self, input_vocab_size, output_vocab_size, embedding_units, units, batch_size):
            super(Seq2Seq, self).__init__()
            self.encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)
            self.decoder = Decoder(output_vocab_size, embedding_units, units, batch_size)

        def call(self, inp, targ, encoding_hidden):
            encoding_outputs, encoding_hidden = self.encoder(inp, encoding_hidden)
            decoding_hidden = encoding_hidden
            predictions = []
            for t in range(0, targ.shape[1] - 1):
                decoding_input = tf.expand_dims(targ[:, t], 1)
                prediction, decoding_hidden, _ = self.decoder(decoding_input,
                                                              decoding_hidden,
                                                              encoding_outputs)
                predictions.append(prediction)
            return predictions


with strategy.scope():
    embedding_units = 256  
    units = 1024  

    max_input = -1
    max_output = -1

    input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=None, filters="", split=" "
    )
    output_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=None, filters="", split=" "
    )

    for i in range(0, 33):
        en_file = "training-giga-fren/data/en_" + str(i) + ".txt"
        fr_file = "training-giga-fren/data/fr_" + str(i) + ".txt"
        en_data = parse_data(en_file)
        fr_data = parse_data(fr_file)

        input_tokenizer.fit_on_texts(en_data)
        output_tokenizer.fit_on_texts(fr_data)

        input_tensor = input_tokenizer.texts_to_sequences(en_data)
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding="post")
        output_tensor = output_tokenizer.texts_to_sequences(fr_data)
        output_tensor = tf.keras.preprocessing.sequence.pad_sequences(output_tensor, padding="post")

        tmp_max_input = max_length(input_tensor)
        tmp_max_output = max_length(output_tensor)
        if(tmp_max_input>max_input):
            max_input = tmp_max_input
        if(tmp_max_output>max_output):
            max_output = tmp_max_output

        print("Count {} max_input {} max_output {} input_vocab_size {} output_vocab_size {}"
              .format(i, max_input, max_output,
                      len(input_tokenizer.word_index) + 1,
                      len(output_tokenizer.word_index) + 1))

    batch_size_per_replica = 2
    batch_size = batch_size_per_replica * len(gpus)
    input_vocab_size = len(input_tokenizer.word_index) + 1
    output_vocab_size = len(output_tokenizer.word_index) + 1
    print("max_input {} max_output {}".format(max_input, max_output))
    print("input_vocab_size {} output_vocab_size {}".format(input_vocab_size, output_vocab_size))

    seq2seq = Seq2Seq(input_vocab_size, output_vocab_size, embedding_units, units, batch_size_per_replica)

with strategy.scope():
    def loss_function(real, pred):

        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss.dtype)

        loss *= mask
        return tf.nn.compute_average_loss(loss, global_batch_size=batch_size)



    def train_step(inp, targ, encoding_hidden):
        loss = 0
        with tf.GradientTape() as tape:
            predictions = seq2seq(inp, targ, encoding_hidden)
            for t in range(0, targ.shape[1] - 1):
                prediction = predictions[t]
                loss += loss_function(targ[:, t + 1], prediction)
        batch_loss = loss / int(targ.shape[0])
        variables = seq2seq.encoder.trainable_variables + seq2seq.decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients((zip(gradients, variables)))
        return batch_loss


    def test_step(inp, targ, encoding_hidden):
        loss = 0
        predictions = seq2seq(inp, targ, encoding_hidden)
        for t in range(0, targ.shape[1] - 1):
            prediction = predictions[t]
            loss += loss_function(targ[:, t + 1], prediction)
        test_loss_object.update_state(loss)

    @tf.function

    def distribute_train_step(inp, targ, encoding_hidden):
        batch_loss = strategy.experimental_run_v2(train_step, args=(inp, targ, encoding_hidden,))

        return strategy.reduce(tf.distribute.ReduceOp.SUM, batch_loss, axis=None)


    @tf.function
    def distribute_test_step(inp, targ, encoding_hidden):
        return strategy.experimental_run_v2(test_step, args=(inp, targ, encoding_hidden,))

with strategy.scope():
    optimizer = tf.keras.optimizers.Adam()  
    # from_logits: Boolean, whether `output` is the result of a softmax, or is a tensor of logits.

    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

    test_loss_object = tf.keras.metrics.Mean(name="test_loss")

    checkpoint_dir = './training_checkpoints'
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     seq2seq=seq2seq)

    try:
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    except:
        print("no latest model")

with strategy.scope():
    for i in range(0, 33):

        en_file = "training-giga-fren/data/en_" + str(i) + ".txt"
        fr_file = "training-giga-fren/data/fr_" + str(i) + ".txt"
        en_data = parse_data(en_file)
        fr_data = parse_data(fr_file)
        input_tensor = input_tokenizer.texts_to_sequences(en_data)
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding="post")
        output_tensor = output_tokenizer.texts_to_sequences(fr_data)
        output_tensor = tf.keras.preprocessing.sequence.pad_sequences(output_tensor, padding="post")


        epochs = 1
        input_train, input_test, output_train, output_test = train_test_split(input_tensor,
                                                                              output_tensor,
                                                                              test_size=0.2)
        train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
        test_dataset = make_dataset(input_test, output_test, batch_size, epochs, False)

        dist_train_dataset = strategy.experimental_distribute_dataset(train_dataset)
        dist_test_dataset = strategy.experimental_distribute_dataset(test_dataset)

        epochs = 50
        for epoch in range(epochs):
            encoder_hidden = seq2seq.encoder.initialize_hidden_state()
            total_loss = 0
            num_batch = 0
            for (batch, (inp, targ)) in enumerate(dist_train_dataset):
                batch_loss = distribute_train_step(inp, targ, encoder_hidden)
                total_loss += batch_loss
                num_batch += 1
                if (batch % 100 == 0):
                    print("Epoch {} Batch {} Loss {:.4f}".format(epoch + 1, batch, batch_loss.numpy()))
            for (batch, (inp, targ)) in enumerate(dist_test_dataset):
                distribute_test_step(inp, targ, encoder_hidden)
            print("Epoch {} Train Loss {:.4f} Test Loss {:.4f}".format(epoch + 1, total_loss.numpy() / num_batch,
                                                                       test_loss_object.result()))
            test_loss_object.reset_states()
            if (epoch + 1) % 2 == 0:
                checkpoint.save(file_prefix=checkpoint_prefix)


def evaluate(sentence):
    attention_matrix = np.zeros((max_output, max_input))
    sentence = preprocess_sentence(sentence)
    inputs = [input_tokenizer.word_index[token] for token in sentence.split(" ")]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_input,
                                                           padding="post")
    inputs = tf.convert_to_tensor(inputs)
    result = ""
    encoding_hidden = tf.zeros((1, units))
    encoding_outputs, encoding_hidden = seq2seq.encoder(inputs, encoding_hidden)
    decoding_hidden = encoding_hidden
    decoding_input = tf.expand_dims([output_tokenizer.word_index["<start>"]], 0)
    for t in range(max_output):
        # attention_weights.shape:(batch_size, input_length, 1)
        predictions, decoding_hidden, attention_weights = seq2seq.decoder(
            decoding_input, decoding_hidden, encoding_outputs
        )
        attention_weights = tf.reshape(attention_weights, (-1,))
        attention_matrix[t] = attention_weights.numpy()
        # predictions.shape:(batch_size, vocab_size)
        predictions_id = tf.argmax(predictions[0]).numpy()
        if (output_tokenizer.index_word[predictions_id] == "<end>"):
            return result, sentence, attention_matrix
        else:
            result += output_tokenizer.index_word[predictions_id] + " "
        decoding_input = tf.expand_dims([predictions_id], 0)
    return result, sentence, attention_matrix


def translation(input):
    results, input, attention_matrix = evaluate(input)

    print("input:", input)
    print("translation:", results)


with open(r"training-giga-fren/data/en_1.txt") as f_test:
    sentence = f_test.readlines()
    cnt = 0
    for test_sentence in sentence:
        translation(test_sentence)
        cnt += 1
        if (cnt == 100):
            break

В этом коде мне нужно пройти 33 наборы английских sh -французских наборов данных (каждый набор данных содержит 50 000 предложений. Из-за большого количества предложений я пытался использовать графический процессор для ускорения вычислений, но обнаружил, что программа работает очень медленно, и даже процесс был убит по системе (я не обнаружил, что использование памяти и памяти GPU выходит за рамки. См. использование памяти GPU ниже)
В этих наборах данных:
max_input 3434
max_output 3540
input_vocab_size 171871
output_vocab_size 201755

enter image description here

Моя конфигурация системы:
Ubuntu 16.04, CUDA 10.1
6 Intel (R) Core (TM) i5- 9600K CPU @ 3,70 ГГц
GeForce RTX 2080 8 ГБ (два)
16 ГБ памяти

Мои вопросы:
1. Память графического процессора занята, но коэффициент использования графического процессора всегда равен 0. Как я могу улучшить использование графического процессора Скорость действия?
2. Как я могу ускорить мою программу?
3. Почему процесс убит?

...