Не могу восстановить переменные должным образом, tenSroflow 1.6, Python 3.5. - PullRequest
0 голосов
/ 11 октября 2018

Я тестировал иерархический LSTM с использованием текстовых корпусов Text8.Я построил сеть, используя тензор потока, а затем обучился на тренировочном наборе, проверил на тестовом наборе , и все прошло хорошо.Затем я закомментировал обучающий код, восстановил обученные ранее переменные и протестировал, на этот раз что-то пошло не так.Потеря намного хуже, чем первый тест, который, я думаю, будет похожим.Вот мой вопрос, почему результат второго теста намного хуже, чем в первый раз, когда сеть использует те же переменные?

Более конкретно, в первый раз я сохраняю переменные с помощью tf.train.Saver () .save () после тренировки и восстановите переменные с помощью tf.train.Saver (). restore () перед тестированием, итоговая потеря является разумной (близкой к последней потере в тренировке).Во второй раз я создаю одну и ту же сеть, пропускаю этап обучения, восстанавливаю переменную из того же файла переменных, который был создан в первый раз, и проверяю те же данные, что приводит к гораздо худшим потерям, чем в первый раз.Вот код:

network = HMLSTMNetwork(output_size=vocab_size, input_size=vocab_size, num_layers=num_layers, embed_size=1024,
                    out_hidden_size=512, hidden_state_sizes=512, task=task)

# Train a little:
generator = Generator(train_data_, batch_size, num_steps, num_epochs)
losses = network.train_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs),
                                    load_weights=False)

# Test:
generator = Generator(test_data, batch_size, num_steps, num_epochs=1)
_, predictions, truths, indicators = network.test_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs))
print_text(predictions[0][0], truths[0][0], indicators[0][0], vocab_list=vocab_list)
network.print_variable()

Вот результат, когда я запускаю приведенный выше код: тест после тренировки

Вот результат, когда я повторно запускаю код послезакомментированный шаг обучения: протестируйте снова после восстановления переменных

Как видите, я напечатал 2 переменные в сети, они кажутся одинаковыми, это должно доказать, что сеть успешно восстановилаПеременные обучаются в первый раз.Так почему же одна и та же переменная и одна и та же сеть дали очень разные результаты?Это действительно сбивает с толку.За исключением потери, результаты прогноза, напечатанные на фиг. 6, также различны.

Этот вопрос действительно имеет значение, потому что я хочу использовать сохраненную переменную не один раз, а несколько раз.Я спрашивал окружающих и искал переполнение стека, не нашел подходящего ответа, возможно, я не выполнил поиск должным образом.Подскажите пожалуйста как решить эту проблему или как найти правильный ответ.Большое спасибо.

Я новичок в переполнении стека, поэтому я не могу показать фиг, вам нужно щелкнуть, чтобы увидеть его.Извините за это.

Добавление 2 основных файлов Python в конце.

main.py

from hmlstm import HMLSTMNetwork, prepare_inputs, get_text, viz_char_boundaries, plot_losses, Generator, print_text
import matplotlib.pyplot as plt
import tensorflow as tf

all_data_file = 'text8.txt'
seleted_data_file = 'text8_selected.txt'

with open(all_data_file, 'r') as f:
    all_data = f.read()
all_data_length = len(all_data)
train_data_length = all_data_length // 10
test_data_length = all_data_length // 5000 + 8
selected_data_length = train_data_length + test_data_length
print('All data length: ', all_data_length)
print('Seleted data length: ', selected_data_length)
print('Train data length: ', train_data_length)
print('Test data length: ', test_data_length)
selected_data = all_data[: selected_data_length]
del all_data
# with open(seleted_data_file, 'w') as f:
#     f.write(seleted_data)

vocab = set(selected_data)
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))
vocab_list = [idx_to_vocab[i] for i in range(vocab_size)]
train_data = [vocab_to_idx[c] for c in selected_data[: train_data_length]]
test_data = [vocab_to_idx[c] for c in selected_data[-test_data_length:]]
train_data_ = [vocab_to_idx[c] for c in selected_data[: 200000]]
del selected_data

num_steps = 500
batch_size = 8
num_classes = vocab_size
learning_rate = 1e-3
num_layers = 3
num_epochs = 1

task = 'classification'
network = HMLSTMNetwork(output_size=vocab_size, input_size=vocab_size, num_layers=num_layers, embed_size=1024,
                        out_hidden_size=512, hidden_state_sizes=512, task=task)

# # Train:
# generator = Generator(train_data, batch_size, num_steps, num_epochs)
# losses = network.train_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs),
#                                     load_weights=False)
# plot_losses(losses)
# plt.savefig('figs/losses_of_{}_{}data_{}epochs'.format(task, train_data_length, num_epochs))
# # plt.show()

# Train a little:
generator = Generator(train_data_, batch_size, num_steps, num_epochs)
losses = network.train_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs),
                                    load_weights=False)

# Test:
generator = Generator(test_data, batch_size, num_steps, num_epochs=1)
_, predictions, truths, indicators = network.test_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs))
print_text(predictions[0][0], truths[0][0], indicators[0][0], vocab_list=vocab_list)
network.print_variable()

network.py

import numpy as np
import tensorflow as tf
from .hmlstm_cell_ import HMLSTMCell, HMLSTMState
from .multi_hmlstm_cell_ import MultiHMLSTMCell

class HMLSTMNetwork(object):
    def __init__(self,
                 input_size=1,
                 output_size=1,
                 num_layers=3,
                 hidden_state_sizes=50,
                 out_hidden_size=100,
                 embed_size=100,
                 task='regression'):
        """
        HMLSTMNetwork is a class representing hierarchical multiscale
        long short-term memory network.

        params:
        ---
        input_size: integer, the size of an input at one timestep
        output_size: integer, the size of an output at one timestep
        num_layers: integer, the number of layers in the hmlstm
        hidden_state_size: integer or list of integers. If it is an integer,
            it is the size of the hidden state for each layer of the hmlstm.
            If it is a list, it must have length equal to the number of layers,
            and each integer of the list is the size of the hidden state for
            the layer corresponding to its index.
        out_hidden_size: integer, the size of the two hidden layers in the
            output network.
        embed_size: integer, the size of the embedding in the output network.
        task: string, one of 'regression' and 'classification'.
        """
        self._out_hidden_size = out_hidden_size
        self._embed_size = embed_size
        self._num_layers = num_layers
        self._input_size = input_size
        self._output_size = output_size
        self._session = None
        self._graph = None
        self._task = task

        tf.reset_default_graph()

        if type(hidden_state_sizes) is list and len(hidden_state_sizes) != num_layers:
            raise ValueError('The number of hidden states provided must be the' +
                             ' same as the number of layers.')

        if type(hidden_state_sizes) is int:
            self._hidden_state_sizes = [hidden_state_sizes] * self._num_layers
        else:
            self._hidden_state_sizes = hidden_state_sizes

        if task == 'classification':
            self._loss_function = tf.nn.softmax_cross_entropy_with_logits
            self.batch_in = tf.placeholder(tf.int32, shape=[None, None], name='batch_in')
            self.batch_out = tf.placeholder(tf.int32, shape=[None, None], name='batch_out')
        elif task == 'regression':
            self._loss_function = lambda logits, labels: tf.square((logits - labels))
            batch_in_shape = (None, None, self._input_size)                 # (T, B, I)
            batch_out_shape = (None, None, self._output_size)               # (T, B, O)
            self.batch_in = tf.placeholder(tf.float32, shape=batch_in_shape, name='batch_in')
            self.batch_out = tf.placeholder(tf.float32, shape=batch_out_shape, name='batch_out')

        self._optimizer = tf.train.AdamOptimizer(1e-3)
        #  初始化参数
        self._initialize_output_variables()
        self._initialize_gate_variables()
        self._initialize_embedding_variables()

    def _initialize_gate_variables(self):
        with tf.variable_scope('gates_vars'):
            for l in range(self._num_layers):
                tf.get_variable('gate_{}'.format(l), [sum(self._hidden_state_sizes), 1], dtype=tf.float32)
                # 这里每一层的gate.shape==(self._hidden_state_sizes[l], 1)

    def _initialize_embedding_variables(self):
        with tf.variable_scope('embedding_vars'):
            embed_shape = [sum(self._hidden_state_sizes), self._embed_size]
            tf.get_variable('embed_weights', embed_shape, dtype=tf.float32)

    def _initialize_output_variables(self):
        with tf.variable_scope('output_module_vars'):
            tf.get_variable('b1', [1, self._out_hidden_size], dtype=tf.float32)
            tf.get_variable('b2', [1, self._out_hidden_size], dtype=tf.float32)
            tf.get_variable('b3', [1, self._output_size], dtype=tf.float32)
            tf.get_variable('w1', [self._embed_size, self._out_hidden_size], dtype=tf.float32)
            tf.get_variable('w2', [self._out_hidden_size, self._out_hidden_size], dtype=tf.float32)
            tf.get_variable('w3', [self._out_hidden_size, self._output_size], dtype=tf.float32)

    def load_variables(self, path=None):
        saver = tf.train.Saver()
        print('loading variables...')
        assert path is not None
        saver.restore(self._session, path)

    def save_variables(self, path=None):
        saver = tf.train.Saver()
        print('saving variables...')
        assert path is not None
        saver.save(self._session, path)

    def gate_input(self, hidden_states):
        '''
        gate the incoming hidden states
        hidden_states: [B, sum(h_l)]

        gated_input: [B, sum(h_l)]
        '''
        with tf.variable_scope('gates_vars', reuse=True):
            gates = []# [[B, 1] for l in range(L)]
            for l in range(self._num_layers):
                weights = tf.get_variable('gate_{}'.format(l), dtype=tf.float32)
                gates.append(tf.sigmoid(tf.matmul(hidden_states, weights)))

            split = tf.split(value=hidden_states,
                                    num_or_size_splits=self._hidden_state_sizes,
                                    axis=1)

            gated_list = []# [[B, h_l] for l in range(L)]
            for gate, hidden_state in zip(gates, split):
                gated_list.append(tf.multiply(gate, hidden_state))
            gated_input = tf.concat(gated_list, axis=1)# [B, sum(h_l)]
        return gated_input

    def embed_input(self, gated_input):
        '''
        gated_input: [B, sum(h_l)]

        embedding: [B, E], i.e. [B, embed_size]
        '''
        with tf.variable_scope('embedding_vars', reuse=True):
            embed_weights = tf.get_variable('embed_weights', dtype=tf.float32)
            prod = tf.matmul(gated_input, embed_weights)
            embedding = tf.nn.relu(prod)
        return embedding

    def output_module(self, embedding, outcome):
        '''
        embedding: [B, E]
        outcome: [B, output_size]

        loss: [B, output_size] or [B, 1]
        prediction: [B, output_size]
        '''
        with tf.variable_scope('output_module_vars', reuse=True):
            b1 = tf.get_variable('b1')
            b2 = tf.get_variable('b2')
            b3 = tf.get_variable('b3')
            w1 = tf.get_variable('w1')
            w2 = tf.get_variable('w2')
            w3 = tf.get_variable('w3')

            # feed forward network
            # first layer
            l1 = tf.nn.tanh(tf.matmul(embedding, w1) + b1)

            # second layer
            l2 = tf.nn.tanh(tf.matmul(l1, w2) + b2)

            # output
            prediction = tf.add(tf.matmul(l2, w3), b3, name='prediction')

            loss_args = {'logits': prediction, 'labels': outcome}
            loss = self._loss_function(**loss_args)

            if self._task == 'classification':
                loss = tf.expand_dims(loss, -1) # 这里由于tf.nn.softmax_cross_entropy_with_logits输出loss会比logits或者
                                                # labels降一维,因此expand_dims让loss回到(B, 1)。

        return loss, prediction

    def create_multicell(self, batch_size, reuse):

        def hmlstm_cell(layer):
            if layer == 0:
                h_below_size = self._input_size
            else:
                h_below_size = self._hidden_state_sizes[layer - 1]

            if layer == self._num_layers - 1:
                # doesn't matter, all zeros, but for convenience with summing
                # so the sum of ha sizes is just sum of hidden states
                # 意思是将最上层的h_above_size设为最下层的hidden_state_size
                h_above_size = self._hidden_state_sizes[0]
            else:
                h_above_size = self._hidden_state_sizes[layer + 1]

            return HMLSTMCell(self._hidden_state_sizes[layer], batch_size, h_below_size,
                              h_above_size, reuse)

        hmlstm = MultiHMLSTMCell([hmlstm_cell(l) for l in range(self._num_layers)], reuse)
        return hmlstm

    def split_out_cell_states(self, accum):
        '''
        accum: [B, H], i.e. [B, sum(h_l + h_l + 1)]

        cell_states: a list of ([B, h_l], [B, h_l], [B, 1]), with length L
        '''
        splits = []
        for size in self._hidden_state_sizes:
            splits += [size, size, 1]
        split_states = tf.split(value=accum, num_or_size_splits=splits, axis=1)
        cell_states = []
        for l in range(self._num_layers):
            c = split_states[(l * 3)]
            h = split_states[(l * 3) + 1]
            z = split_states[(l * 3) + 2]
            cell_states.append(HMLSTMState(c=c, h=h, z=z))# (cell_state, hidden_state, boundary_detector)
        return cell_states

    def get_h_aboves(self, hidden_states, batch_size, hmlstm):
        '''
        hidden_states: [[B, h_l] for l in range(L)]

        h_aboves: [B, sum(ha_l)], ha denotes h_above
        '''
        concated_hs = tf.concat(hidden_states[1:], axis=1)
        h_above_for_last_layer = tf.zeros([batch_size, hmlstm._cells[-1]._h_above_size], dtype=tf.float32)
        h_aboves = tf.concat([concated_hs, h_above_for_last_layer], axis=1)
        return h_aboves

    def network(self, reuse):
        if self._task == 'classification':
            batch_in_oh = tf.one_hot(self.batch_in, self._input_size)
            batch_in_trans = tf.transpose(batch_in_oh, [1, 0, 2])
            batch_out_oh = tf.one_hot(self.batch_out, self._output_size)
            batch_out_trans = tf.transpose(batch_out_oh, [1, 0, 2])
        elif self._task == 'regression':
            batch_in_trans = self.batch_in
            batch_out_trans = self.batch_out
            batch_out_oh = self.batch_out
        else:
            raise ValueError('Wrong task name!')

        batch_size = tf.shape(batch_in_trans)[1]
        hmlstm = self.create_multicell(batch_size, reuse)

        def scan_rnn(accum, elem):  # accum: 堆叠所有层的c,h,z的值组成的tensor, shape==[B, H]
                                    # elem: 一个batch, 一个时间步长的输入, shape==[B, I]
            cell_states = self.split_out_cell_states(accum)     # [B, H] -> [([B, h_l], [B, h_l], [B, 1]) for l in L]
            h_aboves = self.get_h_aboves([cs.h for cs in cell_states], batch_size, hmlstm)  # [B, sum(ha_l)]
            hmlstm_in = tf.concat((elem, h_aboves), axis=1)     # [B, I] + [B, sum(ha_l)] -> [B, I + sum(ha_l)]
            _, state = hmlstm(hmlstm_in, cell_states)
            # a list of (c=[B, h_l], h=[B, h_l], z=[B, 1])  ->  a list of (c=[B, h_l], h=[B, h_l], z=[B, 1])
            concated_states = [tf.concat(tuple(s), axis=1) for s in state]
            return tf.concat(concated_states, axis=1)                       # [B, H]

        elem_len = (sum(self._hidden_state_sizes) * 2) + self._num_layers
        initial = tf.zeros([batch_size, elem_len])                          # [B, H], H = elem_len
        states = tf.scan(scan_rnn, batch_in_trans, initial)            # [T, B, H]

        def map_indicators(elem):
            state = self.split_out_cell_states(elem)
            return tf.concat([l.z for l in state], axis=1)
        raw_indicators = tf.map_fn(map_indicators, states)                  # [T, B, L]
        indicators = tf.transpose(raw_indicators, [1, 2, 0])                # [B, L, T]
        to_map = tf.concat((states, batch_out_trans), axis=2)          # [T, B, H+O]

        def map_output(elem):
            splits = tf.constant([elem_len, self._output_size])
            cell_states, outcome = tf.split(value=elem, num_or_size_splits=splits, axis=1)
            hs = [s.h for s in self.split_out_cell_states(cell_states)]
            gated = self.gate_input(tf.concat(hs, axis=1))      # [B, sum(h_l)]
            embeded = self.embed_input(gated)                   # [B, E], E = embeded_size
            loss, prediction = self.output_module(embeded, outcome)
                                                                # [B, 1], [B, O] for classification
                                                                # [B, O], [B, O] for regression(usually O == 1)
            return tf.concat((loss, prediction), axis=1)        # [B, 1+O] or [B, 2*output_size]

        mapped = tf.map_fn(map_output, to_map)                  # [T, B, 1+O] or [T, B, 2*O]
        loss = tf.reduce_mean(mapped[:, :, :-self._output_size])# scalar
        predictions_ = mapped[:, :, -self._output_size:]
        predictions = tf.nn.softmax(predictions_, axis=-1)      # [T, B, O]
        train = self._optimizer.minimize(loss)

        return train, loss, indicators, predictions, batch_out_oh

    def _get_graph(self):
        if self._graph is None:
            self._graph = self.network(reuse=False)
            print('Instructing network...')
        return self._graph

    def train(self, batches_in, batches_out, variable_path='weights/unnamed_weights', load_weights=False, epochs=1):
        """
        Train the network.

        params:
        ---
        batches_in: a 4 dimensional numpy array. The dimensions should be
            [num_batches, batch_size, num_timesteps, input_size]
            These represent the input at each time step for each batch.
        batches_out: a 4 dimensional numpy array. The dimensions should be
            [num_batches, batch_size, num_timesteps, output_size]
            These represent the output at each time step for each batch.
        variable_path: the path to which variable values will be saved and/or
            loaded
        load_weights: bool, whether to load variables prior to training
        save_weights: bool, whether to save variables after training
        epochs: integer, number of epochs
        """

        train, loss, _, _ = self._get_graph()
        losses = []

        self._session = tf.Session()
        print('Start a session...')

        if not load_weights:
            init = tf.global_variables_initializer()
            self._session.run(init)
        else:
            self.load_variables(variable_path)

        for epoch in range(epochs):
            print('Epoch {}'.format(epoch))
            for batch_in, batch_out in zip(batches_in, batches_out):
                fetches = [train, loss]
                feed_dict = {
                    self.batch_in: batch_in,     # (T, B, I)
                    self.batch_out: batch_out    # (T, B, O)
                }
                _, _loss = self._session.run(fetches, feed_dict)
                print('loss:', _loss)
                losses.append(_loss)

        self.save_variables(variable_path)

        self._session.close()
        print('Close session...')

        return losses

    def train_on_generator(self, generator, variable_path='weights/unnamed_weights', load_weights=False):
        train, loss, _, _, _ = self._get_graph()
        losses = []

        self._session = tf.Session()
        print('Start a session...')

        if not load_weights:
            init = tf.global_variables_initializer()
            self._session.run(init)
        else:
            self.load_variables(variable_path)

        for idx, epoch in enumerate(generator.gen_epochs()):
            print('\nEpoch {}'.format(idx))
            for batch_idx, (batch_in, batch_out) in enumerate(epoch):
                fetches = [train, loss]
                feed_dict = {
                    self.batch_in: batch_in,    # (B, T)
                    self.batch_out: batch_out   # (B, T)
                }
                _, _loss = self._session.run(fetches, feed_dict)
                print('Loss of batch {}: '.format(batch_idx), _loss)
                losses.append(_loss)

        self.save_variables(variable_path)

        self._session.close()
        print('Close session...')

        return losses

    def predict(self, batch, variable_path=None, return_gradients=False):
        """
        Make predictions.

        params:
        ---
        batch: batch for which to make predictions. should have dimensions
            [batch_size, num_timesteps, output_size]
        variable_path: string. If there is no active session in the network
            object (i.e. it has not yet been used to train or predict, or the
            tensorflow session has been manually closed), variables will be
            loaded from the provided path. Otherwise variables already present
            in the session will be used.

        returns:
        ---
        predictions for the batch
        """

        batch = np.array(batch)
        _, _, _, predictions, _ = self._get_graph()

        self._session = tf.Session()
        print('Start a session...')

        self.load_variables(variable_path)

        batch_out_size = (batch.shape[1], batch.shape[0], self._output_size)
        gradients = tf.gradients(predictions[-1:, :], self.batch_in)
        feed_dict = {self.batch_in: np.swapaxes(batch, 0, 1),
                     self.batch_out: np.zeros(batch_out_size)}
        _predictions, _gradients = self._session.run([predictions, gradients], feed_dict)

        if return_gradients:
            return tuple(np.swapaxes(r, 0, 1) for r in (_predictions, _gradients[0]))

        self._session.close()
        print('Close session...')

        return np.swapaxes(_predictions, 0, 1)

    def test_on_generator(self, generator, variable_path=None, return_gradients=False):
        _, loss, indicators, predictions, batch_out_oh = self._get_graph()
        losses = []
        predictionss = []
        truths = []
        indicatorss = []

        self._session = tf.Session()
        print('Start a session...')

        self.load_variables(variable_path)

        for idx, epoch in enumerate(generator.gen_epochs()):
            print('\nEpoch {}'.format(idx))
            for batch_idx, (batch_in, batch_out) in enumerate(epoch):
                feed_dict = {
                    self.batch_in: batch_in,    # (B, T)
                    self.batch_out: batch_out   # (B, T)
                }
                _loss, _predictions, _indicators, _batch_out_oh = self._session.run(
                    [loss, predictions, indicators, batch_out_oh], feed_dict
                )
                print('Loss of batch {}: '.format(batch_idx), _loss)
                losses.append(_loss)                                    # list of float
                predictionss.append(np.swapaxes(_predictions, 0, 1))    # list of [B, T, O]
                truths.append(_batch_out_oh)                            # list of [B, T, O]
                indicatorss.append(np.array(_indicators))               # list of [B, L, T]

        avg_loss = np.mean(np.array(losses))
        print('Average loss in testing: {}'.format(avg_loss))

        # self._session.close()
        # print('Close session...')

        return losses, predictionss, truths, indicatorss

    def predict_boundaries(self, batch, variable_path=None):
        """
        Find indicator values for every layer at every timestep.

        params:
        ---
        batch: batch for which to make predictions. should have dimensions
            [batch_size, num_timesteps, output_size]
        variable_path: string. If there is no active session in the network
            object (i.e. it has not yet been used to train or predict, or the
            tensorflow session has been manually closed), variables will be
            loaded from the provided path. Otherwise variables already present
            in the session will be used.

        returns:
        ---
        indicator values for ever layer at every timestep
        """

        batch = np.array(batch)
        _, _, indicators, _, _ = self._get_graph()

        self._session = tf.Session()
        print('Start a session...')

        self.load_variables(variable_path)

        feed_dict = {self.batch_in: np.swapaxes(batch, 0, 1)}
        _indicators = self._session.run(indicators, feed_dict)

        self._session.close()
        print('Close session...')

        return np.array(_indicators)

    def print_variable(self):
        w1_output = tf.get_default_graph().get_tensor_by_name("output_module_vars/w1:0")
        w1_lstm = tf.get_default_graph().get_tensor_by_name("multi_hmlstm_cell/cell_0/hmlstm_cell/dense/kernel:0")
        print('w1_output: \n', self._session.run(w1_output))
        print('w1_lstm: \n', self._session.run(w1_lstm))
        self._session.close()

        # for v in tf.trainable_variables():
        #     print(v.name)
...