Я тестировал иерархический LSTM с использованием текстовых корпусов Text8.Я построил сеть, используя тензор потока, а затем обучился на тренировочном наборе, проверил на тестовом наборе , и все прошло хорошо.Затем я закомментировал обучающий код, восстановил обученные ранее переменные и протестировал, на этот раз что-то пошло не так.Потеря намного хуже, чем первый тест, который, я думаю, будет похожим.Вот мой вопрос, почему результат второго теста намного хуже, чем в первый раз, когда сеть использует те же переменные?
Более конкретно, в первый раз я сохраняю переменные с помощью tf.train.Saver () .save () после тренировки и восстановите переменные с помощью tf.train.Saver (). restore () перед тестированием, итоговая потеря является разумной (близкой к последней потере в тренировке).Во второй раз я создаю одну и ту же сеть, пропускаю этап обучения, восстанавливаю переменную из того же файла переменных, который был создан в первый раз, и проверяю те же данные, что приводит к гораздо худшим потерям, чем в первый раз.Вот код:
network = HMLSTMNetwork(output_size=vocab_size, input_size=vocab_size, num_layers=num_layers, embed_size=1024,
out_hidden_size=512, hidden_state_sizes=512, task=task)
# Train a little:
generator = Generator(train_data_, batch_size, num_steps, num_epochs)
losses = network.train_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs),
# Test:
generator = Generator(test_data, batch_size, num_steps, num_epochs=1)
_, predictions, truths, indicators = network.test_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs))
print_text(predictions[0][0], truths[0][0], indicators[0][0], vocab_list=vocab_list)
Вот результат, когда я запускаю приведенный выше код: тест после тренировки
Вот результат, когда я повторно запускаю код послезакомментированный шаг обучения: протестируйте снова после восстановления переменных
Как видите, я напечатал 2 переменные в сети, они кажутся одинаковыми, это должно доказать, что сеть успешно восстановилаПеременные обучаются в первый раз.Так почему же одна и та же переменная и одна и та же сеть дали очень разные результаты?Это действительно сбивает с толку.За исключением потери, результаты прогноза, напечатанные на фиг. 6, также различны.
Этот вопрос действительно имеет значение, потому что я хочу использовать сохраненную переменную не один раз, а несколько раз.Я спрашивал окружающих и искал переполнение стека, не нашел подходящего ответа, возможно, я не выполнил поиск должным образом.Подскажите пожалуйста как решить эту проблему или как найти правильный ответ.Большое спасибо.
Добавление 2 основных файлов Python в конце.
from hmlstm import HMLSTMNetwork, prepare_inputs, get_text, viz_char_boundaries, plot_losses, Generator, print_text
import matplotlib.pyplot as plt
import tensorflow as tf
all_data_file = 'text8.txt'
seleted_data_file = 'text8_selected.txt'
with open(all_data_file, 'r') as f:
all_data = f.read()
all_data_length = len(all_data)
train_data_length = all_data_length // 10
test_data_length = all_data_length // 5000 + 8
selected_data_length = train_data_length + test_data_length
print('All data length: ', all_data_length)
print('Seleted data length: ', selected_data_length)
print('Train data length: ', train_data_length)
print('Test data length: ', test_data_length)
selected_data = all_data[: selected_data_length]
del all_data
# with open(seleted_data_file, 'w') as f:
# f.write(seleted_data)
vocab = set(selected_data)
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))
vocab_list = [idx_to_vocab[i] for i in range(vocab_size)]
train_data = [vocab_to_idx[c] for c in selected_data[: train_data_length]]
test_data = [vocab_to_idx[c] for c in selected_data[-test_data_length:]]
train_data_ = [vocab_to_idx[c] for c in selected_data[: 200000]]
del selected_data
num_steps = 500
batch_size = 8
num_classes = vocab_size
learning_rate = 1e-3
num_layers = 3
num_epochs = 1
task = 'classification'
network = HMLSTMNetwork(output_size=vocab_size, input_size=vocab_size, num_layers=num_layers, embed_size=1024,
out_hidden_size=512, hidden_state_sizes=512, task=task)
# # Train:
# generator = Generator(train_data, batch_size, num_steps, num_epochs)
# losses = network.train_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs),
# load_weights=False)
# plot_losses(losses)
# plt.savefig('figs/losses_of_{}_{}data_{}epochs'.format(task, train_data_length, num_epochs))
# # plt.show()
# Train a little:
generator = Generator(train_data_, batch_size, num_steps, num_epochs)
losses = network.train_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs),
# Test:
generator = Generator(test_data, batch_size, num_steps, num_epochs=1)
_, predictions, truths, indicators = network.test_on_generator(generator, variable_path='weights/{}_{}data_{}epochs'.format(task, train_data_length, num_epochs))
print_text(predictions[0][0], truths[0][0], indicators[0][0], vocab_list=vocab_list)
import numpy as np
import tensorflow as tf
from .hmlstm_cell_ import HMLSTMCell, HMLSTMState
from .multi_hmlstm_cell_ import MultiHMLSTMCell
class HMLSTMNetwork(object):
def __init__(self,
HMLSTMNetwork is a class representing hierarchical multiscale
long short-term memory network.
input_size: integer, the size of an input at one timestep
output_size: integer, the size of an output at one timestep
num_layers: integer, the number of layers in the hmlstm
hidden_state_size: integer or list of integers. If it is an integer,
it is the size of the hidden state for each layer of the hmlstm.
If it is a list, it must have length equal to the number of layers,
and each integer of the list is the size of the hidden state for
the layer corresponding to its index.
out_hidden_size: integer, the size of the two hidden layers in the
output network.
embed_size: integer, the size of the embedding in the output network.
task: string, one of 'regression' and 'classification'.
self._out_hidden_size = out_hidden_size
self._embed_size = embed_size
self._num_layers = num_layers
self._input_size = input_size
self._output_size = output_size
self._session = None
self._graph = None
self._task = task
if type(hidden_state_sizes) is list and len(hidden_state_sizes) != num_layers:
raise ValueError('The number of hidden states provided must be the' +
' same as the number of layers.')
if type(hidden_state_sizes) is int:
self._hidden_state_sizes = [hidden_state_sizes] * self._num_layers
self._hidden_state_sizes = hidden_state_sizes
if task == 'classification':
self._loss_function = tf.nn.softmax_cross_entropy_with_logits
self.batch_in = tf.placeholder(tf.int32, shape=[None, None], name='batch_in')
self.batch_out = tf.placeholder(tf.int32, shape=[None, None], name='batch_out')
elif task == 'regression':
self._loss_function = lambda logits, labels: tf.square((logits - labels))
batch_in_shape = (None, None, self._input_size) # (T, B, I)
batch_out_shape = (None, None, self._output_size) # (T, B, O)
self.batch_in = tf.placeholder(tf.float32, shape=batch_in_shape, name='batch_in')
self.batch_out = tf.placeholder(tf.float32, shape=batch_out_shape, name='batch_out')
self._optimizer = tf.train.AdamOptimizer(1e-3)
# 初始化参数
def _initialize_gate_variables(self):
with tf.variable_scope('gates_vars'):
for l in range(self._num_layers):
tf.get_variable('gate_{}'.format(l), [sum(self._hidden_state_sizes), 1], dtype=tf.float32)
# 这里每一层的gate.shape==(self._hidden_state_sizes[l], 1)
def _initialize_embedding_variables(self):
with tf.variable_scope('embedding_vars'):
embed_shape = [sum(self._hidden_state_sizes), self._embed_size]
tf.get_variable('embed_weights', embed_shape, dtype=tf.float32)
def _initialize_output_variables(self):
with tf.variable_scope('output_module_vars'):
tf.get_variable('b1', [1, self._out_hidden_size], dtype=tf.float32)
tf.get_variable('b2', [1, self._out_hidden_size], dtype=tf.float32)
tf.get_variable('b3', [1, self._output_size], dtype=tf.float32)
tf.get_variable('w1', [self._embed_size, self._out_hidden_size], dtype=tf.float32)
tf.get_variable('w2', [self._out_hidden_size, self._out_hidden_size], dtype=tf.float32)
tf.get_variable('w3', [self._out_hidden_size, self._output_size], dtype=tf.float32)
def load_variables(self, path=None):
saver = tf.train.Saver()
print('loading variables...')
assert path is not None
saver.restore(self._session, path)
def save_variables(self, path=None):
saver = tf.train.Saver()
print('saving variables...')
assert path is not None
saver.save(self._session, path)
def gate_input(self, hidden_states):
gate the incoming hidden states
hidden_states: [B, sum(h_l)]
gated_input: [B, sum(h_l)]
with tf.variable_scope('gates_vars', reuse=True):
gates = []# [[B, 1] for l in range(L)]
for l in range(self._num_layers):
weights = tf.get_variable('gate_{}'.format(l), dtype=tf.float32)
gates.append(tf.sigmoid(tf.matmul(hidden_states, weights)))
split = tf.split(value=hidden_states,
gated_list = []# [[B, h_l] for l in range(L)]
for gate, hidden_state in zip(gates, split):
gated_list.append(tf.multiply(gate, hidden_state))
gated_input = tf.concat(gated_list, axis=1)# [B, sum(h_l)]
return gated_input
def embed_input(self, gated_input):
gated_input: [B, sum(h_l)]
embedding: [B, E], i.e. [B, embed_size]
with tf.variable_scope('embedding_vars', reuse=True):
embed_weights = tf.get_variable('embed_weights', dtype=tf.float32)
prod = tf.matmul(gated_input, embed_weights)
embedding = tf.nn.relu(prod)
return embedding
def output_module(self, embedding, outcome):
embedding: [B, E]
outcome: [B, output_size]
loss: [B, output_size] or [B, 1]
prediction: [B, output_size]
with tf.variable_scope('output_module_vars', reuse=True):
b1 = tf.get_variable('b1')
b2 = tf.get_variable('b2')
b3 = tf.get_variable('b3')
w1 = tf.get_variable('w1')
w2 = tf.get_variable('w2')
w3 = tf.get_variable('w3')
# feed forward network
# first layer
l1 = tf.nn.tanh(tf.matmul(embedding, w1) + b1)
# second layer
l2 = tf.nn.tanh(tf.matmul(l1, w2) + b2)
# output
prediction = tf.add(tf.matmul(l2, w3), b3, name='prediction')
loss_args = {'logits': prediction, 'labels': outcome}
loss = self._loss_function(**loss_args)
if self._task == 'classification':
loss = tf.expand_dims(loss, -1) # 这里由于tf.nn.softmax_cross_entropy_with_logits输出loss会比logits或者
# labels降一维,因此expand_dims让loss回到(B, 1)。
return loss, prediction
def create_multicell(self, batch_size, reuse):
def hmlstm_cell(layer):
if layer == 0:
h_below_size = self._input_size
h_below_size = self._hidden_state_sizes[layer - 1]
if layer == self._num_layers - 1:
# doesn't matter, all zeros, but for convenience with summing
# so the sum of ha sizes is just sum of hidden states
# 意思是将最上层的h_above_size设为最下层的hidden_state_size
h_above_size = self._hidden_state_sizes[0]
h_above_size = self._hidden_state_sizes[layer + 1]
return HMLSTMCell(self._hidden_state_sizes[layer], batch_size, h_below_size,
h_above_size, reuse)
hmlstm = MultiHMLSTMCell([hmlstm_cell(l) for l in range(self._num_layers)], reuse)
return hmlstm
def split_out_cell_states(self, accum):
accum: [B, H], i.e. [B, sum(h_l + h_l + 1)]
cell_states: a list of ([B, h_l], [B, h_l], [B, 1]), with length L
splits = []
for size in self._hidden_state_sizes:
splits += [size, size, 1]
split_states = tf.split(value=accum, num_or_size_splits=splits, axis=1)
cell_states = []
for l in range(self._num_layers):
c = split_states[(l * 3)]
h = split_states[(l * 3) + 1]
z = split_states[(l * 3) + 2]
cell_states.append(HMLSTMState(c=c, h=h, z=z))# (cell_state, hidden_state, boundary_detector)
return cell_states
def get_h_aboves(self, hidden_states, batch_size, hmlstm):
hidden_states: [[B, h_l] for l in range(L)]
h_aboves: [B, sum(ha_l)], ha denotes h_above
concated_hs = tf.concat(hidden_states[1:], axis=1)
h_above_for_last_layer = tf.zeros([batch_size, hmlstm._cells[-1]._h_above_size], dtype=tf.float32)
h_aboves = tf.concat([concated_hs, h_above_for_last_layer], axis=1)
return h_aboves
def network(self, reuse):
if self._task == 'classification':
batch_in_oh = tf.one_hot(self.batch_in, self._input_size)
batch_in_trans = tf.transpose(batch_in_oh, [1, 0, 2])
batch_out_oh = tf.one_hot(self.batch_out, self._output_size)
batch_out_trans = tf.transpose(batch_out_oh, [1, 0, 2])
elif self._task == 'regression':
batch_in_trans = self.batch_in
batch_out_trans = self.batch_out
batch_out_oh = self.batch_out
raise ValueError('Wrong task name!')
batch_size = tf.shape(batch_in_trans)[1]
hmlstm = self.create_multicell(batch_size, reuse)
def scan_rnn(accum, elem): # accum: 堆叠所有层的c,h,z的值组成的tensor, shape==[B, H]
# elem: 一个batch, 一个时间步长的输入, shape==[B, I]
cell_states = self.split_out_cell_states(accum) # [B, H] -> [([B, h_l], [B, h_l], [B, 1]) for l in L]
h_aboves = self.get_h_aboves([cs.h for cs in cell_states], batch_size, hmlstm) # [B, sum(ha_l)]
hmlstm_in = tf.concat((elem, h_aboves), axis=1) # [B, I] + [B, sum(ha_l)] -> [B, I + sum(ha_l)]
_, state = hmlstm(hmlstm_in, cell_states)
# a list of (c=[B, h_l], h=[B, h_l], z=[B, 1]) -> a list of (c=[B, h_l], h=[B, h_l], z=[B, 1])
concated_states = [tf.concat(tuple(s), axis=1) for s in state]
return tf.concat(concated_states, axis=1) # [B, H]
elem_len = (sum(self._hidden_state_sizes) * 2) + self._num_layers
initial = tf.zeros([batch_size, elem_len]) # [B, H], H = elem_len
states = tf.scan(scan_rnn, batch_in_trans, initial) # [T, B, H]
def map_indicators(elem):
state = self.split_out_cell_states(elem)
return tf.concat([l.z for l in state], axis=1)
raw_indicators = tf.map_fn(map_indicators, states) # [T, B, L]
indicators = tf.transpose(raw_indicators, [1, 2, 0]) # [B, L, T]
to_map = tf.concat((states, batch_out_trans), axis=2) # [T, B, H+O]
def map_output(elem):
splits = tf.constant([elem_len, self._output_size])
cell_states, outcome = tf.split(value=elem, num_or_size_splits=splits, axis=1)
hs = [s.h for s in self.split_out_cell_states(cell_states)]
gated = self.gate_input(tf.concat(hs, axis=1)) # [B, sum(h_l)]
embeded = self.embed_input(gated) # [B, E], E = embeded_size
loss, prediction = self.output_module(embeded, outcome)
# [B, 1], [B, O] for classification
# [B, O], [B, O] for regression(usually O == 1)
return tf.concat((loss, prediction), axis=1) # [B, 1+O] or [B, 2*output_size]
mapped = tf.map_fn(map_output, to_map) # [T, B, 1+O] or [T, B, 2*O]
loss = tf.reduce_mean(mapped[:, :, :-self._output_size])# scalar
predictions_ = mapped[:, :, -self._output_size:]
predictions = tf.nn.softmax(predictions_, axis=-1) # [T, B, O]
train = self._optimizer.minimize(loss)
return train, loss, indicators, predictions, batch_out_oh
def _get_graph(self):
if self._graph is None:
self._graph = self.network(reuse=False)
print('Instructing network...')
return self._graph
def train(self, batches_in, batches_out, variable_path='weights/unnamed_weights', load_weights=False, epochs=1):
Train the network.
batches_in: a 4 dimensional numpy array. The dimensions should be
[num_batches, batch_size, num_timesteps, input_size]
These represent the input at each time step for each batch.
batches_out: a 4 dimensional numpy array. The dimensions should be
[num_batches, batch_size, num_timesteps, output_size]
These represent the output at each time step for each batch.
variable_path: the path to which variable values will be saved and/or
load_weights: bool, whether to load variables prior to training
save_weights: bool, whether to save variables after training
epochs: integer, number of epochs
train, loss, _, _ = self._get_graph()
losses = []
self._session = tf.Session()
print('Start a session...')
if not load_weights:
init = tf.global_variables_initializer()
for epoch in range(epochs):
print('Epoch {}'.format(epoch))
for batch_in, batch_out in zip(batches_in, batches_out):
fetches = [train, loss]
feed_dict = {
self.batch_in: batch_in, # (T, B, I)
self.batch_out: batch_out # (T, B, O)
_, _loss = self._session.run(fetches, feed_dict)
print('loss:', _loss)
print('Close session...')
return losses
def train_on_generator(self, generator, variable_path='weights/unnamed_weights', load_weights=False):
train, loss, _, _, _ = self._get_graph()
losses = []
self._session = tf.Session()
print('Start a session...')
if not load_weights:
init = tf.global_variables_initializer()
for idx, epoch in enumerate(generator.gen_epochs()):
print('\nEpoch {}'.format(idx))
for batch_idx, (batch_in, batch_out) in enumerate(epoch):
fetches = [train, loss]
feed_dict = {
self.batch_in: batch_in, # (B, T)
self.batch_out: batch_out # (B, T)
_, _loss = self._session.run(fetches, feed_dict)
print('Loss of batch {}: '.format(batch_idx), _loss)
print('Close session...')
return losses
def predict(self, batch, variable_path=None, return_gradients=False):
Make predictions.
batch: batch for which to make predictions. should have dimensions
[batch_size, num_timesteps, output_size]
variable_path: string. If there is no active session in the network
object (i.e. it has not yet been used to train or predict, or the
tensorflow session has been manually closed), variables will be
loaded from the provided path. Otherwise variables already present
in the session will be used.
predictions for the batch
batch = np.array(batch)
_, _, _, predictions, _ = self._get_graph()
self._session = tf.Session()
print('Start a session...')
batch_out_size = (batch.shape[1], batch.shape[0], self._output_size)
gradients = tf.gradients(predictions[-1:, :], self.batch_in)
feed_dict = {self.batch_in: np.swapaxes(batch, 0, 1),
self.batch_out: np.zeros(batch_out_size)}
_predictions, _gradients = self._session.run([predictions, gradients], feed_dict)
if return_gradients:
return tuple(np.swapaxes(r, 0, 1) for r in (_predictions, _gradients[0]))
print('Close session...')
return np.swapaxes(_predictions, 0, 1)
def test_on_generator(self, generator, variable_path=None, return_gradients=False):
_, loss, indicators, predictions, batch_out_oh = self._get_graph()
losses = []
predictionss = []
truths = []
indicatorss = []
self._session = tf.Session()
print('Start a session...')
for idx, epoch in enumerate(generator.gen_epochs()):
print('\nEpoch {}'.format(idx))
for batch_idx, (batch_in, batch_out) in enumerate(epoch):
feed_dict = {
self.batch_in: batch_in, # (B, T)
self.batch_out: batch_out # (B, T)
_loss, _predictions, _indicators, _batch_out_oh = self._session.run(
[loss, predictions, indicators, batch_out_oh], feed_dict
print('Loss of batch {}: '.format(batch_idx), _loss)
losses.append(_loss) # list of float
predictionss.append(np.swapaxes(_predictions, 0, 1)) # list of [B, T, O]
truths.append(_batch_out_oh) # list of [B, T, O]
indicatorss.append(np.array(_indicators)) # list of [B, L, T]
avg_loss = np.mean(np.array(losses))
print('Average loss in testing: {}'.format(avg_loss))
# self._session.close()
# print('Close session...')
return losses, predictionss, truths, indicatorss
def predict_boundaries(self, batch, variable_path=None):
Find indicator values for every layer at every timestep.
batch: batch for which to make predictions. should have dimensions
[batch_size, num_timesteps, output_size]
variable_path: string. If there is no active session in the network
object (i.e. it has not yet been used to train or predict, or the
tensorflow session has been manually closed), variables will be
loaded from the provided path. Otherwise variables already present
in the session will be used.
indicator values for ever layer at every timestep
batch = np.array(batch)
_, _, indicators, _, _ = self._get_graph()
self._session = tf.Session()
print('Start a session...')
feed_dict = {self.batch_in: np.swapaxes(batch, 0, 1)}
_indicators = self._session.run(indicators, feed_dict)
print('Close session...')
return np.array(_indicators)
def print_variable(self):
w1_output = tf.get_default_graph().get_tensor_by_name("output_module_vars/w1:0")
w1_lstm = tf.get_default_graph().get_tensor_by_name("multi_hmlstm_cell/cell_0/hmlstm_cell/dense/kernel:0")
print('w1_output: \n', self._session.run(w1_output))
print('w1_lstm: \n', self._session.run(w1_lstm))
# for v in tf.trainable_variables():
# print(v.name)