У меня проблемы с сохранением / восстановлением модели Tensorflow, которую я обучал.Видимая проблема заключается в том, что восстановленная модель работает плохо, потому что она использует случайные веса вместо правильных.Другие сталкивались с этой проблемой, но основная причина для меня, кажется, другая, возможно, дублированные имена переменных.
Я проверил, что веса в файле контрольных точек совпадают с весами, используемыми в обученной модели,и что веса после восстановления совершенно разные.Я также видел повторяющиеся переменные, оканчивающиеся на _1 перед сохранением.Еще более необычно то, что после восстановления повторяются переменные с одинаковыми именами .
Я старался запускать tf.reset_default_graph () перед первой настройкой графика и снова перед восстановлениемЭто.Потенциальная проблема может заключаться в том, что я запускаю global_variables_initializer (), прежде чем восстановить переменные.В противном случае я получаю ошибку неинициализированной переменной при запуске восстановленной модели.
Я использую последнюю версию tenorflow с поддержкой графического процессора.Вот код, разделенный на здание, обучение / сохранение и восстановление:
#build model
def model_ten_layer_unet(X,y,is_training):
# define our weights (e.g. init_two_layer_convnet)
M = 4
# setup variables
Wconv1 = tf.get_variable("Wconv1", shape=[3, 3, 3, 1, M*32]) #SAME: Never change output size
bconv1 = tf.get_variable("bconv1", shape=[M*32])
Wconv2 = tf.get_variable("Wconv2", shape=[3, 3, 3, M*32, M*32])
bconv2 = tf.get_variable("bconv2", shape=[M*32])
...
#Wconv12= tf.get_variable("Wconv12", shape=[3, 3, 3, 1, M*32]) #SAME: Final layer combines stacked outputs
#bconv12= tf.get_variable("bconv12", shape=[1])
# define our graph (e.g. two_layer_convnet)
d1 = tf.nn.max_pool3d(tf.expand_dims(X,-1), ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='VALID')
print('d1 shape: ', d1.shape)
...
a12 = tf.layers.conv3d_transpose(c10, filters=1, kernel_size=[3,3,3], strides=[2,2,2], padding='SAME')
print('a12 shape: ', a12.shape)
y_out = tf.add(tf.squeeze(a12, -1),X)
print('y_out shape: ', y_out.shape)
return y_out
def run_model(session, predict, loss_val, Xd, yd, mean_image,
epochs=1, batch_size=64, print_every=100,
training=None, plot_losses=False):
# have tensorflow compute accuracy
mse = tf.reduce_mean(tf.square(tf.subtract(predict, y)))
accuracy = tf.sqrt(mse)/tf.reduce_mean(tf.cast(mean_image,tf.float32))
# shuffle indicies
train_indicies = np.arange(Xd.shape[0])
np.random.shuffle(train_indicies)
training_now = training is not None
# setting up variables we want to compute (and optimizing)
# if we have a training function, add that to things we compute
# mse is a placeholder
variables = [mean_loss, accuracy, mse]
if training_now:
variables[-1] = training
# counter
iter_cnt = 0
for e in range(epochs):
# keep track of losses and accuracy
losses = []
accuracies = []
# make sure we iterate over the dataset once
for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
# generate indicies for the batch
start_idx = (i*batch_size)%Xd.shape[0]
idx = train_indicies[start_idx:start_idx+batch_size]
# create a feed dictionary for this batch
feed_dict = {X: Xd[idx,:],
y: yd[idx],
is_training: training_now }
# get batch size
actual_batch_size = yd[idx].shape[0]
# have tensorflow compute loss and correct predictions
# and (if given) perform a training step
loss, accuracy, _ = session.run(variables,feed_dict=feed_dict)
# aggregate performance stats
losses.append(loss)
accuracies.append(accuracy)
# print every now and then
if training_now and (iter_cnt % print_every) == 0:
print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
.format(iter_cnt,loss,accuracy))
iter_cnt += 1
total_accuracy = np.mean(accuracies)
total_loss = np.mean(losses)
...
return total_loss,total_accuracy, y_out
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, 64, 64, 64])
y = tf.placeholder(tf.float32, [None, 64, 64, 64])
is_training = tf.placeholder(tf.bool)
y_out = model_ten_layer_unet(X,y,is_training)
total_loss = tf.square(tf.subtract(y_out, y))
mean_loss = tf.reduce_mean(total_loss)
global_step = tf.Variable(0, trainable=False)
boundaries = [5000,10000,15000]
values = [4e-4,2e-4,5e-5,1e-5]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
optimizer = tf.train.AdamOptimizer(learning_rate) # select optimizer and set learning rate
# enable saving and loading models
saver = tf.train.Saver()
#set up path for saving models
home = os.getenv("HOME")
work_dir = 'assignment2'
model_dir = 'models'
model_name = 'unet1'
model_path = os.path.join(home,work_dir,model_dir)
if not os.path.exists(model_path):
os.makedirs(model_path)
model_pathname = os.path.join(model_path,model_name)
# batch normalization in tensorflow requires this extra dependency
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_step = optimizer.minimize(mean_loss,global_step=global_step)
#train model
with tf.Session() as sess:
#with tf.device("/gpu:0"): #"/cpu:0" or "/gpu:0"
sess.run(tf.global_variables_initializer())
#print(sess.run('Wconv1:0'))
for e in range(1):
print('Training epoch ', e+1)
run_model(sess,y_out,mean_loss,X_train,y_train,mean_image,1,32,1,train_step,False)
print('Validation epoch ', e+1)
_,_,y_hat = run_model(sess,y_out,mean_loss,X_val,y_val,mean_image,1,32,1)
print('Global Variables')
for v in tf.global_variables():
print(v.name)
print('Local Variables')
for v in tf.local_variables():
print(v.name)
saver.export_meta_graph(model_pathname, clear_devices=True)
save_path = saver.save(sess, model_pathname, global_step=global_step)
print("Model saved in path: %s" % save_path)
#Output
Training epoch 1
Iteration 0: with minibatch training loss = 7.95e+03 and accuracy of 0.2
...
Iteration 27: with minibatch training loss = 7.51e+03 and accuracy of 0.2
Global Variables
Wconv1:0
bconv1:0
Wconv2:0
bconv2:0
Wconv3:0
bconv3:0
conv3d_transpose/kernel:0
conv3d_transpose/bias:0
conv3d_transpose_1/kernel:0
conv3d_transpose_1/bias:0
Variable:0
beta1_power:0
beta2_power:0
Wconv1/Adam:0
Wconv1/Adam_1:0 **Duplicated variables with _1 before saving**
bconv1/Adam:0
bconv1/Adam_1:0
Wconv2/Adam:0
Wconv2/Adam_1:0
bconv2/Adam:0
bconv2/Adam_1:0
Wconv3/Adam:0
Wconv3/Adam_1:0
bconv3/Adam:0
bconv3/Adam_1:0
conv3d_transpose/kernel/Adam:0
conv3d_transpose/kernel/Adam_1:0
...
conv3d_transpose_1/bias/Adam:0
conv3d_transpose_1/bias/Adam_1:0
#restore model
tf.reset_default_graph()
# Later, launch the model, use the saver to restore variables from disk, and
# do some work with the model.
with tf.Session() as sess:
X = tf.placeholder(tf.float32, [None, 64, 64, 64])
y = tf.placeholder(tf.float32, [None, 64, 64, 64])
is_training = tf.placeholder(tf.bool)
y_out = model_ten_layer_unet(X,y,is_training)
total_loss = tf.square(tf.subtract(y_out, y))
mean_loss = tf.reduce_mean(total_loss)
sess.run(tf.global_variables_initializer())
new_saver = tf.train.import_meta_graph(save_path+'.meta')
# Restore variables from disk.
new_saver.restore(sess, save_path)
print("Model restored.")
# Load output images
print('Global Variables')
for v in tf.global_variables():
print(v.name)
print('Local Variables')
for v in tf.local_variables():
print(v.name)
_,_,y_hat = run_model(sess,y_out,mean_loss,X_val,y_val,mean_image,1,32,1)
y_hat = y_hat.eval(feed_dict = {X:X_val[0:9,],is_training:False})
#Output
d1 shape: (?, 32, 32, 32, 1)
...
y_out shape: (?, 64, 64, 64)
Global Variables
Wconv1:0
bconv1:0
Wconv2:0
bconv2:0
Wconv3:0
bconv3:0
conv3d_transpose/kernel:0
conv3d_transpose/bias:0
conv3d_transpose_1/kernel:0
conv3d_transpose_1/bias:0
Wconv1:0 **Repeated variables with the same name**
bconv1:0
Wconv2:0
bconv2:0
Wconv3:0
bconv3:0
conv3d_transpose/kernel:0
conv3d_transpose/bias:0
conv3d_transpose_1/kernel:0
conv3d_transpose_1/bias:0
Variable:0
beta1_power:0
beta2_power:0
Wconv1/Adam:0
Wconv1/Adam_1:0
bconv1/Adam:0
bconv1/Adam_1:0
Wconv2/Adam:0
Wconv2/Adam_1:0
bconv2/Adam:0
bconv2/Adam_1:0
Wconv3/Adam:0
Wconv3/Adam_1:0
bconv3/Adam:0
bconv3/Adam_1:0
conv3d_transpose/kernel/Adam:0
conv3d_transpose/kernel/Adam_1:0
...
conv3d_transpose_1/bias/Adam:0
conv3d_transpose_1/bias/Adam_1:0
Local Variables
Epoch 1, Overall loss = 9.01e+03 and accuracy of 0.214 **Poor validation performance**