Трансферное обучение - PullRequest
0 голосов
/ 24 апреля 2018

Я пытаюсь следовать этому руководству введите описание ссылки здесь при обучении передаче, я использовал свой собственный набор данных, и я пытаюсь использовать MobileNet вместо этого, проблема в моделях MobileNet Есть 3 файла контрольных точек: mobilenet_v1_0.5_128.ckpt.data-00000-of-00001 mobilenet_v1_0.5_128.ckpt.index mobilenet_v1_0.5_128.ckpt.meta когда я использую один из них получил эту ошибку: NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt.meta [[Node: save/RestoreV2_139 = RestoreV2[dtypes=[DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2_139/tensor_names, save/RestoreV2_139/shape_and_slices)]]


import tensorflow as tf
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
from tensorflow.python.platform import tf_logging as logging
#from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
from models.research.slim.nets.mobilenet_v1 import mobilenet_v1, mobilenet_v1_arg_scope
import os
import time
import h5py
import numpy as np

slim = tf.contrib.slim

# ================ DATASET INFORMATION ======================
# State dataset directory where the tfrecord files are located
dataset_dir = 'C://Nassima//lymphoma//subs3'

# State where your log file is at. If it doesn't exist, create it.
log_dir = './log'

# State where your checkpoint file is
checkpoint_file = 'C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt.meta'



# State the image size you're resizing your images to. We will use the default inception size of 299.
#image_size = 299
#image_size = 128

# State the number of classes to predict:
num_classes = 3
# State the labels file and read it
labels_file = 'C://Nassima//lymphoma//subs3//labels.txt'
labels = open(labels_file, 'r')

# Create a dictionary to refer each label to their string name
labels_to_name = {}
for line in labels:
    label, string_name = line.split(':')
    string_name = string_name[:-1]  # Remove newline
    labels_to_name[int(label)] = string_name
print(labels_to_name)
# Create the file pattern of your TFRecord files so that it could be recognized later on
"""
file_pattern = 'flowers_%s_*.tfrecord'
"""

# Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
items_to_descriptions = {
    'image': 'A 3-channel RGB coloured lymphoma image that is either CLL, FL, MCL.',
    'label': 'A label that is as such -- 0:CLL, 1:FL, 2:MCL'
}

# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 1

# State your batch size
#batch_size = 8
file_mean = "C://Nassima//lymphoma//subs3//train//mean.hdf5"
TRAINING_SET_SIZE = 41860
BATCH_SIZE = 128
IMAGE_SIZE =  144
IMAGE_RESIZE = 128
# Learning rate information and configuration (Up to you to experiment)
initial_learning_rate = 0.0002
learning_rate_decay_factor = 0.7
num_epochs_before_decay = 2

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
class _image_object: # image object from protobuf
    def __init__(self):
        self.image = tf.Variable([], dtype=tf.string)
        self.height = tf.Variable([], dtype=tf.int64)
        self.width = tf.Variable([], dtype=tf.int64)
        self.filename = tf.Variable([], dtype=tf.string)
        self.label = tf.Variable([], dtype=tf.int32)
def read_and_decode(filename_queue, mean):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(serialized_example, features = {
        "image/encoded": tf.FixedLenFeature([], tf.string),
        "image/height": tf.FixedLenFeature([], tf.int64),
        "image/width": tf.FixedLenFeature([], tf.int64),
        "image/filename": tf.FixedLenFeature([], tf.string),
        "image/class/label": tf.FixedLenFeature([], tf.int64),})
    image_encoded = features["image/encoded"]
    image_raw = tf.decode_raw(image_encoded, tf.float32)
    image_object = _image_object()
    #image_object.image = tf.image.resize_image_with_crop_or_pad(image_raw, IMAGE_SIZE, IMAGE_SIZE)
    image_r = tf.reshape(image_raw, [IMAGE_SIZE, IMAGE_SIZE, 3])
    #added
    image_r = image_r - mean
    image_r = tf.random_crop(image_r ,[IMAGE_RESIZE ,IMAGE_RESIZE ,3], seed = 0, name = None)
    image_object.image = image_r
    image_object.height = features["image/height"]
    image_object.width = features["image/width"]
    image_object.filename = features["image/filename"]
    image_object.label = tf.cast(features["image/class/label"], tf.int64)
    return image_object
def flower_input(mean, if_random = True, if_training = True):
    if(if_training):
        filenames = [os.path.join(dataset_dir, "lymphoma_train_0000%d-of-00005.tfrecord" % i) for i in range(0, 5)]
    else:
        filenames = [os.path.join(dataset_dir, "lymphoma_validation_0000%d-of-00005.tfrecord" % i) for i in range(0, 5)]
    for f in filenames:
        if not tf.gfile.Exists(f):
            raise ValueError("Failed to find file: " + f)
    filename_queue = tf.train.string_input_producer(filenames)
    image_object = read_and_decode(filename_queue, mean)
    image = tf.image.per_image_standardization(image_object.image)
#    image = image_object.image
#    image = tf.image.adjust_gamma(tf.cast(image_object.image, tf.float32), gamma=1, gain=1) # Scale image to (0, 1)
    filename = image_object.filename
    label = image_object.label
    if(if_random):
        min_fraction_of_examples_in_queue = 0.4
        min_queue_examples = int(TRAINING_SET_SIZE * min_fraction_of_examples_in_queue)
        print("Filling queue with %d images before starting to train. " "This will take a few minutes." % min_queue_examples)
        num_preprocess_threads = 1
        image_batch, label_batch, filename_batch = tf.train.shuffle_batch(
            [image, label, filename],
            batch_size=BATCH_SIZE,
            num_threads=num_preprocess_threads,
            capacity=min_queue_examples + 3 * BATCH_SIZE,
            min_after_dequeue=min_queue_examples)
        return image_batch, label_batch, filename_batch
    else:
        image_batch, label_batch, filename_batch = tf.train.batch(
            [image, label, filename],
            batch_size=BATCH_SIZE,
            num_threads=1)
        return image_batch, label_batch, filename_batch

"""
# ============== DATASET LOADING ======================
"""

def run():
    # Create the log directory here. Must be done here otherwise import will activate this unneededly.
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # ======================= TRAINING PROCESS =========================
    # Now we start to construct the graph and build our model
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)  # Set the verbosity to INFO level
        # ajouter le mean de l'image
        hdf5_file = h5py.File(file_mean, "r")
        # subtract the training mean
        mm = hdf5_file["train_mean"][0, ...]
        mm = mm[np.newaxis, ...]
        # Total number of samples
        mean = tf.convert_to_tensor(mm, np.float32)
        # First create the dataset and load one batch
        images, labels, _ = flower_input(mean, if_random=True, if_training=True)
        # Know the number steps to take before decaying the learning rate and batches per epoch
        num_batches_per_epoch = int(TRAINING_SET_SIZE / BATCH_SIZE)
        num_steps_per_epoch = num_batches_per_epoch  # Because one step is one batch processed
        decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)

        # Create the model inference
        with slim.arg_scope(mobilenet_v1_arg_scope()):
            logits, end_points = mobilenet_v1(images, num_classes= num_classes, is_training=True)

        # Define the scopes that you want to exclude for restoration
        #exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
        exclude = ['MobilenetV1/Logits', 'MobilenetV1/AuxLogits']
        #exclude = ["MobilenetV1/Logits/Conv2d_1c_1x1"]
        #exclude = []
        variables_to_restore = slim.get_variables_to_restore(exclude=exclude)

        # Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
        one_hot_labels = slim.one_hot_encoding(labels, num_classes)

        # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
        loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits)
        total_loss = tf.losses.get_total_loss()  # obtain the regularization losses as well

        # Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        # Define your exponentially decaying learning rate
        lr = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=decay_steps,
            decay_rate=learning_rate_decay_factor,
            staircase=True)

        # Now we can define the optimizer that takes on the learning rate
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        # Create the train_op.
        train_op = slim.learning.create_train_op(total_loss, optimizer)

        # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(end_points['Predictions'], 1)
        probabilities = end_points['Predictions']
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels)
        metrics_op = tf.group(accuracy_update, probabilities)

        # Now finally create all the summaries you need to monitor and group them into one summary op.
        tf.summary.scalar('losses/Total_Loss', total_loss)
        tf.summary.scalar('accuracy', accuracy)
        tf.summary.scalar('learning_rate', lr)
        my_summary_op = tf.summary.merge_all()

        # Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step):
            '''
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            '''
            # Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
            time_elapsed = time.time() - start_time
            # Run the logging to print some results
            logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)
            return total_loss, global_step_count
        # Now we create a saver function that actually restores the variables from a checkpoint file in a sess
        saver = tf.train.Saver(variables_to_restore)
        saver = tf.train.import_meta_graph(checkpoint_file)
        #added
        def restore_fn(sess):
            return saver.restore(sess, 'C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt')
        # Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=log_dir, summary_op=None, init_fn=restore_fn)
        # Run the managed session
        with sv.managed_session() as sess:
            for step in range(num_steps_per_epoch * num_epochs):
                # At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
                    learning_rate_value, accuracy_value = sess.run([lr, accuracy])
                    logging.info('Current Learning Rate: %s', learning_rate_value)
                    logging.info('Current Streaming Accuracy: %s', accuracy_value)

                    # optionally, print your logits and predictions for a sanity check that things are going fine.
                    logits_value, probabilities_value, predictions_value, labels_value = sess.run(
                        [logits, probabilities, predictions, labels])
                    print
                    'logits: \n', logits_value
                    print
                    'Probabilities: \n', probabilities_value
                    print
                    'predictions: \n', predictions_value
                    print
                    'Labels:\n:', labels_value
                # Log the summaries every 10 step.
                if step % 10 == 0:
                    loss, _ = train_step(sess, train_op, sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)
                # If not, simply run the training step
                else:
                    loss, _ = train_step(sess, train_op, sv.global_step)

            # We log the final training loss and accuracy
            logging.info('Final Loss: %s', loss)
            logging.info('Final Accuracy: %s', sess.run(accuracy))
            # Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')
            # saver.save(sess, "./flowers_model.ckpt")
            #sv.saver.save(sess, sv.save_path, global_step=sv.global_step)



if __name__ == '__main__':
    run()

и ошибка

File "C:/Users/hp/PycharmProjects/tfSlim/lympho_mobileNet/train_lymphoma2.py", line 272, in <module>
    run()
  File "C:/Users/hp/PycharmProjects/tfSlim/lympho_mobileNet/train_lymphoma2.py", line 230, in run
    sv = tf.train.Supervisor(logdir=log_dir, summary_op=None, init_fn=restore_fn)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\supervisor.py", line 300, in __init__
    self._init_saver(saver=saver)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\supervisor.py", line 448, in _init_saver
    saver = saver_mod.Saver()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1218, in __init__
    self.build()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1227, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1263, in _build
    build_save=build_save, build_restore=build_restore)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 729, in _build_internal
    saveables = self._ValidateAndSliceInputs(names_to_saveables)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 582, in _ValidateAndSliceInputs
    names_to_saveables = BaseSaverBuilder.OpListToDict(names_to_saveables)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 554, in OpListToDict
    name)
ValueError: At least two variables have the same name: MobilenetV1/Conv2d_7_depthwise/BatchNorm/gamma

думаю из-за исключенных слоев или инструкции

tf.train.import_meta_graph(checkpoint_file)

1 Ответ

0 голосов
/ 22 мая 2018

Вы загружаете метафайл, в то время как вы должны указать только путь к mobilenet_v1_0.5_128.ckpt

...