SGD - Tensorflow против Matlab, колебания в точности проверки - PullRequest
0 голосов
/ 05 октября 2019

Я тренирую CNN для классификации CIFAR-10, как в Matlab, так и в Tensorflow: CNN выглядят одинаково для меня:

Tensorflow

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, Dropout, MaxPool2D, Flatten, Dense
import numpy as np
import datetime
import os   
cifar10 = tf.keras.datasets.cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0    
base_lr = 0.005    
for k in range(1):    
    lr = base_lr * (0.9 ** k) # lr = base_lr   
    model = Sequential()    
    # Layer 1
    model.add(Conv2D(input_shape=(32, 32, 3), filters=128, kernel_size=5, strides=(1, 1), padding="valid",
              kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01),
              bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)))
    model.add(BatchNormalization(epsilon=1e-5, momentum=0.0))
    model.add(Activation("relu"))
    model.add(MaxPool2D(pool_size=(3, 3), strides=(1, 1), padding="valid"))

    # Layer 2
    model.add(Conv2D(filters=128, kernel_size=5, strides=(1, 1), padding="valid",
              kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01),
              bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)))
    model.add(BatchNormalization(epsilon=1e-5, momentum=0.0))
    model.add(Activation("relu"))
    model.add(MaxPool2D(pool_size=(3, 3), strides=(1, 1), padding="valid"))

    #Layer 3 
    model.add(Conv2D(filters=128, kernel_size=5, strides=(1, 1), padding="valid",
              kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01),
              bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)))
    model.add(BatchNormalization(epsilon=1e-5, momentum=0.0))
    model.add(Activation("relu"))
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="valid"))

    # Fully Connected layer
    model.add(Flatten())
    layer_length = model.layers[-1].output_shape[-1]#layer_length=8192
    model.add(Dense(10, kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=1/np.sqrt(layer_length)),
                    bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=1/np.sqrt(10)), activation="softmax"))

    model.compile(optimizer=tf.keras.optimizers.SGD(lr, 0.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    sub_dir = datetime.datetime.strftime(datetime.datetime.now(), "%H_%M_%S_%m_%d_%y")
    log_dir = "D:\\DataSet\\CIFAR10\\TensorBoard\\" + sub_dir
    os.mkdir(log_dir)
    model.fit(x_train, y_train, batch_size=256, epochs=100, validation_data=(x_test,  y_test), verbose=2,
              callbacks=[tf.keras.callbacks.TensorBoard(log_dir=log_dir)])

MATLAB

Specs.Conv1Channels =  128;
Specs.Conv2Channels = 128;
Specs.Conv3Channels = 128;
Specs.DropOutProb =  0;%no dropout
Specs.Momentum =  0;%no momentum    
ImSize = [32 32 3];
MaxEpochs = 100;
num_of_batches = 5;
B = cell(num_of_batches,1);
for kBatch = 1:num_of_batches
  B{kBatch} = load(['D:\DataSet\CIFAR10\cifar-10-batches-mat\data_batch_',num2str(kBatch),'.mat']);
end
C = cell2mat(B(:));
X = double(rot90(reshape(cell2mat({C.data}')',ImSize(1),ImSize(2),ImSize(3),[]),-1))/255;
Y = categorical(cell2mat({C.labels}'));
T = {load('D:\DataSet\CIFAR10\cifar-10-batches-mat\test_batch.mat')};
C = cell2mat(T(:));
Xt = double(rot90(reshape(cell2mat({C.data}')',ImSize(1),ImSize(2),ImSize(3),[]),-1))/255;
Yt = categorical(cell2mat({C.labels}'));

MiniBatchSizes = 256;
lrs = 0.005;
for lr = 1:length(lrs)
for mbs = 1:length(MiniBatchSizes)      

  IterPerEpoch = ceil(size(X,4) / MiniBatchSizes(mbs));

  layers(1) = imageInputLayer(ImSize,'Name','input','Normalization','none');
  % Layer 1
  Conv1KernelSize = 5;
  layers(2) = convolution2dLayer(Conv1KernelSize,Specs.Conv1Channels,'Name','conv1');
  LOutSize = [ImSize(1) - Conv1KernelSize + 1 ImSize(2) - Conv1KernelSize + 1 ImSize(3) Specs.Conv1Channels];
  layers(3) = batchNormalizationLayer;
  layers(4) = reluLayer('Name','relu1');
  layers(5) = dropoutLayer(Specs.DropOutProb);
  layers(6) = maxPooling2dLayer(3,'Stride',1,'Name','pool1');
  LOutSize  = LOutSize - [2 2 0 0];

  %Layer 2
  Conv2KernelSize = 5;
  layers(7) = convolution2dLayer(Conv2KernelSize,Specs.Conv2Channels,'Name','conv2');
  LOutSize = [LOutSize(1) - Conv2KernelSize + 1 LOutSize(2) - Conv2KernelSize + 1 LOutSize(4) Conv2KernelSize];
  layers(8) = batchNormalizationLayer;
  layers(9) = reluLayer('Name','relu2');
  layers(10) = dropoutLayer(Specs.DropOutProb);
  layers(11) = maxPooling2dLayer(3,'Stride',1,'Name','pool2');
  LOutSize  = LOutSize - [2 2 0 0];

  %Layer 3

  Conv3KernelSize = 5;
  layers(12) = convolution2dLayer(Conv3KernelSize,Specs.Conv3Channels,'Name','conv3');
  LOutSize = [LOutSize(1) - Conv3KernelSize + 1 LOutSize(2) - Conv3KernelSize + 1 LOutSize(4) Conv3KernelSize];
  layers(13) = batchNormalizationLayer;
  layers(14) = reluLayer('Name','relu3');
  layers(15) = dropoutLayer(Specs.DropOutProb);
  layers(16) = maxPooling2dLayer(2,'Stride',2,'Name','pool3');
  LOutSize = [LOutSize(1:2) / 2 LOutSize(3:4)];

  %FC Layer 
  layers(17) = fullyConnectedLayer(10,'Name','fc4');
  layers(18) = softmaxLayer('Name','softmax');
  layers(19) = classificationLayer('Name','outlayer');

  %Initialization

  layers(2).Weights = 0.01 * randn([Conv1KernelSize Conv1KernelSize 3 Specs.Conv1Channels]);
  layers(2).Bias = 0.01 * randn([1 1 Specs.Conv1Channels]);
  layers(7).Weights = 0.01 * randn([Conv2KernelSize Conv2KernelSize Specs.Conv1Channels Specs.Conv2Channels]);
  layers(7).Bias = 0.01 * randn([1 1 Specs.Conv2Channels]);
  layers(12).Weights = 0.01 * randn([Conv3KernelSize Conv3KernelSize Specs.Conv2Channels Specs.Conv3Channels]);
  layers(12).Bias = 0.01 * randn([1 1 Specs.Conv3Channels]);
  %LOutSize(1) * LOutSize(2) * Specs.Conv3Channels = 8192
  layers(17).Weights = 1/ sqrt(LOutSize(1) * LOutSize(2) * Specs.Conv3Channels) * randn(10,LOutSize(1) * LOutSize(2) * Specs.Conv3Channels);
  layers(17).Bias = 1/ sqrt(10) * randn(10,1);
  options = trainingOptions('sgdm', ...
    'MaxEpochs',MaxEpochs,
    'InitialLearnRate',lrs(lr),...
    'MiniBatchSize',MiniBatchSizes(mbs),...
    'Momentum',Specs.Momentum,...
    'Shuffle','every-epoch', ...
    'ValidationData',{Xt,Yt},...
    'ValidationFrequency',1 * IterPerEpoch,...
    'VerboseFrequency',1 * IterPerEpoch,...
    'Plots','none',...
    'LearnRateSchedule','none',...
    'ExecutionEnvironment','gpu',...
    'L2Regularization',0.0);%no l2 reg.

     trainedNet = trainNetwork(X,Y,layers,options);
end
end

По какой-то причине я не могу понять, что точность проверки на TF сильно колеблется, в то время как точность проверки MATLAB выглядит красиво и плавно во всех отношениях:

Matlab Vs. TF validation accuracy

Что мне здесь не хватает ???

...