TensorFlow fit () и GradientTape - количество эпох различно - PullRequest
0 голосов
/ 24 января 2020

, если я определяю архитектуру нейронной сети, используя только плотные полностью связанные слои, и обучаю их так, что есть две модели, которые обучаются с использованием model.fit () и GradientTape, Оба метода обучения используют одинаковую архитектуру модели.

Случайно инициализированные веса распределяются между двумя моделями, и все другие параметры, такие как оптимизатор, функция потерь и метрики, также одинаковы.

Размеры учебных и тестовых наборов: X_train = (960, 4), y_train = (960,), X_test = (412, 4) & y_test = (412,)

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow_model_optimization as tfmot
from tensorflow_model_optimization.sparsity import keras as sparsity

def create_nn():
    Function to create a
    Neural Network
    model = Sequential()                                                    

            units = 4, activation = 'relu',
            kernel_initializer = tf.keras.initializers.GlorotNormal(),
            input_shape = (4,)

            units = 3, activation = 'relu',
            kernel_initializer = tf.keras.initializers.GlorotNormal()

            units = 1, activation = 'sigmoid'

    # Compile the defined NN model above-
        loss = 'binary_crossentropy',  # loss = 'categorical_crossentropy'
        optimizer = tf.keras.optimizers.Adam(lr = 0.001),

    return model

# Instantiate a model- model = create_nn()

# Save weights for fair comparison- model.save_weights("Random_Weights.h5", overwrite=True)

# Create datasets to be used for GradientTape-
# Use tf.data to batch and shuffle the dataset train_ds = tf.data.Dataset.from_tensor_slices(
    (X_train, y_train)).shuffle(100).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices(
    (X_test, y_test)).shuffle(100).batch(32)

# Define early stopping- callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3,
    min_delta = 0.001, mode = 'min' )

# Train defined model- history_orig = model.fit(
    x = X_train, y = y_train,
    batch_size = 32, epochs = 500,
    validation_data = (X_test, y_test),
    callbacks = [callback],
    verbose = 1 )

# Instantiate a model- model_gt = create_nn()

# Restore random weights as used by the previous model for fair comparison- model_gt.load_weights("Random_Weights.h5")

# Choose an optimizer and loss function for training- loss_fn = tf.keras.losses.BinaryCrossentropy() optimizer = tf.keras.optimizers.Adam(lr = 0.001)

# Select metrics to measure the error & accuracy of model.
# These metrics accumulate the values over epochs and then
# print the overall result- train_loss = tf.keras.metrics.Mean(name = 'train_loss') train_accuracy = tf.keras.metrics.BinaryAccuracy(name = 'train_accuracy')

test_loss = tf.keras.metrics.Mean(name = 'test_loss') test_accuracy = tf.keras.metrics.BinaryAccuracy(name = 'train_accuracy')

# Use tf.GradientTape to train the model-

@tf.function def train_step(data, labels):
    Function to perform one step of Gradient
    Descent optimization

    with tf.GradientTape() as tape:
        predictions = model_gt(data)
        loss = loss_fn(labels, predictions)

    gradients = tape.gradient(loss, model_gt.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model_gt.trainable_variables))

    train_accuracy(labels, predictions)

@tf.function def test_step(data, labels):
    Function to test model performance
    on testing dataset

    predictions = model_gt(data)
    t_loss = loss_fn(labels, predictions)

    test_accuracy(labels, predictions)

EPOCHS = 100

# User input- minimum_delta = 0.001 patience = 3

patience_val = np.zeros(patience)

# Dictionary to hold scalar metrics- history = {}

history['accuracy'] = np.zeros(EPOCHS) history['val_accuracy'] = np.zeros(EPOCHS) history['loss'] = np.zeros(EPOCHS) history['val_loss'] = np.zeros(EPOCHS)

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch

    for x, y in train_ds:
        train_step(x, y)

    for x_t, y_t in test_ds:
        test_step(x_t, y_t)

    template = 'Epoch {0}, Loss: {1:.4f}, Accuracy: {2:.4f}, Test Loss: {3:.4f}, Test Accuracy: {4:4f}'

    history['accuracy'][epoch] = train_accuracy.result()
    history['loss'][epoch] = train_loss.result()
    history['val_loss'][epoch] = test_loss.result()
    history['val_accuracy'][epoch] = test_accuracy.result()

    print(template.format(epoch + 1, 
                          train_loss.result(), train_accuracy.result()*100,
                          test_loss.result(), test_accuracy.result()*100))

    if epoch > 2:
        # Computes absolute differences between 3 consecutive loss values-
        differences = np.abs(np.diff(history['val_loss'][epoch - 3:epoch], n = 1))

        # Checks whether the absolute differences is greater than 'minimum_delta'-
        check =  differences > minimum_delta

        # print('differences: {0}'.format(differences))

        # Count unique element with it's counts-
        # elem, count = np.unique(check, return_counts=True)
        # print('\nelem = {0}, count = {1}'.format(elem, count))

        if np.all(check == False):
        # if elem.all() == False and count == 2:
            print("\n\nEarlyStopping Evoked! Stopping training\n\n")

В методе "model.fit ()" это занимает около 82 эпох, а метод GradientTape - 52 эпохи.

Почему существует такое расхождение в количестве эпох?


Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.