Мне нужна помощь по поведению кода MNIST, отличающемуся от Ускоренный курс машинного обучения Google
Следующий код скопирован из примера MNIST для ускоренного курса обучения в Google: Упражнение по программированию: классификация рукописных цифр с помощью нейронных сетей .
Однако обучение расходится на локальном ПК (Windows или Linux)
Дивергенция в ПК с Linux
Тот же код работает нормально в Google Colab
Конвергенция в Google Colab
Посоветуйте, пожалуйста, как отлаживать.
from __future__ import print_function
import glob
import os
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
mnist_dataframe = pd.read_csv(
# "https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",
"mnist_train_small.csv",
sep=",",
header=None)
# Use just the first 10,000 records for training/validation.
mnist_dataframe = mnist_dataframe.head(10000)
mnist_dataframe = mnist_dataframe.reindex(np.random.permutation(mnist_dataframe.index))
def parse_labels_and_features(dataset):
labels = dataset[0]
features = dataset.loc[:,1:784]
features = features / 255
return labels, features
training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])
validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])
def construct_feature_columns():
return set([tf.feature_column.numeric_column('pixels', shape=784)])
def create_training_input_fn(features, labels, batch_size, num_epochs=None, shuffle=True):
def _input_fn(num_epochs=None, shuffle=True):
idx = np.random.permutation(features.index)
raw_features = {"pixels":features.reindex(idx)}
raw_targets = np.array(labels[idx])
ds = Dataset.from_tensor_slices((raw_features,raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
def create_predict_input_fn(features, labels, batch_size):
def _input_fn():
raw_features = {"pixels": features.values}
raw_targets = np.array(labels)
ds = Dataset.from_tensor_slices((raw_features, raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
def train_nn_classification_model(
learning_rate,
steps,
batch_size,
hidden_units,
training_examples,
training_targets,
validation_examples,
validation_targets):
periods = 3
steps_per_period = steps / periods
# Create the input functions.
predict_training_input_fn = create_predict_input_fn(
training_examples, training_targets, batch_size)
predict_validation_input_fn = create_predict_input_fn(
validation_examples, validation_targets, batch_size)
training_input_fn = create_training_input_fn(
training_examples, training_targets, batch_size)
# Create feature columns.
feature_columns = [tf.feature_column.numeric_column('pixels', shape=784)]
# Create a DNNClassifier object.
my_optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns,
n_classes=10,
hidden_units=hidden_units,
optimizer=my_optimizer,
# config=tf.contrib.learn.RunConfig(keep_checkpoint_max=1),
model_dir='./mdd'
)
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
print("Training model...")
print("LogLoss error (on validation data):")
training_errors = []
validation_errors = []
for period in range (0, periods):
# Train the model, starting from the prior state.
classifier.train(
input_fn=training_input_fn,
steps=steps_per_period
)
# Take a break and compute probabilities.
training_predictions = list(classifier.predict(input_fn=predict_training_input_fn))
training_probabilities = np.array([item['probabilities'] for item in training_predictions])
training_pred_class_id = np.array([item['class_ids'][0] for item in training_predictions])
training_pred_one_hot = tf.keras.utils.to_categorical(training_pred_class_id,10)
validation_predictions = list(classifier.predict(input_fn=predict_validation_input_fn))
validation_probabilities = np.array([item['probabilities'] for item in validation_predictions])
validation_pred_class_id = np.array([item['class_ids'][0] for item in validation_predictions])
validation_pred_one_hot = tf.keras.utils.to_categorical(validation_pred_class_id,10)
# Compute training and validation errors.
training_log_loss = metrics.log_loss(training_targets, training_pred_one_hot)
validation_log_loss = metrics.log_loss(validation_targets, validation_pred_one_hot)
# Occasionally print the current loss.
print(" period %02d : %0.2f" % (period, validation_log_loss))
# Add the loss metrics from this period to our list.
training_errors.append(training_log_loss)
validation_errors.append(validation_log_loss)
print("Model training finished.")
# Remove event files to save disk space.
#_ = map(os.remove, glob.glob(os.path.join(classifier.model_dir, 'events.out.tfevents*')))
# Calculate final predictions (not probabilities, as above).
final_predictions = classifier.predict(input_fn=predict_validation_input_fn)
final_predictions = np.array([item['class_ids'][0] for item in final_predictions])
accuracy = metrics.accuracy_score(validation_targets, final_predictions)
print("Final accuracy (on validation data): %0.2f" % accuracy)
# Output a graph of loss metrics over periods.
plt.ylabel("LogLoss")
plt.xlabel("Periods")
plt.title("LogLoss vs. Periods")
plt.plot(training_errors, label="training")
plt.plot(validation_errors, label="validation")
plt.legend()
plt.show()
# Output a plot of the confusion matrix.
cm = metrics.confusion_matrix(validation_targets, final_predictions)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class).
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
ax = sns.heatmap(cm_normalized, cmap="bone_r")
ax.set_aspect(1)
plt.title("Confusion matrix")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
return classifier
classifier = train_nn_classification_model(
learning_rate=0.05,
steps=300,
batch_size=30,
hidden_units=[100, 100],
training_examples=training_examples,
training_targets=training_targets,
validation_examples=validation_examples,
validation_targets=validation_targets)
mnist_test_dataframe = pd.read_csv(
# "https://download.mlcc.google.com/mledu-datasets/mnist_test.csv",
"mnist_test.csv",
sep=",",
header=None)
test_targets, test_examples = parse_labels_and_features(mnist_test_dataframe)
test_examples.describe()
predict_test_input_fn = create_predict_input_fn(
test_examples, test_targets, batch_size=100)
test_predictions = classifier.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item['class_ids'][0] for item in test_predictions])
accuracy = metrics.accuracy_score(test_targets, test_predictions)
print("Accuracy on test data: %0.2f" % accuracy)
[EDIT] Кажется, проблема заключается в np.random.permutation
. Следующий код может показать разницу двух платформ. Есть ли у ПК версия Numpy / Pandas?
import pandas as pd
import numpy as np
a = pd.DataFrame([[1,2],[3,4]])
print(a)
print(list(a.index))
while 1:
idx = np.random.permutation(a.index)
if idx[0] == 1:
break
print(a)
print(list(a.index))
print(idx)
Выводы из Google Colab
0 1
0 1 2
1 3 4
[0, 1]
0 1
0 1 2
1 3 4
[0, 1]
[1 0]
Выходы с ПК с Linux:
0 1
0 1 2
1 3 4
[0, 1]
0 1
1 1 2
0 3 4
[0, 1]
[1 0]
Так что, похоже, np.random.permutation(a.index)
ведет себя по-разному на двух платформах. Это ошибка?