Изменения, которые необходимо внести, уже упомянуты в ссылке https://github.com/horovod/horovod.. В основном это сводится к изменениям эпох, скорости обучения, обратных вызовов, оптимизатора, сохранению контрольных точек и некоторым настройкам инициализации и конфигурации. Ниже приведена версия вашего кода с поддержкой horovod:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.regularizers import l2
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
import math
import tensorflow as tf
import horovod.keras as hvd
# Horovod: initialize Horovod.
hvd.init()
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))
def alexnet_model(img_shape=(28, 28, 1), n_classes=10, l2_reg=0.):
alexnet = Sequential()
alexnet.add(Conv2D(96, (11, 11), input_shape=img_shape,
padding='same', kernel_regularizer=l2(l2_reg)))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(MaxPooling2D(pool_size=(2, 2)))
alexnet.add(Conv2D(256, (5, 5), padding='same'))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(MaxPooling2D(pool_size=(2, 2)))
alexnet.add(ZeroPadding2D((1, 1)))
alexnet.add(Conv2D(512, (3, 3), padding='same'))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(MaxPooling2D(pool_size=(2, 2)))
alexnet.add(ZeroPadding2D((1, 1)))
alexnet.add(Conv2D(1024, (3, 3), padding='same'))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(ZeroPadding2D((1, 1)))
alexnet.add(Conv2D(1024, (3, 3), padding='same'))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(MaxPooling2D(pool_size=(2, 2)))
alexnet.add(Flatten())
alexnet.add(Dense(3072))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(Dropout(0.5))
alexnet.add(Dense(4096))
alexnet.add(BatchNormalization())
alexnet.add(Activation('relu'))
alexnet.add(Dropout(0.5))
alexnet.add(Dense(n_classes))
alexnet.add(BatchNormalization())
alexnet.add(Activation('softmax'))
return alexnet
if __name__ == "__main__":
batch_size = 32
num_classes = 10
epochs = int(math.ceil(3.0 / hvd.size()))
# input image dimensions
img_rows, img_cols = 28, 28
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
model = alexnet_model()
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=hvd.DistributedOptimizer(keras.optimizers.Adadelta(1.0 * hvd.size())),
metrics=['accuracy'])
callbacks=[hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if hvd.rank() == 0:
callbacks.append(keras.callbacks.ModelCheckpoint(filepath='alexnet_mnist_checkpoint.hdf5'))
history= model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test),
callbacks=callbacks)
score = model.evaluate(x_test, y_test, verbose=0)
После изменения кода, пожалуйста, запустите его с помощью mpirun / mpiexec, чтобы horovod мог эффективно взаимодействовать со своими процессами. Например: mpirun -hosts <hostname0,hostname1> -np 2 -genv OMP_NUM_THREADS 4 python alexnet_mnist_horovod.py
. Обратите внимание, что в вашем кластере должен быть уже установлен OpenMPI / IntelMPI. Пример приведен для IntelMPI. Пожалуйста, измените соответственно, если вы используете OpenMPI