Ядро ноутбука Jupyter умирает при запуске UNet - PullRequest
1 голос
/ 21 февраля 2020

Я запускал свой код с tensorflow=2.0.0 для запуска UNet с помощью ноутбука Jupyter. Когда я обновлюсь до tensorflow=2.1.0, мой код запустится. До окончания первой эпохи я получаю сообщение The kernel appears to have died. It will restart automatically.. UNet использует MirroredStrategy() для распределения UNet на графические процессоры. Я не получаю эту ошибку, когда я удаляю MirroredStrategy() и запускаю на одном GPU. Любая идея, почему tensorflow=2.1.0 может вызвать эту проблему?

def get_model(optimizer, loss_metric, metrics, lr=1e-4):
    with tf.device('/job:localhost/replica:0/task:0/device:GPU:0'):
        inputs = Input((sample_width, sample_height, sample_depth, 1))
        conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
        conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
        pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
        drop1 = Dropout(0.5)(pool1)
        conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
        conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
        pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
        drop2 = Dropout(0.5)(pool2)
        conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
        conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
        pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
        drop3 = Dropout(0.3)(pool3)
        conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
        conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
        pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
        drop4 = Dropout(0.3)(pool4)
        conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
        conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)
    with tf.device('/job:localhost/replica:0/task:0/device:GPU:1'): 
        up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
        conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
        conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)
        up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
        conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
        conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)
        up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
        conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
        conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)
        up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
        conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
        conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)
        conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)
        model = Model(inputs=[inputs], outputs=[conv10])
        model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
        return model

smooth = 1.
def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-4)
observe_var = 'dice_coef'
strategy = 'max'
model_checkpoint = ModelCheckpoint('{epoch:04}model', monitor=observe_var, save_best_only=True)
model.fit(train_x, train_y, batch_size = 2, epochs= 1000, verbose=1, shuffle=True, validation_split=.2, callbacks=[model_checkpoint])
model.save( 'finalmodel')
...