Вкратце, я столкнулся со странной разницей в производительности при равных реализациях Глубокая встроенная кластеризация (DE C) в R , в которую я включил ссылки на реализацию в следующем.
Мой вопрос. Согласно следующим результатам и цифрам (включена реализация R ), код в R покрывает условие останова, помечая все наблюдения (т.е. выборки данных) в один кластер (см. рисунок 2).
Рисунок 1. Маркировка после первой инициализации с помощью kmeans
Figure2 Конвергенция в одном кластере после условия остановки (окончательная маркировка)
- вот и все код в R:
library(keras)
K <- keras::backend()
library(MLmetrics)
library(aricode)
#' Clustering layer for Deep Embedded Clustering -----------------------------------------------------------
createAutoencoderModel <- function( numberOfUnitsPerLayer,
activation = 'relu',
initializer = 'glorot_uniform' )
{
numberOfEncodingLayers <- length( numberOfUnitsPerLayer ) - 1
# input of AE
inputs <- layer_input( shape = numberOfUnitsPerLayer[1],name = 'input' )
encoder <- inputs
# internal layers in encoder
for( i in seq_len( numberOfEncodingLayers - 1 ) )
{
encoder <- encoder %>%
layer_dense( units = numberOfUnitsPerLayer[i+1],
activation = activation, kernel_initializer = initializer )
}
# hidden layer
encoder <- encoder %>%
layer_dense( units = tail( numberOfUnitsPerLayer, 1 ) )
autoencoder <- encoder
# internal layers in decoder
for( i in seq( from = numberOfEncodingLayers, to = 2, by = -1 ) )
{
autoencoder <- autoencoder %>%
layer_dense( units = numberOfUnitsPerLayer[i],
activation = activation, kernel_initializer = initializer )
}
# output
autoencoder <- autoencoder %>%
layer_dense( numberOfUnitsPerLayer[1], kernel_initializer = initializer, name = 'decoder' )
return( list(
autoencoderModel = keras_model( inputs = inputs, outputs = autoencoder ),
encoderModel = keras_model( inputs = inputs, outputs = encoder ) ) )
}
# Defination of Clustering layer ---------------------------------------------------------------------------
ClusteringLayer <- R6::R6Class( "ClusteringLayer",
inherit = KerasLayer,
lock_objects = FALSE,
public = list(
numberOfClusters = 10,
initialClusterWeights = NULL,
alpha = 1.0,
name = '',
initialize = function( numberOfClusters,
initialClusterWeights = NULL, alpha = 1.0, name = '' )
{
self$numberOfClusters <- numberOfClusters
self$initialClusterWeights <- initialClusterWeights
self$alpha <- alpha
self$name <- name
},
build = function( input_shape )
{
if( length( input_shape ) != 2 )
{
stop( paste0( "input_shape is not of length 2." ) )
}
self$clusters <- self$add_weight(
shape = list( self$numberOfClusters, input_shape[[2]] ),
initializer = 'glorot_uniform', name = 'clusters' )
if( ! is.null( self$initialClusterWeights ) )
{
self$set_weights( self$initialClusterWeights )
self$initialClusterWeights <- NULL
}
self$built <- TRUE
},
call = function( inputs, mask = NULL )
{
# Uses Student t-distribution (same as t-SNE)
# inputs are the variable containing the data, shape = ( numberOfSamples, numberOfFeatures )
K <- keras::backend()
q <- 1.0 / ( 1.0 + ( K$sum( K$square(
K$expand_dims( inputs, axis = 1L ) - self$clusters ), axis = 2L ) / self$alpha ) )
q <- q^( ( self$alpha + 1.0 ) / 2.0 )
q <- K$transpose( K$transpose( q ) / K$sum( q, axis = 1L ) )
return( q )
},
compute_output_shape = function( input_shape )
{
return( list( input_shape[[1]], self$numberOfClusters ) )
}
)
)
layer_clustering <- function( object,
numberOfClusters, initialClusterWeights = NULL,
alpha = 1.0, name = '' )
{
create_layer( ClusteringLayer, object,
list( numberOfClusters = numberOfClusters,
initialClusterWeights = initialClusterWeights,
alpha = alpha, name = name )
)
}
#' Deep embedded clustering (DEC) model class --------------------------------------------------------------
DeepEmbeddedClusteringModel <- R6::R6Class( "DeepEmbeddedClusteringModel",
inherit = NULL,
lock_objects = FALSE,
public = list(
numberOfUnitsPerLayer = NULL,
numberOfClusters = 10,
alpha = 1.0,
initializer = 'glorot_uniform',
convolutional = FALSE,
inputImageSize = NULL,
initialize = function( numberOfUnitsPerLayer,
numberOfClusters, alpha = 1.0, initializer = 'glorot_uniform',
convolutional = FALSE, inputImageSize = NULL )
{
self$numberOfUnitsPerLayer <- numberOfUnitsPerLayer
self$numberOfClusters <- numberOfClusters
self$alpha <- alpha
self$initializer <- initializer
self$convolutional <- convolutional
self$inputImageSize <- inputImageSize
ae <- createAutoencoderModel( self$numberOfUnitsPerLayer,
initializer = self$initializer )
self$autoencoder <- ae$autoencoderModel
self$encoder <- ae$encoderModel
# prepare DEC model
clusteringLayer <- self$encoder$output %>%
layer_clustering( self$numberOfClusters, name = "clustering" )
self$model <- keras_model( inputs = self$encoder$input, outputs = clusteringLayer )
},
pretrain = function( x, optimizer = 'adam', epochs = 200L, batchSize = 256L )
{
self$autoencoder$compile( optimizer = optimizer, loss = 'mse' )
self$autoencoder$fit( x, x, batch_size = batchSize, epochs = epochs )
},
loadWeights = function( weights )
{
self$model$load_weights( weights )
},
extractFeatures = function( x )
{
self$encoder$predict( x, verbose = 0 )
},
predictClusterLabels = function( x )
{
clusterProbabilities <- self$model$predict( x, verbose = 0 )
return( max.col( clusterProbabilities ) )
},
targetDistribution = function( q )
{
weight <- q^2 / colSums( q )
p <- t( t( weight ) / rowSums( weight ) )
return( p )
},
compile = function( optimizer = 'sgd', loss = 'kld', lossWeights = NULL )
{
self$model$compile( optimizer = optimizer, loss = loss, loss_weights = lossWeights )
},
fit = function( x, maxNumberOfIterations = 2e4, batchSize = 256L, tolerance = 1e-3, updateInterval = 10)
{
# Initialize clusters using k-means
km <- stats::kmeans( self$encoder$predict( x, verbose = 0 ),
centers = self$numberOfClusters, nstart = 20 )
currentPrediction <- km$cluster # fitted( km )
previousPrediction <- currentPrediction
self$model$get_layer( name = 'clustering' )$set_weights( list( km$centers ) )
# Deep clustering
loss <- 10000
index <- 0
indexArray <- 1:( dim( x )[1] )
for( i in seq_len( maxNumberOfIterations ) )
{
if( i %% updateInterval == 1 )
{
q <- self$model$predict( x, verbose = 0 )
p <- self$targetDistribution( q )
# Met stopping criterion
currentPrediction <- max.col( q )
plot(currentPrediction, col="blue")
title(main = 'Current prediction')
deltaLabel <- sum( currentPrediction != previousPrediction ) / length( currentPrediction )
cat( "Itr", i, ": ( out of", maxNumberOfIterations,
"): loss = [", unlist( loss ), "], deltaLabel =", deltaLabel,
", ACC= ", Accuracy(previousPrediction, currentPrediction),
", NMI= ", NMI(previousPrediction, currentPrediction), "\n", sep = ' ' )
previousPrediction <- currentPrediction
if( i > 1 && deltaLabel < tolerance )
{
print('Reached tolerance threshold. Stopping training......')
break
}
}
# train on batch
batchIndices <- indexArray[( index * batchSize + 1 ):min( ( index + 1 ) * batchSize, dim( x )[1] )]
loss <- self$model$train_on_batch( x = x[batchIndices,], y = p[batchIndices,] )
if( ( index + 1 ) * batchSize + 1 <= dim( x )[1] )
{
index <- index + 1
} else {
index <- 0
}
}
return( currentPrediction )
}
)
)
# loading dataset ---------------------------------------------------------------------------------------------
fmnist <- dataset_fashion_mnist()
numberOfTrainingData <- length( fmnist$train$y )
numberOfTestingData <- length( fmnist$test$y )
numberOfPixels <- prod( dim( fmnist$test$x[1,,] ) )
3
fmnist$train$xreshaped <- array_reshape( fmnist$train$x,
dim = c( numberOfTrainingData, numberOfPixels ), order = "C" )
fmnist$test$xreshaped <- array_reshape( fmnist$test$x,
dim = c( numberOfTestingData, numberOfPixels ), order = "C" )
x <- rbind( fmnist$test$xreshaped, fmnist$train$xreshaped )/255.0
y <- c( fmnist$test$y, fmnist$train$y )
numberOfClusters <- length( unique( fmnist$train$y ) )
initializer <- initializer_variance_scaling(
scale = 1/3, mode = 'fan_in', distribution = 'uniform' )
pretrainOptimizer <- optimizer_sgd( lr = 1.0, momentum = 0.9 )
decModel <- DeepEmbeddedClusteringModel$new(
numberOfUnitsPerLayer = c( numberOfPixels, 32, 32, 256, 10 ),
numberOfClusters = numberOfClusters, initializer = initializer )
decModel$pretrain( x = x, optimizer = optimizer_sgd( lr = 1.0, momentum = 0.9 ),
epochs = 10L, batchSize = 256L )
decModel$compile( optimizer = optimizer_sgd( lr = 1.0, momentum = 0.9 ), loss = 'kld' )
yPredicted <- decModel$fit( x, maxNumberOfIterations = 2e4, batchSize = 256,
tolerance = 1e-3, updateInterval = 10 )
Train on 70000 samples
Epoch 1/10
70000/70000 [==============================] - 4s 60us/sample - loss: 0.0795
Epoch 2/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0651
Epoch 3/10
70000/70000 [==============================] - 3s 46us/sample - loss: 0.0470
Epoch 4/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0430
Epoch 5/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0402
Epoch 6/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0359
Epoch 7/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0345
Epoch 8/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0337
Epoch 9/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0326
Epoch 10/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0311
<tensorflow.python.keras.callbacks.History>
Рисунок 3 Обучение с 10 эпохами
Itr 1 : ( out of 20000 ): loss = [ 10000 ], deltaLabel = 0 , ACC= 1 , NMI= 1
Itr 11 : ( out of 20000 ): loss = [ 1.02756 ], deltaLabel = 0.8403571 , ACC= 0.1596429 , NMI= 0.2638058
Itr 21 : ( out of 20000 ): loss = [ 1.016267 ], deltaLabel = 0.3924 , ACC= 0.6076 , NMI= 0
Itr 31 : ( out of 20000 ): loss = [ 1.467916 ], deltaLabel = 0 , ACC= 1 , NMI= NaN
[1] "Reached tolerance threshold. Stopping training......"
Рисунок 4, соответствующий модели DE C
Как это было соответственно относительно рисунка 5 и рисунка 6 в Python
Рисунок 5 Инициализация меток с помощью kmeans
Рисунок 6 окончательная маркировка после охвата модели (ось X - это образцы, а ось Y - это метки)
Не могли бы вы дать мне знаете, почему это происходит? Я пробовал другие функции потери, все же, подобные явления случаются (например, "categoryorical_crossentropy")
Информация о машине:
- Python 3.7, запущенная Spyder 4.1
- Rstodio Версия 1.2.5033
- Набор данных " mnist " для обеих реализаций
Реализация в Python: https://www.dropbox.com/s/ii3k7rklz7z6446/DEC_original.py?dl=0