Почему кластеризация с глубоким внедрением (DE C) с реализацией R приводит к одному кластеру? - PullRequest
0 голосов
/ 27 марта 2020

Вкратце, я столкнулся со странной разницей в производительности при равных реализациях Глубокая встроенная кластеризация (DE C) в R , в которую я включил ссылки на реализацию в следующем.

Мой вопрос. Согласно следующим результатам и цифрам (включена реализация R ), код в R покрывает условие останова, помечая все наблюдения (т.е. выборки данных) в один кластер (см. рисунок 2).

enter image description here

Рисунок 1. Маркировка после первой инициализации с помощью kmeans

enter image description here

Figure2 Конвергенция в одном кластере после условия остановки (окончательная маркировка)

- вот и все код в R:

library(keras)
K <- keras::backend()
library(MLmetrics)
library(aricode)

#' Clustering layer for Deep Embedded Clustering -----------------------------------------------------------

createAutoencoderModel <- function( numberOfUnitsPerLayer,
                                    activation = 'relu',
                                    initializer = 'glorot_uniform' )
{
  numberOfEncodingLayers <- length( numberOfUnitsPerLayer ) - 1
  # input of AE
  inputs <- layer_input( shape = numberOfUnitsPerLayer[1],name = 'input' )
  encoder <- inputs

  # internal layers in encoder

  for( i in seq_len( numberOfEncodingLayers - 1 ) )
  {
    encoder <- encoder %>%
      layer_dense( units = numberOfUnitsPerLayer[i+1],
                   activation = activation, kernel_initializer = initializer )
  }
  # hidden layer
  encoder <- encoder %>%
    layer_dense( units = tail( numberOfUnitsPerLayer, 1 ) )

  autoencoder <- encoder

  # internal layers in decoder
  for( i in seq( from = numberOfEncodingLayers, to = 2, by = -1 ) )
  {
    autoencoder <- autoencoder %>%
      layer_dense( units = numberOfUnitsPerLayer[i],
                   activation = activation, kernel_initializer = initializer )
  }

  # output 
  autoencoder <- autoencoder %>%
    layer_dense( numberOfUnitsPerLayer[1], kernel_initializer = initializer, name = 'decoder' )

  return( list(
    autoencoderModel = keras_model( inputs = inputs, outputs = autoencoder ),
    encoderModel = keras_model( inputs = inputs, outputs = encoder ) ) )
}



# Defination of Clustering layer ---------------------------------------------------------------------------

ClusteringLayer <- R6::R6Class( "ClusteringLayer",
                                inherit = KerasLayer,
                                lock_objects = FALSE,
                                public = list(
                                  numberOfClusters = 10,
                                  initialClusterWeights = NULL,
                                  alpha = 1.0,
                                  name = '',

                                  initialize = function( numberOfClusters,
                                                         initialClusterWeights = NULL, alpha = 1.0, name = '' )
                                  {
                                    self$numberOfClusters <- numberOfClusters
                                    self$initialClusterWeights <- initialClusterWeights
                                    self$alpha <- alpha
                                    self$name <- name
                                  },

                                  build = function( input_shape )
                                  {
                                    if( length( input_shape ) != 2 )
                                    {
                                      stop( paste0( "input_shape is not of length 2." ) )
                                    }

                                    self$clusters <- self$add_weight(
                                      shape = list( self$numberOfClusters, input_shape[[2]] ),
                                      initializer = 'glorot_uniform', name = 'clusters' )

                                    if( ! is.null( self$initialClusterWeights ) )
                                    {
                                      self$set_weights( self$initialClusterWeights )
                                      self$initialClusterWeights <- NULL
                                    }
                                    self$built <- TRUE
                                  },

                                  call = function( inputs, mask = NULL )
                                  {
                                    # Uses Student t-distribution (same as t-SNE)
                                    # inputs are the variable containing the data, shape = ( numberOfSamples, numberOfFeatures )

                                    K <- keras::backend()

                                    q <- 1.0 / ( 1.0 + ( K$sum( K$square(
                                      K$expand_dims( inputs, axis = 1L ) - self$clusters ), axis = 2L ) / self$alpha ) )
                                    q <- q^( ( self$alpha + 1.0 ) / 2.0 )
                                    q <- K$transpose( K$transpose( q ) / K$sum( q, axis = 1L ) )

                                    return( q )
                                  },

                                  compute_output_shape = function( input_shape )
                                  {
                                    return( list( input_shape[[1]], self$numberOfClusters ) )
                                  }
                                )
)

layer_clustering <- function( object,
                              numberOfClusters, initialClusterWeights = NULL,
                              alpha = 1.0, name = '' )
{
  create_layer( ClusteringLayer, object,
                list( numberOfClusters = numberOfClusters,
                      initialClusterWeights = initialClusterWeights,
                      alpha = alpha, name = name )
  )
}

#' Deep embedded clustering (DEC) model class --------------------------------------------------------------

DeepEmbeddedClusteringModel <- R6::R6Class( "DeepEmbeddedClusteringModel",
                                            inherit = NULL,
                                            lock_objects = FALSE,
                                            public = list(
                                              numberOfUnitsPerLayer = NULL,
                                              numberOfClusters = 10,
                                              alpha = 1.0,
                                              initializer = 'glorot_uniform',
                                              convolutional = FALSE,
                                              inputImageSize = NULL,

                                              initialize = function( numberOfUnitsPerLayer,
                                                                     numberOfClusters, alpha = 1.0, initializer = 'glorot_uniform',
                                                                     convolutional = FALSE, inputImageSize = NULL )
                                              {
                                                self$numberOfUnitsPerLayer <- numberOfUnitsPerLayer
                                                self$numberOfClusters <- numberOfClusters
                                                self$alpha <- alpha
                                                self$initializer <- initializer
                                                self$convolutional <- convolutional
                                                self$inputImageSize <- inputImageSize

                                                ae <- createAutoencoderModel( self$numberOfUnitsPerLayer,
                                                                              initializer = self$initializer )
                                                self$autoencoder <- ae$autoencoderModel
                                                self$encoder <- ae$encoderModel


                                                # prepare DEC model

                                                clusteringLayer <- self$encoder$output %>%
                                                  layer_clustering( self$numberOfClusters, name = "clustering" )

                                                self$model <- keras_model( inputs = self$encoder$input, outputs = clusteringLayer )

                                              },
                                              pretrain = function( x, optimizer = 'adam', epochs = 200L, batchSize = 256L )
                                              {
                                                self$autoencoder$compile( optimizer = optimizer, loss = 'mse' )
                                                self$autoencoder$fit( x, x, batch_size = batchSize, epochs = epochs )
                                              },

                                              loadWeights = function( weights )
                                              {
                                                self$model$load_weights( weights )
                                              },

                                              extractFeatures = function( x )
                                              {
                                                self$encoder$predict( x, verbose = 0 )
                                              },

                                              predictClusterLabels = function( x )
                                              {
                                                clusterProbabilities <- self$model$predict( x, verbose = 0 )
                                                return( max.col( clusterProbabilities ) )
                                              },

                                              targetDistribution = function( q )
                                              {
                                                weight <- q^2 / colSums( q )
                                                p <- t( t( weight ) / rowSums( weight ) )
                                                return( p )
                                              },

                                              compile = function( optimizer = 'sgd', loss = 'kld', lossWeights = NULL )
                                              {
                                                self$model$compile( optimizer = optimizer, loss = loss, loss_weights = lossWeights )
                                              },

                                              fit = function( x, maxNumberOfIterations = 2e4, batchSize = 256L, tolerance = 1e-3, updateInterval = 10)
                                              {
                                                # Initialize clusters using k-means

                                                km <- stats::kmeans( self$encoder$predict( x, verbose = 0 ),
                                                                     centers = self$numberOfClusters, nstart = 20 )
                                                currentPrediction <- km$cluster # fitted( km )
                                                previousPrediction <- currentPrediction

                                                self$model$get_layer( name = 'clustering' )$set_weights( list( km$centers ) )

                                                # Deep clustering

                                                loss <- 10000
                                                index <- 0
                                                indexArray <- 1:( dim( x )[1] )

                                                for( i in seq_len( maxNumberOfIterations ) )
                                                {
                                                  if( i %% updateInterval == 1 )
                                                  {
                                                    q <- self$model$predict( x, verbose = 0 )
                                                    p <- self$targetDistribution( q )

                                                    # Met stopping criterion

                                                    currentPrediction <- max.col( q )

                                                    plot(currentPrediction, col="blue")
                                                    title(main = 'Current prediction')

                                                    deltaLabel <- sum( currentPrediction != previousPrediction ) / length( currentPrediction )

                                                    cat( "Itr", i, ": ( out of", maxNumberOfIterations,
                                                         "): loss = [", unlist( loss ), "], deltaLabel =", deltaLabel,
                                                         ", ACC= ", Accuracy(previousPrediction, currentPrediction),
                                                         ", NMI= ", NMI(previousPrediction, currentPrediction), "\n", sep = ' ' )

                                                    previousPrediction <- currentPrediction

                                                    if( i > 1 && deltaLabel < tolerance )
                                                    {
                                                      print('Reached tolerance threshold. Stopping training......')
                                                      break
                                                    }
                                                  }


                                                  # train on batch

                                                  batchIndices <- indexArray[( index * batchSize + 1 ):min( ( index + 1 ) * batchSize, dim( x )[1] )]
                                                  loss <- self$model$train_on_batch( x = x[batchIndices,], y = p[batchIndices,] )

                                                  if( ( index + 1 ) * batchSize + 1 <= dim( x )[1] )
                                                  {
                                                    index <- index + 1
                                                  } else {
                                                    index <- 0
                                                  }
                                                }
                                                return( currentPrediction )
                                              }
                                            )
)

# loading dataset  ---------------------------------------------------------------------------------------------
fmnist <- dataset_fashion_mnist()

numberOfTrainingData <- length( fmnist$train$y )
numberOfTestingData <- length( fmnist$test$y )

numberOfPixels <- prod( dim( fmnist$test$x[1,,] ) )
3
fmnist$train$xreshaped <- array_reshape( fmnist$train$x,
                                         dim = c( numberOfTrainingData, numberOfPixels ), order = "C" )
fmnist$test$xreshaped <- array_reshape( fmnist$test$x,
                                        dim = c( numberOfTestingData, numberOfPixels ), order = "C" )

x <- rbind( fmnist$test$xreshaped, fmnist$train$xreshaped )/255.0
y <- c( fmnist$test$y, fmnist$train$y )

numberOfClusters <- length( unique( fmnist$train$y ) )

initializer <- initializer_variance_scaling(
  scale = 1/3, mode = 'fan_in', distribution = 'uniform' )
pretrainOptimizer <- optimizer_sgd( lr = 1.0, momentum = 0.9 )

decModel <- DeepEmbeddedClusteringModel$new(
  numberOfUnitsPerLayer = c( numberOfPixels, 32, 32, 256, 10 ),
  numberOfClusters = numberOfClusters, initializer = initializer )

decModel$pretrain( x = x, optimizer = optimizer_sgd( lr = 1.0, momentum = 0.9 ),
                   epochs = 10L, batchSize = 256L )

decModel$compile( optimizer = optimizer_sgd( lr = 1.0, momentum = 0.9 ), loss = 'kld' )

yPredicted <- decModel$fit( x, maxNumberOfIterations = 2e4, batchSize = 256,
                            tolerance = 1e-3, updateInterval = 10 )


Train on 70000 samples
Epoch 1/10
70000/70000 [==============================] - 4s 60us/sample - loss: 0.0795
Epoch 2/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0651
Epoch 3/10
70000/70000 [==============================] - 3s 46us/sample - loss: 0.0470
Epoch 4/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0430
Epoch 5/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0402
Epoch 6/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0359
Epoch 7/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0345
Epoch 8/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0337
Epoch 9/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0326
Epoch 10/10
70000/70000 [==============================] - 3s 45us/sample - loss: 0.0311
<tensorflow.python.keras.callbacks.History>

Рисунок 3 Обучение с 10 эпохами

Itr 1 : ( out of 20000 ): loss = [ 10000 ], deltaLabel = 0 , ACC=  1 , NMI=  1 
Itr 11 : ( out of 20000 ): loss = [ 1.02756 ], deltaLabel = 0.8403571 , ACC=  0.1596429 , NMI=  0.2638058 
Itr 21 : ( out of 20000 ): loss = [ 1.016267 ], deltaLabel = 0.3924 , ACC=  0.6076 , NMI=  0 
Itr 31 : ( out of 20000 ): loss = [ 1.467916 ], deltaLabel = 0 , ACC=  1 , NMI=  NaN 
[1] "Reached tolerance threshold. Stopping training......"

Рисунок 4, соответствующий модели DE C

Как это было соответственно относительно рисунка 5 и рисунка 6 в Python

enter image description here

Рисунок 5 Инициализация меток с помощью kmeans

enter image description here

Рисунок 6 окончательная маркировка после охвата модели (ось X - это образцы, а ось Y - это метки)

Не могли бы вы дать мне знаете, почему это происходит? Я пробовал другие функции потери, все же, подобные явления случаются (например, "categoryorical_crossentropy")

Информация о машине:

  1. Python 3.7, запущенная Spyder 4.1
  2. Rstodio Версия 1.2.5033
  3. Набор данных " mnist " для обеих реализаций

Реализация в Python: https://www.dropbox.com/s/ii3k7rklz7z6446/DEC_original.py?dl=0

...