Пользовательский анализатор sklearn - ST-DBSCAN: ошибка RandomizedSearchCV - PullRequest
0 голосов
/ 31 марта 2020

Я пытаюсь использовать пользовательский оценщик scikit-learn для выполнения пространственно-временного алгоритма кластеризации DBSCAN. У меня есть набор данных, и чтобы узнать, насколько хорошо ST-DBSCAN кластеризует точки данных, есть атрибут 'cid', который является меткой истинности для каждой данной точки данных (ID кластера). Таким образом, идея состоит в том, чтобы сопоставить прогнозируемую метку с меткой истинности земли для каждой точки данных и, следовательно, назначить оценку точности для алгоритма кластеризации 'ST-DBSCAN'.

Код, который я имею, выглядит следующим образом:

from sklearn.base import BaseEstimator, ClusterMixin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time
from sklearn.model_selection import RandomizedSearchCV




class ST_DBSCAN(BaseEstimator, ClusterMixin):
    """
    Spatio-Temporal DBSCAN algorithm for scikit-learn compatibility
    # eps1 spatial neighborhood
    # eps2 Time Neighborhood
    # minPts the minimum number of points satisfying the double neighborhood


    All estimators must have 'get_params()'' and 'set_params()' functions. They are inherited
    when you subclass 'BaseEstimator' and the recommendation is not to override these function
    (just not state them in definition of your classifier).
    """

    def __init__(self, eps1 = 0.5, eps2 = 10, minPts = 5):
        '''
        All arguments must have default values, so it is possible to
        initialize the clustering object without any parameters.
        Do not take data as argument here! It should be in fit method.
        Parameters should have the same name as attributes.
        '''
        self.eps1 = eps1
        self.eps2 = eps2
        self.minPts = minPts
        self.predicted_labels = None


    def compute_squared_EDM(self, X):
        # Calculate the distance matrix of the X matrix-
        return squareform(pdist(X, metric='euclidean'))



    def fit(self, X, y):
        '''
        Here, you should implement all the hard work. At first you should check the parameters.
        Secondly, you should take and process the data. You'll almost surely want to add some new
        attributes to your object which are created in fit() method. These should be ended by _ at the 
        end, e.g. self.fitted_.

        And finally you should return 'self'. This is again for compatibility reasons with common interface of scikit-learn.
        '''


        # Get the rows and columns of data (a total of 'n' data)
        # n, m = self.data.shape
        self.n_, self.m_ = X.shape

        # Calculate time distance matrix
        self.timeDisMat_ = self.compute_squared_EDM(X[:,0].reshape(self.n_, 1))

        # Get space distance matrix-
        self.disMat_ = self.compute_squared_EDM(X[:, 1:])

        # Assign the number less than minPts in the matrix to 1, the number greater than minPts to zero, then 1 to sum each
        # row, and then find the index of the core point coordinates
        # Note: Two uses of np.where((search, replace function))
        self.core_points_index_ = np.where(np.sum(np.where((self.disMat_ <= self.eps1) & 
            (self.timeDisMat_ <= self.eps2), 1, 0), axis=1) >= self.minPts)[0]

        # Initialization category, -1 means unclassified-
        self.labels_ = np.full((self.n_,), -1)
        self.clusterId_ = 0

        # Iterate through all the core points-
        for pointId in self.core_points_index_:
            # If the core point is not classified, use it as the seed point and start to find the corresponding cluster
            if (self.labels_[pointId] == -1):
                # Mark pointId as the current category (that is, identified as operated)
                self.labels_[pointId] = self.clusterId_

                # Find the eps neighborhood of the seed point and the points that are not classified, and put it into the seed set-
                self.neighbour_ = np.where((self.disMat_[:, pointId] <= self.eps1) & 
                    (self.timeDisMat_[:, pointId] <= self.eps2) & (self.labels_ == -1))[0]
                self.seeds_ = set(self.neighbour_)

                # Through seed points, start to grow, find data points with reachable density, until the seed set is empty,
                # one cluster set is searched
                while len(self.seeds_) > 0:
                    # Pop up a new seed point-
                    newPoint = self.seeds_.pop()

                    # Mark newPoint as the current class-
                    self.labels_[newPoint] = self.clusterId_

                    # Find newPoint seed point eps neighborhood (including itself)
                    self.queryResults_ = set(np.where((self.disMat_[:,newPoint] <= self.eps1) & 
                        (self.timeDisMat_[:, newPoint] <= self.eps2) )[0])

                    # If newPoint belongs to the core point, then newPoint can be expanded, that is, the density can be reached
                    # through newPoint
                    if len(self.queryResults_) >= self.minPts:
                        # Push the points in the neighborhood that are not classified into the seed set
                        for resultPoint in self.queryResults_:
                            if self.labels_[resultPoint] == -1:
                                self.seeds_.add(resultPoint)

                # After the cluster grows, find a category
                self.clusterId_ = self.clusterId_ + 1

        self.predicted_labels = self.labels_

        # return self.labels_
        return self


    def score(self, X, y):
        return accuracy_score(y, self.predicted_labels)


    def get_labels(self):
        return self.predicted_labels

Образец набора данных, который у меня есть, имеет следующие атрибуты - 'frame' (для времени), 'x' и 'y' (для пространственных аспектов) и 'cid' (истинность каждой точки данных) ). Код:

# Read in CSV file-
data = pd.read_csv("Clustering_Ground_Truth_Data.csv")

# Take the first 1000 records-
data_mod = data.loc[:1000, ['frame', 'x', 'y']]

data_mod.shape
#  (1001, 3)

# Get numpy values instead of Pandas DataFrame-
data_mod = data_mod.values

X = data.loc[:1000, ['frame', 'x', 'y']]
y = data.loc[:1000, 'cid']

X = X.values
y = y.values

# Get shapes-
X.shape, y.shape
# ((1001, 3), (1001,))

Для использования вышеуказанного кода кластеризации -

# Initialize an instance of 'ST_DBSCAN' class-
stdb = ST_DBSCAN(0.1, 60, 5)

# Perform ST-DBSCAN clustering and return labels for each
# data point-
stdb.fit(X, y)

# labels = stdb.fit(X, y)
labels = stdb.get_labels()
# Cluster labels. Noisy samples are given the label -1


# Get elements and their counts-
unique, counts = np.unique(labels, return_counts=True)

# Create a dictionary such that-
# element: count
element_count = dict(zip(unique, counts))

print("\nPredictions of data points using ST-DBSCAN, element_count:\n{0}\n\n".format(element_count))
print("\nAccuracy score: {0:.4f}\n".format(stdb.score(X, y)))

Этот код работает нормально. Однако выполнение поиска по гиперпараметрам с использованием RandomizedSearchCV выдает ошибку:

# RandomizedSearchCV parameters-
random_params = {
    'eps1': [0.1, 0.01, 0.001],
    'eps2' : [x for x in range(40, 101, 5)]
    }


rf_st_dbscan = RandomizedSearchCV(
    estimator = stdb,
    param_distributions = random_params
    # scoring = 'accuracy'
    )


# This line gives error-
rf_st_dbscan.fit(X, y)

Ошибка выглядит следующим образом:

> ValueError                                Traceback (most recent call
> last) <ipython-input-15-90f41752b564> in <module>
> ----> 1 rf_st_dbscan.fit(X, y)
> 
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
> in fit(self, X, y, groups, **fit_params)
>     708                 return results
>     709 
> --> 710             self._run_search(evaluate_candidates)
>     711 
>     712         # For multi-metric evaluation, store the best_index_, best_params_ and
> 
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
> in _run_search(self, evaluate_candidates)    1480     def
> _run_search(self, evaluate_candidates):    1481         """Search n_iter candidates from param_distributions"""
> -> 1482         evaluate_candidates(ParameterSampler(    1483             self.param_distributions, self.n_iter,    1484            
> random_state=self.random_state))
> 
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
> in evaluate_candidates(candidate_params)
>     680                               n_splits, n_candidates, n_candidates * n_splits))
>     681 
> --> 682                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
>     683                                                        X, y,
>     684                                                        train=train, test=test,
> 
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> __call__(self, iterable)    1002             # remaining jobs.    1003             self._iterating = False
> -> 1004             if self.dispatch_one_batch(iterator):    1005                 self._iterating = self._original_iterator is not None    1006 
> 
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> dispatch_one_batch(self, iterator)
>     833                 return False
>     834             else:
> --> 835                 self._dispatch(tasks)
>     836                 return True
>     837 
> 
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> _dispatch(self, batch)
>     752         with self._lock:
>     753             job_idx = len(self._jobs)
> --> 754             job = self._backend.apply_async(batch, callback=cb)
>     755             # A job can complete so quickly than its callback is
>     756             # called before we get here, causing self._jobs to
> 
> ~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
> apply_async(self, func, callback)
>     207     def apply_async(self, func, callback=None):
>     208         """Schedule a func to be run"""
> --> 209         result = ImmediateResult(func)
>     210         if callback:
>     211             callback(result)
> 
> ~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
> __init__(self, batch)
>     588         # Don't delay the application, to avoid keeping the input
>     589         # arguments in memory
> --> 590         self.results = batch()
>     591 
>     592     def get(self):
> 
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> __call__(self)
>     253         # change the default number of processes to -1
>     254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 255             return [func(*args, **kwargs)
>     256                     for func, args, kwargs in self.items]
>     257 
> 
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> <listcomp>(.0)
>     253         # change the default number of processes to -1
>     254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 255             return [func(*args, **kwargs)
>     256                     for func, args, kwargs in self.items]
>     257 
> 
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
> in _fit_and_score(estimator, X, y, scorer, train, test, verbose,
> parameters, fit_params, return_train_score, return_parameters,
> return_n_test_samples, return_times, return_estimator, error_score)
>     542     else:
>     543         fit_time = time.time() - start_time
> --> 544         test_scores = _score(estimator, X_test, y_test, scorer)
>     545         score_time = time.time() - start_time - fit_time
>     546         if return_train_score:
> 
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
> in _score(estimator, X_test, y_test, scorer)
>     589         scores = scorer(estimator, X_test)
>     590     else:
> --> 591         scores = scorer(estimator, X_test, y_test)
>     592 
>     593     error_msg = ("scoring must return a number, got %s (%s) "
> 
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
> __call__(self, estimator, *args, **kwargs)
>      87                                       *args, **kwargs)
>      88             else:
> ---> 89                 score = scorer(estimator, *args, **kwargs)
>      90             scores[name] = score
>      91         return scores
> 
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
> _passthrough_scorer(estimator, *args, **kwargs)
>     369 def _passthrough_scorer(estimator, *args, **kwargs):
>     370     """Function that wraps estimator.score"""
> --> 371     return estimator.score(*args, **kwargs)
>     372 
>     373 
> 
> ~/University_of_Konstanz/Hiwi/Unsupervised_Learning_Works/Spatio-temporal-Clustering-master/Custom_Estimator_scikit_learn_Tutorials.py in score(self, X, y)
>     134 
>     135         def score(self, X, y):
> --> 136                 return accuracy_score(y, self.predicted_labels)
>     137 
>     138 
> 
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
> in accuracy_score(y_true, y_pred, normalize, sample_weight)
>     183 
>     184     # Compute accuracy for each possible representation
> --> 185     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
>     186     check_consistent_length(y_true, y_pred, sample_weight)
>     187     if y_type.startswith('multilabel'):
> 
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
> in _check_targets(y_true, y_pred)
>      78     y_pred : array or indicator matrix
>      79     """
> ---> 80     check_consistent_length(y_true, y_pred)
>      81     type_true = type_of_target(y_true)
>      82     type_pred = type_of_target(y_pred)
> 
> ~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in
> check_consistent_length(*arrays)
>     209     uniques = np.unique(lengths)
>     210     if len(uniques) > 1:
> --> 211         raise ValueError("Found input variables with inconsistent numbers of"
>     212                          " samples: %r" % [int(l) for l in lengths])
>     213 
> 
> ValueError: Found input variables with inconsistent numbers of
> samples: [201, 800]

Почему я получаю этот 'ValueError'?

...