Извлечение базовых точек модели DBSCAN в Python 3 - PullRequest
0 голосов
/ 06 января 2020

Я пытаюсь извлечь основные точки, используя алгоритм DBSCAN. Проблема с кодом заключается в том, что он маркирует каждую точку (включая основные точки) и не предоставляет способ доступа к основным или пограничным точкам для данного кластера.

Код с пояснениями, которые я использовал, следующим образом:

def DBSCAN_Algo(D, eps, min_pts):
    """
    Cluster dataset 'D' using DBSCAN algorithm

    Input:
    eps is the epsilon threshold radius
    min_points is the minimum number of points required
    to form a core point

    Returns a list of cluster labels.
    Cluster labels start from 1 and onwards.
    Label of -1 means noise
    '''

    # List to hold final cluster assignment for each point in 'D'
    # There are 2 fixed values-
    # -1 means a noise point
    # 0 means the point has not been visited yet
    """

    n, m = D.shape

    # Initially, all labels are zeros-
    labels = np.zeros(n)

    # 'C' is the current cluster ID-
    C = 0

    # Outer loop to pick new seed points - a point from which to
    # grow a new cluster
    # Once a valid seed point is found, a new cluster is created
    # The cluster expansion/growth is handled by 'expandCluster()'

    # 'p' is the index of data point, rather than the data point itself
    for p in range(0, n):
        # Only points that have NOT been visited before can be picked
        # as new seed points
        # If the point's label is not 0, continue to next point
        if not (labels[p] == 0):    # if labels[p] != 0:
            continue

        # Find all neighbors to point 'p'-
        neighbor_pts = regionQuery(D, p, eps)

        # If number of points within epsilon distance is less than
        # min_pts, label point 'p' as noise-
        if len(neighbor_pts) < min_pts:
            labels[p] = -1

        # Otherwise, if this is a core point, use this point as seed
        # for a new cluster
        else:
            # Get the next cluster ID-
            C += 1

            # Assign the cluster ID to seed point-
            labels[p] = C

            # Grow/expand the cluster using this seed poit-
            growCluster(D, labels, p, C, eps, min_pts)

    # All data points have been clustered-
    return labels


def growCluster(D, labels, P, C, eps, min_pts):
    """
    Expand a new cluster with ID 'C' from the seed point 'P'

    This function searches through dataset 'D' to find all points
    belonging to this new cluster 'C'.
    When this function returns, cluster 'C' is completey
    expanded/grown.

    D:          Dataset
    labels:     List storing cluster labels/IDs for all dataset points
    P:          Index of seed point for this new cluster
    C:          Label for this new cluster
    eps:        Threshold distance
    min_pts:    Minimum number of neibhoring points required
    """
    # 'SearchQueue' is a FIFO queue of points to evaluate.
    # It will only ever contain points which belong to cluster
    # 'C' (and have already been labeled as such).

    # The points are represented by their index values (not
    # the actual vector).

    # The FIFO queue behavior is accomplished by appending new
    # points to the end of the list, and using a while-loop
    # rather than a for-loop.
    SearchQueue = [P]

    # For each point in the queue-
    # 1.) Determine whether it's a border point or a noise point
    # 2.) For border points, add their unvisited neighbors to
    # 'SearchQueue'
    i = 0

    while i < len(SearchQueue):
        # Get the next point from queue-
        P = SearchQueue[i]

        # Find all neighbors of 'P'-
        neighbor_pts = regionQuery(D, P, eps)

        # If the number of neighbors < min_pts, then this is
        # a border point and we move on to the next point in
        # the queue-
        if len(neighbor_pts) < min_pts:
            i += 1
            continue

        # Otherwise, we have the minimum number of neighbors,
        # and this is a core point-

        # for each of the neighbors-
        for Pn in neighbor_pts:
            # If 'Pn' was labelled as NOISE during the seed search,
            # then we konw it's not a core point (as it does not
            # have enough number of neighbors), so make it a a noise
            # point of cluster 'C' and continue-
            if labels[Pn] == -1:
                labels[Pn] = C

            # Otherwise, if 'Pn' isn't already visited, claim it
            # as part of 'C' and add it to search queue-
            elif labels[Pn] == 0:
                # Add 'Pn' to cluster 'C'-
                labels[Pn] = C

                # Add 'Pn' to 'SearchQueue'-
                SearchQueue.append(Pn)

        # Advance to next data point in FIFO queue-
        i += 1


def regionQuery(D, P, eps):
    """
    Function to find all data points in dataset 'D' within
    'eps' distance of point 'P'

    This function calculates the distance between a point 'P'
    and all other points in dataset 'D', and then returns
    only those data points which are within the threshold
    'eps' radius
    """

    neighbors = []

    # For each point in dataset 'D'-
    for Pn in range(0, D.shape[0]):
        # If distance < eps, add the point to the list-
        if np.linalg.norm(D[P] - D[Pn], ord = 2) < eps:
            neighbors.append(Pn)

    return neighbors


# Read first 1000 rows of CSV file-
data = pd.read_csv("some_file.csv", nrows = 5000)

# Only get the 'x' & 'y' attributes-
data = data.loc[:, ['x', 'y']]

# Convert from Pandas to numpy-
data_values = data.values

data_values.shape
# (1000, 2)

# Scale data-
data_values = StandardScaler().fit_transform(data_values)

# Visualize data distribution-
# plt.scatter(data_values[:, 0], data_values[:, 1])


# Use our implementation of DBSCAN algorithm-
labels_algo = DBSCAN_Algo(data_values, eps = 0.03, min_pts = 5)

type(labels_algo)                                                      
# numpy.ndarray

labels_algo.shape                                                      
# (1000,)

labels_algo[:20]                                                       
"""
array([76., -1.,  1.,  2., -1., -1., 77., 78.,  3.,  4.,  5., 79.,  6.,
        7., -1., -1.,  8., 94.,  9., 80.])
"""

len(set(labels_algo))                                                  
# 122

# Data points having label of -1 are noise-
np.count_nonzero(labels_algo == -1)                                    
# 200

# Get cluster labels and the number of elements it contains-
unique, counts = np.unique(labels_algo, return_counts=True) 

# Create a dictionary such that- 
# cluster_label: count of elements in it
element_count = dict(zip(unique, counts))

Это показывает, что для нашего набора данных из 1000 точек данных, используя параметры eps = 0.03, min_pts = 5, реализация алгоритма DBSCAN сгруппировала 1000 точек данных в (122 - 1 ) = 121 метка или кластер, поскольку метка -1 означает шум.

Есть ли способ, которым я могу хранить базовые точки и граничные точки в отдельной структуре данных, скажем, как отображение между базовой точкой и всеми Точки данных присваиваются ему с помощью словаря или списка?

Спасибо!

...