kneeliverse.clustering

The following module provides a 1D version of hierarchical clustering, optimized for linear complexity due to the use of a single dimension.

  1# coding: utf-8
  2
  3'''
  4The following module provides a 1D version 
  5of hierarchical clustering, optimized for 
  6linear complexity due to the use of a 
  7single dimension.
  8'''
  9
 10__author__ = 'Mário Antunes'
 11__version__ = '1.0'
 12__email__ = 'mario.antunes@ua.pt'
 13__status__ = 'Development'
 14__license__ = 'MIT'
 15__copyright__ = '''
 16Copyright (c) 2021-2023 Stony Brook University
 17Copyright (c) 2021-2023 The Research Foundation of SUNY
 18'''
 19
 20
 21import math
 22import numpy as np
 23
 24
 25def single_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
 26    """
 27    Computes the 1D clustering of the input points.
 28
 29    Efficient implementation that uses a single pass to compute the clusters.
 30    Computes the single linkage clustering based only on the x axis:
 31    $$
 32        D(C_1, C_2) = \\min_{c_1  \\in C_1, c_2 \\in C_2} d(c_1, c_2)
 33    $$
 34
 35    Args:
 36        points (np.ndarray): numpy array with the points (x, y)
 37        t (float): the threshold for merging (in percentage, default 0.01)
 38
 39    Returns:
 40        np.ndarray: the clusters ids
 41    """
 42    clusters = []
 43    cluster_index = 0
 44    length = points[-1, 0] - points[0, 0]
 45
 46    # First Point is a cluster
 47    clusters.append(cluster_index)
 48
 49    for i in range(1, len(points)):
 50        distance = math.fabs(points[i][0]-points[i-1][0])/length
 51        if distance >= t:
 52            cluster_index += 1
 53        clusters.append(cluster_index)
 54
 55    return np.array(clusters)
 56
 57
 58def complete_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
 59    """
 60    Computes the 1D clustering of the input points.
 61
 62    Efficient implementation that uses a single pass to compute the clusters.
 63    Computes the complete linkage clustering based only on the x axis:
 64    $$
 65        D(C_1, C_2) = \\max_{c_1  \\in C_1, c_2 \\in C_2} d(c_1, c_2)
 66    $$
 67
 68    Args:
 69        points (np.ndarray): numpy array with the points (x, y)
 70        t (float): the threshold for merging (in percentage, default 0.01)
 71
 72    Returns:
 73        np.ndarray: the clusters ids
 74    """
 75    clusters = []
 76    cluster_index = 0
 77    length = points[-1, 0] - points[0, 0]
 78
 79    # First Point is a cluster
 80    clusters.append(cluster_index)
 81    cluster_point_idx = 0
 82
 83    for i in range(1, len(points)):
 84        distance = math.fabs(points[i][0]-points[cluster_point_idx][0])/length
 85        if distance >= t:
 86            cluster_index += 1
 87            cluster_point_idx = i
 88        clusters.append(cluster_index)
 89
 90    return np.array(clusters)
 91
 92
 93def centroid_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
 94    """
 95    Computes the 1D clustering of the input points.
 96
 97    Efficient implementation that uses a single pass to compute the clusters.
 98    Computes the centroid linkage clustering based only on the x axis:
 99    $$
100        D(C_1, C_2) = ||c_1 - c_2 ||
101    $$
102
103    Args:
104        points (np.ndarray): numpy array with the points (x, y)
105        t (float): the threshold for merging (in percentage, default 0.01)
106
107    Returns:
108        np.ndarray: the clusters ids
109    """
110    clusters = []
111    cluster_index = 0
112    length = points[-1, 0] - points[0, 0]
113
114    # First Point is a cluster
115    clusters.append(cluster_index)
116    cluster_center = points[0, 0]
117    cluster_size = 1
118
119    for i in range(1, len(points)):
120        distance = math.fabs(points[i][0] - cluster_center)/length
121        if distance < t:
122            clusters.append(cluster_index)
123            # Update center
124            cluster_center = (cluster_size/(cluster_size+1)) * \
125            cluster_center + (1/(cluster_size+1)) * points[i][0]
126            cluster_size += 1
127        else:
128            cluster_index += 1
129            clusters.append(cluster_index)
130            cluster_center = points[i, 0]
131            cluster_size = 1
132
133    return np.array(clusters)
134
135
136def average_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
137    """
138    Computes the 1D clustering of the input points.
139
140    Efficient implementation that uses a single pass to compute the clusters.
141    Computes the average linkage clustering based only on the x axis:
142    $$
143        D(C_1, C_2) = \\frac{1}{|C_1|}\\sum_{c_1 \\in C_1}d(c_1, C_2)
144    $$
145
146    Args:
147        points (np.ndarray): numpy array with the points (x, y)
148        t (float): the threshold for merging (in percentage, default 0.01)
149
150    Returns:
151        np.ndarray: the clusters ids
152    """
153    clusters = []
154    cluster_index = 0
155    length = points[-1, 0] - points[0, 0]
156
157    # First Point is a cluster
158    clusters.append(cluster_index)
159    idx = 0
160
161    for i in range(1, len(points)):
162        cluster_points = points[idx:i, 0]
163        distances = np.abs(cluster_points - points[i][0])
164        distance = np.sum(distances)/(len(cluster_points)*length)
165        if distance >= t:
166            cluster_index += 1
167            idx = i
168        clusters.append(cluster_index)
169
170    return np.array(clusters)
def single_linkage(points: numpy.ndarray, t: float = 0.01) -> numpy.ndarray:
26def single_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
27    """
28    Computes the 1D clustering of the input points.
29
30    Efficient implementation that uses a single pass to compute the clusters.
31    Computes the single linkage clustering based only on the x axis:
32    $$
33        D(C_1, C_2) = \\min_{c_1  \\in C_1, c_2 \\in C_2} d(c_1, c_2)
34    $$
35
36    Args:
37        points (np.ndarray): numpy array with the points (x, y)
38        t (float): the threshold for merging (in percentage, default 0.01)
39
40    Returns:
41        np.ndarray: the clusters ids
42    """
43    clusters = []
44    cluster_index = 0
45    length = points[-1, 0] - points[0, 0]
46
47    # First Point is a cluster
48    clusters.append(cluster_index)
49
50    for i in range(1, len(points)):
51        distance = math.fabs(points[i][0]-points[i-1][0])/length
52        if distance >= t:
53            cluster_index += 1
54        clusters.append(cluster_index)
55
56    return np.array(clusters)

Computes the 1D clustering of the input points.

Efficient implementation that uses a single pass to compute the clusters. Computes the single linkage clustering based only on the x axis: $$ D(C_1, C_2) = \min_{c_1 \in C_1, c_2 \in C_2} d(c_1, c_2) $$

Arguments:
  • points (np.ndarray): numpy array with the points (x, y)
  • t (float): the threshold for merging (in percentage, default 0.01)
Returns:

np.ndarray: the clusters ids

def complete_linkage(points: numpy.ndarray, t: float = 0.01) -> numpy.ndarray:
59def complete_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
60    """
61    Computes the 1D clustering of the input points.
62
63    Efficient implementation that uses a single pass to compute the clusters.
64    Computes the complete linkage clustering based only on the x axis:
65    $$
66        D(C_1, C_2) = \\max_{c_1  \\in C_1, c_2 \\in C_2} d(c_1, c_2)
67    $$
68
69    Args:
70        points (np.ndarray): numpy array with the points (x, y)
71        t (float): the threshold for merging (in percentage, default 0.01)
72
73    Returns:
74        np.ndarray: the clusters ids
75    """
76    clusters = []
77    cluster_index = 0
78    length = points[-1, 0] - points[0, 0]
79
80    # First Point is a cluster
81    clusters.append(cluster_index)
82    cluster_point_idx = 0
83
84    for i in range(1, len(points)):
85        distance = math.fabs(points[i][0]-points[cluster_point_idx][0])/length
86        if distance >= t:
87            cluster_index += 1
88            cluster_point_idx = i
89        clusters.append(cluster_index)
90
91    return np.array(clusters)

Computes the 1D clustering of the input points.

Efficient implementation that uses a single pass to compute the clusters. Computes the complete linkage clustering based only on the x axis: $$ D(C_1, C_2) = \max_{c_1 \in C_1, c_2 \in C_2} d(c_1, c_2) $$

Arguments:
  • points (np.ndarray): numpy array with the points (x, y)
  • t (float): the threshold for merging (in percentage, default 0.01)
Returns:

np.ndarray: the clusters ids

def centroid_linkage(points: numpy.ndarray, t: float = 0.01) -> numpy.ndarray:
 94def centroid_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
 95    """
 96    Computes the 1D clustering of the input points.
 97
 98    Efficient implementation that uses a single pass to compute the clusters.
 99    Computes the centroid linkage clustering based only on the x axis:
100    $$
101        D(C_1, C_2) = ||c_1 - c_2 ||
102    $$
103
104    Args:
105        points (np.ndarray): numpy array with the points (x, y)
106        t (float): the threshold for merging (in percentage, default 0.01)
107
108    Returns:
109        np.ndarray: the clusters ids
110    """
111    clusters = []
112    cluster_index = 0
113    length = points[-1, 0] - points[0, 0]
114
115    # First Point is a cluster
116    clusters.append(cluster_index)
117    cluster_center = points[0, 0]
118    cluster_size = 1
119
120    for i in range(1, len(points)):
121        distance = math.fabs(points[i][0] - cluster_center)/length
122        if distance < t:
123            clusters.append(cluster_index)
124            # Update center
125            cluster_center = (cluster_size/(cluster_size+1)) * \
126            cluster_center + (1/(cluster_size+1)) * points[i][0]
127            cluster_size += 1
128        else:
129            cluster_index += 1
130            clusters.append(cluster_index)
131            cluster_center = points[i, 0]
132            cluster_size = 1
133
134    return np.array(clusters)

Computes the 1D clustering of the input points.

Efficient implementation that uses a single pass to compute the clusters. Computes the centroid linkage clustering based only on the x axis: $$ D(C_1, C_2) = ||c_1 - c_2 || $$

Arguments:
  • points (np.ndarray): numpy array with the points (x, y)
  • t (float): the threshold for merging (in percentage, default 0.01)
Returns:

np.ndarray: the clusters ids

def average_linkage(points: numpy.ndarray, t: float = 0.01) -> numpy.ndarray:
137def average_linkage(points: np.ndarray, t: float = 0.01) -> np.ndarray:
138    """
139    Computes the 1D clustering of the input points.
140
141    Efficient implementation that uses a single pass to compute the clusters.
142    Computes the average linkage clustering based only on the x axis:
143    $$
144        D(C_1, C_2) = \\frac{1}{|C_1|}\\sum_{c_1 \\in C_1}d(c_1, C_2)
145    $$
146
147    Args:
148        points (np.ndarray): numpy array with the points (x, y)
149        t (float): the threshold for merging (in percentage, default 0.01)
150
151    Returns:
152        np.ndarray: the clusters ids
153    """
154    clusters = []
155    cluster_index = 0
156    length = points[-1, 0] - points[0, 0]
157
158    # First Point is a cluster
159    clusters.append(cluster_index)
160    idx = 0
161
162    for i in range(1, len(points)):
163        cluster_points = points[idx:i, 0]
164        distances = np.abs(cluster_points - points[i][0])
165        distance = np.sum(distances)/(len(cluster_points)*length)
166        if distance >= t:
167            cluster_index += 1
168            idx = i
169        clusters.append(cluster_index)
170
171    return np.array(clusters)

Computes the 1D clustering of the input points.

Efficient implementation that uses a single pass to compute the clusters. Computes the average linkage clustering based only on the x axis: $$ D(C_1, C_2) = \frac{1}{|C_1|}\sum_{c_1 \in C_1}d(c_1, C_2) $$

Arguments:
  • points (np.ndarray): numpy array with the points (x, y)
  • t (float): the threshold for merging (in percentage, default 0.01)
Returns:

np.ndarray: the clusters ids