kneeliverse.zmethod

The following module provides knee detection method based on Z-method algorithm.

View Source

  1# coding: utf-8
  2
  3'''
  4The following module provides knee detection method
  5based on Z-method algorithm.
  6'''
  7
  8__author__ = 'Tyler Estro'
  9__version__ = '1.0'
 10__email__ = 'testro@cs.stonybrook.edu'
 11__status__ = 'Development'
 12__license__ = 'MIT'
 13__copyright__ = '''
 14Copyright (c) 2021-2023 Stony Brook University
 15Copyright (c) 2021-2023 The Research Foundation of SUNY
 16'''
 17
 18
 19import enum
 20import math
 21import logging
 22import numpy as np
 23import kneeliverse.postprocessing as pp
 24
 25
 26import uts.gradient as grad
 27import uts.zscore as uzscore #import zscore_array
 28#import uts.thresholding as uthres #import isodata
 29
 30
 31logger = logging.getLogger(__name__)
 32
 33
 34class Outlier(enum.Enum):
 35    """
 36    """
 37    zscore = 'zscore'
 38    iqr = 'iqr'
 39    hampel = 'hampel'
 40
 41    def __str__(self):
 42        return self.value
 43
 44
 45def map_index(a:np.ndarray, b:np.ndarray) -> np.ndarray:
 46    """
 47    Maps the knee points into indexes.
 48
 49    Args:
 50        a (np.ndarray): numpy array with the points (x)
 51        b (np.ndarray): numpy array with the knee points points (x)
 52    Returns:
 53        np.ndarray: The knee indexes
 54    """
 55    sort_idx = np.argsort(a)
 56    out = sort_idx[np.searchsorted(a, b, sorter=sort_idx)]
 57    return out
 58
 59
 60def knees2(points:np.ndarray, dx:float=0.05, dy:float=0.05, out:Outlier=Outlier.iqr):
 61    x = points[:, 0]
 62    y = points[:, 1]
 63
 64    logger.info(f'Number of points {len(points)}')
 65    
 66    # get the step for the space contrains (BB)
 67    x_step = (max(x) - min (x))*dx
 68    y_step = (max(y) - min (y))*dy
 69    
 70    # get the z-score from the second derivative
 71    yd2 = grad.csd(x, y)
 72
 73    if out is Outlier.iqr:
 74        # interquartile range method
 75        q1, q3 = np.percentile(yd2, [25, 75])
 76        iqr = q3 - q1
 77        outlier_z = q3 + (1.5 * iqr)
 78        candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z]
 79    elif out is Outlier.hampel:
 80        # Hampel method
 81        med = np.median(yd2)
 82        t = np.abs(yd2-med)
 83        outlier_z = np.median(t) * 4.5
 84        candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z]
 85    else:
 86        # Z-score
 87        z_yd2 = uzscore.zscore_array(x, yd2) # <-- TODO T-score or other
 88        outlier_z = np.median(z_yd2) # <-- TODO outher metric
 89        candidates = [i for i in range(len(z_yd2)) if z_yd2[i] >= outlier_z]
 90    
 91    logger.info(f'Candidates: {candidates}({len(candidates)})')
 92    
 93    # filter worst and corner points
 94    candidates = pp.filter_worst_knees(points, candidates)
 95    candidates = pp.filter_corner_knees(points, candidates, t=0.3)
 96
 97    counter = 0
 98    done = False
 99    while not done:
100        counter += 1
101        logger.info(f'Round {counter}')
102        logger.info(f'Current candidates {candidates}')
103
104        best_candidates = []
105        # cluster points based on dx and dy
106        for i in candidates:
107            c = points[i]
108            n = [candidates[j] for j in range(len(candidates)) if (math.fabs(points[candidates[j],0] - c[0])<=x_step) and (math.fabs(points[candidates[j],1] - c[1])<=y_step)]
109            logger.info(f'Candidate {c}/{i} neighbourhood {n}')
110
111            if len(n) == 1 and n[0] == i:
112                best_candidates.append(i)
113                logger.info(f'found best knee...')
114            elif len(n) > 1:
115                # find the best candidate from the remaining list
116                r = pp.rank_corners(points, n)
117                logger.info(f'{n} -> rank {r}')
118                if n[np.argmax(r)] == i:
119                    best_candidates.append(i)
120                    logger.info(f'found best knee...')
121            else:
122                logger.info(f'Ups...')
123        logger.info(f'Candidates {candidates}({len(candidates)}) best candidates {best_candidates}({len(best_candidates)})')
124        if np.array_equal(best_candidates, candidates):
125            done = True
126        else:
127            candidates = best_candidates
128
129    logger.info(f'Final list {candidates}')
130
131    return np.array(candidates)
132
133
134def knees(points:np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, x_max:int=None, y_range:list=None) -> np.ndarray:
135    """
136    Given an array of points, it computes the knees.
137
138    Args:
139        points (np.ndarray): numpy array with the points (x, y)
140        dx (float): % of max cache size between points (default 0.05)
141        dy (float): % of max - min miss ratio between points (default 0.05)
142        dz (float): amount we decrease outlier_z every iteration (default 0.05)
143        x_max (int): max cache size of original (pre-RDP) MRC  (default None)
144        y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)
145
146    Returns:
147        np.ndarray: The knee points on the curve
148    """
149    x = points[:, 0]
150    rv = getPoints(points, dx, dy, dz, False, x_max, y_range)
151    # convert x points into indexes:
152    return map_index(x, np.array(rv))
153
154
155def getPoints(points: np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, plot:bool=False, x_max:int=None, y_range:list=None) -> np.ndarray:
156    """
157    Use our outlier method to find interesting points in an MRC.
158    
159    Args:
160        points (np.ndarray): numpy array with the points (x, y)
161        dx (float): % of max cache size between points (default 0.05)
162        dy (float): % of max - min miss ratio between points (default 0.05)
163        dz (float): amount we decrease outlier_z every iteration (default 0.05)
164        plot (bool): set True if you want to return data useful for plotting
165        x_max (int): max cache size of original (pre-RDP) MRC (default None)
166        y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)
167
168    Returns:
169        list: list with the knees x coordinate
170    """
171    
172    # in case we use RDP, we need the original MRC x/y ranges: x_max,y_range vars
173    x_max = x_max if x_max else len(points)
174    if y_range:
175        y_max,y_min = y_range
176    else:
177        y_max,y_min = (points[:,1].max(),points[:,1].min())
178
179    if len(points) < 4:
180        logger.debug('pointSelector: < 4 unique requests in workload')
181        return []
182
183    if y_min == 1:
184        logger.debug('pointSelector: workload completely random (dont bother caching)')
185        return []
186
187    # get absolute x and y distances
188    x_width = max(1, int(x_max * dx))
189    y_height = (y_max - y_min) * dy
190    
191    #logger.info(f'X width {x_width} y height {y_height}')
192
193    # get z-score
194    x = points[:, 0]
195    y = points[:, 1]
196    yd2 = grad.csd(x, y)
197    z_yd2 = uzscore.zscore_array(x, yd2)
198    min_zscore = min(z_yd2)
199
200    #logger.info(f'zscore [{min_zscore} {max(z_yd2)}]')
201
202    # stack the 2nd derivative zscore with the points
203    points = np.column_stack((points, z_yd2))
204
205    # outlier_points holds our final selected points
206    outlier_points = np.empty((0,2))
207
208    #logger.info(f'outlier points {outlier_points}')
209    
210    # main loop. start with outliers >= 3 z-score
211    # TODO: Magic number
212    outlier_z = 3
213    #logger.info(f' Initial outlier Z value: {outlier_z}')
214    while True:
215        points_added = 0
216        # candidate points have a zscore >= outlier_z
217        candidates = points[points[:,2] >= outlier_z]
218        #print('Candidates: ' + str(len(candidates)) + ' Points: ' + str(len(points)) + ' Outlier_Points: ' +
219        #        str(len(outlier_points)) + ' Outlier_Z: '  + str(round(outlier_z,3)))
220        
221        
222
223        if len(candidates) > 0:
224            x_diff = np.argwhere(np.diff(candidates, axis=0)[:,0] >= x_width).flatten()
225            
226            if len(x_diff) == 0:
227                outlier_best = candidates[np.argmin(candidates[:,1])] # best miss ratio in range
228                if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]):
229                    outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0)
230                    points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \
231                                ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))]
232                    points_added += 1
233            else:
234                candidate_outliers = np.empty((0,3))
235                x_diff = np.hstack(([0], x_diff,[len(candidates)-1]))
236                
237                # first create an array of candidate outliers
238                for i in range(0, len(x_diff)-1):
239                    # points in this form (0, 1) [1,2) ... [n,End)
240                    if i == 0:
241                        x_range = candidates[candidates[:,0] <= candidates[x_diff[i+1]][0]]
242                    else:
243                        x_range = candidates[(candidates[:,0] > candidates[x_diff[i]][0]) & (candidates[:,0] <= candidates[x_diff[i+1]][0])]
244                    
245                    outlier_best = x_range[np.argmin(x_range[:,1])] # point with best miss ratio in range
246                    outlier_best_z = x_range[np.argmin(x_range[:,2])][2] # best z-score in range
247                    outlier_best[2] = outlier_best_z
248                    candidate_outliers = np.append(candidate_outliers, [outlier_best], axis=0)
249                
250                # sort all the candidate outliers by z-score in descending order
251                candidate_outliers = candidate_outliers[np.argsort(candidate_outliers[:,2])][::-1]
252                for outlier_best in candidate_outliers:
253                    if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]):
254                        outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0)
255                        points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \
256                                    ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))]
257                        points_added += 1
258
259        # terminating conditions (i think len(points) == 0 is all we need now)
260        if len(points) == 0 or ((outlier_z <= min_zscore) and points_added == 0):
261            break
262
263        outlier_z -= dz
264        
265        #logger.info(f'Outlier Z value: {outlier_z}')
266
267
268    # TODO: what is this step
269    # sweep through and points to avoid picking concavity issues
270    outlier_min_mr = 1.0
271    
272    # convert to a dict so we can delete in-place
273    #logger.info(f'Outlier points {outlier_points}')
274    outlier_points = {int(x[0]):x[1] for x in outlier_points}
275    #logger.info(f'Outlier points {outlier_points}')
276    outlier_keys = list(sorted(outlier_points.keys()))
277    for k in outlier_keys:
278        if outlier_points[k] > outlier_min_mr:
279            del outlier_points[k]
280        else:
281            outlier_min_mr = outlier_points[k]
282
283    # returns sorted list of cache sizes
284    if not plot:
285        #return map_index(points, outlier_points)
286        return np.array(list(sorted(outlier_points.keys())))
287    else:
288        return (outlier_points, z_yd2)

logger = <Logger kneeliverse.zmethod (WARNING)>

class Outlier(enum.Enum): View Source

35class Outlier(enum.Enum):
36    """
37    """
38    zscore = 'zscore'
39    iqr = 'iqr'
40    hampel = 'hampel'
41
42    def __str__(self):
43        return self.value

zscore = <Outlier.zscore: 'zscore'>

iqr = <Outlier.iqr: 'iqr'>

hampel = <Outlier.hampel: 'hampel'>

Inherited Members

enum.Enum: name; value

def map_index(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray: View Source

46def map_index(a:np.ndarray, b:np.ndarray) -> np.ndarray:
47    """
48    Maps the knee points into indexes.
49
50    Args:
51        a (np.ndarray): numpy array with the points (x)
52        b (np.ndarray): numpy array with the knee points points (x)
53    Returns:
54        np.ndarray: The knee indexes
55    """
56    sort_idx = np.argsort(a)
57    out = sort_idx[np.searchsorted(a, b, sorter=sort_idx)]
58    return out

Maps the knee points into indexes.

Arguments:

a (np.ndarray): numpy array with the points (x)
b (np.ndarray): numpy array with the knee points points (x)

Returns:

np.ndarray: The knee indexes

def knees2( points: numpy.ndarray, dx: float = 0.05, dy: float = 0.05, out: Outlier = <Outlier.iqr: 'iqr'>): View Source

 61def knees2(points:np.ndarray, dx:float=0.05, dy:float=0.05, out:Outlier=Outlier.iqr):
 62    x = points[:, 0]
 63    y = points[:, 1]
 64
 65    logger.info(f'Number of points {len(points)}')
 66    
 67    # get the step for the space contrains (BB)
 68    x_step = (max(x) - min (x))*dx
 69    y_step = (max(y) - min (y))*dy
 70    
 71    # get the z-score from the second derivative
 72    yd2 = grad.csd(x, y)
 73
 74    if out is Outlier.iqr:
 75        # interquartile range method
 76        q1, q3 = np.percentile(yd2, [25, 75])
 77        iqr = q3 - q1
 78        outlier_z = q3 + (1.5 * iqr)
 79        candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z]
 80    elif out is Outlier.hampel:
 81        # Hampel method
 82        med = np.median(yd2)
 83        t = np.abs(yd2-med)
 84        outlier_z = np.median(t) * 4.5
 85        candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z]
 86    else:
 87        # Z-score
 88        z_yd2 = uzscore.zscore_array(x, yd2) # <-- TODO T-score or other
 89        outlier_z = np.median(z_yd2) # <-- TODO outher metric
 90        candidates = [i for i in range(len(z_yd2)) if z_yd2[i] >= outlier_z]
 91    
 92    logger.info(f'Candidates: {candidates}({len(candidates)})')
 93    
 94    # filter worst and corner points
 95    candidates = pp.filter_worst_knees(points, candidates)
 96    candidates = pp.filter_corner_knees(points, candidates, t=0.3)
 97
 98    counter = 0
 99    done = False
100    while not done:
101        counter += 1
102        logger.info(f'Round {counter}')
103        logger.info(f'Current candidates {candidates}')
104
105        best_candidates = []
106        # cluster points based on dx and dy
107        for i in candidates:
108            c = points[i]
109            n = [candidates[j] for j in range(len(candidates)) if (math.fabs(points[candidates[j],0] - c[0])<=x_step) and (math.fabs(points[candidates[j],1] - c[1])<=y_step)]
110            logger.info(f'Candidate {c}/{i} neighbourhood {n}')
111
112            if len(n) == 1 and n[0] == i:
113                best_candidates.append(i)
114                logger.info(f'found best knee...')
115            elif len(n) > 1:
116                # find the best candidate from the remaining list
117                r = pp.rank_corners(points, n)
118                logger.info(f'{n} -> rank {r}')
119                if n[np.argmax(r)] == i:
120                    best_candidates.append(i)
121                    logger.info(f'found best knee...')
122            else:
123                logger.info(f'Ups...')
124        logger.info(f'Candidates {candidates}({len(candidates)}) best candidates {best_candidates}({len(best_candidates)})')
125        if np.array_equal(best_candidates, candidates):
126            done = True
127        else:
128            candidates = best_candidates
129
130    logger.info(f'Final list {candidates}')
131
132    return np.array(candidates)

def knees( points: numpy.ndarray, dx: float = 0.05, dy: float = 0.05, dz: float = 0.05, x_max: int = None, y_range: list = None) -> numpy.ndarray: View Source

135def knees(points:np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, x_max:int=None, y_range:list=None) -> np.ndarray:
136    """
137    Given an array of points, it computes the knees.
138
139    Args:
140        points (np.ndarray): numpy array with the points (x, y)
141        dx (float): % of max cache size between points (default 0.05)
142        dy (float): % of max - min miss ratio between points (default 0.05)
143        dz (float): amount we decrease outlier_z every iteration (default 0.05)
144        x_max (int): max cache size of original (pre-RDP) MRC  (default None)
145        y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)
146
147    Returns:
148        np.ndarray: The knee points on the curve
149    """
150    x = points[:, 0]
151    rv = getPoints(points, dx, dy, dz, False, x_max, y_range)
152    # convert x points into indexes:
153    return map_index(x, np.array(rv))

Given an array of points, it computes the knees.

Arguments:

points (np.ndarray): numpy array with the points (x, y)
dx (float): % of max cache size between points (default 0.05)
dy (float): % of max - min miss ratio between points (default 0.05)
dz (float): amount we decrease outlier_z every iteration (default 0.05)
x_max (int): max cache size of original (pre-RDP) MRC (default None)
y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)

Returns:

np.ndarray: The knee points on the curve

def getPoints( points: numpy.ndarray, dx: float = 0.05, dy: float = 0.05, dz: float = 0.05, plot: bool = False, x_max: int = None, y_range: list = None) -> numpy.ndarray: View Source

156def getPoints(points: np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, plot:bool=False, x_max:int=None, y_range:list=None) -> np.ndarray:
157    """
158    Use our outlier method to find interesting points in an MRC.
159    
160    Args:
161        points (np.ndarray): numpy array with the points (x, y)
162        dx (float): % of max cache size between points (default 0.05)
163        dy (float): % of max - min miss ratio between points (default 0.05)
164        dz (float): amount we decrease outlier_z every iteration (default 0.05)
165        plot (bool): set True if you want to return data useful for plotting
166        x_max (int): max cache size of original (pre-RDP) MRC (default None)
167        y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)
168
169    Returns:
170        list: list with the knees x coordinate
171    """
172    
173    # in case we use RDP, we need the original MRC x/y ranges: x_max,y_range vars
174    x_max = x_max if x_max else len(points)
175    if y_range:
176        y_max,y_min = y_range
177    else:
178        y_max,y_min = (points[:,1].max(),points[:,1].min())
179
180    if len(points) < 4:
181        logger.debug('pointSelector: < 4 unique requests in workload')
182        return []
183
184    if y_min == 1:
185        logger.debug('pointSelector: workload completely random (dont bother caching)')
186        return []
187
188    # get absolute x and y distances
189    x_width = max(1, int(x_max * dx))
190    y_height = (y_max - y_min) * dy
191    
192    #logger.info(f'X width {x_width} y height {y_height}')
193
194    # get z-score
195    x = points[:, 0]
196    y = points[:, 1]
197    yd2 = grad.csd(x, y)
198    z_yd2 = uzscore.zscore_array(x, yd2)
199    min_zscore = min(z_yd2)
200
201    #logger.info(f'zscore [{min_zscore} {max(z_yd2)}]')
202
203    # stack the 2nd derivative zscore with the points
204    points = np.column_stack((points, z_yd2))
205
206    # outlier_points holds our final selected points
207    outlier_points = np.empty((0,2))
208
209    #logger.info(f'outlier points {outlier_points}')
210    
211    # main loop. start with outliers >= 3 z-score
212    # TODO: Magic number
213    outlier_z = 3
214    #logger.info(f' Initial outlier Z value: {outlier_z}')
215    while True:
216        points_added = 0
217        # candidate points have a zscore >= outlier_z
218        candidates = points[points[:,2] >= outlier_z]
219        #print('Candidates: ' + str(len(candidates)) + ' Points: ' + str(len(points)) + ' Outlier_Points: ' +
220        #        str(len(outlier_points)) + ' Outlier_Z: '  + str(round(outlier_z,3)))
221        
222        
223
224        if len(candidates) > 0:
225            x_diff = np.argwhere(np.diff(candidates, axis=0)[:,0] >= x_width).flatten()
226            
227            if len(x_diff) == 0:
228                outlier_best = candidates[np.argmin(candidates[:,1])] # best miss ratio in range
229                if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]):
230                    outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0)
231                    points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \
232                                ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))]
233                    points_added += 1
234            else:
235                candidate_outliers = np.empty((0,3))
236                x_diff = np.hstack(([0], x_diff,[len(candidates)-1]))
237                
238                # first create an array of candidate outliers
239                for i in range(0, len(x_diff)-1):
240                    # points in this form (0, 1) [1,2) ... [n,End)
241                    if i == 0:
242                        x_range = candidates[candidates[:,0] <= candidates[x_diff[i+1]][0]]
243                    else:
244                        x_range = candidates[(candidates[:,0] > candidates[x_diff[i]][0]) & (candidates[:,0] <= candidates[x_diff[i+1]][0])]
245                    
246                    outlier_best = x_range[np.argmin(x_range[:,1])] # point with best miss ratio in range
247                    outlier_best_z = x_range[np.argmin(x_range[:,2])][2] # best z-score in range
248                    outlier_best[2] = outlier_best_z
249                    candidate_outliers = np.append(candidate_outliers, [outlier_best], axis=0)
250                
251                # sort all the candidate outliers by z-score in descending order
252                candidate_outliers = candidate_outliers[np.argsort(candidate_outliers[:,2])][::-1]
253                for outlier_best in candidate_outliers:
254                    if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]):
255                        outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0)
256                        points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \
257                                    ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))]
258                        points_added += 1
259
260        # terminating conditions (i think len(points) == 0 is all we need now)
261        if len(points) == 0 or ((outlier_z <= min_zscore) and points_added == 0):
262            break
263
264        outlier_z -= dz
265        
266        #logger.info(f'Outlier Z value: {outlier_z}')
267
268
269    # TODO: what is this step
270    # sweep through and points to avoid picking concavity issues
271    outlier_min_mr = 1.0
272    
273    # convert to a dict so we can delete in-place
274    #logger.info(f'Outlier points {outlier_points}')
275    outlier_points = {int(x[0]):x[1] for x in outlier_points}
276    #logger.info(f'Outlier points {outlier_points}')
277    outlier_keys = list(sorted(outlier_points.keys()))
278    for k in outlier_keys:
279        if outlier_points[k] > outlier_min_mr:
280            del outlier_points[k]
281        else:
282            outlier_min_mr = outlier_points[k]
283
284    # returns sorted list of cache sizes
285    if not plot:
286        #return map_index(points, outlier_points)
287        return np.array(list(sorted(outlier_points.keys())))
288    else:
289        return (outlier_points, z_yd2)

Use our outlier method to find interesting points in an MRC.

Arguments:

points (np.ndarray): numpy array with the points (x, y)
dx (float): % of max cache size between points (default 0.05)
dy (float): % of max - min miss ratio between points (default 0.05)
dz (float): amount we decrease outlier_z every iteration (default 0.05)
plot (bool): set True if you want to return data useful for plotting
x_max (int): max cache size of original (pre-RDP) MRC (default None)
y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)

Returns:

list: list with the knees x coordinate