kneeliverse.zmethod
The following module provides knee detection method based on Z-method algorithm.
1# coding: utf-8 2 3''' 4The following module provides knee detection method 5based on Z-method algorithm. 6''' 7 8__author__ = 'Tyler Estro' 9__version__ = '1.0' 10__email__ = 'testro@cs.stonybrook.edu' 11__status__ = 'Development' 12__license__ = 'MIT' 13__copyright__ = ''' 14Copyright (c) 2021-2023 Stony Brook University 15Copyright (c) 2021-2023 The Research Foundation of SUNY 16''' 17 18 19import enum 20import math 21import logging 22import numpy as np 23import kneeliverse.postprocessing as pp 24 25 26import uts.gradient as grad 27import uts.zscore as uzscore #import zscore_array 28#import uts.thresholding as uthres #import isodata 29 30 31logger = logging.getLogger(__name__) 32 33 34class Outlier(enum.Enum): 35 """ 36 """ 37 zscore = 'zscore' 38 iqr = 'iqr' 39 hampel = 'hampel' 40 41 def __str__(self): 42 return self.value 43 44 45def map_index(a:np.ndarray, b:np.ndarray) -> np.ndarray: 46 """ 47 Maps the knee points into indexes. 48 49 Args: 50 a (np.ndarray): numpy array with the points (x) 51 b (np.ndarray): numpy array with the knee points points (x) 52 Returns: 53 np.ndarray: The knee indexes 54 """ 55 sort_idx = np.argsort(a) 56 out = sort_idx[np.searchsorted(a, b, sorter=sort_idx)] 57 return out 58 59 60def knees2(points:np.ndarray, dx:float=0.05, dy:float=0.05, out:Outlier=Outlier.iqr): 61 x = points[:, 0] 62 y = points[:, 1] 63 64 logger.info(f'Number of points {len(points)}') 65 66 # get the step for the space contrains (BB) 67 x_step = (max(x) - min (x))*dx 68 y_step = (max(y) - min (y))*dy 69 70 # get the z-score from the second derivative 71 yd2 = grad.csd(x, y) 72 73 if out is Outlier.iqr: 74 # interquartile range method 75 q1, q3 = np.percentile(yd2, [25, 75]) 76 iqr = q3 - q1 77 outlier_z = q3 + (1.5 * iqr) 78 candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z] 79 elif out is Outlier.hampel: 80 # Hampel method 81 med = np.median(yd2) 82 t = np.abs(yd2-med) 83 outlier_z = np.median(t) * 4.5 84 candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z] 85 else: 86 # Z-score 87 z_yd2 = uzscore.zscore_array(x, yd2) # <-- TODO T-score or other 88 outlier_z = np.median(z_yd2) # <-- TODO outher metric 89 candidates = [i for i in range(len(z_yd2)) if z_yd2[i] >= outlier_z] 90 91 logger.info(f'Candidates: {candidates}({len(candidates)})') 92 93 # filter worst and corner points 94 candidates = pp.filter_worst_knees(points, candidates) 95 candidates = pp.filter_corner_knees(points, candidates, t=0.3) 96 97 counter = 0 98 done = False 99 while not done: 100 counter += 1 101 logger.info(f'Round {counter}') 102 logger.info(f'Current candidates {candidates}') 103 104 best_candidates = [] 105 # cluster points based on dx and dy 106 for i in candidates: 107 c = points[i] 108 n = [candidates[j] for j in range(len(candidates)) if (math.fabs(points[candidates[j],0] - c[0])<=x_step) and (math.fabs(points[candidates[j],1] - c[1])<=y_step)] 109 logger.info(f'Candidate {c}/{i} neighbourhood {n}') 110 111 if len(n) == 1 and n[0] == i: 112 best_candidates.append(i) 113 logger.info(f'found best knee...') 114 elif len(n) > 1: 115 # find the best candidate from the remaining list 116 r = pp.rank_corners(points, n) 117 logger.info(f'{n} -> rank {r}') 118 if n[np.argmax(r)] == i: 119 best_candidates.append(i) 120 logger.info(f'found best knee...') 121 else: 122 logger.info(f'Ups...') 123 logger.info(f'Candidates {candidates}({len(candidates)}) best candidates {best_candidates}({len(best_candidates)})') 124 if np.array_equal(best_candidates, candidates): 125 done = True 126 else: 127 candidates = best_candidates 128 129 logger.info(f'Final list {candidates}') 130 131 return np.array(candidates) 132 133 134def knees(points:np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, x_max:int=None, y_range:list=None) -> np.ndarray: 135 """ 136 Given an array of points, it computes the knees. 137 138 Args: 139 points (np.ndarray): numpy array with the points (x, y) 140 dx (float): % of max cache size between points (default 0.05) 141 dy (float): % of max - min miss ratio between points (default 0.05) 142 dz (float): amount we decrease outlier_z every iteration (default 0.05) 143 x_max (int): max cache size of original (pre-RDP) MRC (default None) 144 y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None) 145 146 Returns: 147 np.ndarray: The knee points on the curve 148 """ 149 x = points[:, 0] 150 rv = getPoints(points, dx, dy, dz, False, x_max, y_range) 151 # convert x points into indexes: 152 return map_index(x, np.array(rv)) 153 154 155def getPoints(points: np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, plot:bool=False, x_max:int=None, y_range:list=None) -> np.ndarray: 156 """ 157 Use our outlier method to find interesting points in an MRC. 158 159 Args: 160 points (np.ndarray): numpy array with the points (x, y) 161 dx (float): % of max cache size between points (default 0.05) 162 dy (float): % of max - min miss ratio between points (default 0.05) 163 dz (float): amount we decrease outlier_z every iteration (default 0.05) 164 plot (bool): set True if you want to return data useful for plotting 165 x_max (int): max cache size of original (pre-RDP) MRC (default None) 166 y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None) 167 168 Returns: 169 list: list with the knees x coordinate 170 """ 171 172 # in case we use RDP, we need the original MRC x/y ranges: x_max,y_range vars 173 x_max = x_max if x_max else len(points) 174 if y_range: 175 y_max,y_min = y_range 176 else: 177 y_max,y_min = (points[:,1].max(),points[:,1].min()) 178 179 if len(points) < 4: 180 logger.debug('pointSelector: < 4 unique requests in workload') 181 return [] 182 183 if y_min == 1: 184 logger.debug('pointSelector: workload completely random (dont bother caching)') 185 return [] 186 187 # get absolute x and y distances 188 x_width = max(1, int(x_max * dx)) 189 y_height = (y_max - y_min) * dy 190 191 #logger.info(f'X width {x_width} y height {y_height}') 192 193 # get z-score 194 x = points[:, 0] 195 y = points[:, 1] 196 yd2 = grad.csd(x, y) 197 z_yd2 = uzscore.zscore_array(x, yd2) 198 min_zscore = min(z_yd2) 199 200 #logger.info(f'zscore [{min_zscore} {max(z_yd2)}]') 201 202 # stack the 2nd derivative zscore with the points 203 points = np.column_stack((points, z_yd2)) 204 205 # outlier_points holds our final selected points 206 outlier_points = np.empty((0,2)) 207 208 #logger.info(f'outlier points {outlier_points}') 209 210 # main loop. start with outliers >= 3 z-score 211 # TODO: Magic number 212 outlier_z = 3 213 #logger.info(f' Initial outlier Z value: {outlier_z}') 214 while True: 215 points_added = 0 216 # candidate points have a zscore >= outlier_z 217 candidates = points[points[:,2] >= outlier_z] 218 #print('Candidates: ' + str(len(candidates)) + ' Points: ' + str(len(points)) + ' Outlier_Points: ' + 219 # str(len(outlier_points)) + ' Outlier_Z: ' + str(round(outlier_z,3))) 220 221 222 223 if len(candidates) > 0: 224 x_diff = np.argwhere(np.diff(candidates, axis=0)[:,0] >= x_width).flatten() 225 226 if len(x_diff) == 0: 227 outlier_best = candidates[np.argmin(candidates[:,1])] # best miss ratio in range 228 if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]): 229 outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0) 230 points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \ 231 ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))] 232 points_added += 1 233 else: 234 candidate_outliers = np.empty((0,3)) 235 x_diff = np.hstack(([0], x_diff,[len(candidates)-1])) 236 237 # first create an array of candidate outliers 238 for i in range(0, len(x_diff)-1): 239 # points in this form (0, 1) [1,2) ... [n,End) 240 if i == 0: 241 x_range = candidates[candidates[:,0] <= candidates[x_diff[i+1]][0]] 242 else: 243 x_range = candidates[(candidates[:,0] > candidates[x_diff[i]][0]) & (candidates[:,0] <= candidates[x_diff[i+1]][0])] 244 245 outlier_best = x_range[np.argmin(x_range[:,1])] # point with best miss ratio in range 246 outlier_best_z = x_range[np.argmin(x_range[:,2])][2] # best z-score in range 247 outlier_best[2] = outlier_best_z 248 candidate_outliers = np.append(candidate_outliers, [outlier_best], axis=0) 249 250 # sort all the candidate outliers by z-score in descending order 251 candidate_outliers = candidate_outliers[np.argsort(candidate_outliers[:,2])][::-1] 252 for outlier_best in candidate_outliers: 253 if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]): 254 outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0) 255 points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \ 256 ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))] 257 points_added += 1 258 259 # terminating conditions (i think len(points) == 0 is all we need now) 260 if len(points) == 0 or ((outlier_z <= min_zscore) and points_added == 0): 261 break 262 263 outlier_z -= dz 264 265 #logger.info(f'Outlier Z value: {outlier_z}') 266 267 268 # TODO: what is this step 269 # sweep through and points to avoid picking concavity issues 270 outlier_min_mr = 1.0 271 272 # convert to a dict so we can delete in-place 273 #logger.info(f'Outlier points {outlier_points}') 274 outlier_points = {int(x[0]):x[1] for x in outlier_points} 275 #logger.info(f'Outlier points {outlier_points}') 276 outlier_keys = list(sorted(outlier_points.keys())) 277 for k in outlier_keys: 278 if outlier_points[k] > outlier_min_mr: 279 del outlier_points[k] 280 else: 281 outlier_min_mr = outlier_points[k] 282 283 # returns sorted list of cache sizes 284 if not plot: 285 #return map_index(points, outlier_points) 286 return np.array(list(sorted(outlier_points.keys()))) 287 else: 288 return (outlier_points, z_yd2)
logger =
<Logger kneeliverse.zmethod (WARNING)>
class
Outlier(enum.Enum):
35class Outlier(enum.Enum): 36 """ 37 """ 38 zscore = 'zscore' 39 iqr = 'iqr' 40 hampel = 'hampel' 41 42 def __str__(self): 43 return self.value
zscore =
<Outlier.zscore: 'zscore'>
iqr =
<Outlier.iqr: 'iqr'>
hampel =
<Outlier.hampel: 'hampel'>
Inherited Members
- enum.Enum
- name
- value
def
map_index(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
46def map_index(a:np.ndarray, b:np.ndarray) -> np.ndarray: 47 """ 48 Maps the knee points into indexes. 49 50 Args: 51 a (np.ndarray): numpy array with the points (x) 52 b (np.ndarray): numpy array with the knee points points (x) 53 Returns: 54 np.ndarray: The knee indexes 55 """ 56 sort_idx = np.argsort(a) 57 out = sort_idx[np.searchsorted(a, b, sorter=sort_idx)] 58 return out
Maps the knee points into indexes.
Arguments:
- a (np.ndarray): numpy array with the points (x)
- b (np.ndarray): numpy array with the knee points points (x)
Returns:
np.ndarray: The knee indexes
def
knees2( points: numpy.ndarray, dx: float = 0.05, dy: float = 0.05, out: Outlier = <Outlier.iqr: 'iqr'>):
61def knees2(points:np.ndarray, dx:float=0.05, dy:float=0.05, out:Outlier=Outlier.iqr): 62 x = points[:, 0] 63 y = points[:, 1] 64 65 logger.info(f'Number of points {len(points)}') 66 67 # get the step for the space contrains (BB) 68 x_step = (max(x) - min (x))*dx 69 y_step = (max(y) - min (y))*dy 70 71 # get the z-score from the second derivative 72 yd2 = grad.csd(x, y) 73 74 if out is Outlier.iqr: 75 # interquartile range method 76 q1, q3 = np.percentile(yd2, [25, 75]) 77 iqr = q3 - q1 78 outlier_z = q3 + (1.5 * iqr) 79 candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z] 80 elif out is Outlier.hampel: 81 # Hampel method 82 med = np.median(yd2) 83 t = np.abs(yd2-med) 84 outlier_z = np.median(t) * 4.5 85 candidates = [i for i in range(len(yd2)) if yd2[i] >= outlier_z] 86 else: 87 # Z-score 88 z_yd2 = uzscore.zscore_array(x, yd2) # <-- TODO T-score or other 89 outlier_z = np.median(z_yd2) # <-- TODO outher metric 90 candidates = [i for i in range(len(z_yd2)) if z_yd2[i] >= outlier_z] 91 92 logger.info(f'Candidates: {candidates}({len(candidates)})') 93 94 # filter worst and corner points 95 candidates = pp.filter_worst_knees(points, candidates) 96 candidates = pp.filter_corner_knees(points, candidates, t=0.3) 97 98 counter = 0 99 done = False 100 while not done: 101 counter += 1 102 logger.info(f'Round {counter}') 103 logger.info(f'Current candidates {candidates}') 104 105 best_candidates = [] 106 # cluster points based on dx and dy 107 for i in candidates: 108 c = points[i] 109 n = [candidates[j] for j in range(len(candidates)) if (math.fabs(points[candidates[j],0] - c[0])<=x_step) and (math.fabs(points[candidates[j],1] - c[1])<=y_step)] 110 logger.info(f'Candidate {c}/{i} neighbourhood {n}') 111 112 if len(n) == 1 and n[0] == i: 113 best_candidates.append(i) 114 logger.info(f'found best knee...') 115 elif len(n) > 1: 116 # find the best candidate from the remaining list 117 r = pp.rank_corners(points, n) 118 logger.info(f'{n} -> rank {r}') 119 if n[np.argmax(r)] == i: 120 best_candidates.append(i) 121 logger.info(f'found best knee...') 122 else: 123 logger.info(f'Ups...') 124 logger.info(f'Candidates {candidates}({len(candidates)}) best candidates {best_candidates}({len(best_candidates)})') 125 if np.array_equal(best_candidates, candidates): 126 done = True 127 else: 128 candidates = best_candidates 129 130 logger.info(f'Final list {candidates}') 131 132 return np.array(candidates)
def
knees( points: numpy.ndarray, dx: float = 0.05, dy: float = 0.05, dz: float = 0.05, x_max: int = None, y_range: list = None) -> numpy.ndarray:
135def knees(points:np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, x_max:int=None, y_range:list=None) -> np.ndarray: 136 """ 137 Given an array of points, it computes the knees. 138 139 Args: 140 points (np.ndarray): numpy array with the points (x, y) 141 dx (float): % of max cache size between points (default 0.05) 142 dy (float): % of max - min miss ratio between points (default 0.05) 143 dz (float): amount we decrease outlier_z every iteration (default 0.05) 144 x_max (int): max cache size of original (pre-RDP) MRC (default None) 145 y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None) 146 147 Returns: 148 np.ndarray: The knee points on the curve 149 """ 150 x = points[:, 0] 151 rv = getPoints(points, dx, dy, dz, False, x_max, y_range) 152 # convert x points into indexes: 153 return map_index(x, np.array(rv))
Given an array of points, it computes the knees.
Arguments:
- points (np.ndarray): numpy array with the points (x, y)
- dx (float): % of max cache size between points (default 0.05)
- dy (float): % of max - min miss ratio between points (default 0.05)
- dz (float): amount we decrease outlier_z every iteration (default 0.05)
- x_max (int): max cache size of original (pre-RDP) MRC (default None)
- y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)
Returns:
np.ndarray: The knee points on the curve
def
getPoints( points: numpy.ndarray, dx: float = 0.05, dy: float = 0.05, dz: float = 0.05, plot: bool = False, x_max: int = None, y_range: list = None) -> numpy.ndarray:
156def getPoints(points: np.ndarray, dx:float=0.05, dy:float=0.05, dz:float=0.05, plot:bool=False, x_max:int=None, y_range:list=None) -> np.ndarray: 157 """ 158 Use our outlier method to find interesting points in an MRC. 159 160 Args: 161 points (np.ndarray): numpy array with the points (x, y) 162 dx (float): % of max cache size between points (default 0.05) 163 dy (float): % of max - min miss ratio between points (default 0.05) 164 dz (float): amount we decrease outlier_z every iteration (default 0.05) 165 plot (bool): set True if you want to return data useful for plotting 166 x_max (int): max cache size of original (pre-RDP) MRC (default None) 167 y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None) 168 169 Returns: 170 list: list with the knees x coordinate 171 """ 172 173 # in case we use RDP, we need the original MRC x/y ranges: x_max,y_range vars 174 x_max = x_max if x_max else len(points) 175 if y_range: 176 y_max,y_min = y_range 177 else: 178 y_max,y_min = (points[:,1].max(),points[:,1].min()) 179 180 if len(points) < 4: 181 logger.debug('pointSelector: < 4 unique requests in workload') 182 return [] 183 184 if y_min == 1: 185 logger.debug('pointSelector: workload completely random (dont bother caching)') 186 return [] 187 188 # get absolute x and y distances 189 x_width = max(1, int(x_max * dx)) 190 y_height = (y_max - y_min) * dy 191 192 #logger.info(f'X width {x_width} y height {y_height}') 193 194 # get z-score 195 x = points[:, 0] 196 y = points[:, 1] 197 yd2 = grad.csd(x, y) 198 z_yd2 = uzscore.zscore_array(x, yd2) 199 min_zscore = min(z_yd2) 200 201 #logger.info(f'zscore [{min_zscore} {max(z_yd2)}]') 202 203 # stack the 2nd derivative zscore with the points 204 points = np.column_stack((points, z_yd2)) 205 206 # outlier_points holds our final selected points 207 outlier_points = np.empty((0,2)) 208 209 #logger.info(f'outlier points {outlier_points}') 210 211 # main loop. start with outliers >= 3 z-score 212 # TODO: Magic number 213 outlier_z = 3 214 #logger.info(f' Initial outlier Z value: {outlier_z}') 215 while True: 216 points_added = 0 217 # candidate points have a zscore >= outlier_z 218 candidates = points[points[:,2] >= outlier_z] 219 #print('Candidates: ' + str(len(candidates)) + ' Points: ' + str(len(points)) + ' Outlier_Points: ' + 220 # str(len(outlier_points)) + ' Outlier_Z: ' + str(round(outlier_z,3))) 221 222 223 224 if len(candidates) > 0: 225 x_diff = np.argwhere(np.diff(candidates, axis=0)[:,0] >= x_width).flatten() 226 227 if len(x_diff) == 0: 228 outlier_best = candidates[np.argmin(candidates[:,1])] # best miss ratio in range 229 if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]): 230 outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0) 231 points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \ 232 ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))] 233 points_added += 1 234 else: 235 candidate_outliers = np.empty((0,3)) 236 x_diff = np.hstack(([0], x_diff,[len(candidates)-1])) 237 238 # first create an array of candidate outliers 239 for i in range(0, len(x_diff)-1): 240 # points in this form (0, 1) [1,2) ... [n,End) 241 if i == 0: 242 x_range = candidates[candidates[:,0] <= candidates[x_diff[i+1]][0]] 243 else: 244 x_range = candidates[(candidates[:,0] > candidates[x_diff[i]][0]) & (candidates[:,0] <= candidates[x_diff[i+1]][0])] 245 246 outlier_best = x_range[np.argmin(x_range[:,1])] # point with best miss ratio in range 247 outlier_best_z = x_range[np.argmin(x_range[:,2])][2] # best z-score in range 248 outlier_best[2] = outlier_best_z 249 candidate_outliers = np.append(candidate_outliers, [outlier_best], axis=0) 250 251 # sort all the candidate outliers by z-score in descending order 252 candidate_outliers = candidate_outliers[np.argsort(candidate_outliers[:,2])][::-1] 253 for outlier_best in candidate_outliers: 254 if all(abs(outlier_best[1]-i) >= y_height for i in outlier_points[:,1]): 255 outlier_points = np.append(outlier_points, [[outlier_best[0], outlier_best[1]]], axis=0) 256 points = points[np.where(((points[:,0] <= (outlier_best[0] - x_width)) | (points[:,0] >= (outlier_best[0] + x_width))) & \ 257 ((points[:,1] <= (outlier_best[1] - y_height)) | (points[:,1] >= (outlier_best[1] + y_height))))] 258 points_added += 1 259 260 # terminating conditions (i think len(points) == 0 is all we need now) 261 if len(points) == 0 or ((outlier_z <= min_zscore) and points_added == 0): 262 break 263 264 outlier_z -= dz 265 266 #logger.info(f'Outlier Z value: {outlier_z}') 267 268 269 # TODO: what is this step 270 # sweep through and points to avoid picking concavity issues 271 outlier_min_mr = 1.0 272 273 # convert to a dict so we can delete in-place 274 #logger.info(f'Outlier points {outlier_points}') 275 outlier_points = {int(x[0]):x[1] for x in outlier_points} 276 #logger.info(f'Outlier points {outlier_points}') 277 outlier_keys = list(sorted(outlier_points.keys())) 278 for k in outlier_keys: 279 if outlier_points[k] > outlier_min_mr: 280 del outlier_points[k] 281 else: 282 outlier_min_mr = outlier_points[k] 283 284 # returns sorted list of cache sizes 285 if not plot: 286 #return map_index(points, outlier_points) 287 return np.array(list(sorted(outlier_points.keys()))) 288 else: 289 return (outlier_points, z_yd2)
Use our outlier method to find interesting points in an MRC.
Arguments:
- points (np.ndarray): numpy array with the points (x, y)
- dx (float): % of max cache size between points (default 0.05)
- dy (float): % of max - min miss ratio between points (default 0.05)
- dz (float): amount we decrease outlier_z every iteration (default 0.05)
- plot (bool): set True if you want to return data useful for plotting
- x_max (int): max cache size of original (pre-RDP) MRC (default None)
- y_max (list): [max, min] miss ratio of original (pre-RDP) MRC (default None)
Returns:
list: list with the knees x coordinate