Source code for hdbscan.prediction

# Support various prediction methods for predicting cluster membership
# of new or unseen points. There are several ways to interpret how
# to do this correctly, so we provide several methods for
# the different use cases that may arise.

import numpy as np

from sklearn.neighbors import KDTree, BallTree
from .dist_metrics import DistanceMetric
from ._hdbscan_tree import recurse_leaf_dfs
from ._prediction_utils import (get_tree_row_with_child,
                                dist_membership_vector,
                                outlier_membership_vector,
                                prob_in_some_cluster,
                                all_points_dist_membership_vector,
                                all_points_outlier_membership_vector,
                                all_points_prob_in_some_cluster)
from warnings import warn


[docs] class PredictionData(object): """ Extra data that allows for faster prediction if cached. Parameters ---------- data : array (n_samples, n_features) The original data set that was clustered condensed_tree : CondensedTree The condensed tree object created by a clustering min_samples : int The min_samples value used in clustering tree_type : string, optional Which type of space tree to use for core distance computation. One of: * ``kdtree`` * ``balltree`` metric : string, optional The metric used to determine distance for the clustering. This is the metric that will be used for the space tree to determine core distances etc. **kwargs : Any further arguments to the metric. Attributes ---------- raw_data : array (n_samples, n_features) The original data set that was clustered tree : KDTree or BallTree A space partitioning tree that can be queried for nearest neighbors. core_distances : array (n_samples,) The core distances for every point in the original data set. cluster_map : dict A dictionary mapping cluster numbers in the condensed tree to labels in the final selected clustering. cluster_tree : structured array A version of the condensed tree that only contains clusters, not individual points. max_lambdas : dict A dictionary mapping cluster numbers in the condensed tree to the maximum lambda value seen in that cluster. """ _tree_type_map = {'kdtree': KDTree, 'balltree': BallTree} def _clusters_below(self, cluster): result = [] to_process = [cluster] while to_process: result.extend(to_process) to_process = \ self.cluster_tree['child'][np.in1d(self.cluster_tree['parent'], to_process)] to_process = to_process.tolist() return result def _recurse_leaf_dfs(self, current_node): children = self.cluster_tree[self.cluster_tree['parent'] == current_node]['child'] if len(children) == 0: return [current_node, ] else: return sum( [recurse_leaf_dfs(self.cluster_tree, child) for child in children], []) def __init__(self, data, condensed_tree, min_samples, tree_type='kdtree', metric='euclidean', **kwargs): self.raw_data = data.astype(np.float64) self.tree = self._tree_type_map[tree_type](self.raw_data, metric=metric, **kwargs) self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1] self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) selected_clusters = sorted(condensed_tree._select_clusters()) # raw_condensed_tree = condensed_tree.to_numpy() raw_condensed_tree = condensed_tree._raw_tree self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))} self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()} self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size'] > 1] self.max_lambdas = {} self.leaf_max_lambdas = {} self.exemplars = [] all_clusters = set(np.hstack([self.cluster_tree['parent'], self.cluster_tree['child']])) for cluster in all_clusters: self.leaf_max_lambdas[cluster] = raw_condensed_tree['lambda_val'][ raw_condensed_tree['parent'] == cluster].max() for cluster in selected_clusters: self.max_lambdas[cluster] = \ raw_condensed_tree['lambda_val'][raw_condensed_tree['parent'] == cluster].max() for sub_cluster in self._clusters_below(cluster): self.cluster_map[sub_cluster] = self.cluster_map[cluster] self.max_lambdas[sub_cluster] = self.max_lambdas[cluster] cluster_exemplars = np.array([], dtype=np.int64) for leaf in self._recurse_leaf_dfs(cluster): leaf_max_lambda = raw_condensed_tree['lambda_val'][ raw_condensed_tree['parent'] == leaf].max() points = raw_condensed_tree['child'][ (raw_condensed_tree['parent'] == leaf) & (raw_condensed_tree['lambda_val'] == leaf_max_lambda) ] cluster_exemplars = np.hstack([cluster_exemplars, points]) self.exemplars.append(self.raw_data[cluster_exemplars])
def _find_neighbor_and_lambda(neighbor_indices, neighbor_distances, core_distances, min_samples): """ Find the nearest mutual reachability neighbor of a point, and compute the associated lambda value for the point, given the mutual reachability distance to a nearest neighbor. Parameters ---------- neighbor_indices : array (2 * min_samples, ) An array of raw distance based nearest neighbor indices. neighbor_distances : array (2 * min_samples, ) An array of raw distances to the nearest neighbors. core_distances : array (n_samples, ) An array of core distances for all points min_samples : int The min_samples value used to generate core distances. Returns ------- neighbor : int The index into the full raw data set of the nearest mutual reachability distance neighbor of the point. lambda_ : float The lambda value at which this point joins/merges with `neighbor`. """ neighbor_core_distances = core_distances[neighbor_indices] point_core_distances = neighbor_distances[min_samples] * np.ones( neighbor_indices.shape[0]) mr_distances = np.vstack(( neighbor_core_distances, point_core_distances, neighbor_distances )).max(axis=0) nn_index = mr_distances.argmin() nearest_neighbor = neighbor_indices[nn_index] if mr_distances[nn_index] > 0.0: lambda_ = 1. / mr_distances[nn_index] else: lambda_ = np.finfo(np.double).max return nearest_neighbor, lambda_ def _extend_condensed_tree(tree, neighbor_indices, neighbor_distances, core_distances, min_samples): """ Create a new condensed tree with an additional point added, allowing for computations as if this point had been part of the original tree. Note that this makes as little change to the tree as possible, with no re-optimizing/re-condensing so that the selected clusters remain effectively unchanged. Parameters ---------- tree : structured array The raw format condensed tree to update. neighbor_indices : array (2 * min_samples, ) An array of raw distance based nearest neighbor indices. neighbor_distances : array (2 * min_samples, ) An array of raw distances to the nearest neighbors. core_distances : array (n_samples, ) An array of core distances for all points min_samples : int The min_samples value used to generate core distances. Returns ------- new_tree : structured array The original tree with an extra row providing the parent cluster and lambda information for a new point given index -1. """ tree_root = tree['parent'].min() nearest_neighbor, lambda_ = _find_neighbor_and_lambda(neighbor_indices, neighbor_distances, core_distances, min_samples ) neighbor_tree_row = get_tree_row_with_child(tree, nearest_neighbor) potential_cluster = neighbor_tree_row['parent'] if neighbor_tree_row['lambda_val'] <= lambda_: # New point departs with the old new_tree_row = (potential_cluster, -1, 1, neighbor_tree_row['lambda_val']) else: # Find appropriate cluster based on lambda of new point while potential_cluster > tree_root and \ tree[tree['child'] == potential_cluster]['lambda_val'] >= lambda_: potential_cluster = tree['parent'][tree['child'] == potential_cluster][0] new_tree_row = (potential_cluster, -1, 1, lambda_) return np.append(tree, new_tree_row) def _find_cluster_and_probability(tree, cluster_tree, neighbor_indices, neighbor_distances, core_distances, cluster_map, max_lambdas, min_samples): """ Return the cluster label (of the original clustering) and membership probability of a new data point. Parameters ---------- tree : CondensedTree The condensed tree associated with the clustering. cluster_tree : structured_array The raw form of the condensed tree with only cluster information (no data on individual points). This is significantly more compact. neighbor_indices : array (2 * min_samples, ) An array of raw distance based nearest neighbor indices. neighbor_distances : array (2 * min_samples, ) An array of raw distances to the nearest neighbors. core_distances : array (n_samples, ) An array of core distances for all points cluster_map : dict A dictionary mapping cluster numbers in the condensed tree to labels in the final selected clustering. max_lambdas : dict A dictionary mapping cluster numbers in the condensed tree to the maximum lambda value seen in that cluster. min_samples : int The min_samples value used to generate core distances. """ raw_tree = tree._raw_tree tree_root = cluster_tree['parent'].min() nearest_neighbor, lambda_ = _find_neighbor_and_lambda(neighbor_indices, neighbor_distances, core_distances, min_samples ) neighbor_tree_row = get_tree_row_with_child(raw_tree, nearest_neighbor) potential_cluster = neighbor_tree_row['parent'] if neighbor_tree_row['lambda_val'] > lambda_: # Find appropriate cluster based on lambda of new point while potential_cluster > tree_root and \ cluster_tree['lambda_val'][cluster_tree['child'] == potential_cluster] >= lambda_: potential_cluster = cluster_tree['parent'][cluster_tree['child'] == potential_cluster][0] if potential_cluster in cluster_map: cluster_label = cluster_map[potential_cluster] else: cluster_label = -1 if cluster_label >= 0: max_lambda = max_lambdas[potential_cluster] if max_lambda > 0.0: lambda_ = min(max_lambda, lambda_) prob = (lambda_ / max_lambda) else: prob = 1.0 else: prob = 0.0 return cluster_label, prob
[docs] def approximate_predict(clusterer, points_to_predict): """Predict the cluster label of new points. The returned labels will be those of the original clustering found by ``clusterer``, and therefore are not (necessarily) the cluster labels that would be found by clustering the original data combined with ``points_to_predict``, hence the 'approximate' label. If you simply wish to assign new points to an existing clustering in the 'best' way possible, this is the function to use. If you want to predict how ``points_to_predict`` would cluster with the original data under HDBSCAN the most efficient existing approach is to simply recluster with the new point(s) added to the original dataset. Parameters ---------- clusterer : HDBSCAN A clustering object that has been fit to the data and either had ``prediction_data=True`` set, or called the ``generate_prediction_data`` method after the fact. points_to_predict : array, or array-like (n_samples, n_features) The new data points to predict cluster labels for. They should have the same dimensionality as the original dataset over which clusterer was fit. Returns ------- labels : array (n_samples,) The predicted labels of the ``points_to_predict`` probabilities : array (n_samples,) The soft cluster scores for each of the ``points_to_predict`` See Also -------- :py:func:`hdbscan.predict.membership_vector` :py:func:`hdbscan.predict.all_points_membership_vectors` """ if clusterer.prediction_data_ is None: raise ValueError('Clusterer does not have prediction data!' ' Try fitting with prediction_data=True set,' ' or run generate_prediction_data on the clusterer') points_to_predict = np.asarray(points_to_predict) if points_to_predict.shape[1] != \ clusterer.prediction_data_.raw_data.shape[1]: raise ValueError('New points dimension does not match fit data!') if clusterer.prediction_data_.cluster_tree.shape[0] == 0: warn('Clusterer does not have any defined clusters, new data' ' will be automatically predicted as noise.') labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) return labels, probabilities labels = np.empty(points_to_predict.shape[0], dtype=np.int32) probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) min_samples = clusterer.min_samples or clusterer.min_cluster_size neighbor_distances, neighbor_indices = \ clusterer.prediction_data_.tree.query(points_to_predict, k=2 * min_samples) for i in range(points_to_predict.shape[0]): label, prob = _find_cluster_and_probability( clusterer.condensed_tree_, clusterer.prediction_data_.cluster_tree, neighbor_indices[i], neighbor_distances[i], clusterer.prediction_data_.core_distances, clusterer.prediction_data_.cluster_map, clusterer.prediction_data_.max_lambdas, min_samples ) labels[i] = label probabilities[i] = prob return labels, probabilities
[docs] def approximate_predict_scores(clusterer, points_to_predict): """Predict the outlier score of new points. The returned scores will be based on the original clustering found by ``clusterer``, and therefore are not (necessarily) the outlier scores that would be found by clustering the original data combined with ``points_to_predict``, hence the 'approximate' label. If you simply wish to calculate the outlier scores for new points in the 'best' way possible, this is the function to use. If you want to predict the outlier score of ``points_to_predict`` with the original data under HDBSCAN the most efficient existing approach is to simply recluster with the new point(s) added to the original dataset. Parameters ---------- clusterer : HDBSCAN A clustering object that has been fit to the data and either had ``prediction_data=True`` set, or called the ``generate_prediction_data`` method after the fact. points_to_predict : array, or array-like (n_samples, n_features) The new data points to predict cluster labels for. They should have the same dimensionality as the original dataset over which clusterer was fit. Returns ------- scores : array (n_samples,) The predicted scores of the ``points_to_predict`` See Also -------- :py:func:`hdbscan.predict.membership_vector` :py:func:`hdbscan.predict.all_points_membership_vectors` """ try: clusterer.prediction_data_ except AttributeError: raise ValueError('Clusterer does not have prediction data!' ' Try fitting with prediction_data=True set,' ' or run generate_prediction_data on the clusterer') points_to_predict = np.asarray(points_to_predict) if points_to_predict.shape[1] != \ clusterer.prediction_data_.raw_data.shape[1]: raise ValueError('New points dimension does not match fit data!') if clusterer.prediction_data_.cluster_tree.shape[0] == 0: warn('Clusterer does not have any defined clusters, new data' ' will be automatically predicted as outliers.') scores = np.ones(points_to_predict.shape[0], dtype=np.int32) return scores scores = np.empty(points_to_predict.shape[0], dtype=np.float64) min_samples = clusterer.min_samples or clusterer.min_cluster_size neighbor_distances, neighbor_indices = \ clusterer.prediction_data_.tree.query(points_to_predict, k=2 * min_samples) tree = clusterer.condensed_tree_._raw_tree parent_array = tree['parent'] tree_root = parent_array.min() max_lambdas = {} for parent in np.unique(tree['parent']): max_lambdas[parent] = tree[tree['parent'] == parent]['lambda_val'].max() for n in np.argsort(parent_array): cluster = tree['child'][n] if cluster < tree_root: break parent = parent_array[n] if max_lambdas[cluster] > max_lambdas[parent]: max_lambdas[parent] = max_lambdas[cluster] for i in range(points_to_predict.shape[0]): neigh, lambda_ = _find_neighbor_and_lambda( neighbor_indices[i], neighbor_distances[i], clusterer.prediction_data_.core_distances, min_samples ) neighbor_tree_row = get_tree_row_with_child(tree, neigh) potential_cluster = neighbor_tree_row['parent'] if neighbor_distances[i].min() == 0: # the point is in the dataset, fix lambda for rounding errors lambda_ = neighbor_tree_row['lambda_val'] max_lambda = max_lambdas[potential_cluster] if max_lambda > 0.0: scores[i] = (max_lambda - lambda_) / max_lambda else: scores[i] = 0.0 return scores
[docs] def membership_vector(clusterer, points_to_predict): """Predict soft cluster membership. The result produces a vector for each point in ``points_to_predict`` that gives a probability that the given point is a member of a cluster for each of the selected clusters of the ``clusterer``. Parameters ---------- clusterer : HDBSCAN A clustering object that has been fit to the data and either had ``prediction_data=True`` set, or called the ``generate_prediction_data`` method after the fact. points_to_predict : array, or array-like (n_samples, n_features) The new data points to predict cluster labels for. They should have the same dimensionality as the original dataset over which clusterer was fit. Returns ------- membership_vectors : array (n_samples, n_clusters) The probability that point ``i`` is a member of cluster ``j`` is in ``membership_vectors[i, j]``. See Also -------- :py:func:`hdbscan.predict.predict` :py:func:`hdbscan.predict.all_points_membership_vectors` """ points_to_predict = points_to_predict.astype(np.float64) clusters = np.array( sorted(list(clusterer.condensed_tree_._select_clusters()))).astype(np.intp) result = np.empty((points_to_predict.shape[0], clusters.shape[0]), dtype=np.float64) min_samples = clusterer.min_samples or clusterer.min_cluster_size neighbor_distances, neighbor_indices = \ clusterer.prediction_data_.tree.query(points_to_predict, k=2 * min_samples) for i in range(points_to_predict.shape[0]): # We need to find where in the tree the new point would go # for the purposes of outlier membership approximation nearest_neighbor, lambda_ = \ _find_neighbor_and_lambda( neighbor_indices[i], neighbor_distances[i], clusterer.prediction_data_.core_distances, min_samples) neighbor_tree_row = get_tree_row_with_child( clusterer.condensed_tree_._raw_tree, nearest_neighbor) if neighbor_tree_row['lambda_val'] <= lambda_: lambda_ = neighbor_tree_row['lambda_val'] distance_vec = dist_membership_vector( points_to_predict[i], clusterer.prediction_data_.exemplars, clusterer.prediction_data_.dist_metric) outlier_vec = outlier_membership_vector( nearest_neighbor, lambda_, clusters, clusterer.condensed_tree_._raw_tree, clusterer.prediction_data_.leaf_max_lambdas, clusterer.prediction_data_.cluster_tree) result[i] = distance_vec ** 0.5 * outlier_vec ** 2.0 result[i] /= result[i].sum() result[i] *= prob_in_some_cluster( nearest_neighbor, lambda_, clusters, clusterer.condensed_tree_._raw_tree, clusterer.prediction_data_.leaf_max_lambdas, clusterer.prediction_data_.cluster_tree) return result
[docs] def all_points_membership_vectors(clusterer): """Predict soft cluster membership vectors for all points in the original dataset the clusterer was trained on. This function is more efficient by making use of the fact that all points are already in the condensed tree, and processing in bulk. Parameters ---------- clusterer : HDBSCAN A clustering object that has been fit to the data and either had ``prediction_data=True`` set, or called the ``generate_prediction_data`` method after the fact. This method does not work if the clusterer was trained with ``metric='precomputed'``. Returns ------- membership_vectors : array (n_samples, n_clusters) The probability that point ``i`` of the original dataset is a member of cluster ``j`` is in ``membership_vectors[i, j]``. See Also -------- :py:func:`hdbscan.predict.predict` :py:func:`hdbscan.predict.all_points_membership_vectors` """ clusters = np.array(sorted(list(clusterer.condensed_tree_._select_clusters()))).astype(np.intp) all_points = clusterer.prediction_data_.raw_data # When no clusters found, return array of 0's if clusters.size == 0: return np.zeros(all_points.shape[0]) distance_vecs = all_points_dist_membership_vector( all_points, clusterer.prediction_data_.exemplars, clusterer.prediction_data_.dist_metric) outlier_vecs = all_points_outlier_membership_vector( clusters, clusterer.condensed_tree_._raw_tree, clusterer.prediction_data_.leaf_max_lambdas, clusterer.prediction_data_.cluster_tree) in_cluster_probs = all_points_prob_in_some_cluster( clusters, clusterer.condensed_tree_._raw_tree, clusterer.prediction_data_.leaf_max_lambdas, clusterer.prediction_data_.cluster_tree) result = distance_vecs * outlier_vecs row_sums = result.sum(axis=1) result = result / row_sums[:, np.newaxis] result *= in_cluster_probs[:, np.newaxis] return result