Source code for hdbscan.plots

# -*- coding: utf-8 -*-
# Author: Leland McInnes <leland.mcinnes@gmail.com>
#
# License: BSD 3 clause

import numpy as np

from scipy.cluster.hierarchy import dendrogram
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from warnings import warn
from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs

CB_LEFT = 0
CB_RIGHT = 1
CB_BOTTOM = 2
CB_TOP = 3


def _bfs_from_cluster_tree(tree, bfs_root):
    """
    Perform a breadth first search on a tree in condensed tree format
    """

    result = []
    to_process = [bfs_root]

    while to_process:
        result.extend(to_process)
        to_process = tree['child'][np.in1d(tree['parent'], to_process)].tolist()

    return result

def _recurse_leaf_dfs(cluster_tree, current_node):
    children = cluster_tree[cluster_tree['parent'] == current_node]['child']
    if len(children) == 0:
        return [current_node,]
    else:
        return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])

def _get_leaves(condensed_tree):
    cluster_tree = condensed_tree[condensed_tree['child_size'] > 1]
    if cluster_tree.shape[0] == 0:
        # Return the only cluster, the root
        return [condensed_tree['parent'].min()]

    root = cluster_tree['parent'].min()
    return _recurse_leaf_dfs(cluster_tree, root)


[docs]
class CondensedTree(object):
    """The condensed tree structure, which provides a simplified or smoothed version
    of the :class:`~hdbscan.plots.SingleLinkageTree`.

    Parameters
    ----------
    condensed_tree_array : numpy recarray from :class:`~hdbscan.HDBSCAN`
        The raw numpy rec array version of the condensed tree as produced
        internally by hdbscan.

    cluster_selection_method : string, optional (default 'eom')
        The method of selecting clusters. One of 'eom' or 'leaf'

    allow_single_cluster : Boolean, optional (default False)
        Whether to allow the root cluster as the only selected cluster

    """
    def __init__(self, condensed_tree_array, cluster_selection_method='eom',
                 allow_single_cluster=False):
        self._raw_tree = condensed_tree_array
        self.cluster_selection_method = cluster_selection_method
        self.allow_single_cluster = allow_single_cluster


[docs]
    def get_plot_data(self,
                      leaf_separation=1,
                      log_size=False,
                      max_rectangle_per_icicle=20):
        """Generates data for use in plotting the 'icicle plot' or dendrogram
        plot of the condensed tree generated by HDBSCAN.

        Parameters
        ----------
        leaf_separation : float, optional
                          How far apart to space the final leaves of the
                          dendrogram. (default 1)

        log_size : boolean, optional
                   Use log scale for the 'size' of clusters (i.e. number of
                   points in the cluster at a given lambda value).
                   (default False)

        max_rectangles_per_icicle : int, optional
            To simplify the plot this method will only emit
            ``max_rectangles_per_icicle`` bars per branch of the dendrogram.
            This ensures that we don't suffer from massive overplotting in
            cases with a lot of data points.

        Returns
        -------
        plot_data : dict
                    Data associated to bars in a bar plot:
                        `bar_centers` x coordinate centers for bars
                        `bar_tops` heights of bars in lambda scale
                        `bar_bottoms` y coordinate of bottoms of bars
                        `bar_widths` widths of the bars (in x coord scale)
                        `bar_bounds` a 4-tuple of [left, right, bottom, top]
                                     giving the bounds on a full set of
                                     cluster bars
                    Data associates with cluster splits:
                        `line_xs` x coordinates for horizontal dendrogram lines
                        `line_ys` y coordinates for horizontal dendrogram lines
        """
        leaves = _get_leaves(self._raw_tree)
        last_leaf = self._raw_tree['parent'].max()
        root = self._raw_tree['parent'].min()

        # We want to get the x and y coordinates for the start of each cluster
        # Initialize the leaves, since we know where they go, the iterate
        # through everything from the leaves back, setting coords as we go
        if isinstance(leaves, np.int64):
            cluster_x_coords = {leaves: leaf_separation}
        else:
            cluster_x_coords = dict(zip(leaves, [leaf_separation * x
                                                 for x in range(len(leaves))]))
        cluster_y_coords = {root: 0.0}

        for cluster in range(last_leaf, root - 1, -1):
            split = self._raw_tree[['child', 'lambda_val']]
            split = split[(self._raw_tree['parent'] == cluster) &
                          (self._raw_tree['child_size'] > 1)]
            if len(split['child']) > 1:
                left_child, right_child = split['child']
                cluster_x_coords[cluster] = np.mean([cluster_x_coords[left_child],
                                                     cluster_x_coords[right_child]])
                cluster_y_coords[left_child] = split['lambda_val'][0]
                cluster_y_coords[right_child] = split['lambda_val'][1]

        # We use bars to plot the 'icicles', so we need to generate centers, tops,
        # bottoms and widths for each rectangle. We can go through each cluster
        # and do this for each in turn.
        bar_centers = []
        bar_tops = []
        bar_bottoms = []
        bar_widths = []

        cluster_bounds = {}

        scaling = np.sum(self._raw_tree[self._raw_tree['parent'] == root]['child_size'])

        if log_size:
            scaling = np.log(scaling)

        for c in range(last_leaf, root - 1, -1):

            cluster_bounds[c] = [0, 0, 0, 0]

            c_children = self._raw_tree[self._raw_tree['parent'] == c]
            current_size = np.sum(c_children['child_size'])
            current_lambda = cluster_y_coords[c]
            cluster_max_size = current_size
            cluster_max_lambda = c_children['lambda_val'].max()
            cluster_min_size = np.sum(
                c_children[c_children['lambda_val'] ==
                           cluster_max_lambda]['child_size'])

            if log_size:
                current_size = np.log(current_size)
                cluster_max_size = np.log(cluster_max_size)
                cluster_min_size = np.log(cluster_min_size)

            total_size_change = float(cluster_max_size - cluster_min_size)
            step_size_change = total_size_change / max_rectangle_per_icicle

            cluster_bounds[c][CB_LEFT] = cluster_x_coords[c] * scaling - (current_size / 2.0)
            cluster_bounds[c][CB_RIGHT] = cluster_x_coords[c] * scaling + (current_size / 2.0)
            cluster_bounds[c][CB_BOTTOM] = cluster_y_coords[c]
            cluster_bounds[c][CB_TOP] = np.max(c_children['lambda_val'])

            last_step_size = current_size
            last_step_lambda = current_lambda

            for i in np.argsort(c_children['lambda_val']):
                row = c_children[i]
                if row['lambda_val'] != current_lambda and \
                        (last_step_size - current_size > step_size_change
                        or row['lambda_val'] == cluster_max_lambda):
                    bar_centers.append(cluster_x_coords[c] * scaling)
                    bar_tops.append(row['lambda_val'] - last_step_lambda)
                    bar_bottoms.append(last_step_lambda)
                    bar_widths.append(last_step_size)
                    last_step_size = current_size
                    last_step_lambda = current_lambda
                if log_size:
                    exp_size = np.exp(current_size) - row['child_size']
                    # Ensure we don't try to take log of zero
                    if exp_size > 0.01:
                        current_size = np.log(np.exp(current_size) - row['child_size'])
                    else:
                        current_size = 0.0
                else:
                    current_size -= row['child_size']
                current_lambda = row['lambda_val']

        # Finally we need the horizontal lines that occur at cluster splits.
        line_xs = []
        line_ys = []

        for row in self._raw_tree[self._raw_tree['child_size'] > 1]:
            parent = row['parent']
            child = row['child']
            child_size = row['child_size']
            if log_size:
                child_size = np.log(child_size)
            sign = np.sign(cluster_x_coords[child] - cluster_x_coords[parent])
            line_xs.append([
                cluster_x_coords[parent] * scaling,
                cluster_x_coords[child] * scaling + sign * (child_size / 2.0)
            ])
            line_ys.append([
                cluster_y_coords[child],
                cluster_y_coords[child]
            ])

        return {
            'bar_centers': bar_centers,
            'bar_tops': bar_tops,
            'bar_bottoms': bar_bottoms,
            'bar_widths': bar_widths,
            'line_xs': line_xs,
            'line_ys': line_ys,
            'cluster_bounds': cluster_bounds
        }


    def _select_clusters(self):
        if self.cluster_selection_method == 'eom':
            stability = compute_stability(self._raw_tree)
            if self.allow_single_cluster:
                node_list = sorted(stability.keys(), reverse=True)
            else:
                node_list = sorted(stability.keys(), reverse=True)[:-1]
            cluster_tree = self._raw_tree[self._raw_tree['child_size'] > 1]
            is_cluster = {cluster: True for cluster in node_list}

            for node in node_list:
                child_selection = (cluster_tree['parent'] == node)
                subtree_stability = np.sum([stability[child] for
                                            child in cluster_tree['child'][child_selection]])

                if subtree_stability > stability[node]:
                    is_cluster[node] = False
                    stability[node] = subtree_stability
                else:
                    for sub_node in _bfs_from_cluster_tree(cluster_tree, node):
                        if sub_node != node:
                            is_cluster[sub_node] = False

            return sorted([cluster
                           for cluster in is_cluster
                           if is_cluster[cluster]])

        elif self.cluster_selection_method == 'leaf':
            return _get_leaves(self._raw_tree)
        else:
            raise ValueError('Invalid Cluster Selection Method: %s\n'
                             'Should be one of: "eom", "leaf"\n')


[docs]
    def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
             label_clusters=False, selection_palette=None,
             axis=None, colorbar=True, log_size=False,
             max_rectangles_per_icicle=20):
        """Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree.

        Effectively this is a dendrogram where the width of each cluster bar is
        equal to the number of points (or log of the number of points) in the cluster
        at the given lambda value. Thus bars narrow as points progressively drop
        out of clusters. The make the effect more apparent the bars are also colored
        according the the number of points (or log of the number of points).

        Parameters
        ----------
        leaf_separation : float, optional (default 1)
                          How far apart to space the final leaves of the
                          dendrogram.

        cmap : string or matplotlib colormap, optional (default viridis)
               The matplotlib colormap to use to color the cluster bars.


        select_clusters : boolean, optional (default False)
                          Whether to draw ovals highlighting which cluster
                          bar represent the clusters that were selected by
                          HDBSCAN as the final clusters.

        label_clusters : boolean, optional (default False)
                         If select_clusters is True then this determines
                         whether to draw text labels on the clusters.

        selection_palette : list of colors, optional (default None)
                            If not None, and at least as long as
                            the number of clusters, draw ovals
                            in colors iterating through this palette.
                            This can aid in cluster identification
                            when plotting.

        axis : matplotlib axis or None, optional (default None)
               The matplotlib axis to render to. If None then a new axis
               will be generated. The rendered axis will be returned.


        colorbar : boolean, optional (default True)
                   Whether to draw a matplotlib colorbar displaying the range
                   of cluster sizes as per the colormap.

        log_size : boolean, optional (default False)
                   Use log scale for the 'size' of clusters (i.e. number of
                   points in the cluster at a given lambda value).


        max_rectangles_per_icicle : int, optional (default 20)
            To simplify the plot this method will only emit
            ``max_rectangles_per_icicle`` bars per branch of the dendrogram.
            This ensures that we don't suffer from massive overplotting in
            cases with a lot of data points.

         Returns
        -------
        axis : matplotlib axis
               The axis on which the 'icicle plot' has been rendered.
        """
        try:
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError(
                'You must install the matplotlib library to plot the condensed tree.'
                'Use get_plot_data to calculate the relevant data without plotting.')

        plot_data = self.get_plot_data(leaf_separation=leaf_separation,
                                       log_size=log_size,
                                       max_rectangle_per_icicle=max_rectangles_per_icicle)

        if cmap != 'none':
            sm = plt.cm.ScalarMappable(cmap=cmap,
                                       norm=plt.Normalize(0, max(plot_data['bar_widths'])))
            sm.set_array(plot_data['bar_widths'])
            bar_colors = [sm.to_rgba(x) for x in plot_data['bar_widths']]
        else:
            bar_colors = 'black'

        if axis is None:
            axis = plt.gca()

        axis.bar(
            plot_data['bar_centers'],
            plot_data['bar_tops'],
            bottom=plot_data['bar_bottoms'],
            width=plot_data['bar_widths'],
            color=bar_colors,
            align='center',
            linewidth=0
        )

        drawlines = []
        for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']):
            drawlines.append(xs)
            drawlines.append(ys)
        axis.plot(*drawlines, color='black', linewidth=1)
        # for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']):
        #     axis.plot(xs, ys, color='black', linewidth=1)

        if select_clusters:
            try:
                from matplotlib.patches import Ellipse
            except ImportError:
                raise ImportError('You must have matplotlib.patches available to plot selected clusters.')

            chosen_clusters = self._select_clusters()
            
            # Extract the chosen cluster bounds. If enough duplicate data points exist in the
            # data the lambda value might be infinite. This breaks labeling and highlighting
            # the chosen clusters.
            cluster_bounds = np.array([ plot_data['cluster_bounds'][c] for c in chosen_clusters ])
            if not np.isfinite(cluster_bounds).all():
                warn('Infinite lambda values encountered in chosen clusters.'
                     ' This might be due to duplicates in the data.')

            # Extract the plot range of the y-axis and set default center and height values for ellipses.
            # Extremly dense clusters might result in near infinite lambda values. Setting max_height
            # based on the percentile should alleviate the impact on plotting.
            plot_range = np.hstack([plot_data['bar_tops'], plot_data['bar_bottoms']])
            plot_range = plot_range[np.isfinite(plot_range)]
            mean_y_center = np.mean([np.max(plot_range), np.min(plot_range)])
            max_height = np.diff(np.percentile(plot_range, q=[10,90]))

            for i, c in enumerate(chosen_clusters):
                c_bounds = plot_data['cluster_bounds'][c]
                width = (c_bounds[CB_RIGHT] - c_bounds[CB_LEFT])
                height = (c_bounds[CB_TOP] - c_bounds[CB_BOTTOM])
                center = (
                    np.mean([c_bounds[CB_LEFT], c_bounds[CB_RIGHT]]),
                    np.mean([c_bounds[CB_TOP], c_bounds[CB_BOTTOM]]),
                )
                
                # Set center and height to default values if necessary
                if not np.isfinite(center[1]):
                    center = (center[0], mean_y_center)
                if not np.isfinite(height):
                    height = max_height

                # Ensure the ellipse is visible
                min_height = 0.1*max_height
                if height < min_height:
                    height = min_height

                if selection_palette is not None and \
                        len(selection_palette) >= len(chosen_clusters):
                    oval_color = selection_palette[i]
                else:
                    oval_color = 'r'

                box = Ellipse(
                    center,
                    2.0 * width,
                    1.2 * height,
                    facecolor='none',
                    edgecolor=oval_color,
                    linewidth=2
                )

                if label_clusters:
                    axis.annotate(str(i), xy=center,
                                  xytext=(center[0] - 4.0 * width, center[1] + 0.65 * height),
                                  horizontalalignment='left',
                                  verticalalignment='bottom')

                axis.add_artist(box)

        if colorbar:
            cb = plt.colorbar(sm, ax=axis)
            if log_size:
                cb.ax.set_ylabel('log(Number of points)')
            else:
                cb.ax.set_ylabel('Number of points')

        axis.set_xticks([])
        for side in ('right', 'top', 'bottom'):
            axis.spines[side].set_visible(False)
        axis.invert_yaxis()
        axis.set_ylabel('$\lambda$ value')

        return axis



[docs]
    def to_numpy(self):
        """Return a numpy structured array representation of the condensed tree.
        """
        return self._raw_tree.copy()



[docs]
    def to_pandas(self):
        """Return a pandas dataframe representation of the condensed tree.

        Each row of the dataframe corresponds to an edge in the tree.
        The columns of the dataframe are `parent`, `child`, `lambda_val`
        and `child_size`.

        The `parent` and `child` are the ids of the
        parent and child nodes in the tree. Node ids less than the number
        of points in the original dataset represent individual points, while
        ids greater than the number of points are clusters.

        The `lambda_val` value is the value (1/distance) at which the `child`
        node leaves the cluster.

        The `child_size` is the number of points in the `child` node.
        """
        try:
            from pandas import DataFrame, Series
        except ImportError:
            raise ImportError('You must have pandas installed to export pandas DataFrames')

        result = DataFrame(self._raw_tree)

        return result



[docs]
    def to_networkx(self):
        """Return a NetworkX DiGraph object representing the condensed tree.

        Edge weights in the graph are the lamba values at which child nodes
        'leave' the parent cluster.

        Nodes have a `size` attribute attached giving the number of points
        that are in the cluster (or 1 if it is a singleton point) at the
        point of cluster creation (fewer points may be in the cluster at
        larger lambda values).
        """
        try:
            from networkx import DiGraph, set_node_attributes
        except ImportError:
            raise ImportError('You must have networkx installed to export networkx graphs')

        result = DiGraph()
        for row in self._raw_tree:
            result.add_edge(row['parent'], row['child'], weight=row['lambda_val'])

        set_node_attributes(result, dict(self._raw_tree[['child', 'child_size']]), 'size')

        return result




def _get_dendrogram_ordering(parent, linkage, root):

    if parent < root:
        return []

    return _get_dendrogram_ordering(int(linkage[parent-root][0]), linkage, root) + \
            _get_dendrogram_ordering(int(linkage[parent-root][1]), linkage, root) + [parent]

def _calculate_linewidths(ordering, linkage, root):

    linewidths = []

    for x in ordering:
        if linkage[x - root][0] >= root:
            left_width = linkage[int(linkage[x - root][0]) - root][3]
        else:
            left_width = 1

        if linkage[x - root][1] >= root:
            right_width = linkage[int(linkage[x - root][1]) - root][3]
        else:
            right_width = 1

        linewidths.append((left_width, right_width))

    return linewidths



[docs]
class SingleLinkageTree(object):
    """A single linkage format dendrogram tree, with plotting functionality
    and networkX support.

    Parameters
    ----------
    linkage : ndarray (n_samples, 4)
        The numpy array that holds the tree structure. As output by
        scipy.cluster.hierarchy, hdbscan, of fastcluster.

    """
    def __init__(self, linkage):
        self._linkage = linkage


[docs]
    def plot(self, axis=None, truncate_mode=None, p=0, vary_line_width=True,
             cmap='viridis', colorbar=True):
        """Plot a dendrogram of the single linkage tree.

        Parameters
        ----------
        truncate_mode : str, optional
                        The dendrogram can be hard to read when the original
                        observation matrix from which the linkage is derived
                        is large. Truncation is used to condense the dendrogram.
                        There are several modes:

        ``None/'none'``
                No truncation is performed (Default).

        ``'lastp'``
                The last p non-singleton formed in the linkage are the only
                non-leaf nodes in the linkage; they correspond to rows
                Z[n-p-2:end] in Z. All other non-singleton clusters are
                contracted into leaf nodes.

        ``'level'/'mtica'``
                No more than p levels of the dendrogram tree are displayed.
                This corresponds to Mathematica(TM) behavior.

        p : int, optional
            The ``p`` parameter for ``truncate_mode``.

        vary_line_width : boolean, optional
            Draw downward branches of the dendrogram with line thickness that
            varies depending on the size of the cluster.

        cmap : string or matplotlib colormap, optional
               The matplotlib colormap to use to color the cluster bars.
               A value of 'none' will result in black bars.
               (default 'viridis')

        colorbar : boolean, optional
                   Whether to draw a matplotlib colorbar displaying the range
                   of cluster sizes as per the colormap. (default True)

        Returns
        -------
        axis : matplotlib axis
               The axis on which the dendrogram plot has been rendered.

        """
        dendrogram_data = dendrogram(self._linkage, p=p, truncate_mode=truncate_mode, no_plot=True)
        X = dendrogram_data['icoord']
        Y = dendrogram_data['dcoord']

        try:
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError('You must install the matplotlib library to plot the single linkage tree.')

        if axis is None:
            axis = plt.gca()

        if vary_line_width:
            dendrogram_ordering = _get_dendrogram_ordering(2 * len(self._linkage), self._linkage, len(self._linkage) + 1)
            linewidths = _calculate_linewidths(dendrogram_ordering, self._linkage, len(self._linkage) + 1)
        else:
            linewidths = [(1.0, 1.0)] * len(Y)

        if cmap != 'none':
            color_array = np.log2(np.array(linewidths).flatten())
            sm = plt.cm.ScalarMappable(cmap=cmap,
                                       norm=plt.Normalize(0, color_array.max()))
            sm.set_array(color_array)

        for x, y, lw in zip(X, Y, linewidths):
            left_x = x[:2]
            right_x = x[2:]
            left_y = y[:2]
            right_y = y[2:]
            horizontal_x = x[1:3]
            horizontal_y = y[1:3]

            if cmap != 'none':
                axis.plot(left_x, left_y, color=sm.to_rgba(np.log2(lw[0])),
                          linewidth=np.log2(1 + lw[0]),
                          solid_joinstyle='miter', solid_capstyle='butt')
                axis.plot(right_x, right_y, color=sm.to_rgba(np.log2(lw[1])),
                          linewidth=np.log2(1 + lw[1]),
                          solid_joinstyle='miter', solid_capstyle='butt')
            else:
                axis.plot(left_x, left_y, color='k',
                          linewidth=np.log2(1 + lw[0]),
                          solid_joinstyle='miter', solid_capstyle='butt')
                axis.plot(right_x, right_y, color='k',
                          linewidth=np.log2(1 + lw[1]),
                          solid_joinstyle='miter', solid_capstyle='butt')

            axis.plot(horizontal_x, horizontal_y, color='k', linewidth=1.0,
                      solid_joinstyle='miter', solid_capstyle='butt')

        if colorbar:
            cb = plt.colorbar(sm, ax=axis)
            cb.ax.set_ylabel('log(Number of points)')

        axis.set_xticks([])
        for side in ('right', 'top', 'bottom'):
            axis.spines[side].set_visible(False)
        axis.set_ylabel('distance')

        return axis



[docs]
    def to_numpy(self):
        """Return a numpy array representation of the single linkage tree.

        This representation conforms to the scipy.cluster.hierarchy notion
        of a single linkage tree, and can be used with all the associated
        scipy tools. Please see the scipy documentation for more details
        on the format.
        """
        return self._linkage.copy()




[docs]
    def to_pandas(self):
        """Return a pandas dataframe representation of the single linkage tree.

        Each row of the dataframe corresponds to an edge in the tree.
        The columns of the dataframe are `parent`, `left_child`,
        `right_child`, `distance` and `size`.

        The `parent`, `left_child` and `right_child` are the ids of the
        parent and child nodes in the tree. Node ids less than the number
        of points in the original dataset represent individual points, while
        ids greater than the number of points are clusters.

        The `distance` value is the at which the child nodes merge to form
        the parent node.

        The `size` is the number of points in the `parent` node.
        """
        try:
            from pandas import DataFrame, Series
        except ImportError:
            raise ImportError('You must have pandas installed to export pandas DataFrames')

        max_node = 2 * self._linkage.shape[0]
        num_points = max_node - (self._linkage.shape[0] - 1)

        parent_array = np.arange(num_points, max_node + 1)

        result = DataFrame({
            'parent': parent_array,
            'left_child': self._linkage.T[0],
            'right_child': self._linkage.T[1],
            'distance': self._linkage.T[2],
            'size': self._linkage.T[3]
        })[['parent', 'left_child', 'right_child', 'distance', 'size']]

        return result



[docs]
    def to_networkx(self):
        """Return a NetworkX DiGraph object representing the single linkage tree.

        Edge weights in the graph are the distance values at which child nodes
        merge to form the parent cluster.

        Nodes have a `size` attribute attached giving the number of points
        that are in the cluster.
        """
        try:
            from networkx import DiGraph, set_node_attributes
        except ImportError:
            raise ImportError('You must have networkx installed to export networkx graphs')

        max_node = 2 * self._linkage.shape[0]
        num_points = max_node - (self._linkage.shape[0] - 1)

        result = DiGraph()
        for parent, row in enumerate(self._linkage, num_points):
            result.add_edge(parent, row[0], weight=row[2])
            result.add_edge(parent, row[1], weight=row[2])

        size_dict = {parent: row[3] for parent, row in enumerate(self._linkage, num_points)}
        set_node_attributes(result, size_dict, 'size')

        return result



[docs]
    def get_clusters(self, cut_distance, min_cluster_size=5):
        """Return a flat clustering from the single linkage hierarchy.

        This represents the result of selecting a cut value for robust single linkage
        clustering. The `min_cluster_size` allows the flat clustering to declare noise
        points (and cluster smaller than `min_cluster_size`).

        Parameters
        ----------

        cut_distance : float
            The mutual reachability distance cut value to use to generate a flat clustering.

        min_cluster_size : int, optional
            Clusters smaller than this value with be called 'noise' and remain unclustered
            in the resulting flat clustering.

        Returns
        -------

        labels : array [n_samples]
            An array of cluster labels, one per datapoint. Unclustered points are assigned
            the label -1.
        """
        return labelling_at_cut(self._linkage, cut_distance, min_cluster_size)





[docs]
class MinimumSpanningTree(object):
    def __init__(self, mst, data):
        self._mst = mst
        self._data = data


[docs]
    def plot(self, axis=None, node_size=40, node_color='k',
             node_alpha=0.8, edge_alpha=0.5, edge_cmap='viridis_r',
             edge_linewidth=2, vary_line_width=True, colorbar=True):
        """Plot the minimum spanning tree (as projected into 2D by t-SNE if required).

        Parameters
        ----------

        axis : matplotlib axis, optional
               The axis to render the plot to

        node_size : int, optional
                The size of nodes in the plot (default 40).

        node_color : matplotlib color spec, optional
                The color to render nodes (default black).

        node_alpha : float, optional
                The alpha value (between 0 and 1) to render nodes with
                (default 0.8).

        edge_cmap : matplotlib colormap, optional
                The colormap to color edges by (varying color by edge
                    weight/distance). Can be a cmap object or a string
                    recognised by matplotlib. (default `viridis_r`)

        edge_alpha : float, optional
                The alpha value (between 0 and 1) to render edges with
                (default 0.5).

        edge_linewidth : float, optional
                The linewidth to use for rendering edges (default 2).

        vary_line_width : bool, optional
                Edge width is proportional to (log of) the inverse of the
                mutual reachability distance. (default True)

        colorbar : bool, optional
                Whether to draw a colorbar. (default True)

        Returns
        -------

        axis : matplotlib axis
                The axis used the render the plot.
        """
        try:
            import matplotlib.pyplot as plt
            from matplotlib.collections import LineCollection
        except ImportError:
            raise ImportError('You must install the matplotlib library to plot the minimum spanning tree.')

        if self._data.shape[0] > 32767:
            warn('Too many data points for safe rendering of an minimal spanning tree!')
            return None

        if axis is None:
            axis = plt.gca()

        if self._data.shape[1] > 2:
            # Get a 2D projection; if we have a lot of dimensions use PCA first
            if self._data.shape[1] > 32:
                # Use PCA to get down to 32 dimension
                data_for_projection = PCA(n_components=32).fit_transform(self._data)
            else:
                data_for_projection = self._data.copy()

            projection = TSNE().fit_transform(data_for_projection)
        else:
            projection = self._data.copy()

        if vary_line_width:
            line_width = edge_linewidth * (np.log(self._mst.T[2].max() / self._mst.T[2]) + 1.0)
        else:
            line_width = edge_linewidth

        line_coords = projection[self._mst[:, :2].astype(int)]
        line_collection = LineCollection(line_coords, linewidth=line_width,
                                         cmap=edge_cmap, alpha=edge_alpha)
        line_collection.set_array(self._mst[:, 2].T)

        axis.add_artist(line_collection)
        axis.scatter(projection.T[0], projection.T[1], c=node_color, alpha=node_alpha, s=node_size)
        axis.set_xticks([])
        axis.set_yticks([])

        if colorbar:
            cb = plt.colorbar(line_collection, ax=axis)
            cb.ax.set_ylabel('Mutual reachability distance')

        return axis



[docs]
    def to_numpy(self):
        """Return a numpy array of weighted edges in the minimum spanning tree
        """
        return self._mst.copy()



[docs]
    def to_pandas(self):
        """Return a Pandas dataframe of the minimum spanning tree.

        Each row is an edge in the tree; the columns are `from`,
        `to`, and `distance` giving the two vertices of the edge
        which are indices into the dataset, and the distance
        between those datapoints.
        """
        try:
            from pandas import DataFrame
        except ImportError:
            raise ImportError('You must have pandas installed to export pandas DataFrames')

        result = DataFrame({'from': self._mst.T[0].astype(int),
                            'to': self._mst.T[1].astype(int),
                            'distance': self._mst.T[2]})
        return result



[docs]
    def to_networkx(self):
        """Return a NetworkX Graph object representing the minimum spanning tree.

        Edge weights in the graph are the distance between the nodes they connect.

        Nodes have a `data` attribute attached giving the data vector of the
        associated point.
        """
        try:
            from networkx import Graph, set_node_attributes
        except ImportError:
            raise ImportError('You must have networkx installed to export networkx graphs')

        result = Graph()
        for row in self._mst:
            result.add_edge(row[0], row[1], weight=row[2])

        data_dict = {index: tuple(row) for index, row in enumerate(self._data)}
        set_node_attributes(result, data_dict, 'data')

        return result