# -*- coding: utf-8 -*-
# Author: Leland McInnes <leland.mcinnes@gmail.com>
#
# License: BSD 3 clause
import numpy as np
from scipy.cluster.hierarchy import dendrogram
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from warnings import warn
from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs
CB_LEFT = 0
CB_RIGHT = 1
CB_BOTTOM = 2
CB_TOP = 3
def _bfs_from_cluster_tree(tree, bfs_root):
"""
Perform a breadth first search on a tree in condensed tree format
"""
result = []
to_process = [bfs_root]
while to_process:
result.extend(to_process)
to_process = tree['child'][np.in1d(tree['parent'], to_process)].tolist()
return result
def _recurse_leaf_dfs(cluster_tree, current_node):
children = cluster_tree[cluster_tree['parent'] == current_node]['child']
if len(children) == 0:
return [current_node,]
else:
return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
def _get_leaves(condensed_tree):
cluster_tree = condensed_tree[condensed_tree['child_size'] > 1]
if cluster_tree.shape[0] == 0:
# Return the only cluster, the root
return [condensed_tree['parent'].min()]
root = cluster_tree['parent'].min()
return _recurse_leaf_dfs(cluster_tree, root)
[docs]
class CondensedTree(object):
"""The condensed tree structure, which provides a simplified or smoothed version
of the :class:`~hdbscan.plots.SingleLinkageTree`.
Parameters
----------
condensed_tree_array : numpy recarray from :class:`~hdbscan.HDBSCAN`
The raw numpy rec array version of the condensed tree as produced
internally by hdbscan.
cluster_selection_method : string, optional (default 'eom')
The method of selecting clusters. One of 'eom' or 'leaf'
allow_single_cluster : Boolean, optional (default False)
Whether to allow the root cluster as the only selected cluster
"""
def __init__(self, condensed_tree_array, cluster_selection_method='eom',
allow_single_cluster=False):
self._raw_tree = condensed_tree_array
self.cluster_selection_method = cluster_selection_method
self.allow_single_cluster = allow_single_cluster
[docs]
def get_plot_data(self,
leaf_separation=1,
log_size=False,
max_rectangle_per_icicle=20):
"""Generates data for use in plotting the 'icicle plot' or dendrogram
plot of the condensed tree generated by HDBSCAN.
Parameters
----------
leaf_separation : float, optional
How far apart to space the final leaves of the
dendrogram. (default 1)
log_size : boolean, optional
Use log scale for the 'size' of clusters (i.e. number of
points in the cluster at a given lambda value).
(default False)
max_rectangles_per_icicle : int, optional
To simplify the plot this method will only emit
``max_rectangles_per_icicle`` bars per branch of the dendrogram.
This ensures that we don't suffer from massive overplotting in
cases with a lot of data points.
Returns
-------
plot_data : dict
Data associated to bars in a bar plot:
`bar_centers` x coordinate centers for bars
`bar_tops` heights of bars in lambda scale
`bar_bottoms` y coordinate of bottoms of bars
`bar_widths` widths of the bars (in x coord scale)
`bar_bounds` a 4-tuple of [left, right, bottom, top]
giving the bounds on a full set of
cluster bars
Data associates with cluster splits:
`line_xs` x coordinates for horizontal dendrogram lines
`line_ys` y coordinates for horizontal dendrogram lines
"""
leaves = _get_leaves(self._raw_tree)
last_leaf = self._raw_tree['parent'].max()
root = self._raw_tree['parent'].min()
# We want to get the x and y coordinates for the start of each cluster
# Initialize the leaves, since we know where they go, the iterate
# through everything from the leaves back, setting coords as we go
if isinstance(leaves, np.int64):
cluster_x_coords = {leaves: leaf_separation}
else:
cluster_x_coords = dict(zip(leaves, [leaf_separation * x
for x in range(len(leaves))]))
cluster_y_coords = {root: 0.0}
for cluster in range(last_leaf, root - 1, -1):
split = self._raw_tree[['child', 'lambda_val']]
split = split[(self._raw_tree['parent'] == cluster) &
(self._raw_tree['child_size'] > 1)]
if len(split['child']) > 1:
left_child, right_child = split['child']
cluster_x_coords[cluster] = np.mean([cluster_x_coords[left_child],
cluster_x_coords[right_child]])
cluster_y_coords[left_child] = split['lambda_val'][0]
cluster_y_coords[right_child] = split['lambda_val'][1]
# We use bars to plot the 'icicles', so we need to generate centers, tops,
# bottoms and widths for each rectangle. We can go through each cluster
# and do this for each in turn.
bar_centers = []
bar_tops = []
bar_bottoms = []
bar_widths = []
cluster_bounds = {}
scaling = np.sum(self._raw_tree[self._raw_tree['parent'] == root]['child_size'])
if log_size:
scaling = np.log(scaling)
for c in range(last_leaf, root - 1, -1):
cluster_bounds[c] = [0, 0, 0, 0]
c_children = self._raw_tree[self._raw_tree['parent'] == c]
current_size = np.sum(c_children['child_size'])
current_lambda = cluster_y_coords[c]
cluster_max_size = current_size
cluster_max_lambda = c_children['lambda_val'].max()
cluster_min_size = np.sum(
c_children[c_children['lambda_val'] ==
cluster_max_lambda]['child_size'])
if log_size:
current_size = np.log(current_size)
cluster_max_size = np.log(cluster_max_size)
cluster_min_size = np.log(cluster_min_size)
total_size_change = float(cluster_max_size - cluster_min_size)
step_size_change = total_size_change / max_rectangle_per_icicle
cluster_bounds[c][CB_LEFT] = cluster_x_coords[c] * scaling - (current_size / 2.0)
cluster_bounds[c][CB_RIGHT] = cluster_x_coords[c] * scaling + (current_size / 2.0)
cluster_bounds[c][CB_BOTTOM] = cluster_y_coords[c]
cluster_bounds[c][CB_TOP] = np.max(c_children['lambda_val'])
last_step_size = current_size
last_step_lambda = current_lambda
for i in np.argsort(c_children['lambda_val']):
row = c_children[i]
if row['lambda_val'] != current_lambda and \
(last_step_size - current_size > step_size_change
or row['lambda_val'] == cluster_max_lambda):
bar_centers.append(cluster_x_coords[c] * scaling)
bar_tops.append(row['lambda_val'] - last_step_lambda)
bar_bottoms.append(last_step_lambda)
bar_widths.append(last_step_size)
last_step_size = current_size
last_step_lambda = current_lambda
if log_size:
exp_size = np.exp(current_size) - row['child_size']
# Ensure we don't try to take log of zero
if exp_size > 0.01:
current_size = np.log(np.exp(current_size) - row['child_size'])
else:
current_size = 0.0
else:
current_size -= row['child_size']
current_lambda = row['lambda_val']
# Finally we need the horizontal lines that occur at cluster splits.
line_xs = []
line_ys = []
for row in self._raw_tree[self._raw_tree['child_size'] > 1]:
parent = row['parent']
child = row['child']
child_size = row['child_size']
if log_size:
child_size = np.log(child_size)
sign = np.sign(cluster_x_coords[child] - cluster_x_coords[parent])
line_xs.append([
cluster_x_coords[parent] * scaling,
cluster_x_coords[child] * scaling + sign * (child_size / 2.0)
])
line_ys.append([
cluster_y_coords[child],
cluster_y_coords[child]
])
return {
'bar_centers': bar_centers,
'bar_tops': bar_tops,
'bar_bottoms': bar_bottoms,
'bar_widths': bar_widths,
'line_xs': line_xs,
'line_ys': line_ys,
'cluster_bounds': cluster_bounds
}
def _select_clusters(self):
if self.cluster_selection_method == 'eom':
stability = compute_stability(self._raw_tree)
if self.allow_single_cluster:
node_list = sorted(stability.keys(), reverse=True)
else:
node_list = sorted(stability.keys(), reverse=True)[:-1]
cluster_tree = self._raw_tree[self._raw_tree['child_size'] > 1]
is_cluster = {cluster: True for cluster in node_list}
for node in node_list:
child_selection = (cluster_tree['parent'] == node)
subtree_stability = np.sum([stability[child] for
child in cluster_tree['child'][child_selection]])
if subtree_stability > stability[node]:
is_cluster[node] = False
stability[node] = subtree_stability
else:
for sub_node in _bfs_from_cluster_tree(cluster_tree, node):
if sub_node != node:
is_cluster[sub_node] = False
return sorted([cluster
for cluster in is_cluster
if is_cluster[cluster]])
elif self.cluster_selection_method == 'leaf':
return _get_leaves(self._raw_tree)
else:
raise ValueError('Invalid Cluster Selection Method: %s\n'
'Should be one of: "eom", "leaf"\n')
[docs]
def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
label_clusters=False, selection_palette=None,
axis=None, colorbar=True, log_size=False,
max_rectangles_per_icicle=20):
"""Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree.
Effectively this is a dendrogram where the width of each cluster bar is
equal to the number of points (or log of the number of points) in the cluster
at the given lambda value. Thus bars narrow as points progressively drop
out of clusters. The make the effect more apparent the bars are also colored
according the the number of points (or log of the number of points).
Parameters
----------
leaf_separation : float, optional (default 1)
How far apart to space the final leaves of the
dendrogram.
cmap : string or matplotlib colormap, optional (default viridis)
The matplotlib colormap to use to color the cluster bars.
select_clusters : boolean, optional (default False)
Whether to draw ovals highlighting which cluster
bar represent the clusters that were selected by
HDBSCAN as the final clusters.
label_clusters : boolean, optional (default False)
If select_clusters is True then this determines
whether to draw text labels on the clusters.
selection_palette : list of colors, optional (default None)
If not None, and at least as long as
the number of clusters, draw ovals
in colors iterating through this palette.
This can aid in cluster identification
when plotting.
axis : matplotlib axis or None, optional (default None)
The matplotlib axis to render to. If None then a new axis
will be generated. The rendered axis will be returned.
colorbar : boolean, optional (default True)
Whether to draw a matplotlib colorbar displaying the range
of cluster sizes as per the colormap.
log_size : boolean, optional (default False)
Use log scale for the 'size' of clusters (i.e. number of
points in the cluster at a given lambda value).
max_rectangles_per_icicle : int, optional (default 20)
To simplify the plot this method will only emit
``max_rectangles_per_icicle`` bars per branch of the dendrogram.
This ensures that we don't suffer from massive overplotting in
cases with a lot of data points.
Returns
-------
axis : matplotlib axis
The axis on which the 'icicle plot' has been rendered.
"""
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
'You must install the matplotlib library to plot the condensed tree.'
'Use get_plot_data to calculate the relevant data without plotting.')
plot_data = self.get_plot_data(leaf_separation=leaf_separation,
log_size=log_size,
max_rectangle_per_icicle=max_rectangles_per_icicle)
if cmap != 'none':
sm = plt.cm.ScalarMappable(cmap=cmap,
norm=plt.Normalize(0, max(plot_data['bar_widths'])))
sm.set_array(plot_data['bar_widths'])
bar_colors = [sm.to_rgba(x) for x in plot_data['bar_widths']]
else:
bar_colors = 'black'
if axis is None:
axis = plt.gca()
axis.bar(
plot_data['bar_centers'],
plot_data['bar_tops'],
bottom=plot_data['bar_bottoms'],
width=plot_data['bar_widths'],
color=bar_colors,
align='center',
linewidth=0
)
drawlines = []
for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']):
drawlines.append(xs)
drawlines.append(ys)
axis.plot(*drawlines, color='black', linewidth=1)
# for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']):
# axis.plot(xs, ys, color='black', linewidth=1)
if select_clusters:
try:
from matplotlib.patches import Ellipse
except ImportError:
raise ImportError('You must have matplotlib.patches available to plot selected clusters.')
chosen_clusters = self._select_clusters()
# Extract the chosen cluster bounds. If enough duplicate data points exist in the
# data the lambda value might be infinite. This breaks labeling and highlighting
# the chosen clusters.
cluster_bounds = np.array([ plot_data['cluster_bounds'][c] for c in chosen_clusters ])
if not np.isfinite(cluster_bounds).all():
warn('Infinite lambda values encountered in chosen clusters.'
' This might be due to duplicates in the data.')
# Extract the plot range of the y-axis and set default center and height values for ellipses.
# Extremly dense clusters might result in near infinite lambda values. Setting max_height
# based on the percentile should alleviate the impact on plotting.
plot_range = np.hstack([plot_data['bar_tops'], plot_data['bar_bottoms']])
plot_range = plot_range[np.isfinite(plot_range)]
mean_y_center = np.mean([np.max(plot_range), np.min(plot_range)])
max_height = np.diff(np.percentile(plot_range, q=[10,90]))
for i, c in enumerate(chosen_clusters):
c_bounds = plot_data['cluster_bounds'][c]
width = (c_bounds[CB_RIGHT] - c_bounds[CB_LEFT])
height = (c_bounds[CB_TOP] - c_bounds[CB_BOTTOM])
center = (
np.mean([c_bounds[CB_LEFT], c_bounds[CB_RIGHT]]),
np.mean([c_bounds[CB_TOP], c_bounds[CB_BOTTOM]]),
)
# Set center and height to default values if necessary
if not np.isfinite(center[1]):
center = (center[0], mean_y_center)
if not np.isfinite(height):
height = max_height
# Ensure the ellipse is visible
min_height = 0.1*max_height
if height < min_height:
height = min_height
if selection_palette is not None and \
len(selection_palette) >= len(chosen_clusters):
oval_color = selection_palette[i]
else:
oval_color = 'r'
box = Ellipse(
center,
2.0 * width,
1.2 * height,
facecolor='none',
edgecolor=oval_color,
linewidth=2
)
if label_clusters:
axis.annotate(str(i), xy=center,
xytext=(center[0] - 4.0 * width, center[1] + 0.65 * height),
horizontalalignment='left',
verticalalignment='bottom')
axis.add_artist(box)
if colorbar:
cb = plt.colorbar(sm, ax=axis)
if log_size:
cb.ax.set_ylabel('log(Number of points)')
else:
cb.ax.set_ylabel('Number of points')
axis.set_xticks([])
for side in ('right', 'top', 'bottom'):
axis.spines[side].set_visible(False)
axis.invert_yaxis()
axis.set_ylabel('$\lambda$ value')
return axis
[docs]
def to_numpy(self):
"""Return a numpy structured array representation of the condensed tree.
"""
return self._raw_tree.copy()
[docs]
def to_pandas(self):
"""Return a pandas dataframe representation of the condensed tree.
Each row of the dataframe corresponds to an edge in the tree.
The columns of the dataframe are `parent`, `child`, `lambda_val`
and `child_size`.
The `parent` and `child` are the ids of the
parent and child nodes in the tree. Node ids less than the number
of points in the original dataset represent individual points, while
ids greater than the number of points are clusters.
The `lambda_val` value is the value (1/distance) at which the `child`
node leaves the cluster.
The `child_size` is the number of points in the `child` node.
"""
try:
from pandas import DataFrame, Series
except ImportError:
raise ImportError('You must have pandas installed to export pandas DataFrames')
result = DataFrame(self._raw_tree)
return result
[docs]
def to_networkx(self):
"""Return a NetworkX DiGraph object representing the condensed tree.
Edge weights in the graph are the lamba values at which child nodes
'leave' the parent cluster.
Nodes have a `size` attribute attached giving the number of points
that are in the cluster (or 1 if it is a singleton point) at the
point of cluster creation (fewer points may be in the cluster at
larger lambda values).
"""
try:
from networkx import DiGraph, set_node_attributes
except ImportError:
raise ImportError('You must have networkx installed to export networkx graphs')
result = DiGraph()
for row in self._raw_tree:
result.add_edge(row['parent'], row['child'], weight=row['lambda_val'])
set_node_attributes(result, dict(self._raw_tree[['child', 'child_size']]), 'size')
return result
def _get_dendrogram_ordering(parent, linkage, root):
if parent < root:
return []
return _get_dendrogram_ordering(int(linkage[parent-root][0]), linkage, root) + \
_get_dendrogram_ordering(int(linkage[parent-root][1]), linkage, root) + [parent]
def _calculate_linewidths(ordering, linkage, root):
linewidths = []
for x in ordering:
if linkage[x - root][0] >= root:
left_width = linkage[int(linkage[x - root][0]) - root][3]
else:
left_width = 1
if linkage[x - root][1] >= root:
right_width = linkage[int(linkage[x - root][1]) - root][3]
else:
right_width = 1
linewidths.append((left_width, right_width))
return linewidths
[docs]
class SingleLinkageTree(object):
"""A single linkage format dendrogram tree, with plotting functionality
and networkX support.
Parameters
----------
linkage : ndarray (n_samples, 4)
The numpy array that holds the tree structure. As output by
scipy.cluster.hierarchy, hdbscan, of fastcluster.
"""
def __init__(self, linkage):
self._linkage = linkage
[docs]
def plot(self, axis=None, truncate_mode=None, p=0, vary_line_width=True,
cmap='viridis', colorbar=True):
"""Plot a dendrogram of the single linkage tree.
Parameters
----------
truncate_mode : str, optional
The dendrogram can be hard to read when the original
observation matrix from which the linkage is derived
is large. Truncation is used to condense the dendrogram.
There are several modes:
``None/'none'``
No truncation is performed (Default).
``'lastp'``
The last p non-singleton formed in the linkage are the only
non-leaf nodes in the linkage; they correspond to rows
Z[n-p-2:end] in Z. All other non-singleton clusters are
contracted into leaf nodes.
``'level'/'mtica'``
No more than p levels of the dendrogram tree are displayed.
This corresponds to Mathematica(TM) behavior.
p : int, optional
The ``p`` parameter for ``truncate_mode``.
vary_line_width : boolean, optional
Draw downward branches of the dendrogram with line thickness that
varies depending on the size of the cluster.
cmap : string or matplotlib colormap, optional
The matplotlib colormap to use to color the cluster bars.
A value of 'none' will result in black bars.
(default 'viridis')
colorbar : boolean, optional
Whether to draw a matplotlib colorbar displaying the range
of cluster sizes as per the colormap. (default True)
Returns
-------
axis : matplotlib axis
The axis on which the dendrogram plot has been rendered.
"""
dendrogram_data = dendrogram(self._linkage, p=p, truncate_mode=truncate_mode, no_plot=True)
X = dendrogram_data['icoord']
Y = dendrogram_data['dcoord']
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError('You must install the matplotlib library to plot the single linkage tree.')
if axis is None:
axis = plt.gca()
if vary_line_width:
dendrogram_ordering = _get_dendrogram_ordering(2 * len(self._linkage), self._linkage, len(self._linkage) + 1)
linewidths = _calculate_linewidths(dendrogram_ordering, self._linkage, len(self._linkage) + 1)
else:
linewidths = [(1.0, 1.0)] * len(Y)
if cmap != 'none':
color_array = np.log2(np.array(linewidths).flatten())
sm = plt.cm.ScalarMappable(cmap=cmap,
norm=plt.Normalize(0, color_array.max()))
sm.set_array(color_array)
for x, y, lw in zip(X, Y, linewidths):
left_x = x[:2]
right_x = x[2:]
left_y = y[:2]
right_y = y[2:]
horizontal_x = x[1:3]
horizontal_y = y[1:3]
if cmap != 'none':
axis.plot(left_x, left_y, color=sm.to_rgba(np.log2(lw[0])),
linewidth=np.log2(1 + lw[0]),
solid_joinstyle='miter', solid_capstyle='butt')
axis.plot(right_x, right_y, color=sm.to_rgba(np.log2(lw[1])),
linewidth=np.log2(1 + lw[1]),
solid_joinstyle='miter', solid_capstyle='butt')
else:
axis.plot(left_x, left_y, color='k',
linewidth=np.log2(1 + lw[0]),
solid_joinstyle='miter', solid_capstyle='butt')
axis.plot(right_x, right_y, color='k',
linewidth=np.log2(1 + lw[1]),
solid_joinstyle='miter', solid_capstyle='butt')
axis.plot(horizontal_x, horizontal_y, color='k', linewidth=1.0,
solid_joinstyle='miter', solid_capstyle='butt')
if colorbar:
cb = plt.colorbar(sm, ax=axis)
cb.ax.set_ylabel('log(Number of points)')
axis.set_xticks([])
for side in ('right', 'top', 'bottom'):
axis.spines[side].set_visible(False)
axis.set_ylabel('distance')
return axis
[docs]
def to_numpy(self):
"""Return a numpy array representation of the single linkage tree.
This representation conforms to the scipy.cluster.hierarchy notion
of a single linkage tree, and can be used with all the associated
scipy tools. Please see the scipy documentation for more details
on the format.
"""
return self._linkage.copy()
[docs]
def to_pandas(self):
"""Return a pandas dataframe representation of the single linkage tree.
Each row of the dataframe corresponds to an edge in the tree.
The columns of the dataframe are `parent`, `left_child`,
`right_child`, `distance` and `size`.
The `parent`, `left_child` and `right_child` are the ids of the
parent and child nodes in the tree. Node ids less than the number
of points in the original dataset represent individual points, while
ids greater than the number of points are clusters.
The `distance` value is the at which the child nodes merge to form
the parent node.
The `size` is the number of points in the `parent` node.
"""
try:
from pandas import DataFrame, Series
except ImportError:
raise ImportError('You must have pandas installed to export pandas DataFrames')
max_node = 2 * self._linkage.shape[0]
num_points = max_node - (self._linkage.shape[0] - 1)
parent_array = np.arange(num_points, max_node + 1)
result = DataFrame({
'parent': parent_array,
'left_child': self._linkage.T[0],
'right_child': self._linkage.T[1],
'distance': self._linkage.T[2],
'size': self._linkage.T[3]
})[['parent', 'left_child', 'right_child', 'distance', 'size']]
return result
[docs]
def to_networkx(self):
"""Return a NetworkX DiGraph object representing the single linkage tree.
Edge weights in the graph are the distance values at which child nodes
merge to form the parent cluster.
Nodes have a `size` attribute attached giving the number of points
that are in the cluster.
"""
try:
from networkx import DiGraph, set_node_attributes
except ImportError:
raise ImportError('You must have networkx installed to export networkx graphs')
max_node = 2 * self._linkage.shape[0]
num_points = max_node - (self._linkage.shape[0] - 1)
result = DiGraph()
for parent, row in enumerate(self._linkage, num_points):
result.add_edge(parent, row[0], weight=row[2])
result.add_edge(parent, row[1], weight=row[2])
size_dict = {parent: row[3] for parent, row in enumerate(self._linkage, num_points)}
set_node_attributes(result, size_dict, 'size')
return result
[docs]
def get_clusters(self, cut_distance, min_cluster_size=5):
"""Return a flat clustering from the single linkage hierarchy.
This represents the result of selecting a cut value for robust single linkage
clustering. The `min_cluster_size` allows the flat clustering to declare noise
points (and cluster smaller than `min_cluster_size`).
Parameters
----------
cut_distance : float
The mutual reachability distance cut value to use to generate a flat clustering.
min_cluster_size : int, optional
Clusters smaller than this value with be called 'noise' and remain unclustered
in the resulting flat clustering.
Returns
-------
labels : array [n_samples]
An array of cluster labels, one per datapoint. Unclustered points are assigned
the label -1.
"""
return labelling_at_cut(self._linkage, cut_distance, min_cluster_size)
[docs]
class MinimumSpanningTree(object):
def __init__(self, mst, data):
self._mst = mst
self._data = data
[docs]
def plot(self, axis=None, node_size=40, node_color='k',
node_alpha=0.8, edge_alpha=0.5, edge_cmap='viridis_r',
edge_linewidth=2, vary_line_width=True, colorbar=True):
"""Plot the minimum spanning tree (as projected into 2D by t-SNE if required).
Parameters
----------
axis : matplotlib axis, optional
The axis to render the plot to
node_size : int, optional
The size of nodes in the plot (default 40).
node_color : matplotlib color spec, optional
The color to render nodes (default black).
node_alpha : float, optional
The alpha value (between 0 and 1) to render nodes with
(default 0.8).
edge_cmap : matplotlib colormap, optional
The colormap to color edges by (varying color by edge
weight/distance). Can be a cmap object or a string
recognised by matplotlib. (default `viridis_r`)
edge_alpha : float, optional
The alpha value (between 0 and 1) to render edges with
(default 0.5).
edge_linewidth : float, optional
The linewidth to use for rendering edges (default 2).
vary_line_width : bool, optional
Edge width is proportional to (log of) the inverse of the
mutual reachability distance. (default True)
colorbar : bool, optional
Whether to draw a colorbar. (default True)
Returns
-------
axis : matplotlib axis
The axis used the render the plot.
"""
try:
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
except ImportError:
raise ImportError('You must install the matplotlib library to plot the minimum spanning tree.')
if self._data.shape[0] > 32767:
warn('Too many data points for safe rendering of an minimal spanning tree!')
return None
if axis is None:
axis = plt.gca()
if self._data.shape[1] > 2:
# Get a 2D projection; if we have a lot of dimensions use PCA first
if self._data.shape[1] > 32:
# Use PCA to get down to 32 dimension
data_for_projection = PCA(n_components=32).fit_transform(self._data)
else:
data_for_projection = self._data.copy()
projection = TSNE().fit_transform(data_for_projection)
else:
projection = self._data.copy()
if vary_line_width:
line_width = edge_linewidth * (np.log(self._mst.T[2].max() / self._mst.T[2]) + 1.0)
else:
line_width = edge_linewidth
line_coords = projection[self._mst[:, :2].astype(int)]
line_collection = LineCollection(line_coords, linewidth=line_width,
cmap=edge_cmap, alpha=edge_alpha)
line_collection.set_array(self._mst[:, 2].T)
axis.add_artist(line_collection)
axis.scatter(projection.T[0], projection.T[1], c=node_color, alpha=node_alpha, s=node_size)
axis.set_xticks([])
axis.set_yticks([])
if colorbar:
cb = plt.colorbar(line_collection, ax=axis)
cb.ax.set_ylabel('Mutual reachability distance')
return axis
[docs]
def to_numpy(self):
"""Return a numpy array of weighted edges in the minimum spanning tree
"""
return self._mst.copy()
[docs]
def to_pandas(self):
"""Return a Pandas dataframe of the minimum spanning tree.
Each row is an edge in the tree; the columns are `from`,
`to`, and `distance` giving the two vertices of the edge
which are indices into the dataset, and the distance
between those datapoints.
"""
try:
from pandas import DataFrame
except ImportError:
raise ImportError('You must have pandas installed to export pandas DataFrames')
result = DataFrame({'from': self._mst.T[0].astype(int),
'to': self._mst.T[1].astype(int),
'distance': self._mst.T[2]})
return result
[docs]
def to_networkx(self):
"""Return a NetworkX Graph object representing the minimum spanning tree.
Edge weights in the graph are the distance between the nodes they connect.
Nodes have a `data` attribute attached giving the data vector of the
associated point.
"""
try:
from networkx import Graph, set_node_attributes
except ImportError:
raise ImportError('You must have networkx installed to export networkx graphs')
result = Graph()
for row in self._mst:
result.add_edge(row[0], row[1], weight=row[2])
data_dict = {index: tuple(row) for index, row in enumerate(self._data)}
set_node_attributes(result, data_dict, 'data')
return result