Source code for ims.hca
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
[docs]class HCA:
"""
Hierarchical cluster analysis with scikit-learn AgglomerativeClustering
and scipy dendrogram.
Parameters
----------
dataset : ims.Dataset, optional
Dataset with GC-IMS data is needed for sample and label names
in dendrogram. If not set uses leaves as labels instead,
by default None.
affinity : str, optional
Metric used to compute the linkage.
Can be "euclidean", "l1", "l2", "manhattan" or "cosine".
If linkage is set to "ward" only "euclidean" is accepted,
by default "euclidean".
linkage : str, optional
Linkage criterion which determines which distance to use.
"ward", "complete", "average" or "single" are accepted,
by default "ward".
Attributes
----------
clustering : sklearn.cluster.AgglomerativeClustering
Scikit-learn algorithm used for the clustering.
See the original documentation for details about attributes.
linkage_matrix : numpy.ndarray
Clustering results encoded as linkage matrix.
R : dict
scipy dendrogram output as dictionary.
Example
-------
>>> import ims
>>> ds = ims.Dataset.read_mea("IMS_data")
>>> X, _ = ds.get_xy()
>>> hca = ims.HCA(ds, linkage="ward", affinity="euclidean")
>>> hca.fit(X)
>>> hca.plot_dendrogram()
"""
def __init__(
self,
dataset=None,
affinity="euclidean",
linkage="ward",
):
self.dataset = dataset
self.linkage = linkage
self.affinity = affinity
self.clustering = AgglomerativeClustering(
distance_threshold=0, n_clusters=None, affinity=affinity, linkage=linkage
)
[docs] def fit(self, X):
"""
Fit the model from features.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training features to cluster.
"""
self.clustering.fit(X)
self.linkage_matrix = self._get_linkage_matrix()
self.R = dendrogram(self.linkage_matrix, no_plot=True)
self.labels = self.clustering.labels_
[docs] def plot_dendrogram(self, width=6, height=6, orientation="right", **kwargs):
"""
Plots clustering results as dendrogram.
Parameters
----------
width : int, optional
Width of the figure in inches, by default 8
height : int, optional
Width of the figure in inches, by default 8
orientation : str, optional
Root position of the clustering tree, by default "right"
**kwargs
See scipy.cluster.hierarchy.dendrogram documentation
for information about valid keyword arguments.
Returns
-------
matplotlib.pyplot.axes
"""
_, ax = plt.subplots(figsize=(width, height))
if self.dataset is not None:
labels = self.dataset.labels
else:
labels = self.R["leaves"]
dendrogram(
self.linkage_matrix, ax=ax, orientation=orientation, labels=labels, **kwargs
)
plt.xlabel(f"Distances ({self.affinity} method)")
return ax
def _get_linkage_matrix(self):
"""Builds linkage matrix from AgglomerativeClustering output."""
counts = np.zeros(self.clustering.children_.shape[0])
n_samples = len(self.clustering.labels_)
for i, merge in enumerate(self.clustering.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
return np.column_stack(
[self.clustering.children_, self.clustering.distances_, counts]
).astype(float)