Source code for ims.pca

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator, AutoMinorLocator
from sklearn.decomposition import PCA
from scipy.stats import f


[docs]class PCA_Model:
    """
    PCA_Model is a wrapper class around the scikit-learn PCA implementation
    and provides prebuilt plots for GC-IMS datasets.

    See the original scikit-learn documentation for a detailed description:
    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

    Parameters
    ----------
    dataset : ims.Dataset
        The dataset is needed for the retention and drift time
        coordinates.

    n_components : int or float, optional
        Number of components to keep. If None all components are kept,
        by default None.

    svd_solver : str, optional
        "auto", "full", "arpack" or "randomised" are valid,
        by default "auto".

    **kwargs: optional
        Additional key word arguments are passed to the scikit-learn PCA.
        See the original documentation for valid parameters.

    Attributes
    ----------
    scores : numpy.ndarray of shape (n_samples, n_features)
        X with dimension reduction applied.

    loadings : numpy.ndarray of shape (n_components, n_features)
        PCA loadings already corrected when a scaling method was
        applied on the dataset.

    explainded_variance : numpy.ndarray of shape (n_components,)
        The amount of variance explained by each component.

    explained_variance_ratio : numpy.nd_array of shape (n_components,)
        Percentage of variance explained by each component.

    singular_values : numpy.ndarray of shape (n_components,)
        The singular values corresponding to each component.

    mean : numpy.ndarray of shape (n_features,)
        Per feature mean estimated from training data.

    Example
    -------
    >>> import ims
    >>> ds = ims.Dataset.read_mea("IMS_data")
    >>> X, _ = ds.get_xy()
    >>> pca = ims.PCA_Model(ds, n_components=20)
    >>> pca.fit(X)
    >>> pca.plot()
    """

    def __init__(self, dataset, n_components=None, svd_solver="auto", **kwargs):
        self.dataset = dataset
        if n_components is None:
            self.n_components = len(self.dataset)
        else:
            self.n_components = n_components
        self.svd_solver = svd_solver
        self._sk_pca = PCA(n_components, svd_solver=svd_solver, **kwargs)

[docs]    def fit(self, X_train):
        """
        Fit the PCA model with training data.

        Parameters
        ----------
        X_train : numpy.ndarray of shape (n_samples, n_features)
            The training data.

        Returns
        -------
        self
            The fitted model.
        """
        self._sk_pca.fit(X_train)
        self.scores = self._sk_pca.transform(X_train)
        self.explained_variance = self._sk_pca.explained_variance_
        self.explained_variance_ratio = self._sk_pca.explained_variance_ratio_
        self.singular_values = self._sk_pca.singular_values_
        self.mean = self._sk_pca.mean_
        self.loadings = self._sk_pca.components_
        self.residuals = X_train - self.scores @ self.loadings
        self.Q = np.sum(self.residuals**2, axis=1)
        self.Tsq = np.sum((self.scores / np.std(self.scores, axis=0)) ** 2, axis=1)
        self.Tsq_conf = (
            f.ppf(q=0.95, dfn=self.n_components, dfd=self.scores.shape[0])
            * self.n_components
            * (self.scores.shape[0] - 1)
            / (self.scores.shape[0] - self.n_components)
        )
        self.Q_conf = np.quantile(self.Q, q=0.95)
        return self

[docs]    def plot(self, PC_x=1, PC_y=2, annotate=False):
        """
        Scatter plot of selected principal components.

        Parameters
        ----------
        PC_x : int, optional
            PC x axis, by default 1.

        PC_y : int, optional
            PC y axis, by default 2.

        annotate : bool, optional
            label data points with sample name,
            by default False.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        expl_var = []
        for i in range(1, self.n_components + 1):
            expl_var.append(round(self.explained_variance_ratio[i - 1] * 100, 1))

        pc_df = pd.DataFrame(
            data=self.scores,
            columns=[f"PC {x}" for x in range(1, self.n_components + 1)],
        )

        if hasattr(self.dataset, "train_index"):
            pc_df["Sample"] = self.dataset[self.dataset.train_index].samples
            pc_df["Label"] = self.dataset[self.dataset.train_index].labels
        else:
            pc_df["Sample"] = self.dataset.samples
            pc_df["Label"] = self.dataset.labels

        ax = sns.scatterplot(
            x=f"PC {PC_x}",
            y=f"PC {PC_y}",
            data=pc_df,
            hue="Label",
            style="Label",
        )

        plt.legend()
        plt.xlabel(f"PC {PC_x} ({expl_var[PC_x-1]} % of variance)")
        plt.ylabel(f"PC {PC_y} ({expl_var[PC_y-1]} % of variance)")

        if annotate:
            for i, point in pc_df.iterrows():
                ax.text(point[f"PC {PC_x}"], point[f"PC {PC_y}"], point["Sample"])

        return ax

[docs]    def plot_loadings(self, PC=1, color_range=0.1, width=6, height=6):
        """
        Plots loadings of a principle component with the original retention
        and drift time coordinates.

        Parameters
        ----------
        PC : int, optional
            principal component, by default 1.

        color_range : int, optional
            color_scale ranges from - color_range to + color_range
            centered at 0.

        width : int or float, optional
            plot width in inches, by default 9.

        height : int or float, optional
            plot height in inches, by default 10.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        # use retention and drift time axis from the first spectrum
        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        loading_pc = self.loadings[PC - 1, :].reshape(len(ret_time), len(drift_time))

        expl_var = []
        for i in range(1, self.n_components + 1):
            expl_var.append(round(self.explained_variance_ratio[i - 1] * 100, 1))

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            loading_pc,
            origin="lower",
            aspect="auto",
            cmap="RdBu_r",
            vmin=(-color_range),
            vmax=color_range,
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar().set_label("PCA loadings")

        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())

        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        plt.title(f"PCA loadings of PC {PC} ({expl_var[PC-1]} % of variance)")
        return ax

[docs]    def scree_plot(self):
        """
        Plots the explained variance ratio per principal component
        and cumulatively.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        x = [*range(1, self.n_components + 1)]
        y = self.explained_variance_ratio

        _, ax = plt.subplots()

        for axis in [ax.xaxis, ax.yaxis]:
            axis.set_major_locator(MaxNLocator(integer=True))

        plt.xticks(x)
        plt.xlabel("Principal component")
        plt.ylabel("Explainded variance ratio [%]")

        ax.plot(x, np.cumsum(y) * 100, label="cumulative")
        ax.plot(x, y * 100, label="per PC")

        plt.legend()
        return ax

[docs]    def Tsq_Q_plot(self, annotate=False):
        """
        Plots T square values and Q residuals with 95 % confidence limits
        for outlier detection.
        Q confidence limit is determined empirically,
        the T square limit is calculated using the f distribution.

        Parameters
        ----------
        annotate : bool, optional
            Annotates markers with sample names when True,
            by default False.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        ax = sns.scatterplot(
            x=self.Q, y=self.Tsq, style=self.dataset.labels, hue=self.dataset.labels
        )

        ax.axhline(self.Tsq_conf, c="tab:red", linestyle=":")
        ax.axvline(self.Q_conf, c="tab:red", linestyle=":")
        ax.set_xlabel("Q")
        ax.set_ylabel("$T^2$", rotation=0, labelpad=15)

        if annotate:
            for x, y, sample in zip(self.Q, self.Tsq, self.dataset.samples):
                plt.text(x, y, sample)

        return ax