Source code for ims.plsda

import ims
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from matplotlib.colors import CenteredNorm
import seaborn as sns
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score


[docs]class PLS_DA:
    """
    PLS-DA classifier built using the scikit-learn PLSRegression implementation.
    Provides prebuilt plots and feature selection via variable importance in projection (VIP)
    scores.

    See the scikit-learn documentation for more details:
    https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html

    Parameters
    ----------
    dataset : ims.Dataset
        Needed for the retention and drift time coordinates in the plots.

    n_components : int, optional
        Number of components to keep, by default 2.

    kwargs : optional
        Additional key word arguments are passed on to the scikit-learn PLSRegression.

    Attributes
    ----------
    x_scores : numpy.ndarray of shape (n_samples, n_components)
        X scores.

    y_scores : numpy.ndarray of shape (n_samples, n_components)
        y scores.

    x_weights : numpy.ndarray of shape (n_features, n_components)
        The left singular vectors of the cross-covariance matrices of each iteration.

    y_weights : numpy.ndarray of shape (n_targets, n_components)
        The right singular vectors of the cross-covariance matrices of each iteration.

    x_loadings : numpy.ndarray of shape (n_features, n_components)
        The loadings of X. When scaling was applied on the dataset,
        corrects the loadings using the weights.

    y_loadings : numpy.ndarray of shape (n_targes, n_components)
        The loadings of y.

    coefficients : numpy.ndarray of shape (n_features, n_targets)
        The coefficients of the linear model.

    vip_scores : numpy.ndarray of shape (n_features,)
        Variable importance in projection (VIP) scores.

    y_pred_train : numpy.ndarray
        Stores the predicted values from the training data for the plot method.

    Example
    -------
    >>> import ims
    >>> ds = ims.Dataset.read_mea("IMS_data")
    >>> X_train, X_test, y_train, y_test = ds.train_test_split()
    >>> model = ims.PLS_DA(ds, n_components=5)
    >>> model.fit(X_train, y_train)
    >>> model.predict(X_test, y_test)
    >>> model.plot()
    """

    def __init__(self, dataset, n_components=2, **kwargs):
        self.dataset = dataset
        self.n_components = n_components
        self.pls = PLSRegression(n_components=n_components, scale=False, **kwargs)
        self._binarizer = LabelBinarizer()
        self._fitted = False
        self._validated = False

[docs]    def fit(self, X_train, y_train):
        """
        Fits the model with training data.
        Converts the labels into a binary matrix.

        Parameters
        ----------
        X_train : numpy.ndarray of targets (n_samples, n_features)
            Training vectors with features.

        y_train : numpy.ndarray of shape (n_samples,)
            True class labels for training data.

        Returns
        -------
        self
            Fitted model.
        """
        self.groups = np.unique(y_train)
        self.y_train = y_train
        y_binary = self._binarizer.fit_transform(y_train)
        self.pls.fit(X_train, y_binary)
        self.x_scores, self.y_scores = self.pls.transform(X_train, y_binary)
        self.x_weights = self.pls.x_weights_
        self.x_loadings = self.pls.x_loadings_
        self.y_weights = self.pls.y_weights_
        self.y_loadings = self.pls.y_loadings_
        self.coefficients = self.pls.coef_
        self.vip_scores = ims.utils.vip_scores(
            self.x_weights, self.x_scores, self.y_loadings
        )
        self._fitted = True
        return self

[docs]    def predict(self, X_test):
        """
        Predicts class labels for test data. Converts back from binary
        labels matrix to a list of class names. If y_test is set also calculates
        accuracy, precision and recall and stores them as attributes.

        Parameters
        ----------
        X_test : numpy.ndarray of shape (n_samples, n_features)
            Feature vectors of test dataset.

        Returns
        -------
        numpy.ndarray of shape (n_samples,)
            Predicted class labels.
        """
        y_pred = self.pls.predict(X_test)
        y_pred = self._binarizer.inverse_transform(y_pred)
        self.x_scores_pred = self.pls.transform(X_test)
        self._validated = True
        return y_pred

[docs]    def transform(self, X, y=None):
        """
        Apply the dimensionality reduction.

        Parameters
        ----------
        X : numpy.ndarray of shape (n_samples, n_features)
            Feature matrix.

        y : numpy.ndarray of shape (n_samples, n_targtets), optional
            Dependend variables, by default None

        Returns
        -------
        tuple
            X_scores
        """
        return self.pls.transform(X, y)

[docs]    def score(self, X_test, y_test, sample_weight=None):
        """
        Calculates accuracy score for predicted data.

        Parameters
        ----------
        X_test : numpy.ndarray of shape (n_samples, n_features)
            Feature vectors of the test data.

        y_test : numpy.ndarray of shape (n_samples,)
            True classification labels.

        sample_weight : array-like of shape (n_samples,), optional
            Sample weights, by default None.

        Returns
        -------
        score : float
            Mean accuracy score.
        """
        y_pred = self.predict(X_test)
        return accuracy_score(y_test, y_pred, sample_weight=sample_weight)

[docs]    def plot(self, x_comp=1, y_comp=2, annotate=False):
        """
        Plots PLS components as scatter plot.

        Parameters
        ----------
        x_comp : int, optional
            Component x axis, by default 1.

        y_comp : int, optional
            Component y axis, by default 2.

        annotate : bool, optional
            If True adds sample names to markers,
            by default False.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if self._validated:
            X = np.concatenate(
                (self.x_scores[:, x_comp - 1], self.x_scores_pred[:, x_comp - 1])
            )
            Y = np.concatenate(
                (self.x_scores[:, y_comp - 1], self.x_scores_pred[:, y_comp - 1])
            )
            hue = list(self.y_train) + self.dataset[self.dataset.test_index].labels
            style = ["Training"] * len(self.y_train) + ["Validation"] * len(
                self.dataset.test_index
            )
            if hasattr(self.dataset, "train_index"):
                sample_names = (
                    self.dataset[self.dataset.train_index].samples
                    + self.dataset[self.dataset.test_index].samples
                )
            else:
                sample_names = self.dataset.samples

        else:
            X = self.x_scores[:, x_comp - 1]
            Y = self.x_scores[:, y_comp - 1]
            hue = self.y_train
            style = self.y_train
            sample_names = self.y_train

        ax = sns.scatterplot(x=X, y=Y, hue=hue, style=style)

        ax.legend()

        plt.xlabel(f"Latent variable {x_comp}")
        plt.ylabel(f"Latent variable {y_comp}")

        if annotate:
            for x, y, name in zip(X, Y, sample_names):
                ax.text(x, y, name)

        return ax

[docs]    def plot_loadings(self, component=1, color_range=0.02, width=6, height=6):
        """
        Plots PLS x loadings as image with retention and drift
        time coordinates.

        Parameters
        ----------
        component : int, optional
            Component to plot, by default 1.

        color_range : float, optional
            Minimum and Maximum to adjust to different scaling methods,
            by default 0.02.

        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        loadings = self.x_loadings[:, component - 1].reshape(self.dataset[0].shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            loadings,
            cmap="RdBu_r",
            vmin=(-color_range),
            vmax=color_range,
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="PLS-DA loadings")
        plt.title(f"PLS-DA loadings of component {component}")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax

[docs]    def plot_coefficients(self, group=0, width=6, height=6):
        """
        Plots PLS coefficients of selected group as image
        with retention and drift time axis.

        Parameters
        ----------
        group : int or str, optional
            Index or name of group, by default 0.

        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if isinstance(group, str):
            group_index = self.groups.index(group)
            group_name = group

        if isinstance(group, int):
            group_index = group
            group_name = self.groups[group]

        coef = self.pls.coef_[:, group_index].reshape(self.dataset[0].values.shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            coef,
            cmap="RdBu_r",
            norm=CenteredNorm(0),
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="PLS-DA coefficients")
        plt.title(f"PLS-DA coefficients of {group_name}")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax

[docs]    def plot_vip_scores(self, threshold=None, width=6, height=6):
        """
        Plots VIP scores as image with retention and drift time axis.

        Parameters
        ----------
        threshold : int
            Only plots VIP scores above threshold if set.
            Values below are displayed as 0,
            by default None.

        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )
        if threshold is None:
            vip_matrix = self.vip_scores.reshape(self.dataset[0].values.shape)
        else:
            vips = np.zeros_like(self.vip_scores)
            i = np.where(self.vip_scores > threshold)
            vips[i] = self.vip_scores[i]
            vip_matrix = vips.reshape(self.dataset[0].values.shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            vip_matrix,
            cmap="RdBu_r",
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="VIP scores")
        plt.title(f"PLS-DA VIP scores")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax