Source code for ims.plsr

import ims
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from matplotlib.colors import CenteredNorm
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from ims.utils import vip_scores


[docs]class PLSR:
    """
    Applies a scikit-learn PLSRegression to GC-IMS data and provides prebuilt
    plots as well as a feature selection via variable importance in projection (VIP)
    scores.

    See the scikit-learn documentation for more details:
    https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html

    Parameters
    ----------
    dataset : ims.Dataset
        Needed for the retention and drift time coordinates in the plots.

    n_components : int, optional
        Number of components to keep, by default 2.

    kwargs : optional
        Additional key word arguments are passed on to the scikit-learn PLSRegression.

    Attributes
    ----------
    x_scores : numpy.ndarray of shape (n_samples, n_components)
        X scores.

    y_scores : numpy.ndarray of shape (n_samples, n_components)
        y scores.

    x_weights : numpy.ndarray of shape (n_features, n_components)
        The left singular vectors of the cross-covariance matrices of each iteration.

    y_weights : numpy.ndarray of shape (n_targets, n_components)
        The right singular vectors of the cross-covariance matrices of each iteration.

    x_loadings : numpy.ndarray of shape (n_features, n_components)
        The loadings of X. When scaling was applied on the dataset,
        corrects the loadings using the weights.

    y_loadings : numpy.ndarray of shape (n_targes, n_components)
        The loadings of y.

    coefficients : numpy.ndarray of shape (n_features, n_targets)
        The coefficients of the linear model.

    y_pred_train : numpy.ndarray
        Stores the predicted values from the training data for the plot method.

    Example
    -------
    >>> import ims
    >>> import pandas as pd
    >>> ds = ims.Dataset.read_mea("IMS_data")
    >>> responses = pd.read_csv("responses.csv")
    >>> ds.labels = responses
    >>> X_train, X_test, y_train, y_test = ds.train_test_split()
    >>> model = ims.PLSR(ds, n_components=5)
    >>> model.fit(X_train, y_train)
    >>> model.predict(X_test, y_test)
    >>> model.plot()
    """

    def __init__(self, dataset, n_components=2, **kwargs):
        self.dataset = dataset
        self.n_components = n_components
        self.pls = PLSRegression(n_components=n_components, scale=False, **kwargs)
        self._fitted = False
        self._validated = False

[docs]    def fit(self, X_train, y_train):
        """
        Fits the model with training data.

        Parameters
        ----------
        X_train : numpy.ndarray of targets (n_samples, n_features)
            Training vectors with features.

        y_train : numpy.ndarray of shape (n_samples, n_targets)
            Target vectors with response variables.

        Returns
        -------
        self
            Fitted model.
        """
        self.pls.fit(X_train, y_train)
        self.x_scores, self.y_scores = self.pls.transform(X_train, y_train)
        self.x_weights = self.pls.x_weights_
        self.x_loadings = self.pls.x_loadings_
        self.y_weights = self.pls.y_weights_
        self.y_loadings = self.pls.y_loadings_
        self.coefficients = self.pls.coef_
        self.y_pred_train = self.pls.predict(X_train).flatten()
        self.y_train = y_train
        self.vip_scores = ims.utils.vip_scores(
            self.x_weights, self.x_scores, self.y_loadings
        )
        self.selectivity_ratio = ims.utils.selectivity_ratio(X_train, self.coefficients)
        self._fitted = True
        return self

[docs]    def predict(self, X_test, y_test=None):
        """
        Predicts responses for features of the test data.

        Parameters
        ----------
        X_test : numpy.ndarray of shape (n_samples, n_features)
            Features of test data.

        y_train : numpy.ndarray of shape (n_samples, n_targets), optional
            True labels for test data.
            If set allows automatic plotting of validation data,
            by default None.

        Returns
        -------
        numpy.ndarray of shape (n_samples, n_targets)
            Predicted responses for test data.
        """
        y_pred = self.pls.predict(X_test)
        self.y_pred = y_pred
        self.y_test = y_test
        if y_test is not None:
            self.rmse = round(mean_squared_error(y_test, self.y_pred, squared=False), 2)
        self._validated = True
        return y_pred

[docs]    def score(self, X_test, y_test, sample_weight=None):
        """
        Calculates R^2 score score for predicted data.

        Parameters
        ----------
        X_test : numpy.ndarray of shape (n_samples, n_features)
            Feature vectors of the test data.

        y_test : numpy.ndarray of shape (n_samples, n_targets)
            True regression responses.

        sample_weight : array-like of shape (n_samples,), optional
            Sample weights, by default None.

        Returns
        -------
        score : float
            R^2 score.
        """
        y_pred = self.predict(X_test)
        return r2_score(y_test, y_pred, sample_weight=sample_weight)

[docs]    def transform(self, X, y=None):
        """
        Apply the dimensionality reduction.

        Parameters
        ----------
        X : numpy.ndarray of shape (n_samples, n_features)
            Feature matrix.
        y : numpy.ndarray of shape (n_samples, n_targtets), optional
            Dependend variables, by default None

        Returns
        -------
        X_scores
        """
        return self.pls.transform(X, y)

[docs]    def plot(self, annotate=False):
        """
        Plots predicted vs actual values and shows regression line.
        Recommended to predict with test data first.

        annotate : bool, optional
            If True annotates plot with sample names,
            by default False.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if self._validated and hasattr(self, "y_test"):
            ax = sns.scatterplot(x=self.y_train, y=self.y_pred_train, label="Training")
            sns.scatterplot(
                ax=ax,
                x=self.y_test,
                y=self.y_pred.flatten(),
                label=f"Validation: RMSE {self.rmse}",
            )
        else:
            ax = sns.scatterplot(x=self.y_train, y=self.y_pred_train, label="Training")

        z = np.polyfit(self.y_train, self.y_pred_train, 1)
        sns.lineplot(
            ax=ax,
            x=np.polyval(z, self.y_train),
            y=self.y_train,
            c="tab:blue",
            label="Trend line",
        )

        z = np.polyfit(self.y_train, self.y_train, 1)
        sns.lineplot(
            ax=ax,
            x=np.polyval(z, self.y_train),
            y=self.y_train,
            c="tab:red",
            linestyle=":",
            label="Perfect fit",
        )

        if annotate:
            if hasattr(self.dataset, "train_index"):
                sample_names = self.dataset[self.dataset.train_index].samples
                X = self.y_train
                Y = self.y_pred_train
            if self._validated and hasattr(self, "y_test"):
                sample_names = (
                    sample_names + self.dataset[self.dataset.test_index].samples
                )
                X = np.concatenate((X, self.y_test))
                Y = np.concatenate((Y, self.y_pred.flatten()))
            for x, y, sample in zip(X, Y, sample_names):
                plt.text(x, y, sample)

        plt.xlabel("Observed response")
        plt.ylabel("Predicted response")
        plt.legend()
        return ax

[docs]    def plot_loadings(self, component=1, color_range=0.01, width=6, height=6):
        """
        Plots PLS x loadings as image with retention and drift
        time coordinates.

        Parameters
        ----------
        component : int, optional
            Component to plot, by default 1.

        color_range : float, optional
            Minimum and maximum to adjust to different scaling methods,
            by default 0.02.

        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        loadings = self.x_loadings[:, component - 1].reshape(self.dataset[0].shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            loadings,
            cmap="RdBu_r",
            vmin=(-color_range),
            vmax=color_range,
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="PLS loadings")
        plt.title(f"PLS loadings of component {component}")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax

[docs]    def plot_coefficients(self, width=6, height=6):
        """
        Plots PLS coefficients as image with retention and drift time axis.

        Parameters
        ----------
        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        coef = self.coefficients.reshape(self.dataset[0].values.shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            coef,
            cmap="RdBu_r",
            norm=CenteredNorm(0),
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="PLS coefficients")
        plt.title("PLS coefficients")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax

[docs]    def plot_vip_scores(self, threshold=None, width=6, height=6):
        """
        Plots VIP scores as image with retention and drift time axis.

        Parameters
        ----------
        threshold : int
            Only plots VIP scores above threshold if set.
            Values below are displayed as 0,
            by default None.

        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes

        Raises
        ------
        ValueError
            If VIP scores have not been calculated prior.
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if threshold is None:
            vip_matrix = self.vip_scores.reshape(self.dataset[0].values.shape)
        else:
            vips = np.zeros_like(self.vip_scores)
            i = np.where(self.vip_scores > threshold)
            vips[i] = self.vip_scores[i]
            vip_matrix = vips.reshape(self.dataset[0].values.shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            vip_matrix,
            cmap="RdBu_r",
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="VIP scores")
        plt.title(f"PLS VIP scores")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax

[docs]    def plot_selectivity_ratio(self, threshold=None, width=6, height=6):
        """
        Plots VIP scores as image with retention and drift time axis.

        Parameters
        ----------
        threshold : int
            Only plots VIP scores above threshold if set.
            Values below are displayed as 0,
            by default None.

        width : int or float, optional
            Width of the plot in inches,
            by default 8.

        height : int or float, optional
            Height of the plot in inches,
            by default 8.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if threshold is None:
            sr_matrix = self.vip_scores.reshape(self.dataset[0].values.shape)
        else:
            sratio = np.zeros_like(self.vip_scores)
            i = np.where(self.vip_scores > threshold)
            sratio[i] = self.vip_scores[i]
            sr_matrix = sratio.reshape(self.dataset[0].values.shape)

        ret_time = self.dataset[0].ret_time
        drift_time = self.dataset[0].drift_time

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            sr_matrix,
            cmap="RdBu_r",
            origin="lower",
            aspect="auto",
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar(label="Selectivity ratio")
        plt.title(f"PLS selectivity ratio")
        plt.xlabel(self.dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())
        return ax