Source code for ims.simca

#%%
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from scipy.stats import f
from sklearn.decomposition import PCA
import scipy
import pandas as pd
from matplotlib.ticker import AutoMinorLocator


[docs]
class OneClassSIMCA:
    """
    Implements soft independed modelling of class analogies (SIMCA) for one target class.

    Parameters
    ----------
    n_components : int, optional
        Number of PCA components, by default 2.

    q : float, optional
        q-value: false discovery rate, by default 0.95.

    Attributes
    ---------
    n_components : int
        Parameter set during initialization.

    q : float
        Parameter set during initialization.

    pca : sklearn.decomposition.PCA
        The underlying PCA model.

    target : str
        Name of the target class. Parameter set with 'fit' method.

    Q_target : numpy.ndarray of shape (n_samples,)
        Q residuals of target class. Calculated in 'fit' method.

    Q_conf : float
        Q confidence limit of target class. Calculated in 'fit' method.

    Q_test : numpy.ndarray of shape (n_samples,)
        Q residuals of test samples. Calculated in 'predict' method.

    Tsq_target : numpy.ndarray of shape (n_samples,)
        T square values of target class. Calculated in 'fit' method.

    Tsq_conf : float
        T square confidence limit of target class. Calculated in 'fit' method.

    Tsq_test : numpy.ndarray of shape (n_samples,)
        T square of test samples. Calculated in 'predict' method.

    Example
    -------
    >>> import numpy as np
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.utils import resample
    >>> from sklearn.metrics import accuracy_score
    >>>
    >>> # load example data and set target (class label)
    >>> X, y = load_iris(return_X_y=True)
    >>> target = 0
    >>>
    >>> # Find target data to train model and draw random test data
    >>> X_target = X[np.where(y == target)]
    >>> X_test, y_test = resample(X, y n_samples=50)
    >>>
    >>> # instantiate a model and fit target data
    >>> model = OneClassSIMCA(n_components=2, q=0.95)
    >>> model.fit(X_target, target)
    >>>
    >>> # make prediction on test data and calculate accuracy
    >>> y_pred = model.predict(X_test)
    >>> y_true = y_test == target
    >>> print(accuracy_score(y_true, y_pred))
    >>>
    >>> # visualize results
    >>> model.plot(hue=y_test)
    """

    def __init__(self, n_components=2, q=0.95):
        self.n_components = n_components
        self.q = q
        self.pca = PCA(n_components=n_components)
        self._fitted = False
        self._validated = False


[docs]
    def fit(self, X_target, target):
        """
        Fit SIMCA model with data from target class.

        Parameters
        ----------
        X_target : numpy.ndarray of shape (n_samples, n_features)
            Training data for target class.

        target : str
            Name of the target class as identifier of the model and for plotting.
        """
        # set target as identifier and title in plot method
        self.target = target
        self.pca.fit(X_target)

        self.scores_target = self.pca.transform(X_target)
        self.loadings = self.pca.components_
        self.residuals_target = X_target - self.pca.inverse_transform(self.scores_target)

        self.Q_target = np.sum(self.residuals_target**2, axis=1)
        
        lambda_values = PCA().fit(X_target).explained_variance_[self.n_components:]
        theta1 = np.sum(lambda_values)
        theta2 = np.sum(lambda_values**2)
        theta3 = np.sum(lambda_values**3)
        h0 = 1 - 2 * theta1 * theta3 / (3 * theta2**2)
        calpha = scipy.stats.norm.ppf(self.q)
        fraction1 = calpha * np.sqrt(2 * theta2 * h0**2) / theta1
        fraction2 = theta2*h0*(h0-1) / theta1**2
        self.Q_conf = theta1 * (1 + fraction1 + fraction2)**(1/h0)

        self.Tsq_target = np.sum(
            (self.scores_target / np.std(self.scores_target, axis=0)) ** 2, axis=1
        )
        self.Tsq_conf = (
            f.ppf(q=0.95, dfn=self.n_components, dfd=self.scores_target.shape[0])
            * self.n_components
            * (self.scores_target.shape[0] - 1)
            / (self.scores_target.shape[0] - self.n_components)
        )

        self._fitted = True
        
        return self



[docs]
    def predict(self, X_test, decision_rule="both"):
        """
        Applies fitted SIMCA model to test data and makes a prediction
        about the target class membership.

        Parameters
        ----------
        X_test : numpy.ndarray of shape (n_samples, n_features)
            Test data as feature matrix.

        decision_rule : str, optional
            Prediction based on either 'Q', 'Tsq' or 'both',
            by default "both".

        Returns
        -------
        numpy.ndarray of shape (n_samples,)
            Boolean result array.

        Raises
        ------
        ValueError
            If invalid value is given for decision_rule argument.
        """
        # set as attributes for plot method
        
        self.scores_test = self.pca.transform(X_test)
        self.residuals_test = X_test - self.pca.inverse_transform(self.scores_test)
        self.Q_test = np.sum(self.residuals_test**2, axis=1)
        self.Tsq_test = np.sum(
            (self.scores_test / np.std(self.scores_test, axis=0)) ** 2, axis=1
        )

        pred_Q = self.Q_test < self.Q_conf
        pred_Tsq = self.Tsq_test < self.Tsq_conf
        self._validated = True

        if decision_rule == "both":
            return np.logical_and(pred_Q, pred_Tsq)
        elif decision_rule == "Q":
            return pred_Q
        elif decision_rule == "Tsq":
            return pred_Tsq
        else:
            raise ValueError("Invalid decision method, use 'Q', 'Tsq' or 'both'.")



[docs]
    def Tsq_Q_plot(self, hue=None, annotate=None):
        """
        Visualizes fitted SIMCA model as T square Q plot.
        Test data is included when the 'predict' method was used prior.

        Parameters
        ----------
        hue : iterable, optional
            Iterable with test data class labels to color markers by class.
            Only labels for test data is needed, labels for target class are already known.
            Ignores if 'predict' method was not used prior,
            by default None.

        annotate : iterable, optional
            Iterable with sample names to annotate markers with,
            by default None.

        Returns
        -------
        matplotlib.pyplot.axes

        Raises
        ------
        ValueError
            If instance was not fitted before calling 'plot'.
        """
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if self._validated:
            X = np.concatenate((self.Q_target, self.Q_test))
            Y = np.concatenate((self.Tsq_target, self.Tsq_test))
            style = ["Training"] * len(self.Q_target) + ["Validation"] * len(
                self.Q_test
            )
            if hue is not None:
                hue = [self.target] * len(self.Q_target) + list(hue)
        else:
            X = self.Q_target
            Y = self.Tsq_target
            style = ["Training"] * len(self.Q_target)
            hue = None

        ax = sns.scatterplot(x=X, y=Y, style=style, hue=hue)

        if annotate is not None:
            for x, y, name in zip(X, Y, annotate):
                ax.text(x, y, name)

        if self._validated:
            ax.legend()

        ax.axhline(self.Tsq_conf, c="tab:red", linestyle=":")
        ax.axvline(self.Q_conf, c="tab:red", linestyle=":")
        ax.set_xlabel("Q")
        ax.set_ylabel("$T^2$", rotation=0, labelpad=15)
        ax.set_title(f"SIMCA model for target: {self.target}")
        return ax

    

[docs]
    def scores_plot(self, y_train, y_test, x_comp=1, y_comp=2):
        """
        Visualizes scores of fitted SIMCA model as Scores Plot.

        Parameters:
        ---------- 
        y_train : numpy.array of shape(n_samples,)
            True class labels for training data.    
        
        y_test : numpy.array of shape(n_samples,)
            True class labels for test data.
            
        x_comp : int, optional
            Component x axis, by default 1.

        y_comp : int, optional
            Component y axis, by default 2.
            
        Returns
        -------
        matplotlib.pyplot.axes         
        """        
        self.y_train = y_train
        self.y_test = y_test
        
        if not self._fitted:
            raise ValueError(
                "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting."
            )

        if self._validated:
            X = np.concatenate(
                (self.scores_target[:, x_comp - 1], self.scores_test[:, x_comp - 1])
            )
            Y = np.concatenate(
                (self.scores_target[:, y_comp - 1], self.scores_test[:, y_comp - 1])
            )
            hue = list(self.y_train) + list(self.y_test)
            style = ["Training"] * len(self.y_train) + ["Validation"] * len(
                self.y_test
            )

        ax = sns.scatterplot(x=X, y=Y, hue=hue, style=style)

        ax.legend()

        plt.xlabel(f"PC {x_comp}")
        plt.ylabel(f"PC {y_comp}")

        return ax

    

[docs]
    def plot_loadings(self, dataset, PC=1, color_range=0.1, width=6, height=6):
        """
        Plots loadings of a principle component with the original retention
        and drift time coordinates.

        Parameters
        ----------
        dataset : ims.Dataset
            The dataset is needed for the retention and drift time
            coordinates. 
        
        PC : int, optional
            principal component, by default 1.

        color_range : int, optional
            color_scale ranges from - color_range to + color_range
            centered at 0.

        width : int or float, optional
            plot width in inches, by default 9.

        height : int or float, optional
            plot height in inches, by default 10.

        Returns
        -------
        matplotlib.pyplot.axes
        """

        # use retention and drift time axis from the first spectrum
        ret_time = dataset[0].ret_time
        drift_time = dataset[0].drift_time

        loading_pc = self.loadings[PC - 1, :].reshape(len(ret_time), len(drift_time))

        expl_var = []
        for i in range(1, self.n_components + 1):
            expl_var.append(round(self.pca.explained_variance_ratio_[i - 1] * 100, 1))

        _, ax = plt.subplots(figsize=(width, height))

        plt.imshow(
            loading_pc,
            origin="lower",
            aspect="auto",
            cmap="RdBu_r",
            vmin=(-color_range),
            vmax=color_range,
            extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)),
        )

        plt.colorbar().set_label("PCA loadings")

        ax.xaxis.set_minor_locator(AutoMinorLocator())
        ax.yaxis.set_minor_locator(AutoMinorLocator())

        plt.xlabel(dataset[0]._drift_time_label)
        plt.ylabel("Retention time [s]")
        plt.title(f"PCA loadings of PC {PC} ({expl_var[PC-1]} % of variance)")
        return ax







[docs]
class MultiClassSIMCA:
    """
    Builds soft independent modelling of class analogies model out off OneClassSIMCA models for each class.

    Parameters
    ----------
    models : list
        OneClassSIMCA model per class.

    Attributes
    ---------
    nclasses : int
        Number of OneClasSIMCA models in models.
        
    classnames : list
        Names of targets in OneClasSIMCA models.
        
    res_df : pd.DataFrame of shape (n_targets, n_samples)
        Binary representation of classification.
        Calculated in the predict method.

    Raises
    ------
    ValueError
        If instance was not fitted in 'OneClassSimca' class.

    Example
    -------
    >>> import numpy as np
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.utils import resample
    >>> from sklearn.metrics import accuracy_score
    >>>
    >>> # load example data
    >>> X, y = load_iris(return_X_y=True)
    >>>
    >>> # instantiate a model and fit target data
    >>> model = MultiClassSIMCA()
    >>> model.fit(X, y, n_components=2, q=0.95)
    >>>
    >>> # make prediction on test data and calculate accuracy
    >>> y_pred = model.predict(X_test)
    >>> y_true = y_test == target
    >>> print(accuracy_score(y_true, y_pred))
    >>>
    >>> # visualize results
    >>> model.plot()
    """
    
    def __init__(self, models=[]):
        self.models = models
        self.nclasses = len(models)
        self.classnames = [list(model.target) for model in models]
        for model in models:
            if not model._fitted:
                raise ValueError(
                f"OneClassSIMCA {model.target} is not fitted yet! Call 'fit' in OneClassSIMCA with appropriate arguments."
                ) 
            

[docs]
    def fit(self, X_train, y_train, n_components=2, p=0.95):
        """
        Fit OneClasSIMCA models with training data.
        
        Parameters
        ----------
        X_train : numpy.ndarray of shape (n_samples, n_features)
            Training data as feature matrix.
        
        y_train : numpy.array of shape(n_samples,)
            True class labels for training data.
            
        n_components : int, optional
            Number of PCA components, by default 2.

        q : float, optional
            False discovery rate, by default 0.95.
        """
        
        self.models = []
        target_list = np.unique(y_train)
        for target in target_list:
            X_target = X_train[np.where(y_train == target)]
            model = OneClassSIMCA(n_components, p)
            model.fit(X_target, target)
            self.models.append(model)
            self.classnames.append(target)
            
        return self



[docs]
    def predict(self, X_test, y_test=None):
        """
        Applies fitted SIMCA models to test data and makes a prediction
        about the class memberships.

        Parameters
        ----------
        X_test : numpy.ndarray of shape (n_samples, n_features)
            Test data as feature matrix.
            
        y_test : numpy.array of shape(n_samples,), optional
            True class labels for test data, by default None

        Returns
        -------
        pd.DataFrame of shape (n_targets, n_samples)
        """
        
        y_pred_class = []
        for model in self.models:
            y_pred = model.predict(X_test)
            y_pred_class.append(y_pred.astype('int'))
            
        self.res_df = pd.DataFrame(
            data=np.array(y_pred_class).T,
            columns=[model.target for model in self.models]
        )
        
        if y_test is not None:
            self.res_df.insert(loc=0, column="Sample name", value=list(y_test))
            
        return self.res_df

    

[docs]
    def plot(self):
        
        """Visualizes the classification of test data.

        Returns
        -------
        matplotlib.pyplot.axes
        """
        
        self.res_df = self.res_df.set_index("Sample name").sort_index()
        
        # Define colors
        colors = ["tab:grey", "tab:cyan"]
        cmap = mcolors.LinearSegmentedColormap.from_list('Custom', colors, len(colors))

        ax = sns.heatmap(self.res_df, cmap=cmap)

        # Set the colorbar labels
        colorbar = ax.collections[0].colorbar
        colorbar.set_ticks([0.25,0.75])
        colorbar.set_ticklabels(['False', 'True'])
        
        ax.set(xlabel="Predicted label", ylabel="Sample name")
        
        return ax