Source code for ims.simca

#%%
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from scipy.stats import f
from sklearn.decomposition import PCA
import scipy
import pandas as pd
from matplotlib.ticker import AutoMinorLocator

[docs] class OneClassSIMCA: """ Implements soft independed modelling of class analogies (SIMCA) for one target class. Parameters ---------- n_components : int, optional Number of PCA components, by default 2. q : float, optional q-value: false discovery rate, by default 0.95. Attributes --------- n_components : int Parameter set during initialization. q : float Parameter set during initialization. pca : sklearn.decomposition.PCA The underlying PCA model. target : str Name of the target class. Parameter set with 'fit' method. Q_target : numpy.ndarray of shape (n_samples,) Q residuals of target class. Calculated in 'fit' method. Q_conf : float Q confidence limit of target class. Calculated in 'fit' method. Q_test : numpy.ndarray of shape (n_samples,) Q residuals of test samples. Calculated in 'predict' method. Tsq_target : numpy.ndarray of shape (n_samples,) T square values of target class. Calculated in 'fit' method. Tsq_conf : float T square confidence limit of target class. Calculated in 'fit' method. Tsq_test : numpy.ndarray of shape (n_samples,) T square of test samples. Calculated in 'predict' method. Example ------- >>> import numpy as np >>> from sklearn.datasets import load_iris >>> from sklearn.utils import resample >>> from sklearn.metrics import accuracy_score >>> >>> # load example data and set target (class label) >>> X, y = load_iris(return_X_y=True) >>> target = 0 >>> >>> # Find target data to train model and draw random test data >>> X_target = X[np.where(y == target)] >>> X_test, y_test = resample(X, y n_samples=50) >>> >>> # instantiate a model and fit target data >>> model = OneClassSIMCA(n_components=2, q=0.95) >>> model.fit(X_target, target) >>> >>> # make prediction on test data and calculate accuracy >>> y_pred = model.predict(X_test) >>> y_true = y_test == target >>> print(accuracy_score(y_true, y_pred)) >>> >>> # visualize results >>> model.plot(hue=y_test) """ def __init__(self, n_components=2, q=0.95): self.n_components = n_components self.q = q self.pca = PCA(n_components=n_components) self._fitted = False self._validated = False
[docs] def fit(self, X_target, target): """ Fit SIMCA model with data from target class. Parameters ---------- X_target : numpy.ndarray of shape (n_samples, n_features) Training data for target class. target : str Name of the target class as identifier of the model and for plotting. """ # set target as identifier and title in plot method self.target = target self.pca.fit(X_target) self.scores_target = self.pca.transform(X_target) self.loadings = self.pca.components_ self.residuals_target = X_target - self.pca.inverse_transform(self.scores_target) self.Q_target = np.sum(self.residuals_target**2, axis=1) lambda_values = PCA().fit(X_target).explained_variance_[self.n_components:] theta1 = np.sum(lambda_values) theta2 = np.sum(lambda_values**2) theta3 = np.sum(lambda_values**3) h0 = 1 - 2 * theta1 * theta3 / (3 * theta2**2) calpha = scipy.stats.norm.ppf(self.q) fraction1 = calpha * np.sqrt(2 * theta2 * h0**2) / theta1 fraction2 = theta2*h0*(h0-1) / theta1**2 self.Q_conf = theta1 * (1 + fraction1 + fraction2)**(1/h0) self.Tsq_target = np.sum( (self.scores_target / np.std(self.scores_target, axis=0)) ** 2, axis=1 ) self.Tsq_conf = ( f.ppf(q=0.95, dfn=self.n_components, dfd=self.scores_target.shape[0]) * self.n_components * (self.scores_target.shape[0] - 1) / (self.scores_target.shape[0] - self.n_components) ) self._fitted = True return self
[docs] def predict(self, X_test, decision_rule="both"): """ Applies fitted SIMCA model to test data and makes a prediction about the target class membership. Parameters ---------- X_test : numpy.ndarray of shape (n_samples, n_features) Test data as feature matrix. decision_rule : str, optional Prediction based on either 'Q', 'Tsq' or 'both', by default "both". Returns ------- numpy.ndarray of shape (n_samples,) Boolean result array. Raises ------ ValueError If invalid value is given for decision_rule argument. """ # set as attributes for plot method self.scores_test = self.pca.transform(X_test) self.residuals_test = X_test - self.pca.inverse_transform(self.scores_test) self.Q_test = np.sum(self.residuals_test**2, axis=1) self.Tsq_test = np.sum( (self.scores_test / np.std(self.scores_test, axis=0)) ** 2, axis=1 ) pred_Q = self.Q_test < self.Q_conf pred_Tsq = self.Tsq_test < self.Tsq_conf self._validated = True if decision_rule == "both": return np.logical_and(pred_Q, pred_Tsq) elif decision_rule == "Q": return pred_Q elif decision_rule == "Tsq": return pred_Tsq else: raise ValueError("Invalid decision method, use 'Q', 'Tsq' or 'both'.")
[docs] def Tsq_Q_plot(self, hue=None, annotate=None): """ Visualizes fitted SIMCA model as T square Q plot. Test data is included when the 'predict' method was used prior. Parameters ---------- hue : iterable, optional Iterable with test data class labels to color markers by class. Only labels for test data is needed, labels for target class are already known. Ignores if 'predict' method was not used prior, by default None. annotate : iterable, optional Iterable with sample names to annotate markers with, by default None. Returns ------- matplotlib.pyplot.axes Raises ------ ValueError If instance was not fitted before calling 'plot'. """ if not self._fitted: raise ValueError( "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting." ) if self._validated: X = np.concatenate((self.Q_target, self.Q_test)) Y = np.concatenate((self.Tsq_target, self.Tsq_test)) style = ["Training"] * len(self.Q_target) + ["Validation"] * len( self.Q_test ) if hue is not None: hue = [self.target] * len(self.Q_target) + list(hue) else: X = self.Q_target Y = self.Tsq_target style = ["Training"] * len(self.Q_target) hue = None ax = sns.scatterplot(x=X, y=Y, style=style, hue=hue) if annotate is not None: for x, y, name in zip(X, Y, annotate): ax.text(x, y, name) if self._validated: ax.legend() ax.axhline(self.Tsq_conf, c="tab:red", linestyle=":") ax.axvline(self.Q_conf, c="tab:red", linestyle=":") ax.set_xlabel("Q") ax.set_ylabel("$T^2$", rotation=0, labelpad=15) ax.set_title(f"SIMCA model for target: {self.target}") return ax
[docs] def scores_plot(self, y_train, y_test, x_comp=1, y_comp=2): """ Visualizes scores of fitted SIMCA model as Scores Plot. Parameters: ---------- y_train : numpy.array of shape(n_samples,) True class labels for training data. y_test : numpy.array of shape(n_samples,) True class labels for test data. x_comp : int, optional Component x axis, by default 1. y_comp : int, optional Component y axis, by default 2. Returns ------- matplotlib.pyplot.axes """ self.y_train = y_train self.y_test = y_test if not self._fitted: raise ValueError( "This model is not fitted yet! Call 'fit' with appropriate arguments before plotting." ) if self._validated: X = np.concatenate( (self.scores_target[:, x_comp - 1], self.scores_test[:, x_comp - 1]) ) Y = np.concatenate( (self.scores_target[:, y_comp - 1], self.scores_test[:, y_comp - 1]) ) hue = list(self.y_train) + list(self.y_test) style = ["Training"] * len(self.y_train) + ["Validation"] * len( self.y_test ) ax = sns.scatterplot(x=X, y=Y, hue=hue, style=style) ax.legend() plt.xlabel(f"PC {x_comp}") plt.ylabel(f"PC {y_comp}") return ax
[docs] def plot_loadings(self, dataset, PC=1, color_range=0.1, width=6, height=6): """ Plots loadings of a principle component with the original retention and drift time coordinates. Parameters ---------- dataset : ims.Dataset The dataset is needed for the retention and drift time coordinates. PC : int, optional principal component, by default 1. color_range : int, optional color_scale ranges from - color_range to + color_range centered at 0. width : int or float, optional plot width in inches, by default 9. height : int or float, optional plot height in inches, by default 10. Returns ------- matplotlib.pyplot.axes """ # use retention and drift time axis from the first spectrum ret_time = dataset[0].ret_time drift_time = dataset[0].drift_time loading_pc = self.loadings[PC - 1, :].reshape(len(ret_time), len(drift_time)) expl_var = [] for i in range(1, self.n_components + 1): expl_var.append(round(self.pca.explained_variance_ratio_[i - 1] * 100, 1)) _, ax = plt.subplots(figsize=(width, height)) plt.imshow( loading_pc, origin="lower", aspect="auto", cmap="RdBu_r", vmin=(-color_range), vmax=color_range, extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)), ) plt.colorbar().set_label("PCA loadings") ax.xaxis.set_minor_locator(AutoMinorLocator()) ax.yaxis.set_minor_locator(AutoMinorLocator()) plt.xlabel(dataset[0]._drift_time_label) plt.ylabel("Retention time [s]") plt.title(f"PCA loadings of PC {PC} ({expl_var[PC-1]} % of variance)") return ax
[docs] class MultiClassSIMCA: """ Builds soft independent modelling of class analogies model out off OneClassSIMCA models for each class. Parameters ---------- models : list OneClassSIMCA model per class. Attributes --------- nclasses : int Number of OneClasSIMCA models in models. classnames : list Names of targets in OneClasSIMCA models. res_df : pd.DataFrame of shape (n_targets, n_samples) Binary representation of classification. Calculated in the predict method. Raises ------ ValueError If instance was not fitted in 'OneClassSimca' class. Example ------- >>> import numpy as np >>> from sklearn.datasets import load_iris >>> from sklearn.utils import resample >>> from sklearn.metrics import accuracy_score >>> >>> # load example data >>> X, y = load_iris(return_X_y=True) >>> >>> # instantiate a model and fit target data >>> model = MultiClassSIMCA() >>> model.fit(X, y, n_components=2, q=0.95) >>> >>> # make prediction on test data and calculate accuracy >>> y_pred = model.predict(X_test) >>> y_true = y_test == target >>> print(accuracy_score(y_true, y_pred)) >>> >>> # visualize results >>> model.plot() """ def __init__(self, models=[]): self.models = models self.nclasses = len(models) self.classnames = [list(model.target) for model in models] for model in models: if not model._fitted: raise ValueError( f"OneClassSIMCA {model.target} is not fitted yet! Call 'fit' in OneClassSIMCA with appropriate arguments." )
[docs] def fit(self, X_train, y_train, n_components=2, p=0.95): """ Fit OneClasSIMCA models with training data. Parameters ---------- X_train : numpy.ndarray of shape (n_samples, n_features) Training data as feature matrix. y_train : numpy.array of shape(n_samples,) True class labels for training data. n_components : int, optional Number of PCA components, by default 2. q : float, optional False discovery rate, by default 0.95. """ self.models = [] target_list = np.unique(y_train) for target in target_list: X_target = X_train[np.where(y_train == target)] model = OneClassSIMCA(n_components, p) model.fit(X_target, target) self.models.append(model) self.classnames.append(target) return self
[docs] def predict(self, X_test, y_test=None): """ Applies fitted SIMCA models to test data and makes a prediction about the class memberships. Parameters ---------- X_test : numpy.ndarray of shape (n_samples, n_features) Test data as feature matrix. y_test : numpy.array of shape(n_samples,), optional True class labels for test data, by default None Returns ------- pd.DataFrame of shape (n_targets, n_samples) """ y_pred_class = [] for model in self.models: y_pred = model.predict(X_test) y_pred_class.append(y_pred.astype('int')) self.res_df = pd.DataFrame( data=np.array(y_pred_class).T, columns=[model.target for model in self.models] ) if y_test is not None: self.res_df.insert(loc=0, column="Sample name", value=list(y_test)) return self.res_df
[docs] def plot(self): """Visualizes the classification of test data. Returns ------- matplotlib.pyplot.axes """ self.res_df = self.res_df.set_index("Sample name").sort_index() # Define colors colors = ["tab:grey", "tab:cyan"] cmap = mcolors.LinearSegmentedColormap.from_list('Custom', colors, len(colors)) ax = sns.heatmap(self.res_df, cmap=cmap) # Set the colorbar labels colorbar = ax.collections[0].colorbar colorbar.set_ticks([0.25,0.75]) colorbar.set_ticklabels(['False', 'True']) ax.set(xlabel="Predicted label", ylabel="Sample name") return ax