Source code for ims.pca

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator, AutoMinorLocator
from sklearn.decomposition import PCA
from scipy.stats import f


[docs]class PCA_Model: """ PCA_Model is a wrapper class around the scikit-learn PCA implementation and provides prebuilt plots for GC-IMS datasets. See the original scikit-learn documentation for a detailed description: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html Parameters ---------- dataset : ims.Dataset The dataset is needed for the retention and drift time coordinates. n_components : int or float, optional Number of components to keep. If None all components are kept, by default None. svd_solver : str, optional "auto", "full", "arpack" or "randomised" are valid, by default "auto". **kwargs: optional Additional key word arguments are passed to the scikit-learn PCA. See the original documentation for valid parameters. Attributes ---------- scores : numpy.ndarray of shape (n_samples, n_features) X with dimension reduction applied. loadings : numpy.ndarray of shape (n_components, n_features) PCA loadings already corrected when a scaling method was applied on the dataset. explainded_variance : numpy.ndarray of shape (n_components,) The amount of variance explained by each component. explained_variance_ratio : numpy.nd_array of shape (n_components,) Percentage of variance explained by each component. singular_values : numpy.ndarray of shape (n_components,) The singular values corresponding to each component. mean : numpy.ndarray of shape (n_features,) Per feature mean estimated from training data. Example ------- >>> import ims >>> ds = ims.Dataset.read_mea("IMS_data") >>> X, _ = ds.get_xy() >>> pca = ims.PCA_Model(ds, n_components=20) >>> pca.fit(X) >>> pca.plot() """ def __init__(self, dataset, n_components=None, svd_solver="auto", **kwargs): self.dataset = dataset if n_components is None: self.n_components = len(self.dataset) else: self.n_components = n_components self.svd_solver = svd_solver self._sk_pca = PCA(n_components, svd_solver=svd_solver, **kwargs)
[docs] def fit(self, X_train): """ Fit the PCA model with training data. Parameters ---------- X_train : numpy.ndarray of shape (n_samples, n_features) The training data. Returns ------- self The fitted model. """ self._sk_pca.fit(X_train) self.scores = self._sk_pca.transform(X_train) self.explained_variance = self._sk_pca.explained_variance_ self.explained_variance_ratio = self._sk_pca.explained_variance_ratio_ self.singular_values = self._sk_pca.singular_values_ self.mean = self._sk_pca.mean_ self.loadings = self._sk_pca.components_ self.residuals = X_train - self.scores @ self.loadings self.Q = np.sum(self.residuals**2, axis=1) self.Tsq = np.sum((self.scores / np.std(self.scores, axis=0)) ** 2, axis=1) self.Tsq_conf = ( f.ppf(q=0.95, dfn=self.n_components, dfd=self.scores.shape[0]) * self.n_components * (self.scores.shape[0] - 1) / (self.scores.shape[0] - self.n_components) ) self.Q_conf = np.quantile(self.Q, q=0.95) return self
[docs] def plot(self, PC_x=1, PC_y=2, annotate=False): """ Scatter plot of selected principal components. Parameters ---------- PC_x : int, optional PC x axis, by default 1. PC_y : int, optional PC y axis, by default 2. annotate : bool, optional label data points with sample name, by default False. Returns ------- matplotlib.pyplot.axes """ expl_var = [] for i in range(1, self.n_components + 1): expl_var.append(round(self.explained_variance_ratio[i - 1] * 100, 1)) pc_df = pd.DataFrame( data=self.scores, columns=[f"PC {x}" for x in range(1, self.n_components + 1)], ) if hasattr(self.dataset, "train_index"): pc_df["Sample"] = self.dataset[self.dataset.train_index].samples pc_df["Label"] = self.dataset[self.dataset.train_index].labels else: pc_df["Sample"] = self.dataset.samples pc_df["Label"] = self.dataset.labels ax = sns.scatterplot( x=f"PC {PC_x}", y=f"PC {PC_y}", data=pc_df, hue="Label", style="Label", ) plt.legend() plt.xlabel(f"PC {PC_x} ({expl_var[PC_x-1]} % of variance)") plt.ylabel(f"PC {PC_y} ({expl_var[PC_y-1]} % of variance)") if annotate: for i, point in pc_df.iterrows(): ax.text(point[f"PC {PC_x}"], point[f"PC {PC_y}"], point["Sample"]) return ax
[docs] def plot_loadings(self, PC=1, color_range=0.1, width=6, height=6): """ Plots loadings of a principle component with the original retention and drift time coordinates. Parameters ---------- PC : int, optional principal component, by default 1. color_range : int, optional color_scale ranges from - color_range to + color_range centered at 0. width : int or float, optional plot width in inches, by default 9. height : int or float, optional plot height in inches, by default 10. Returns ------- matplotlib.pyplot.axes """ # use retention and drift time axis from the first spectrum ret_time = self.dataset[0].ret_time drift_time = self.dataset[0].drift_time loading_pc = self.loadings[PC - 1, :].reshape(len(ret_time), len(drift_time)) expl_var = [] for i in range(1, self.n_components + 1): expl_var.append(round(self.explained_variance_ratio[i - 1] * 100, 1)) _, ax = plt.subplots(figsize=(width, height)) plt.imshow( loading_pc, origin="lower", aspect="auto", cmap="RdBu_r", vmin=(-color_range), vmax=color_range, extent=(min(drift_time), max(drift_time), min(ret_time), max(ret_time)), ) plt.colorbar().set_label("PCA loadings") ax.xaxis.set_minor_locator(AutoMinorLocator()) ax.yaxis.set_minor_locator(AutoMinorLocator()) plt.xlabel(self.dataset[0]._drift_time_label) plt.ylabel("Retention time [s]") plt.title(f"PCA loadings of PC {PC} ({expl_var[PC-1]} % of variance)") return ax
[docs] def scree_plot(self): """ Plots the explained variance ratio per principal component and cumulatively. Returns ------- matplotlib.pyplot.axes """ x = [*range(1, self.n_components + 1)] y = self.explained_variance_ratio _, ax = plt.subplots() for axis in [ax.xaxis, ax.yaxis]: axis.set_major_locator(MaxNLocator(integer=True)) plt.xticks(x) plt.xlabel("Principal component") plt.ylabel("Explainded variance ratio [%]") ax.plot(x, np.cumsum(y) * 100, label="cumulative") ax.plot(x, y * 100, label="per PC") plt.legend() return ax
[docs] def Tsq_Q_plot(self, annotate=False): """ Plots T square values and Q residuals with 95 % confidence limits for outlier detection. Q confidence limit is determined empirically, the T square limit is calculated using the f distribution. Parameters ---------- annotate : bool, optional Annotates markers with sample names when True, by default False. Returns ------- matplotlib.pyplot.axes """ ax = sns.scatterplot( x=self.Q, y=self.Tsq, style=self.dataset.labels, hue=self.dataset.labels ) ax.axhline(self.Tsq_conf, c="tab:red", linestyle=":") ax.axvline(self.Q_conf, c="tab:red", linestyle=":") ax.set_xlabel("Q") ax.set_ylabel("$T^2$", rotation=0, labelpad=15) if annotate: for x, y, sample in zip(self.Q, self.Tsq, self.dataset.samples): plt.text(x, y, sample) return ax