Source code for simba.preprocessing._pca

"""Principal component analysis"""

import numpy as np
from sklearn.decomposition import TruncatedSVD
from ._utils import (
    locate_elbow,
)


[docs] def pca(adata, n_components=50, algorithm='randomized', n_iter=5, random_state=2021, tol=0.0, feature=None, **kwargs, ): """perform Principal Component Analysis (PCA) Parameters ---------- adata: AnnData Annotated data matrix. n_components: `int`, optional (default: 50) Desired dimensionality of output data algorithm: `str`, optional (default: 'randomized') SVD solver to use. Choose from {'arpack', 'randomized'}. n_iter: `int`, optional (default: '5') Number of iterations for randomized SVD solver. Not used by ARPACK. tol: `float`, optional (default: 0) Tolerance for ARPACK. 0 means machine precision. Ignored by randomized SVD solver. feature: `str`, optional (default: None) Feature used to perform PCA. The data type of `.var[feature]` needs to be `bool` If None, adata.X will be used. kwargs: Other keyword arguments are passed down to `TruncatedSVD()` Returns ------- updates `adata` with the following fields: `.obsm['X_pca']` : `array` PCA transformed X. `.uns['pca']['PCs']` : `array` Principal components in feature space, representing the directions of maximum variance in the data. `.uns['pca']['variance']` : `array` The variance of the training samples transformed by a projection to each component. `.uns['pca']['variance_ratio']` : `array` Percentage of variance explained by each of the selected components. """ if feature is None: X = adata.X.copy() else: mask = adata.var[feature] X = adata[:, mask].X.copy() svd = TruncatedSVD(n_components=n_components, algorithm=algorithm, n_iter=n_iter, random_state=random_state, tol=tol, **kwargs) svd.fit(X) adata.obsm['X_pca'] = svd.transform(X) adata.uns['pca'] = dict() adata.uns['pca']['n_pcs'] = n_components adata.uns['pca']['PCs'] = svd.components_.T adata.uns['pca']['variance'] = svd.explained_variance_ adata.uns['pca']['variance_ratio'] = svd.explained_variance_ratio_
[docs] def select_pcs(adata, n_pcs=None, S=1, curve='convex', direction='decreasing', online=False, min_elbow=None, **kwargs): """select top PCs based on variance_ratio Parameters ---------- n_pcs: `int`, optional (default: None) If n_pcs is None, the number of PCs will be automatically selected with "`kneed <https://kneed.readthedocs.io/>`__" S : `float`, optional (default: 1) Sensitivity min_elbow: `int`, optional (default: None) The minimum elbow location By default, it is n_components/10 curve: `str`, optional (default: 'convex') Choose from {'convex','concave'} If 'concave', algorithm will detect knees, If 'convex', algorithm will detect elbows. direction: `str`, optional (default: 'decreasing') Choose from {'decreasing','increasing'} online: `bool`, optional (default: False) kneed will correct old knee points if True, kneed will return first knee if False. **kwargs: `dict`, optional Extra arguments to KneeLocator. Returns """ if n_pcs is None: n_components = adata.obsm['X_pca'].shape[1] if min_elbow is None: min_elbow = n_components/10 n_pcs = locate_elbow(range(n_components), adata.uns['pca']['variance_ratio'], S=S, curve=curve, min_elbow=min_elbow, direction=direction, online=online, **kwargs) adata.uns['pca']['n_pcs'] = n_pcs else: adata.uns['pca']['n_pcs'] = n_pcs
[docs] def select_pcs_features(adata, S=1, curve='convex', direction='decreasing', online=False, min_elbow=None, **kwargs): """select features that contribute to the top PCs Parameters ---------- S : `float`, optional (default: 10) Sensitivity min_elbow: `int`, optional (default: None) The minimum elbow location. By default, it is #features/6 curve: `str`, optional (default: 'convex') Choose from {'convex','concave'} If 'concave', algorithm will detect knees, If 'convex', algorithm will detect elbows. direction: `str`, optional (default: 'decreasing') Choose from {'decreasing','increasing'} online: `bool`, optional (default: False) kneed will correct old knee points if True, kneed will return first knee if False. **kwargs: `dict`, optional Extra arguments to KneeLocator. Returns ------- """ n_pcs = adata.uns['pca']['n_pcs'] n_features = adata.uns['pca']['PCs'].shape[0] if min_elbow is None: min_elbow = n_features/6 adata.uns['pca']['features'] = dict() ids_features = list() for i in range(n_pcs): elbow = locate_elbow(range(n_features), np.sort( np.abs(adata.uns['pca']['PCs'][:, i],))[::-1], S=S, min_elbow=min_elbow, curve=curve, direction=direction, online=online, **kwargs) ids_features_i = \ list(np.argsort(np.abs( adata.uns['pca']['PCs'][:, i],))[::-1][:elbow]) adata.uns['pca']['features'][f'pc_{i}'] = ids_features_i ids_features = ids_features + ids_features_i print(f'#features selected from PC {i}: {len(ids_features_i)}') adata.var['top_pcs'] = False adata.var.loc[adata.var_names[np.unique(ids_features)], 'top_pcs'] = True print(f'#features in total: {adata.var["top_pcs"].sum()}')