"""Principal component analysis"""
import numpy as np
from sklearn.decomposition import TruncatedSVD
from ._utils import (
locate_elbow,
)
[docs]
def pca(adata,
n_components=50,
algorithm='randomized',
n_iter=5,
random_state=2021,
tol=0.0,
feature=None,
**kwargs,
):
"""perform Principal Component Analysis (PCA)
Parameters
----------
adata: AnnData
Annotated data matrix.
n_components: `int`, optional (default: 50)
Desired dimensionality of output data
algorithm: `str`, optional (default: 'randomized')
SVD solver to use. Choose from {'arpack', 'randomized'}.
n_iter: `int`, optional (default: '5')
Number of iterations for randomized SVD solver.
Not used by ARPACK.
tol: `float`, optional (default: 0)
Tolerance for ARPACK. 0 means machine precision.
Ignored by randomized SVD solver.
feature: `str`, optional (default: None)
Feature used to perform PCA.
The data type of `.var[feature]` needs to be `bool`
If None, adata.X will be used.
kwargs:
Other keyword arguments are passed down to `TruncatedSVD()`
Returns
-------
updates `adata` with the following fields:
`.obsm['X_pca']` : `array`
PCA transformed X.
`.uns['pca']['PCs']` : `array`
Principal components in feature space,
representing the directions of maximum variance in the data.
`.uns['pca']['variance']` : `array`
The variance of the training samples transformed by a
projection to each component.
`.uns['pca']['variance_ratio']` : `array`
Percentage of variance explained by each of the selected components.
"""
if feature is None:
X = adata.X.copy()
else:
mask = adata.var[feature]
X = adata[:, mask].X.copy()
svd = TruncatedSVD(n_components=n_components,
algorithm=algorithm,
n_iter=n_iter,
random_state=random_state,
tol=tol,
**kwargs)
svd.fit(X)
adata.obsm['X_pca'] = svd.transform(X)
adata.uns['pca'] = dict()
adata.uns['pca']['n_pcs'] = n_components
adata.uns['pca']['PCs'] = svd.components_.T
adata.uns['pca']['variance'] = svd.explained_variance_
adata.uns['pca']['variance_ratio'] = svd.explained_variance_ratio_
[docs]
def select_pcs(adata,
n_pcs=None,
S=1,
curve='convex',
direction='decreasing',
online=False,
min_elbow=None,
**kwargs):
"""select top PCs based on variance_ratio
Parameters
----------
n_pcs: `int`, optional (default: None)
If n_pcs is None,
the number of PCs will be automatically selected with "`kneed
<https://kneed.readthedocs.io/>`__"
S : `float`, optional (default: 1)
Sensitivity
min_elbow: `int`, optional (default: None)
The minimum elbow location
By default, it is n_components/10
curve: `str`, optional (default: 'convex')
Choose from {'convex','concave'}
If 'concave', algorithm will detect knees,
If 'convex', algorithm will detect elbows.
direction: `str`, optional (default: 'decreasing')
Choose from {'decreasing','increasing'}
online: `bool`, optional (default: False)
kneed will correct old knee points if True,
kneed will return first knee if False.
**kwargs: `dict`, optional
Extra arguments to KneeLocator.
Returns
"""
if n_pcs is None:
n_components = adata.obsm['X_pca'].shape[1]
if min_elbow is None:
min_elbow = n_components/10
n_pcs = locate_elbow(range(n_components),
adata.uns['pca']['variance_ratio'],
S=S,
curve=curve,
min_elbow=min_elbow,
direction=direction,
online=online,
**kwargs)
adata.uns['pca']['n_pcs'] = n_pcs
else:
adata.uns['pca']['n_pcs'] = n_pcs
[docs]
def select_pcs_features(adata,
S=1,
curve='convex',
direction='decreasing',
online=False,
min_elbow=None,
**kwargs):
"""select features that contribute to the top PCs
Parameters
----------
S : `float`, optional (default: 10)
Sensitivity
min_elbow: `int`, optional (default: None)
The minimum elbow location.
By default, it is #features/6
curve: `str`, optional (default: 'convex')
Choose from {'convex','concave'}
If 'concave', algorithm will detect knees,
If 'convex', algorithm will detect elbows.
direction: `str`, optional (default: 'decreasing')
Choose from {'decreasing','increasing'}
online: `bool`, optional (default: False)
kneed will correct old knee points if True,
kneed will return first knee if False.
**kwargs: `dict`, optional
Extra arguments to KneeLocator.
Returns
-------
"""
n_pcs = adata.uns['pca']['n_pcs']
n_features = adata.uns['pca']['PCs'].shape[0]
if min_elbow is None:
min_elbow = n_features/6
adata.uns['pca']['features'] = dict()
ids_features = list()
for i in range(n_pcs):
elbow = locate_elbow(range(n_features),
np.sort(
np.abs(adata.uns['pca']['PCs'][:, i],))[::-1],
S=S,
min_elbow=min_elbow,
curve=curve,
direction=direction,
online=online,
**kwargs)
ids_features_i = \
list(np.argsort(np.abs(
adata.uns['pca']['PCs'][:, i],))[::-1][:elbow])
adata.uns['pca']['features'][f'pc_{i}'] = ids_features_i
ids_features = ids_features + ids_features_i
print(f'#features selected from PC {i}: {len(ids_features_i)}')
adata.var['top_pcs'] = False
adata.var.loc[adata.var_names[np.unique(ids_features)], 'top_pcs'] = True
print(f'#features in total: {adata.var["top_pcs"].sum()}')