Source code for simba.tools._post_training

"""Functions and classes for the analysis after PBG training"""

import numpy as np
import pandas as pd
import anndata as ad
from scipy.stats import entropy
from sklearn.neighbors import KDTree
from scipy.spatial import distance
# import faiss

from ._utils import _gini



[docs]
def softmax(adata_ref,
            adata_query,
            T=0.5,
            n_top=None,
            percentile=0):
    """Softmax-based transformation

    This will transform query data to reference-comparable data

    Parameters
    ----------
    adata_ref: `AnnData`
        Reference anndata.
    adata_query: `list`
        Query anndata objects
    T: `float`
        Temperature parameter.
        It controls the output probability distribution.
        When T goes to inf, it becomes a discrete uniform distribution,
        each query becomes the average of reference;
        When T goes to zero, softargmax converges to arg max,
        each query is approximately the best of reference.
    n_top: `float` (default: None)
        The number of top reference entities to include when transforming
        entities in query data.
        It is used to filter out low-probability reference entities.
        All reference entities used by default.
    percentile: `int` (default: 0)
        Percentile (0-100) of reference entities to exclude when transforming
        entities in query data.
        Only valid when `n_top` is not specified.
        It is used to filter out low-probability reference entities.
        All reference entities used by default.

    Returns
    -------
    updates `adata_query` with the following field.
    softmax: `array_like` (`.layers['softmax']`)
        Store #observations × #dimensions softmax transformed data matrix.
    """

    scores_ref_query = np.matmul(adata_ref.X, adata_query.X.T)
    # avoid overflow encountered
    scores_ref_query = scores_ref_query - scores_ref_query.max()
    scores_softmax = np.exp(scores_ref_query/T) / \
        (np.exp(scores_ref_query/T).sum(axis=0))[None, :]
    if n_top is None:
        thresh = np.percentile(scores_softmax, q=percentile, axis=0)
    else:
        thresh = (np.sort(scores_softmax, axis=0)[::-1, :])[n_top-1, ]
    mask = scores_softmax < thresh[None, :]
    scores_softmax[mask] = 0
    # rescale to make scores add up to 1
    scores_softmax = scores_softmax/scores_softmax.sum(axis=0, keepdims=1)
    X_query = np.dot(scores_softmax.T, adata_ref.X)
    adata_query.layers['softmax'] = X_query



class SimbaEmbed:
    """A class used to represent post-training embedding analyis

    Attributes
    ----------

    Methods
    -------

    """

    def __init__(self,
                 adata_ref,
                 list_adata_query,
                 T=0.5,
                 list_T=None,
                 percentile=50,
                 n_top=None,
                 list_percentile=None,
                 use_precomputed=True,
                 ):
        """
        Parameters
        ----------
        adata_ref: `AnnData`
            Reference anndata.
        list_adata_query: `list`
            A list query anndata objects
        T: `float`
            Temperature parameter shared by all query adata objects.
            It controls the output probability distribution.
            when T goes to inf, it becomes a discrete uniform distribution,
            each query becomes the average of reference;
            when T goes to zero, softargmax converges to arg max,
            each query is approximately the best of reference.
        list_T: `list`, (default: None)
            A list of temperature parameters.
            It should correspond to each of query data.
            Once it's specified, it will override `T`.
        n_top: `float` (default: None)
            The number of top reference entities to include
            when transforming entities in query data.
            It is used to filter out low-probability reference entities.
            All reference entities used by default.
        percentile: `int` (default: 0)
            Percentile (0-100) of reference entities to exclude
            when transforming entities in query data.
            It is used to filter out low-probability reference entities.
            All reference entities used by default.
        list_percentile: `list`, (default: None)
            A list of percentile parameters.
            It corresponds to each of query data.
            Once it's specified, it will override `percentile`.
        """
        assert isinstance(list_adata_query, list), \
            "`list_adata_query` must be list"
        if list_T is not None:
            assert isinstance(list_T, list), \
                "`list_T` must be list"
        self.adata_ref = adata_ref
        self.list_adata_query = list_adata_query
        self.T = T
        self.list_T = list_T
        self.percentile = percentile
        self.n_top = n_top
        self.list_percentile = list_percentile
        self.use_precomputed = use_precomputed

    def embed(self):
        """Embed a list of query datasets along with reference dataset
        into the same space

        Returns
        -------
        adata_all: `AnnData`
            Store #entities × #dimensions.
        """
        adata_ref = self.adata_ref
        list_adata_query = self.list_adata_query
        use_precomputed = self.use_precomputed
        T = self.T
        list_T = self.list_T
        n_top = self.n_top
        percentile = self.percentile
        list_percentile = self.list_percentile
        X_all = adata_ref.X.copy()
        # obs_all = pd.DataFrame(
        #     data=['ref']*adata_ref.shape[0],
        #     index=adata_ref.obs.index,
        #     columns=['id_dataset'])
        obs_all = adata_ref.obs.copy()
        obs_all['id_dataset'] = ['ref']*adata_ref.shape[0]
        for i, adata_query in enumerate(list_adata_query):
            if list_T is not None:
                param_T = list_T[i]
            else:
                param_T = T
            if list_percentile is not None:
                param_percentile = list_percentile[i]
            else:
                param_percentile = percentile
            if use_precomputed:
                if 'softmax' in adata_query.layers.keys():
                    print(f'Reading in precomputed softmax-transformed matrix '
                          f'for query data {i};')
                else:
                    print(f'No softmax-transformed matrix exists '
                          f'for query data {i}')
                    print("Performing softmax transformation;")
                    softmax(
                        adata_ref,
                        adata_query,
                        T=param_T,
                        percentile=param_percentile,
                        n_top=n_top,
                    )
            else:
                print(f'Performing softmax transformation '
                      f'for query data {i};')
                softmax(
                    adata_ref,
                    adata_query,
                    T=param_T,
                    percentile=param_percentile,
                    n_top=n_top,
                    )
            X_all = np.vstack((X_all, adata_query.layers['softmax']))
            # obs_all = obs_all.append(
            #     pd.DataFrame(
            #         data=[f'query_{i}']*adata_query.shape[0],
            #         index=adata_query.obs.index,
            #         columns=['id_dataset'])
            #         )
            obs_query = adata_query.obs.copy()
            obs_query['id_dataset'] = [f'query_{i}']*adata_query.shape[0]
            obs_all = pd.concat(
                [obs_all, obs_query], ignore_index=False)
        adata_all = ad.AnnData(X=X_all,
                               obs=obs_all)
        return adata_all



[docs]
def embed(adata_ref,
          list_adata_query,
          T=0.5,
          list_T=None,
          percentile=0,
          n_top=None,
          list_percentile=None,
          use_precomputed=False):
    """Embed a list of query datasets along with reference dataset
    into the same space

    Parameters
    ----------
    adata_ref: `AnnData`
        Reference anndata.
    list_adata_query: `list`
        A list query anndata objects
    T: `float`
        Temperature parameter shared by all query adata objects.
        It controls the output probability distribution.
        when T goes to inf, it becomes a discrete uniform distribution,
        each query becomes the average of reference;
        when T goes to zero, softargmax converges to arg max,
        each query is approximately the best of reference.
    list_T: `list`, (default: None)
        A list of temperature parameters.
        It corresponds to each of query data.
        Once it's specified, it will override `T`.
    n_top: `float` (default: None)
        The number of top reference entities to include when transforming
        entities in query data.
        It is used to filter out low-probability reference entities.
        All reference entities used by default.
    percentile: `int` (default: 0)
        Percentile (0-100) of reference entities to exclude when transforming
        entities in query data.
        It is used to filter out low-probability reference entities.
        All reference entities used by default.
    list_percentile: `list`, (default: None)
        A list of percentile parameters.
        It corresponds to each of query data.
        Once it's specified, it will override `percentile`.

    Returns
    -------
    adata_all: `AnnData`
        Store #entities × #dimensions.
    updates `adata_query` with the following field.
    softmax: `array_like` (`.layers['softmax']`)
        Store #observations × #dimensions softmax transformed data matrix.
    """
    SE = SimbaEmbed(adata_ref,
                    list_adata_query,
                    T=T,
                    list_T=list_T,
                    percentile=percentile,
                    n_top=n_top,
                    list_percentile=list_percentile,
                    use_precomputed=use_precomputed)
    adata_all = SE.embed()
    return adata_all




[docs]
def compare_entities(adata_ref,
                     adata_query,
                     n_top=50,
                     T=1):
    """Compare the embeddings of two entities by calculating

    the following values between reference and query entities:

    - dot product
    - normalized dot product
    - softmax probability

    and the following metrics for each query entity:

    - max (The average maximum dot product of top-rank reference entities,
      based on normalized dot product)
    - std (standard deviation of reference entities,
      based on dot product)
    - gini (Gini coefficients of reference entities,
      based on softmax probability)
    - entropy (The entropy of reference entities,
      based on softmax probability)

    Parameters
    ----------
    adata_ref: `AnnData`
        Reference entity anndata.
    adata_query: `list`
        Query entity anndata.
    n_top: `int`, optional (default: 50)
        The number of reference entities to consider
        when calculating the metric 'max'
    T: `float`
        Temperature parameter for softmax.
        It controls the output probability distribution.
        When T goes to inf, it becomes a discrete uniform distribution,
        each query becomes the average of reference;
        When T goes to zero, softargmax converges to arg max,
        each query is approximately the best of reference.

    Returns
    -------
    adata_cmp: `AnnData`
        Store reference entity as observations and query entity as variables
    """
    X_ref = adata_ref.X
    X_query = adata_query.X
    X_cmp = np.matmul(X_ref, X_query.T)
    adata_cmp = ad.AnnData(X=X_cmp,
                           obs=adata_ref.obs,
                           var=adata_query.obs)
    adata_cmp.layers['norm'] = X_cmp \
        - np.log(np.exp(X_cmp).mean(axis=0)).reshape(1, -1)
    adata_cmp.layers['softmax'] = np.exp(X_cmp/T) \
        / np.exp(X_cmp/T).sum(axis=0).reshape(1, -1)
    adata_cmp.var['max'] = \
        np.clip(np.sort(adata_cmp.layers['norm'], axis=0)[-n_top:, ],
                a_min=0,
                a_max=None).mean(axis=0)
    adata_cmp.var['std'] = np.std(X_cmp, axis=0, ddof=1)
    adata_cmp.var['gini'] = np.array([_gini(adata_cmp.layers['softmax'][:, i])
                                      for i in np.arange(X_cmp.shape[1])])
    adata_cmp.var['entropy'] = entropy(adata_cmp.layers['softmax'])
    return adata_cmp




[docs]
def query(adata,
          obsm='X_umap',
          layer=None,
          metric='euclidean',
          anno_filter=None,
          filters=None,
          entity=None,
          pin=None,
          k=20,
          use_radius=False,
          r=None,
          **kwargs
          ):
    """Query the "database" of entites

    Parameters
    ----------
    adata : `AnnData`
        Anndata object to query.
    obsm : `str`, optional (default: "X_umap")
        The multi-dimensional annotation to use for calculating the distance.
    layer : `str`, optional (default: None)
        The layer to use for calculating the distance.
    metric : `str`, optional (default: "euclidean")
        The distance metric to use.
        More metrics can be found at "`DistanceMetric class
        <https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html>`__"
    anno_filter : `str`, optional (default: None)
        The annotation of filter to use.
        It should be one of ``adata.obs_keys()``
    filters : `list`, optional (default: None)
        The filters to use.
        It should be a list of values in ``adata.obs[anno_filter]``
    entity : `list`, optional (default: None)
        Query entity. It needs to be in ``adata.obs_names()``
    k : `int`, optional (default: 20)
        The number of nearest neighbors to return.
        Only valid if ``use_radius`` is False
    use_radius : `bool`, optional (default: False)
        If True, query for neighbors within a given radius
    r: `float`, optional (default: None)
        Distance within which neighbors are returned.
        If None, it will be estimated based the range of the space.
    **kwargs: `dict`, optional
        Extra arguments to ``sklearn.neighbors.KDTree``.

    Returns
    -------
    updates `adata` with the following fields.

    params: `dict`, (`adata.uns['query']['params']`)
        Parameters used for the query
    output: `pandas.DataFrame`, (`adata.uns['query']['output']`)
        Query result.
    """
    if sum(list(map(lambda x: x is None,
                    [entity, pin]))) == 2:
        raise ValueError("One of `entity` and `pin` must be specified")
    if sum(list(map(lambda x: x is not None,
                    [entity, pin]))) == 2:
        print("`entity` will be ignored.")
    if entity is not None:
        entity = np.array(entity).flatten()

    if sum(list(map(lambda x: x is not None,
                    [layer, obsm]))) == 2:
        raise ValueError("Only one of `layer` and `obsm` can be used")
    elif obsm is not None:
        X = adata.obsm[obsm].copy()
        if pin is None:
            pin = adata[entity, :].obsm[obsm].copy()
    elif layer is not None:
        X = adata.layers[layer].copy()
        if pin is None:
            pin = adata[entity, :].layers[layer].copy()
    else:
        X = adata.X.copy()
        if pin is None:
            pin = adata[entity, :].X.copy()
    pin = np.reshape(np.array(pin), [-1, X.shape[1]])

    if use_radius:
        kdt = KDTree(X, metric=metric, **kwargs)
        if r is None:
            r = np.mean(X.max(axis=0) - X.min(axis=0))/5
        ind, dist = kdt.query_radius(pin,
                                     r=r,
                                     sort_results=True,
                                     return_distance=True)
        df_output = pd.DataFrame()
        for ii in np.arange(pin.shape[0]):
            df_output_ii = adata.obs.iloc[ind[ii], ].copy()
            df_output_ii['distance'] = dist[ii]
            if entity is not None:
                df_output_ii['query'] = entity[ii]
            else:
                df_output_ii['query'] = ii
            df_output = pd.concat(
                [df_output, df_output_ii], ignore_index=False)
        if anno_filter is not None:
            if anno_filter in adata.obs_keys():
                if filters is None:
                    filters = df_output[anno_filter].unique().tolist()
                df_output.query(f'{anno_filter} == @filters', inplace=True)
            else:
                raise ValueError(f'could not find {anno_filter}')
        df_output = df_output.sort_values(by='distance')
    else:
        # assert (metric in ['euclidean', 'dot_product']),\
        #             "`metric` must be one of ['euclidean','dot_product']"
        if anno_filter is not None:
            if anno_filter in adata.obs_keys():
                if filters is None:
                    filters = adata.obs[anno_filter].unique().tolist()
                ids_filters = \
                    np.where(np.isin(adata.obs[anno_filter], filters))[0]
            else:
                raise ValueError(f'could not find {anno_filter}')
        else:
            ids_filters = np.arange(X.shape[0])
        kdt = KDTree(X[ids_filters, :], metric=metric, **kwargs)
        dist, ind = kdt.query(pin,
                              k=k,
                              sort_results=True,
                              return_distance=True)

        # use faiss
        # X = X.astype('float32')
        # if metric == 'euclidean':
        #     faiss_index = faiss.IndexFlatL2(X.shape[1])   # build the index
        #     faiss_index.add(X[ids_filters, :])
        #     dist, ind = faiss_index.search(pin, k)
        # if metric == 'dot_product':
        #     faiss_index = faiss.IndexFlatIP(X.shape[1])   # build the index
        #     faiss_index.add(X[ids_filters, :])
        #     sim, ind = faiss_index.search(pin, k)
        #     dist = -sim

        df_output = pd.DataFrame()
        for ii in np.arange(pin.shape[0]):
            df_output_ii = \
                adata.obs.iloc[ids_filters, ].iloc[ind[ii, ], ].copy()
            df_output_ii['distance'] = dist[ii, ]
            if entity is not None:
                df_output_ii['query'] = entity[ii]
            else:
                df_output_ii['query'] = ii
            df_output = pd.concat(
                [df_output, df_output_ii], ignore_index=False)
        df_output = df_output.sort_values(by='distance')

    adata.uns['query'] = dict()
    adata.uns['query']['params'] = {'obsm': obsm,
                                    'layer': layer,
                                    'entity': entity,
                                    'pin': pin,
                                    'k': k,
                                    'use_radius': use_radius,
                                    'r': r}
    adata.uns['query']['output'] = df_output.copy()
    return df_output




[docs]
def find_master_regulators(adata_all,
                           list_tf_motif=None,
                           list_tf_gene=None,
                           metric='euclidean',
                           anno_filter='entity_anno',
                           filter_gene='gene',
                           metrics_gene=None,
                           metrics_motif=None,
                           cutoff_gene_max=1.5,
                           cutoff_gene_gini=0.3,
                           cutoff_gene_std=None,
                           cutoff_gene_entropy=None,
                           cutoff_motif_max=1.5,
                           cutoff_motif_gini=0.3,
                           cutoff_motif_std=None,
                           cutoff_motif_entropy=None,
                           ):
    """Find all the master regulators

    Parameters
    ----------
    adata_all : `AnnData`
        Anndata object storing SIMBA embedding of all entities.
    list_tf_motif : `list`
        A list of TF motifs. They should match TF motifs in `list_tf_gene`.
    list_tf_gene : `list`
        A list TF genes. They should match TF motifs in `list_tf_motif`.
    metric : `str`, optional (default: "euclidean")
        The distance metric to use. It can be ‘braycurtis’, ‘canberra’,
        ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’,
        ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’,
        ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’,
        ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘wminkowski’, ‘yule’.
    anno_filter : `str`, optional (default: None)
        The annotation of filter to use.
        It should be one of ``adata.obs_keys()``
    filter_gene : `str`, optional (default: None)
        The filter for gene.
        It should be in ``adata.obs[anno_filter]``
    metrics_gene : `pandas.DataFrame`, optional (default: None)
        SIMBA metrics for genes.
    metrics_motif : `pandas.DataFrame`, optional (default: None)
        SIMBA metrics for motifs.
    cutoff_gene_max, cutoff_motif_max: `float`
        cutoff of SIMBA metric `max value` for genes and motifs
    cutoff_gene_gini,  cutoff_motif_gini: `float`
        cutoff of SIMBA metric `Gini index` for genes and motifs
    cutoff_gene_gini,  cutoff_motif_gini: `float`
        cutoff of SIMBA metric `Gini index` for genes and motifs
    cutoff_gene_std,  cutoff_motif_std: `float`
        cutoff of SIMBA metric `standard deviation` for genes and motifs
    cutoff_gene_entropy,  cutoff_motif_entropy: `float`
        cutoff of SIMBA metric `entropy` for genes and motifs

    Returns
    -------
    df_MR: `pandas.DataFrame`
        Dataframe of master regulators
    """
    if sum(list(map(lambda x: x is None,
                    [list_tf_motif, list_tf_gene]))) > 0:
        return "Please specify both `list_tf_motif` and `list_tf_gene`"

    assert isinstance(list_tf_motif, list), \
        "`list_tf_motif` must be list"
    assert isinstance(list_tf_gene, list), \
        "`list_tf_gene` must be list"
    assert len(list_tf_motif) == len(list_tf_gene), \
        "`list_tf_motif` and `list_tf_gene` must have the same length"
    assert len(list_tf_motif) == len(set(list_tf_motif)), \
        "Duplicates are found in `list_tf_motif`"

    genes = adata_all[adata_all.obs[anno_filter] == filter_gene].\
        obs_names.tolist().copy()
    # Master Regulator
    df_MR = pd.DataFrame(list(zip(list_tf_motif, list_tf_gene)),
                         columns=['motif', 'gene'])

    if metrics_motif is not None:
        print('Adding motif metrics ...')
        assert isinstance(metrics_motif, pd.DataFrame), \
            "`metrics_motif` must be pd.DataFrame"
        df_metrics_motif = metrics_motif.loc[list_tf_motif, ].copy()
        df_metrics_motif.columns = df_metrics_motif.columns + '_motif'
        df_MR = df_MR.merge(df_metrics_motif,
                            how='left',
                            left_on='motif',
                            right_index=True)

    if metrics_gene is not None:
        print('Adding gene metrics ...')
        assert isinstance(metrics_gene, pd.DataFrame), \
            "`metrics_gene` must be pd.DataFrame"
        df_metrics_gene = metrics_gene.loc[list_tf_gene, ].copy()
        df_metrics_gene.index = list_tf_motif  # avoid duplicate genes
        df_metrics_gene.columns = df_metrics_gene.columns + '_gene'
        df_MR = df_MR.merge(df_metrics_gene,
                            how='left',
                            left_on='motif',
                            right_index=True)
    print('Computing distances between TF motifs and genes ...')
    dist_MG = distance.cdist(adata_all[df_MR['motif'], ].X,
                             adata_all[genes, ].X,
                             metric=metric)
    dist_MG = pd.DataFrame(dist_MG,
                           index=df_MR['motif'].tolist(),
                           columns=genes)
    df_MR.insert(2, 'rank', -1)
    df_MR.insert(3, 'dist', -1)
    for i in np.arange(df_MR.shape[0]):
        x_motif = df_MR['motif'].iloc[i]
        x_gene = df_MR['gene'].iloc[i]
        df_MR.loc[i, 'rank'] = dist_MG.loc[x_motif, ].rank()[x_gene]
        df_MR.loc[i, 'dist'] = dist_MG.loc[x_motif, x_gene]

    if metrics_gene is not None:
        print('filtering master regulators based on gene metrics:')
        if cutoff_gene_entropy is not None:
            print('entropy')
            df_MR = df_MR[df_MR['entropy_gene'] > cutoff_gene_entropy]
        if cutoff_gene_gini is not None:
            print('Gini index')
            df_MR = df_MR[df_MR['gini_gene'] > cutoff_gene_gini]
        if cutoff_gene_max is not None:
            print('max')
            df_MR = df_MR[df_MR['max_gene'] > cutoff_gene_max]
        if cutoff_gene_std is not None:
            print('standard deviation')
            df_MR = df_MR[df_MR['std_gene'] > cutoff_gene_std]
    if metrics_motif is not None:
        print('filtering master regulators based on motif metrics:')
        if cutoff_motif_entropy is not None:
            print('entropy')
            df_MR = df_MR[df_MR['entropy_motif'] > cutoff_motif_entropy]
        if cutoff_motif_gini is not None:
            print('Gini index')
            df_MR = df_MR[df_MR['gini_motif'] > cutoff_motif_gini]
        if cutoff_motif_max is not None:
            print('max')
            df_MR = df_MR[df_MR['max_motif'] > cutoff_motif_max]
        if cutoff_motif_std is not None:
            print('standard deviation')
            df_MR = df_MR[df_MR['std_motif'] > cutoff_motif_std]
    df_MR = df_MR.sort_values(by='rank', ignore_index=True)
    return df_MR




[docs]
def find_target_genes(adata_all,
                      adata_PM,
                      list_tf_motif=None,
                      list_tf_gene=None,
                      adata_CP=None,
                      metric='euclidean',
                      anno_filter='entity_anno',
                      filter_peak='peak',
                      filter_gene='gene',
                      n_genes=200,
                      cutoff_gene=None,
                      cutoff_peak=1000,
                      use_precomputed=True,
                      ):
    """For a given TF, infer its target genes

    Parameters
    ----------
    adata_all : `AnnData`
        Anndata object storing SIMBA embedding of all entities.
    adata_PM : `AnnData`
        Peaks-by-motifs anndata object.
    list_tf_motif : `list`
        A list of TF motifs. They should match TF motifs in `list_tf_gene`.
    list_tf_gene : `list`
        A list TF genes. They should match TF motifs in `list_tf_motif`.
    adata_CP : `AnnData`, optional (default: None)
        When ``use_precomputed`` is True, it can be set None
    metric : `str`, optional (default: "euclidean")
        The distance metric to use. It can be ‘braycurtis’, ‘canberra’,
        ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’,
        ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’,
        ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’,
        ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘wminkowski’, ‘yule’.
    anno_filter : `str`, optional (default: None)
        The annotation of filter to use.
        It should be one of ``adata.obs_keys()``
    filter_gene : `str`, optional (default: None)
        The filter for gene.
        It should be in ``adata.obs[anno_filter]``
    filter_peak : `str`, optional (default: None)
        The filter for peak.
        It should be in ``adata.obs[anno_filter]``
    n_genes : `int`, optional (default: 200)
        The number of neighbor genes to consider initially
        around TF gene or TF motif
    cutoff_gene : `float`, optional (default: None)
        Cutoff of "average_rank"
    cutoff_peak : `int`, optional (default: 1000)
        Cutoff for peaks-associated ranks, including
        "rank_peak_to_gene" and "rank_peak_to_TFmotif".
    use_precomputed : `bool`, optional (default: True)
        Distances calculated between genes, peaks, and motifs
        (stored in `adata.uns['tf_targets']`) will be imported

    Returns
    -------
    dict_tf_targets : `dict`
        Target genes for each TF.

    updates `adata` with the following fields.

    tf_targets: `dict`, (`adata.uns['tf_targets']`)
        Distances calculated between genes, peaks, and motifs
    """
    if sum(list(map(lambda x: x is None,
                    [list_tf_motif, list_tf_gene]))) > 0:
        return "Please specify both `list_tf_motif` and `list_tf_gene`"

    assert isinstance(list_tf_motif, list), \
        "`list_tf_motif` must be list"
    assert isinstance(list_tf_gene, list), \
        "`list_tf_gene` must be list"
    assert len(list_tf_motif) == len(list_tf_gene), \
        "`list_tf_motif` and `list_tf_gene` must have the same length"

    def isin(a, b):
        return np.array([item in b for item in a])

    print('Preprocessing ...')
    if use_precomputed and 'tf_targets' in adata_all.uns_keys():
        print('importing precomputed variables ...')
        genes = adata_all.uns['tf_targets']['genes']
        peaks = adata_all.uns['tf_targets']['peaks']
        peaks_in_genes = adata_all.uns['tf_targets']['peaks_in_genes']
        dist_PG = adata_all.uns['tf_targets']['dist_PG']
        overlap_PG = adata_all.uns['tf_targets']['overlap']
    else:
        assert (adata_CP is not None), \
            '`adata_CP` needs to be specified '\
            'when no precomputed variable is stored'
        if 'gene_scores' not in adata_CP.uns_keys():
            print('Please run "si.tl.gene_scores(adata_CP)" first.')
        else:
            overlap_PG = adata_CP.uns['gene_scores']['overlap'].copy()
            overlap_PG['peak'] = \
                overlap_PG[['chr_p', 'start_p', 'end_p']].apply(
                    lambda row: '_'.join(row.values.astype(str)), axis=1)
            tuples = list(zip(overlap_PG['symbol_g'], overlap_PG['peak']))
            multi_indices = pd.MultiIndex.from_tuples(
                tuples, names=["gene", "peak"])
            overlap_PG.index = multi_indices

        genes = adata_all[adata_all.obs[anno_filter] == filter_gene].\
            obs_names.tolist().copy()
        peaks = adata_all[adata_all.obs[anno_filter] == filter_peak].\
            obs_names.tolist().copy()
        peaks_in_genes = list(set(overlap_PG['peak']))

        print(f'#genes: {len(genes)}')
        print(f'#peaks: {len(peaks)}')
        print(f'#genes-associated peaks: {len(peaks_in_genes)}')
        print('computing distances between genes '
              'and genes-associated peaks ...')
        dist_PG = distance.cdist(
            adata_all[peaks_in_genes, ].X,
            adata_all[genes, ].X,
            metric=metric,
            )
        dist_PG = pd.DataFrame(dist_PG, index=peaks_in_genes, columns=[genes])
        print("Saving variables into `.uns['tf_targets']` ...")
        adata_all.uns['tf_targets'] = dict()
        adata_all.uns['tf_targets']['overlap'] = overlap_PG
        adata_all.uns['tf_targets']['dist_PG'] = dist_PG
        adata_all.uns['tf_targets']['peaks_in_genes'] = peaks_in_genes
        adata_all.uns['tf_targets']['genes'] = genes
        adata_all.uns['tf_targets']['peaks'] = peaks
        adata_all.uns['tf_targets']['peaks_in_genes'] = peaks_in_genes

    dict_tf_targets = dict()
    for tf_motif, tf_gene in zip(list_tf_motif, list_tf_gene):

        print(f'searching for target genes of {tf_motif}')
        motif_peaks = adata_PM.obs_names[adata_PM[:, tf_motif].X.nonzero()[0]]
        motif_genes = list(
            set(overlap_PG[isin(overlap_PG['peak'], motif_peaks)]['symbol_g'])
            .intersection(genes))

        # rank of the distances from genes to tf_motif
        dist_GM_motif = distance.cdist(adata_all[genes, ].X,
                                       adata_all[tf_motif, ].X,
                                       metric=metric)
        dist_GM_motif = pd.DataFrame(dist_GM_motif,
                                     index=genes,
                                     columns=[tf_motif])
        rank_GM_motif = dist_GM_motif.rank(axis=0)

        # rank of the distances from genes to tf_gene
        dist_GG_motif = distance.cdist(adata_all[genes, ].X,
                                       adata_all[tf_gene, ].X,
                                       metric=metric)
        dist_GG_motif = pd.DataFrame(dist_GG_motif,
                                     index=genes,
                                     columns=[tf_gene])
        rank_GG_motif = dist_GG_motif.rank(axis=0)

        # rank of the distances from peaks to tf_motif
        dist_PM_motif = distance.cdist(
            adata_all[peaks_in_genes, ].X,
            adata_all[tf_motif, ].X,
            metric=metric)
        dist_PM_motif = pd.DataFrame(dist_PM_motif,
                                     index=peaks_in_genes,
                                     columns=[tf_motif])
        rank_PM_motif = dist_PM_motif.rank(axis=0)

        # rank of the distances from peaks to candidate genes
        cand_genes = \
            dist_GG_motif[tf_gene].nsmallest(n_genes).index.tolist()\
            + dist_GM_motif[tf_motif].nsmallest(n_genes).index.tolist()
        print(f'#candinate genes is {len(cand_genes)}')
        print('removing duplicate genes ...')
        print('removing genes that do not contain TF motif ...')
        cand_genes = list(set(cand_genes).intersection(set(motif_genes)))
        print(f'#candinate genes is {len(cand_genes)}')
        dist_PG_motif = distance.cdist(
            adata_all[peaks_in_genes, ].X,
            adata_all[cand_genes, ].X,
            metric=metric
            )
        dist_PG_motif = pd.DataFrame(dist_PG_motif,
                                     index=peaks_in_genes,
                                     columns=cand_genes)
        rank_PG_motif = dist_PG_motif.rank(axis=0)

        df_tf_targets = pd.DataFrame(index=cand_genes)
        df_tf_targets['average_rank'] = -1
        df_tf_targets['has_motif'] = 'no'
        df_tf_targets['rank_gene_to_TFmotif'] = -1
        df_tf_targets['rank_gene_to_TFgene'] = -1
        df_tf_targets['rank_peak_to_TFmotif'] = -1
        df_tf_targets['rank_peak2_to_TFmotif'] = -1
        df_tf_targets['rank_peak_to_gene'] = -1
        df_tf_targets['rank_peak2_to_gene'] = -1
        for i, g in enumerate(cand_genes):
            g_peaks = list(set(overlap_PG.loc[[g]]['peak']))
            g_motif_peaks = list(set(g_peaks).intersection(motif_peaks))
            if len(g_motif_peaks) > 0:
                df_tf_targets.loc[g, 'has_motif'] = 'yes'
                df_tf_targets.loc[g, 'rank_gene_to_TFmotif'] = \
                    rank_GM_motif[tf_motif][g]
                df_tf_targets.loc[g, 'rank_gene_to_TFgene'] = \
                    rank_GG_motif[tf_gene][g]
                df_tf_targets.loc[g, 'rank_peak_to_TFmotif'] = \
                    rank_PM_motif.loc[g_peaks, tf_motif].min()
                df_tf_targets.loc[g, 'rank_peak2_to_TFmotif'] = \
                    rank_PM_motif.loc[g_motif_peaks, tf_motif].min()
                df_tf_targets.loc[g, 'rank_peak_to_gene'] = \
                    rank_PG_motif.loc[g_peaks, g].min()
                df_tf_targets.loc[g, 'rank_peak2_to_gene'] = \
                    rank_PG_motif.loc[g_peaks, g].min()
            if i % int(len(cand_genes)/5) == 0:
                print(f'completed: {i/len(cand_genes):.1%}')
        df_tf_targets['average_rank'] = \
            df_tf_targets[['rank_gene_to_TFmotif',
                           'rank_gene_to_TFgene']].mean(axis=1)
        if cutoff_peak is not None:
            print('Pruning candidate genes based on nearby peaks ...')
            df_tf_targets = df_tf_targets[
                (df_tf_targets[[
                    'rank_peak_to_TFmotif',
                    'rank_peak_to_gene']]
                    < cutoff_peak).sum(axis=1) > 0]
        if cutoff_gene is not None:
            print('Pruning candidate genes based on average rank ...')
            df_tf_targets = df_tf_targets[
                df_tf_targets['average_rank'] < cutoff_gene]
        dict_tf_targets[tf_motif] = \
            df_tf_targets.sort_values(by='average_rank').copy()
    return dict_tf_targets