Source code for simba.tools._post_training

"""Functions and classes for the analysis after PBG training"""

import numpy as np
import pandas as pd
import anndata as ad
from scipy.stats import entropy
from sklearn.neighbors import KDTree
from scipy.spatial import distance
# import faiss

from ._utils import _gini


[docs] def softmax(adata_ref, adata_query, T=0.5, n_top=None, percentile=0): """Softmax-based transformation This will transform query data to reference-comparable data Parameters ---------- adata_ref: `AnnData` Reference anndata. adata_query: `list` Query anndata objects T: `float` Temperature parameter. It controls the output probability distribution. When T goes to inf, it becomes a discrete uniform distribution, each query becomes the average of reference; When T goes to zero, softargmax converges to arg max, each query is approximately the best of reference. n_top: `float` (default: None) The number of top reference entities to include when transforming entities in query data. It is used to filter out low-probability reference entities. All reference entities used by default. percentile: `int` (default: 0) Percentile (0-100) of reference entities to exclude when transforming entities in query data. Only valid when `n_top` is not specified. It is used to filter out low-probability reference entities. All reference entities used by default. Returns ------- updates `adata_query` with the following field. softmax: `array_like` (`.layers['softmax']`) Store #observations × #dimensions softmax transformed data matrix. """ scores_ref_query = np.matmul(adata_ref.X, adata_query.X.T) # avoid overflow encountered scores_ref_query = scores_ref_query - scores_ref_query.max() scores_softmax = np.exp(scores_ref_query/T) / \ (np.exp(scores_ref_query/T).sum(axis=0))[None, :] if n_top is None: thresh = np.percentile(scores_softmax, q=percentile, axis=0) else: thresh = (np.sort(scores_softmax, axis=0)[::-1, :])[n_top-1, ] mask = scores_softmax < thresh[None, :] scores_softmax[mask] = 0 # rescale to make scores add up to 1 scores_softmax = scores_softmax/scores_softmax.sum(axis=0, keepdims=1) X_query = np.dot(scores_softmax.T, adata_ref.X) adata_query.layers['softmax'] = X_query
class SimbaEmbed: """A class used to represent post-training embedding analyis Attributes ---------- Methods ------- """ def __init__(self, adata_ref, list_adata_query, T=0.5, list_T=None, percentile=50, n_top=None, list_percentile=None, use_precomputed=True, ): """ Parameters ---------- adata_ref: `AnnData` Reference anndata. list_adata_query: `list` A list query anndata objects T: `float` Temperature parameter shared by all query adata objects. It controls the output probability distribution. when T goes to inf, it becomes a discrete uniform distribution, each query becomes the average of reference; when T goes to zero, softargmax converges to arg max, each query is approximately the best of reference. list_T: `list`, (default: None) A list of temperature parameters. It should correspond to each of query data. Once it's specified, it will override `T`. n_top: `float` (default: None) The number of top reference entities to include when transforming entities in query data. It is used to filter out low-probability reference entities. All reference entities used by default. percentile: `int` (default: 0) Percentile (0-100) of reference entities to exclude when transforming entities in query data. It is used to filter out low-probability reference entities. All reference entities used by default. list_percentile: `list`, (default: None) A list of percentile parameters. It corresponds to each of query data. Once it's specified, it will override `percentile`. """ assert isinstance(list_adata_query, list), \ "`list_adata_query` must be list" if list_T is not None: assert isinstance(list_T, list), \ "`list_T` must be list" self.adata_ref = adata_ref self.list_adata_query = list_adata_query self.T = T self.list_T = list_T self.percentile = percentile self.n_top = n_top self.list_percentile = list_percentile self.use_precomputed = use_precomputed def embed(self): """Embed a list of query datasets along with reference dataset into the same space Returns ------- adata_all: `AnnData` Store #entities × #dimensions. """ adata_ref = self.adata_ref list_adata_query = self.list_adata_query use_precomputed = self.use_precomputed T = self.T list_T = self.list_T n_top = self.n_top percentile = self.percentile list_percentile = self.list_percentile X_all = adata_ref.X.copy() # obs_all = pd.DataFrame( # data=['ref']*adata_ref.shape[0], # index=adata_ref.obs.index, # columns=['id_dataset']) obs_all = adata_ref.obs.copy() obs_all['id_dataset'] = ['ref']*adata_ref.shape[0] for i, adata_query in enumerate(list_adata_query): if list_T is not None: param_T = list_T[i] else: param_T = T if list_percentile is not None: param_percentile = list_percentile[i] else: param_percentile = percentile if use_precomputed: if 'softmax' in adata_query.layers.keys(): print(f'Reading in precomputed softmax-transformed matrix ' f'for query data {i};') else: print(f'No softmax-transformed matrix exists ' f'for query data {i}') print("Performing softmax transformation;") softmax( adata_ref, adata_query, T=param_T, percentile=param_percentile, n_top=n_top, ) else: print(f'Performing softmax transformation ' f'for query data {i};') softmax( adata_ref, adata_query, T=param_T, percentile=param_percentile, n_top=n_top, ) X_all = np.vstack((X_all, adata_query.layers['softmax'])) # obs_all = obs_all.append( # pd.DataFrame( # data=[f'query_{i}']*adata_query.shape[0], # index=adata_query.obs.index, # columns=['id_dataset']) # ) obs_query = adata_query.obs.copy() obs_query['id_dataset'] = [f'query_{i}']*adata_query.shape[0] obs_all = pd.concat( [obs_all, obs_query], ignore_index=False) adata_all = ad.AnnData(X=X_all, obs=obs_all) return adata_all
[docs] def embed(adata_ref, list_adata_query, T=0.5, list_T=None, percentile=0, n_top=None, list_percentile=None, use_precomputed=False): """Embed a list of query datasets along with reference dataset into the same space Parameters ---------- adata_ref: `AnnData` Reference anndata. list_adata_query: `list` A list query anndata objects T: `float` Temperature parameter shared by all query adata objects. It controls the output probability distribution. when T goes to inf, it becomes a discrete uniform distribution, each query becomes the average of reference; when T goes to zero, softargmax converges to arg max, each query is approximately the best of reference. list_T: `list`, (default: None) A list of temperature parameters. It corresponds to each of query data. Once it's specified, it will override `T`. n_top: `float` (default: None) The number of top reference entities to include when transforming entities in query data. It is used to filter out low-probability reference entities. All reference entities used by default. percentile: `int` (default: 0) Percentile (0-100) of reference entities to exclude when transforming entities in query data. It is used to filter out low-probability reference entities. All reference entities used by default. list_percentile: `list`, (default: None) A list of percentile parameters. It corresponds to each of query data. Once it's specified, it will override `percentile`. Returns ------- adata_all: `AnnData` Store #entities × #dimensions. updates `adata_query` with the following field. softmax: `array_like` (`.layers['softmax']`) Store #observations × #dimensions softmax transformed data matrix. """ SE = SimbaEmbed(adata_ref, list_adata_query, T=T, list_T=list_T, percentile=percentile, n_top=n_top, list_percentile=list_percentile, use_precomputed=use_precomputed) adata_all = SE.embed() return adata_all
[docs] def compare_entities(adata_ref, adata_query, n_top=50, T=1): """Compare the embeddings of two entities by calculating the following values between reference and query entities: - dot product - normalized dot product - softmax probability and the following metrics for each query entity: - max (The average maximum dot product of top-rank reference entities, based on normalized dot product) - std (standard deviation of reference entities, based on dot product) - gini (Gini coefficients of reference entities, based on softmax probability) - entropy (The entropy of reference entities, based on softmax probability) Parameters ---------- adata_ref: `AnnData` Reference entity anndata. adata_query: `list` Query entity anndata. n_top: `int`, optional (default: 50) The number of reference entities to consider when calculating the metric 'max' T: `float` Temperature parameter for softmax. It controls the output probability distribution. When T goes to inf, it becomes a discrete uniform distribution, each query becomes the average of reference; When T goes to zero, softargmax converges to arg max, each query is approximately the best of reference. Returns ------- adata_cmp: `AnnData` Store reference entity as observations and query entity as variables """ X_ref = adata_ref.X X_query = adata_query.X X_cmp = np.matmul(X_ref, X_query.T) adata_cmp = ad.AnnData(X=X_cmp, obs=adata_ref.obs, var=adata_query.obs) adata_cmp.layers['norm'] = X_cmp \ - np.log(np.exp(X_cmp).mean(axis=0)).reshape(1, -1) adata_cmp.layers['softmax'] = np.exp(X_cmp/T) \ / np.exp(X_cmp/T).sum(axis=0).reshape(1, -1) adata_cmp.var['max'] = \ np.clip(np.sort(adata_cmp.layers['norm'], axis=0)[-n_top:, ], a_min=0, a_max=None).mean(axis=0) adata_cmp.var['std'] = np.std(X_cmp, axis=0, ddof=1) adata_cmp.var['gini'] = np.array([_gini(adata_cmp.layers['softmax'][:, i]) for i in np.arange(X_cmp.shape[1])]) adata_cmp.var['entropy'] = entropy(adata_cmp.layers['softmax']) return adata_cmp
[docs] def query(adata, obsm='X_umap', layer=None, metric='euclidean', anno_filter=None, filters=None, entity=None, pin=None, k=20, use_radius=False, r=None, **kwargs ): """Query the "database" of entites Parameters ---------- adata : `AnnData` Anndata object to query. obsm : `str`, optional (default: "X_umap") The multi-dimensional annotation to use for calculating the distance. layer : `str`, optional (default: None) The layer to use for calculating the distance. metric : `str`, optional (default: "euclidean") The distance metric to use. More metrics can be found at "`DistanceMetric class <https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html>`__" anno_filter : `str`, optional (default: None) The annotation of filter to use. It should be one of ``adata.obs_keys()`` filters : `list`, optional (default: None) The filters to use. It should be a list of values in ``adata.obs[anno_filter]`` entity : `list`, optional (default: None) Query entity. It needs to be in ``adata.obs_names()`` k : `int`, optional (default: 20) The number of nearest neighbors to return. Only valid if ``use_radius`` is False use_radius : `bool`, optional (default: False) If True, query for neighbors within a given radius r: `float`, optional (default: None) Distance within which neighbors are returned. If None, it will be estimated based the range of the space. **kwargs: `dict`, optional Extra arguments to ``sklearn.neighbors.KDTree``. Returns ------- updates `adata` with the following fields. params: `dict`, (`adata.uns['query']['params']`) Parameters used for the query output: `pandas.DataFrame`, (`adata.uns['query']['output']`) Query result. """ if sum(list(map(lambda x: x is None, [entity, pin]))) == 2: raise ValueError("One of `entity` and `pin` must be specified") if sum(list(map(lambda x: x is not None, [entity, pin]))) == 2: print("`entity` will be ignored.") if entity is not None: entity = np.array(entity).flatten() if sum(list(map(lambda x: x is not None, [layer, obsm]))) == 2: raise ValueError("Only one of `layer` and `obsm` can be used") elif obsm is not None: X = adata.obsm[obsm].copy() if pin is None: pin = adata[entity, :].obsm[obsm].copy() elif layer is not None: X = adata.layers[layer].copy() if pin is None: pin = adata[entity, :].layers[layer].copy() else: X = adata.X.copy() if pin is None: pin = adata[entity, :].X.copy() pin = np.reshape(np.array(pin), [-1, X.shape[1]]) if use_radius: kdt = KDTree(X, metric=metric, **kwargs) if r is None: r = np.mean(X.max(axis=0) - X.min(axis=0))/5 ind, dist = kdt.query_radius(pin, r=r, sort_results=True, return_distance=True) df_output = pd.DataFrame() for ii in np.arange(pin.shape[0]): df_output_ii = adata.obs.iloc[ind[ii], ].copy() df_output_ii['distance'] = dist[ii] if entity is not None: df_output_ii['query'] = entity[ii] else: df_output_ii['query'] = ii df_output = pd.concat( [df_output, df_output_ii], ignore_index=False) if anno_filter is not None: if anno_filter in adata.obs_keys(): if filters is None: filters = df_output[anno_filter].unique().tolist() df_output.query(f'{anno_filter} == @filters', inplace=True) else: raise ValueError(f'could not find {anno_filter}') df_output = df_output.sort_values(by='distance') else: # assert (metric in ['euclidean', 'dot_product']),\ # "`metric` must be one of ['euclidean','dot_product']" if anno_filter is not None: if anno_filter in adata.obs_keys(): if filters is None: filters = adata.obs[anno_filter].unique().tolist() ids_filters = \ np.where(np.isin(adata.obs[anno_filter], filters))[0] else: raise ValueError(f'could not find {anno_filter}') else: ids_filters = np.arange(X.shape[0]) kdt = KDTree(X[ids_filters, :], metric=metric, **kwargs) dist, ind = kdt.query(pin, k=k, sort_results=True, return_distance=True) # use faiss # X = X.astype('float32') # if metric == 'euclidean': # faiss_index = faiss.IndexFlatL2(X.shape[1]) # build the index # faiss_index.add(X[ids_filters, :]) # dist, ind = faiss_index.search(pin, k) # if metric == 'dot_product': # faiss_index = faiss.IndexFlatIP(X.shape[1]) # build the index # faiss_index.add(X[ids_filters, :]) # sim, ind = faiss_index.search(pin, k) # dist = -sim df_output = pd.DataFrame() for ii in np.arange(pin.shape[0]): df_output_ii = \ adata.obs.iloc[ids_filters, ].iloc[ind[ii, ], ].copy() df_output_ii['distance'] = dist[ii, ] if entity is not None: df_output_ii['query'] = entity[ii] else: df_output_ii['query'] = ii df_output = pd.concat( [df_output, df_output_ii], ignore_index=False) df_output = df_output.sort_values(by='distance') adata.uns['query'] = dict() adata.uns['query']['params'] = {'obsm': obsm, 'layer': layer, 'entity': entity, 'pin': pin, 'k': k, 'use_radius': use_radius, 'r': r} adata.uns['query']['output'] = df_output.copy() return df_output
[docs] def find_master_regulators(adata_all, list_tf_motif=None, list_tf_gene=None, metric='euclidean', anno_filter='entity_anno', filter_gene='gene', metrics_gene=None, metrics_motif=None, cutoff_gene_max=1.5, cutoff_gene_gini=0.3, cutoff_gene_std=None, cutoff_gene_entropy=None, cutoff_motif_max=1.5, cutoff_motif_gini=0.3, cutoff_motif_std=None, cutoff_motif_entropy=None, ): """Find all the master regulators Parameters ---------- adata_all : `AnnData` Anndata object storing SIMBA embedding of all entities. list_tf_motif : `list` A list of TF motifs. They should match TF motifs in `list_tf_gene`. list_tf_gene : `list` A list TF genes. They should match TF motifs in `list_tf_motif`. metric : `str`, optional (default: "euclidean") The distance metric to use. It can be ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘wminkowski’, ‘yule’. anno_filter : `str`, optional (default: None) The annotation of filter to use. It should be one of ``adata.obs_keys()`` filter_gene : `str`, optional (default: None) The filter for gene. It should be in ``adata.obs[anno_filter]`` metrics_gene : `pandas.DataFrame`, optional (default: None) SIMBA metrics for genes. metrics_motif : `pandas.DataFrame`, optional (default: None) SIMBA metrics for motifs. cutoff_gene_max, cutoff_motif_max: `float` cutoff of SIMBA metric `max value` for genes and motifs cutoff_gene_gini, cutoff_motif_gini: `float` cutoff of SIMBA metric `Gini index` for genes and motifs cutoff_gene_gini, cutoff_motif_gini: `float` cutoff of SIMBA metric `Gini index` for genes and motifs cutoff_gene_std, cutoff_motif_std: `float` cutoff of SIMBA metric `standard deviation` for genes and motifs cutoff_gene_entropy, cutoff_motif_entropy: `float` cutoff of SIMBA metric `entropy` for genes and motifs Returns ------- df_MR: `pandas.DataFrame` Dataframe of master regulators """ if sum(list(map(lambda x: x is None, [list_tf_motif, list_tf_gene]))) > 0: return "Please specify both `list_tf_motif` and `list_tf_gene`" assert isinstance(list_tf_motif, list), \ "`list_tf_motif` must be list" assert isinstance(list_tf_gene, list), \ "`list_tf_gene` must be list" assert len(list_tf_motif) == len(list_tf_gene), \ "`list_tf_motif` and `list_tf_gene` must have the same length" assert len(list_tf_motif) == len(set(list_tf_motif)), \ "Duplicates are found in `list_tf_motif`" genes = adata_all[adata_all.obs[anno_filter] == filter_gene].\ obs_names.tolist().copy() # Master Regulator df_MR = pd.DataFrame(list(zip(list_tf_motif, list_tf_gene)), columns=['motif', 'gene']) if metrics_motif is not None: print('Adding motif metrics ...') assert isinstance(metrics_motif, pd.DataFrame), \ "`metrics_motif` must be pd.DataFrame" df_metrics_motif = metrics_motif.loc[list_tf_motif, ].copy() df_metrics_motif.columns = df_metrics_motif.columns + '_motif' df_MR = df_MR.merge(df_metrics_motif, how='left', left_on='motif', right_index=True) if metrics_gene is not None: print('Adding gene metrics ...') assert isinstance(metrics_gene, pd.DataFrame), \ "`metrics_gene` must be pd.DataFrame" df_metrics_gene = metrics_gene.loc[list_tf_gene, ].copy() df_metrics_gene.index = list_tf_motif # avoid duplicate genes df_metrics_gene.columns = df_metrics_gene.columns + '_gene' df_MR = df_MR.merge(df_metrics_gene, how='left', left_on='motif', right_index=True) print('Computing distances between TF motifs and genes ...') dist_MG = distance.cdist(adata_all[df_MR['motif'], ].X, adata_all[genes, ].X, metric=metric) dist_MG = pd.DataFrame(dist_MG, index=df_MR['motif'].tolist(), columns=genes) df_MR.insert(2, 'rank', -1) df_MR.insert(3, 'dist', -1) for i in np.arange(df_MR.shape[0]): x_motif = df_MR['motif'].iloc[i] x_gene = df_MR['gene'].iloc[i] df_MR.loc[i, 'rank'] = dist_MG.loc[x_motif, ].rank()[x_gene] df_MR.loc[i, 'dist'] = dist_MG.loc[x_motif, x_gene] if metrics_gene is not None: print('filtering master regulators based on gene metrics:') if cutoff_gene_entropy is not None: print('entropy') df_MR = df_MR[df_MR['entropy_gene'] > cutoff_gene_entropy] if cutoff_gene_gini is not None: print('Gini index') df_MR = df_MR[df_MR['gini_gene'] > cutoff_gene_gini] if cutoff_gene_max is not None: print('max') df_MR = df_MR[df_MR['max_gene'] > cutoff_gene_max] if cutoff_gene_std is not None: print('standard deviation') df_MR = df_MR[df_MR['std_gene'] > cutoff_gene_std] if metrics_motif is not None: print('filtering master regulators based on motif metrics:') if cutoff_motif_entropy is not None: print('entropy') df_MR = df_MR[df_MR['entropy_motif'] > cutoff_motif_entropy] if cutoff_motif_gini is not None: print('Gini index') df_MR = df_MR[df_MR['gini_motif'] > cutoff_motif_gini] if cutoff_motif_max is not None: print('max') df_MR = df_MR[df_MR['max_motif'] > cutoff_motif_max] if cutoff_motif_std is not None: print('standard deviation') df_MR = df_MR[df_MR['std_motif'] > cutoff_motif_std] df_MR = df_MR.sort_values(by='rank', ignore_index=True) return df_MR
[docs] def find_target_genes(adata_all, adata_PM, list_tf_motif=None, list_tf_gene=None, adata_CP=None, metric='euclidean', anno_filter='entity_anno', filter_peak='peak', filter_gene='gene', n_genes=200, cutoff_gene=None, cutoff_peak=1000, use_precomputed=True, ): """For a given TF, infer its target genes Parameters ---------- adata_all : `AnnData` Anndata object storing SIMBA embedding of all entities. adata_PM : `AnnData` Peaks-by-motifs anndata object. list_tf_motif : `list` A list of TF motifs. They should match TF motifs in `list_tf_gene`. list_tf_gene : `list` A list TF genes. They should match TF motifs in `list_tf_motif`. adata_CP : `AnnData`, optional (default: None) When ``use_precomputed`` is True, it can be set None metric : `str`, optional (default: "euclidean") The distance metric to use. It can be ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘wminkowski’, ‘yule’. anno_filter : `str`, optional (default: None) The annotation of filter to use. It should be one of ``adata.obs_keys()`` filter_gene : `str`, optional (default: None) The filter for gene. It should be in ``adata.obs[anno_filter]`` filter_peak : `str`, optional (default: None) The filter for peak. It should be in ``adata.obs[anno_filter]`` n_genes : `int`, optional (default: 200) The number of neighbor genes to consider initially around TF gene or TF motif cutoff_gene : `float`, optional (default: None) Cutoff of "average_rank" cutoff_peak : `int`, optional (default: 1000) Cutoff for peaks-associated ranks, including "rank_peak_to_gene" and "rank_peak_to_TFmotif". use_precomputed : `bool`, optional (default: True) Distances calculated between genes, peaks, and motifs (stored in `adata.uns['tf_targets']`) will be imported Returns ------- dict_tf_targets : `dict` Target genes for each TF. updates `adata` with the following fields. tf_targets: `dict`, (`adata.uns['tf_targets']`) Distances calculated between genes, peaks, and motifs """ if sum(list(map(lambda x: x is None, [list_tf_motif, list_tf_gene]))) > 0: return "Please specify both `list_tf_motif` and `list_tf_gene`" assert isinstance(list_tf_motif, list), \ "`list_tf_motif` must be list" assert isinstance(list_tf_gene, list), \ "`list_tf_gene` must be list" assert len(list_tf_motif) == len(list_tf_gene), \ "`list_tf_motif` and `list_tf_gene` must have the same length" def isin(a, b): return np.array([item in b for item in a]) print('Preprocessing ...') if use_precomputed and 'tf_targets' in adata_all.uns_keys(): print('importing precomputed variables ...') genes = adata_all.uns['tf_targets']['genes'] peaks = adata_all.uns['tf_targets']['peaks'] peaks_in_genes = adata_all.uns['tf_targets']['peaks_in_genes'] dist_PG = adata_all.uns['tf_targets']['dist_PG'] overlap_PG = adata_all.uns['tf_targets']['overlap'] else: assert (adata_CP is not None), \ '`adata_CP` needs to be specified '\ 'when no precomputed variable is stored' if 'gene_scores' not in adata_CP.uns_keys(): print('Please run "si.tl.gene_scores(adata_CP)" first.') else: overlap_PG = adata_CP.uns['gene_scores']['overlap'].copy() overlap_PG['peak'] = \ overlap_PG[['chr_p', 'start_p', 'end_p']].apply( lambda row: '_'.join(row.values.astype(str)), axis=1) tuples = list(zip(overlap_PG['symbol_g'], overlap_PG['peak'])) multi_indices = pd.MultiIndex.from_tuples( tuples, names=["gene", "peak"]) overlap_PG.index = multi_indices genes = adata_all[adata_all.obs[anno_filter] == filter_gene].\ obs_names.tolist().copy() peaks = adata_all[adata_all.obs[anno_filter] == filter_peak].\ obs_names.tolist().copy() peaks_in_genes = list(set(overlap_PG['peak'])) print(f'#genes: {len(genes)}') print(f'#peaks: {len(peaks)}') print(f'#genes-associated peaks: {len(peaks_in_genes)}') print('computing distances between genes ' 'and genes-associated peaks ...') dist_PG = distance.cdist( adata_all[peaks_in_genes, ].X, adata_all[genes, ].X, metric=metric, ) dist_PG = pd.DataFrame(dist_PG, index=peaks_in_genes, columns=[genes]) print("Saving variables into `.uns['tf_targets']` ...") adata_all.uns['tf_targets'] = dict() adata_all.uns['tf_targets']['overlap'] = overlap_PG adata_all.uns['tf_targets']['dist_PG'] = dist_PG adata_all.uns['tf_targets']['peaks_in_genes'] = peaks_in_genes adata_all.uns['tf_targets']['genes'] = genes adata_all.uns['tf_targets']['peaks'] = peaks adata_all.uns['tf_targets']['peaks_in_genes'] = peaks_in_genes dict_tf_targets = dict() for tf_motif, tf_gene in zip(list_tf_motif, list_tf_gene): print(f'searching for target genes of {tf_motif}') motif_peaks = adata_PM.obs_names[adata_PM[:, tf_motif].X.nonzero()[0]] motif_genes = list( set(overlap_PG[isin(overlap_PG['peak'], motif_peaks)]['symbol_g']) .intersection(genes)) # rank of the distances from genes to tf_motif dist_GM_motif = distance.cdist(adata_all[genes, ].X, adata_all[tf_motif, ].X, metric=metric) dist_GM_motif = pd.DataFrame(dist_GM_motif, index=genes, columns=[tf_motif]) rank_GM_motif = dist_GM_motif.rank(axis=0) # rank of the distances from genes to tf_gene dist_GG_motif = distance.cdist(adata_all[genes, ].X, adata_all[tf_gene, ].X, metric=metric) dist_GG_motif = pd.DataFrame(dist_GG_motif, index=genes, columns=[tf_gene]) rank_GG_motif = dist_GG_motif.rank(axis=0) # rank of the distances from peaks to tf_motif dist_PM_motif = distance.cdist( adata_all[peaks_in_genes, ].X, adata_all[tf_motif, ].X, metric=metric) dist_PM_motif = pd.DataFrame(dist_PM_motif, index=peaks_in_genes, columns=[tf_motif]) rank_PM_motif = dist_PM_motif.rank(axis=0) # rank of the distances from peaks to candidate genes cand_genes = \ dist_GG_motif[tf_gene].nsmallest(n_genes).index.tolist()\ + dist_GM_motif[tf_motif].nsmallest(n_genes).index.tolist() print(f'#candinate genes is {len(cand_genes)}') print('removing duplicate genes ...') print('removing genes that do not contain TF motif ...') cand_genes = list(set(cand_genes).intersection(set(motif_genes))) print(f'#candinate genes is {len(cand_genes)}') dist_PG_motif = distance.cdist( adata_all[peaks_in_genes, ].X, adata_all[cand_genes, ].X, metric=metric ) dist_PG_motif = pd.DataFrame(dist_PG_motif, index=peaks_in_genes, columns=cand_genes) rank_PG_motif = dist_PG_motif.rank(axis=0) df_tf_targets = pd.DataFrame(index=cand_genes) df_tf_targets['average_rank'] = -1 df_tf_targets['has_motif'] = 'no' df_tf_targets['rank_gene_to_TFmotif'] = -1 df_tf_targets['rank_gene_to_TFgene'] = -1 df_tf_targets['rank_peak_to_TFmotif'] = -1 df_tf_targets['rank_peak2_to_TFmotif'] = -1 df_tf_targets['rank_peak_to_gene'] = -1 df_tf_targets['rank_peak2_to_gene'] = -1 for i, g in enumerate(cand_genes): g_peaks = list(set(overlap_PG.loc[[g]]['peak'])) g_motif_peaks = list(set(g_peaks).intersection(motif_peaks)) if len(g_motif_peaks) > 0: df_tf_targets.loc[g, 'has_motif'] = 'yes' df_tf_targets.loc[g, 'rank_gene_to_TFmotif'] = \ rank_GM_motif[tf_motif][g] df_tf_targets.loc[g, 'rank_gene_to_TFgene'] = \ rank_GG_motif[tf_gene][g] df_tf_targets.loc[g, 'rank_peak_to_TFmotif'] = \ rank_PM_motif.loc[g_peaks, tf_motif].min() df_tf_targets.loc[g, 'rank_peak2_to_TFmotif'] = \ rank_PM_motif.loc[g_motif_peaks, tf_motif].min() df_tf_targets.loc[g, 'rank_peak_to_gene'] = \ rank_PG_motif.loc[g_peaks, g].min() df_tf_targets.loc[g, 'rank_peak2_to_gene'] = \ rank_PG_motif.loc[g_peaks, g].min() if i % int(len(cand_genes)/5) == 0: print(f'completed: {i/len(cand_genes):.1%}') df_tf_targets['average_rank'] = \ df_tf_targets[['rank_gene_to_TFmotif', 'rank_gene_to_TFgene']].mean(axis=1) if cutoff_peak is not None: print('Pruning candidate genes based on nearby peaks ...') df_tf_targets = df_tf_targets[ (df_tf_targets[[ 'rank_peak_to_TFmotif', 'rank_peak_to_gene']] < cutoff_peak).sum(axis=1) > 0] if cutoff_gene is not None: print('Pruning candidate genes based on average rank ...') df_tf_targets = df_tf_targets[ df_tf_targets['average_rank'] < cutoff_gene] dict_tf_targets[tf_motif] = \ df_tf_targets.sort_values(by='average_rank').copy() return dict_tf_targets