Source code for simba.tools._general

"""General-purpose tools"""

import numpy as np
from sklearn.cluster import KMeans


[docs] def discretize(adata, layer=None, n_bins=5, max_bins=100): """Discretize continous values Parameters ---------- adata: AnnData Annotated data matrix. layer: `str`, optional (default: None) The layer used to perform discretization n_bins: `int`, optional (default: 5) The number of bins to produce. It must be smaller than `max_bins`. max_bins: `int`, optional (default: 100) The number of bins used in the initial approximation. i.e. the number of bins to cluster. Returns ------- updates `adata` with the following fields `.layer['simba']` : `array_like` The matrix of discretized values to build SIMBA graph. `.uns['disc']` : `dict` `bin_edges`: The edges of each bin. `bin_count`: The number of values in each bin. `hist_edges`: The edges of each bin \ in the initial approximation. `hist_count`: The number of values in each bin \ for the initial approximation. """ if layer is None: X = adata.X else: X = adata.layers[layer] nonzero_cont = X.data hist_count, hist_edges = np.histogram( nonzero_cont, bins=max_bins, density=False) hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2 kmeans = KMeans(n_clusters=n_bins, random_state=2021, n_init='auto').fit( hist_centroids.reshape(-1, 1), sample_weight=hist_count) cluster_centers = np.sort(kmeans.cluster_centers_.flatten()) padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10) bin_edges = np.array( [hist_edges[0]-padding] + list((cluster_centers[0:-1] + cluster_centers[1:])/2) + [hist_edges[-1]+padding]) nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,) bin_count = np.unique(nonzero_disc, return_counts=True)[1] adata.layers['simba'] = X.copy() adata.layers['simba'].data = nonzero_disc adata.uns['disc'] = dict() adata.uns['disc']['bin_edges'] = bin_edges adata.uns['disc']['bin_count'] = bin_count adata.uns['disc']['hist_edges'] = hist_edges adata.uns['disc']['hist_count'] = hist_count