"""General-purpose tools"""
import numpy as np
from sklearn.cluster import KMeans
[docs]
def discretize(adata,
layer=None,
n_bins=5,
max_bins=100):
"""Discretize continous values
Parameters
----------
adata: AnnData
Annotated data matrix.
layer: `str`, optional (default: None)
The layer used to perform discretization
n_bins: `int`, optional (default: 5)
The number of bins to produce.
It must be smaller than `max_bins`.
max_bins: `int`, optional (default: 100)
The number of bins used in the initial approximation.
i.e. the number of bins to cluster.
Returns
-------
updates `adata` with the following fields
`.layer['simba']` : `array_like`
The matrix of discretized values to build SIMBA graph.
`.uns['disc']` : `dict`
`bin_edges`: The edges of each bin.
`bin_count`: The number of values in each bin.
`hist_edges`: The edges of each bin \
in the initial approximation.
`hist_count`: The number of values in each bin \
for the initial approximation.
"""
if layer is None:
X = adata.X
else:
X = adata.layers[layer]
nonzero_cont = X.data
hist_count, hist_edges = np.histogram(
nonzero_cont,
bins=max_bins,
density=False)
hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2
kmeans = KMeans(n_clusters=n_bins, random_state=2021, n_init='auto').fit(
hist_centroids.reshape(-1, 1),
sample_weight=hist_count)
cluster_centers = np.sort(kmeans.cluster_centers_.flatten())
padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10)
bin_edges = np.array(
[hist_edges[0]-padding] +
list((cluster_centers[0:-1] + cluster_centers[1:])/2) +
[hist_edges[-1]+padding])
nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,)
bin_count = np.unique(nonzero_disc, return_counts=True)[1]
adata.layers['simba'] = X.copy()
adata.layers['simba'].data = nonzero_disc
adata.uns['disc'] = dict()
adata.uns['disc']['bin_edges'] = bin_edges
adata.uns['disc']['bin_count'] = bin_count
adata.uns['disc']['hist_edges'] = hist_edges
adata.uns['disc']['hist_count'] = hist_count