Source code for simba.datasets._datasets

import urllib.request
from tqdm import tqdm
import os

from .._settings import settings
from ..readwrite import read_h5ad


class DownloadProgressBar(tqdm):
    def update_to(self,
                  b=1,
                  bsize=1,
                  tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def download_url(url,
                 output_path,
                 desc=None):
    if desc is None:
        desc = url.split('/')[-1]
    with DownloadProgressBar(
        unit='B',
        unit_scale=True,
        miniters=1,
        desc=desc
    ) as t:
        urllib.request.urlretrieve(
            url,
            filename=output_path,
            reporthook=t.update_to)


[docs] def rna_10xpmbc3k(): """10X human peripheral blood mononuclear cells (PBMCs) scRNA-seq data Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/087wuliddmbp3oe/rna_seq.h5ad?dl=1' filename = 'rna_10xpmbc3k.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_han2018(): """single-cell microwell-seq mouse cell atlas data ref: Han, X. et al. Mapping the mouse cell atlas by microwell-seq. Cell 172, 1091-1107. e1017 (2018). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/nxbszjbir44g99n/rna_seq_mi.h5ad?dl=1' filename = 'rna_han2018.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_tmc2018(): """single-cell Smart-Seq2 mouse cell atlas data ref: Tabula Muris Consortium. Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris. Nature 562, 367-372 (2018). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/rnpyp6vfpuiptkz/rna_seq_sm.h5ad?dl=1' filename = 'rna_tmc2018.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_baron2016(): """single-cell RNA-seq human pancreas data ref: Baron, M. et al. A single-cell transcriptomic map of the human and mouse pancreas reveals inter-and intra-cell population structure. Cell systems 3, 346-360. e344 (2016) Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/bvziclu6d3fdzow/rna_seq_baron.h5ad?dl=1' filename = 'rna_baron2016.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_muraro2016(): """single-cell RNA-seq human pancreas data ref: Muraro, M.J. et al. A single-cell transcriptome atlas of the human pancreas.Cell systems 3, 385-394. e383 (2016). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/ginc9rbo4qmobwx/rna_seq_muraro.h5ad?dl=1' filename = 'rna_muraro2016.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_segerstolpe2016(): """single-cell RNA-seq human pancreas data ref: Segerstolpe, Å. et al. Single-cell transcriptome profiling of human pancreatic islets in health and type 2 diabetes. Cell metabolism 24, 593-607 (2016). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/qomnf4860jwm9pd/rna_seq_segerstolpe.h5ad?dl=1' filename = 'rna_segerstolpe2016.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_wang2016(): """single-cell RNA-seq human pancreas data ref: Wang, Y.J. et al. Single-cell transcriptomics of the human endocrine pancreas. Diabetes 65, 3028-3038 (2016). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/9tv44nugwpx9t4c/rna_seq_wang.h5ad?dl=1' filename = 'rna_wang2016.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def rna_xin2016(): """single-cell RNA-seq human pancreas data ref: Xin, Y. et al. RNA sequencing of single human islet cells reveals type 2 diabetes genes. Cell metabolism 24, 608-615 (2016). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/j483i47mxty6rzo/rna_seq_xin.h5ad?dl=1' filename = 'rna_xin2016.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def atac_buenrostro2018(): """single cell ATAC-seq human blood data ref: Buenrostro, J.D. et al. Integrated Single-Cell Analysis Maps the Continuous RegulatoryLandscape of Human Hematopoietic Differentiation. Cell 173, 1535-1548 e1516 (2018). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/7hxjqgdxtbna1tm/atac_seq.h5ad?dl=1' filename = 'atac_buenrostro2018.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def atac_10xpbmc5k(): """10X human peripheral blood mononuclear cells (PBMCs) scATAC-seq data Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/xa8u7rlskc5h7iv/atac_seq.h5ad?dl=1' filename = 'atac_10xpbmc5k.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def atac_cusanovich2018_subset(): """downsampled sci-ATAC-seq mouse tissue data ref: Cusanovich, D.A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin Accessibility. Cell 174, 1309-1324 e1318 (2018). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/e8iqwm93m33i5wt/atac_seq.h5ad?dl=1' filename = 'atac_cusanovich2018_subset.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def atac_chen2019(): """simulated scATAC-seq bone marrow data with a noise level of 0.4 and a coverage of 2500 fragments ref: Chen, H. et al. Assessment of computational methods for the analysis of single-cell ATAC-seq data. Genome Biology 20, 241 (2019). Returns ------- adata: `AnnData` Anndata object """ url = 'https://www.dropbox.com/s/fthhh3mz5b39d4y/atac_seq.h5ad?dl=1' filename = 'atac_chen2019.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath = os.path.join(filepath, filename) if(not os.path.exists(fullpath)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url, fullpath, desc=filename) print(f'Downloaded to {filepath}.') adata = read_h5ad(fullpath) return adata
[docs] def multiome_ma2020_fig4(): """single cell multiome mouse skin data (SHARE-seq) ref: Ma, S. et al. Chromatin Potential Identified by Shared Single-Cell Profiling of RNA and Chromatin. Cell (2020). Returns ------- dict_adata: `dict` A dictionary of anndata objects """ url_rna = 'https://www.dropbox.com/s/gmmf77l8kzle6o7/rna_seq_fig4.h5ad?dl=1' url_atac = 'https://www.dropbox.com/s/ts0v2y2m5fcumcb/atac_seq_fig4.h5ad?dl=1' filename_rna = 'multiome_ma2020_fig4_rna.h5ad' filename_atac = 'multiome_ma2020_fig4_atac.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath_rna = os.path.join(filepath, filename_rna) fullpath_atac = os.path.join(filepath, filename_atac) if(not os.path.exists(fullpath_rna)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url_rna, fullpath_rna, desc=filename_rna) print(f'Downloaded to {filepath}.') if(not os.path.exists(fullpath_atac)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url_atac, fullpath_atac, desc=filename_atac) print(f'Downloaded to {filepath}.') adata_rna = read_h5ad(fullpath_rna) adata_atac = read_h5ad(fullpath_atac) dict_adata = {'rna': adata_rna, 'atac': adata_atac} return dict_adata
[docs] def multiome_chen2019(): """single cell multiome neonatal mouse cerebral cortex data (SNARE-seq) ref: Chen, S., Lake, B.B. & Zhang, K. High-throughput sequencing of the transcriptome and chromatin accessibility in the same cell. Nat Biotechnol (2019). Returns ------- dict_adata: `dict` A dictionary of anndata objects """ url_rna = 'https://www.dropbox.com/s/b1bbcs500q0pigt/rna_seq.h5ad?dl=1' url_atac = 'https://www.dropbox.com/s/ljepkfber68pdvc/atac_seq.h5ad?dl=1' filename_rna = 'multiome_chen2019_rna.h5ad' filename_atac = 'multiome_chen2019_atac.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath_rna = os.path.join(filepath, filename_rna) fullpath_atac = os.path.join(filepath, filename_atac) if(not os.path.exists(fullpath_rna)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url_rna, fullpath_rna, desc=filename_rna) print(f'Downloaded to {filepath}.') if(not os.path.exists(fullpath_atac)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url_atac, fullpath_atac, desc=filename_atac) print(f'Downloaded to {filepath}.') adata_rna = read_h5ad(fullpath_rna) adata_atac = read_h5ad(fullpath_atac) dict_adata = {'rna': adata_rna, 'atac': adata_atac} return dict_adata
[docs] def multiome_10xpbmc10k(): """single cell 10X human peripheral blood mononuclear cells (PBMCs) multiome data Returns ------- dict_adata: `dict` A dictionary of anndata objects """ url_rna = 'https://www.dropbox.com/s/zwlim6vljnbfp43/rna_seq.h5ad?dl=1' url_atac = 'https://www.dropbox.com/s/163msz0k9hkfrt7/atac_seq.h5ad?dl=1' filename_rna = 'multiome_10xpbmc10k_rna.h5ad' filename_atac = 'multiome_10xpbmc10k_atac.h5ad' filepath = os.path.join(settings.workdir, 'data') fullpath_rna = os.path.join(filepath, filename_rna) fullpath_atac = os.path.join(filepath, filename_atac) if(not os.path.exists(fullpath_rna)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url_rna, fullpath_rna, desc=filename_rna) print(f'Downloaded to {filepath}.') if(not os.path.exists(fullpath_atac)): print('Downloading data ...') os.makedirs(filepath, exist_ok=True) download_url(url_atac, fullpath_atac, desc=filename_atac) print(f'Downloaded to {filepath}.') adata_rna = read_h5ad(fullpath_rna) adata_atac = read_h5ad(fullpath_atac) dict_adata = {'rna': adata_rna, 'atac': adata_atac} return dict_adata