PKRUOHpUUfgread/__init__.py# coding: utf-8 """Module for reading datasets shared on FASTGenomics""" # set blog url for readme BLOGURL = "https://www.fastgenomics.org/blog_posts/readers/" from .read import get_datasets, read_dataset, read_datasets from get_version import get_version __version__ = get_version(__file__) __author__ = "FASTGenomics" del get_version PKRUO |: fgread/dataset.pyimport json from pathlib import Path import re DATASET_INFO_FILE = "dataset_info.json" class DataSet(object): """Represents a dataset on FASTGenomics, including the relative location and the contents of the ``metadata.json`` file. :param path: absolute path to a dataset folder, for example ``/fastgenomics/data/dataset_0001`` """ def __init__(self, path: str): self.path = Path(path) if not self.path.exists(): raise FileNotFoundError(self.path) self.metadata = self.read_metadata() self.format = self.metadata["format"] self.title = self.metadata["title"] self.file = self.path / self.metadata["file"] self.id = int(self.path.name.split("_")[-1]) def read_metadata(self): with open(self.path / DATASET_INFO_FILE) as f: return json.load(f) def __repr__(self): return ( f'id: {self.id}\n' f'title: {self.title}\n' f'format: {self.format}\n' f'path: {self.path}\n' f'file: {self.metadata["file"]}' ) class DatasetDict(dict): ''' Represents a dictionary for :py:class:`~DataSet` objects. You can select a single dataset by its ID (DatasetDict[ID]), or you can pass a list of IDs (DatasetDict[[ID1, ID3, ID4]]), or you can use slices (DatasetDict[1:3]). Note that lower and upper bounds are inclusive and you pass dataset IDs not indices (hence starting with 1). ''' def __getitem__(self, select): if isinstance(select, slice): stop = select.stop + 1 if select.stop else None newkey = slice(select.start, stop, select.step) keys = list(self.keys()) keys.append(max(self.keys()) + 1) idx = list(range(max(keys))[newkey]) select = list(set(idx) & set(keys)) if isinstance(select, list): return DatasetDict({f"{sel}": self[sel] for sel in select}) return dict.__getitem__(self, select) def __repr__(self): ds_list = [f"Dataset: {id}\n{indent_multiline(str(ds))}" for id, ds in self.items()] return "\n\n".join(ds_list) def indent_multiline(ml_str, tabs=1): ''' Indents a multiline string. :param ml_str: A multiline string :param tabs: the number of tabs (indents) to add to each line :return: An indented multiline string ''' return re.sub(r"^", "\t" * tabs, ml_str, flags=re.M)PKRUOᏭfgread/read.pyimport re from pathlib import Path from . import readers from .dataset import DataSet, DatasetDict import logging from . import BLOGURL # configure logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) logger.addHandler(ch) DEFAULT_READERS = { "Loom": readers.read_loom_to_anndata, "Seurat Object": readers.read_seurat_to_anndata, "AnnData": readers.read_anndata_to_anndata, "10x (hdf5)": readers.read_10xhdf5_to_anndata, "10x (mtx)": readers.read_10xmtx_to_anndata, "tab-separated text": readers.read_densetsv_to_anndata, "comma-separated text": readers.read_densecsv_to_anndata, } DATA_DIR = "/fastgenomics/data" def read_dataset(dataset: DataSet, additional_readers={}): """Reads a single dataset. Dispatches to specific readers based on the value of the ``dataset.format``. :param dataset: Object of class :py:class:`~.dataset.DataSet` to be read. :param additional_readers: Used to specify your own readers for the specific data set format. Highly experimental and not tested. :returns: AnnData object containing the loaded dataset. """ format = dataset.format title = dataset.title path = dataset.path readers = {**DEFAULT_READERS, **additional_readers} if format in readers: logger.info( f'Loading dataset "{title}" in format "{format}" from directory "{path}"...\n' ) adata = readers[format](dataset) adata.uns["metadata"] = dataset.metadata adata.obs["fg_title"] = dataset.title adata.obs["fg_id"] = dataset.id n_genes = adata.shape[1] n_cells = adata.shape[0] logger.info( f'Loaded dataset "{title}" with {n_cells} cells and {n_genes} genes.\n' f'==================================================================\n' ) return adata elif format == "Other": raise NotImplementedError( f'The format of the dataset "{title}" is "{format}". Datasets with the "{format}" format are '\ f'unsupported by this module and have to be loaded manually.\nSee {BLOGURL} for more information.' ) elif format == "Not set": raise ValueError( f'The format of the dataset "{title}" was not defined. If you can modify the dataset please specify '\ f'its format in its details page, otherwise ask the dataset owner to do that.\nSee {BLOGURL} for more information.' ) else: raise KeyError(f'Unsupported format "{format}", use one of {list(readers)} or implement your '\ f'own reading function.\nSee {BLOGURL} for more information.') def get_datasets(data_dir=DATA_DIR): """Gets all available datasets. This is a convenience function used to gather all information specified in the FASTGenomics environment. The returned value can be either used to manually load datasets or passed to the :py:func:`read_dataset` or :py:func:`read_datasets` functions. :param data_dir: Specify the main data directory. Useful for testing the module, defaults to the FASTGenomics path ``/fastgenomics/data``. :returns: A dictionary where keys are dataset ids (the ``xxxx`` part of ``/fastgenomics/data/dataset_xxxx``) and values are the corresponding :py:class:`~dataset.DataSet` objects. """ data_dir = Path(data_dir) paths = [ subdir for subdir in sorted(data_dir.iterdir()) if subdir.is_dir() and re.match(r"^dataset_\d{4}$", subdir.name) ] datasets = DatasetDict({dataset.id: dataset for dataset in map(DataSet, paths)}) return datasets def read_datasets(datasets=None, additional_readers={}, data_dir=DATA_DIR): """Reads all specified datasets and returns them as AnnData objects. Internally uses :py:func:`read_dataset` to read the datasets. :param datasets: If specified, read the datasets from this dictionary or list. Can be useful for e.g. filtering some dataset types. :param additional_readers: Used to specify your own readers for the specific data set format. Highly experimental and not tested. :param data_dir: Specify the main data directory. Only used when ``datasets==None``. Useful for testing the module, defaults to the FASTGenomics path ``/fastgenomics/data``. :returns: If multiple datasets are given returns a dictionary of dataset objects, where the keys are dataset ids and the values are the corresponding AnnData objects. If a single dataset is passed, an AnnData object is returned. """ datasets = datasets or get_datasets(data_dir) if isinstance(datasets, DatasetDict): return DatasetDict({ dataset_id: read_dataset(datasets[dataset_id], additional_readers=additional_readers) for dataset_id in sorted(datasets.keys()) }) elif isinstance(datasets, DataSet): return read_dataset(datasets) else: raise TypeError( f'The type of "datasets" has to be a DatasetDict or a single DataSet. Use "fgread.get_datasets()" '\ f'to create it.\nSee {BLOGURL} for more information.') PKRUO:iUq q fgread/readers.pyimport anndata import numpy as np import pandas as pd import scipy.sparse as sp import scanpy as sc from .dataset import DataSet from . import BLOGURL def read_loom_to_anndata(dataset: DataSet): """Reads a dataset in the loom format into the AnnData format.""" adata = anndata.read_loom(dataset.file) return adata def read_seurat_to_anndata(dataset: DataSet): """Reads a dataset in the Seurat format into the AnnData format (not implemented).""" raise NotImplementedError(f"Reading of Seurat files not implemented.\nSee {BLOGURL} for more information.") def read_anndata_to_anndata(dataset: DataSet): """Reads a dataset in the AnnData format into the AnnData format.""" adata = anndata.read_h5ad(dataset.file) return adata def read_10xhdf5_to_anndata(dataset: DataSet): """Reads a dataset in the 10x hdf5 format into the AnnData format.""" adata = sc.read_10x_h5(dataset.file) return adata def read_10xmtx_to_anndata(dataset: DataSet): """Reads a dataset in the 10x mtx format into the AnnData format.""" adata = sc.read_10x_mtx(dataset.path) return adata def read_densetsv_to_anndata(dataset: DataSet): """Reads a dense text file in tsv format into the AnnData format.""" return read_densemat_to_anndata(dataset, sep="\t") def read_densecsv_to_anndata(dataset: DataSet): """Reads a dense text file in csv format into the AnnData format.""" return read_densemat_to_anndata(dataset, sep=",") def read_densemat_to_anndata(dataset: DataSet, sep=None): """Helper function to read dense text files in tsv and csv format. The separator (tab or comma) is passed by the corresponding function.""" file = dataset.file with open(file) as f: cells = f.readline().replace('"', '').split(sep) nextline = f.readline().replace('"', '').split(sep) n_cells = len(nextline)-1 cells = cells[-n_cells:] genes = pd.read_csv( file, skiprows=1, usecols=(0,), header=None, names=["GeneID"] ).set_index("GeneID") X = np.loadtxt( file, delimiter=sep, skiprows=1, usecols=range(1, len(cells) + 1), dtype=np.float32, ).T X = sp.csr_matrix(X) var = genes obs = pd.DataFrame( cells, columns=["sample"], index=pd.Series(cells, name="CellID") ) adata = anndata.AnnData(X=X, var=var, obs=obs) return adata PKRUO1--fgread-0.2.1.dist-info/LICENSEMIT License Copyright (c) 2019 FASTGenomics Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HMuSafgread-0.2.1.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UD"PK!Hѳfgread-0.2.1.dist-info/METADATASM0WBR!(!j[BM6$ ɶi ,y3} (2" Ib:et[9 cu]U¶1Y]iKO%+jE\7oAJDF^@4.󥟖,UI QHeLSPbϴ Z'O%YK4MG|-(6~!g:/Ø /⓺d)3䜖~kH='\ 㜁֕i;pvP]贮| "dB'+ 2ʍwrwR =:Jl(&5;25KT/kYf7E2|.[+TZ$`G&jW50x;̨|tsᅕZd/)ǧ?߂~PK!H/kGfgread-0.2.1.dist-info/RECORDur@{ t  AP(Y'S3NyrYuWiZRe@x>5+w0! 5,V[NȷN"*(/_)ƌA y`峂 <7ouTù.5 d.6 )v&x( @y\O$4oEF$ݍ3$[k2uW-?ȭb.:Y7>Q7, 1sNaxNRwG#x~=(Ik\QhNSrEZv:E