PK1~O`  fgread/__init__.py# coding: utf-8 """Module for reading files shared on FASTGenomics""" from .read import print_datasets, get_datasets, read_dataset, read_datasets from get_version import get_version __version__ = get_version(__file__) __author__ = "FASTGenomics" del get_version PK1~O@fgread/dataset.pyimport json from pathlib import Path DATASET_INFO_FILE = "dataset_info.json" class DataSet(object): """Represents a dataset on FASTGenomics, including the relative location and the contents of the ``metadata.json`` file. :param path: absolute path to a dataset folder, for example ``/fastgenomics/data/dataset_0001`` """ def __init__(self, path: str): self.path = Path(path) if not self.path.exists(): raise FileNotFoundError(self.path) self.metadata = self.read_metadata() self.format = self.metadata["format"] self.title = self.metadata["title"] self.file = self.path / self.metadata["file"] self.id = int(self.path.name.split("_")[-1]) def read_metadata(self): with open(self.path / DATASET_INFO_FILE) as f: return json.load(f) def __repr__(self): return f""" id: {self.id} title: {self.title} format: {self.format} path: {self.path} """ PK1~Ofgread/read.pyimport re from pathlib import Path from . import readers from .dataset import DataSet DEFAULT_READERS = { "Loom": readers.read_loom_to_anndata, "Seurat Object": readers.read_seurat_to_anndata, "AnnData": readers.read_anndata_to_anndata, "10x (hdf5)": readers.read_10xhdf5_to_anndata, "Drop-Seq (tsv)": readers.read_dropseqtsv_to_anndata, } DATA_DIR = "/fastgenomics/data" def read_dataset(dataset: DataSet, additional_readers={}): """Reads a single dataset. Dispatches to specific readers based on the value of the ``dataset.format``. :param dataset: Object of class :py:class:`~.dataset.DataSet` to be read. :param additional_readers: Used to specify your own readers for the specific data set format. Highly experimental and not tested. :returns: AnnData object containing the loaded dataset. """ format = dataset.format title = dataset.title path = dataset.path readers = {**DEFAULT_READERS, **additional_readers} if format == "Other": raise NotImplementedError( f'The format of the dataset "{title}" is "{format}". Datasets with the "{format}" format are unsupported by this module and have to be loaded manually.' ) elif format == "Not set": raise KeyError( f'The format of the dataset "{title}" was not defined. If you can modify the dataset please specify its format in its Details page, otherwise ask the dataset owner to do that.' ) elif format in readers: print( f'Loading dataset "{title}" in format "{format}" from directory "{path}".' ) adata = readers[format](dataset) adata.uns["metadata"] = dataset.metadata adata.obs["fg_title"] = dataset.title adata.obs["fg_id"] = dataset.id return adata else: raise KeyError(f'Unsupported format "{format}", use one of {readers}') def get_datasets(data_dir=DATA_DIR): """Lists all available datasets. This is a convenience function used to gather all information specified in the FASTGenomics environment. The returned value can be either used to manually load datasets or passed to the :py:func:`read_dataset` or :py:func:`read_datasets` functions. :param data_dir: Specify the main data directory. Useful for testing the module, defaults to the FASTGenomics path ``/fastgenomics/data``. :returns: A dictionary where keys are dataset ids (the ``xxxx`` part of ``/fastgenomics/data/dataset_xxxx``) and values are the corresponding :py:class:`~dataset.DataSet` objects. """ data_dir = Path(data_dir) paths = [ subdir for subdir in data_dir.iterdir() if subdir.is_dir() and re.match(r"^dataset_\d{4}$", subdir.name) ] return {dataset.id: dataset for dataset in map(DataSet, paths)} def print_datasets(data_dir=DATA_DIR): """prints the list of available datasets :param data_dir: Specify the main data directory. Useful for testing the module, defaults to the FASTGenomics path ``/fastgenomics/data``. """ datasets = get_datasets(data_dir=data_dir) for index, ds in datasets.items(): print(f"Dataset: {index}:", ds) print() def read_datasets(datasets=None, additional_readers={}, data_dir=DATA_DIR): """Reads all datasets and returns them as AnnData objects. Internally uses :py:func:`read_dataset` to read the datasets. :param datasets: If specified, read the datasets from this dictionary. Can be useful for e.g. filtering some dataset types. :param additional_readers: Used to specify your own readers for the specific data set format. Highly experimental and not tested. :param data_dir: Specify the main data directory. Only used when ``datasets==None``. Useful for testing the module, defaults to the FASTGenomics path ``/fastgenomics/data``. :returns: A dictionary of dataset objects, where the keys are dataset ids and the values are the corresponding AnnData objects. """ datasets = datasets or get_datasets(data_dir) return { dataset.id: read_dataset(dataset, additional_readers=additional_readers) for dataset in datasets.values() } PK1~ORfgread/readers.pyimport anndata import re import numpy as np import pandas as pd import scipy.sparse as sp from .scanpy_read_10x import read_10x_h5 from .dataset import DataSet def read_loom_to_anndata(dataset: DataSet): """Reads a dataset in the loom format into the AnnData format.""" adata = anndata.read_loom(dataset.file) return adata def read_seurat_to_anndata(dataset: DataSet): """Reads a dataset in the Seurat format into the AnnData format (not implemented).""" raise NotImplementedError("Reading of Seurat files not implemented.") def read_anndata_to_anndata(dataset: DataSet): """Reads a dataset in the AnnData format into the AnnData format.""" adata = anndata.read_h5ad(dataset.file) return adata def read_10xhdf5_to_anndata(dataset: DataSet): """Reads a dataset in the 10x hdf5 format into the AnnData format.""" # todo replace with anndata.read_10x_h5 once read_10x_h5 is moved to anndata (if # ever) adata = read_10x_h5(dataset.file) return adata def read_dropseqtsv_to_anndata(dataset: DataSet): """Reads a dataset in the DropSeq format into the AnnData format.""" file = dataset.file with open(file) as f: cells = f.readline().replace('"', "").split("\t") samples = [re.search("(.*)_", c).group(1) for c in cells] genes = pd.read_csv( file, sep="\t", skiprows=1, usecols=(0,), header=None, names=["GeneID"] ).set_index("GeneID") X = np.loadtxt( file, delimiter="\t", skiprows=1, usecols=range(1, len(cells) + 1), dtype=np.float32, ).T X = sp.csr_matrix(X) var = genes obs = pd.DataFrame( samples, columns=["sample"], index=pd.Series(cells, name="CellID") ) adata = anndata.AnnData(X=X, var=var, obs=obs) return adata PK1~Ofgread/scanpy_read_10x.py"""Taken from scanpy/scanpy/readwrite.py as a temporary fix to the lack of read_10x_h5 in the anndata package. """ from anndata import AnnData from typing import Union, Optional from pathlib import Path import numpy as np import tables def read_10x_h5( filename: Union[str, Path], genome: Optional[str] = None, gex_only: bool = True ) -> AnnData: """\ Read 10x-Genomics-formatted hdf5 file. Parameters ---------- filename Filename. genome Filter expression to this genes within this genome. For legacy 10x h5 files, this must be provided if the data contains more than one genome. gex_only Only keep 'Gene Expression' data and ignore other feature types, e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom' Returns ------- Annotated data matrix, where obsevations/cells are named by their barcode and variables/genes by gene name. The data matrix is stored in `adata.X`, cell names in `adata.obs_names` and gene names in `adata.var_names`. The gene IDs are stored in `adata.var['gene_ids']`. The feature types are stored in `adata.var['feature_types']` """ with tables.open_file(str(filename), "r") as f: v3 = "/matrix" in f if v3: adata = _read_v3_10x_h5(filename) if genome: if genome not in adata.var["genome"].values: raise ValueError( f"Could not find data corresponding to genome '{genome}' in '{filename}'. " f'Available genomes are: {list(adata.var["genome"].unique())}.' ) adata = adata[:, list(map(lambda x: x == str(genome), adata.var["genome"]))] if gex_only: adata = adata[ :, list(map(lambda x: x == "Gene Expression", adata.var["feature_types"])), ] return adata else: return _read_legacy_10x_h5(filename, genome=genome) def _read_legacy_10x_h5(filename, *, genome=None, start=None): """ Read hdf5 file from Cell Ranger v2 or earlier versions. """ with tables.open_file(str(filename), "r") as f: try: children = [x._v_name for x in f.list_nodes(f.root)] if not genome: if len(children) > 1: raise ValueError( f"'{filename}' contains more than one genome. For legacy 10x h5 " "files you must specify the genome if more than one is present. " f"Available genomes are: {children}" ) genome = children[0] elif genome not in children: raise ValueError( f"Could not find genome '{genome}' in '{filename}'. " f"Available genomes are: {children}" ) dsets = {} for node in f.walk_nodes("/" + genome, "Array"): dsets[node.name] = node.read() # AnnData works with csr matrices # 10x stores the transposed data, so we do the transposition right away from scipy.sparse import csr_matrix M, N = dsets["shape"] data = dsets["data"] if dsets["data"].dtype == np.dtype("int32"): data = dsets["data"].view("float32") data[:] = dsets["data"] matrix = csr_matrix((data, dsets["indices"], dsets["indptr"]), shape=(N, M)) # the csc matrix is automatically the transposed csr matrix # as scanpy expects it, so, no need for a further transpostion adata = AnnData( matrix, dict(obs_names=dsets["barcodes"].astype(str)), dict( var_names=dsets["gene_names"].astype(str), gene_ids=dsets["genes"].astype(str), ), ) return adata except KeyError: raise Exception("File is missing one or more required datasets.") def _read_v3_10x_h5(filename, *, start=None): """ Read hdf5 file from Cell Ranger v3 or later versions. """ with tables.open_file(str(filename), "r") as f: try: dsets = {} for node in f.walk_nodes("/matrix", "Array"): dsets[node.name] = node.read() from scipy.sparse import csr_matrix M, N = dsets["shape"] data = dsets["data"] if dsets["data"].dtype == np.dtype("int32"): data = dsets["data"].view("float32") data[:] = dsets["data"] matrix = csr_matrix((data, dsets["indices"], dsets["indptr"]), shape=(N, M)) adata = AnnData( matrix, dict(obs_names=dsets["barcodes"].astype(str)), dict( var_names=dsets["name"].astype(str), gene_ids=dsets["id"].astype(str), feature_types=dsets["feature_type"].astype(str), genome=dsets["genome"].astype(str), ), ) return adata except KeyError: raise Exception("File is missing one or more required datasets.") PK1~O1--fgread-0.1.4.dist-info/LICENSEMIT License Copyright (c) 2019 FASTGenomics Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HMuSafgread-0.1.4.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UD"PK!H4 fgread-0.1.4.dist-info/METADATARMs0W-vz0B azU_ۑb7q8yFo>_r'74:tL\AF7.yslM tc~:'W VeOVp# (.F#/P?5$g#d~dpbH1Y6lF,D!̽>Ϥnjrc25]%mW#=}'iIױpj>kcm#Ǔc<1/h08szO™&1GBnLѨǸtvx=V LA\}Qzt3ӡ4 4lx^5 η'y}oq]T 77Et6m{z>=e Ԇ e_7PK!Hs2fgread-0.1.4.dist-info/RECORDu͒0< q =$ rPA$>Ԯ[^t髮WfIRVe$FHHeUUC9yNoͽ[ǪսF@E':́v#==b<&[^(qMrKv1"wl@PK1~O`  fgread/__init__.pyPK1~O@;fgread/dataset.pyPK1~Oofgread/read.pyPK1~ORhfgread/readers.pyPK1~Ofgread/scanpy_read_10x.pyPK1~O1--f2fgread-0.1.4.dist-info/LICENSEPK!HMuSa6fgread-0.1.4.dist-info/WHEELPK!H4 \7fgread-0.1.4.dist-info/METADATAPK!Hs2q9fgread-0.1.4.dist-info/RECORDPK o`;