PK&SOYfgread/__init__.py# coding: utf-8 """Module for reading files shared on FASTGenomics""" from .read import list_datasets, read_dataset, read_datasets from get_version import get_version __version__ = get_version(__file__) __author__ = "FASTGenomics" del get_version PK&SO{,ʘ_ _ fgread/read.pyimport re from pathlib import Path import json from . import readers DEFAULT_READERS = { "Loom": readers.read_loom_to_anndata, "Seurat Object": readers.read_seurat_to_anndata, "AnnData": readers.read_anndata_to_anndata, "10x (hdf5)": readers.read_10xhdf5_to_anndata, "Drop-Seq (tsv)": readers.read_dropseqtsv_to_anndata, } DATA_DIR = "/fastgenomics/data" DATASET_INFO_FILE = "dataset_info.json" class DataSet(object): """Represents a data set on FASTGenomics, including the relative location and the contents of the metadata.json file. """ def __init__(self, path): self.path = path if not self.path.exists(): raise FileNotFoundError(filename=self.path) self.metadata = self.read_metadata() self.format = self.metadata["format"] self.title = self.metadata["title"] self.file = self.path / self.metadata["file"] self.id = int(self.path.name.split("_")[-1]) def read_metadata(self): with open(self.path / DATASET_INFO_FILE) as f: return json.load(f) def __repr__(self): return "\n".join( [ f"id: {self.id}", f"title: {self.title}", f"format: {self.format}", f"path: {self.path}", ] ) def read_dataset(dataset: DataSet, additional_readers={}): """Reads a single data set. Dispatches to specific readers based on the contents of the `dataset.format`. """ format = dataset.format title = dataset.title path = dataset.path readers = {**DEFAULT_READERS, **additional_readers} if format == "Other": raise NotImplementedError( f'The format of the data set "{title}" is "{format}". Data sets with the "{format}" format are unsupported by this module and have to be loaded manually.' ) elif format == "Not set": raise KeyError( f'The format of the data set "{title}" was not defined. If you can modify the data set please specify its format in its Details page, otherwise ask the data set owner to do that.' ) elif format in readers: print( f'Loading data set "{title}" in format "{format}" from directory "{path}".' ) adata = readers[format](dataset) adata.uns["metadata"] = dataset.metadata adata.var["fg_title"] = dataset.title adata.var["fg_id"] = dataset.id return adata else: raise KeyError(f'Unsupported format "{format}", use one of {readers}') def list_datasets(data_dir=DATA_DIR): """Lists available data sets.""" data_dir = Path(data_dir) paths = [ f for f in data_dir.iterdir() if f.is_dir() and re.match(r"^dataset_\d{4}$", f.name) ] return {dataset.id: dataset for dataset in map(DataSet, paths)} def read_datasets(datasets=None, additional_readers={}, data_dir=DATA_DIR): """Reads all data sets.""" datasets = datasets or list_datasets(data_dir) return { id: read_dataset(dataset, additional_readers=additional_readers) for id, dataset in datasets.items() } PK&SO.rYYfgread/readers.pyimport anndata import re import numpy as np import pandas as pd import scipy.sparse as sp from .scanpy_read_10x import read_10x_h5 def read_loom_to_anndata(dataset): """Reads a data set in the loom format.""" adata = anndata.read_loom(dataset.file) return adata def read_seurat_to_anndata(dataset): """Reads a data set in the Seurat format (not implemented).""" raise NotImplementedError("Reading of Seurat files not implemented.") def read_anndata_to_anndata(dataset): """Reads a data set in the AnnData format.""" adata = anndata.read_h5ad(dataset.file) return adata def read_10xhdf5_to_anndata(dataset): """Reads a data set in the 10x hdf5 format.""" # todo replace with anndata.read_10x_h5 once read_10x_h5 is moved to anndata (if # ever) adata = read_10x_h5(dataset.file) return adata def read_dropseqtsv_to_anndata(dataset): """Reads a data set in the DropSeq format.""" file = dataset.file with open(file) as f: cells = f.readline().replace('"', "").split("\t") samples = [re.search("(.*)_", c).group(1) for c in cells] genes = pd.read_csv( file, sep="\t", skiprows=1, usecols=(0,), header=None, names=["GeneID"] ).set_index("GeneID") X = np.loadtxt( file, delimiter="\t", skiprows=1, usecols=range(1, len(cells) + 1), dtype=np.float32, ).T X = sp.csr_matrix(X) var = genes obs = pd.DataFrame( samples, columns=["sample"], index=pd.Series(cells, name="CellID") ) adata = anndata.AnnData(X=X, var=var, obs=obs) return adata PK&SOfgread/scanpy_read_10x.py"""Taken from scanpy/scanpy/readwrite.py as a temporary fix to the lack of read_10x_h5 in the anndata package. """ from anndata import AnnData from typing import Union, Optional from pathlib import Path import numpy as np import tables def read_10x_h5( filename: Union[str, Path], genome: Optional[str] = None, gex_only: bool = True ) -> AnnData: """\ Read 10x-Genomics-formatted hdf5 file. Parameters ---------- filename Filename. genome Filter expression to this genes within this genome. For legacy 10x h5 files, this must be provided if the data contains more than one genome. gex_only Only keep 'Gene Expression' data and ignore other feature types, e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom' Returns ------- Annotated data matrix, where obsevations/cells are named by their barcode and variables/genes by gene name. The data matrix is stored in `adata.X`, cell names in `adata.obs_names` and gene names in `adata.var_names`. The gene IDs are stored in `adata.var['gene_ids']`. The feature types are stored in `adata.var['feature_types']` """ with tables.open_file(str(filename), "r") as f: v3 = "/matrix" in f if v3: adata = _read_v3_10x_h5(filename) if genome: if genome not in adata.var["genome"].values: raise ValueError( f"Could not find data corresponding to genome '{genome}' in '{filename}'. " f'Available genomes are: {list(adata.var["genome"].unique())}.' ) adata = adata[:, list(map(lambda x: x == str(genome), adata.var["genome"]))] if gex_only: adata = adata[ :, list(map(lambda x: x == "Gene Expression", adata.var["feature_types"])), ] return adata else: return _read_legacy_10x_h5(filename, genome=genome) def _read_legacy_10x_h5(filename, *, genome=None, start=None): """ Read hdf5 file from Cell Ranger v2 or earlier versions. """ with tables.open_file(str(filename), "r") as f: try: children = [x._v_name for x in f.list_nodes(f.root)] if not genome: if len(children) > 1: raise ValueError( f"'{filename}' contains more than one genome. For legacy 10x h5 " "files you must specify the genome if more than one is present. " f"Available genomes are: {children}" ) genome = children[0] elif genome not in children: raise ValueError( f"Could not find genome '{genome}' in '{filename}'. " f"Available genomes are: {children}" ) dsets = {} for node in f.walk_nodes("/" + genome, "Array"): dsets[node.name] = node.read() # AnnData works with csr matrices # 10x stores the transposed data, so we do the transposition right away from scipy.sparse import csr_matrix M, N = dsets["shape"] data = dsets["data"] if dsets["data"].dtype == np.dtype("int32"): data = dsets["data"].view("float32") data[:] = dsets["data"] matrix = csr_matrix((data, dsets["indices"], dsets["indptr"]), shape=(N, M)) # the csc matrix is automatically the transposed csr matrix # as scanpy expects it, so, no need for a further transpostion adata = AnnData( matrix, dict(obs_names=dsets["barcodes"].astype(str)), dict( var_names=dsets["gene_names"].astype(str), gene_ids=dsets["genes"].astype(str), ), ) return adata except KeyError: raise Exception("File is missing one or more required datasets.") def _read_v3_10x_h5(filename, *, start=None): """ Read hdf5 file from Cell Ranger v3 or later versions. """ with tables.open_file(str(filename), "r") as f: try: dsets = {} for node in f.walk_nodes("/matrix", "Array"): dsets[node.name] = node.read() from scipy.sparse import csr_matrix M, N = dsets["shape"] data = dsets["data"] if dsets["data"].dtype == np.dtype("int32"): data = dsets["data"].view("float32") data[:] = dsets["data"] matrix = csr_matrix((data, dsets["indices"], dsets["indptr"]), shape=(N, M)) adata = AnnData( matrix, dict(obs_names=dsets["barcodes"].astype(str)), dict( var_names=dsets["name"].astype(str), gene_ids=dsets["id"].astype(str), feature_types=dsets["feature_type"].astype(str), genome=dsets["genome"].astype(str), ), ) return adata except KeyError: raise Exception("File is missing one or more required datasets.") PK!HMuSafgread-0.1.1.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UD"PK!H`#fgread-0.1.1.dist-info/METADATARMs0W-vz0B azU_ۑb7q8yFo>_r'74:tL\AF7.ylM tc~:'W VeOVp# (.F#/P?5$g#d~dpbH1Y6lF,D!̽>Ϥnjrc25]%mW#=}'iIױp1ʶ14u9='dV}#!nghTHc\ ;}N+kYg@gak.J`>[^(=:HPLud6it.\n]HG:JJͶ= Z]e܃nyjÅ/ePK!HVfgread-0.1.1.dist-info/RECORDuMO0񻟥U+/u&#. 2* )e]̓~Pd;+Y * 2CzXrA+91Z_VNb0hTGy="M2at#~\) S1чjlJ*>t?K\wi;۽g+|ڤ޴k8W2|Mr*qjN䘬Awˮ2>'8=B"y&?J'f7No[})mgZU&ݜIl'BޮXIޱ?_PTavmwK>P8au/1HL.jVx