PK!+U00 diamond_add_taxonomy/__init__.py__version__ = '0.1.0' __all__ = ['taxid_utils'] PK!ssdiamond_add_taxonomy/cli.pyimport csv from typing import Optional, TextIO import re import sys import click from .taxid_utils import TaxIDExpander @click.command(short_help='Annotate diamond output with taxonomy names') @click.option('--taxdump_filename', type=click.Path(exists=True, file_okay=True), help='Path to local copy of NCBI taxdump.tar.gz file') @click.option('--taxdb_filename', type=click.Path(), help='Name for the processed database, will be loaded if it exists') @click.option('--diamond_output_format', default='6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovhsp stitle staxids', show_default=True, help='Output format used by DIAMOND (most include staxids)') @click.option('--output_file', type=click.File('w'), default=sys.stdout, help='Output file to write output with expanded taxonomy information (TSV format)') @click.argument('diamond_output_file', type=click.File()) def annotate_diamond(diamond_output_file: TextIO, diamond_output_format: str, output_file: TextIO, taxdump_filename: Optional[str] = None, taxdb_filename: Optional[str] = None): """annotate_diamond - add lineage info to DIAMOND output file that includes staxids A new output file is created with 7 extra columns on the right hand side that contain the standard ranks superkingdom, phylym, class, order, family, genus and species corresponding to the NCBI taxid in the staxids column. The taxonomy lookup is performed using the NCBI taxonomy database via ete3 NCBITaxa. If either a saved copy of the taxdump.tar.gz file or the sqlite3 db generated by NCBITaxa is available these can be provided to reduce network usage and speed up processing. \b Args: diamond_output_file(file) - file containing output from DIAMOND diamond_ouput_format(str) - format used for --outfmt with DIAMOND, must contain staxids field output_file(file) - file to write output to (default is sys.stdout) taxdump_filename(str) - path to NCBI taxdump.tar.gz file for the taxonomy resolver (optional) taxdb_filename(str) - path to a sqlite3 db created from NCBI taxdump.tar.gz by ete3 NCBITaxa""" taxid_re = re.compile(r'^(\d+?)*$') annotater = TaxIDExpander(taxdb_filename=taxdb_filename, taxdump_filename=taxdump_filename) assert 'staxids' in diamond_output_format, "The DIAMOND output format must include the staxids column" taxid_column = diamond_output_format.split().index('staxids') - 1 # the column position, minus 1 to ignore '6' output = csv.writer(output_file, delimiter='\t') for row in csv.reader(diamond_output_file, delimiter='\t'): try: taxid = row[taxid_column] except IndexError: sys.exit(f"Caught an IndexError trying to retrieve taxon info from column {taxid_column}, did you use the correct --diamond_output_format?") if taxid_re.match(taxid) is None: sys.exit(f"Taxid f{taxid} does not match expected format taxon id format, did you use the correct --diamond_output_format?") if taxid == '': # this entry is missing taxonomy id info lineage_info = [('', 'UNKNOWN')] * 7 elif ';' in taxid: # this is an entry from multiples taxons, no clean way to handle that lineage_info = [('', 'UNKNOWN/MULTIPLE')] * 7 else: taxid = int(taxid) lineage_info_std = annotater.get_lineage(taxid, only_standard_ranks=True) # we only include the 7 standard ranks lineage_dict = dict(lineage_info_std) lineage_info = [] for rank in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']: lineage_info.append((rank, lineage_dict.get(rank, 'UNKNOWN'))) output_row = row + [taxon_info[1] for taxon_info in lineage_info] output.writerow(output_row) if diamond_output_file != sys.stdout: diamond_output_file.close() if __name__ == '__main__': annotate_diamond() # pylint: disable=E1120 PK!V'P! ! #diamond_add_taxonomy/taxid_utils.pyfrom pathlib import Path from typing import Optional, List, Tuple from ete3 import NCBITaxa class TaxIDExpander(object): def __init__(self, taxdump_filename: str = None, taxdb_filename: str = None) -> 'TaxIDExpander': """Constructor for TaxIDExpander Args: taxdump_filename(str): if specified, refers to a local copy of the NCBI taxdump.tar.gz file taxdb_filename(str): if specified will be used to look for a db containing the NCBI database to load. if both taxdump_filename and taxdb_filename are set, save to taxdb_filename """ if taxdump_filename is not None: taxdump_path = Path(taxdump_filename) if not (taxdump_path.exists() and taxdump_path.is_file()): raise ValueError(f'{taxdump_filename} must be a readable file') if taxdb_filename is not None: # we have both a taxdump file and a taxdb file # this means we load from taxdump file and save to taxdb file self.ncbi = NCBITaxa(taxdump_file=taxdump_filename, dbfile=taxdb_filename) else: # we have a taxdump file and no taxdb file # this means we load from the taxdump file and let ete3 save to its default location self.ncbi = NCBITaxa(taxdump_file=taxdump_filename) else: if taxdb_filename is not None: # we have a taxdb file and no taxdump file # this means we load the database from the taxdb file taxdb_path = Path(taxdb_filename) if not (taxdb_path.exists() and taxdb_path.is_file()): raise ValueError(f'{taxdb_filename} must be a readable file') self.ncbi = NCBITaxa(dbfile=taxdb_filename) else: # we have neither a taxdump file nor a taxdb file # this means ete3 loads the database over the network (and cache in local directory) # and let ete3 save the taxdb to its default location self.ncbi = NCBITaxa() def get_lineage(self, taxid: str, only_standard_ranks: Optional[bool] = False) -> List[Tuple[str, str]]: """Return lineage for a given taxonomy ID Raises ValueError if taxonomy ID is not found. Args: taxid(str): NCBI taxonomy ID only_standard_ranks(bool): if True only return superkingdom, phylum, class, order, family, genus and species ranks Returns: list of tuples where the tuples have members (taxon rank, taxon name)""" lineage_ids = self.ncbi.get_lineage(taxid) names = self.ncbi.get_taxid_translator(lineage_ids) ranks = self.ncbi.get_rank(lineage_ids) standard_ranks = set(['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) lineage = [] for id in lineage_ids: rank = ranks[id] if only_standard_ranks and rank not in standard_ranks: continue lineage.append((ranks[id], names[id])) return lineage def get_scientific_name(self, taxid: str): results = self.ncbi.translate_to_names([taxid]) if not results: return 'UNKNOWN' else: return results[0] PK!H)pȥ=R5diamond_add_taxonomy-0.1.1.dist-info/entry_points.txtN+I/N.,()JLKOLI/Iϭ&i_XUPK!3D)),diamond_add_taxonomy-0.1.1.dist-info/LICENSEMIT License Copyright (c) 2018 pvanheus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!H\TT*diamond_add_taxonomy-0.1.1.dist-info/WHEEL 1 0 нR \I$ơ7.ZON `h6oi14m,b4>4ɛpK>X;baP>PK!Hwc-diamond_add_taxonomy-0.1.1.dist-info/METADATAIO0sRQdD"uSYh !^.z( ԛ޼7cOcNhJPZ1EqKX?hwni|'ђbe7zeӬXeϬQUԸ>cm"rR.(]4j|-ytp}%&`եC sђ^aŋ]œ);OX]x̀sf025=IVl3AU6a̖tjO<;=X1q,"K3tai ۶>D[ $y %v=UPK!Hkӓ+diamond_add_taxonomy-0.1.1.dist-info/RECORD=0~ e[ *2dą(௿6laTChpFpJoO)e"U?+/VA Sb9DRTp.U^Un~W٬ }g_Ц^dNi{b(n>%*hiN-qQ99ì;e_^WQd%M3^ee#2e9<1ю)D 9 CYN7/-Ɲ~ ;@ޜ){j{{w:7>ky]a >x6}5 $E{Vh~LΛSD u؆L=Gz ȐҼOb3~=KdH#ƾd|^P>PK!+U00 diamond_add_taxonomy/__init__.pyPK!ssndiamond_add_taxonomy/cli.pyPK!V'P! ! #diamond_add_taxonomy/taxid_utils.pyPK!H)pȥ=R5|diamond_add_taxonomy-0.1.1.dist-info/entry_points.txtPK!3D)), diamond_add_taxonomy-0.1.1.dist-info/LICENSEPK!H\TT*#diamond_add_taxonomy-0.1.1.dist-info/WHEELPK!Hwc-$diamond_add_taxonomy-0.1.1.dist-info/METADATAPK!Hkӓ+%diamond_add_taxonomy-0.1.1.dist-info/RECORDPK'