PK!NNgcgc/__init__.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved __version__ = "0.2.2" PK!AJ0gcgc/alphabet/__init__.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from gcgc.alphabet.iupac import ExtendedIUPACProteinEncoding from gcgc.alphabet.iupac import ExtendedIUPACDNAEncoding from gcgc.alphabet.iupac import IUPACUnambiguousDNAEncoding from gcgc.alphabet.iupac import IUPACUnambiguousRNAEncoding from gcgc.alphabet.iupac import IUPACAmbiguousDNAEncoding from gcgc.alphabet.iupac import IUPACAmbiguousRNAEncoding from gcgc.alphabet.iupac import IUPACProteinEncoding __all__ = [ "ExtendedIUPACProteinEncoding", "ExtendedIUPACDNAEncoding", "IUPACUnambiguousDNAEncoding", "IUPACUnambiguousRNAEncoding", "IUPACAmbiguousDNAEncoding", "IUPACAmbiguousRNAEncoding", "IUPACProteinEncoding", ] PK!&gcgc/alphabet/base.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from typing import Sequence from abc import ABC class EncodingAlphabet(ABC): """ The Encoding Alphabet is meant to be a baseclass for other alphabets. This is similar to vocabularies often found in NLP packages, though the extent of the vocabulary is known upfront. """ START = ">" END = "<" PADDING = "|" def __init__(self): """ Create the EncodingAlphabet object. """ self.letters_and_tokens = self.letters + self.START + self.END + self.PADDING self.encoding_index = {letter: idx for idx, letter in enumerate(self.letters_and_tokens)} self.decoding_index = {idx: letter for letter, idx in self.encoding_index.items()} def encode_token(self, token: str) -> int: """ Given a particular token, return the integer representation. Args: token: The token to convert. Returns: The integer representing the token. """ return self.encoding_index[token] def decode_token(self, int_token: int) -> str: """ Decode a token. This is the inverse of encode_token. Args: int_token: The integer representation of a token. Returns: The str which was encoded by the integer. """ return self.decoding_index[int_token] def integer_encode(self, seq: str) -> Sequence[int]: """ Integer encode the sequence. Args: seq: The sequence to pad. Returns: The integer sequence representation of the sequence. """ return [self.encoding_index[s] for s in seq] def integer_decode(self, int_seq: Sequence[int]) -> str: """ Given a sequence of integers, convert it to a string. Args: int_seq: A sequence of integers to convert. Returns: The string version of the integers list. """ return "".join(self.decode_token(s) for s in int_seq) PK!hd))gcgc/alphabet/iupac.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from Bio.Alphabet import IUPAC from gcgc.alphabet.base import EncodingAlphabet class ExtendedIUPACProteinEncoding(EncodingAlphabet, IUPAC.ExtendedIUPACProtein): """ Implements an encoding alphabet using the IUPAC extended protein letters. """ class ExtendedIUPACDNAEncoding(EncodingAlphabet, IUPAC.ExtendedIUPACDNA): """ Implements an encoding alphabet using the IUPAC extended dna letters. """ class IUPACUnambiguousDNAEncoding(EncodingAlphabet, IUPAC.IUPACUnambiguousDNA): """ Implements an encoding alphabet using the IUPAC unambiguous dna letters. """ class IUPACUnambiguousRNAEncoding(EncodingAlphabet, IUPAC.IUPACUnambiguousRNA): """ Implements an encoding alphabet using the IUPAC unambiguous rna letters. """ class IUPACAmbiguousDNAEncoding(EncodingAlphabet, IUPAC.IUPACAmbiguousDNA): """ Implements an encoding alphabet using the IUPAC unambiguous dna letters. """ class IUPACAmbiguousRNAEncoding(EncodingAlphabet, IUPAC.IUPACAmbiguousRNA): """ Implements an encoding alphabet using the IUPAC unambiguous rna letters. """ class IUPACProteinEncoding(EncodingAlphabet, IUPAC.IUPACProtein): """ Implements an encoding alphabet using the IUPAC protein letters. """ PK!տ gcgc/alphabet/utils.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from gcgc import alphabet from gcgc.exceptions import GCGCAlphabetException def biopython_alphabet_to_gcgc_alphabet(biopython_alphabet_instance): """ Args: biopython_alphabet_instance """ # Might also try just creating the type on the fly. for gcgc_alphabet in alphabet.__all__: klass = getattr(alphabet, gcgc_alphabet) instance = klass() if isinstance(instance, type(biopython_alphabet_instance)): return instance msg = f"No instance of {type(biopython_alphabet_instance)} among {alphabet.__all__}." raise GCGCAlphabetException(msg) PK!zh%{{ gcgc/cli.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved import sys import click import pathlib import logging from Bio import SeqIO import tensorflow as tf from gcgc import __version__ from gcgc.encoded_seq import EncodedSeq from gcgc.third_party.tensorflow_utils import record logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @click.group() def main(args=None): """Console script for gcgc.""" return 0 @main.command() def version(): print(__version__) def to_path(ctx, param, value) -> pathlib.Path: """ Simple callback to convert the string in the option to a command. """ return pathlib.Path(value) @main.command() @click.argument("filename", callback=to_path) @click.argument("format") @click.option("-e", "--encapsulate", is_flag=True) @click.option("-c", "--conform-to-length", "conform_to", type=int) @click.option("-p", "--pad-to-length", "pad_to", type=int) def convert_file_to_tf_records(filename, format, encapsulate, conform_to, pad_to): output_file = filename.with_suffix(".tf-records") logger.info(f"Reading from {filename} in format {format} and writing to {output_file}.") writer = tf.python_io.TFRecordWriter(str(output_file)) try: with open(filename, "rU") as handle: for seq_record in SeqIO.parse(handle, format): encoded_seq = EncodedSeq.from_seq(seq_record.seq) if encapsulate: encoded_seq = encoded_seq.encapsulate() if conform_to: encoded_seq = encoded_seq.conform(conform_to) if pad_to: encoded_seq = encoded_seq.pad(pad_to) example = record.to_tensorflow_record(encoded_seq) writer.write(example.SerializeToString()) finally: writer.close() if __name__ == "__main__": sys.exit(main()) # pragma: no cover PK!Bgcgc/encoded_seq/__init__.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from gcgc.encoded_seq.encoded_seq import EncodedSeq __all__ = ["EncodedSeq"] PK!Sq gcgc/encoded_seq/encoded_seq.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from typing import Sequence from Bio.Seq import Seq import numpy as np from gcgc.alphabet.base import EncodingAlphabet from gcgc.exceptions import GCGCAlphabetException from gcgc.alphabet.utils import biopython_alphabet_to_gcgc_alphabet class EncodedSeq(Seq): def __init__(self, data, alphabet) -> None: if not isinstance(alphabet, EncodingAlphabet): raise GCGCAlphabetException(f"Cannot use alphabet of type {type(alphabet)}.") super().__init__(data, alphabet) @classmethod def from_seq(cls, seq: Seq): """ Instantiate an EncodedSeq object from a Seq object. """ gcgc_alphabet = biopython_alphabet_to_gcgc_alphabet(seq.alphabet) return cls(str(seq), gcgc_alphabet) def encapsulate(self) -> "EncodedSeq": return self.alphabet.START + self + self.alphabet.END def pad(self, pad_to: int = 50) -> "EncodedSeq": """ Pad a sequence up to `pad_to` characters. """ seq_len = len(self) if seq_len < pad_to: n_extra_chars = pad_to - seq_len extra_chars = self.alphabet.PADDING * n_extra_chars else: extra_chars = "" return self + extra_chars def conform(self, conform_to: int = 50) -> "EncodedSeq": seq_len = len(self) if seq_len == conform_to: return self elif seq_len < conform_to: return self.pad(pad_to=conform_to) else: return self[:conform_to] @property def integer_encoded(self): return self.alphabet.integer_encode(self) @property def one_hot_encode_sequence(self) -> Sequence[Sequence[int]]: """ Encodes D x N where D is the size of the alphabet and N is the padding. Returns: A one hot encoded matrix representing the sequence. """ encoded_sequence = self.alphabet.integer_encode(self) encoded_len = len(encoded_sequence) letters_len = len(self.alphabet.letters_and_tokens) one_hot_seq = np.zeros((encoded_len, letters_len), dtype=np.int) one_hot_seq[np.arange(encoded_len), encoded_sequence] = 1 return one_hot_seq.tolist() def __add__(self, other) -> "EncodedSeq": """ Add two enccoded sequences together. """ added_seq = super().__add__(other) return self.from_seq(added_seq) def __getitem__(self, index) -> "EncodedSeq": got_item = super().__getitem__(index) if isinstance(index, int): return got_item else: return self.from_seq(got_item) PK!@jjgcgc/exceptions.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved class GCGCAlphabetException(Exception): pass PK!j.77gcgc/tests/__init__.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved PK!gcgc/tests/alphabet/__init__.pyPK!!gcgc/tests/alphabet/test_utils.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved import unittest from Bio.Alphabet import IUPAC from gcgc.alphabet.utils import biopython_alphabet_to_gcgc_alphabet from gcgc.alphabet import IUPACUnambiguousDNAEncoding class TestUtils(unittest.TestCase): def test_biopython_alphabet_to_gcgc_alphabet(self): dna = IUPAC.IUPACUnambiguousDNA() klass = biopython_alphabet_to_gcgc_alphabet(dna) self.assertIsInstance(klass, IUPACUnambiguousDNAEncoding) PK!gcgc/tests/cli/__init__.pyPK! vgcgc/tests/cli/test_cli.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved import unittest from click.testing import CliRunner from pathlib import Path from Bio.SeqRecord import SeqRecord from Bio import SeqIO from gcgc.encoded_seq import EncodedSeq from gcgc.alphabet.iupac import ExtendedIUPACDNAEncoding from gcgc import cli from gcgc.tests.fixtures import P53_HUMAN class TestGcgc(unittest.TestCase): """Tests for `gcgc` package.""" def setUp(self): """Set up test fixtures, if any.""" def tearDown(self): """Tear down test fixtures, if any.""" def test_command_line_interface(self): """Test the CLI.""" runner = CliRunner() result = runner.invoke(cli.main) assert result.exit_code == 0 help_result = runner.invoke(cli.main, ["--help"]) assert help_result.exit_code == 0 assert "--help Show this message and exit." in help_result.output @unittest.skip("Test failing as invoke isn't writing the file for some reason.") def test_convert_to_tf_records(self): runner = CliRunner() fmt = P53_HUMAN.suffix.lstrip(".") runner.invoke(cli.convert_file_to_tf_records, str(P53_HUMAN), fmt) PK!gcgc/tests/cli.pyPK!"gcgc/tests/encoded_seq/__init__.pyPK! . *gcgc/tests/encoded_seq/test_encoded_seq.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved import unittest from numpy.testing import assert_array_equal from Bio.Seq import Seq from Bio.Alphabet import IUPAC from gcgc.encoded_seq import EncodedSeq from gcgc.alphabet.iupac import ExtendedIUPACDNAEncoding from gcgc.exceptions import GCGCAlphabetException class TestEncodedSeq(unittest.TestCase): def test_raises_for_bad_alphabet(self): with self.assertRaises(GCGCAlphabetException): EncodedSeq("ATCG", 1) def test_pad_to(self): pad_to = 10 es = EncodedSeq("ATCG", ExtendedIUPACDNAEncoding()).pad(pad_to) self.assertEqual(len(es), pad_to) def test_pad_to_over(self): pad_to = 2 es = EncodedSeq("ATCG", ExtendedIUPACDNAEncoding()) new_es = es.pad(pad_to) self.assertEqual(len(new_es), len(es)) def test_encapsulate(self): es = EncodedSeq("ATCG", ExtendedIUPACDNAEncoding()) new_es = es.encapsulate() self.assertEqual(new_es[0], new_es.alphabet.START) self.assertEqual(new_es[-1], new_es.alphabet.END) def test_conform(self): length = 5 es = EncodedSeq("A", ExtendedIUPACDNAEncoding()) new_es = es.encapsulate().conform(length) self.assertEqual(len(new_es), length) self.assertIsInstance(new_es, EncodedSeq) es = EncodedSeq("ATCGGCG", ExtendedIUPACDNAEncoding()) new_es = es.encapsulate().conform(length) self.assertEqual(len(new_es), length) self.assertIsInstance(new_es, EncodedSeq) es = EncodedSeq("ATC", ExtendedIUPACDNAEncoding()) new_es = es.encapsulate().conform(length) self.assertEqual(len(new_es), length) self.assertIsInstance(new_es, EncodedSeq) def test_one_hot_encoding(self): expected = [ [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ] es = EncodedSeq("ATCG", ExtendedIUPACDNAEncoding()) assert_array_equal(es.one_hot_encode_sequence, expected) def test_from_seq_bad_alphabet(self): seq = Seq("ATCG", None) with self.assertRaises(GCGCAlphabetException): EncodedSeq.from_seq(seq) def test_from_seq(self): alphabet = IUPAC.IUPACUnambiguousDNA() seq = Seq("ATCG", alphabet) encoded_seq = EncodedSeq.from_seq(seq) self.assertEqual(seq, encoded_seq) PK!i+ޞgcgc/tests/fixtures/__init__.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved import pathlib _PATH = pathlib.Path(__file__).parent P53_HUMAN = _PATH / 'p53_human/p53_human.fasta' PK!S9-gcgc/tests/fixtures/p53_human/p53_human.fasta>sp|P04637|P53_HUMAN Cellular tumor antigen p53 OS=Homo sapiens OX=9606 GN=TP53 PE=1 SV=4 MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD PK!IM((2gcgc/tests/fixtures/p53_human/p53_human.tf_recordslo  m alphabet_lettersY W A C D E F G H I K L M N P Q R S T V W Y B X Z J U O > < |  integer_encoded                                                                          ĐͼPK!תrgcgc/tests/test_gcgc.py#!/usr/bin/env python # -*- coding: utf-8 -*- """Tests for `gcgc` package.""" import unittest from click.testing import CliRunner from gcgc import cli class TestGcgc(unittest.TestCase): """Tests for `gcgc` package.""" def setUp(self): """Set up test fixtures, if any.""" def tearDown(self): """Tear down test fixtures, if any.""" def test_000_something(self): """Test something.""" def test_command_line_interface(self): """Test the CLI.""" runner = CliRunner() result = runner.invoke(cli.main) assert result.exit_code == 0 help_result = runner.invoke(cli.main, ["--help"]) assert help_result.exit_code == 0 assert "--help Show this message and exit." in help_result.output PK!3gcgc/tests/third_party/tensorflow_utils/__init__.pyPK!g;gcgc/tests/third_party/tensorflow_utils/test_gcgc_record.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved import unittest from Bio import SeqIO import tensorflow as tf from gcgc.encoded_seq import EncodedSeq from gcgc.third_party.tensorflow_utils import record as gcgc_record from gcgc.tests.fixtures import P53_HUMAN class TestTFRecordSerialization(unittest.TestCase): def setUp(self): pass def tearDown(self): pass def test_read_tf_records(self): with open(P53_HUMAN) as f: sr = SeqIO.read(f, "fasta") encoded_seq = EncodedSeq.from_seq(sr.seq) filenames = [str(P53_HUMAN.with_suffix(".tf_records"))] dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.map(gcgc_record.from_tensorflow_example) dataset_iterator = dataset.make_one_shot_iterator() next_item = dataset_iterator.get_next() session = tf.Session() seq = session.run(next_item) self.assertEqual(encoded_seq.integer_encoded, seq.integer_encoded.tolist()) PK!gcgc/third_party/__init__.pyPK!-gcgc/third_party/tensorflow_utils/__init__.pyPK!ν{BB+gcgc/third_party/tensorflow_utils/record.py# (c) Copyright 2018 Trent Hauck # All Rights Reserved from typing import NamedTuple import tensorflow as tf import numpy as np def to_tensorflow_record(encoded_seq) -> tf.train.Example: """ Convert the sequence to a tensorflow record. """ integer_encoded = tf.train.Feature( int64_list=tf.train.Int64List(value=encoded_seq.integer_encoded) ) alphabet_letters = tf.train.Feature( bytes_list=tf.train.BytesList( value=[m.encode("utf-8") for m in encoded_seq.alphabet.letters_and_tokens] ) ) feature_arg = {"integer_encoded": integer_encoded, "alphabet_letters": alphabet_letters} example = tf.train.Example(features=tf.train.Features(feature=feature_arg)) return example # TODO: Should this be SequenceRecordExample?? I.e. probably want to have the actual id of the # sequence for later use. class ParsedEncodedSequence(NamedTuple): integer_encoded: np.ndarray alphabet_letters: np.ndarray def from_tensorflow_example(example: tf.train.Example) -> ParsedEncodedSequence: """ Given the example return an encoded sequence with the integer encoding and alphabet. """ features = { "integer_encoded": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True), "alphabet_letters": tf.FixedLenSequenceFeature((), tf.string, allow_missing=True), } parsed_features = tf.parse_single_example(example, features) return ParsedEncodedSequence( integer_encoded=parsed_features["integer_encoded"], alphabet_letters=parsed_features["alphabet_letters"], ) PK!HX%&%gcgc-0.2.2.dist-info/entry_points.txtN+I/N.,()JONOz9Vy\\PK!oB..gcgc-0.2.2.dist-info/LICENSEMIT License Copyright (c) 2018, Trent Hauck Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HlUTgcgc-0.2.2.dist-info/WHEEL HM K-*ϳR03rOK-J,/R(O-)T0343 /, (-JLR()*M IL*4KM̫PK!H| gcgc-0.2.2.dist-info/METADATAPKO1WE!ih0D\Pv5{`d=f[J3PВ%dxd t:PS`:dl}+-@$~C6Nז\l-5򩺩^*Ԓ\\5i烀ǐ-F~aYF@jCݵ4޲VjdG3g39VS ^`N59 lm7lδPK!HVx= gcgc-0.2.2.dist-info/RECORDGF[`L,@ H҆BE@ޚyƚg؋W.6ݽNsϭ&O(*r Eԟf Aҏ֥ ]T~rqB6&">ϛB6ag77Y/齈f_Z6(xG>԰\3$ИZvs.']qvj O>>s$؞ģbn,]$3)$̭H8s]ădzk%GK;`,2enHw%`+F$z%5FN ÇDBqnVaJrICܗ~N ~DWzz{TuzD>F}