PKQMBgHdogma/__init__.py"Collection of objects related by the central dogma of biology." __version__ = '0.0.1' name = 'dogma' from dogma.nbci import ( NBCI_GENETIC_CODE_NAMES, NCBI_NUCLEOTIDES, NCBI_CODONS, NCBI_AMINO_ACIDS, ) from dogma.utils import ( DEFAULT_DECIMAL_PRECISION, STANDARD_NUCLEOTIDES, DEGENERATE_NUCLEOTIDE_CODE, DEGENERATE_NUCLEOTIDES, DEGENERATE_NUCLEOTIDE_CODE_REVERSED, DEGENERATE_NUCLEOTIDE_CODE_COMPOSITION, NUCLEOTIDE_BASE_PAIRS, DEGENERATE_NUCLEOTIDE_PAIRS, DEFAULT_NUCLEOTIDE_LABEL, DEFAULT_OLIGONUCLEOTIDE_LABEL, STANDARD_CODONS, DEGENERATE_CODONS, DEFAULT_CODON_LABEL, STANDARD_AMINO_ACIDS, DEFAULT_AMINO_ACID_LABEL, STOP_LABEL, DEFAULT_RESIDUE_LABEL, rescale, get_frequency_dictionary, get_random_oligonucleotide ) from dogma.genetic_codes import ( GeneticCode, DEFAULT_GENETIC_CODE, translate) from dogma.nucleotides import ( Nucleotide, combine_nucleotides, is_valid_nucleotide_string ) from dogma.codons import ( Codon, degenerate_codon_string_to_standard_members, nucleotide_string_to_codons, combine_codons, combine_codon_labels, merge_codons ) from dogma.oligonucleotides import ( Oligonucleotide, reverse_complement ) from dogma.amino_acids import AminoAcid from dogma.proteins import ( Protein, calculate_protein_degeneracy ) PK~QMH dogma/amino_acids.pyfrom dogma import ( GeneticCode, DEFAULT_GENETIC_CODE, Codon, ) class AminoAcid: def __init__(self, data=None, genetic_code=None, triplet_string_is_dna=True): if not isinstance(genetic_code, GeneticCode): genetic_code = DEFAULT_GENETIC_CODE self.genetic_code = genetic_code # input data is string of length 1, an amino acid letter # AminoAcid('A') if isinstance(data, str) and len(data) == 1: self.label = data self.members = [data] self.proportions = [1] self.codon_labels = self.get_synonymous_codons() self.codons = [self.get_synonymous_codons('Codon')] self.composition = dict(zip(self.members, self.proportions)) # input data is a string of length 3, a codon label # AminoAcid('GGG') elif (isinstance(data, str) and len(data) == 3 and triplet_string_is_dna): self.codons = [Codon(data, genetic_code)] self.codon_labels = self.codons[0].members self.composition = self.codons[0].translate() self.members = sorted(self.composition.keys()) self.proportions = [self.composition[_] for _ in self.members] self.label = genetic_code[data] # input data is a Codon object # AminoAcid(Codon('GGG')) elif isinstance(data, Codon): self.codons = [data] self.codon_labels = self.codons[0].members self.composition = self.codons[0].translate() self.members = sorted(self.composition.keys()) self.proportions = [self.composition[_] for _ in self.members] self.label = genetic_code[data.label] self.degenerate = self.is_degenerate() self.equimolar = self.is_equimolar() def get_synonymous_codons(self, output_type='string'): """ Returns list of triplet strings or Codon instances. Each synonymous codon encodes a nonzero amino acid. """ codon_strings = [] for member in self.members: codon_strings += self.genetic_code.code_reversed[member] if output_type == 'string': return codon_strings elif output_type == 'Codon': return Codon(codon_strings, self.genetic_code) elif output_type == 'Codons': return [Codon(_, self.genetic_code) for _ in codon_strings] return None def is_degenerate(self): """ Amino acid is degenerate if there are multiple amino acid members have nonzero composition values. """ return len(self.members) def is_equimolar(self): """ Amino acid is equimolar if all nonzero amino acid members have equal abundance. """ return max(self.proportions) == min(self.proportions) def __str__(self): print(self.label) print(self.composition) def __repr__(self): l, c = self.label, self.composition return f'AminoAcid(label="{l}", composition={c})' PK~QM Vdogma/codons.pyfrom itertools import product from collections import defaultdict import decimal from random import choices from dogma import ( DEFAULT_DECIMAL_PRECISION, GeneticCode, DEFAULT_GENETIC_CODE, Nucleotide, DEGENERATE_NUCLEOTIDE_CODE, DEGENERATE_NUCLEOTIDE_CODE_REVERSED, is_valid_nucleotide_string, combine_nucleotides, STANDARD_CODONS, DEFAULT_CODON_LABEL ) decimal.getcontext().prec = DEFAULT_DECIMAL_PRECISION class Codon: """ A sequence of three nucleotides. """ def __init__(self, data=None, genetic_code=None): """ Multiple input formats are acceptable n = Nucleotide('N') Codon([n,n,n]) == Codon((n, n, n)) """ if not isinstance(genetic_code, GeneticCode): genetic_code = DEFAULT_GENETIC_CODE self.genetic_code = genetic_code # aaa = Codon('AAA') # naa = Codon('NAA') # Xaa = combine_codons(aaa, naa) # is dict # Codon(Xaa) if (isinstance(data, dict) and all([_ in STANDARD_CODONS for _ in data])): self.bases = None self.label = DEFAULT_CODON_LABEL self.members = list(data.keys()) self.proportions = [data[_] for _ in self.members] # Codon(['AAA', 'GGG']) elif (isinstance(data, list) and all([isinstance(_, str) for _ in data]) and all([len(_) == 3 for _ in data]) and all([_ in STANDARD_CODONS for _ in data])): self.bases = None self.label = DEFAULT_CODON_LABEL self.members = list(set(data)) self.proportions = [data.count(_) for _ in self.members] # Codon('AAA') or Codon('NNK') or ... # Codon([Nucleotide('A'),]*3) or ... # data as iterable of length 3 , or Nuc(), # or parameters that generates valid Nuc() elif (isinstance(data, (str, list, tuple)) and len(data) == 3): self.bases = [_ if isinstance(_, Nucleotide) else Nucleotide(_) for _ in data] self.label = ''.join([_.label for _ in self.bases]) self.members = self.get_members() self.proportions = self.get_proportions() # Codon() --> Codon([Nucleotide(),]*3) or ... else: raise 'Error, Codon() parameters not valid' self.composition = dict(zip(self.members, self.proportions)) self.degenerate = self.is_degenerate() self.equimolar = self.is_equimolar() def get_members(self): """ Returns a list of codons (as triplet strings) with nonzero proportion. """ b1, b2, b3 = [b.get_members() for b in self.bases] return [''.join(_) for _ in product(b1, b2, b3)] def get_proportions(self): """ Returns a list of codon proportions, aligned with self.members """ data = [] for member in self.members: proportion = 1 for m, b in zip(member, self.bases): proportion *= b.composition[m] data.append(proportion) return data def translate(self, dtype=None): """ Returns a dictionary mapping amino acids (as single-letter codes) with proportions. """ data = defaultdict(float) for c, p in zip(self.members, self.proportions): a = self.genetic_code[c] data[a] += p if dtype == 'decimal': data = {k: decimal.Decimal(v) for k, v in data.items()} elif dtype == 'float': data = {k: float(v) for k, v in data.items()} elif dtype == 'int': data = {k: int(v) for k, v in data.items()} elif dtype == 'object': pass elif dtype is None: data = {k: v for k, v in data.items()} else: # print('error', dtype, data) pass return data def samples(self, k=1): return [self.sample() for _ in range(k)] def sample(self): """ Returns a single non-degenerate oligonucleotide string. """ return choices(self.members, self.proportions)[0] def is_degenerate(self): """ Codon is degenerate if there are multiple nonzero standard codons in composition. """ return len(self.members) > 1 def is_equimolar(self): """ Codon is equimolar if all nonzero standard codons in composition have equal abundance. """ return max(self.proportions) == min(self.proportions) def copy(self): return Codon([b.copy() for b in self.bases], self.genetic_code) def __str__(self): b0, b1, b2 = self.bases # return f'Codon(\n\t{b0}\n\t{b1}\n\t{b2}\n)' return f'Codon(\n\t{b0}\n\t{b1}\n\t{b2})' def combine_codons(*codons, proportions=None): """ codons unpacked to standard codons and proportions added ex: AAA + CCC + GGG + TTT --> [AAA, CCC, GGG, TTT] ex: NNN + AAA --> {AAA: 2 , AAC:1, AAG:1,.... TTT:1} """ genetic_code = codons[0].genetic_code if proportions is None: proportions = [1, ] * len(codons) data = defaultdict(float) for codon, p in zip(codons, proportions): for sub_codon, sub_proportion in codon.composition.items(): data[sub_codon] += p * sub_proportion return Codon(data, genetic_code) def nucleotide_string_to_codons(s, output='Codon'): if is_valid_nucleotide_string(s): codon_strings = [s[3 * i:3 * i + 3] for i in range(len(s) // 3)] if output == 'Codon': return [Codon(*_) for _ in codon_strings] elif output == 'strings': return codon_strings def degenerate_codon_string_to_standard_members(c): """ Converts degenerate codon string to list of standard codons. """ b1, b2, b3 = [DEGENERATE_NUCLEOTIDE_CODE[b] for b in c] return [''.join(_) for _ in product(b1, b2, b3)] def combine_codon_labels(c1, c2): """ Combines codon labels, assuming bases mixed randomly Ex: AAA + CCC + GGG + TTT -> NNN = (AAA, AAC, ....) Ex: ABC + CCC --> MBC """ letters = [] for b1, b2 in zip(c1, c2): b1 = b1.upper() b2 = b2.upper() _ = DEGENERATE_NUCLEOTIDE_CODE[b1] + DEGENERATE_NUCLEOTIDE_CODE[b2] base_letters = ''.join(sorted(list(set(_)))) letters.append(DEGENERATE_NUCLEOTIDE_CODE_REVERSED[base_letters]) return ''.join(letters) def merge_codons(c1, c2, proportions=[1, 1], genetic_code=None): if genetic_code is None: genetic_code = DEFAULT_GENETIC_CODE nucs = [combine_nucleotides(n1, n2, proportions=proportions) for n1, n2 in zip(c1.bases, c2.bases)] return Codon(nucs, genetic_code) PKyQMB,,dogma/genetic_codes.pyfrom dogma import ( NBCI_GENETIC_CODE_NAMES, NCBI_CODONS, NCBI_AMINO_ACIDS ) class GeneticCode: """ System for translating codons into amino acids. Parameters ---------- ncbi_id: integer witin NBCI_GENETIC_CODE_NAMES keys, default is 1, standard genetic code updated_mappings: dictionary stop_symbol: string with length 1 error_symbol: string with length 1 name: str """ def __init__(self, ncbi_id=1, updated_mappings={}, stop_symbol='*', error_symbol='_', name=''): if ncbi_id in NBCI_GENETIC_CODE_NAMES: name = NBCI_GENETIC_CODE_NAMES[ncbi_id] codons = NCBI_CODONS amino_acids = NCBI_AMINO_ACIDS[ncbi_id] else: codons = [] amino_acids = '' if updated_mappings: assert isinstance(updated_mappings, dict) assert all([(len(c) == 3 and isinstance(c, str)) for c in updated_mappings.keys()]) assert all([(len(a) == 1 and isinstance(a, str)) for a in updated_mappings.values()]) if error_symbol is not '_': assert isinstance(error_symbol, str) and len(error_symbol) == 1 self.error_symbol = error_symbol if stop_symbol is not '*': assert isinstance(stop_symbol, str) and len(stop_symbol) == 1 amino_acids = amino_acids.replace('*', stop_symbol) self.stop_symbol = stop_symbol if isinstance(name, str): self.name = name # define genetic code mapping from codons to amino_acids self.code = dict(zip(codons, amino_acids)) # overwrite specified codon to amino acid mappings for codon, amino_acid in updated_mappings.items(): self.code[codon] = amino_acid # create list of codons and amino acids self.codons = codons self.amino_acids = amino_acids # define reversed genetic code, where each amino acids # maps to a list of codons self.code_reversed = {a: [_ for _ in codons if self.code[_] == a] for a in amino_acids} def update(self, updated_mappings={}, expand=False): """ Updates genetic code, reprogramming codons to amino acids Parameters ---------- updated_mappings: dict expand: Boolean if True, genetic code is 'expanded' to include new codons if False, only keys in GeneticCode.code.keys() will be remapped, others silently ignored. Default is False Usage ----- standard = GeneticCode() # default parameter ncbi_id = 1 # corresponds to 'Standard' code supE = GeneticCode(updated_mappings={'TAG': 'Q'}) """ assert isinstance(updated_mappings, dict) assert all([(len(c) == 3 and isinstance(c, str)) for c in updated_mappings.keys()]) assert all([(len(a) == 1 and isinstance(a, str)) for a in updated_mappings.values()]) if not expand: updated_mappings = {k: v for k, v in updated_mappings.items() if k in self.code} for codon, amino_acid in updated_mappings.items(): self.code[codon] = amino_acid def get_codons(self): """ Returns sorted list of codons in genetic code. """ return sorted(list(self.code.keys())) def get_amino_acids(self): """ Returns list of amino acids, ordered by sorted codons. """ return [self.code[codon] for codon in self.get_codons()] def __getitem__(self, codon): """ Returns value of code dictionary for specified codon key. Parameter --------- codon: string within code dictionary Returns ------- One-letter amino acid string Usage ----- standard = GeneticCode(1) standard['GGG'] --> 'G' """ codon = codon.upper().replace('U', 'T') return self.code.get(codon, self.error_symbol) def __setitem__(self, codon, amino_acid): """ Sets value of code dictionary for specified codon key to specified amino acid. Parameter --------- codon: string within genetic code dictionary Usage ----- >> std = GeneticCode() >> std['GGG'] --> 'G' >> std['GGG'] = 'Z' >> std['GGG'] --> 'Z' """ codon = codon.upper().replace('U', 'T') if codon not in self.code: raise "Attempting to set codon not in genetic code." else: self.code[codon] = amino_acid def __str__(self): """ Returns readable genetic code list """ codons = self.get_codons() amino_acids = self.get_amino_acids() return '\n'.join([' '.join([c, a]) for c, a in zip(codons, amino_acids)]) def __repr__(self): """ Returns string that would evaluate to an equivalent GeneticCode instance. """ codons = self.get_codons() amino_acids = self.get_amino_acids() return f'GeneticCode(codons={codons}, amino_acids={amino_acids})' DEFAULT_GENETIC_CODE = GeneticCode() SUPE = GeneticCode(updated_mappings={'TAG': 'Q'}, name='supE') def translate(dna, genetic_code=None): """ Translates a DNA string into amino acid string. """ assert isinstance(dna, str) # process string by replacing 'U's with 'T's and uppercasing dna = dna.upper().replace('U', 'T') if not isinstance(genetic_code, GeneticCode): genetic_code = DEFAULT_GENETIC_CODE codon_strings = [dna[3 * i: 3 * i + 3] for i in range(len(dna) // 3)] return ''.join([genetic_code[_] for _ in codon_strings]) def test(): for k, v in SUPE.code.items(): print(f"'{k}': '{v}',") if __name__ == '__main__': test() PK |QMHAi dogma/nbci.py""" :Reference: Nomenclature for Incompletely Specified Bases in Nucleic Acid Sequences http://www.sbcs.qmul.ac.uk/iubmb/misc/naseq.html """ from itertools import product # dictionary of NCBI genetic code ID numbers and full names NBCI_GENETIC_CODE_NAMES = { 1: 'Standard', 2: 'Vertebrate Mitochondrial', 3: 'Yeast Mitochondrial', 4: 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate;' 'Mitochondrial; Mycoplasma; Spiroplasma', 5: 'Invertebrate Mitochondrial', 6: 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear', 9: 'Echinoderm Mitochondrial; Flatworm Mitochondrial', 10: 'Euplotid Nuclear', 11: 'Bacterial, Archaeal and Plant Plastid', 12: 'Alternative Yeast Nuclear', 13: 'Ascidian Mitochondrial', 14: 'Alternative Flatworm Mitochondrial', 15: 'Blepharisma Macronuclear', 16: 'Chlorophycean Mitochondrial', 21: 'Trematode Mitochondrial', 22: 'Scenedesmus obliquus Mitochondrial', 23: 'Thraustochytrium Mitochondrial', 24: 'Pterobranchia Mitochondrial', 25: 'Candidate Division SR1 and Gracilibacteria', 26: 'Pachysolen tannophilus Nuclear Code' } # string of one-letter codes for the four standard nucleotides NCBI_NUCLEOTIDES = 'ACGT' # list of all 64 standard codons ['AAA', 'AAC', ... 'TTG', 'TTT'] NCBI_CODONS = [''.join(_) for _ in product(NCBI_NUCLEOTIDES, repeat=3)] # string of one-letter amino acid codes aligned with alphabetized codon list NCBI_AMINO_ACIDS = { 1: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF', 2: 'KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 3: 'KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 4: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 5: 'KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 6: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF', 9: 'NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 10: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF', 11: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF', 12: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF', 13: 'KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 14: 'NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF', 15: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF', 16: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF', 21: 'NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 22: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF', 23: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF', 24: 'KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF', 25: 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF', 26: 'FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' } PK~QMfQQdogma/nucleotides.pyfrom random import choices from dogma import ( rescale, STANDARD_NUCLEOTIDES, DEFAULT_NUCLEOTIDE_LABEL, DEGENERATE_NUCLEOTIDES, DEGENERATE_NUCLEOTIDE_CODE_COMPOSITION, DEGENERATE_NUCLEOTIDE_CODE_REVERSED ) class Nucleotide: """ Nucleotides are the building blocks of oligonucleotides. Parameters ---------- data: multiple inputs available, see self._process_input() Attributes ---------- label: str, len=1 single letter code for nucleotide composition: dictionary keys are single letters refering to undegenerate nucleotides values are numerical values related to abundance proportion members sorted list of nonzero nucleotides (as single letter codes) proportions list of member proportions, aligned with members degenerate: Boolean whether multiple standard nucleotides are nonzero equimolar: Boolean whether all nonzero standard nucleotide members have equal proportions Usage ----- a = Nucleotide('A') a.label --> 'A' a.composition --> {'A':1} a.members --> ['A'] a.proportions --> [1] a.degenerate --> False a.equimolar --> True a = Nucleotide({'A':1}) n = Nucleotide('N') a.label --> 'N' a.composition --> {'A':1, 'C':1, 'G':1, 'T':1} a.members --> ['A', 'C', 'G', 'T'] a.proportions --> [1, 1, 1, 1] a.degenerate --> True a.equimolar --> True n = Nucleotide({'A':0.1, 'C':0.5, 'G':0.4}) a.label --> 'v' a.composition --> {'A':0.1, 'C':0.5, 'G':0.4} a.members --> ['A', 'C', 'G', 'T'] a.proportions --> [0.1, 0.5, 0.4] a.degenerate --> True a.equimolar --> False """ def __init__(self, data=None): # define self.label and self.composition by processing # data input parameter self._process_input(data) self.members = self.get_members() self.proportions = self.get_proportions() self.degenerate = self.is_degenerate() self.equimolar = self.is_equimolar() def _process_input(self, data): """ Various input data formats are supported: Nucleotide('A') Nucleotide({'A': 3, 'C': 1}) """ # ex: Nucleotide('A') if isinstance(data, str) and len(data) == 1: label = data.upper().replace('U', 'T') composition = DEGENERATE_NUCLEOTIDE_CODE_COMPOSITION[label] # ex: Nucleotide({'A':3, 'C':1}) elif (isinstance(data, dict) and is_valid_nucleotide_string(''.join(data.keys())) and all([isinstance(_, (int, float)) for _ in data.values()])): composition = data label = nucleotide_composition_to_letter(composition) else: label = DEFAULT_NUCLEOTIDE_LABEL composition = DEGENERATE_NUCLEOTIDE_CODE_COMPOSITION[label] self.label = label self.composition = composition def get_members(self): """ Returns a sorted string of nucleotide members that are have nonzero frequency within nucleotide. """ nonzero_members = [k for k, v in self.composition.items() if v > 0] return ''.join(sorted(nonzero_members)) def get_proportions(self): """ Returns a list of proportions aligned with self.members """ return [self.composition[_] for _ in self.members] def is_degenerate(self): """ Nucleotide is degenerate if multiple standard nucleotides have nonzero proportions. """ return len(self.members) > 1 def is_equimolar(self): """ Nucleotide is equimolar if the proportion of each nonzero nucleotide values are equal. """ nonzero_values = [_ for _ in self.composition.values() if _ > 0] return max(nonzero_values) == min(nonzero_values) def samples(self, k=1): """ Randomly selects `k` members based on Nucleotide composition. """ return [self.sample() for _ in range(k)] def sample(self): """ Randomly selects a members based on Nucleotide composition. """ return choices(self.members, self.proportions)[0] def copy(self): return Nucleotide(self.composition) def __str__(self): return f'Nucleotide(label={self.label}, '\ f'composition={self.composition})' def __repr__(self): return f'Nucleotide(label={self.label}, '\ f'composition={self.composition})' def is_nondegenerate_nucleotide_string(n): n = n.upper().replace('U', 'T') return all([_ in STANDARD_NUCLEOTIDES for _ in n]) def is_degenerate_nucleotide_string(n): if not is_nondegenerate_nucleotide_string(n): n = n.upper().replace('U', 'T') return all([_ in DEGENERATE_NUCLEOTIDES for _ in n]) return False def is_valid_nucleotide_string(s): """ Checks if string is composed of valid nucleotide members. input is first capitalized and U's are replaced with T's each letter must be a standard or degenerate letter """ s = s.upper().replace('U', 'T') return all([_ in DEGENERATE_NUCLEOTIDES for _ in s]) def combine_nucleotides(*nucleotides, proportions=None): """ Combines nucleotides, creating a new Nucleotide instance with a nucleotide composition as a weighted sum input compositions. Usage ----- c = Nucleotide('C') t = Nucleotide('T') c_plus_t = combine_nucleotides(c, t) print(c_plus_t) --> "Nucleotide('N', " \ "composition={'A':0, 'C':0.5, 'G':0, 'T':0.5})" "" c_plus_ttt = combine_nucleotides(c, t, proportions=[1, 3]) print(c_plus_ttt) --> "Nucleotide('n', "\ "composition={'A':0, 'C':0.25, 'G':0, 'T':0.75})" """ if (proportions is None) or (len(nucleotides) != len(proportions)): proportions = [1, ] * len(nucleotides) data = {} for n in STANDARD_NUCLEOTIDES: data[n] = 0 for nuc, p in zip(nucleotides, proportions): data[n] += nuc.composition.get(n, 0) * p return Nucleotide(rescale(data)) def nucleotide_composition_to_letter(composition): """ Converts dictionary of {nucleotide letter: proportions} pairs to IUPAC degenerate DNA letter. Usage: c = {'A': 1} print(nucleotide_composition_to_letter(c)) --> 'A' c = dict(zip('ACGT', [1, 1, 1, 1])) print(nucleotide_composition_to_letter(c)) --> 'N' c = dict(zip('ACGT', [1, 1, 2, 1])) print(nucleotide_composition_to_letter(c)) --> 'n' """ nonzero_nucleotides = ''.join(sorted([n for n, v in composition.items() if v > 0])) nonzero_proportions = [composition[n] for n in nonzero_nucleotides] equimolar = min(nonzero_proportions) == max(nonzero_proportions) letter = DEGENERATE_NUCLEOTIDE_CODE_REVERSED.get(nonzero_nucleotides, DEFAULT_NUCLEOTIDE_LABEL) if equimolar: return letter return letter.lower() PK~QM"*99dogma/oligonucleotides.pyimport decimal import math import pandas as pd from dogma import ( DEFAULT_DECIMAL_PRECISION, GeneticCode, DEFAULT_GENETIC_CODE, translate, Nucleotide, Codon, ) decimal.getcontext().prec = DEFAULT_DECIMAL_PRECISION class Oligonucleotide: """ A sequence of Nucleotides objects. 5' --> 3' list """ def __init__(self, data, genetic_code=None, auto_run=True): """ Multiple input formats are acceptable Oligonucleotide('AAANNK') Oligonucleotide(list('AAANNK')) Oligonucleotide(list(map(Nucleotide, list('AAANNK')))) Oligonucleotide([{'A':1}, {'A':0.5, 'C':0.5}] Oligonucleotide(list(map(Nucleotide, list('AAANNK')))) """ if not isinstance(genetic_code, GeneticCode): genetic_code = DEFAULT_GENETIC_CODE self.genetic_code = genetic_code if isinstance(data, (str, list)): if isinstance(data[0], Codon): self.bases = [] for c in data: self.bases += c.bases else: self.bases = [_ if isinstance(_, Nucleotide) else Nucleotide(_) for _ in data] elif isinstance(data, Codon): self.bases = data.bases self.genetic_code = data.genetic_code else: self.bases = None self.length = self.get_length() self.label = self.get_label() self.compact_label = self.get_compact_label() self.codon_strings = self.get_codon_strings() self.codons = self.get_codons() self.get_base_profile() self.get_codon_profile() self.get_amino_acid_profile() if auto_run: self.assess_degeneracy() def get_bases(self): """ Returns list of Nucleotides objects """ return self.bases def get_codons(self): """ Returns list of Codon objects derived from oligonucleotide bases """ base_triplets = [self.bases[3 * i: 3 * i + 3] for i in range(self.length // 3)] return [Codon(_, self.genetic_code) for _ in base_triplets] def get_length(self): """ Returns integer length of oligonucleotide, number of bases """ return len(self.bases) def get_label(self): """ Returns a string of nucleotide labels. """ return ''.join(_.label for _ in self.bases) def get_codon_strings(self): """ Returns list of codon strings. """ return [self.label[3 * i: 3 * i + 3] for i in range(self.length // 3)] def get_compact_label(self): """ Returns shorthand label Attempts to reduce identical codon labels with numbers Ex: NNKNNKNNKAAANNK --> NNK<3>AAANNK """ count = 1 previous_string = '' output = [] for c in self.get_codon_strings(): if c == previous_string: count += 1 else: if count > 1: output.append(f'<{count}>') previous_string = c output.append(c) count = 1 if count > 1: output.append(f'<{count}>') return ''.join(output) def translate(self, genetic_code=None): """ Translates oligonucleotide into protein. Parameters: genetic_code: GeneticCode object """ if not isinstance(genetic_code, GeneticCode): genetic_code = self.genetic_code return ''.join([genetic_code[_] for _ in self.get_codon_strings()]) def samples(self, k=1, output='oligonucleotide', by_codon=True): return [self.sample(output=output, by_codon=by_codon) for _ in range(k)] def sample(self, output='oligonucleotide', by_codon=True): """ Returns a single non-degenerate oligonucleotide string. If by_codon is True, sampling is based on the codons attribute, otherwise sampling is performed on each base individually. """ if by_codon: oligonucleotide_string = ''.join(c.sample() for c in self.codons) else: oligonucleotide_string = ''.join([b.sample() for b in self.bases]) if output == 'protein': return translate(oligonucleotide_string, self.genetic_code) elif output == 'both': return oligonucleotide_string, translate(oligonucleotide_string, self.genetic_code) else: return oligonucleotide_string def assess_degeneracy(self): """ Performs the sequential and potentially computationally and memory intensive calculations related to generating the protein abundance profile. """ self.get_amino_acid_degeneracy_profile() self.get_protein_degeneracy_table() self.get_degeneracy_table() self.get_size() def get_base_profile(self): """ Generates and returns base profile Usage ----- >> nnk = Codon('NNK') >> oligo = Oligonucleotide(nnk) >> oligo.get_base_profile() {0: {'A':1, 'C':1 , 'G':1. 'T':1}, 1: {'A':1, 'C':1 , 'G':1. 'T':1}, 2: { 'G':1. 'T':1}} """ self.base_profile = [b.composition for b in self.bases] return self.base_profile def get_codon_profile(self): """ Generates and returns codon profile Usage ----- >> nnk = Codon('NNK') >> oligo = Oligonucleotide(nnk) >> oligo.get_codon_profile() {0: {'AAA':1, 'AAC':1, AAG':1, 'AAT':1, 'ACA':1, ...}, 1: {'AAA':1, 'AAC':1, AAG':1, 'AAT':1, 'ACA':1, ...}, 2: { 'AAG':1, 'AAT':1, ...}} """ self.codon_profile = [c.composition for c in self.codons] return self.codon_profile def get_amino_acid_profile(self): """ Generates and returns amino acid profile. List of dictionaries mapping amino acid to degeneracy (redundance) Indicies of list reflects position within protein Usage ----- >> supE = GeneticCode(1, {'TAG': 'Q'}) >> oligo = Oligonucleotide('NNKGCC', supE) >> oligo.get_amino_acid_profile() [{'A':2, 'C':1, D':1, 'E':1, 'F':1, ...}, {'A':1 }] """ self.amino_acid_profile = [c.translate(dtype='decimal') for c in self.codons] return self.amino_acid_profile def get_amino_acid_degeneracy_profile(self): """ Generates and returns amino acid degeneracy profile Usage ----- >> supE = GeneticCode(1, {'TAG': 'Q'}) >> oligo = Oligonucleotide('NNKAAA', supE) >> oligo.get_amino_acid_degeneracy_profile() [{1:11, 2:6 , 3:3}, {1:1 }] Explanation of profile (for 'NNKAAA' oligo with supE translation) ---------------------- The 0 index refers to the first amino acid (or codon) position in oligo This NNK codon encodes 20 (11 + 6 + 3) amino acids 11 of these amino acids (CDEFHIKMNQ) are encoded by 1 codon 6 (AGPQTV) 2 codons 3 (LRS) 3 codons The 1 index refers to the next position, where 'AAA' encodes for 'K' 1 amino acid (K) 1 codon """ self.amino_acid_degeneracy_profile = [] for d in self.amino_acid_profile: self.amino_acid_degeneracy_profile.append( {v: list(d.values()).count(v) for v in set(d.values())}) return self.amino_acid_degeneracy_profile def get_protein_degeneracy_table(self): """ Generates and returns protein degeneracy profile Protein degeneracy is a dictionary where keys are all possible degeneracies of proteins derived from the degenerate oligonucleotide, and where values are the number of proteins in this group. Dictionary values reflect number of proteins within Oligonucleotid with a given degeneracy. Usage ----- >> supE = GeneticCode(1, {'TAG': 'Q'}) >> oligo = Oligonucleotide('NNK'*2, supE) >> oligo.get_protein_degeneracy_table() {1: 121, # key=1*1, value=11*11 2: 132, # key=1*2, value=11*6*2 (*2 because key=1*2 and 2*1) 3: 66, 4: 36, 6: 36, 9: 9} Explanation of profile (for 'NNKNNK' oligo with supE translation) ---------------------- 121 proteins (CC, CD, CE, ...) have a degeneracy of 1 131 proteins (AC, AD, AE, ...) have a degeneracy of 2 ... There are 20*20 = 400 = (121 + 132 + 66 + 36 + 36 + 9) unique protein members of this oligonucleotide ensemble. Multiplying the degeneracy by the protein counts give the number of unique DNA counts """ # for each df, create new dfs for each row of next df, # then concat, repeat dfs = [pd.DataFrame({'Degeneracy': list(_.keys()), 'Proteins': list(map(decimal.Decimal, _.values()))}, dtype='object') for _ in self.amino_acid_degeneracy_profile] if not dfs: return pd.DataFrame() df = dfs[0] for next_df in dfs[1:]: df = pd.concat([df * next_df.iloc[i] for i in range(len(next_df))]) df = df.groupby(['Degeneracy'], sort=False).sum().reset_index() self.protein_degeneracy_table = df.sort_values(by=['Degeneracy']) \ .reset_index(drop=True) return self.protein_degeneracy_table def get_degeneracy_table(self): """ Expands protein degeneracy table """ df = self.protein_degeneracy_table.copy() df['Oligonucleotides'] = df.Degeneracy * df.Proteins df['DNA_Quantile'] = \ df.Oligonucleotides.cumsum() / sum(df.Oligonucleotides) df['Protein_Quantile'] = \ df.Proteins.cumsum() / sum(df.Proteins) self.degeneracy_table = df # defining additional aliases self.protein_quantiles = df['Protein_Quantile'].tolist() self.dna_quantiles = df['DNA_Quantile'].tolist() self.degeneracies = df['Degeneracy'].tolist() self.df = df def get_size(self, output='dna'): """ Calculates and returns size of oligonucleotide ensemble """ self.size_oligonucleotides = self.df['Oligonucleotides'].sum() self.size_proteins = self.df['Proteins'].sum() self.size = self.size_oligonucleotides if output.startswith('d'): return self.size_oligonucleotides elif output.startswith('p'): return self.size_proteins else: return (self.size_oligonucleotides, self.size_proteins) def get_gini_index(self): """ Calculates Gini Index of protein ensemble. """ x = [0] + self.protein_quantiles y = [0] + self.dna_quantiles n = len(self.protein_quantiles) gini = decimal.Decimal(1 / 2) for i in range(n): gini -= (x[i + 1] - x[i]) * (y[i + 1] + y[i]) / 2 self.gini_index = 2 * gini return self.gini_index def get_makowski_diversity(self): """ **INCORRECT** Calculates diversity as define by Makowski and Soares https://doi.org/10.1093/bioinformatics/btg013 d = 1/(N*SUM(Pi^2)) """ d = decimal.Decimal(0) P = self.size_proteins degeneracy = self.degeneracy_table['Degeneracy'] protein_counts = self.degeneracy_table['Proteins'] for x, y in zip(protein_counts, degeneracy): p = y / self.size_oligonucleotides d += x * (p ** 2) self.makowski_diversity = 1 / (P * d) return self.makowski_diversity def _get_alternative_makowski_diversity(self): """ EQUIVALENT ANSWER AS self.get_makowski_diversity() Calculates diversity as define by Makowski and Soares https://doi.org/10.1093/bioinformatics/btg013 Equation (2) d = 1/(N*PRODUCTi{SUMj{Pij^2}}) Where i is position protein, j is each amino acids, Pij is prob of amino acid j at i """ d = decimal.Decimal(1) for aa_dict in self.amino_acid_profile: tot = sum(aa_dict.values()) d *= sum([(aa / tot)**2 for aa in aa_dict.values()]) self._makowski_diversity = 1 / (self.size_proteins * d) return self._makowski_diversity def get_entropy(self, base=None): """ Calculates entropy of abundance profile s = -SUM(pi*ln(pi)) """ degeneracy = self.degeneracy_table['Degeneracy'] protein_counts = self.degeneracy_table['Proteins'] print(degeneracy) print(protein_counts) return 0 entropy = decimal.Decimal(0) # for each degeneracy group for x, y in zip(protein_counts, degeneracy): p = y / self.size_proteins entropy += x * p * decimal.Decimal(math.log(p)) if base is not None: entropy /= math.log(base) self.entropy = -entropy return self.entropy def __repr__(self): return f'Oligonucleotide({self.label})' def reverse_complement(oligo): """ """ if isinstance(oligo, Oligonucleotide): oligo = oligo.label # TODO! def get_nnk(n=1): """ Simple helper function to easily generate NNK-based oligos. """ supE = GeneticCode(1, {'TAG': 'Q'}) oligo = Oligonucleotide('NNK' * n, supE) return oligo PK~QM!!!dogma/proteins.pyfrom functools import reduce from random import choices from dogma import ( GeneticCode, DEFAULT_GENETIC_CODE, Oligonucleotide, Codon, AminoAcid, ) class Protein: """ A sequence of AminoAcid objects. N-terminal --> C-terminal list Parameters ---------- data: genetic_code: GeneticCode object data_is_dna: flag for processing data Attributes --------- oligonucleotide: (opt) Oligonucleotide object. If not exactly determined, consist of all possible dna """ def __init__(self, data, genetic_code=None, data_is_dna=False): if not isinstance(genetic_code, GeneticCode): genetic_code = DEFAULT_GENETIC_CODE self.genetic_code = genetic_code if all([isinstance(_, str) for _ in data]): # Protein('NNK', data_is_dna=True) if data_is_dna: self.oligonucleotide = Oligonucleotide(data, genetic_code) self.codons = oligonucleotide_to_codons(self.oligonucleotide) self.residues = [AminoAcid(c.translate()) for c in self.codons] # Protein('NNK') else: self.residues = [AminoAcid(_, genetic_code) for _ in data] self.codons = [a.get_synonymous_codons(Codon) for a in self.residue] self.oligonucleotide = None # Protein(Oligonucleotide('NNK')) elif isinstance(data, Oligonucleotide): self.oligonucleotide = data self.codons = oligonucleotide_to_codons(self.oligonucleotide) self.residues = [AminoAcid(c.translate()) for c in self.codons] # Protein(AminoAcid('A')) elif isinstance(data, AminoAcid): self.residues = [data] self.oligonucleotide = None self.codons = oligonucleotide_to_codons(self.oligonucleotide) else: self.residues = [] self.oligonucleotide = None self.codons = [] self.label = ''.join([_.label for _ in self.residues]) self.length = len(self.residues) self.degenerate = self.is_degenerate() def is_degenerate(self): return any([a.is_degenerate() for a in self.residues]) def get_synonymous_codons(self): return [codon for codon in [a.get_synonymous_codons() for a in self.amino_acids]] def samples(self, k=1): return [self.sample() for _ in range(k)] def sample(self): return choices(self.letters, self.proportions) def __str__(self): return self.label def __repr__(self): return f'Protein(label={self.label}, composition={self.composition})' def calculate_protein_degeneracy(protein, oligonucleotide=None): """ Calculates degeneracy of protein sequences for a specified oligonucleotide design. If no oligonucleotide is specified, defaults to a fully randomized ('NNN') scheme with a length 3 times as large as the protein, and the default genetic code is used. """ if isinstance(protein, Protein): protein = protein.label # how to handle '_'? if oligonucleotide is None: oligonucleotide = Oligonucleotide('NNN' * len(protein)) data = oligonucleotide.amino_acid_profile return reduce(lambda x, y: x * y, [aa_composition[aa] for aa_composition, aa in zip(data, protein)],) def oligonucleotide_to_codons(oligo): return None PKsMM- dogma/utils.py"""Package-wide variables and helper functions.""" from itertools import product from random import choice DEFAULT_DECIMAL_PRECISION = 200 STANDARD_NUCLEOTIDES = 'ACGT' DEGENERATE_NUCLEOTIDE_CODE = { 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 'R': 'AG', 'Y': 'CT', 'S': 'CG', 'W': 'AT', 'K': 'GT', 'M': 'AC', 'B': 'CGT', 'D': 'AGT', 'H': 'ACT', 'V': 'ACG', 'N': 'ACGT' } DEGENERATE_NUCLEOTIDES = sorted(DEGENERATE_NUCLEOTIDE_CODE.keys()) # includes standard nucleotides (ACGT) DEGENERATE_NUCLEOTIDE_CODE_REVERSED = {v: k for k, v in DEGENERATE_NUCLEOTIDE_CODE.items()} DEGENERATE_NUCLEOTIDE_CODE_COMPOSITION = {code: {n: int(n in _) for n in STANDARD_NUCLEOTIDES} for code, _ in DEGENERATE_NUCLEOTIDE_CODE.items()} NUCLEOTIDE_BASE_PAIRS = dict(zip('ACGT', 'TGCA')) _D = DEGENERATE_NUCLEOTIDE_CODE_REVERSED DEGENERATE_NUCLEOTIDE_PAIRS = {k: _D[''.join(sorted( [NUCLEOTIDE_BASE_PAIRS[_] for _ in v]))] for k, v in DEGENERATE_NUCLEOTIDE_CODE.items()} DEFAULT_NUCLEOTIDE_LABEL = 'N' DEGENERATE_NUCLEOTIDE_PAIRS_ = { 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'R': 'Y', 'Y': 'R', 'S': 'S', 'W': 'W', 'K': 'M', 'M': 'K', 'B': 'V', 'D': 'H', 'H': 'D', 'V': 'B', 'N': 'N' } DEFAULT_OLIGONUCLEOTIDE_LABEL = 'NNN' STANDARD_CODONS = [''.join(_) for _ in product(STANDARD_NUCLEOTIDES, repeat=3)] DEGENERATE_CODONS = [''.join(_) for _ in product(DEGENERATE_NUCLEOTIDES, repeat=3)] DEFAULT_CODON_LABEL = 'NNN' STANDARD_AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY' DEFAULT_AMINO_ACID_LABEL = 'X' STOP_LABEL = '*' DEFAULT_RESIDUE_LABEL = 'X' def rescale(data, total=1): """ Rescales numerical values in lists and dictionaries to sum to specified total. """ if isinstance(data, list): input_total = sum(data) assert input_total != 0, 'Error in doe.rescale(), input_total == 0' return [_ / input_total * total for _ in data] elif isinstance(data, dict): input_total = sum(data.values()) assert input_total != 0, 'Error in doe.rescale(), input_total == 0' return {k: v / input_total * total for k, v in data.items()} def get_frequency_dictionary(data): """ Takes a string or list of strings and returns a dictionary of unique members and their abundance. """ return {k: data.count(k) for k in set(data)} def get_random_oligonucleotide(length=3, letters='ACGTRYSWKMBDHVN'): """ Returns string of random degenerate nucleotides. Each of the 15 letters 'ACGTRYSWKMBDHVN' are equally likely. Alternatively, use letters='ACGT' for standard oligo strings """ return ''.join(choice(list(letters)) for _ in range(length)) def test_get_random_oligonucleotide(): print(get_random_oligonucleotide()) print(get_random_oligonucleotide(12)) print(get_random_oligonucleotide(12, 'ACGT')) def tests(): test_get_random_oligonucleotide() def main(): tests() if __name__ == '__main__': main() PK mQM/qCCdogma-0.0.1.dist-info/LICENSEMIT License Copyright (c) 2018 Griffin Clausen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.PK!Hd BUcdogma-0.0.1.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,rzd&Y)r$[)T&UD"PK!HF/dogma-0.0.1.dist-info/METADATAUn0 y Ӫ+=LȔ`J&;,>)eϣB. vN#CV4oY@FI6k<vKi8R1=ȔmU9/ܔ-%~lΙj+.G}/_2p~A}Vw]b=)kr=~76?PK!H:Z-Wc=`~u1ξ-sLۤ4 w0_zԐd\c7kR={5gBPxꙇ{UY2oPKQMBgHdogma/__init__.pyPK~QMH dogma/amino_acids.pyPK~QM Vdogma/codons.pyPKyQMB,,.dogma/genetic_codes.pyPK |QMHAi 6Hdogma/nbci.pyPK~QMfQQ_Tdogma/nucleotides.pyPK~QM"*99qdogma/oligonucleotides.pyPK~QM!!!ȫdogma/proteins.pyPKsMM- dogma/utils.pyPK mQM/qCCdogma-0.0.1.dist-info/LICENSEPK!Hd BUcgdogma-0.0.1.dist-info/WHEELPK!HF/dogma-0.0.1.dist-info/METADATAPK!H: