PK2JU $DeFCoM-1.0.1.data/scripts/predict.py#!python from ConfigParser import SafeConfigParser from defcom.alignment import Alignments import defcom.motif_sites as ms import numpy as np import os from collections import Counter from defcom.features import Features import numpy as np import pickle import sys ######## # TO DO: REFACTOR CODE ######## def main(): """Run the classification phase of DeFCoM. Args: sys.argv[1] = A python configuration file. For configuration file details see https://docs.python.org/2/library/configparser.html. """ #Load parameters from config file config_file = sys.argv[1] parser = SafeConfigParser(os.environ) parser.read(config_file) candidate_sites_file = parser.get('data', 'candidate_sites_file') bam_file = parser.get('data', 'candidate_bam_file') f_offset = parser.getint('options', 'f_offset') r_offset = parser.getint('options', 'r_offset') flank_size = parser.getint('options', 'flank_size') combine_strands = parser.getboolean('options', 'combine_strands') bias_correction = parser.getboolean('options', 'bias_correction') model_data_file = parser.get('options', 'model_data_file') results_file = parser.get('options', 'results_file') #Create data objects bam_data = Alignments(bam_file) candidate_data = ms.MotifSites(candidate_sites_file) model_data = pickle.load(open(model_data_file, 'rb')) svm = model_data["svm"] feature_scaler = model_data["feature_scaler"] #Prepare data candidate_sites = candidate_data.get_all_sites() cut_profiles = Features.convert_to_cuts(candidate_sites, bam_data, flank_size, combine_strands, f_offset, r_offset, bias_correction) candidate_sites = candidate_data.get_all_sites() pwm_scores = Features.get_pwm_scores(candidate_sites) #Apply square root transformation cut_profiles = np.sqrt(cut_profiles) #Apply feature extraction partition_fn = np.vectorize(lambda x:x**2 + 5) split_pts = Features.get_splits(cut_profiles[0].size, partition_fn) center = cut_profiles[0].size/2 #For regional feature center size left = center - 15 #For regional feature center size right = center + 15 #For regional feature center size candidate_features = [] for i in xrange(cut_profiles.shape[0]): row = cut_profiles[i] #Get local features row_split = np.split(row, split_pts) local_slopes = Features.get_slopes(row_split) local_means = np.vectorize(np.mean)(row_split) #Get regional features row_split = np.array([row[(left-30):left], row[left:right], row[right:(right+30)]]) regional_slopes = Features.get_slopes(row_split) regional_means = np.mean(row_split, axis=1) global_mean = [np.mean(row[(center-75):(center+75)])] row_features = np.concatenate((local_slopes, local_means, regional_slopes, regional_means, global_mean, [pwm_scores[i]])) candidate_features.append(row_features) #Scale features candidate_features = feature_scaler.transform(candidate_features) #Predict and output to file scores = svm.decision_function(candidate_features) fout = open(results_file, 'w') i = 0 candidate_sites = candidate_data.get_all_sites() for site in candidate_sites: updated_row = list(site) + [str(scores[i])] fout.write("\t".join(updated_row) + "\n") i += 1 fout.close() if __name__ == '__main__': sys.exit(main()) PK2J))"DeFCoM-1.0.1.data/scripts/train.py#!python from ConfigParser import SafeConfigParser from defcom.alignment import Alignments import defcom.motif_sites as ms import numpy as np import os from collections import Counter from defcom.features import Features from sklearn import preprocessing, model_selection, svm, metrics import defcom.performance_stats as performance_stats from scipy import stats import numpy as np import pickle import sys ######## # TO DO: REFACTOR CODE ######## def main(): """Run the training phase of DeFCoM. Args: sys.argv[1] = A python configuration file. For configuration file details see https://docs.python.org/2/library/configparser.html. """ #Load parameters from config file config_file = sys.argv[1] parser = SafeConfigParser(os.environ) parser.read(config_file) active_sites_file = parser.get('data', 'active_sites_file') inactive_sites_file = parser.get('data', 'inactive_sites_file') bam_file = parser.get('data', 'training_bam_file') f_offset = parser.getint('options', 'f_offset') r_offset = parser.getint('options', 'r_offset') flank_size = parser.getint('options', 'flank_size') combine_strands = parser.getboolean('options', 'combine_strands') bias_correction = parser.getboolean('options', 'bias_correction') bootstrap_iterations = parser.getint('options', 'bootstrap_iterations') bootstrap_active_set_size = parser.getint('options', 'bootstrap_active_set_size') bootstrap_inactive_set_size = parser.getint('options', 'bootstrap_inactive_set_size') training_active_set_size = parser.getint('options', 'training_active_set_size') training_inactive_set_size = parser.getint('options', 'training_inactive_set_size') memory = parser.getint('options', 'memory_limit') model_data_file = parser.get('options', 'model_data_file') print "making data" #Create data objects bam_data = Alignments(bam_file) active_sites = ms.MotifSites(active_sites_file) inactive_sites = ms.MotifSites(inactive_sites_file) print "begin bootstrap" #Execute parameter estimation via bootstrapping gammas = [] c_rbf = [] c_linear = [] rbf_pauc = [] lin_pauc = [] active_bootstrap = active_sites.subsample_sites( bootstrap_active_set_size, bootstrap_iterations) inactive_bootstrap = active_sites.subsample_sites( bootstrap_inactive_set_size, bootstrap_iterations) for i in range(bootstrap_iterations): temp_active = active_bootstrap[i] temp_inactive = inactive_bootstrap[i] print "to cuts" active_cuts = Features.convert_to_cuts(temp_active, bam_data, flank_size, combine_strands, f_offset, r_offset, bias_correction) inactive_cuts = Features.convert_to_cuts(temp_inactive, bam_data, flank_size, combine_strands, f_offset, r_offset, bias_correction) active_pwm = Features.get_pwm_scores(temp_active) inactive_pwm = Features.get_pwm_scores(temp_inactive) print "sqrt transform" #Apply square root transformation active_cuts = np.sqrt(active_cuts) inactive_cuts = np.sqrt(inactive_cuts) #Apply feature extraction partition_fn = np.vectorize(lambda x:x**2 + 5) split_pts = Features.get_splits(active_cuts[0].size, partition_fn) center = active_cuts[0].size/2 #For regional feature center size left = center - 15 #For regional feature center size right = center + 15 #For regional feature center size active_features = [] inactive_features = [] print "feature extraction" for i in xrange(active_cuts.shape[0]): row = active_cuts[i] #Get local features row_split = np.split(row, split_pts) local_slopes = Features.get_slopes(row_split) local_means = np.vectorize(np.mean)(row_split) #Get regional features row_split = np.array([row[(left-30):left], row[left:right], row[right:(right+30)]]) regional_slopes = Features.get_slopes(row_split) regional_means = np.mean(row_split, axis=1) global_mean = [np.mean(row[(center-75):(center+75)])] row_features = np.concatenate((local_slopes, local_means, regional_slopes, regional_means, global_mean, [active_pwm[i]])) active_features.append(row_features) for i in xrange(inactive_cuts.shape[0]): row = inactive_cuts[i] #Get local features row_split = np.split(row, split_pts) local_slopes = Features.get_slopes(row_split) local_means = np.vectorize(np.mean)(row_split) #Get regional features row_split = np.array([row[(left-30):left], row[left:right], row[right:(right+30)]]) regional_slopes = Features.get_slopes(row_split) regional_means = np.mean(row_split, axis=1) global_mean = [np.mean(row[(center-75):(center+75)])] row_features = np.concatenate((local_slopes, local_means, regional_slopes, regional_means, global_mean, [inactive_pwm[i]])) inactive_features.append(row_features) combined_features = np.concatenate((active_features, inactive_features), axis=0) feature_scaler = preprocessing.MaxAbsScaler() training_set = feature_scaler.fit_transform(combined_features) active_labels = [1] * len(active_features) inactive_labels = [-1] * len(inactive_features) training_labels = active_labels + inactive_labels print "cross-validation" #Perform cross-validation gamma_range = np.logspace(-6,3,10) c_range = np.logspace(-3,4,8) parameters = {'C':c_range, 'gamma':gamma_range} pauc_scorer = metrics.make_scorer(performance_stats.pAUC, needs_threshold=True, fpr_cutoff=0.05) #RBF kernel SVM gridCV = model_selection.GridSearchCV( svm.SVC(cache_size=memory, kernel="rbf"), parameters, n_jobs=3, pre_dispatch=3, cv=5, scoring=pauc_scorer, refit=False) gridCV.fit(training_set, training_labels) c_rbf.append(gridCV.best_params_["C"]) gammas.append(gridCV.best_params_["gamma"]) rbf_pauc.append(gridCV.best_score_) #Linear SVM parameters = {'C':c_range} gridCV = model_selection.GridSearchCV(svm.SVC(kernel="linear"), parameters, n_jobs=3, pre_dispatch=3, cv=5, scoring=pauc_scorer, refit=False) gridCV.fit(training_set, training_labels) c_linear.append(gridCV.best_params_["C"]) lin_pauc.append(gridCV.best_score_) print "t-test" #Use t-test to determine better model pval = stats.ttest_ind(lin_pauc, rbf_pauc)[1] if pval > 0.01: best_c = Counter(c_linear).most_common(1)[0][0] final_model = svm.SVC(kernel="linear", C=best_c) elif np.mean(lin_pauc) > np.mean(rbf_pauc): best_c = Counter(c_linear).most_common(1)[0][0] final_model = svm.SVC(kernel="linear", C=best_c) else: best_c = Counter(c_rbf).most_common(1)[0][0] best_gamma = Counter(gammas).most_common(1)[0][0] final_model = svm.SVC(kernel="linear", C=best_c, gamma=best_gamma) print "final model training" #Train final model active_set = active_sites.subsample_sites(training_active_set_size)[0] inactive_set = inactive_sites.subsample_sites(training_inactive_set_size)[0] active_cuts = Features.convert_to_cuts(active_set, bam_data, flank_size, combine_strands, f_offset, r_offset, bias_correction) inactive_cuts = Features.convert_to_cuts(inactive_set, bam_data, flank_size, combine_strands, f_offset, r_offset, bias_correction) active_pwm = Features.get_pwm_scores(active_set) inactive_pwm = Features.get_pwm_scores(inactive_set) #Apply square root transformation active_cuts = np.sqrt(active_cuts) inactive_cuts = np.sqrt(inactive_cuts) print "feature extraction" #Apply feature extraction partition_fn = np.vectorize(lambda x:x**2 + 5) split_pts = Features.get_splits(active_cuts[0].size, partition_fn) center = active_cuts[0].size/2 #For regional feature center size left = center - 15 #For regional feature center size right = center + 15 #For regional feature center size active_features = [] inactive_features = [] for i in xrange(active_cuts.shape[0]): row = active_cuts[i] #Get local features row_split = np.split(row, split_pts) local_slopes = Features.get_slopes(row_split) local_means = np.vectorize(np.mean)(row_split) #Get regional features row_split = np.array([row[(left-30):left], row[left:right], row[right:(right+30)]]) regional_slopes = Features.get_slopes(row_split) regional_means = np.mean(row_split, axis=1) global_mean = [np.mean(row[(center-75):(center+75)])] row_features = np.concatenate((local_slopes, local_means, regional_slopes, regional_means, global_mean, [active_pwm[i]])) active_features.append(row_features) for i in xrange(inactive_cuts.shape[0]): row = inactive_cuts[i] #Get local features row_split = np.split(row, split_pts) local_slopes = Features.get_slopes(row_split) local_means = np.vectorize(np.mean)(row_split) #Get regional features row_split = np.array([row[(left-30):left], row[left:right], row[right:(right+30)]]) regional_slopes = Features.get_slopes(row_split) regional_means = np.mean(row_split, axis=1) global_mean = [np.mean(row[(center-75):(center+75)])] row_features = np.concatenate((local_slopes, local_means, regional_slopes, regional_means, global_mean, [inactive_pwm[i]])) inactive_features.append(row_features) combined_features = np.concatenate((active_features, inactive_features), axis=0) feature_scaler = preprocessing.MaxAbsScaler() training_set = feature_scaler.fit_transform(combined_features) active_labels = [1] * len(active_features) inactive_labels = [-1] * len(inactive_features) training_labels = active_labels + inactive_labels print "final model fit" final_model.fit(training_set, training_labels) model_data = {"svm": final_model, "feature_scaler": feature_scaler} print "pickling" pickle.dump(model_data, open(model_data_file, 'wb')) if __name__ == '__main__': sys.exit(main()) PKˤ0Jgr##defcom/motif_sites.py#--------------------------------------------------------------------- # DeFCoM: A supervised learning genomic footprinter # Copyright (C) 2016 Bryan Quach # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #---------------------------------------------------------------------- from bisect import bisect_left from os import path import pysam import random class MotifSites(object): """Access and process pseudo-BED annotations for motif sites. Provides functionality for working with motif predictions. Assumes that coordinates are 0-based [start, end). Attributes: _motif_sites: A pysam Tabixfile object. filename: The filename associated with the Tabixfile object. num_sites: Number of motif site annotations in the given file Args: motif_file: A string specifying the path and filename of the pseudo-BED file containing motif sites. The file must be gzipped and have the '.gz' file extension. It should also have an index file with the same name but with the '.tbi' extension appended. For example, '/path/file.bed' should be gzipped to '/path/file.bed.gz' and have an index file called '/path/file.bed.gz.tbi'. Raises: IOError: File could not be found or opened. IOError: File index could not be found. IOError: Gzipped version of file could not be found. """ def __init__(self, motif_file): """Initialize MotifSites class.""" self._motif_sites = self.__load_motif_sites(motif_file) self.filename = motif_file self.num_sites = len(tuple( self._motif_sites.fetch(multiple_iterators=True))) def __del__(self): """Deconstructor for MotifSites. Closes the Tabixfile object in '_motif_sites'. """ try: self._motif_sites.close() except: pass def __load_motif_sites(self, filename): """Load the pseudo-BED file data into a Tabixfile object. Args: filename: A string for the psuedo-BED file. The path should be prepended if the file is located outside the working directory. Returns: A pysam Tabixfile object. Raises: IOError: File could not be found or opened. IOError: File index could not be found. IOError: Gzipped version of file could not be found. """ print "Loading BED file %s..." % path.basename(filename) if not path.isfile(filename): error_msg = "%s could not be found." % filename raise IOError(error_msg) if not path.isfile(filename + ".gz"): error_msg = "%s could not be found." % (filename + ".gz") raise IOError(error_msg) if not path.isfile(filename + ".gz.tbi"): error_msg = "Index file %s not found." % (filename + ".gz.tbi") raise IOError(error_msg) tabixfile = pysam.Tabixfile(filename + ".gz", "r") return tabixfile def get_all_sites(self): """Retrieve all motif sites in the MotifSites object. Retrieves all the motif sites for the MotifSites object and stores them in an iterator. Returns: A pysam iterator containing motif site data. """ return self._motif_sites.fetch(parser=pysam.asTuple(), multiple_iterators=True) def get_chr_sites(self, chrom, multi_iter=False): """Retrieve all motif sites on a specified chromosome. Retrieves all the motif sites in the MotifSites object from a specified chromosome and stores them in an iterator. Args: chrom: A string specifying the chromosome of interest. Must follow the naming convention specified by the pseudo-BED file. multi_iter: A boolean indicating whether to enable multipe iterators. This should be 'True' if multiple calls are made to this method concurrently for the same MotifSites object. Returns: A pysam iterator that contains motif site data. Raises: ValueError - The genomic coordinates are out of range or invalid. """ return self._motif_sites.fetch(reference=chrom, parser=pysam.asTuple(), multiple_iterators=multi_iter) def get_sites(self, chrom, start, end, multi_iter=False): """Retrieve motif sites overlapping a genomic interval. Retrieves the motif sites in the MotifSites object that overlap with a specified genomic interval. Does not require complete overlap. Args: chrom: A string specifying the chromosome of interest. Must follow the naming convention specified by the pseudo-BED file. start: An int for the 0-based start position of the genomic region. end: An int for the 0-based end position of the genomic region. The position 'end' denotes one position past the region of interest. multi_iter: A boolean indicating whether to enable multipe iterators. This should be 'True' if multiple calls are made to this method concurrently for the same MotifSites object. Returns: A pysam iterator that contains motif site data. Raises: ValueError - The genomic coordinates are out of range or invalid. """ return self._motif_sites.fetch(chrom, start, end, parser=pysam.asTuple(), multiple_iterators=multi_iter) def subsample_sites(self, n_samples, iterations=1): """Choose a random subset of motif sites. Chooses a random subset of motif sites assuming that n_samples is less than the total number of sites. If n_samples is greater than or equal to the total number of sites, then the whole set is returned. Args: n_samples: The number of sites to subsample. Returns: A 2D list containing pseudo-BED format data for the sites selected. """ ######## # TO DO: APPLY PARALLEL PROCESSING FOR OBTAINING SAMPLES ######## def binary_search(a, x, lo=0, hi=None): """Binary search algorithm.""" hi = hi if hi is not None else len(a) pos = bisect_left(a,x,lo,hi) return (True if pos != hi and a[pos] == x else False) counter = 0 site_list = [[] for i in xrange(iterations)] indices = range(self.num_sites) #Apply random sampling if needed sample_idx = [] for i in xrange(iterations): if self.num_sites <= n_samples: n_samples = self.num_sites subset = indices else: subset = random.sample(indices, n_samples) subset.sort() sample_idx.append(subset) motif_sites = self._motif_sites.fetch(parser=pysam.asTuple(), multiple_iterators=True) for site in motif_sites: for i in xrange(iterations): idx = sample_idx[i] if binary_search(idx, counter, hi=n_samples): site_list[i].append(site) counter += 1 return site_list PKˤ0J#w$w$defcom/features.py#--------------------------------------------------------------------- # DeFCoM: A supervised learning genomic footprinter # Copyright (C) 2016 Bryan Quach # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #---------------------------------------------------------------------- from types import * import numpy as np import math class Features(object): """Functions for applying feature extraction.""" @classmethod def get_splits(self, length, partition_fn): """Get indices for dividing a vector into multiple parts. Gets the indices for partitioning a vector into multiple subvectors based on a supplied partition function. If the partition function does not evenly divide the vector then the segments furthest from the vector center will include the remainder. This function is designed to easily create partitions whose sizes are approximately symmetric about the vector center but may vary in length otherwise. The list of indices matches the format required as an argument for numpy.split(). Args: length: An int specifying the size of the vector to partition. partition_fn: A vectorized callable that accepts and returns int types. A passed int value denotes the distance from the center of the vector in partition function distance. A returned int should specify the size of the partition at the provided partition distance. We define a partition distance unit to be the number of partitions away from a specified point where 0 denotes an initial partition that borders a specified point. Returns: An array containing the indices at which a vector of the size specified would be split based on the given partition function. Raises: AssertionError: The length supplied is not positive. TypeError: Partition function is not numpy vectorized. TypeError: The 'length' argument is not the correct type. ValueError: Non-positive value returned by partition function. """ if type(length) is not IntType: error_msg = "Vector supplied must be an array-like or numpy array" raise TypeError(error_msg) assert length > 0, "Length value is not positive" if type(partition_fn) is not np.lib.function_base.vectorize: error_msg = "Partition function is not vectorized" raise TypeError(error_msg) #Get the number of partitions needed to span the vector pdist = 0 error_msg = "Non-positive value returned by partition function" if partition_fn(pdist) < 1: raise ValueError(error_msg) windows_sum = partition_fn(pdist) * 2 while windows_sum <= length: pdist += 1 if partition_fn(pdist) < 1: raise ValueError(error_msg) windows_sum = partition_fn(np.arange(pdist + 1)).sum() * 2 #Get the window sizes to use right_windows = partition_fn(np.arange(pdist)) left_windows = partition_fn(np.arange(pdist - 1, -1, -1)) #Update the first window size to include the remainder remainder = length - left_windows.sum() * 2 left_remainder = int(math.floor(remainder / 2.0)) left_windows[0] += left_remainder #Make partitions left_splits = [left_windows[:(i+1)].sum() for i in range(left_windows.size)] center = left_windows.sum() right_splits = [right_windows[:(i+1)].sum() + center for i in range(right_windows.size-1)] splits = np.append(left_splits, right_splits) return splits @classmethod def slope(self, x, y): """Calculate a slope. Args: x = An array-like of x-coordinates. y = An array-like of y-coordinates. Returns: A numpy float64 value for the slope. Raises: AssertError: Insufficient number of coordinates given. ValueError: x and y coordinate vectors not equal in length. """ assert len(y) > 1, "Insufficient number of coordinates given." if type(x) is not np.ndarray: x = np.array(x) if type(y) is not np.ndarray: y = np.array(y) numerator = (x * y).mean() - x.mean() * y.mean() denominator = (x**2).mean() - x.mean()**2 return numerator/denominator @classmethod def get_slopes(self, vector): """Extract slope values for segments of a given vector. Computes the slope assuming that the segments of the given vector represent y-coordinate values and x-coordinate values are spaced 1 unit apart. Args: vector: An array-like of numpy arrays with numeric values. Returns: A numpy array with slope values for each segment in the given vector. Raises: AttributeError: Segments of vector are not numpy arrays. """ get_x = lambda x: np.arange(x.size) x_vals = [get_x(y) for y in vector] return np.array([self.slope(x,y) for x,y in zip(x_vals, vector)]) @classmethod def get_pwm_scores(self, motif_sites, score_column=4): """Retrieve PWM scores given a list of motif_sites. Gets PWM scores from a 2D list of motif site data in Pseudo-BED format and stores them into a list. Args: motif_sites: A 2D list or iterator of motif sites in Pseudo-BED format. score_column: The 2nd dimension index in "motif_sites" that specifies the position in the list that contains the PWM score. By default it is assumed that this is position 4. """ scores = [] for site in motif_sites: scores.append(float(site[score_column])) return scores @classmethod def convert_to_cuts(self, motif_sites, aln_data, flank_size, combine_strands=True, f_offset=0, r_offset=0, use_weights=False): """Convert motif sites into a matrix of DNaseI digestion site counts. Retrieves a vector of DNaseI digestion sites for each motif site in the 'motif_sites' list. Puts all the vectors into a 2D list. Args: motif_sites: A 2D list or iterator of motif sites in Pseudo-BED format. aln_data: An Alignment object. flank_size: An int specifying how many bases upstream and downstream of the motif site center to include. The value specifies one flank. The total included bases is flank_size*2. combine_strands: A boolean indicating if DNaseI digestion site counts should be aggregated across strands. If 'False' then the forward and reverse strand digestion count vectors are concatenated with the forward strand being first. f_offset: An int denoting the offset downstream from the 5' end of a forward (+) strand read. Equivalent to read shifting. r_offset: An int denoting the offset upstream from the 5' end of a reverse (-) strand read. Equivalent to read shifting. use_weights: A boolean indicating whether bias correction weights should be used for each read. Returns: A 2D list where each inner list contains DNaseI digestion counts per base within and/or around a motif site. Each index of the first dimension corresponds to a motif site. Raises: ValueError - The genomic coordinates are out of range or invalid. """ cut_matrix = [] for motif_site in motif_sites: chrom = motif_site[0] start = int(motif_site[1]) end = int(motif_site[2]) midpoint = (start+end)/2 #Extend region by flank size start = midpoint - flank_size end = midpoint + flank_size #Get cut sites if combine_strands: cut_sites = aln_data.get_cut_sites(chrom, start, end, strand="combined", f_offset=f_offset, r_offset=r_offset, use_weights=use_weights, multi_iter=False) else: cut_sites = aln_data.get_cut_sites(chrom, start, end, strand=None, f_offset=f_offset, r_offset=r_offset, use_weights=use_weights, multi_iter=False) cut_matrix.append(cut_sites) return cut_matrix PK2J?x"SSdefcom/performance_stats.py#--------------------------------------------------------------------- # DeFCoM: A supervised learning genomic footprinter # Copyright (C) 2016 Bryan Quach # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #---------------------------------------------------------------------- import uuid import subprocess from numpy import concatenate from sklearn.metrics import roc_curve, auc """Methods for computing performance statistics.""" def pAUC(y_true, y_score, fpr_cutoff): """ Calculate partial Area Under ROC (pAUC). Computes a pAUC value given a specified false positive rate (FPR) cutoff. It is important to note that the exact pAUC cannot be computed. The accuracy of the calculation depends on the resolution of data points produced by an intermediary ROC curve. The FPR data point closest to and greater than the cutoff specified will be used for interpolation to determine the pAUC at the specified FPR cutoff. For these FPR values, the highest associated TPR values are used. Args: y_true: Array-like of true binary class labels in range {0, 1} or {-1, 1} corresponding to y_score. The larger value represents the positive class. y_score: Array-like of target scores with higher scores indicating more confidence in the positive class. fpr_cutoff: A float specifying the FPR cutoff to use in computing the pAUC. Must be in the interval (0,1). Returns: A float representing the pAUC value. Raises: AssertionError: The FPR cutoff is not in the interval (0,1) """ error_msg = "FPR cutoff must be in (0,1)" assert fpr_cutoff > 0.0 and fpr_cutoff < 1.0 fpr, tpr, trash = roc_curve(y_true, y_score, drop_intermediate=False) index_low = len([1 for i in fpr if i < fpr_cutoff])-1 index_high = index_low + 1 #Get interpolated TPR value from FPR cutoff if index_low == -1: #No ROC data points lower than cutoff x0 = fpr[0] try: x1 = min([xv for (c,xv) in zip([x>x0 for x in fpr],fpr) if c]) except ValueError: x1 = x0 y0 = max([yv for (c,xv,yv) in zip([x==x0 for x in fpr],fpr,tpr) if c]) y1 = max([yv for (c,xv,yv) in zip([x==x1 for x in fpr],fpr,tpr) if c]) #Apply line derived from two closest points from FPR cutoff tpr_cutoff = fpr_cutoff*((y1-y0)/(x1-x0)) + ((x1*y0-x0*y1)/(x1-x0)) #Segment full ROC to get partial ROC fpr = [0.0] + [fpr_cutoff] tpr = [0.0] + [tpr_cutoff] elif index_high == len(fpr): #No ROC data points higher than cutoff try: x0 = max([xv for (c,xv) in zip([x. #---------------------------------------------------------------------- from os import path from pysam import FastaFile class Genome(object): """Access and process genomic sequence information. Provides functionality to retrieve and process genomic sequence data. The class acts as a wrapper around the pysam FastaFile class. Attributes: _genome: A pysam Fastafile object. Args: fasta_file: A string containing the path and filename for a genome FASTA file. The file must have an index file with the same name but with the '.fai' extension appended. For example, the index file for '/path/file.fa' should be '/path/file.fa.fai'. Raises: IOError: FASTA file could not be found or opened. IOError: FASTA file index could not be found. """ def __init__(self, fasta_file): """Initialize Genome class.""" self._genome = self._load_fasta(fasta_file) def __del__(self): """Deconstructor for the Genome class.""" try: self._genome.close() except: pass def _load_fasta(self, filename): """Load the FASTA file data into a FastaFile object. Args: filename: A string for the FASTA file to load. The path should be prepended if the file is located outside the working directory. Returns: A pysam FastaFile object. Raises: IOError: FASTA file could not be found or opened. IOError: FASTA file index could not be found. """ print "Loading FASTA file %s..." % path.basename(filename) if not path.isfile(filename): error_msg = "%s could not be found." % filename raise IOError(error_msg) if not path.isfile(filename + ".fai"): error_msg = "Index file %s not found." % (filename + ".fai") raise IOError(error_msg) return FastaFile(filename) def get_genome_filename(self): """Get the associated FASTA file name. Returns: A string for the FASTA file name. """ return path.basename(self._genome.filename) def get_chr_names(self): """Return the chromosome names associated with the genome. Returns: A tuple containing chromosome names as strings. """ return self._genome.references def get_chr_length(self, chrom): """Get the length of specified chromosome. Args: chrom: A string specifying the chromosome of interest. Must follow the naming convention specified by the FASTA file. Returns: An int value representing the length of the chromosome. Raises: KeyError: Chromosome name could not be found. """ try: return self._genome.get_reference_length(chrom) except KeyError: print ("Genome has the following chromosome/reference names:\n") for i in self._genome.references: print i error_msg = "\n%s not found in reference list!" % chrom raise KeyError(error_msg) def get_sequence(self, chrom, start, end, strand="+"): """Get nucleotide sequence from genomic coordinates. Retrieves the nucleotide sequence for a specified genomic region using BED format chromosome coordinates i.e., 0-based indexing [start, end). Args: chrom: A string specifying the chromosome of interest. Must follow the naming convention specified by the FASTA file. start: An int for the 0-based start position of the genomic region. end: An int for the 0-based end position of the genomic region. The position 'end' denotes one position past the region of interest. strand: The strand from which to obtain the cut counts. Must be either '+' or '-'. Returns: A string of uppercase nucleotides corresponding to specified strand of the genomic region supplied. Raises: KeyError - The chromosome name is specified incorrectly. ValueError - The genomic coordinates are out of range, are invalid, or the file does not permit random access. ValueError - 'strand' is not specified correctly. """ if strand == "+": return self._genome.fetch(chrom, start, end).upper() elif strand == "-": dna = self._genome.fetch(chrom, start+1, end+1).upper() return self.reverse_complement(dna) else: raise ValueError("strand must be '+' or '-'") @classmethod def complement(self, sequence): """Get the complement of the given DNA sequence. Args: sequence: A string from alphabet {A,C,G,T,N,a,c,g,t,n}. Returns: A string with the complementary DNA sequence of the sequence provided. Raises: KeyError: The string provided contains characters other than 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', and 'n'. """ seq_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N', 'a':'T', 't':'A', 'g':'C', 'c':'G', 'n':'N',} try: return "".join([seq_dict[i] for i in sequence.upper()]) except KeyError: error_msg = "DNA sequence must be from {A,C,G,T,N,a,c,g,t,n}" raise KeyError(error_msg) @classmethod def reverse_complement(self, sequence): """Reverse complement the given DNA sequence. Args: sequence: A string from alphabet {A,C,G,T,N,a,c,g,t,n}. Returns: A string with the reverse complement DNA sequence of the sequence provided. Raises: KeyError: The string provided contains characters other than 'A', 'C', 'G', 'T', and 'N'. """ return self.complement(sequence)[::-1] PKˤ0Jbbdefcom/__init__.py#--------------------------------------------------------------------- # DeFCoM: A supervised learning genomic footprinter # Copyright (C) 2016 Bryan Quach # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #---------------------------------------------------------------------- PKˤ0J.f@@defcom/alignment.py#--------------------------------------------------------------------- # DeFCoM: A supervised learning genomic footprinter # Copyright (C) 2016 Bryan Quach # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #---------------------------------------------------------------------- from os import path import pysam class Alignments(object): """Access and process sequence alignment data. The Alignment class acts as a wrapper class for pysam to provide additional footprinting specific data processing functionality. Information about pysam can be found at http://pysam.readthedocs.org/. Attributes: _alignment: A pysam AlignmentFile object. Args: aln_file: A string containing the path and filename for a sorted BAM file. The file must have an index file with the same name but with the '.bai' extension appended. For example, the index file for '/path/file.bam' should be '/path/file.bam.bai'. Raises: IOError: BAM file could not be found. AssertionError: File is not a BAM file. IOError: BAM file index could not be found. """ def __init__(self, aln_file): """Inits the Alignment class.""" self._alignment = self._load_alignment(aln_file) def __del__(self): """Deconstructor. Closes the AlignmentFile object in '_alignment'. """ try: self._alignment.close() except: pass def _load_alignment(self, filename): """Load the BAM data into an AlignmentFile object. Args: filename: A string for the BAM file to load. The path should be prepended if the file is located outside the working directory. Returns: A pysam AlignmentFile object. Raises: IOError: BAM file could not be found. AssertionError: File is not a BAM file. IOError: BAM file index could not be found. """ if not path.isfile(filename): error_msg = "%s could not be found." % filename raise IOError(error_msg) if path.splitext(filename)[1] != ".bam": error_msg = "%s is not a BAM file." % path.basename(filename) raise AssertionError(error_msg) if not path.isfile(filename + ".bai"): error_msg = "Index file %s not found." % (filename + ".bai") raise IOError(error_msg) print "Loading BAM file %s..." % path.basename(filename) return pysam.AlignmentFile(filename, "rb") def get_mapped_read_count(self): """Get the number of mapped reads in the AlignmentFile object. Returns: A long int corresponding to the number of mapped reads in the '_alignment' object. """ return self._alignment.mapped def get_unmapped_read_count(self): """Get the number of unmapped reads in the AlignmentFile object. Returns: A long int corresponding to the number of unmapped reads in the '_alignment' object. """ return self._alignment.unmapped def get_total_read_count(self): """Get the total number of reads in the Samfile object. Returns: A long int corresponding to the number of total reads in the '_alignment' object. """ return self._alignment.mapped + self._alignment.unmapped def get_reads(self, chrom, start, end, multi_iter=False): """Get reads overlapping a genomic region. Retrieves the set of reads overlapping a specified genomic region using BED format chromosome coordinates i.e., 0-based indexing [start, end). Args: chrom: A string for the name of the chromosome for the desired genomic region. Names are derived from the BAM header. start: An int for the 0-based start position of the genomic region. end: An int for the 0-based end position of the genomic region. The position 'end' denotes one position past the region of interest. multi_iter: A boolean indicating whether to enable multipe iterators. This should be 'True' if multiple calls are made to this method for the same Alignments object and the iterators are used concurrently. Returns: An iterator of AlignedSegment objects. Raises: ValueError - The genomic coordinates are out of range, are invalid, or the file does not permit random access. """ return self._alignment.fetch(chrom, start, end, multiple_iterators=multi_iter) def get_read_density(self, chrom, start, end, strand=None, use_weights=False, multi_iter=False): """Compute the total number of reads overlapping a genomic region. Calculates the read density within a genomic region specified using BED format chromosome coordinates i.e., 0-based indexing [start, end). Strand-specific read density is computed if specified. It is important to note that a read is counted merely if it overlaps the specified region. The read does not have to be fully contained in the region. Args: chrom: A string for the name of the chromosome for the desired genomic region. Names are derived from the BAM header. start: An int for the 0-based start position of the genomic region. end: An int for the 0-based end position of the genomic region. The position 'end' denotes one position past the region of interest. strand: The strand from which to obtain the read density. Must be one of '+', '-', or 'None' (default). If 'None' is specified then reads from both strands are included. multi_iter: A boolean indicating whether to enable multipe iterators. This should be 'True' if multiple calls are made to this method concurrently for the same Alignments object. Returns: A float value corresponding to the total read count observed in the genomic interval. Raises: KeyError - The tag field 'XW' cannot be found for a read. ValueError - The genomic coordinates are out of range, are invalid, or the file does not permit random access. ValueError - 'strand' is not specified correctly. """ reads = self._alignment.fetch(chrom, start, end, multiple_iterators=multi_iter) read_data = [] if use_weights: for read in reads: read_data.append((read.is_reverse, read.is_unmapped, read.get_tag('XW'))) if strand == "+": read_count = 0.0 for i in read_data: if not i[0] and not i[1]: read_count += i[2] elif strand == "-": read_count = 0.0 for i in read_data: if i[0] and not i[1]: read_count += i[2] elif strand == None: read_count = sum([i[2] for i in read_data if not i[1]]) else: raise ValueError("strand must be '+', '-', or None") else: read_data = [(read.is_reverse, read.is_unmapped) for read in reads] if strand == "+": read_count = 0.0 for i in read_data: if not i[0] and not i[1]: read_count += 1.0 elif strand == "-": read_count = 0.0 for i in read_data: if i[0] and not i[1]: read_count += 1.0 elif strand == None: read_count = sum([1.0 for i in read_data if not i[1]]) else: raise ValueError("strand must be '+', '-', or None") return read_count def get_weights(self, chrom, start, end, strand=None, f_offset=0, r_offset=0, multi_iter=False): """Retrieve bias correction weights for a given region. Retrieves the per position bias correction weights within the specified genomic interval. The per read weights for a position are totaled to get the per position weight. The BAM file must contain bias correction information or else this method will raise an error. Args: chrom: A string for the name of the chromosome for the desired genomic region. Names are derived from the BAM header. start: An int for the 0-based start position of the genomic region. end: An int for the 0-based end position of the genomic region. The position 'end' denotes one position past the region of interest. strand: The strand from which to obtain the weights. Must be one of '+', '-', or 'None' (default). If 'None' is specified then reads from both strands are included. f_offset: An int denoting the offset downstream from the 5' end of a forward (+) strand read. Equivalent to read shifting. r_offset: An int denoting the offset upstream from the 5' end of a reverse (-) strand read. Equivalent to read shifting. multi_iter: A boolean indicating whether to enable multipe iterators. This should be 'True' if multiple calls are made to this method concurrently for the same Alignments object. Returns: A 2-D list of bias correction weights where the first dimension represents a genomic position. Index 0 corresponds to 'start'. The second dimension is a list of all the read weights at a position. Raises: KeyError - The tag field 'XW' cannot be found for a read. ValueError - The genomic coordinates are out of range, are invalid, or the file does not permit random access. ValueError - 'strand' is not specified correctly. """ reads = self._alignment.fetch(chrom, start, end, multiple_iterators=multi_iter) pos_weights = {} neg_weights = {} for read in reads: if read.is_unmapped: continue weight = read.get_tag('XW') if read.is_reverse: cut_site = read.reference_end - r_offset - 1 if cut_site >= start and cut_site < end: current_weight = neg_weights.get(cut_site, 0.0) neg_weights[cut_site] = current_weight + weight else: cut_site = read.reference_start + f_offset if cut_site >= start and cut_site < end: current_weight = pos_weights.get(cut_site, 0.0) pos_weights[cut_site] = current_weight + weight if strand is None: pos_weights = [pos_weights.get(x, 0.0) for x in range(start, end)] neg_weights = [neg_weights.get(x, 0.0) for x in range(start, end)] combined_weights = [] for x, y in zip(pos_weights, neg_weights): combined_weights.append(x + y) return combined_weights elif strand == '+': return [pos_weights.get(x, 0.0) for x in range(start, end)] elif strand == '-': return [neg_weights.get(x, 0.0) for x in range(start, end)] else: raise ValueError("strand must be '+', '-', or None") def get_cut_sites(self, chrom, start, end, strand=None, f_offset=0, r_offset=0, use_weights=False, multi_iter=False): """Retrieve DNaseI digestion sites for a genomic region. Makes a list of counts relative to the start coordinate of the genomic region specified using BED format chromosome coordinates i.e., 0-based indexing [start, end). Cuts are assumed to be 5' read ends unless offsets are specified. Args: chrom: A string for the name of the chromosome for the desired genomic region. Names are derived from the BAM header. start: An int for the 0-based start position of the genomic region. end: An int for the 0-based end position of the genomic region. The position 'end' denotes one position past the region of interest. strand: The strand from which to obtain the cut counts. Must be one of '+', '-', 'combined' or 'None' (default). If 'combined' is specified then reads from both strands are aggregated. If 'None' is specified the read vectors for each strand are concatenated with the '+' strand vector being first. f_offset: An int denoting the offset downstream from the 5' end of a forward (+) strand read. Equivalent to read shifting. r_offset: An int denoting the offset upstream from the 5' end of a reverse (-) strand read. Equivalent to read shifting. use_weights: A boolean indicating whether bias correction weights should be used for each read. The weights are expected to be stored as an optional tag 'XW' for each read. multi_iter: A boolean indicating whether to enable multipe iterators. This should be 'True' if multiple calls are made to this method concurrently for the same Alignments object. Returns: A list of counts as floats with positions corresponding to the specified genomic interval. Raises: KeyError - The tag field 'XW' cannot be found for a read. ValueError - The genomic coordinates are out of range, are invalid, or the file does not permit random access. ValueError - 'strand' is not specified correctly. """ if use_weights: return self.get_weights(chrom, start, end, strand, f_offset, r_offset, multi_iter) reads = self._alignment.fetch(chrom, start, end, multiple_iterators=multi_iter) pos_cuts = {} neg_cuts = {} for read in reads: if read.is_unmapped: continue if read.is_reverse: cut_site = read.reference_end - r_offset - 1 if cut_site >= start and cut_site < end: neg_cuts[cut_site] = neg_cuts.get(cut_site, 0.0) + 1.0 else: cut_site = read.reference_start + f_offset if cut_site >= start and cut_site < end: pos_cuts[cut_site] = pos_cuts.get(cut_site, 0.0) + 1.0 if strand is None: pos_cuts = [pos_cuts.get(x, 0.0) for x in range(start, end)] neg_cuts = [neg_cuts.get(x, 0.0) for x in range(start, end)] combined_cuts = ([x + y for x, y in zip(pos_cuts, neg_cuts)]) return combined_cuts elif strand == "combined": pos_cuts = [pos_cuts.get(x, 0.0) for x in range(start, end)] neg_cuts = [neg_cuts.get(x, 0.0) for x in range(start, end)] concat_cuts = pos_cuts + neg_cuts return concat_cuts elif strand == '+': return [pos_cuts.get(x, 0.0) for x in range(start, end)] elif strand == '-': return [neg_cuts.get(x, 0.0) for x in range(start, end)] else: err_msg = "strand must be '+', '-', 'combined', or None (default)" raise ValueError(err_msg) PKf 2J^- &DeFCoM-1.0.1.dist-info/DESCRIPTION.rstUNKNOWN PKe 2JpQb+DeFCoM-1.0.1.dist-info/dependency_links.txthttps://github.com/pysam-developers/pysam https://sourceforge.net/projects/numpy/ https://github.com/scipy/scipy/releases https://github.com/scikit-learn/scikit-learn/releases PKf 2J mr$DeFCoM-1.0.1.dist-info/metadata.json{"extensions": {"python.details": {"contacts": [{"email": "bryancquach@gmail.com", "name": "Bryan Quach", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://bitbucket.org/bryancquach/defcom"}}}, "extras": [], "generator": "bdist_wheel (0.26.0)", "keywords": ["defcom", "genomics", "footprinter", "footprinting"], "license": "PSF", "metadata_version": "2.0", "name": "DeFCoM", "run_requires": [{"requires": ["Cython (>0.20.0)", "docutils (>=0.3)", "numpy (>=0.18)", "pysam (>=0.9)", "scikit-learn (>=0.16)", "scipy (>=0.14)"]}], "summary": "A supervised learning genomic footprinter", "version": "1.0.1"}PKe 2J@$DeFCoM-1.0.1.dist-info/top_level.txtdefcom PKf 2J''\\DeFCoM-1.0.1.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.26.0) Root-Is-Purelib: true Tag: py2-none-any PKf 2JPDeFCoM-1.0.1.dist-info/METADATAMetadata-Version: 2.0 Name: DeFCoM Version: 1.0.1 Summary: A supervised learning genomic footprinter Home-page: https://bitbucket.org/bryancquach/defcom Author: Bryan Quach Author-email: bryancquach@gmail.com License: PSF Keywords: defcom genomics footprinter footprinting Platform: UNKNOWN Requires-Dist: Cython (>0.20.0) Requires-Dist: docutils (>=0.3) Requires-Dist: numpy (>=0.18) Requires-Dist: pysam (>=0.9) Requires-Dist: scikit-learn (>=0.16) Requires-Dist: scipy (>=0.14) UNKNOWN PKf 2JDeFCoM-1.0.1.dist-info/RECORDDeFCoM-1.0.1.data/scripts/predict.py,sha256=0aLgjMMfriBEf_sBo1vOexNAmsAfUKjwWNwfF88zdGI,3503 DeFCoM-1.0.1.data/scripts/train.py,sha256=bJA5kDJA0fv6qdodWs3QnW6ZUxMK3Laa51B4iJg9Aek,10747 DeFCoM-1.0.1.dist-info/DESCRIPTION.rst,sha256=OCTuuN6LcWulhHS3d5rfjdsQtW22n7HENFRh6jC6ego,10 DeFCoM-1.0.1.dist-info/METADATA,sha256=HUzBa5NeXdG9KtFIdghbcw6H3RAks2Jo4TogBXOxf1A,492 DeFCoM-1.0.1.dist-info/RECORD,, DeFCoM-1.0.1.dist-info/WHEEL,sha256=JTb7YztR8fkPg6aSjc571Q4eiVHCwmUDlX8PhuuqIIE,92 DeFCoM-1.0.1.dist-info/dependency_links.txt,sha256=9rwmV2Ab53SsVQsDUt6KHmKn0pLXCSddSZjK2-istSM,176 DeFCoM-1.0.1.dist-info/metadata.json,sha256=7ljBNApUpzB8FflxEPbKOezcIRn4rYmEI3soTlCX8zU,660 DeFCoM-1.0.1.dist-info/top_level.txt,sha256=ZecTXQW5bcxY2cPBZY9iFVn5ZEULnjeL4hq-QzeofYg,7 defcom/__init__.py,sha256=XYwd8cunol02f3JOtZyn7HfYqG6pZSqRlzsM8sPY4TE,866 defcom/alignment.py,sha256=cxGd-TBpJlFXtIV78YBbxvL8Pi12BSkoFSoxqPiMQn0,16407 defcom/features.py,sha256=qU1pSmYdAInGmCosVb1lH_E4BHY6dUv8neMY3E1BU0w,9335 defcom/genome.py,sha256=ER0wy4ZEFRv4kokCG6lL-JvFuQCUoiq_GWyabP00JUs,6919 defcom/motif_sites.py,sha256=t3crtNMwmwMT-2XAPBPpASjIAY-INSPdL5vYrt61rGk,7971 defcom/performance_stats.py,sha256=s7I2FRVl8xxnoMpdqUfd_limOXWcRrZscXAEXYoDzao,4435 PK2JU $DeFCoM-1.0.1.data/scripts/predict.pyPK2J))" DeFCoM-1.0.1.data/scripts/train.pyPKˤ0Jgr##,8defcom/motif_sites.pyPKˤ0J#w$w$Wdefcom/features.pyPK2J?x"SS)|defcom/performance_stats.pyPKˤ0J_defcom/genome.pyPKˤ0Jbbdefcom/__init__.pyPKˤ0J.f@@|defcom/alignment.pyPKf 2J^- &DeFCoM-1.0.1.dist-info/DESCRIPTION.rstPKe 2JpQb+DeFCoM-1.0.1.dist-info/dependency_links.txtPKf 2J mr$ DeFCoM-1.0.1.dist-info/metadata.jsonPKe 2J@$DeFCoM-1.0.1.dist-info/top_level.txtPKf 2J''\\*DeFCoM-1.0.1.dist-info/WHEELPKf 2JPDeFCoM-1.0.1.dist-info/METADATAPKf 2JDeFCoM-1.0.1.dist-info/RECORDPK`