[med-svn] [fitgpc] 01/02: Imported Upstream version 0.0.20130418
Andreas Tille
tille at debian.org
Fri Feb 7 16:00:13 UTC 2014
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository fitgpc.
commit 915099fd9d23f17c4898b29b97f02e83a29b132c
Author: Andreas Tille <tille at debian.org>
Date: Fri Feb 7 17:03:52 2014 +0100
Imported Upstream version 0.0.20130418
---
README.txt | 79 ++++++
fitGCP.py | 795 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 874 insertions(+)
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..747b68d
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,79 @@
+Help on python script fitGCP.py
+
+Fits mixtures of probability distributions to genome coverage profiles using an
+EM-like iterative algorithm.
+
+The script uses a SAM file as input and parses the mapping information and
+creates a Genome Coverage Profile (GCP). The GCP is written to a file, such that
+this step can be skipped the next time.
+The user provides a mixture model that is fitted to the GCP. Furthermore, the
+user may specify initial parameters for each model.
+
+As output, the script generates a text file containing the final set of fit
+parameters and additional information about the fitting process. A log file
+contains the the current set of parameters in each step of the iteration. If
+requested, a plot of the GCP and the fitted distributions can be created.
+
+REQUIREMENTS:
+-------------------------------------------------------------------------------
+Python 2.7
+Python packages numpy, scipy, pysam
+
+
+USAGE:
+-------------------------------------------------------------------------------
+fitGCP runs on the command line. The following command describes the general
+structure:
+
+python fitGCP.py [options] NAME
+
+fitGCP fits mixtures of probability distributions to genome coverage profiles using
+an EM-like iterative algorithm.
+
+The script uses a SAM file as input and parses the mapping information and
+creates a Genome Coverage Profile (GCP). The GCP is written to a file, such that
+this step can be skipped the next time.
+The user provides a mixture model that is fitted to the GCP. Furthermore, the
+user may specify initial parameters for each model.
+
+As output, the script generates a text file containing the final set of fit
+parameters and additional information about the fitting process. A log file
+contains the the current set of parameters in each step of the iteration. If
+requested, a plot of the GCP and the fitted distributions can be created.
+
+Parameters:
+NAME: Name of SAM file to analyze.
+
+
+Options:
+ -h, --help show this help message and exit
+ -d DIST, --distributions=DIST
+ Distributions to fit. z->zero; n: nbinom (MOM); N:
+ nbinom (MLE); p:binom; t: tail. Default: zn
+ -i STEPS, --iterations=STEPS
+ Maximum number of iterations. Default: 50
+ -t THR, --threshold=THR
+ Set the convergence threshold for the iteration. Stop
+ if the change between two iterations is less than THR.
+ Default: 0.01
+ -c CUTOFF, --cutoff=CUTOFF
+ Specifies a coverage cutoff quantile such that only
+ coverage values below this quantile are considered.
+ Default: 0.95
+ -p, --plot Create a plot of the fitted mixture model. Default:
+ False
+ -m MEAN, --means=MEAN
+ Specifies the initial values for the mean of each
+ Poisson or Negative Binomial distribution. Usage: -m
+ 12.4 -m 16.1 will specify the means for the first two
+ non-zero/tail distributions. The default is calculated
+ from the data.
+ -a ALPHA, --alpha=ALPHA
+ Specifies the initial values for the proportion alpha
+ of each distribution. Usage: For three distributions
+ -a 0.3 -a 0.3 specifies the proportions 0.3, 0.3 and
+ 0.4. The default is equal proportions for all
+ distributions.
+ -l, --log Enable logging. Default: False
+ --view Only view the GCP. Do not fit any distribution.
+ Respects cutoff (-c). Default: False
\ No newline at end of file
diff --git a/fitGCP.py b/fitGCP.py
new file mode 100644
index 0000000..2613daa
--- /dev/null
+++ b/fitGCP.py
@@ -0,0 +1,795 @@
+#!/usr/bin/python
+"""
+Created on Fri Aug 31 2012 14:05
+
+Copyright (c) 2012, Martin S. Lindner and Maximilian Kollock, LindnerM at rki.de,
+Robert Koch-Institut, Germany,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * The name of the author may not be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER OR MAXIMILIAN KOLLOCK BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+usage = """
+%prog [options] NAME
+
+Help on python script fitGCP.py
+
+Fits mixtures of probability distributions to genome coverage profiles using an
+EM-like iterative algorithm.
+
+The script uses a SAM file as input and parses the mapping information and
+creates a Genome Coverage Profile (GCP). The GCP is written to a file, such that
+this step can be skipped the next time.
+The user provides a mixture model that is fitted to the GCP. Furthermore, the
+user may specify initial parameters for each model.
+
+As output, the script generates a text file containing the final set of fit
+parameters and additional information about the fitting process. A log file
+contains the the current set of parameters in each step of the iteration. If
+requested, a plot of the GCP and the fitted distributions can be created.
+
+PARAMETER:
+
+NAME: Name of SAM file to analyze.
+"""
+
+
+import pysam
+import numpy as np
+import scipy.stats as stats
+import sys
+import os
+from collections import namedtuple
+import time
+import cPickle
+from optparse import OptionParser
+from scipy.special import digamma, betainc
+from scipy.optimize import newton
+
+
+"""----------------------------------------------------------------------------
+ Define the distributions
+----------------------------------------------------------------------------"""
+
+class Distribution:
+ _p1 = None
+ _p2 = None
+ _name = "General Distribution"
+ _dof = 0 # Number of degrees of freedom
+
+ alpha = 1.
+ def __str__(self):
+ """ return the name of the distribution """
+ return self._name
+
+ def set_par(self,p1=None, p2=None):
+ """ explicitly set a parameter """
+ if p1 != None:
+ self._p1 = p1
+ if p2 != None:
+ self._p2 = p2
+
+ def pmf(self, x):
+ """ return the value of the probability mass function at x """
+ return x*0.
+
+ def estimate_par(self, data=None, weights=None):
+ """ estimate the distribution parameters from the data and the weights
+ (if provided) """
+ pass
+
+ def init_par(self, mean=None, var=None):
+ """ estimate initial distribution parameters given the mean and
+ variance of the data"""
+ pass
+
+ def report_stats(self, width=20):
+ """ return a string that reports information about the distribution """
+ return str(self._name).ljust(width) + str(self.alpha).ljust(width) + \
+ str(self._p1).ljust(width) + str(self._p2).ljust(width)
+
+
+class Zero(Distribution):
+ _name = "Zero"
+ _dof = 1
+
+ def pmf(self,x):
+ if isinstance(x,np.ndarray):
+ return (x==0).astype(np.float)
+ else:
+ return float(x==0)
+
+
+class NBinom(Distribution):
+ _name = "NBinom"
+ _dof = 3
+ _use_MOM = False
+
+ def pmf(self, x):
+ return stats.nbinom.pmf(x,self._p1,self._p2)
+
+ def estimate_par(self, data, weights=None):
+ if weights == None:
+ weights = data*0. + 1.
+ norm = np.sum(weights)
+ mean = np.sum(data*weights)/(norm + 10**(-25))
+ var = np.sum((data - mean)**2 * weights) / (norm + 10**(-25))
+
+ if self._use_MOM:
+ if var < mean:
+ print "Warning: var < mean"
+ var = 1.01*mean
+ self._p1 = mean**2 / (var - mean)
+ self._p2 = mean / var
+ else:
+ def dp1_llnbinom(param,obs,obs_w):
+ # param: parameter 1
+ # obs: observed values
+ # obs_w: weight of each value
+ N = np.sum(obs_w)
+ data_mean = np.sum(obs*obs_w)/(N)
+ return np.sum(digamma(obs+param)*obs_w) - N*digamma(param) + \
+ N*np.log(data_mean/(param+data_mean))
+ try:
+ self._p1 = newton(dp1_llnbinom,self._p1, args=(data,weights),
+ maxiter=10000)
+ self._p2 = (self._p1)/(self._p1+mean)
+ except:
+ print "Warning: MLE for negative binomial failed. Using MOM."
+ if var < mean:
+ print "Warning: var < mean"
+ var = 1.01*mean
+ self._p1 = mean**2 / (var - mean)
+ self._p2 = mean / var
+
+
+
+ def report_stats(self, width=20):
+ """ return a string that reports information about the distribution """
+ return str(self._name).ljust(width) + str(self.alpha).ljust(width) + \
+ str(self._p1).ljust(width) + str(self._p2).ljust(width) + \
+ str(stats.nbinom.mean(self._p1,self._p2)).ljust(width)
+
+
+
+class Poisson(Distribution):
+ _name = "Poisson"
+ _dof = 2
+
+ def pmf(self,x):
+ return stats.poisson.pmf(x,self._p1)
+
+ def estimate_par(self, data, weights=None):
+ mean = np.sum(data*weights) / (np.sum(weights) )
+ self._p1 = mean
+
+
+
+class TailDistribution(Distribution):
+ _name = "Tail"
+ _dof = 1
+ _norm = False # normalization; recalculate only if necessary
+ _parent = None
+ _active = True # switch tail on/off
+
+ def set_par(self,p1=None, p2=None):
+ """ explicitly set a parameter """
+ if p1 != None:
+ self._p1 = p1
+ if p2 != None:
+ self._p2 = p2
+ self._norm = False
+
+ def estimate_par(self, data=None, weights=None):
+ """ Do not estimate parameters, but obtain parameters from parent
+ distribution """
+ self._p1 = self._parent._p1
+ self._p2 = self._parent._p2
+ self._norm = False
+
+
+
+class NbTail(TailDistribution):
+ _name = "Tail"
+ _dof = 1
+ def __init__(self, nbinom):
+ """ NbTail distribution must be connected to a negative binomial"""
+ if isinstance(nbinom, NBinom):
+ self._parent = nbinom
+ else:
+ raise(Exception("NbTail must be connected to a NBinom object"))
+
+ def pmf(self, x):
+ if np.isscalar(x) and x == 0:
+ return 0.
+
+ if self._active == False:
+ return 0*x
+ if stats.nbinom.mean(self._p1, self._p2) < 1.:
+ self._active = False # switch tail permanently off
+
+ def betaincreg(x,p1,p2):
+ return 1-betainc(p1,x+1,p2)
+
+ # calculate normalization up to certain precision
+ if not self._norm:
+ ks = int(max(2,stats.nbinom.ppf(0.999999,self._p1,self._p2)))
+ norm = np.sum(betaincreg(np.arange(1,ks),self._p1,self._p2))
+ else:
+ norm = self._norm
+ # now return the value(s)
+ ret = betaincreg(x,self._p1,self._p2) / norm
+ if not np.isscalar(x):
+ ret[np.where(x==0)] = 0.
+ return ret
+
+
+
+class PoissonTail(TailDistribution):
+ _name = "Tail of Poisson"
+ _dof = 1
+ def __init__(self, poisson):
+ """ PoissonTail distribution must be connected to a poisson
+ distribution """
+ if isinstance(poisson, Poisson):
+ self._parent = poisson
+ else:
+ raise(Exception("PoissonTail must be connected to a Poisson object"))
+
+ def pmf(self, x):
+ if self._p1 < 1.:
+ self._active = False # switch tail permanently off
+ if self._active == False:
+ return 0*x
+
+ if np.isscalar(x) and x == 0:
+ return 0.
+
+ xmax = int(max(np.max(x),5,stats.poisson.ppf(0.999999,self._p1)))+1
+ xs = np.arange(0,xmax, dtype=np.float)
+ #backward cumulative sum
+ tx = np.cumsum((stats.poisson.pmf(xs, self._p1)/xs)[::-1])[::-1]
+ tx[0] = 0
+ tx /= np.sum(tx)
+ return tx[x]
+
+
+def build_mixture_model(dist_str):
+ """ Build a mixture model from a string """
+ num_dist = len(dist_str)
+ mm = np.array([Zero()]*num_dist)
+ it = 0
+ for dist in dist_str:
+ if dist == "z":
+ it += 1
+ elif dist=="n":
+ mm[it] = NBinom()
+ mm[it]._use_MOM = True
+ it += 1
+ elif dist=="N":
+ mm[it] = NBinom()
+ mm[it]._use_MOM = False
+ it += 1
+ elif dist == "t":
+ if it > 0 and isinstance(mm[it-1], NBinom):
+ mm[it] = NbTail(mm[it-1])
+ it += 1
+ elif it > 0 and isinstance(mm[it-1], Poisson):
+ mm[it] = PoissonTail(mm[it-1])
+ it += 1
+ else:
+ raise Exception("Error: wrong input '%s'. A Tail distribution "
+ "(t) must follow a Negative Binomial (n|N) or "
+ "Poisson (p)."%dist_str)
+ elif dist == "p":
+ mm[it] = Poisson()
+ it+=1
+ else:
+ raise Exception("Input Error: distribution %s not recognized!"%dist)
+
+ # initialize all distributions with equal weights
+ alpha = 1./len(mm)
+ for dist in mm:
+ dist.alpha = alpha
+
+ return mm
+
+
+def init_gamma(mixture_model, dataset):
+ """ create initial responsibilities gamma. The probability that a coverage
+ belongs to a distribution is equal for all distributions."""
+ N_mm = len(mixture_model)
+ N_cov = len(dataset.cov)
+
+ return np.array([[1./N_mm for d in mixture_model] for i in range(N_cov)])
+
+
+
+"""----------------------------------------------------------------------------
+ Define DataSet to store all relevant information (incl. GCP)
+----------------------------------------------------------------------------"""
+
+class DataSet:
+ fname = "" # path to original SAM file
+ cov = np.array([],dtype=np.int) # observed coverage values
+ count = np.array([],dtype=np.int) # number of observations for each cov
+ rlen = 0 # average read length of mapped reads
+ rds = 0 # total number of reads
+ glen = 0 # length of the genome
+
+ def read_from_pickle(self,filename):
+ """ read a genome coverage profile from a pickle file. """
+ data = cPickle.load(open(filename,'r'))
+
+ self.cov = np.array(data[0],dtype=np.int)
+ self.count = np.array(data[1],dtype=np.int)
+ self.rlen = data[2]
+ self.rds = data[3]
+ self.glen = data[4]
+ if len(data) == 6:
+ self.fname = data[5]
+
+
+ def read_from_sam(self, filename):
+ """ extract a genome coverage profile from a sam file. """
+ sf = pysam.Samfile(filename,'r')
+
+ cov = np.zeros((sum(sf.lengths),))
+ start_pos = np.zeros((len(sf.lengths),))
+ start_pos[1:] = np.cumsum(sf.lengths[:-1])
+
+ read_length = 0
+ num_reads = 0
+ for read in sf:
+ if not read.is_unmapped:
+ r_start = start_pos[read.tid] + read.pos # start position
+ r_end = start_pos[read.tid] + read.pos + read.qlen # end
+ cov[r_start:r_end] += 1
+ num_reads += 1
+ read_length += r_end-r_start
+ self.fname = filename
+ self.rds = num_reads
+ self.rlen = read_length/float(num_reads) # average length mapped reads
+ self.glen = len(cov)
+ self.cov = np.unique(cov)
+ self.count = np.array( [np.sum(cov==ucov) for ucov in self.cov] )
+
+ def write_to_pickle(self, filename):
+ """ store dataset in a pickle file. """
+ return cPickle.dump([self.cov, self.count, self.rlen, self.rds,
+ self.glen, self.fname], open(filename,'w'))
+
+
+
+"""----------------------------------------------------------------------------
+ Iterative Method function definitions
+----------------------------------------------------------------------------"""
+
+
+def update_gamma(data_set, mixture_model, gamma):
+ """ Update the probability gamma_i(x), that a position with coverage x
+ belongs to distribution i
+ """
+ num_dist = len(mixture_model)
+ for it in range(len(data_set.cov)):
+ # calculate probability that coverages[it] belongs to a distribution
+ prob = [dist.alpha*dist.pmf(data_set.cov[it]) for dist in mixture_model]
+ if sum(prob) <= 0:
+ gamma[it,:] = 0.
+ else:
+ gamma[it,:] = np.array([prob[i]/sum(prob) for i in range(num_dist)])
+
+ return 1
+
+
+def update_alpha(data_set, mixture_model, gamma):
+ """ Update alpha_i, the proportion of data that belongs to
+ distribution i """
+
+ for i in range(len(mixture_model)-1):
+ w_probs = np.array([p*w for p,w in zip(gamma[:,i],data_set.count)])
+ mixture_model[i].alpha = np.sum(w_probs) / np.sum(data_set.count)
+
+ mixture_model[-1].alpha = 1 - sum([d.alpha for d in mixture_model[:-1]])
+
+ return 1
+
+
+def iterative_fitting(data_set, mixture_model, gamma, iterations):
+ """ Generator fuction to run the iterative method. Operates directly on the
+ data structures mixture_model and gamma. """
+
+ for i in range(iterations):
+ # Expectation-step: update gammas
+ update_gamma(data_set, mixture_model, gamma)
+
+ # temporarily store old parameters
+ old_p1 = np.array([d._p1 or 0 for d in mixture_model])
+ old_p2 = np.array([d._p2 or 0 for d in mixture_model])
+ old_alpha = np.array([d.alpha for d in mixture_model])
+
+ # Parameter estimation step
+ for j,dist in enumerate(mixture_model):
+ dist_weights = gamma[:,j]*data_set.count
+ dist.estimate_par(data_set.cov, dist_weights)
+ update_alpha(data_set, mixture_model, gamma)
+
+ # calculate relative change of the parameters
+ new_p1 = np.array([d._p1 or 0 for d in mixture_model])
+ new_p2 = np.array([d._p2 or 0 for d in mixture_model])
+ new_alpha = np.array([d.alpha for d in mixture_model])
+
+ rel_p1 = np.max(np.abs(new_p1-old_p1) / (new_p1 + 1e-20))
+ rel_p2 = np.max(np.abs(new_p2-old_p2) / (new_p2 + 1e-20))
+ rel_alpha = np.max(np.abs(new_alpha-old_alpha) / (new_alpha + 1e-20))
+
+ max_change = np.max([rel_p1, rel_p2, rel_alpha])
+
+ # maximum CDF difference
+ xs = np.arange(np.max(data_set.cov)+1)
+ ref_pdf = np.zeros((np.max(data_set.cov)+1,))
+ for dist in mixture_model:
+ ref_pdf += dist.pmf(xs)*dist.alpha
+
+ obs_pdf = np.zeros((np.max(data_set.cov)+1,))
+ obs_pdf[data_set.cov.astype(np.int)] = data_set.count/float(np.sum(data_set.count))
+ max_cdf_diff = np.max(np.abs(np.cumsum(ref_pdf)-np.cumsum(obs_pdf)))
+
+ yield i, max_change, max_cdf_diff
+
+
+"""----------------------------------------------------------------------------
+ Auxiliary code
+----------------------------------------------------------------------------"""
+
+class Logger:
+ """ simple logger class """
+ log_file = None
+ def __init__(self, filename=None):
+ if filename:
+ self.log_file = open(filename,'w')
+
+ def log(self, s, c=True):
+ """ write string s to log file and print to console (if c) """
+ if c:
+ print s
+ if self.log_file:
+ self.log_file.write(s+'\n')
+
+
+def create_plot(ds, mm, filename):
+ """ create a plot showing the data and the fitted distributions """
+ plt.figure()
+ xmax = np.max(ds.cov)+1
+ plotXs = np.arange(xmax)
+ plt.plot(ds.cov[np.where(ds.cov<xmax)],ds.count[np.where(ds.cov<xmax)]/
+ float(ds.glen),'k', lw=2, label="GCP")
+ plotY_total = np.zeros((len(plotXs),))
+ for dist in mm:
+ if isinstance(dist,Poisson):
+ format_string = "--"
+ color = 'r'
+ elif isinstance(dist,NBinom):
+ format_string = "-."
+ color = 'b'
+ elif isinstance(dist,TailDistribution):
+ format_string = ":"
+ color = 'Purple'
+ elif isinstance(dist,Zero):
+ format_string = "-"
+ color = 'YellowGreen'
+
+ plotYs = dist.pmf(plotXs)*dist.alpha
+ plotY_total += plotYs
+ plt.plot(plotXs,plotYs, format_string, color=color, lw=2, label=dist._name)
+ plt.plot(plotXs,plotY_total,'--',color='gray',lw=2, label="Mixture")
+ plt.xlabel('coverage')
+ plt.ylabel('')
+ plt.xlim(xmin=0, xmax=xmax)
+ plt.legend()
+ plt.savefig(filename, dpi=300, bbox_inches='tight')
+
+
+
+"""----------------------------------------------------------------------------
+ Main function
+----------------------------------------------------------------------------"""
+
+if __name__ == "__main__":
+
+ # Define command line interface using OptionParser
+ parser = OptionParser(usage=usage)
+
+ parser.add_option("-d", "--distributions", dest="dist",
+ help="Distributions to fit. z->zero; n: nbinom (MOM); N: nbinom (MLE); "
+ "p:binom; t: tail. Default: %default", default="zn")
+
+ parser.add_option("-i", "--iterations", dest="steps", type='int',
+ help="Maximum number of iterations. Default: %default", default=50)
+
+ parser.add_option("-t", "--threshold", dest="thr", type='float', help="Set"
+ " the convergence threshold for the iteration. Stop if the change between "
+ "two iterations is less than THR. Default: %default", default=0.01)
+
+ parser.add_option("-c", "--cutoff", dest="cutoff", type='float',
+ help="Specifies a coverage cutoff quantile such that only coverage values"
+ " below this quantile are considered. Default: %default", default=0.95)
+
+ parser.add_option("-p", "--plot", action="store_true", dest="plot",
+ help="Create a plot of the fitted mixture model. Default: %default",
+ default=False)
+
+ parser.add_option("-m", "--means", dest="mean", type='float',
+ action="append", help="Specifies the initial values for the mean of each "
+ "Poisson or Negative Binomial distribution. Usage: -m 12.4 -m 16.1 will "
+ "specify the means for the first two non-zero/tail distributions. The "
+ "default is calculated from the data.", default=None)
+
+ parser.add_option("-a", "--alpha", dest="alpha", action="append",
+ help="Specifies the initial values for the proportion alpha of each "
+ "distribution. Usage: For three distributions -a 0.3 -a 0.3 specifies the "
+ "proportions 0.3, 0.3 and 0.4. The default is "
+ "equal proportions for all distributions.", default=None)
+
+ parser.add_option("-l", "--log", action="store_true", dest="log",
+ help="Enable logging. Default: %default", default=False)
+
+ parser.add_option("--view", action="store_true", dest="view",
+ help="Only view the GCP. Do not fit any distribution. Respects cutoff "
+ "(-c). Default: %default", default=False)
+
+ (options, args) = parser.parse_args()
+
+ if len(args) != 1:
+ parser.print_help()
+ sys.exit(1)
+
+
+ # process command line input
+
+ # check input file name
+ name = args[0] # filename is the first (and only) argument
+ if not os.path.exists(name):
+ raise Exception("Could not find file with name '%s'."%name)
+ if name.endswith('.sam'):
+ name = name[:-4]
+
+
+ # enable logging
+ if options.log and not options.view:
+ logfile = name + '_%s_log.txt'%options.dist
+ else:
+ logfile = None
+ lg = Logger(logfile)
+
+
+ # load data
+ DS = DataSet()
+ if os.path.exists(name+'.pcl'):
+ print 'found pickle file'
+ DS.read_from_pickle(name+'.pcl')
+ else:
+ DS.read_from_sam(name+'.sam')
+ DS.write_to_pickle(name+'.pcl')
+
+
+ # weight cutoff. Coverages above this value have no weight in parameter estimation
+ try:
+ cutoff = max(int(np.max(DS.cov[np.where(np.cumsum(DS.count)<= \
+ options.cutoff*DS.glen)])+1),10)
+ except:
+ cutoff = np.max(DS.cov)+1
+ lg.log("Using coverage cutoff: %i"%cutoff)
+
+ DS.count = DS.count[np.where(DS.cov < cutoff)]
+ DS.cov = DS.cov[np.where(DS.cov < cutoff)]
+ num_unique = len(DS.cov)
+ DS.glen = np.sum(DS.count)
+ mean_cov = np.sum(DS.cov*DS.count)/np.sum(DS.count).astype(np.float)
+
+
+ # only view the GCP
+ if options.view:
+ try:
+ import matplotlib.pyplot as plt
+ except:
+ raise Exception("Error: could not import matplotlib.")
+
+ # create empty mixture model
+ MM = np.array([])
+ create_plot(DS,MM,name+'.png')
+ print "Wrote GCP plot to file %s."%(name+'.png')
+ sys.exit(0)
+
+
+ # Plotting: check if matplotlib is installed
+ plot = options.plot
+ if plot:
+ try:
+ import matplotlib.pyplot as plt
+ except:
+ lg.log("Warning: could not import matplotlib. Plotting disabled.")
+ plot = False
+
+ # create the mixture model
+ if options.dist.count('z') > 1:
+ raise Exception("Error: only one Zero disribution is allowed!")
+
+ if options.dist.count("t") > 1:
+ lg.log("Warning: more than one tail distribution may yield inaccurate "
+ "estimates!")
+
+ MM = build_mixture_model(options.dist)
+ num_dist = len(MM)
+ the_zero = options.dist.find('z')
+
+ if options.alpha != None:
+ if len(options.alpha) >= num_dist:
+ raise Exception("Error: the number of alpha values (%i) exceeds "
+ "the number of distributions (%i)!"%(len(options.alpha,num_dist)))
+
+
+ # set initial proportions for each distribution
+ if options.alpha:
+ alpha = options.alpha
+ if len(options.alpha) < len(MM):
+ rest_alpha = 1. - sum(options.alpha)
+ rest_models = len(MM) - len(options.alpha)
+ alpha.append([rest_alpha / rest_models] * rest_models)
+
+ for dist,a in zip(MM,alpha):
+ dist.alpha = float(a)
+
+ # initialize distribution parameters
+ n_dist = options.dist.count('n') + options.dist.count('N') + \
+ options.dist.count('p')
+ factors = np.linspace(-n_dist,n_dist,n_dist)
+ data_mean = np.power(np.abs(factors),np.sign(factors))*np.mean(DS.cov[1:]*\
+ DS.count[1:]/np.mean(DS.count))
+
+ if options.mean:
+ # use mean values provided by the user, where possible
+ for i,m in enumerate(options.mean):
+ data_mean[i] = m
+
+ data_var = np.array([np.sum((DS.cov-m)**2*DS.count)/DS.glen \
+ for m in data_mean])
+
+ it = 0
+ for dist in MM:
+ if isinstance(dist,Zero) or isinstance(dist,NbTail) \
+ or isinstance(dist,PoissonTail):
+ dist.estimate_par()
+ continue
+ if isinstance(dist,NBinom):
+ m,v = data_mean[it],data_var[it]
+ if m > v:
+ v = 1.01*m
+ dist.set_par(m**2 / (v - m) , m / v )
+ it += 1
+ continue
+ if isinstance(dist,Poisson):
+ dist.set_par(data_mean[it])
+ it += 1
+
+
+ # set initial values for the responsibilities gamma
+ GAMMA = init_gamma(MM,DS)
+
+
+ # write initial values to log file
+ lg.log('Initial values',False)
+ lg.log('distribution'.ljust(20)+'alpha'.ljust(20)+'parameter 1'.ljust(20)+
+ 'parameter 2',False)
+ for i in range(num_dist):
+ lg.log(str(MM[i]).ljust(20)+str(MM[i].alpha).ljust(20)+
+ str(MM[i]._p1).ljust(20)+str(MM[i]._p2),False)
+ lg.log('\n',False)
+
+
+
+ # run the iteration, repeatedly update the variables MM and GAMMA
+ t_start = time.time()
+ for i,change,diff in iterative_fitting(DS, MM, GAMMA, options.steps):
+ print i
+ t_step = time.time() - t_start
+ lg.log(('step#: '+str(i+1)).ljust(20)+'time: '+str(round(t_step,1))+'s'
+ +'\n',False)
+ lg.log('distribution'.ljust(20)+'alpha'.ljust(20)+
+ 'parameter 1'.ljust(20)+'parameter 2',False)
+ for d in MM:
+ lg.log(d.report_stats(20),False)
+ lg.log('Max. CDF diff:'.ljust(20)+'%f'%diff,False)
+ lg.log('\n',False)
+
+ if change < options.thr:
+ break
+ t_start = time.time()
+
+
+
+ # Estimate genome fragmentation and correct the zero-coverage estimate,
+ # if a tail distribution was used
+ zero_frac = MM[the_zero].alpha
+ tail = False
+ for dist in MM:
+ if isinstance(dist,PoissonTail) and dist.alpha > 0:
+ p_tail = dist.alpha
+ p_parent = dist._parent.alpha
+ part_cov = dist._parent._p1
+ elif isinstance(dist,NbTail) and dist.alpha > 0:
+ p_tail = dist.alpha
+ p_parent = dist._parent.alpha
+ part_cov = stats.nbinom.mean(dist._parent._p1,dist._parent._p2)
+ else:
+ continue
+
+ tail=True
+ gfrag = DS.glen/(1.+p_parent/p_tail)/2./DS.rlen*(p_parent+p_tail)
+
+ start_prob = 1. - stats.poisson.cdf(0,part_cov/float(DS.rlen))
+ zero_corr = (2*gfrag-1)*(min(stats.nbinom.mean(1,start_prob),DS.rlen)
+ - start_prob/3.*4./9.)
+ zero_est = DS.glen*MM[the_zero].alpha
+ zero_frac = (zero_est-zero_corr)/float(DS.glen)
+
+
+ # write results to file
+ res = open(name+'_'+str(options.dist)+'_results.txt','w')
+ res.write('Genome length: %i\n'%DS.glen)
+ res.write('Number of reads: %i\n'%DS.rds)
+ res.write('Average read length: %i\n'%DS.rlen)
+ #res.write('Average Coverage: %f\n'%mean_cov)
+ res.write('Max. CDF difference: %f\n\n'%diff)
+ res.write('Distribution'.ljust(20)+'alpha'.ljust(20)+
+ 'parameter 1'.ljust(20)+'parameter 2'.ljust(20)+'[Mean]\n')
+ for d in MM:
+ res.write(d.report_stats(20)+'\n')
+
+ if tail:
+ res.write('\nNum. Fragments:'.ljust(20)+'%.01f'%gfrag+'\n')
+ res.write('Observed Zeros:'.ljust(20)+
+ '%.04f'%(zero_est/float(DS.glen))+'\n')
+ res.write('Corrected Zeros:'.ljust(20)+
+ '%.04f'%((zero_est-zero_corr)/float(DS.glen))+'\n')
+ res.close()
+
+ lg.log('\nFinal Results:\n')
+ #lg.log('Average Coverage: %f'%mean_cov)
+ lg.log('Max. CDF difference: %f\n'%diff)
+ lg.log('Distribution'.ljust(20)+'alpha'.ljust(20)+'parameter 1'.ljust(20)+
+ 'parameter 2'.ljust(20)+'[Mean]')
+ for d in MM:
+ lg.log(d.report_stats(20))
+
+ if tail:
+ lg.log('Num. Fragments:'.ljust(20)+'%.01f'%gfrag)
+ lg.log('Observed Zeros:'.ljust(20)+'%.04f'%(zero_est/float(DS.glen)))
+ lg.log('Corrected Zeros:'.ljust(20)+'%.04f'%((zero_est-zero_corr)/
+ float(DS.glen)))
+
+
+ # plot the figure, if requested
+ if plot:
+ create_plot(DS,MM,name+'_'+options.dist+'.png')
+
+ print "\nFinished."
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fitgpc.git
More information about the debian-med-commit
mailing list