[med-svn] [mapdamage] 01/02: Imported Upstream version 2.0.6+dfsg

Fri Jul 29 12:46:36 UTC 2016

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository mapdamage.

commit cc6aebcb3f9f3c220b52fbd93dd18af62f7948ed
Author: Andreas Tille <tille at debian.org>
Date:   Fri Jul 29 14:45:20 2016 +0200

    Imported Upstream version 2.0.6+dfsg
---
 LICENSE.txt                                        |  18 +
 MANIFEST                                           |  13 +
 MANIFEST.in                                        |   2 +
 README.md                                          |  32 +
 bin/mapDamage                                      | 382 ++++++++++++
 mapdamage/Rscripts/lengths.R                       | 109 ++++
 mapdamage/Rscripts/mapDamage.R                     | 180 ++++++
 mapdamage/Rscripts/stats/checkLibraries.R          |  18 +
 mapdamage/Rscripts/stats/data.R                    |  55 ++
 mapdamage/Rscripts/stats/function.R                | 694 +++++++++++++++++++++
 mapdamage/Rscripts/stats/main.R                    | 283 +++++++++
 mapdamage/Rscripts/stats/postConditonal.R          | 185 ++++++
 mapdamage/Rscripts/stats/priorPropose.R            | 125 ++++
 mapdamage/Rscripts/stats/runGeneral.R              |  95 +++
 mapdamage/Rscripts/stats/start.R                   | 107 ++++
 mapdamage/__init__.py                              |  10 +
 mapdamage/align.py                                 | 134 ++++
 mapdamage/composition.py                           |  84 +++
 mapdamage/parseoptions.py                          | 264 ++++++++
 mapdamage/rescale.py                               | 352 +++++++++++
 mapdamage/rescale_test.py                          |  57 ++
 mapdamage/rescale_test/long_align/pe.sam           |   3 +
 .../pe_output/Stats_out_MCMC_correct_prob.csv      |   5 +
 mapdamage/rescale_test/long_align/ref.fa           |   2 +
 mapdamage/rescale_test/pe_test/pe.sam              |   8 +
 .../pe_output/Stats_out_MCMC_correct_prob.csv      |   5 +
 .../rescale_test/pe_test/pe_rescaled_correct.sam   |   8 +
 mapdamage/rescale_test/pe_test/ref.fa              |   2 +
 mapdamage/rscript.py                               | 153 +++++
 mapdamage/seq.py                                   | 108 ++++
 mapdamage/tables.py                                | 116 ++++
 mapdamage/version.py                               |   6 +
 setup.py                                           |  68 ++
 33 files changed, 3683 insertions(+)

diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..ed6195c
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,18 @@
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
+# copies of the Software, and to permit persons to whom the Software is 
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+# SOFTWARE.
+
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..e217680
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,13 @@
+# file GENERATED by distutils, do NOT edit
+README.txt
+setup.py
+bin/mapDamage
+mapdamage/__init__.py
+mapdamage/align.py
+mapdamage/composition.py
+mapdamage/parseoptions.py
+mapdamage/rscript.py
+mapdamage/seq.py
+mapdamage/tables.py
+mapdamage/version.py
+mapdamage/Rscripts/mapDamage.R
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..3381982
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include mapdamage/Rscripts/mapDamage.R
+include mapdamage/Rscripts/stats/*R
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f908dba
--- /dev/null
+++ b/README.md
@@ -0,0 +1,32 @@
+### Important
+Users with versions dating prior to June 12 2013 please update. A nasty bug that caused the statistical part of mapDamage to use half of the data for estimation of the damage parameters, sorry for the inconvenience.
+
+### Introduction
+Complete documentation, instructions, examples, screenshots and FAQ are available at [this address](http://ginolhac.github.io/mapDamage/).
+
+[mapDamage2](http://geogenetics.ku.dk/publications/mapdamage2.0/) is a computational framework written in Python and R, which tracks and quantifies DNA damage patterns 
+among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms. 
+
+Mapdamage is developed at the [Centre for GeoGenetics](http://geogenetics.ku.dk/) by the [Orlando Group ](http://geogenetics.ku.dk/research/research_groups/palaeomix_group/).
+
+
+
+
+### Citation
+If you use this program, please cite the following publication:  
+Jónsson H, Ginolhac A, Schubert M, Johnson P, Orlando L.
+[mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters.
+_Bioinformatics_ 23rd April 2013. doi: 10.1093/bioinformatics/btt193](http://bioinformatics.oxfordjournals.org/content/early/2013/05/17/bioinformatics.btt193)
+
+
+The original mapDamage1 is described in the following article:  
+Ginolhac A, Rasmussen M, Gilbert MT, Willerslev E, Orlando L.
+[mapDamage: testing for damage patterns in ancient DNA sequences. _Bioinformatics_ 2011 **27**(15):2153-5
+http://bioinformatics.oxfordjournals.org/content/27/15/2153](http://bioinformatics.oxfordjournals.org/content/27/15/2153)
+
+
+
+### Contact
+Please report bugs and suggest possible improvements to Aurélien Ginolhac, Mikkel Schubert or Hákon Jónsson by email:
+aginolhac at snm.ku.dk, MSchubert at snm.ku.dk or jonsson.hakon at gmail.com.
+
diff --git a/bin/mapDamage b/bin/mapDamage
new file mode 100755
index 0000000..d13ee33
--- /dev/null
+++ b/bin/mapDamage
@@ -0,0 +1,382 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import logging
+import random
+import time
+import sys
+import os
+
+""" Copyright (c) 2012  Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson
+and Ludovic Orlando
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+plot and quantify damage patterns from a SAM/BAM file
+
+:Authors: Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson, Ludovic Orlando
+:Contact: aginolhac at snm.ku.dk, MSchubert at snm.ku.dk, jonsson.hakon at gmail.com
+:Date: November 2012
+:Type: tool
+:Input: SAM/BAM
+:Output: tabulated tables, pdf 
+"""
+
+# check if pysam if available
+MODULE = "pysam"
+URL = "http://code.google.com/p/pysam/"
+try:
+    __import__(MODULE)
+except ImportError, e:
+    sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
+    sys.stderr.write("       If module is not installed, please download from '%s'.\n" % URL)
+    sys.stderr.write("       A local install may be performed using the following command:\n")
+    sys.stderr.write("       $ python setup.py install --user\n\n")
+    sys.exit(1)
+
+import pysam
+
+
+_BAM_UNMAPPED  = 0x4
+_BAM_SECONDARY = 0x100
+_BAM_FAILED_QC = 0x200
+_BAM_PCR_DUPE  = 0x400
+_BAM_CHIMERIC  = 0x800
+
+def _filter_reads(bamfile):
+    filtered_flags = _BAM_UNMAPPED | \
+      _BAM_SECONDARY | \
+      _BAM_FAILED_QC | \
+      _BAM_PCR_DUPE | \
+      _BAM_CHIMERIC
+
+    for read in bamfile:
+        if not (read.flag & filtered_flags):
+            yield read
+
+
+def _downsample_to_fraction(bamfile, options):
+    if options.verbose:
+        print("\tDownsampling %.2f fraction of the original file" % options.downsample)
+    assert options.downsample > 0, "Downsample fraction must be positive, not %s" % options.downsample
+
+    rand = random.Random(options.downsample_seed)
+    for read in _filter_reads(bamfile):
+        if rand.random() < options.downsample:
+            yield read
+
+
+def _downsample_to_fixed_number(bamfile, options):
+    if options.verbose:
+        print("\tDownsampling %d random reads from the original file" % options.downsample)
+
+    # use reservoir sampling
+    downsample_to = int(options.downsample)
+    rand = random.Random(options.downsample_seed)
+    sample = [None] * downsample_to
+    for (index, record) in enumerate(_filter_reads(bamfile)):
+        if index >= downsample_to:
+            index = rand.randint(0, index)
+            if index >= downsample_to:
+                continue
+        sample[index] = record
+    return filter(None, sample)
+
+
+def _read_bamfile(bamfile, options):
+    """
+    Takes a subset of the bamfile. Can use a approximate fraction of the
+    hits or specific number of reads using reservoir sampling. Returns
+    a list in the last case otherwise a iterator.
+    """
+    if options.downsample is None:
+        return _filter_reads(bamfile)
+    elif options.downsample < 1:
+        return _downsample_to_fraction(bamfile, options)
+    else:
+        return _downsample_to_fixed_number(bamfile, options)
+
+def _check_mm_option():
+    """
+    As the user can override the system wide mapdamage modules with 
+    the --mapdamage-modules, it has to happen before option parsing 
+    in mapdamage.parseoptions
+    """
+    path_to_mm = None
+    for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
+        if arg.startswith("--mapdamage-modules"):
+            try:
+                if "=" in arg:
+                    # the option is of the format --mapdamage-modules=AAAA
+                    arg_p = arg.split("=")
+                    path_to_mm = arg_p[1]
+                else:
+                    # the option is of the format --mapdamage-modules AAAA
+                    path_to_mm = sys.argv[nr+1] 
+                break
+            except IndexError as e:
+                raise SystemExit("Must specify a path to --mapdamage-modules")
+    if path_to_mm != None:
+        if not os.path.isdir(path_to_mm):
+            raise SystemExit("The --mapdamage-modules option must be a valid path (path=%s)" % path_to_mm)
+        if not os.path.isdir(os.path.join(path_to_mm,"mapdamage")):
+            raise SystemExit("The --mapdamage-modules path (path=%s) must contain the mapdamage module" % path_to_mm)
+    return path_to_mm
+
+
+def main():
+    start_time = time.time()
+
+    # the user can override the system wide mapdamage modules
+    path_to_mm = _check_mm_option()
+    if path_to_mm != None:
+        sys.path.insert(0,path_to_mm)
+    import mapdamage
+
+    fpath_seqtk = mapdamage.rscript.construct_path("seqtk", folder="seqtk")
+    if not (os.path.isfile(fpath_seqtk) and os.access(fpath_seqtk, os.X_OK)):
+        sys.stderr.write("Seqtk executable not accessible; mapDamage has not\n"
+                         "been intalled properly or current user does not\n"
+                         "sufficient permissions. Expected executable at\n   "
+                         "'%s'\n" % (fpath_seqtk,))
+        return 1
+
+    options = mapdamage.parseoptions.options()
+    if options == None:
+        sys.stderr.write("Option parsing failed, terminating the program\n")
+        return 1
+
+    logging.basicConfig(filename = os.path.join(options.folder, "Runtime_log.txt"),
+                        format   = '%(asctime)s\t%(levelname)s\t%(name)s: %(message)s',
+                        level    = logging.DEBUG)
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.INFO)
+    logging.getLogger().addHandler(handler)
+
+    logger = logging.getLogger("main")
+    logger.info("Started with the command: " + " ".join(sys.argv))
+
+    # plot using R if results folder already done
+    if options.plot_only:
+        if options.no_r:
+            logger.error("Cannot use plot damage patterns if R is missing, terminating the program")
+            return 1
+        else:
+            mapdamage.rscript.plot(options)
+            mapdamage.rscript.opt_plots(options)
+            return 0
+
+    # run the Bayesian estimation if the matrix construction is done
+    if options.stats_only:
+        # does not work for very low damage levels
+        if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
+            logger.error("Cannot use the Bayesian estimation, terminating the program")
+            return 1
+        else:
+            # before running the Bayesian estimation get the base composition
+            path_to_basecomp = os.path.join(options.folder,"dnacomp_genome.csv")
+            if os.path.isfile(path_to_basecomp):
+                #Try to read the base composition file
+                mapdamage.composition.read_base_comp(path_to_basecomp)
+            else:
+                #Construct the base composition file
+                mapdamage.composition.get_base_comp(options.ref,path_to_basecomp)
+            mapdamage.rscript.run_stats(options)
+            return 0
+    
+    # fetch all references and associated lengths in nucleotides
+    try:
+        ref = pysam.Fastafile(options.ref)
+    except IOError as e:
+        # Couldn't open the reference file
+        logger.error("Couldn't open the reference file "+options.ref+\
+            " or there are permission problems writing the "+options.ref+".fai file")
+        raise e
+
+    # rescale the qualities
+    if options.rescale_only:
+        if not options.quiet:
+            print("Starting rescaling...")
+        mapdamage.rescale.rescale_qual(ref, options)
+        return 0
+
+    # open SAM/BAM file
+    if options.filename == "-":
+        in_bam = pysam.Samfile("-", 'rb')
+        # disabling rescaling if reading from standard input since we need
+        # to read it twice
+        if options.rescale:
+            print("Warning, reading from standard input, rescaling is disabled")
+            options.rescale = False
+    else:
+        in_bam = pysam.Samfile(options.filename)
+
+
+    reflengths = dict(zip(in_bam.references, in_bam.lengths))
+    # check if references in SAM/BAM are the same in the fasta reference file
+    fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
+    if not fai_lengths:
+        return 1
+    elif not mapdamage.seq.compare_sequence_dicts(fai_lengths, reflengths):
+        return 1
+    elif (len(reflengths) >= 1000) and not options.merge_reference_sequences:
+        print("WARNING: Alignment contains a large number of reference sequences (%i)!" % len(reflengths))
+        print("  This may lead to excessive memory/disk usage.")
+        print("  Consider using --merge-reference-sequences")
+        print()
+
+    refnames = in_bam.references
+    if options.merge_reference_sequences:
+        refnames = ['*']
+
+    # for misincorporation patterns, record mismatches
+    misincorp = mapdamage.tables.initialize_mut(refnames, options.length)
+    # for fragmentation patterns, record base compositions
+    dnacomp =  mapdamage.tables.initialize_comp(refnames, options.around, options.length)
+    # for length distributions
+    lgdistrib =  mapdamage.tables.initialize_lg()
+
+    if not options.quiet:
+        print("\tReading from '%s'" % options.filename)
+        if  options.minqual != 0:
+            print("\tFiltering out bases with a Phred score < %d" % options.minqual)
+        if options.verbose:
+            print("\t%d references are assumed in SAM/BAM file, for a total of %d nucleotides" % (len(reflengths), sum(reflengths.values())))
+        print("\tWriting results to '%s/'" % options.folder)
+
+
+    # open file handler to write alignments in fasta format
+    if options.fasta:
+        # use name of the SAM/BAM filename without extension
+        ffasta = os.path.splitext(os.path.basename(options.filename))[0]+'.fasta'
+        if not options.quiet:
+            print("\tWriting alignments in '%s'" % ffasta)
+        fhfasta = open(options.folder+"/"+ffasta,"w")
+
+    counter = 0
+
+    # main loop
+    for read in _read_bamfile(in_bam, options):
+        counter += 1
+
+        # external coordinates 5' and 3' , 3' is 1-based offset
+        coordinate = mapdamage.align.get_coordinates(read)
+        # record aligned length for single-end reads
+        lgdistrib = mapdamage.seq.record_lg(read, coordinate, lgdistrib)
+        # fetch reference name, chromosome or contig names
+        chrom = in_bam.getrname(read.tid)
+
+        (before, after) = mapdamage.align.get_around(coordinate, chrom, reflengths, options.around, ref)
+        refseq = ref.fetch(chrom, min(coordinate), max(coordinate)).upper()
+        # read.query contains aligned sequences while read.seq is the read itself
+        seq = read.query
+
+        # add gaps according to the cigar string, do it for qualities if filtering options is on
+        if not (options.minqual and read.qual):
+            if options.minqual:
+                logger.warning("Cannot filter bases by PHRED scores for read '%s'; no scores available." % read.qname)
+
+            (seq, refseq) = mapdamage.align.align(read.cigar, seq, refseq)
+        else:
+            # add gaps to qualities and mask read and reference nucleotides if below desired threshold
+            (seq, _, refseq) = mapdamage.align.align_with_qual(read.cigar, seq, read.qqual, options.minqual, refseq)
+
+        # reverse complement read and reference when mapped reverse strand
+        if read.is_reverse:
+            refseq = mapdamage.seq.revcomp(refseq)
+            seq = mapdamage.seq.revcomp(seq)
+            beforerev = mapdamage.seq.revcomp(after)
+            after = mapdamage.seq.revcomp(before)
+            before = beforerev
+
+        if options.merge_reference_sequences:
+            chrom = '*'
+
+        # record soft clipping when present
+        mapdamage.align.record_soft_clipping(read, misincorp[chrom], options.length)
+
+        # count misincorparations by comparing read and reference base by base
+        mapdamage.align.get_mis(read, seq, refseq, chrom, options.length, misincorp, '5p')
+        # do the same with sequences align to 3'-ends
+        mapdamage.align.get_mis(read, seq[::-1], refseq[::-1], chrom, options.length, misincorp, '3p')
+        # compute base composition for reads
+        mapdamage.composition.count_read_comp(read, chrom, options.length, dnacomp)
+
+        # compute base composition for genomic regions
+        mapdamage.composition.count_ref_comp(read, chrom, before, after, dnacomp)
+        
+        if options.fasta:
+            mapdamage.seq.write_fasta(read, chrom, seq, refseq, min(coordinate), max(coordinate), before, after, fhfasta)
+
+        if options.verbose:
+            if counter % 50000 == 0:
+                print("\t%10d filtered alignments processed" % (counter,))
+
+    if options.verbose:
+        print("\tDone. %d filtered alignments processed" % (counter,))
+    logger.debug("BAM read in %f seconds" % (time.time() - start_time,))
+
+    # close file handlers
+    in_bam.close()
+    if options.fasta:
+        fhfasta.close()
+
+    # output results, write summary tables to disk
+    with open(options.folder+"/"+"misincorporation.txt", 'w') as fmut:
+        mapdamage.tables.print_mut(misincorp, options, fmut)
+    with open(options.folder+"/"+"dnacomp.txt", 'w') as fcomp:
+        mapdamage.tables.print_comp(dnacomp, options, fcomp)
+    with open(options.folder+"/"+"lgdistribution.txt", 'w') as flg:
+        mapdamage.tables.print_lg(lgdistrib, options, flg)
+
+
+    # plot using R
+    if not options.no_r:
+        mapdamage.rscript.plot(options)
+        mapdamage.rscript.opt_plots(options)
+
+    # raises a warning for very low damage levels
+    if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
+        options.no_stats = True
+
+   
+    # run the Bayesian estimation
+    if not options.no_stats:
+        # before running the Bayesian estimation get the base composition
+        mapdamage.composition.get_base_comp(options.ref,os.path.join(options.folder,"dnacomp_genome.csv"))
+        mapdamage.rscript.run_stats(options)
+
+    # rescale the qualities
+    if options.rescale:
+        mapdamage.rescale.rescale_qual(ref, options)
+
+    # need the fasta reference still open for rescaling
+    ref.close()
+
+    # log the time it took
+    logger.info("Successful run")
+    logger.debug("Run completed in %f seconds" % (time.time() - start_time,))
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/mapdamage/Rscripts/lengths.R b/mapdamage/Rscripts/lengths.R
new file mode 100644
index 0000000..8955ace
--- /dev/null
+++ b/mapdamage/Rscripts/lengths.R
@@ -0,0 +1,109 @@
+# Enable full backtraces on errors
+on_error <- function(e)
+  {
+    traceback(2)
+    quit(status = 1)
+  }
+options(error = on_error)
+
+
+args <- commandArgs(trailingOnly = TRUE)
+OPT.LGDIST    <- args[1]
+OPT.PDFOUT    <- args[2]
+OPT.MISINCORP <- args[3]
+OPT.LENGTH    <- args[4]
+OPT.TITLE     <- args[5]
+OPT.VERSION   <- args[6]
+OPT.QUIET     <- args[7]
+
+MISMATCHES  <- c("C>T", "G>A")
+
+calculate.mutation.table <- function(filename){
+
+    tbl <- read.table(file = filename, sep = "\t", header = TRUE, check.names = FALSE)
+    tbl <- aggregate(tbl[, MISMATCHES], tbl[, c("End", "Std", "Pos")], sum)
+    return(tbl)
+}
+
+
+plot.length <- function(tbl, title, color) {
+    table <- aggregate(tbl$Occurences, by=list(tbl$Length), FUN=sum)
+    names(table)<- c("Length", "Occurences")
+    plot(table$Length, table$Occurences, type="h",
+       col = color, main = title, cex.axis = 0.8, las = 2,
+       xlab = "", ylab = "", axes = FALSE)
+ 
+    mtext("Occurences", side = 2, line = 2.5, cex = 0.7)
+    mtext("Read length", side = 1, line = 2, cex = 0.7)
+    xcoord = seq(min(table$Length), max(table$Length), 10)
+    axis(side = 1, labels = xcoord, at = xcoord, las = 2, cex.axis = 0.6)
+    axis(side = 2, labels = TRUE, las = 2, cex.axis = 0.6)
+}
+
+plot.lengthStd <- function(tbl, title) {
+    subplus = subset(tbl, Std == "+")
+    subminus = subset(tbl, Std == "-")
+    plot(subplus$Length, subplus$Occurences, type="h", col=rgb(1,0,0,1/2), main = title, axes = FALSE)
+    lines(subminus$Length, subminus$Occurences, type="h", col=rgb(0,0,1,1/2))
+    mtext("Occurences", side = 4, line = 2.5, cex = 0.7)
+    mtext("Read length", side = 1, line = 2, cex = 0.7)
+    xcoord = seq(min(subplus$Length), max(subplus$Length), 10)
+    axis(side = 1, labels = xcoord, at = xcoord, las = 2, cex.axis = 0.6)
+    axis(side = 4, labels = TRUE, las = 2, cex.axis = 0.6)
+    legend('topright',c('+ strand','- strand'),
+       fill = rgb(1:0,0,0:1,0.4), bty = 'n',
+       border = NA)
+}
+
+plot.cumul.mutation <- function(tbl, end, mut, sid) {
+    subplus = subset(tbl, Std == "+" & End == end)
+    subminus = subset(tbl, Std == "-" & End == end)
+    plot(c(0, cumsum(subplus[, mut]/sum(subplus[, mut]))), 
+	 type = "l", col = rgb(1,0,0,1/2), lwd = 2, axes = FALSE)
+    lines(c(0, cumsum(subminus[, mut]/sum(subminus[, mut]))), 
+	 col = rgb(0,0,1,1/2), lwd = 2)
+    axis(side = 1, labels = TRUE, las = 2, cex.axis = 0.6)
+    axis(side = sid, labels = seq(0,1,0.1), at = seq(0,1,0.1), las = 2, cex.axis = 0.6)
+    mtext(mut, side = 3, line = 2, cex = 0.8)
+    mtext("Read position", side = 1, line = 1.8, cex = 0.7)
+    mtext("Cumulative frequencies", side = sid, line = 2.5, cex = 0.7)
+    legend('topleft',c('+ strand','- strand'),
+       	  fill = rgb(1:0,0,0:1,0.4), bty = 'n',
+          border = NA)
+}
+
+lg <- read.table(file = OPT.LGDIST, sep = "\t", header = TRUE, as.is = TRUE)
+if(nrow(lg) == 0){
+    write("No length distributions are available, plotting length distribution only works for single-end reads", stderr())
+}else{
+
+    pdf(file = OPT.PDFOUT, title = paste("mapDamage-", OPT.VERSION, sep=""))
+    par(oma = c(4,2,2,2), mar = c(1,2,1,2))
+    layout(matrix(c(1,1,  # Title
+                    2,3,  # lengths
+                    4,5), # Cumulative mutation
+                    3, 2, byrow = TRUE),
+           heights = c(3, 20, 20))
+    
+    # Plot title
+    plot(0, type = "n", xaxt = "n", yaxt = "n", bty = "n", xlab = "", ylab = "")
+    mtext(OPT.TITLE, 3, cex = 1.3)
+    
+    # Base compositions
+    plot.length(lg, "Single-end read length distribution", "black")
+    plot.lengthStd(lg, "Single-end read length per strand")
+    
+    # Misincorporation patterns
+    mut <- calculate.mutation.table(OPT.MISINCORP)
+    
+    par(mar = c(1, 2, 7, 1))
+    plot.cumul.mutation(mut, "5p", "C>T", 2)
+    par(mar = c(1, 1, 7, 2))
+    plot.cumul.mutation(mut, "3p", "G>A", 4)
+    
+    # graphics.off() calls dev.off() for all devices but doesn't return anything (avoid null device message)
+    graphics.off()
+    if(OPT.QUIET == 0){
+        cat(paste("additional", OPT.PDFOUT, "generated\n"))
+    }
+}
diff --git a/mapdamage/Rscripts/mapDamage.R b/mapdamage/Rscripts/mapDamage.R
new file mode 100644
index 0000000..542c846
--- /dev/null
+++ b/mapdamage/Rscripts/mapDamage.R
@@ -0,0 +1,180 @@
+# R script mapdamage
+# Enable full backtraces on errors
+on_error <- function(e)
+  {
+    traceback(2)
+    quit(status = 1)
+  }
+options(error = on_error)
+
+
+NUCLEOTIDES <- c("A", "C", "G", "T", "Total")
+MISMATCHES  <- c("A>C", "A>G", "A>T", "C>A", "C>G", "C>T", "G>A", "G>C", "G>T", "T>A", "T>C", "T>G")
+INSERTIONS  <- c("->A", "->C", "->G", "->T")
+DELETIONS   <- c("A>-", "C>-", "G>-", "T>-")
+CLIPPING    <- c("S")
+EVERYTHING  <- c(NUCLEOTIDES, MISMATCHES, INSERTIONS, DELETIONS, CLIPPING)
+
+args <- commandArgs(trailingOnly = TRUE)
+OPT.COMP      <- args[1]
+OPT.PDFOUT    <- args[2]
+OPT.AROUND    <- as.numeric(args[3])
+OPT.MISINCORP <- args[4]
+OPT.LENGTH    <- as.numeric(args[5])
+OPT.YMAX      <- as.numeric(args[6])
+OPT.FOLDER    <- args[7]
+OPT.TITLE     <- args[8]
+OPT.VERSION   <- args[9]
+
+
+draw.open.rect <- function(xleft, ybottom, xright, ytop, padding = 0) {
+  if (xleft < 0) {
+    xpoints <- c(xleft, xright + padding, xright + padding, xleft)
+  } else {
+    xpoints <- c(xright, xleft - padding, xleft - padding, xright)
+  }
+    
+  lines(xpoints, c(ytop, ytop, ybottom, ybottom), col = "darkgrey")
+}
+
+
+plot.base.composition <- function(tbl, base, color, around, ylabels.at = c(), xlabels = FALSE) {
+  xcoords <- c(-around:-1, 1:around)
+  
+  plot.axis <- function(yaxis.at) {
+    axis(side = yaxis.at, labels = (yaxis.at == ylabels.at), line = 0, las = 2, cex.axis = 0.8)
+    if ((yaxis.at == 2) && (yaxis.at == ylabels.at)) {
+      mtext("Frequency", side = 2, line = 2.5, cex = 0.6)
+    }
+  
+    if (xlabels) {
+      axis(side = 1, labels = xcoords, at = xcoords, las = 2, cex.axis = 0.6)
+    } else {
+      axis(side = 1, labels = FALSE,   at = xcoords)
+    }
+  }
+
+  plot.frequencies <- function(end) {
+    subtbl <- subset(tbl, End == end)
+    plot(subtbl$Pos, subtbl[, base] / subtbl$Total, pch = '.',
+       xlim = c(-around, around), ylim = c(0, 0.5),
+       col = color, main = base, cex.axis = 0.8, las = 2,
+       xlab = "", ylab = "", lab = c(2 * around, 6, 0.2),
+       axes = FALSE)
+
+    ycoords <- NULL
+    for (i in xcoords) {
+      ycoords <- append(ycoords, mean(subtbl[(subtbl$Pos==i), base]/subtbl$Total[(subtbl$Pos==i)],na.rm=T)) 
+    }
+    points(xcoords, ycoords, pch = 20, col = color, type = "b")
+  }
+
+  # 5p end
+  par(mar = c(1, 2, 1, 0.25))
+  plot.frequencies("5p")
+  plot.axis(2)
+  draw.open.rect(0, 0, around, 0.5)
+
+  # 3p end
+  par(mar = c(1, 0.25, 1, 2))
+  plot.frequencies("3p")
+  plot.axis(4)
+  draw.open.rect(-around, 0, 0, 0.5)
+  
+  par(mar = c(1, 2, 1, 2))
+}
+
+
+calculate.mutation.table <- function(filename, length)
+  {
+    tbl <- read.table(file = filename, sep = "\t", header = TRUE, check.names = FALSE)
+    tbl <- aggregate(tbl[, EVERYTHING], tbl[, c("End", "Pos")], sum)
+    for (mismatch in MISMATCHES) {
+      tbl[, mismatch] <- tbl[, mismatch] / tbl[, substr(mismatch, 1, 1)]
+    }
+
+    for (mismatch in c(INSERTIONS, DELETIONS, CLIPPING)) {
+      tbl[, mismatch] <- tbl[, mismatch] / tbl[, "Total"]
+    }
+
+    return(tbl[tbl$Pos <= length, ])
+  }
+
+
+write.mutation.table <- function(tbl, end, mismatch, filename)
+  {
+    columns <- c("pos", sprintf("5p%s", mismatch))
+    tbl[is.na(tbl)] <- 0 # Python doesn't like NAs  
+    write.table(tbl[tbl$End == end, c("Pos", mismatch)],
+                row.names = FALSE, col.names = columns,
+                sep = "\t", quote = FALSE,
+                file = filename)
+  }
+
+
+plot.mutations <- function(end, axis.at, start.i, end.i, modifier) {
+  do.plot <- function(tbl, end, modifier, mismatches, color, width) {
+    subtable <- tbl[tbl$End == end, c("Pos", mismatches)]
+    rates    <- rowSums(subtable) - subtable$Pos
+    subtable <- aggregate(list(Rate = rates), list(Pos = subtable$Pos), sum)
+    
+    lines(subtable$Pos * modifier, subtable$Rate,
+          xlim = c(1, OPT.LENGTH), ylim = c(0, OPT.YMAX),
+          col = color, lwd = width)
+  }
+  
+  plot(NA, xlim = c(start.i, end.i), ylim=c(0, OPT.YMAX), col="grey", lwd = 1, type = "l", xlab = "", ylab = "", axes = FALSE)
+  axis(side = 1, labels = start.i:end.i, at = start.i:end.i, las = 2, cex.axis = 0.8)
+  axis(side = axis.at, labels = TRUE, las = 2, cex.axis = 0.8)
+  if (end == "5p") {
+      mtext("Frequency", side = 2, line = 2.5, cex = 0.6)
+  }
+  draw.open.rect(start.i, OPT.YMAX, end.i, -0.01, padding = 0.5)
+  
+  for (mismatch in MISMATCHES) {
+    do.plot(mut, end, modifier, mismatch, "grey", 1)
+  }
+  
+  do.plot(mut, end, modifier, CLIPPING,   "orange", 1)
+  do.plot(mut, end, modifier, DELETIONS,  "green", 1)
+  do.plot(mut, end, modifier, INSERTIONS, "purple", 1)
+  do.plot(mut, end, modifier, "G>A",      "blue", 2)
+  do.plot(mut, end, modifier, "C>T",      "red", 2)
+}
+
+
+pdf(file = OPT.PDFOUT, title = paste("mapDamage-", OPT.VERSION, " plot"))
+par(oma = c(4,2,2,2), mar = c(1,2,1,2))
+layout(matrix(c(1,1,   1,1,    # Title
+                2,3,   4,5,    # A,  C
+                6,7,   8,9,    # G,  T
+                10,10, 11,11), # Mismatches 5p, 3p
+              4, 4, byrow = TRUE),
+       heights = c(3, 20, 20, 20))
+
+# Plot title
+plot(0, type = "n", xaxt = "n", yaxt = "n", bty = "n", xlab = "", ylab = "")
+mtext(OPT.TITLE, 3, cex = 1.3)
+
+# Base compositions
+com <- read.table(file = OPT.COMP, sep = "\t", header = TRUE, as.is = TRUE)
+plot.base.composition(com, "A", "blue",  OPT.AROUND, xlabels = FALSE, ylabels.at = c(2, 4))
+plot.base.composition(com, "C", "green", OPT.AROUND, xlabels = FALSE, ylabels.at = c(4))
+plot.base.composition(com, "G", "black", OPT.AROUND, xlabels = TRUE,  ylabels.at = c(2, 4))
+plot.base.composition(com, "T", "red",   OPT.AROUND, xlabels = TRUE,  ylabels.at = c(4))
+
+
+# Misincorporation patterns
+mut <- calculate.mutation.table(OPT.MISINCORP, OPT.LENGTH)
+# Write table of post-morten damage frequencies to tables
+write.mutation.table(mut, "5p", "C>T", paste(OPT.FOLDER, "/5pCtoT_freq.txt", sep = ""))
+write.mutation.table(mut, "3p", "G>A", paste(OPT.FOLDER, "/3pGtoA_freq.txt", sep = ""))
+
+par(mar = c(1, 2, 1, 1))
+plot.mutations("5p", 2, 1, OPT.LENGTH, 1)
+
+par(mar = c(1, 1, 1, 2))
+plot.mutations("3p", 4, -OPT.LENGTH, -1, -1)
+
+# graphics.off() calls dev.off() for all devices but doesn't return anything (avoid null device message)
+graphics.off()
diff --git a/mapdamage/Rscripts/stats/checkLibraries.R b/mapdamage/Rscripts/stats/checkLibraries.R
new file mode 100755
index 0000000..458b633
--- /dev/null
+++ b/mapdamage/Rscripts/stats/checkLibraries.R
@@ -0,0 +1,18 @@
+#!/usr/bin/Rscript
+#Simple script to check if the necessary packages are installed.
+args <- commandArgs(TRUE)
+whichProgram = args[2]
+
+if (whichProgram=="inline"){
+    library(inline)
+} else if (whichProgram=="ggplot2"){
+    library(ggplot2)
+} else if (whichProgram=="Rcpp"){
+    library(Rcpp)
+} else if  (whichProgram=="gam"){
+    library(gam) 
+} else if  (whichProgram=="RcppGSL"){
+    library(RcppGSL)
+} else {
+    stop(paste("Error in checkLibraries.R, no need to check this library",whichProgram))
+}
diff --git a/mapdamage/Rscripts/stats/data.R b/mapdamage/Rscripts/stats/data.R
new file mode 100644
index 0000000..910201b
--- /dev/null
+++ b/mapdamage/Rscripts/stats/data.R
@@ -0,0 +1,55 @@
+#Functions to load the raw data
+sumTheChr <- function(da,co){
+    #Sum up the columns for the different chromosomes
+    return(tapply(da[[co]],da$Pos,sum))
+}
+
+readMapDamData <- function(folder,forward=1){
+    #Reads in the data from mapdamage
+    #Sums up the position counts for the different chromosomes
+    fil <- "misincorporation.txt"
+    raw_dat <- read.table(paste(folder,fil,sep=""),header=TRUE)
+    if (forward==1){
+        raw_dat_plus <- raw_dat[raw_dat$End=="5p" & raw_dat$Std=="+",] 
+        raw_dat_minus <- raw_dat[raw_dat$End=="5p" & raw_dat$Std=="-",]
+    } else {
+        raw_dat_plus <- raw_dat[raw_dat$End=="3p" & raw_dat$Std=="+",]
+        raw_dat_minus <- raw_dat[raw_dat$End=="3p" & raw_dat$Std=="-",] 
+    }
+    dat <- matrix(nrow=max(raw_dat_plus$Pos),ncol=length(colnames(raw_dat_plus))-3)
+    colnames(dat) <- colnames(raw_dat_plus)[c(-1,-2,-3)]
+    dat[,"Pos"] <- seq(from=1,to=nrow(dat),by=1)
+    if (forward!=1){
+        dat[,"Pos"] <- - dat[,"Pos"]
+    }
+    for (i in colnames(raw_dat)[c(-1,-2,-3,-4)]){
+        dat[,i] <- sumTheChr(raw_dat_plus,i)+sumTheChr(raw_dat_minus,i)
+    }
+    return(dat)
+}
+
+joinFowAndRev <- function(fo,re,nrPos){
+    #Joins the 5' and 3' end with nrPos bases for each end
+    te <- fo[1:nrPos,]
+    te2 <- re[nrPos:1,]
+    out <- rbind(te,te2)
+    out[,"Pos"] <- c(1:nrPos,-c(nrPos:1))
+    return(out)
+}
+
+getSeqLen <- function(pa){
+    #Path to the mapDamage folder to get the length distribution
+    le_dat_min <- read.table(paste(pa,"lengthDistribStrd-.txt",sep=""),header=TRUE)
+    le_dat_plu <- read.table(paste(pa,"lengthDistribStrd+.txt",sep=""),header=TRUE)
+    les <- list(Length=le_dat_min$Length,Occurences=le_dat_min$Occurences+le_dat_plu$Occurences)
+    les$Length <- les$Length[les$Occurences!=0]
+    les$Occurences <- les$Occurences[les$Occurences!=0]/sum(les$Occurences)
+    return(les)
+}
+
+readBaseFreqs <- function(fol){
+    #Get the nucleotide composition of the genome
+    fil <- "dnacomp_genome.csv"
+    dat <- read.csv(paste(fol,fil,sep=""),header=TRUE)
+    return(c(dat$A,dat$C,dat$G,dat$T))
+}
diff --git a/mapdamage/Rscripts/stats/function.R b/mapdamage/Rscripts/stats/function.R
new file mode 100644
index 0000000..50174e6
--- /dev/null
+++ b/mapdamage/Rscripts/stats/function.R
@@ -0,0 +1,694 @@
+#Various useful functions
+getPmat <- function(tmu,tv_ti_ratio,acgt){
+    #Returns the evolutionary substitution matrix  
+    if (sum(acgt>=1)!=0 || sum(acgt<=0)!=0){
+        write("The ACGT frequencies must be in the range 0 to 1",stderr())
+        stop()       
+    }
+    if (all.equal(sum(acgt),1)!=TRUE){
+        write("The ACGT frequencies do not sum to 1",stderr())
+        stop()       
+    }
+    if (tv_ti_ratio<=0){
+        write("The transversion and transtition ratio cannot go under 0",stderr())
+        stop()       
+    }
+    #Returns the substitution probability matrix.
+    if (identical(tv_ti_ratio,1) && identical(acgt,c(.25,.25,.25,.25))){
+        return(jukesCantorPmat(tmu))
+    }else{
+        Q <- qmatHKY85(tmu,tv_ti_ratio,acgt)
+        r  <- eigen(Q)
+        B <- r$vectors
+        E  <- diag(exp(r$values))
+
+        #      Q        The eigen vector change of basis
+        #  M  ->   M   
+        #
+        #  ^       ^
+        #  |B^-1   |B
+        #     Eig
+        #  N  ->   N
+        out <- solve(a=t(B),b= E %*% t(B))#Little trick to avoid numerical difficulties
+        rownames(out) <- c("A","C","G","T")
+        colnames(out) <- c("A","C","G","T")
+        return(out) 
+    } 
+}
+
+jukesCantorPmat <- function(tmu){
+    #Using the Juke-Cantor model
+    return(matrix(rep(1/4-exp(-tmu)/4,16),nrow=4,ncol=4,dimnames=list(c("A","C","G","T"),c("A","C","G","T")))+diag(rep(exp(-tmu),4)))
+}
+
+qmatHKY85 <- function(tmu,tv_ti,acgt){
+    #HKY85 model
+    #               |    sum_1         pi_c * tv_ti      pi_g           pi_t * tv_ti  |
+    #               | pi_a * tv_ti     sum_2             pi_g * tv_ti   pi_t          |
+    #  Q = tmu  *   |    pi_a          pi_c * tv_ti      sum_3          pi_t * tv_ti  |
+    #               |    pi_a*tv_ti    pi_c              pi_g * tv_ti   sum_4         |
+    #returns this matrix
+    #This could be replaced with the analytical formulas for the substitutions probabilities.
+    
+    Qmat <- matrix(rep(acgt,4),ncol=4,byrow=TRUE)
+    diag(Qmat) <- 0
+
+    #Adjusting the transversions versus transitions
+    #-  *  -  *    
+    #*  -  *  -    
+    #-  *  -  *    
+    #*  -  *  -    
+
+    Qmat[1,2] <- tv_ti*Qmat[1,2]
+    Qmat[1,4] <- tv_ti*Qmat[1,4]
+
+    Qmat[2,1] <- tv_ti*Qmat[2,1]
+    Qmat[2,3] <- tv_ti*Qmat[2,3]
+    
+    Qmat[3,2] <- tv_ti*Qmat[3,2]
+    Qmat[3,4] <- tv_ti*Qmat[3,4]
+
+    Qmat[4,1] <- tv_ti*Qmat[4,1]
+    Qmat[4,3] <- tv_ti*Qmat[4,3]
+
+    diag(Qmat) <- -apply(Qmat,1,sum)
+
+    Qmat <- tmu * Qmat 
+    return(Qmat)
+}
+
+metroDesc <- function(lpr,lol){
+    # the logic in the Metropolis-Hastings step 
+    stopifnot(!is.na(lpr))
+    stopifnot(!is.na(lol))
+    if (log(runif(1))<lpr-lol){
+        return(1)
+    }else {
+        return(0)
+    }
+}
+
+genOverHang <- function(la){
+    #Currently not used in the main code but could used to generate 
+    #overhangs like Philip
+    r <- runif(1)
+    i <- -1
+    p <- 0
+    while (p< r){
+        if (i==-1){
+            term <- 1
+        }else {
+            term  <- 0
+        }
+        p <-p+ (la*((1-la)**(i+1))+term)/2
+        i <- i+1
+    }
+    return(i)
+}
+
+sampleHJ <- function(x,size,prob){
+    #Convenience function since sample wasn't good enough
+    if (length(prob)==1){
+        return(rep(x,size))
+    }else {
+        return(sample(x,size=size,prob=prob,replace=TRUE))
+    }
+}
+
+
+seqProbVecLambda <- function(lambda,lambda_disp,m,fo_only=NA,re_only=NA){
+    #Returns the position specific probability of being in an overhang
+    if (is.na(fo_only) || is.na(re_only)){
+        write("Must give parameters to fo_only or re_only",stderr())
+        stop()
+    }
+    psum <- matrix(ncol=1,nrow=m)
+    pvals <- dnbinom(c(1:m)-1,prob=lambda,size=lambda_disp)
+    for (i in 1:m){
+        psum[i,1] <- (1-sum(pvals[1:i]))/2
+    }
+    if (fo_only){
+        #Only the forward part
+        return(c(psum))
+    }else if (re_only && fo_only){
+        write("Shouldn't call this function with forward and reverse only.",stderr())
+        stop()
+    }else if (re_only) {
+        #The reverse part
+        return(rev(c(psum)))
+    }else{
+        #Both ends
+        psum <- c(psum[1:(m/2),1],rev(psum[1:(m/2),1]))
+        return(c(psum))
+    }
+}
+
+#The following is an MC simulation code to mimic the 
+#nick frequency part in the model from Philip
+seqProbVecNuWithLengths<- cxxfunction( signature(
+                                      I_la="numeric",
+                                      I_la_disp="numeric",
+                                      I_nu="numeric",
+                                      I_m="numeric",
+                                      I_lengths="numeric",
+                                      I_mLe="numeric",
+                                      I_fo="numeric",
+                                      I_iter="numeric",
+                                      I_ds_protocol="numeric"
+                             ) ,includes='
+                  #include <gsl/gsl_randist.h>
+                  int genOverHang(double la,double la_disp)
+                  {
+                      double r = ((double) rand() / (RAND_MAX));
+                      int i = -1;
+                      double p = 0;
+                      double term = -500; 
+                      while (p<r){
+                          if (i==-1){
+                              term = 1;
+                          }else {
+                              term = 0;
+                          }
+                          p = p + (gsl_ran_negative_binomial_pdf(i+1,la,la_disp)+term)/2;
+                          i++;
+                      }
+                      return(i);
+                  }
+                  ',body= '
+              srand(time(0)); 
+        Rcpp::NumericVector la(I_la);
+        Rcpp::NumericVector la_disp(I_la_disp);
+        Rcpp::NumericVector nu(I_nu);
+        Rcpp::NumericVector m(I_m);
+        Rcpp::NumericVector les(I_lengths);
+        Rcpp::NumericVector mLe(I_mLe);
+        Rcpp::NumericVector fo(I_fo);
+        Rcpp::NumericVector iter(I_iter);
+        Rcpp::NumericVector ds_protocol(I_ds_protocol);
+//
+        Rcpp::NumericVector output(mLe[0]);
+        Rcpp::NumericVector reduced_output(m[0]);
+        if (ds_protocol[0]==0){
+            for (int j = 0; j < m[0];j++){
+                reduced_output(j) = 1;
+            }
+            return(reduced_output);
+        }else {
+            for (int i = 0; i < iter[0];i++ ){
+//
+                double left_o_hang = genOverHang(la[0],la_disp[0]);
+                double right_o_hang = genOverHang(la[0],la_disp[0]);
+                double o_hang  = left_o_hang+right_o_hang;
+//              
+                if (o_hang>=les(i)){
+                    //Single stranded sequence
+                    for (int j = 0; j <les(i);j++){
+                        output(j) = output(j)+1;
+                    }
+                } else {
+                    Rcpp::NumericVector r = runif(1);
+                    if (r[0]< (1-nu[0])/((les[i]-o_hang-1)*nu[0]+(1-nu[0]))){
+                        for (int j = 0; j <(les[i]-right_o_hang);j++){
+                            output(j) = output(j)+1;
+                        }
+                        //The right overhang is always G>A for the double stranded but 
+                        //Here we will make the assumption that the pattern is symmetric 
+                        //for practical reasons We can\'t  do that ....
+                    }else {
+                        Rcpp::NumericVector sa = floor(runif(1,0,les[i]-o_hang))+left_o_hang;
+                        for (int j = 0; j <=sa[0];j++){
+                            output(j) = output(j)+1;
+                        }
+                    }
+                }
+            }
+            if (fo(0)){
+                //Only considering the forward part
+                for (int j = 0; j < m[0];j++){
+                    reduced_output(j) = output(j)/iter(0);
+                }
+            }else {
+                for (int j = 0; j < m[0]/2;j++){
+                    reduced_output(j) = output(j)/iter(0);
+                    reduced_output(m[0]-j-1) = 1-output(j)/iter(0);
+                }
+            }
+            return(reduced_output);
+        }
+',plugin='RcppGSL' )
+
+
+
+
+pDam <- function(th,ded,des,la,nu,lin){
+    #The damage and mutation matrix multiplied together
+    pct <- nu*(la*des+ded*(1-la))
+    pga <- (1-nu)*(la*des+ded*(1-la))
+    return(
+           c(
+             th[lin,1]*1+th[lin,3]*pga
+             ,
+             th[lin,2]*(1-pct)
+             ,
+             th[lin,3]*(1-pga)
+             ,
+             th[lin,2]*pct+th[lin,4]*1
+             )
+           )
+}
+
+logLikFunOneBaseSlow <- function(Gen,S,Theta,deltad,deltas,laVec,nuVec,m,lin){
+    #Calculates the log likelihood using R, a C++ version is available  
+    ll <- 0
+    for (i in 1:length(laVec)){
+        #Get the damage probabilities
+        pd <- pDam(Theta,deltad,deltas,laVec[i],nuVec[i],lin)
+        ll <- ll + dmultinom(S[i,],Gen[i],pd,log=TRUE)
+    }
+    return(ll)
+}
+
+#The same logic as in logLikFunOneBaseSlow except using a compiled code 
+#to do the hard work
+logLikFunOneBaseFast <- cxxfunction(signature(
+                                      I_Gen="numeric",
+                                      I_S="numeric",
+                                      I_Theta="numeric",
+                                      I_deltad="numeric",
+                                      I_deltas="numeric",
+                                      I_laVec="numeric",
+                                      I_nuVec="numeric",
+                                      I_m="numeric",
+                                      I_lin="numeric"
+                                      ), body = '
+Rcpp::NumericMatrix S(I_S);
+Rcpp::NumericMatrix th(I_Theta);
+
+Rcpp::NumericVector Gen(I_Gen);
+
+Rcpp::NumericVector Vded(I_deltad);
+double ded = Vded[0];
+
+Rcpp::NumericVector Vdes(I_deltas);
+double des = Vdes[0];
+
+Rcpp::NumericVector laVec(I_laVec);
+
+Rcpp::NumericVector nuVec(I_nuVec);
+
+Rcpp::NumericVector Vm(I_m);
+int m = Vm[0];
+
+Rcpp::NumericVector Vlin(I_lin);
+int lin = Vlin[0];
+
+Rcpp::NumericVector pDam(4);
+
+Rcpp::NumericVector ret(1);
+ret[0] = 0;
+
+for (int i = 0; i<laVec.size();i++){
+    double la = laVec[i];
+    double nu = nuVec[i];
+    double pct = nu*(la*des+ded*(1-la));
+    double pga = (1-nu)*(la*des+ded*(1-la));
+    pDam[0] = th(lin-1,0)*1+th(lin-1,2)*pga;
+    pDam[1] = th(lin-1,1)*(1-pct);
+    pDam[2] = th(lin-1,2)*(1-pga);
+    pDam[3] = th(lin-1,1)*pct+th(lin-1,3)*1;
+    double p1 = gsl_sf_lnfact(Gen(i))
+               -gsl_sf_lnfact(S(i,0))
+               -gsl_sf_lnfact(S(i,1))
+               -gsl_sf_lnfact(S(i,2))
+               -gsl_sf_lnfact(S(i,3));
+    double p2 = S(i,0)*log(pDam[0])
+               +S(i,1)*log(pDam[1])
+               +S(i,2)*log(pDam[2])
+               +S(i,3)*log(pDam[3]);
+    ret[0] = ret[0] + p1 + p2;
+}
+return(ret);
+', plugin="RcppGSL",include="#include <gsl/gsl_sf_gamma.h>")
+
+
+
+logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
+    #Calculates the logLikelihood for all the bases by calling 
+    # logLikFunOneBaseFast for each base
+    if (deltad<0 || deltad>1 || deltas<0 || deltas>1  ){
+        return(-Inf)
+    }
+    #A,C,G and T
+
+    deb <- 0
+    
+    Asub <- dat[,"A.C"]+dat[,"A.G"]+dat[,"A.T"]    
+    ALL <- logLikFunOneBaseFast(dat[,"A"],cbind(dat[,"A"]-Asub,dat[,"A.C"],dat[,"A.G"],dat[,"A.T"]),Theta,deltad,deltas,laVec,nuVec,m,1)
+    if (deb){
+        ALLSlow <- logLikFunOneBaseSlow(dat[,"A"],cbind(dat[,"A"]-Asub,dat[,"A.C"],dat[,"A.G"],dat[,"A.T"]),Theta,deltad,deltas,laVec,nuVec,m,1)
+        stopifnot(all.equal(ALL,ALLSlow))
+    }
+
+    Csub <- dat[,"C.A"]+dat[,"C.G"]+dat[,"C.T"]
+    CLL <- logLikFunOneBaseFast(dat[,"C"],cbind(dat[,"C.A"],dat[,"C"]-Csub,dat[,"C.G"],dat[,"C.T"]),Theta,deltad,deltas,laVec,nuVec,m,2)
+    if (deb){
+        CLLSlow <- logLikFunOneBaseSlow(dat[,"C"],cbind(dat[,"C.A"],dat[,"C"]-Csub,dat[,"C.G"],dat[,"C.T"]),Theta,deltad,deltas,laVec,nuVec,m,2)
+        stopifnot(all.equal(CLL,CLLSlow))
+    }
+    
+
+    Gsub <- dat[,"G.A"]+dat[,"G.C"]+dat[,"G.T"]
+    GLL <- logLikFunOneBaseFast(dat[,"G"],cbind(dat[,"G.A"],dat[,"G.C"],dat[,"G"]-Gsub,dat[,"G.T"]),Theta,deltad,deltas,laVec,nuVec,m,3)
+    if (deb){
+        GLLSlow <- logLikFunOneBaseSlow(dat[,"G"],cbind(dat[,"G.A"],dat[,"G.C"],dat[,"G"]-Gsub,dat[,"G.T"]),Theta,deltad,deltas,laVec,nuVec,m,3)
+        stopifnot(all.equal(CLL,CLLSlow))
+    }
+    
+    Tsub <- dat[,"T.A"]+dat[,"T.C"]+dat[,"T.G"]
+    TLL <- logLikFunOneBaseFast(dat[,"T"],cbind(dat[,"T.A"],dat[,"T.C"],dat[,"T.G"],dat[,"T"]-Tsub),Theta,deltad,deltas,laVec,nuVec,m,4)
+    if (deb){
+        TLLSlow <- logLikFunOneBaseSlow(dat[,"T"],cbind(dat[,"T.A"],dat[,"T.C"],dat[,"T.G"],dat[,"T"]-Tsub),Theta,deltad,deltas,laVec,nuVec,m,4)
+        stopifnot(all.equal(TLL,TLLSlow))
+    }
+    return(ALL+CLL+GLL+TLL)
+}
+
+
+getParams <- function(cp){
+    #Utility function nice to update the MCMC iterations matrix 
+    return(c(cp$Theta,cp$Rho,cp$DeltaD,cp$DeltaS,cp$Lambda,cp$LambdaRight,cp$LambdaDisp,cp$Nu))
+}
+
+plotTrace<- function(dat,main,k=111){
+    #Running median of the MCMC iterations
+    plot(1:length(dat),dat,xlab="Iteration",ylab="",main=main,type="l")
+}
+
+plotEverything <- function(mcmcOut,hi=0,pl,thin=100){
+    #Plots the MCMC traceplot in the form of a running median and 
+    #histogram of the MCMC iterations
+    if (sum(c(cu_pa$same_overhangs==FALSE,
+                    cu_pa$fix_disp==FALSE,
+                    cu_pa$fix_ti_tv==FALSE,
+                    cu_pa$nuSamples!=0))>1){
+        #Check if I need to add a extra row
+        a_extra_row <- 1
+    }else {
+        a_extra_row <- 0
+    }
+    par(mfrow=c(3,2+a_extra_row))
+    if(hi){
+        hist(mcmcOut$out[,"Theta"],main=expression(theta),xlab="",freq=FALSE)
+        if (!mcmcOut$cu_pa$fix_ti_tv){
+            hist(mcmcOut$out[,"Rho"],main=expression(rho),xlab="",freq=FALSE)
+        }
+        hist(mcmcOut$out[,"DeltaD"],main=expression(delta[d]),xlab="",freq=FALSE)
+        hist(mcmcOut$out[,"DeltaS"],main=expression(delta[s]),xlab="",freq=FALSE)
+        hist(mcmcOut$out[,"Lambda"],main=expression(lambda),xlab="",freq=FALSE)
+        if (!mcmcOut$cu_pa$same_overhangs){
+            hist(mcmcOut$out[,"LambdaRight"],main=expression(lambda[r]),xlab="",freq=FALSE)
+        }
+        if (!mcmcOut$cu_pa$fix_disp){
+            hist(mcmcOut$out[,"LambdaDisp"],main=expression(sigma[lambda]),xlab="",freq=FALSE)
+        }
+        if (mcmcOut$cu_pa$nuSamples!=0){
+            hist(mcmcOut$out[,"Nu"],main=expression(nu),xlab="",freq=FALSE)
+        }
+        hist(mcmcOut$out[,"LogLik"],main="LogLik",xlab="",freq=FALSE)
+    }else {
+        plotTrace(mcmcOut$out[,"Theta"],main=expression(theta))
+        if (!mcmcOut$cu_pa$fix_ti_tv){
+            plotTrace(mcmcOut$out[,"Rho"],main=expression(theta))
+        }
+        plotTrace(mcmcOut$out[,"DeltaD"],main=expression(delta[d]))
+        plotTrace(mcmcOut$out[,"DeltaS"],main=expression(delta[s]))
+        plotTrace(mcmcOut$out[,"Lambda"],main=expression(lambda))
+        if (!mcmcOut$cu_pa$same_overhangs){
+            plotTrace(mcmcOut$out[,"LambdaRight"],main=expression(lambda[r]))
+        }
+        if (!mcmcOut$cu_pa$fix_disp){
+            plotTrace(mcmcOut$out[,"LambdaDisp"],main=expression(sigma[lambda]))
+        }
+        if (mcmcOut$cu_pa$nuSamples!=0){
+            plotTrace(mcmcOut$out[,"Nu"],main=expression(nu))
+        }
+        plotTrace(mcmcOut$out[,"LogLik"],main="LogLik")
+    }
+    par(mfrow=c(1,1))
+}
+
+accRat <- function(da){
+    #A rough measure of the acceptance ratio 
+    return(length(unique(da))/length(da))
+}
+
+adjustPropVar <- function(mcmc,propVar){
+    #Adjust the proposal variance to get something near .22
+    for (i in colnames(mcmc$out)){
+        if (i=="LogLik"){
+            next
+        } else if (i=="LambdaRight" & mcmc$cu_pa$same_overhangs){
+            next
+        } else if (i=="Nu" & mcmc$cu_pa$nuSamples==0){
+            next
+        } else if (i=="LambdaDisp" & mcmc$cu_pa$fix_disp){
+            next
+        } else if (i=="Rho" & mcmc$cu_pa$fix_ti_tv){
+            next
+        }
+        rat <- accRat(mcmc$out[,i])
+        if (rat<0.1){
+            propVar[[i]] <- propVar[[i]]/2
+        } else if (rat>0.3) {
+            propVar[[i]] <- propVar[[i]]*2
+        }
+    }
+    return(propVar)
+}
+
+runGibbs <- function(cu_pa,iter){
+    #Sampling over the conditional posterior distribution 
+    #for the parameters.
+    esti <- matrix(nrow=iter,ncol=9)
+    colnames(esti) <- c("Theta","Rho","DeltaD","DeltaS","Lambda","LambdaRight","LambdaDisp","Nu","LogLik")
+    for (i in 1:iter){
+        cu_pa<-updateTheta(cu_pa)
+        if (!cu_pa$fix_ti_tv){
+            #Fix the transition and transversion ratio
+            cu_pa<-updateRho(cu_pa)
+        }
+        cu_pa<-updateDeltaD(cu_pa)
+        cu_pa<-updateDeltaS(cu_pa)
+        cu_pa<-updateLambda(cu_pa)
+        if (!cu_pa$same_overhangs){
+            #Not the same overhangs update lambda right
+            cu_pa <- updateLambdaRight(cu_pa)
+        }
+        if (!cu_pa$fix_disp){
+            #Allowing dispersion in the overhangs
+            cu_pa<-updateLambdaDisp(cu_pa)
+        }
+        if (cu_pa$nuSamples!=0){
+            #Update the nu parameter by via MC estimation
+            cu_pa<-updateNu(cu_pa)
+        }
+        esti[i,c(1:8)] <- getParams(cu_pa) 
+        esti[i,"LogLik"] <- logLikAll(cu_pa$dat,cu_pa$ThetaMat,cu_pa$DeltaD,cu_pa$DeltaS,cu_pa$laVec,cu_pa$nuVec,cu_pa$m)
+        if (! (i %% 1000) && cu_pa$verbose){
+            cat("MCMC-Iter\t",i,"\t",esti[i,"LogLik"],"\n")
+        }
+    }
+    return(list(out=esti,cu_pa=cu_pa))
+}
+
+
+simPredCheck <- function(da,output){
+    #Simulates one draw from the posterior predictive distribution
+    #and the probability of a C>T substitution because of a cytosine 
+    #demnation.
+    bases <- da[,c("A","C","G","T")]
+    #Constructing the lambda vector
+    if (output$cu_pa$same_overhangs){
+        laVec <- seqProbVecLambda(sample(output$out[,"Lambda"],1),
+                                  sample(output$out[,"LambdaDisp"],1),
+                                  output$cu_pa$m,
+                                  output$cu_pa$forward_only,
+                                  cu_pa$reverse_only)
+    }else {
+        laVecLeft <- seqProbVecLambda(sample(output$out[,"Lambda"],1),
+                                      sample(output$out[,"LambdaDisp"],1),
+                                      output$cu_pa$m,
+                                      0,
+                                      0)
+        laVecRight <- seqProbVecLambda(sample(output$out[,"LambdaRight"],1),
+                                       sample(output$out[,"LambdaDisp"],1),
+                                       output$cu_pa$m,
+                                       0,
+                                       0)
+        laVec <- c(laVecLeft[1:(output$cu_pa$m/2)],laVecRight[(output$cu_pa$m/2+1):output$cu_pa$m])
+    }
+    #Constructing the nu vector
+    if (output$cu_pa$nuSamples !=0){
+        write("The MC sampling for the nu vector hasn't gone through a extensive testing procedure",stderr())
+        stop()
+        nuVec <- seqProbVecNuWithLengths(sample(output$out[,"Lambda"],1),
+                                         sample(output$out[,"LambdaDisp"],1),
+                                         sample(output$out[,"Nu"],1),
+                                         nrow(cu_pa$dat),
+                                         sampleHJ(output$cu_pa$lengths$Length,
+                                                 size=output$cu_pa$laSamples,
+                                                 prob=output$cu_pa$lengths$Occurences),
+                                         output$cu_pa$mLe,
+                                         output$cu_pa$forward_only,
+                                         output$cu_pa$nuSamples,
+                                         output$cu_pa$ds_protocol) 
+        nuVec <- c(nuVec,rev(1-nuVec))
+    }else {
+        nuVec <- output$cu_pa$nuVec
+    }
+    
+    #Sample the other parameters
+    des <- sample(output$out[,"DeltaS"],1)
+    ded <- sample(output$out[,"DeltaD"],1)
+    the <- sample(output$out[,"Theta"],1)
+    rho <- sample(output$out[,"Rho"],1)
+    pmat <- getPmat(the,rho,output$cu_pa$acgt)
+    ptransCT <- pmat["C","T"]
+    ptransCC <- pmat["C","C"]
+    ptransGA <- pmat["G","A"]
+    ptransGG <- pmat["G","G"]
+    #
+    coln <- c("A.C","A.G","A.T","C.A","C.G","C.T","G.A","G.C","G.T","T.A","T.C","T.G")
+    subs <- matrix(NA,nrow=nrow(output$cu_pa$dat),ncol=4+length(coln))
+    colnames(subs) <- c("A","C","G","T",coln)
+    #
+    damProb <- rep(NA,nrow(output$cu_pa$dat)) 
+    damProbGA <- damProb 
+    for (i in 1:nrow(output$cu_pa$dat)){
+        #Construct the site specific probabilities 
+        pct <- nuVec[i]*(laVec[i]*des+ded*(1-laVec[i]))
+        pga <- (1-nuVec[i])*(laVec[i]*des+ded*(1-laVec[i]))
+        pDamMat <- matrix(c(
+                         1,0,0,0,
+                         0,1-pct,0,pct,
+                         pga,0,1-pga,0,
+                         0,0,0,1
+                         ),nrow=4,byrow=TRUE)
+        ThetapDam <- pDamMat %*% pmat 
+        #Calculate the probability C.T due to cytosine demanation
+        damProb[i] <- ptransCC*pct/(ptransCC*pct+ptransCT) 
+        #Do not forget the reverse complement 
+        damProbGA[i] <- ptransGG*pga/(ptransGG*pga+ptransGA) 
+        #Then draw from a multinomial distribution
+        subs[i,c("A.C","A.G","A.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"A"],ThetapDam[1,]))[-1]/output$cu_pa$dat[i,"A"]
+        subs[i,c("C.A","C.G","C.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"C"],ThetapDam[2,]))[-2]/output$cu_pa$dat[i,"C"]
+        subs[i,c("G.A","G.C","G.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"G"],ThetapDam[3,]))[-3]/output$cu_pa$dat[i,"G"]
+        subs[i,c("T.A","T.C","T.G")] <- t(rmultinom(1,output$cu_pa$dat[i,"T"],ThetapDam[4,]))[-4]/output$cu_pa$dat[i,"T"]
+    }
+    return(list(subs=subs,damProb=damProb,damProbGA=damProbGA))
+}
+
+calcSampleStats <- function(da,X){
+    #Summary statistics of the posterior distributions
+    return(data.frame(
+                      x=1:nrow(da),
+                      pos=da[,"Pos"],
+                      mea=apply(X,1,mean),
+                      med=apply(X,1,median),
+                      loCI=apply(X,1,quantile,c(0.025)),
+                      hiCI=apply(X,1,quantile,c(0.975))
+                      ))
+}
+
+postPredCheck <- function(da,output,samples=10000){
+    #Plots the 95% posterior predictive intervals with the data as lines.
+    #Returns the site specific probability of a C>T or G>A substitution 
+    #because of a cytosine demnation.
+    CTs <- matrix(NA,nrow=nrow(da),ncol=samples)
+    GAs <- matrix(NA,nrow=nrow(da),ncol=samples)
+    REs <- matrix(NA,nrow=nrow(da),ncol=samples)
+    C2TProbs <- matrix(NA,nrow=nrow(da),ncol=samples)
+    G2AProbs <- matrix(NA,nrow=nrow(da),ncol=samples)
+    #Two indices here, 1-based and the relative one
+    da <- cbind(1:(nrow(da)),da)
+    colnames(da)[1] <- "oneBased"
+    #Get the breaks depending on parameters for pretty plots
+    if (!output$cu_pa$forward_only || !output$cu_pa$reverse_only){
+        bres <- c(seq(from=1,to=floor(nrow(da)/2),by=2),rev(seq(from=nrow(dat),to=floor(nrow(da)/2)+1,by=-2)))
+    }else if (output$cu_pa$forward_only) {
+        bres <- seq(from=1,to=nrow(da),by=2)
+    } else if (output$cu_pa$reverse_only){
+        bres <- seq(from=nrow(dat),to=1,by=-2)
+    }else {
+        write("There is something fishy with the options",stderr())
+        stop()
+    }
+    labs <- dat[bres,"Pos"]
+    #Sample from the posterior predicitive distibution
+    for (i in 1:samples){
+        sam <- simPredCheck(da,output)
+        C2TProbs[,i] <- sam$damProb
+        G2AProbs[,i] <- sam$damProbGA
+        CTs[,i] <- sam$subs[,"C.T"]
+        GAs[,i] <- sam$subs[,"G.A"]
+        REs[,i] <- apply(sam$subs[,c("A.C","A.G","A.T","C.A","C.G","G.C","G.T","T.A","T.C","T.G")],1,mean)
+    }
+    CTsStats <- calcSampleStats(da,CTs)
+    GAsStats <- calcSampleStats(da,GAs)
+    REsStats <- calcSampleStats(da,REs) 
+    #Plotting the posterior predictive intervals
+    p <- ggplot()+
+         geom_point(aes(x,mea,colour="C->T",aes_string="subs"),data=CTsStats)+
+         geom_point(aes(x,mea,colour="G->A"),data=GAsStats)+
+         geom_point(aes(x,mea,colour="Others"),data=REsStats)+
+         geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="C->T"),data=CTsStats)+
+         geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="G->A"),data=GAsStats)+
+         geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="Others"),data=REsStats)+
+         geom_line(aes(oneBased,C.T/C),color="red",data=data.frame(da))+
+         geom_line(aes(oneBased,G.A/G),color="green",data=data.frame(da))+
+         geom_line(aes(oneBased,((A.C+A.G+A.T)/A+(C.A+C.G)/C+(G.C+G.T)/G+(T.A+T.C+T.G)/T)/10),color="blue",data=data.frame(da))+
+         ylab("Substitution rate")+
+         xlab("Relative position")+
+         scale_x_continuous(breaks=bres,labels=labs)+
+         labs(colour = "Subs. type")+
+         ggtitle("Posterior prediction intervals")
+    if (output$cu_pa$use_bw_theme){
+        p <- p+theme_bw()
+    }
+    plot(p)
+    #The correcting probabilities 
+    coProbs <- cbind(da[,"Pos"],apply(C2TProbs,1,mean),apply(G2AProbs,1,mean))
+    colnames(coProbs) <- c("Position","C.T","G.A")
+    return(coProbs)
+}
+
+
+writeMCMC <- function(out,filename){
+    #Writes the posterior samples to a file
+    parameters <- c("Theta","DeltaD","DeltaS","Lambda")
+    if (!out$cu_pa$fix_ti_tv){
+        parameters <- c(parameters,"Rho")
+    }
+    if (!out$cu_pa$same_overhangs){
+        parameters <- c(parameters,"LambdaRight")
+    }
+    if (!out$cu_pa$fix_disp){
+        parameters <- c(parameters,"LambdaDisp")
+    }
+    if (out$cu_pa$nuSamples!=0){
+        parameters <- c(parameters,"Nu")
+    }
+    parameters <- c(parameters,"LogLik")
+    write.csv(out$out[,parameters],paste(filename,".csv",sep=""))
+    #Now calculate summary statistic of the posterior distributions
+    mea <- apply(out$out[,parameters],2,mean)
+    std <- apply(out$out[,parameters],2,sd)
+    qua <- apply(out$out[,parameters],2,quantile,seq(from=0,to=1,by=.025))
+    acc <- apply(out$out[,parameters],2,accRat)
+    summStat <- rbind(mea,std,acc,qua)
+    rownames(summStat)[1] <- "Mean" 
+    rownames(summStat)[2] <- "Std." 
+    rownames(summStat)[3] <- "Acceptance ratio" 
+    write.csv(summStat,paste(filename,"_summ_stat.csv",sep=""))
+}
+
diff --git a/mapdamage/Rscripts/stats/main.R b/mapdamage/Rscripts/stats/main.R
new file mode 100644
index 0000000..cfc48cd
--- /dev/null
+++ b/mapdamage/Rscripts/stats/main.R
@@ -0,0 +1,283 @@
+#The main work flow of the package
+#Load the libraries
+suppressMessages(library(inline))  #Already checked the libraries 
+suppressMessages(library(ggplot2)) #thus ignoring any messages from them
+suppressMessages(library(Rcpp))
+suppressMessages(library(gam)) 
+suppressMessages(library(RcppGSL))
+
+#Miscellaneous functions 
+source(paste(path_to_mapDamage_stats,"function.R",sep=""))
+
+#Prior and proposal distributions for the parameters
+source(paste(path_to_mapDamage_stats,"priorPropose.R",sep=""))
+
+#Update functions for the gibbs sampler
+source(paste(path_to_mapDamage_stats,"postConditonal.R",sep=""))
+
+#functions for the grid search
+source(paste(path_to_mapDamage_stats,"start.R",sep=""))
+
+#functions for loading the data 
+source(paste(path_to_mapDamage_stats,"data.R",sep=""))
+
+
+#######################################################
+#
+#              Loading the data
+#
+#######################################################
+
+#This used for the Briggs MC estimation
+if (nu_samples!=0){
+    start_vals$lengths <- getSeqLen(path_to_dat)
+    start_vals$le  <- floor(weighted.mean(x=start_vals$lengths$Length,w=start_vals$lengths$Occurences))
+}
+
+if (forward_only && reverse_only){
+    write("Cannot specify using only the 5' end and the 3' end which makes no sense",stderr())
+    stop()
+}
+
+fow_dat <-readMapDamData(path_to_dat) 
+rev_dat <-readMapDamData(path_to_dat,forward=0)
+if (forward_only){
+    #Taking only the forward part
+    dat <- fow_dat[1:sub_length,] 
+}else if (reverse_only){
+    #Taking only the reverse part
+    dat <- rev_dat[sub_length:1,] 
+}else {
+    #Using both ends
+    dat <- joinFowAndRev(fow_dat,rev_dat,sub_length)
+}
+
+#Getting everything ready for the mutation model
+if (jukes_cantor){
+    acgt <- c(0.25,0.25,0.25,0.25)
+    fix_ti_tv <- TRUE
+}else {
+    acgt <- readBaseFreqs(path_to_dat)
+    fix_ti_tv <- FALSE
+}
+
+#A list that keeps everything between the iterations
+cu_pa <- list(
+              dat=dat,
+              ThetaMat=NA,
+              Theta=-log((-start_vals$ptrans+.25)*4),
+              Rho=start_vals$rho,
+              DeltaD=start_vals$deltad,
+              DeltaS=start_vals$deltas,
+              Lambda=start_vals$lambda,
+              LambdaRight=start_vals$lambda_right,
+              LambdaDisp=start_vals$lambda_disp,
+              Nu=start_vals$nu,
+              m=nrow(dat),
+              meanLength=NA,
+              lengths=NA,
+              mLe=NA,
+              laVec=NA,
+              nuVec=NA,
+              iter=iterations,
+              nuSamples=nu_samples,
+              fix_nu=fix_nu,
+              ds_protocol=ds_protocol,
+              forward_only=forward_only,
+              reverse_only=reverse_only,
+              fix_disp= fix_disp,
+              same_overhangs= same_overhangs,
+              fix_ti_tv = fix_ti_tv,
+              acgt = acgt,
+              old_lik=-Inf,
+              verbose=verbose,
+              quiet=quiet,
+              use_raw_nick_freq = use_raw_nick_freq,
+              use_bw_theme = use_bw_theme
+              )
+
+if (nu_samples!=0){
+    #Need the lengths for the Briggs MC estimation
+    cu_pa$meanLength <- start_vals$le
+    cu_pa$lengths  <- start_vals$lengths
+    cu_pa$mLe  <- max(start_vals$lengths$Length)
+}
+
+cu_pa$ThetaMat <- getPmat(cu_pa$Theta,cu_pa$Rho,cu_pa$acgt)
+
+#######################################################
+#
+#              Setting the lambda vector 
+#
+#######################################################
+
+cu_pa$laVec <- seqProbVecLambda(cu_pa$Lambda,cu_pa$LambdaDisp,nrow(cu_pa$dat),cu_pa$forward_only,cu_pa$reverse_only)
+
+if (!cu_pa$same_overhangs){
+    #The overhangs are not the same
+    if (cu_pa$forward_only){
+        write("Cannot use different overhangs with only the 5' end",stderr())
+        stop()
+    } else if (cu_pa$reverse_only){
+        write("Cannot use different overhangs with only the 3' end",stderr())
+        stop()
+    }
+    cu_pa$laVecRight <- seqProbVecLambda(cu_pa$LambdaRight,cu_pa$LambdaDisp,nrow(cu_pa$dat),cu_pa$forward_only,cu_pa$reverse_only)
+}
+
+#######################################################
+#
+#              Setting the nu vector 
+#
+#######################################################
+
+if (!cu_pa$ds_protocol & cu_pa$nuSamples!=0){
+    #Using the single stranded protocol with nu samples which makes 
+    #no sense
+    write("Silly to use the MC estimation for nu_i if using the single stranded protocol", stderr())
+    stop()
+}else if (cu_pa$nuSamples!=0 & cu_pa$fix_nu) {
+    #Using the fixed nu parameter with nu samples which makes no sense 
+    write("Silly to use the MC estimation for nu_i and use fix_nu ", stderr())
+    stop()
+}else if (cu_pa$nuSamples!=0){
+    #Using the nu vector
+    #IF ds protocol we set nu_vector as 1
+    cu_pa$nuVec <- seqProbVecNuWithLengths(cu_pa$Lambda,
+                                       cu_pa$LambdaDisp,
+                               cu_pa$Nu,nrow(cu_pa$dat),
+                               sampleHJ(cu_pa$lengths$Length,size=cu_pa$nuSamples,prob=cu_pa$lengths$Occurences),
+                                              cu_pa$mLe,
+                                     cu_pa$forward_only,
+                                        cu_pa$nuSamples,
+                                      cu_pa$ds_protocol)
+}else if (!cu_pa$ds_protocol){
+    #The single stranded protocol
+    cu_pa$nuVec <- rep(1,nrow(dat))
+}else if (fix_nu){
+    #Ones at the 5' end and zeros at the 3' end
+    if (cu_pa$forward_only){
+        cu_pa$nuVec <- rep(1,nrow(dat))
+    }else if (cu_pa$reverse_only){
+        cu_pa$nuVec <- rep(0,nrow(dat))
+    }else {
+        cu_pa$nuVec <- c(rep(1,nrow(dat)/2),rep(0,nrow(dat)/2))
+    }
+}else {
+    #This is for a non linear nick frequency, assumes the G>T and G>A are 
+    #mostly due to DNA damage patterns do not use for low damage datasets
+    te<-(dat[,"C.T"]/dat[,"C"])/(dat[,"G.A"]/dat[,"G"]+dat[,"C.T"]/dat[,"C"])
+    if (sum(is.na(te) )!=0 ){
+        write("Warning, To few substitutions to assess the nick frequency, using constant nick frequency instead", stderr())
+        if (cu_pa$forward_only){
+            cu_pa$nuVec <- rep(1,nrow(dat))
+        }else if (cu_pa$reverse_only){
+            cu_pa$nuVec <- rep(0,nrow(dat))
+        }else {
+            cu_pa$nuVec <- c(rep(1,nrow(dat)/2),rep(0,nrow(dat)/2))
+        }
+    } else {
+        #The substitutes seem to be okay estimate the nick frequency using GAM 
+        if (cu_pa$forward_only || cu_pa$reverse_only){
+            if (cu_pa$use_raw_nick_freq){
+                #Use the frequency
+                cu_pa$nuVec <- te
+            }else{
+                #Use a smoother
+                cu_pa$nuVec  <- predict(gam(te~s(1:(cu_pa$m))))
+            }
+        }else {
+            if (cu_pa$use_raw_nick_freq){
+                cu_pa$nuVec  <- c(te[1:(cu_pa$m/2)],te[(cu_pa$m/2+1):cu_pa$m])
+            }else{
+                cu_pa$nuVec  <- c(predict(gam(te[1:(cu_pa$m/2)]~s(1:(cu_pa$m/2)))),
+                                  predict(gam(te[(cu_pa$m/2+1):cu_pa$m]~s(1:(cu_pa$m/2)))))
+            }
+        }
+        #This shouldn't happen but for sanity check
+        cu_pa$nuVec[cu_pa$nuVec>1] <- 1
+        cu_pa$nuVec[cu_pa$nuVec<0] <- 0
+    }
+    rm(te)
+}
+#######################################################
+#
+#          Finding an "optimal" starting place 
+#
+#######################################################
+
+if (grid_iter!=0){
+    #Start at random places and optimize the likelihood function
+    cu_pa <- gridSearch(cu_pa,grid_iter) 
+} 
+
+#Calculate the log likelihood in the beginning
+if (cu_pa$same_overhangs){
+    te_laVec <- cu_pa$laVec
+}else {
+    te_laVec <- c(cu_pa$laVec[1:(cu_pa$m/2)],cu_pa$laVecRight[(cu_pa$m/2+1):cu_pa$m])
+}
+cu_pa$old_lik <- logLikAll(cu_pa$dat,
+                      cu_pa$ThetaMat,
+                        cu_pa$DeltaD,
+                        cu_pa$DeltaS,
+                         te_laVec,
+                         cu_pa$nuVec,
+                             cu_pa$m
+                  )
+
+
+if (adjust_iter==0){
+    #Single burning period
+    if (!cu_pa$quiet){
+        cat("Single burn in period\n")
+    }
+    mcmcOut <- runGibbs(cu_pa,burn_in)
+    cu_pa <- mcmcOut$cu_pa
+}else {
+    for (i in 1:adjust_iter){
+        if (!cu_pa$quiet){
+            cat("Adjusting the proposal variance iteration ",i," \n")
+        }
+        mcmcOut <- runGibbs(cu_pa,burn_in)
+        cu_pa <- mcmcOut$cu_pa
+        proposeParameters <- adjustPropVar(mcmcOut,proposeParameters)
+    }
+}
+
+if (!cu_pa$quiet){
+    cat("Done burning, starting the iterations\n")
+}
+mcmcOut <- runGibbs(cu_pa,iterations)
+cu_pa <- mcmcOut$cu_pa
+
+if (!cu_pa$quiet){
+    cat("Done with the iterations, finishing up\n")
+}
+
+if (out_file_base!=""){
+    cat("Writing and plotting to files\n")
+    #Print everything to file
+    writeMCMC(mcmcOut,paste(out_file_base,"_MCMC_iter",sep=""))
+    #Trace plot
+    pdf(paste(out_file_base,"_MCMC_trace.pdf",sep=""))
+    plotEverything(mcmcOut,hi=0)
+    dev.off()
+    #histogram of the conditional distributions
+    pdf(paste(out_file_base,"_MCMC_hist.pdf",sep=""))
+    plotEverything(mcmcOut,hi=1)
+    dev.off()
+    #Posterior predictive plot 
+    pdf(paste(out_file_base,"_MCMC_post_pred.pdf",sep=""))
+    siteProb <- postPredCheck(dat,mcmcOut)
+    dev.off()
+    #Write correcting probabilities 
+    write.csv(siteProb,paste(out_file_base,"_MCMC_correct_prob.csv",sep=""))
+} else {
+    cat("Plotting\n")
+    plotEverything(mcmcOut,hi=0)
+    X11()
+    plotEverything(mcmcOut,hi=1)
+    X11()
+    postPredCheck(dat,mcmcOut)
+}
diff --git a/mapdamage/Rscripts/stats/postConditonal.R b/mapdamage/Rscripts/stats/postConditonal.R
new file mode 100644
index 0000000..e64d92b
--- /dev/null
+++ b/mapdamage/Rscripts/stats/postConditonal.R
@@ -0,0 +1,185 @@
+#The posterior conditional function utilized by the 
+# Gibbs sampler in function.R. They have all the 
+#same form maybe they should be implemented in a 
+#smarter way.
+
+#The basic structure is the following 
+
+#1. Get the old parameter 
+#2. Propose a jump
+#3. Accept it using the MH ratio
+#4. Return the old or new value based on the MH ratio
+
+updateTheta <- function(cp){
+    old_lik  <- cp$old_lik+priorTheta(cp$Theta)
+    theta_star <- proposeTheta(cp$Theta,1)
+    if (theta_star<0 ){
+        new_lik<- -Inf
+    } else {
+        theta_star_mat  <- getPmat(theta_star,cp$Rho,cp$acgt)
+        new_lik_func  <-  logLikAll(cp$dat,theta_star_mat,cp$DeltaD,cp$DeltaS,cp$laVec,cp$nuVec,cp$m)
+        new_lik <- new_lik_func+priorTheta(theta_star)
+    }
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$Theta  <- theta_star
+        cp$ThetaMat  <- theta_star_mat
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateRho <- function(cp){
+    old_lik  <- cp$old_lik+priorRho(cp$Rho)
+    rho_star <- proposeRho(cp$Rho,1)
+    if (rho_star<=0 ){
+        new_lik<- -Inf
+    } else {
+        rho_star_mat  <- getPmat(cp$Theta,rho_star,cp$acgt)
+        new_lik_func  <-  logLikAll(cp$dat,rho_star_mat,cp$DeltaD,cp$DeltaS,cp$laVec,cp$nuVec,cp$m)
+        new_lik <- new_lik_func+priorRho(rho_star)
+    }
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$Rho  <- rho_star
+        cp$ThetaMat  <- rho_star_mat
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateDeltaD <- function(cp){
+    old_lik <- cp$old_lik+priorDeltaD(cp$DeltaD)
+    deltad_star <- proposeDeltaD(cp$DeltaD,1)
+    if (deltad_star<0 || deltad_star>1){
+        new_lik <- -Inf
+    }else {
+        new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,deltad_star,cp$DeltaS,cp$laVec,cp$nuVec,cp$m)
+        new_lik <- new_lik_func+priorDeltaD(deltad_star)
+    }
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$DeltaD  <- deltad_star
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateDeltaS <- function(cp){
+    old_lik  <- cp$old_lik+priorDeltaS(cp$DeltaS)
+    deltas_star <- proposeDeltaS(cp$DeltaS,1)
+    if (deltas_star<0|| deltas_star>1){
+        new_lik <- -Inf
+    }else {
+        new_lik_func  <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,deltas_star,cp$laVec,cp$nuVec,cp$m)
+        new_lik  <- new_lik_func+priorDeltaS(deltas_star)
+    }
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$DeltaS  <- deltas_star
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateLambda <- function(cp){
+    old_lik  <- cp$old_lik+priorLambda(cp$Lambda)
+    lambda_star <- proposeLambda(cp$Lambda,1)
+    if (lambda_star<0 || lambda_star>1){
+        return(cp)
+    } 
+    laVecStarLeft  <- seqProbVecLambda(lambda_star,cp$LambdaDisp,cp$m,cp$forward_only,cp$reverse_only)
+    if (!cp$same_overhangs){
+        #The left and right overhangs are not the same!
+        laVecStar <- c(laVecStarLeft[1:(cp$m/2)],cp$laVecRight[(cp$m/2+1):cp$m])
+    } else {
+        #It left and right are the same
+        laVecStar <- laVecStarLeft
+    }
+    new_lik_func  <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,laVecStar,cp$nuVec,cp$m)
+    new_lik  <- new_lik_func+priorLambda(lambda_star)
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$Lambda  <- lambda_star
+        cp$laVec  <- laVecStar
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateLambdaRight <- function(cp){
+    old_lik  <- cp$old_lik+priorLambdaRight(cp$LambdaRight)
+    lambda_right_star <- proposeLambdaRight(cp$LambdaRight,1)
+    if (lambda_right_star<0 || lambda_right_star>1){
+        #Reject this right away
+        return(cp)
+    } 
+    laVecStarRight  <- seqProbVecLambda(lambda_right_star,cp$LambdaDisp,cp$m,cp$forward_only,cp$reverse_only)
+    if (!cp$same_overhangs){
+        #The left and right overhangs are not the same!
+        laVecStar <- c(cp$laVec[1:(cp$m/2)],laVecStarRight[(cp$m/2+1):cp$m])
+    } else {
+        #left and right are the same
+        print("You shouldn't be calling this function if the overhangs are the same")
+        stop()
+    }
+    if (cp$forward_only){
+        print("You shouldn't be calling this function if you are only considering the forward part")
+        stop()
+    }
+    if (cp$reverse_only){
+        print("You shouldn't be calling this function if you are only considering the reverse part")
+        stop()
+    }
+    new_lik_func  <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,laVecStar,cp$nuVec,cp$m)
+    new_lik  <- new_lik_func+priorLambdaRight(lambda_right_star)
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$LambdaRight  <- lambda_right_star
+        cp$laVecRight  <- laVecStar
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateLambdaDisp <- function(cp){
+    old_lik  <- cp$old_lik+priorLambdaDisp(cp$LambdaDisp)
+    lambda_disp_star <- proposeLambdaDisp(cp$LambdaDisp,1)
+    if (lambda_disp_star<0 ){
+        return(cp)
+    } 
+    if (!cp$same_overhangs){
+        leftLaVecStar  <- seqProbVecLambda(cp$Lambda,lambda_disp_star,cp$m,cp$forward_only,cp$reverse_only)
+        rightLaVecStar  <- seqProbVecLambda(cp$LambdaRight,lambda_disp_star,cp$m,cp$forward_only,cp$reverse_only)
+        laVecStar <- c(leftLaVecStar[1:(cp$m/2)],rightLaVecStar[(cp$m/2+1):cp$m])
+    }else {
+        laVecStar  <- seqProbVecLambda(cp$Lambda,lambda_disp_star,cp$m,cp$forward_only,cp$reverse_only)
+    }
+    new_lik_func  <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,laVecStar,cp$nuVec,cp$m)
+    new_lik  <- new_lik_func+priorLambdaDisp(lambda_disp_star)
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$LambdaDisp  <- lambda_disp_star
+        cp$laVec  <- laVecStar
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
+
+updateNu <- function(cp){
+    old_lik  <- cp$old_lik+priorNu(cp$Nu)
+    nu_star <- proposeNu(cp$Nu,1)
+    if (nu_star<0 || nu_star>1){
+        return(cp)
+    }
+    nu_Vec_star <- seqProbVecNuWithLengths(cp$Lambda,cp$LambdaDisp,nu_star,cp$m,sampleHJ(cp$lengths$Length,size=cp$nuSamples,prob=cp$lengths$Occurences),cp$mLe,cp$forward_only,cu_pa$nuSamples,cu_pa$ds_protocol)
+    new_lik_func  <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,cp$laVec,nu_Vec_star,cp$m)
+    new_lik  <- new_lik_func+priorNu(nu_star)
+    if (metroDesc(new_lik,old_lik)) {
+        #Accept
+        cp$Nu  <- nu_star
+        cp$nuVec  <- nu_Vec_star
+        cp$old_lik <- new_lik_func
+    }
+    return(cp)
+}
diff --git a/mapdamage/Rscripts/stats/priorPropose.R b/mapdamage/Rscripts/stats/priorPropose.R
new file mode 100644
index 0000000..2baff55
--- /dev/null
+++ b/mapdamage/Rscripts/stats/priorPropose.R
@@ -0,0 +1,125 @@
+#Prior and proposal distributions for the parameter,
+#they are all uninformative.
+
+priorTheta <- function(x){
+    return(dnorm(x=x,mean=1,sd=500,log=TRUE))
+}
+
+priorRho <- function(x){
+    return(dnorm(x=x,mean=1,sd=500,log=TRUE))
+}
+
+priorDeltaD <- function(x){
+    if (x<0 || x>1){
+        return(-Inf)
+    }
+    return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorDeltaS <- function(x){
+    if (x<0 || x>1){
+        return(-Inf)
+    }
+    return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorLambda <- function(x){
+    if (x<0 || x>1){
+        return(-Inf)
+    }
+    return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorLambdaRight <- function(x){
+    if (x<0 || x>1){
+        return(-Inf)
+    }
+    return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorLambdaDisp <- function(x){
+    if (x<0){
+        return(-Inf)
+    }
+    #2 times since we truncate it at zero
+    return(log(2)+dnorm(x=x,mean=0,sd=100,log=TRUE))
+}
+
+priorNu <- function(x){
+    if (x<0 || x>1){
+        return(-Inf)
+    }
+    return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+proposeTheta <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$Theta 
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
+
+proposeRho <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$Rho 
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
+
+proposeDeltaD <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$DeltaD
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)#Okay since norm is symmetric
+    }
+}
+
+proposeDeltaS <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$DeltaS 
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
+
+proposeLambda <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$Lambda
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
+
+proposeLambdaRight <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$LambdaRight
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
+
+proposeLambdaDisp <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$LambdaDisp
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
+
+proposeNu <- function(x=NA,ra=NA){
+    sh1 <- proposeParameters$Nu
+    if (!is.na(ra)){
+        return(rnorm(1,mean=x,sd=sh1))
+    } else {
+        return(0)
+    }
+}
diff --git a/mapdamage/Rscripts/stats/runGeneral.R b/mapdamage/Rscripts/stats/runGeneral.R
new file mode 100755
index 0000000..03d4e52
--- /dev/null
+++ b/mapdamage/Rscripts/stats/runGeneral.R
@@ -0,0 +1,95 @@
+#! /usr/bin/Rscript
+#Parses the command line arguments and calls the main program
+
+#Enable full backtraces on errors
+on_error <- function(e)
+  {
+    traceback(2)
+    quit(status = 1)
+  }
+options(error = on_error)
+
+rm(list=ls())
+graphics.off()
+
+argsList<-commandArgs(TRUE) #Assuming the user is not using this script directly
+argsList <- argsList[-1]  #Skip --args
+
+
+#######################################################
+#
+#                  Proposal parameters 
+#
+#######################################################
+
+proposeParameters <- list( 
+                          Theta=0.0003,
+                          Rho=0.001,
+                          DeltaD=0.001,
+                          DeltaS=0.009,
+                          Lambda=0.008,
+                          LambdaRight=0.008,
+                          LambdaDisp=0.015,
+                          Nu=0.001
+                          )
+
+#######################################################
+#
+#                  Initial values
+#
+#######################################################
+
+
+start_vals <- list(
+                   ptrans = 0.00396/3,
+                   rho = 1,
+                   deltad = 0.0285,
+                   deltas = 0.269,
+                   lambda = 0.27,
+                   lambda_right = 0.27,
+                   lambda_disp = 1,
+                   len = 60,
+                   nu = 0.0645
+                   )
+
+
+#######################################################
+#
+#                   Various parameters
+#
+#######################################################
+
+grid_iter <- as.integer(argsList[1])                 # Number of random starting points for the grid search
+burn_in <- as.integer(argsList[2])                   # Burn in period
+adjust_iter <- as.integer(argsList[3])               # Adjust proposal variance parameters iterations
+iterations <- as.integer(argsList[4])                # Iterations
+
+forward_only <- as.logical(as.numeric(argsList[5]))  # Taking only the 5' end of the seqs 
+reverse_only <- as.logical(as.numeric(argsList[6]))  # Taking only the 3' end of the seqs 
+fix_disp <- as.logical(as.numeric(argsList[7]))      # Geom instead of neg Bin 
+same_overhangs <- as.logical(as.numeric(argsList[8]))# The overhangs are the same on both sides
+
+nu_samples <- as.integer(argsList[9])                # Estimate the nu vector using the Briggs model (Should be similiar ammount to the number of sequences use this on your own risk....)
+fix_nu <- as.logical(as.numeric(argsList[10]))       # Set 1 at 5' end and 0 at 3' end or else estimates it with GAM
+ds_protocol <- as.logical(as.numeric(argsList[11]))  # Single stranded protocol C>T at both sides
+sub_length <- as.integer(argsList[12])               # How long sequence to use from each side 
+
+path_to_dat <- argsList[13]                          # Absolute path to the dataset
+path_to_mapDamage_stats <- argsList[14]              # Absolute path to the mapDamage-stats folder
+out_file_base  <- argsList[15]                       # Base file name of the output
+verbose <- as.logical(as.numeric(argsList[16]))      # These options control the volume of the output
+quiet <- as.logical(as.numeric(argsList[17]))
+
+jukes_cantor <- as.logical(as.numeric(argsList[18])) # Fix the transition and transversion ratio and acgt frequencies are equal
+path_to_acgt <- argsList[19] 
+use_raw_nick_freq <- as.logical(as.numeric(argsList[20]))
+use_bw_theme <- as.logical(as.numeric(argsList[21]))
+
+
+#######################################################
+#
+#                   Run the program
+#
+#######################################################
+
+source(paste(path_to_mapDamage_stats,"main.R",sep=""))
diff --git a/mapdamage/Rscripts/stats/start.R b/mapdamage/Rscripts/stats/start.R
new file mode 100644
index 0000000..a71f034
--- /dev/null
+++ b/mapdamage/Rscripts/stats/start.R
@@ -0,0 +1,107 @@
+#Runs likelihood optimization in the beginning to 
+#start in a better place.
+
+logLikAllOptimize  <- function(x,cp){
+    #Optimizer wrapper for the log likelihood function
+    Theta      <- x["Theta"]       
+    DeltaD      <- x["DeltaD"]       
+    DeltaS      <- x["DeltaS"]
+    Lambda      <- x["Lambda"]
+    LambdaRight <- x["LambdaRight"]
+    LambdaDisp  <- x["LambdaDisp"]
+    Rho         <- x["Rho"]
+    if (sum(c(DeltaD,DeltaS,Lambda,LambdaRight)>1)>0 || sum(c(Theta,DeltaD,DeltaS,Lambda,LambdaRight,Rho)<0)>0){
+        #This very convenient for lazy people, using a unbounded optimization method Nelder-Mead 
+        #just returning Inf when we are outside the boundary.
+        return(Inf)
+    }
+    if(cp$fix_ti_tv){
+        theta_mat  <- getPmat(Theta,cp$Rho,cu_pa$acgt)
+    }else {
+        theta_mat  <- getPmat(Theta,Rho,cu_pa$acgt)
+    }
+    if (cp$fix_disp){
+        disp_to_use =  cp$LambdaDisp
+    }else {
+        disp_to_use = LambdaDisp
+    }
+    leftLaVec  <- seqProbVecLambda(Lambda,disp_to_use,cu_pa$m,cu_pa$forward_only,cu_pa$reverse_only)
+    rightLaVec  <- seqProbVecLambda(LambdaRight,disp_to_use,cu_pa$m,cu_pa$forward_only,cu_pa$reverse_only)
+    if (cp$same_overhangs){
+        rightLaVec <- leftLaVec
+    }
+    if (cp$forward_only){
+        #Only using the forward end
+        laVec <- leftLaVec
+    }else if (cp$reverse_only){
+        #Only using the backward end
+        laVec <- rightLaVec
+    }else {
+        #Both ends
+        laVec <- c(leftLaVec[1:(cu_pa$m/2)],rightLaVec[(cu_pa$m/2+1):cu_pa$m])
+    }
+    return(-logLikAll(cp$dat,theta_mat,DeltaD,DeltaS,laVec,cp$nuVec,cp$m))
+}
+
+gridSearch <- function(cp,iter){
+    #Starts the Markov chain in local maxima for "quicker" burn-in period
+    #No theoretical reasoning behind this but seems to work well in practice.
+    if (cp$nuSamples!=0){
+        write("Can't use the grid search with simulated nu vector",stderr())
+        stop()
+    }
+    if(!cp$quiet){
+        cat("Starting grid search, starting from random values\n")
+    }
+    minVal <- Inf
+    minParams <- list()
+    control  <- list(maxit=5000)
+    for (i in 1:iter){
+        out <- optim(
+                     c(Theta=runif(1),
+                       DeltaD=runif(1),
+                       DeltaS=runif(1),
+                       Lambda=runif(1),
+                       LambdaRight=runif(1),
+                       LambdaDisp=sample(c(0.5,1,2,3,4,50,100,150,400),1),
+                       Rho=sample(c(0.5,.75,1,1.25,1.5),1)
+                      ),
+                     logLikAllOptimize,
+                     NULL,
+                     cp=cp,
+                     method="Nelder-Mead",
+                     control=control
+                     )
+        if (out$value<minVal){
+            minParams <- out
+            minVal <- out$value
+        }
+        if (cp$verbose){
+            cat("Iteration\t",i,"\t",-minVal,"\n")
+        }
+    }
+    #These parameters are always optimized
+    cp$Theta <- minParams$par["Theta"]
+    cp$DeltaD <- minParams$par["DeltaD"]
+    cp$DeltaS <- minParams$par["DeltaS"]
+    cp$Lambda <- minParams$par["Lambda"]
+    
+    #Only update the other ones if the user requested for them
+    if (!cp$fix_ti_tv){
+        cp$Rho  <- minParams$par["Rho"]
+    }
+    if (!cp$fix_disp){
+        cp$LamdaDisp  <- minParams$par["LambdaDisp"]
+    }
+    if (!cp$same_overhangs){
+        cp$LamdaRight  <- minParams$par["LambdaRight"]
+    }
+
+    #Now calculating matrix and vectors accompanying the optimal values
+    cp$laVec <- seqProbVecLambda(cp$Lambda,cp$LambdaDisp,nrow(cp$dat),cp$forward_only,cp$reverse_only)
+    cp$laVecRight <- seqProbVecLambda(cp$Lambda,cp$LambdaDisp,nrow(cp$dat),cp$forward_only,cp$reverse_only)
+    cp$ThetaMat <- getPmat(cp$Theta,cp$Rho,cp$acgt)
+    cp$old_lik <- - minVal
+
+    return(cp)
+}
diff --git a/mapdamage/__init__.py b/mapdamage/__init__.py
new file mode 100644
index 0000000..ee20c07
--- /dev/null
+++ b/mapdamage/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+
+import mapdamage.parseoptions
+import mapdamage.seq
+import mapdamage.align
+import mapdamage.tables
+import mapdamage.composition
+import mapdamage.rscript
+import mapdamage.rescale
+import mapdamage.version
diff --git a/mapdamage/align.py b/mapdamage/align.py
new file mode 100644
index 0000000..6abe523
--- /dev/null
+++ b/mapdamage/align.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+
+import itertools
+import mapdamage
+
+# from Martin Kircher, description of CIGAR operations
+#O BAM Description
+#M 0 alignment match (can be a sequence match or mismatch)
+#I 1 insertion to the reference
+#D 2 deletion from the reference
+#N 3 skipped region from the reference
+#S 4 soft clipping (clipped sequences present in SEQ)
+#H 5 hard clipping (clipped sequences NOT present in SEQ)
+#P 6 padding (silent deletion from padded reference)
+#= 7 sequence match
+#X 8 sequence mismatch
+
+
+def get_coordinates(read):  
+  """ return external coordinates of aligned read bases """
+  fivep = read.aend if read.is_reverse else read.pos
+  threep = read.pos if read.is_reverse else read.aend
+  
+  return fivep, threep
+
+
+def get_around(coord, chrom, reflengths, length, ref):
+  """ return reference sequences before and after the read
+  check for extremities and return what is available """
+  coord_min = min(coord)
+  coord_max = max(coord)
+
+  pos_before = max(0, coord_min - length)
+  pos_after  = min(reflengths[chrom], coord_max + length)
+
+  # Uppercased, to be sure that we don't compare A,C,G,T and a,c,g,t
+  before = ref.fetch(chrom, pos_before, coord_min).upper()
+  after = ref.fetch(chrom, coord_max, pos_after).upper()
+
+  return before, after
+
+
+def align(cigarlist, seq, ref):
+  """ insert gaps according to the cigar string 
+  deletion: gaps to be inserted into read sequences, 
+  insertions: gaps to be inserted into reference sequence """  
+  lref = list(ref)
+  for nbr, idx in parse_cigar(cigarlist, 1):
+    lref[idx:idx] = ["-"] * nbr
+
+  lread = list(seq)
+  for nbr, idx in parse_cigar(cigarlist, 2):
+    lread[idx:idx] = ["-"] * nbr
+
+  return "".join(lread), "".join(lref)
+
+
+def align_with_qual(cigarlist, seq, qual, threshold, ref):
+  """ insert gaps according to the cigar string 
+  deletion: gaps to be inserted into read sequences and qualities, 
+  insertions: gaps to be inserted into reference sequence """  
+  lref = list(ref)
+  for nbr, idx in parse_cigar(cigarlist, 1):
+    lref[idx:idx] = ["-"] * nbr
+
+  lread = list(seq)
+  lqual = list(qual)
+  for nbr, idx in parse_cigar(cigarlist, 2):
+    lread[idx:idx] = ["-"] * nbr
+    lqual[idx:idx] = ["-"] * nbr
+
+  for idx, score in enumerate(lqual):
+    # mask read and reference nucleotides (so not gaps) if below desired threshold
+    if (ord(score)-33) < threshold and lread[idx] != "-":
+      lread[idx] = "N"
+      lref[idx]  = "N"
+
+  return "".join(lread), "".join(lqual), "".join(lref)
+
+
+def get_mis(read, seq, refseq, ref, length, tab, end):
+  """ count mismatches using aligned reference and read,
+  must be redone since N in reference were randomly replaced by any bases """
+  std = '-' if read.is_reverse else '+'
+  subtable = tab[ref][end][std]
+
+  for (i, nt_seq, nt_ref) in itertools.izip(xrange(length), seq, refseq):
+    if (nt_seq in "ACGT-") and (nt_ref in "ACGT-"):
+      if nt_ref != "-":
+        # record base composition in the reference, only A, C, G, T
+        subtable[nt_ref][i] += 1
+
+      # Most ref/seq pairs will be identical
+      if (nt_ref != nt_seq):
+        mut = "%s>%s" % (nt_ref, nt_seq)
+        subtable[mut][i] += 1
+
+
+def parse_cigar(cigarlist, ope):
+  """ for a specific operation (mismach, match, insertion, deletion... see above)
+  return occurences and index in the alignment """
+  tlength = 0
+  coordinate = []
+  # count matches, indels and mismatches
+  oplist = (0, 1, 2, 7, 8)
+  for operation, length in cigarlist:
+    if operation == ope:
+        coordinate.append([length, tlength])
+    if operation in oplist: 
+        tlength += length
+  return coordinate
+
+
+def record_soft_clipping(read, tab, length):
+  """ record soft clipped bases at extremities """
+  def update_table(end, std, bases):
+    for i in range(0, min(bases, length)):
+      tab[end][std]['S'][i] += 1
+
+  strand = '-' if read.is_reverse else '+'
+  for (nbases, idx) in mapdamage.align.parse_cigar(read.cigar, 4):
+    if idx == 0:
+      # Soft-clipping at the left side of the alignment
+      end = '3p' if read.is_reverse else '5p'
+    else:
+      # Soft-clipping at the right side of the alignment
+      end = '5p' if read.is_reverse else '3p'
+
+    update_table(end, strand, nbases)
+
+
+
+
+
diff --git a/mapdamage/composition.py b/mapdamage/composition.py
new file mode 100644
index 0000000..882139a
--- /dev/null
+++ b/mapdamage/composition.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+import mapdamage
+import itertools
+import csv
+import subprocess
+import sys
+
+def count_ref_comp(read, chrom, before, after, comp):
+  """ record basae composition in external genomic regions """
+  std = '-' if read.is_reverse else '+'
+
+  _update_table(comp[chrom]['5p'][std], before, xrange(-len(before), 0))
+  _update_table(comp[chrom]['3p'][std], after,  xrange(1, len(after) + 1))
+
+
+def count_read_comp(read, chrom, length, comp):
+  """ record base composition of read, discard marked nucleotides """
+  std, seq = '+', read.query
+  if read.is_reverse:
+    std, seq = '-', mapdamage.seq.revcomp(seq)
+
+  _update_table(comp[chrom]['5p'][std], seq,           xrange(1, length + 1))
+  _update_table(comp[chrom]['3p'][std], reversed(seq), xrange(-1, - length - 1, -1))
+
+
+def _update_table(table, sequence, indices):
+  for (index, nt) in itertools.izip(indices, sequence):
+    if nt in table:
+      table[nt][index] += 1
+
+
+def get_base_comp(filename,destination=False):
+    """
+    Gets the basecomposition of all the sequences in filename
+    and returns the value to destination if given.
+    """
+    path_to_seqtk = mapdamage.rscript.construct_path("seqtk",folder="seqtk") 
+    bases = {"A":0,"C":0,"G":0,"T":0}
+    alp = ["A","C","G","T"]
+    try:
+        proc = subprocess.Popen([path_to_seqtk,"comp",filename],stdout=subprocess.PIPE)
+        out = proc.communicate()[0]
+        for li in out.splitlines():
+            tabs = li.split() # 1 is the ref, 2 is the total and then the base counts A, C, G and T.
+            bases["A"] = bases["A"] + int(tabs[2])
+            bases["C"] = bases["C"] + int(tabs[3])
+            bases["G"] = bases["G"] + int(tabs[4])
+            bases["T"] = bases["T"] + int(tabs[5])
+    except (OSError, ValueError), error:
+        sys.stderr.write("Error: Seqtk failed: %s\n" % (error,))
+        sys.exit(1)
+    # get the base frequencies
+    ba_su = sum(bases.values())
+    for ba in alp:
+        bases[ba] = float(bases[ba])/float(ba_su)
+    if (destination==False):
+        return bases
+    else:
+        # write the results
+        fo = open(destination,"w")
+        vals = [str(bases[i]) for i in alp]
+        fo.write(",".join(alp)+"\n")
+        fo.write(",".join(vals)+"\n")
+        fo.close()
+
+def read_base_comp(filename):
+    """
+    Read the base compition from a file created by get_base_comp
+    """
+    fh = open(filename)
+    first_line = True
+    for li in fh:
+        li = li.rstrip()
+        lp = li.split()
+        if first_line:
+            header = lp
+            first_line = False 
+        else:
+            body = lp
+    bases = {}
+    for ba,per in zip(header,body):
+        bases[ba] = per
+    return bases
diff --git a/mapdamage/parseoptions.py b/mapdamage/parseoptions.py
new file mode 100644
index 0000000..7102cf2
--- /dev/null
+++ b/mapdamage/parseoptions.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
+import os
+import sys
+
+from mapdamage.version import __version__
+from mapdamage.rscript import check_R_lib
+
+def file_exist(filename):
+    if os.path.exists(filename) and not os.path.isdir(filename):
+        return True
+    elif filename == "-":
+        return True
+    else:
+        sys.stderr.write("Error: '%s' is not a valid file\n" % (filename))
+        return None
+
+
+def whereis(program):
+    for path in os.environ.get('PATH', '').split(':'):
+        if os.path.exists(os.path.join(path, program)) and \
+            not os.path.isdir(os.path.join(path, program)):
+            return os.path.join(path, program)
+    return None
+
+
+
+def check_py_version():
+    req_version = (2, 6)
+    cur_version = sys.version_info
+
+    if cur_version >= req_version:
+        return True
+    else:
+        sys.stderr.write("Your Python interpreter is too old."\
+            "Please consider upgrading to at least %d.%d\n" % (req_version[0], req_version[1]))
+        return None
+
+
+def options():  
+    parser = OptionParser("%prog [options] -i BAMfile -r reference.fasta\n\nUse option -h or --help for help", version=__version__, \
+            epilog="report bugs to aginolhac at snm.ku.dk, MSchubert at snm.ku.dk or jonsson.hakon at gmail.com")
+
+    args = OptionGroup(parser, "Input files")
+    args.add_option("-i", "--input", help="SAM/BAM file, must contain a valid header, use '-' for reading a BAM from stdin", \
+          action="store", type="string", dest="filename")
+    args.add_option("-r", "--reference", help="Reference file in FASTA format", \
+          action="store", dest="ref")
+
+    parser.add_option_group(args)
+    group = OptionGroup(parser, "General options")
+    group.add_option("-n", "--downsample", help = "Downsample to a randomly selected fraction of the reads (if 0 < DOWNSAMPLE < 1), or " \
+                     "a fixed number of randomly selected reads (if DOWNSAMPLE >= 1). By default, no downsampling is performed.",
+                     type = float, default = None)
+    group.add_option("--downsample-seed", help = "Seed value to use for downsampling. See documentation for py module 'random' for default behavior.",
+                     type = int, default = None)
+    group.add_option("--merge-reference-sequences", help = "Ignore referece sequence names when tabulating reads (using '*' instead). "
+                     "Useful for alignments with a large number of reference sequnces, which may otherwise result in excessive "
+                     "memory or disk usage due to the number of tables generated.",
+                     default = False, action = "store_true")
+    group.add_option("-l", "--length", dest="length", help="read length, in nucleotides to consider [%default]", \
+            type = int, default=70,action="store")
+    group.add_option("-a", "--around", dest="around", help="nucleotides to retrieve before/after reads [%default]", \
+            type = int, default=10,action="store")
+    group.add_option("-Q", "--min-basequal", dest="minqual", help="minimun base quality Phred score considered, Phred-33 assumed [%default]", \
+            type = int, default=0, action="store")
+    group.add_option("-d", "--folder", help="folder name to store results [results_FILENAME]", \
+          action="store", type="string", dest="folder")
+    group.add_option("-f", "--fasta", dest="fasta", help="Write alignments in a FASTA file", \
+          default=False,action="store_true")
+    group.add_option("--plot-only", dest="plot_only", help="Run only plotting from a valid result folder", \
+          default=False,action="store_true")
+    group.add_option("-q", "--quiet", dest="quiet", help="Disable any output to stdout", \
+          default=False,action="store_true")
+    group.add_option("-v", "--verbose", dest="verbose", help="Display progression information during parsing", \
+          default=False,action="store_true")
+    group.add_option("--mapdamage-modules", dest="mapdamage_modules", help="Override the system wide installed mapDamage module", \
+          default=None)
+    group.add_option("--no-plot", dest="no_r", help=SUPPRESS_HELP, default=False, action="store_true")
+    parser.add_option_group(group)
+
+    # options for plotting damage patterns
+    group2 = OptionGroup(parser, "Options for graphics")
+    group2.add_option("-y", "--ymax", dest="ymax", \
+           help="graphical y-axis limit for nucleotide misincorporation frequencies [%default]", type = float, \
+           default=0.3,action="store")
+    group2.add_option("-m", "--readplot", dest="readplot", \
+           help="read length, in nucleotides, considered for plotting nucleotide misincorporations [%default]", \
+           type = int, default=25, action="store")
+    group2.add_option("-b", "--refplot", dest="refplot", \
+          help="the number of reference nucleotides to consider for ploting base composition in the region located upstream "
+          "and downstream of every read [%default]", type= int, default=10, action="store")
+    group2.add_option("-t", "--title", dest="title", \
+          help="title used for plots [%default]", \
+          type="string", default="",action="store")
+    parser.add_option_group(group2)
+
+    # Then the plethora of optional options for the statistical estimation ..
+    group3 = OptionGroup(parser,"Options for the statistical estimation")
+    group3.add_option("", "--rand", dest="rand", \
+            help="Number of random starting points for the likelihood optimization  [%default]", type = int, default=30, action="store")
+    group3.add_option("", "--burn", dest="burn", \
+            help="Number of burnin iterations  [%default]", type = int, default=10000,action="store")
+    group3.add_option("", "--adjust", dest="adjust", \
+            help="Number of adjust proposal variance parameters iterations  [%default]", type = int, default=10, action="store")
+    group3.add_option("", "--iter", dest="iter", \
+            help="Number of final MCMC iterations  [%default]", type = int, default=50000, action="store")
+    group3.add_option("", "--forward", dest="forward", \
+            help="Using only the 5' end of the seqs  [%default]", default=False, action="store_true")
+    group3.add_option("", "--reverse", dest="reverse", \
+            help="Using only the 3' end of the seqs  [%default]", default=False, action="store_true")
+    group3.add_option("", "--var-disp", dest="var_disp", \
+            help="Variable dispersion in the overhangs  [%default]", default=False,action="store_true")
+    group3.add_option("", "--jukes-cantor", dest="jukes_cantor", \
+            help="Use Jukes Cantor instead of HKY85  [%default]", default=False,action="store_true")
+    group3.add_option("", "--diff-hangs", dest="diff_hangs", \
+            help="The overhangs are different for 5' and 3'  [%default]", default=False, action="store_true")
+    group3.add_option("", "--fix-nicks" , dest="fix_nicks", \
+            help="Fix the nick frequency vector (Only C.T from the 5' end and G.A from the 3' end)  [%default]", default=False, action="store_true")
+    group3.add_option("", "--use-raw-nick-freq" , dest="use_raw_nick_freq", \
+            help="Use the raw nick frequency vector without smoothing  [%default]", default=False, action="store_true")
+    group3.add_option("", "--single-stranded", dest="single_stranded", \
+            help="Single stranded protocol [%default]", default=False, action="store_true")
+    group3.add_option("", "--theme-bw", dest="theme_bw", \
+            help="Use black and white theme in post. pred. plot [%default]", default=False, action="store_true")
+    group3.add_option("", "--seq-length", dest="seq_length", \
+            help="How long sequence to use from each side [%default]", type = int, default=12, action="store")
+    group3.add_option("--stats-only", dest="stats_only", help="Run only statistical estimation from a valid result folder", \
+          default=False, action="store_true")
+    group3.add_option("--rescale", dest="rescale", help="Rescale the quality scores in the BAM file using the output from the statistical estimation", \
+          default=False, action="store_true")
+    group3.add_option("--rescale-only", dest="rescale_only", help="Run only rescaling from a valid result folder", \
+          default=False, action="store_true")
+    group3.add_option("--rescale-out", dest="rescale_out", help="Write the rescaled BAM to this file", \
+          default=None, action="store")
+    group3.add_option("--no-stats", help="Disabled statistical estimation, active by default", default=False, action="store_true")
+    group3.add_option("--check-R-packages", help="Check if the R modules are working", default=False, action="store_true")
+
+    parser.add_option_group(group3)
+
+    #Parse the arguments
+    (options, args) = parser.parse_args()
+    
+    # check python version
+    if not check_py_version():
+        return None
+
+    # if the user wants to check the R packages then do that before the option parsing 
+    if options.check_R_packages:
+        if check_R_lib():
+            sys.exit(1)
+        else:
+            print("All R packages are present")
+            sys.exit(0)
+
+    # check general arguments
+    if not (options.plot_only or options.stats_only) and not options.filename:
+        parser.error('SAM/BAM file not given (-i)')
+    if not (options.plot_only or options.ref):
+        parser.error('Reference file not given (-r)')
+    if not options.plot_only and not options.stats_only:
+        if not file_exist(options.filename) or not file_exist(options.ref):
+            return None
+    if options.downsample is not None:
+        if options.downsample <= 0:
+            parser.error("-n/--downsample must be a positive value")
+        elif options.downsample >= 1:
+            options.downsample = int(options.downsample)
+
+    if options.plot_only and not options.folder:
+        parser.error('Folder not provided, required with --plot-only')
+    if options.stats_only and not options.folder:
+        parser.error('Folder not provided, required with --stats-only')
+    if options.rescale_only and not options.folder:
+        parser.error('Folder not provided, required with --rescale-only')
+    if options.rescale_only and not options.filename:
+        parser.error('Input bam not provided, required with --rescale-only')
+    if options.rescale_only and not options.ref:
+        parser.error('Reference not provided, required with --rescale-only')
+
+    if options.verbose and options.quiet:
+        parser.error('Cannot use verbose and quiet option at the same time')
+
+    # check options
+    if options.length < 0:
+        parser.error('length (-l) must be a positive integrer')
+    if options.around < 0:
+        parser.error('around (-a) must be a positive integrer')
+    if options.ymax <= 0 or options.ymax > 1:
+        parser.error('ymax (-b) must be an real number beetween 0 and 1')
+    if options.readplot < 0:
+        parser.error('readplot (-m) must be a positive integrer')
+    if options.refplot < 0:
+        parser.error('refplot (-b) must be a positive integrer')
+    if options.refplot > options.around and not options.plot_only:
+        parser.error('refplot (-b) must be inferior to around (-a)')
+    if options.readplot > options.length:
+        parser.error('readplot (-m) must be inferior to length (-l)')
+    if options.minqual < 0 or  options.minqual > 41:
+        parser.error('minimal base quality, Phred score, must be within this range: 0 - 41')
+
+    # check statistic options
+    if options.forward and options.reverse:
+        parser.error('Cannot use only forward end and only reverse end for the statistics')
+
+    # use filename as default for plot titles if not set
+    if options.title == "" and options.filename:
+        options.title = os.path.splitext(os.path.basename(options.filename))[0]
+    # for --plot-only, use the folder name, without results_ as title
+    if options.title == "" and not options.filename and options.folder:
+        options.title = os.path.splitext(os.path.basename(options.folder))[0].replace("results_", "")
+
+    # check folder
+    if not options.folder and options.filename:
+        options.folder = "results_"+os.path.splitext(os.path.basename(options.filename))[0]
+
+    # check destination for rescaled bam
+    if not options.rescale_out and (options.rescale or options.rescale_only):
+        # if there are mulitiple bam files to rescale then pick first one as 
+        # the name of the rescaled file
+        if isinstance(options.filename,list):
+            basename = os.path.basename(options.filename[0])
+        else:
+            basename = os.path.basename(options.filename)
+        with_ext = os.path.splitext(basename)[0] + ".rescaled.bam"
+        options.rescale_out = os.path.join(options.folder, with_ext)
+
+    if os.path.isdir(options.folder):
+        if not options.quiet and not options.plot_only:
+            print("Warning, %s already exists" % options.folder)
+        if options.plot_only:
+            if not file_exist(options.folder+"/dnacomp.txt") or not file_exist(options.folder+"/misincorporation.txt"):
+                parser.error('folder %s is not a valid result folder' % options.folder)
+    else:
+        os.makedirs(options.folder, mode = 0750)
+        if options.plot_only or options.stats_only or options.rescale_only:
+            sys.stderr.write("Error, %s does not exist while plot/stats/rescale only was used\n" % options.folder)
+            return None
+
+
+    # check if the Rscript executable is present on the system
+    if not whereis('Rscript'):
+        print("Warning, Rscript is not in your PATH, plotting is disabled")
+        options.no_r = True
+
+    # check the nick frequencies options
+    if (options.use_raw_nick_freq + options.fix_nicks + options.single_stranded)>1:
+        parser.error("The options --use-raw-nick-freq, --fix-nicks and --single-stranded are mutually exclusive.")
+
+    if check_R_lib():
+        # check for R libraries
+        print("The Bayesian estimation has been disabled\n")
+        options.no_stats = True
+        if options.stats_only:
+            sys.exit("Cannot use --stats-only with missing R libraries")
+        if options.rescale:
+            sys.exit("Cannot use --rescale with missing R libraries")
+        if options.rescale_only:
+            sys.exit("Cannot use --rescale-only with missing R libraries")
+
+
+    return options
+
diff --git a/mapdamage/rescale.py b/mapdamage/rescale.py
new file mode 100644
index 0000000..789d21e
--- /dev/null
+++ b/mapdamage/rescale.py
@@ -0,0 +1,352 @@
+import csv
+import sys
+import os
+import mapdamage
+import pysam
+import itertools
+import math
+import logging
+import time
+
+
+def phred_pval_to_char(pval):
+    """Transforming error rate to ASCII character using the Phred scale"""
+    return chr(int(round(-10*math.log10(abs(pval)))+33))
+
+def phred_char_to_pval(ch):
+    """Transforming ASCII character in the Phred scale to the error rate"""
+    return 10**(-(float(ord(ch))-float(33))/10)
+
+def get_corr_prob(folder):
+    """
+    Reads the damage probability correction file, returns a 
+    dictionary with this structure 
+    position (one based)  -  CT  -  probability
+                          -  GA  -  probability
+    """
+    full_path = os.path.join(folder,"Stats_out_MCMC_correct_prob.csv")
+    if not os.path.isfile(full_path):
+        sys.exit("Missing file, the file \n\tStats_out_MCMC_correct_prob.csv\nshould be in the folder\n\t"+folder+"\nDid you run the MCMC estimates of the parameters?")
+    try:
+        fi_handle = csv.DictReader(open(full_path))
+        corr_prob = {}
+        for line in fi_handle:
+            if (corr_prob.has_key(line["Position"])):
+                sys.exit('This file has multiple position definitions %s, line %d: %s' % \
+                    (folder, fi_handle.line_num, corr_prob[line["Position"]]))
+            else:
+                corr_prob[int(line["Position"])] = {'C.T':float(line["C.T"]), 'G.A':float(line["G.A"])}
+        return corr_prob
+    except csv.Error as e:
+        sys.exit('File %s, line %d: %s' % (os.path.join(folder,"Stats_out_MCMC_correct_prob.csv"), \
+            fi_handle.line_num, e))
+
+
+def corr_this_base(corr_prob, nt_seq, nt_ref, pos, length,direction="both"):
+    """
+    The position specific damaging correction, using the input 
+    corr_prob dictionary holding the damage correcting values 
+    nt_seq nucleotide in the sequence 
+    nt_ref nucleotide in the reference
+    pos relative position from the 5' end 
+    length length of the sequence
+    direction which end to consider the rescaling
+    returns the correction probability for this particular set
+    """
+    if (pos == 0):
+        # not using 0 based indexing
+        raise SystemError
+    if ( nt_seq == "T" and nt_ref == "C" ):
+        # an C to T transition
+        subs = "C.T"
+    elif( nt_seq == "A" and nt_ref == "G" ):
+        # an G to A transition
+        subs = "G.A"
+    else:
+        # other transitions/transversions are not affected by damage
+        return 0
+    
+    back_pos = pos-length-1 
+    # position from 3' end
+
+    if corr_prob.has_key(pos):
+        p5_corr = corr_prob[pos][subs]
+        # correction from 5' end
+    else:
+        p5_corr = 0
+
+    if corr_prob.has_key(back_pos):
+        p3_corr = corr_prob[back_pos][subs]
+        # correction from 3' end
+    else:
+        p3_corr = 0
+
+    if direction == "forward":
+        return p5_corr
+    elif direction == "backward":
+        return p3_corr 
+    elif direction == "both":
+        if pos < abs(back_pos) :
+            # then we use the forward correction
+            return p5_corr
+        else :
+            # else the backward correction
+            return p3_corr
+    else:
+        # this should not happen
+        raise SystemExit("Abnormal direction in the rescaling procedure")
+
+def initialize_subs():
+    """Initialize a substitution table, to track the expected substitution counts"""
+    per_qual = dict(zip(range(0,130),[0]*130))
+    subs = {"CT-before":per_qual.copy(),\
+            "TC-before":per_qual.copy(),\
+            "GA-before":per_qual.copy(),\
+            "AG-before":per_qual.copy(),\
+            "CT-after":per_qual.copy(),\
+            "TC-after":per_qual.copy(),\
+            "GA-after":per_qual.copy(),\
+            "AG-after":per_qual.copy(),\
+            "A":0,\
+            "C":0,\
+            "G":0,\
+            "T":0,\
+            "CT-pvals":0.0,\
+            "CT-pvals_before":0.0,\
+            "TC-pvals":0.0,\
+            "GA-pvals":0.0,\
+            "GA-pvals_before":0.0,\
+            "AG-pvals":0.0,\
+            }
+    return subs
+
+
+
+def record_subs(subs,nt_seq,nt_ref,nt_qual,nt_newqual,prob_corr):
+    """ record the expected substitution change, prob_corr is the excact version for nt_qual"""
+    if ( nt_seq == "T" and nt_ref == "C"):
+        sub_type = "CT"
+        subs["CT-pvals"] += prob_corr
+        subs["CT-pvals_before"] += 1-phred_char_to_pval(nt_qual)
+    elif ( nt_seq == "A" and nt_ref == "G"):
+        sub_type = "GA"
+        subs["GA-pvals"] += prob_corr
+        subs["GA-pvals_before"] += 1-phred_char_to_pval(nt_qual)
+    elif ( nt_seq == "C" and nt_ref == "T"):
+        sub_type = "TC"
+        subs["TC-pvals"] += 1-phred_char_to_pval(nt_qual)
+        if (nt_qual != nt_newqual):
+            raise SystemError("Internal error: rescaling qualities for the wrong transitions")
+    elif ( nt_seq == "G" and nt_ref == "A"):
+        sub_type = "AG"
+        subs["AG-pvals"] += 1-phred_char_to_pval(nt_qual)
+        if (nt_qual != nt_newqual):
+            raise SystemError("Internal error: rescaling qualities for the wrong transitions")
+    else:
+        sub_type = "NN"
+    if (sub_type != "NN"):
+        # record only transitions 
+        subs[sub_type+"-before"][int(ord(nt_qual))-33] += 1
+        subs[sub_type+"-after"][int(ord(nt_newqual))-33] += 1
+    if (nt_ref in ["A","C","G","T"]):
+        subs[nt_ref] += 1
+
+def qual_summary_subs(subs):
+    """Calculates summary statistics for the substition table subs"""
+    for i in ["CT-before","TC-before","GA-before","AG-before","CT-after","TC-after","GA-after","AG-after"]:
+        for lv in [0,10,20,30,40]:
+            for qv in subs[i]:
+                if qv >= lv :
+                    key = i+"-Q"+str(lv)
+                    if subs.has_key(key):
+                        subs[key] += subs[i][qv]
+                    else:
+                        subs[key] = subs[i][qv]
+
+def print_subs(subs):
+    """Print the substition table"""
+    print("\tThe expected substition frequencies before and after scaling using the scaled qualities as probalities:")
+    if subs["C"]!=0:
+        # the special case of no substitutions
+        print("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"]))
+    else: 
+        print("\tCT\tNA\t\tNA")
+    if subs["T"]!=0:
+        print("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"]))
+    else:
+        print("\tTC\tNA\t\tNA")
+    if subs["G"]!=0:
+        print("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"]))
+    else:
+        print("\tGA\tNA\t\tNA")
+    if subs["A"]!=0:
+        print("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"]))
+    else:
+        print("\tAG\tNA\t\tNA")
+    print("\tQuality metrics before and after scaling")
+    print("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"]))
+    print("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"]))
+    print("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"]))
+    print("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"]))
+    print("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"]))
+    print("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"]))
+    print("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"]))
+    print("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"]))
+    print("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"]))
+    print("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"]))
+    
+
+def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="both"):
+    """
+    bam              a pysam bam object
+    read             a pysam read object
+    ref              a pysam fasta ref file
+    reflengths       a dictionary holding the length of the references 
+    subs             a dictionary holding the corrected number of substition before and after scaling 
+    corr_prob dictionary from get_corr_prob
+    returns a read with rescaled quality score
+    
+    Iterates through the read and reference, rescales the quality 
+    according to corr_prob
+    """
+    if not debug:
+        # no need to log when unit testing
+        logger = logging.getLogger(__name__)
+    raw_seq = read.query
+    # external coordinates 5' and 3' , 0-based offset
+    coordinate = mapdamage.align.get_coordinates(read)
+    # fetch reference name, chromosome or contig names
+    chrom = bam.getrname(read.tid)
+    refseq = ref.fetch(chrom, min(coordinate), max(coordinate)).upper()
+    # add gaps to qualities and mask read and reference nucleotides if below desired threshold
+    (seq, qual, refseq) = mapdamage.align.align_with_qual(read.cigar, \
+        raw_seq, read.qqual, -100, refseq)
+    length_read = len(raw_seq)
+    length_align = len(seq)
+    # reverse complement read and reference when mapped reverse strand
+    if read.is_reverse:
+        refseq = mapdamage.seq.revcomp(refseq)
+        seq = mapdamage.seq.revcomp(seq)
+        qual = qual[::-1]
+    new_qual = [-100]*length_read
+    pos_on_read = 0
+    number_of_rescaled_bases = 0.0
+    for (i, nt_seq, nt_ref, nt_qual) in itertools.izip(xrange(length_align), seq, refseq, qual):
+        # rescale the quality according to the triplet position, 
+        # pair of the reference and the sequence
+        if ((nt_seq == "T" and nt_ref =="C") or (nt_seq == "A" and nt_ref =="G")):
+            # need to rescale this subs.
+            pdam = 1 - corr_this_base(corr_prob, nt_seq, nt_ref, pos_on_read + 1, length_read,direction=direction)
+            pseq = 1 - phred_char_to_pval(nt_qual)
+            newp = pdam*pseq # this could be numerically unstable
+            newq = phred_pval_to_char(1-newp)
+            number_of_rescaled_bases += 1-pdam
+        else:
+            # don't rescale, other bases
+            newp = 1 - phred_char_to_pval(nt_qual)
+            newq = nt_qual 
+        if pos_on_read < length_read:
+            new_qual[pos_on_read] = newq 
+            record_subs(subs,nt_seq,nt_ref,nt_qual,new_qual[pos_on_read],newp)
+            if nt_seq != "-":
+                pos_on_read += 1
+            # done with the aligned portion of the read 
+        else:
+            if not debug:
+                logger.warning("Warning: The aligment of the read is longer than the actual read %s",(read.qname))
+            break
+    new_qual = "".join(new_qual)
+
+    if read.is_reverse:
+        new_qual = new_qual[::-1]
+    if (read.cigar[0][0] == 4):
+        # check for soft clipping at forward end 
+        new_qual = read.qual[0:read.cigar[0][1]] + new_qual
+    if (read.cigar[-1][0] == 4):
+        # the same backwards
+        new_qual = new_qual + read.qual[-read.cigar[-1][1]:]
+
+    read.qual = new_qual
+    # truncate this to 5 digits
+    number_of_rescaled_bases = float("%.5f" % number_of_rescaled_bases)
+    # check if the read has a MR tag 
+    for tag in read.tags:
+        if tag[0] == "MR":
+            raise SystemExit("Read: %s already has a MR tag, can't rescale" % (read))
+    read.tags = read.tags + [("MR:f",number_of_rescaled_bases)]
+    return read
+
+
+def rescale_qual(ref, options,debug=False):
+    """    
+    ref                a pysam fasta ref file
+    bam_filename       name of a BAM/SAM file to read
+    fi                 file containing the csv with correction probabilities
+    reflengths         dictionary with the reference lengths
+    options            options from the command line parsing
+
+    Iterates through BAM file, makes a new BAM file with rescaled qualities.
+    """
+    if not debug:
+        # no need to log when unit testing
+        logger = logging.getLogger(__name__)
+        logger.info("Rescaling BAM: '%s' -> '%s'" % (options.filename, options.rescale_out))
+    start_time = time.time()
+
+    # open SAM/BAM files
+    bam = pysam.Samfile(options.filename)
+    if debug:
+        write_mode = "wh"
+    else:
+        write_mode = "wb"
+    bam_out = pysam.Samfile(options.rescale_out,write_mode, template = bam)
+    corr_prob = get_corr_prob(options.folder)
+    subs = initialize_subs()
+    first_pair = True
+    number_of_non_proper_pairs = 0
+
+    for hit in bam:
+        if hit.is_unmapped:
+            pass
+        elif not hit.qual and not debug:
+            logger.warning("Cannot rescale base PHRED scores for read '%s'; no scores assigned." % hit.qname)
+        elif hit.is_paired : 
+            if first_pair and not debug:
+                # assuming the ends are non-overlapping 
+                logger.warning("Warning! Assuming the pairs are non-overlapping, facing inwards and correctly paired.")
+                first_pair=False
+            #5p --------------> 3p
+            #3p <-------------- 5p
+            # pair 1 (inwards)
+            #5p ----> 
+            #             <---- 5p
+            #     A         B
+            # pair 2 (outwards), this happens if the reference is RC this is not supported 
+            #             ----> 3p
+            #3p <----         
+            #     A         B
+            # Correct outwards pairs from the 3p and inwards pairs with the 5p end
+            if ((not hit.is_reverse) and hit.mate_is_reverse and (hit.pnext>hit.pos) and hit.tid==hit.mrnm):
+                # the inwards case mate A
+                hit = rescale_qual_read(bam, hit, ref, corr_prob,subs,direction="forward",debug=debug)
+            elif (hit.is_reverse and (not hit.mate_is_reverse) and (hit.pnext<hit.pos) and hit.tid==hit.mrnm):
+                # the inwards case mate B
+                hit = rescale_qual_read(bam, hit, ref, corr_prob,subs,direction="forward",debug=debug)
+            else:
+                number_of_non_proper_pairs += 1
+                # cannot do much with conflicting pairing information
+        else:
+            hit = rescale_qual_read(bam, hit, ref, corr_prob,subs,debug=debug)
+
+        bam_out.write(hit)
+    if  number_of_non_proper_pairs!=0 and not debug:
+        logger.warning("Number of non-rescaled reads due to improper pairing:  %d" % number_of_non_proper_pairs)
+    if (subs["TC-before"] != subs["TC-after"] or subs["AG-before"] != subs["AG-after"]):
+        sys.exit("Qualities for T.C and A.G transitions should not change in the rescaling, please contact the authors.")
+    qual_summary_subs(subs)
+    bam.close()
+    bam_out.close()
+    if not options.quiet:
+        print_subs(subs)
+    if not debug:
+        logger.debug("Rescaling completed in %f seconds" % (time.time() - start_time,))
diff --git a/mapdamage/rescale_test.py b/mapdamage/rescale_test.py
new file mode 100644
index 0000000..df86575
--- /dev/null
+++ b/mapdamage/rescale_test.py
@@ -0,0 +1,57 @@
+import unittest
+import optparse
+import rescale
+import pysam
+import filecmp
+
+def mock_options(filename,rescale_out,folder):
+    """Make the options object with nice values for testing"""
+    return optparse.Values({
+        "filename":filename,
+        "rescale_out":rescale_out,
+        "verbose":True,
+        "folder":folder,
+        "quiet":True
+        })
+
+
+class testPairedFile(unittest.TestCase):
+    """Tests if rescaling of a paired end file"""
+    def test_paired_end_file(self):
+        """Test, paired end SAM file"""
+        # here is the example
+        # >ref
+        # 5p CGA AAA CGA 3p
+        # 3p GCT TTT GCT 5p
+        # >r001/1
+        # CGA
+        # >r001/2
+        # GCA
+        # The sam file looks like this
+        #@HD VN:1.5 SO:coordinate
+        #@SQ SN:ref LN:9
+        #r001 163 ref 1 30 3M = 7 9 CGA III MR:f:0 #the normal ones
+        #r001 83 ref 7 30 3M = 1 -9 TCG III MR:f:0
+        #r002 163 ref 1 30 3M = 7 9 TGA III MR:f:0.9 #With one dam subs
+        #r002 83 ref 7 30 3M = 1 -9 CAA III MR:f:0.009
+        #r003 83 ref 1 30 3M = 7 9 TGA III #The reverse complement, should not rescale (thus no flags)
+        #r003 163 ref 7 30 3M = 1 -9 CAA III
+        # 
+        #hand calculated the correct rescaled sam file in pe_rescaled_correct.sam
+        options = mock_options("rescale_test/pe_test/pe.sam","rescale_test/pe_test/pe_rescaled.sam","rescale_test/pe_test/pe_output/")
+        ref = pysam.Fastafile("rescale_test/pe_test/ref.fa")
+        rescale.rescale_qual(ref,options,debug=True)
+        self.assertTrue(filecmp.cmp("rescale_test/pe_test/pe_rescaled.sam","rescale_test/pe_test/pe_rescaled_correct.sam"))
+
+class testCases(unittest.TestCase):
+    """Various cases that failed"""
+    def test_longalignshortread(self):
+        """Check if fails on an aligment longer than the read"""
+        prefix="rescale_test/long_align/"
+        options = mock_options(prefix+"pe.sam",prefix+"pe_out.sam",prefix+"pe_output/")
+        ref = pysam.Fastafile("rescale_test/pe_test/ref.fa")
+        rescale.rescale_qual(ref,options,debug=True)
+        #Should run without an error
+
+if  __name__=='__main__':
+    unittest.main()
diff --git a/mapdamage/rescale_test/long_align/pe.sam b/mapdamage/rescale_test/long_align/pe.sam
new file mode 100644
index 0000000..92c9427
--- /dev/null
+++ b/mapdamage/rescale_test/long_align/pe.sam
@@ -0,0 +1,3 @@
+ at HD	VN:1.5	SO:coordinate
+ at SQ	SN:ref	LN:9
+r001	163	ref	1	30	3M2D	=	7	9	CGA	III
diff --git a/mapdamage/rescale_test/long_align/pe_output/Stats_out_MCMC_correct_prob.csv b/mapdamage/rescale_test/long_align/pe_output/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..e295f35
--- /dev/null
+++ b/mapdamage/rescale_test/long_align/pe_output/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,5 @@
+"","Position","C.T","G.A"
+"1",1,0.9,0
+"2",2,0.009,0
+"3",-2,0,0.09
+"4",-1,0,0.9
diff --git a/mapdamage/rescale_test/long_align/ref.fa b/mapdamage/rescale_test/long_align/ref.fa
new file mode 100644
index 0000000..29e24ff
--- /dev/null
+++ b/mapdamage/rescale_test/long_align/ref.fa
@@ -0,0 +1,2 @@
+>ref
+CGAAAACGA
diff --git a/mapdamage/rescale_test/pe_test/pe.sam b/mapdamage/rescale_test/pe_test/pe.sam
new file mode 100644
index 0000000..5910991
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/pe.sam
@@ -0,0 +1,8 @@
+ at HD	VN:1.5	SO:coordinate
+ at SQ	SN:ref	LN:9
+r001	163	ref	1	30	3M	=	7	9	CGA	III
+r001	83	ref	7	30	3M	=	1	-9	CGA	III
+r002	163	ref	1	30	3M	=	7	9	TGA	III
+r002	83	ref	7	30	3M	=	1	-9	CAA	III
+r003	83	ref	1	30	3M	=	7	9	TGA	III
+r003	163	ref	7	30	3M	=	1	-9	CAA	III
diff --git a/mapdamage/rescale_test/pe_test/pe_output/Stats_out_MCMC_correct_prob.csv b/mapdamage/rescale_test/pe_test/pe_output/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..e295f35
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/pe_output/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,5 @@
+"","Position","C.T","G.A"
+"1",1,0.9,0
+"2",2,0.009,0
+"3",-2,0,0.09
+"4",-1,0,0.9
diff --git a/mapdamage/rescale_test/pe_test/pe_rescaled_correct.sam b/mapdamage/rescale_test/pe_test/pe_rescaled_correct.sam
new file mode 100644
index 0000000..9b45850
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/pe_rescaled_correct.sam
@@ -0,0 +1,8 @@
+ at HD	VN:1.5	SO:coordinate
+ at SQ	SN:ref	LN:9
+r001	163	ref	1	30	3M	=	7	9	CGA	III	MR:f:0
+r001	83	ref	7	30	3M	=	1	-9	CGA	III	MR:f:0
+r002	163	ref	1	30	3M	=	7	9	TGA	!II	MR:f:0.9
+r002	83	ref	7	30	3M	=	1	-9	CAA	I5I	MR:f:0.009
+r003	83	ref	1	30	3M	=	7	9	TGA	III
+r003	163	ref	7	30	3M	=	1	-9	CAA	III
diff --git a/mapdamage/rescale_test/pe_test/ref.fa b/mapdamage/rescale_test/pe_test/ref.fa
new file mode 100644
index 0000000..29e24ff
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/ref.fa
@@ -0,0 +1,2 @@
+>ref
+CGAAAACGA
diff --git a/mapdamage/rscript.py b/mapdamage/rscript.py
new file mode 100644
index 0000000..9019aeb
--- /dev/null
+++ b/mapdamage/rscript.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+from subprocess import CalledProcessError, check_call
+from mapdamage.version import __version__
+import mapdamage
+import logging
+import time
+
+
+def construct_path(name,folder="Rscripts"):
+    """Construct a path to the mapdamage package data given the name"""
+    return(os.path.join(mapdamage.__path__[0], folder, name))
+
+def plot(opt):
+  """
+  Calls the R script to make the plots, takes in the arguments 
+  from parameter processing 
+  """
+  # comp<-args[1]
+  # pdfout<-args[2]
+  # around<-as.numeric(args[3])
+  # misincorp<-args[4]
+  # lg<-as.numeric(args[5])
+  # ymax<-as.numeric(args[6])
+  # folder<-args[7]
+  # title<-args[8]
+  # version<-args[9]
+
+  fmut = opt.folder+"/"+"misincorporation.txt"
+  fcomp = opt.folder+"/"+"dnacomp.txt"
+  title = opt.folder+"/"+"Fragmisincorporation_plot.pdf"
+
+  script = construct_path("mapDamage.R") 
+  call = ["Rscript", script, fcomp, title, opt.refplot, fmut, opt.readplot, \
+      opt.ymax, opt.folder, opt.title, __version__]
+  code = subprocess.call(map(str, call))
+
+  if code == 0:
+    if not opt.quiet:
+      print("pdf %s generated" % title)
+    return 0
+  else:
+    print("Error: plotting with R failed")
+    return 1
+
+
+        
+def opt_plots(opt):       
+  """ optional plots for length distribution
+  and cumulative C>T mutations, per strand """
+  
+  fmut = opt.folder+"/"+"misincorporation.txt"
+  flength = opt.folder+"/"+"lgdistribution.txt"
+  output = opt.folder+"/"+"Length_plot.pdf"
+
+  script = construct_path("lengths.R") 
+  call = ["Rscript", script, flength, output, fmut, opt.length, \
+      opt.title, __version__, bool_to_int(opt.quiet)]
+  code = subprocess.call(map(str, call))
+  if code == 0:
+    return 0
+  else:
+    print("Error: plotting with R failed")
+    return 1
+
+
+def check_R_one_lib(name):
+    """Checks if a necessary R library is here."""
+    with open(os.devnull, "w") as devnull:
+        rpa = construct_path("stats/checkLibraries.R")
+        return subprocess.call(["Rscript", rpa, "--args", name],
+                               stdout = devnull,
+                               stderr = devnull)
+
+
+def check_R_lib():
+    """
+    Checks if the necessary R libraries are here, signal 
+    otherwise
+    """
+    libs = ["inline", "ggplot2", "gam", "Rcpp", "RcppGSL"]
+    missing_lib = []
+    for lib in libs:
+        # Check the libraries
+        if check_R_one_lib(lib):
+            #found a missing library
+            missing_lib.append(lib)
+    if len(missing_lib) > 1:
+        # Grammar Nazi has arrived
+        last_ele = missing_lib.pop()
+        print ("Missing the following R libraries '" + "', '".join(missing_lib) \
+            + "' and '" + last_ele + "'")
+        return 1
+    elif len(missing_lib) == 1:
+        print ("Missing the following R library "+missing_lib[0])
+        return 1
+    else :
+        # No missing libraries
+        return 0
+
+
+def run_stats(opt):
+    """
+    Runs the Bayesian estimation program, using the options opt
+    """
+    arg = ["Rscript",                            \
+         construct_path("stats/runGeneral.R"), \
+         "--args",                               \
+         opt.rand,                               \
+         opt.burn,                               \
+         opt.adjust,                             \
+         opt.iter,                               \
+         bool_to_int(opt.forward),               \
+         bool_to_int(opt.reverse),               \
+         bool_to_int(not opt.var_disp),              \
+         bool_to_int(not opt.diff_hangs),            \
+         0,                                      \
+         bool_to_int(opt.fix_nicks),             \
+         bool_to_int(not opt.single_stranded),       \
+         opt.seq_length,                         \
+         opt.folder+"/",                         \
+         construct_path("stats/"),             \
+         opt.folder+"/Stats_out",                \
+         int(opt.verbose),                       \
+         int(opt.quiet),                         \
+         int(opt.jukes_cantor),                       \
+         opt.folder+"/acgt_ratio.csv",            \
+         int(opt.use_raw_nick_freq),                   \
+         int(opt.theme_bw)                   \
+         ]
+    arg = [str(i) for i in arg]
+
+    logger = logging.getLogger(__name__)
+    logger.info("Performing Bayesian estimates")
+    logger.debug("Call: %s" % (" ".join(arg),))
+    start_time = time.time()
+
+    try:
+        check_call(arg)
+    except CalledProcessError as e:
+        logger.error("The Bayesian statistics program failed to finish")
+        raise e
+
+    logger.debug("Bayesian estimates completed in %f seconds" % (time.time() - start_time,))
+    return 0
+
+
+def bool_to_int(boolean):
+    """ return 0 for False and 1 for True """
+    result = 1 if boolean else 0
+    return result
diff --git a/mapdamage/seq.py b/mapdamage/seq.py
new file mode 100644
index 0000000..a3d2d58
--- /dev/null
+++ b/mapdamage/seq.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+import sys
+import string
+
+# from Martin Kircher, to complement DNA
+TABLE = string.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
+    'ACGTKYWSRMBDHVacgtkywsrmbdhv')
+
+LETTERS = ("A", "C", "G", "T")
+MUTATIONS = ('G>A', 'C>T', 'A>G', 'T>C', 'A>C', 'A>T', 'C>G', 'C>A', 'T>G',
+             'T>A', 'G>C', 'G>T', 'A>-', 'T>-', 'C>-', 'G>-', '->A', '->T', 
+             '->C', '->G', 'S')
+HEADER = LETTERS + ("Total", ) + MUTATIONS
+
+
+def write_fasta(read, ref, seq, refseq, start, end, before, after, fout):
+  std = '-' if read.is_reverse else '+'
+
+  # output coordinate in 1-based offset for 5'
+  if len(before) > 0:
+    fout.write(">%s\n%s\n" % (ref, before))
+  fout.write(">%s:%d-%d\n%s\n>%s_%s\n%s\n" % (ref, start+1, end, refseq, read.qname, std, seq))
+  if len(after) > 0:
+    fout.write(">%s\n%s\n" % (ref, after))
+
+
+def revcomp(seq):
+  """ return reverse complemented string """
+  return seq.translate(TABLE)[::-1]
+
+
+def record_lg(read, coordinate, tab):
+  """ record global length distribution
+  don't record paired reads as they are normally not used for aDNA """
+  std = '-' if read.is_reverse else '+'
+  
+  length = (max(coordinate) - min(coordinate))
+  if not read.is_paired:
+    tab[std][length] = tab[std][length] + 1
+    
+  return tab
+
+
+def read_fasta_index(filename):
+  """ from a fasta index file, fai, return dictionary of references:lengths """
+  def print_err(msg, filename, line):
+    sys.stderr.write("Error: %s\n" % msg)
+    sys.stderr.write("       Filename: %s\n" % filename)
+    sys.stderr.write("       Line:     %s\n" % repr(line))
+  
+  fai = {}
+  with open(filename, 'r') as handle:
+    for line in handle:
+      ref = line.split("\t")
+      if len(ref) != 5:
+        print_err("Line in fasta index contains wrong number of fields, found %i, expected 5:" \
+            % len(ref), filename, line)
+        return None
+
+      try:
+        fai[ref[0]] = int(ref[1])
+      except ValueError:
+        print_err("Column 2 in FASTA index did not contain a number, found '%s':" % ref[1], filename, line)
+        return None
+
+  return fai
+
+
+def compare_sequence_dicts(fasta_dict, bam_dict):
+  """Compares a FASTA and BAM sequence dictionary, and prints any differences.
+  Returns true if all required sequences are found, false otherwise."""
+  if fasta_dict == bam_dict:
+    return True
+
+  sys.stderr.write("Sequence dictionaries in FASTA/BAM files differ:\n")
+  common = set(fasta_dict) & set(bam_dict)
+  if not common:
+    sys.stderr.write("FATAL ERROR: No sequences in common!\n")
+    return False
+
+  # Check that the lengths of required sequences match (fatal error)
+  different = []
+  for key in sorted(common):
+    if fasta_dict[key] != bam_dict[key]:
+      different.append((key, fasta_dict[key], bam_dict[key]))
+
+  if different:
+    sys.stderr.write("FATAL ERROR: Length of required sequences differ:\n")
+    for values in different:
+      sys.stderr.write("    - %s: %i bp vs %i bp\n" % values)
+
+  # Check for sequences only found in the BAM file (fatal errors)
+  bam_only = set(bam_dict) - common
+  if bam_only:
+    sys.stderr.write("FATAL ERROR: Sequences missing from FASTA dictionary:\n")
+    for key in bam_only:
+      sys.stderr.write("    - %s = %i bp\n" % (key, bam_dict[key]))
+
+  # Check for sequences only found in the BAM file (fatal errors)
+  fasta_only = set(fasta_dict) - common
+  if fasta_only:
+    sys.stderr.write("WARNING: FASTA file contains extra sequences:\n")
+    for key in fasta_only:
+      sys.stderr.write("    - %s = %i bp\n" % (key, fasta_dict[key]))
+
+  sys.stderr.write("\n")
+
+  return not (different or bam_only)
diff --git a/mapdamage/tables.py b/mapdamage/tables.py
new file mode 100644
index 0000000..04fdeef
--- /dev/null
+++ b/mapdamage/tables.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+import os
+import collections
+
+import mapdamage
+from mapdamage.version import __version__
+
+
+def initialize_mut(ref, length):  
+  tab = {}
+  for contig in ref:
+    tab_contig = tab[contig] = {}
+    for end in ('5p','3p'):
+      tab_end = tab_contig[end] = {}
+      for std in ('+','-'):
+        tab_std = tab_end[std] = {}
+        for mut in mapdamage.seq.HEADER:
+          tab_std[mut] = dict.fromkeys(xrange(length), 0)
+  
+  return tab
+
+
+def print_mut(mut, opt, out):
+  _print_freq_table(mut, mapdamage.seq.HEADER, opt, out, offset = 1)
+
+
+def initialize_comp(ref, around, length):
+  keys = {"3p" : range(-length, 0) + range(1, around + 1),
+          "5p" : range(-around, 0) + range(1, length + 1)}
+
+  tab = {}
+  for contig in ref:
+    tab_contig = tab[contig] = {}
+    for end in ('5p','3p'):
+      tab_end = tab_contig[end] = {}
+      for std in ('+','-'):
+        tab_std = tab_end[std] = {}
+        for letters in mapdamage.seq.LETTERS:
+          tab_std[letters] = dict.fromkeys(keys[end], 0)
+
+  return tab
+
+
+def print_comp(comp, opt, out):
+  columns = mapdamage.seq.LETTERS + ("Total",)
+  _print_freq_table(comp, columns, opt, out)
+
+
+def initialize_lg():
+  tab = {}
+  for std in ('+','-'):
+    tab[std] = collections.defaultdict(int)
+
+  return tab 
+
+
+def print_lg(tab, opt, out):
+  out.write("# table produced by mapDamage version %s\n" % __version__)
+  out.write("# using mapped file %s and %s as reference file\n" % (opt.filename, opt.ref))
+  if opt.minqual != 0:
+    out.write("# Quality filtering of bases with a Phred score < %d\n" % opt.minqual)
+  out.write("# Std: strand of reads\n")
+  out.write("Std\tLength\tOccurences \n")
+  for std in tab:
+    for i in tab[std]:
+      out.write("%s\t%d\t%d\n" % (std, i, tab[std][i]))
+
+
+def check_table_and_warn_if_dmg_freq_is_low(folder):
+  """ Returns true if the damage frequencies are too low to allow
+  Bayesian estimation of DNA damages, i.e < 1% at first position.
+  """
+  total = 0.0
+  for filename in ("5pCtoT_freq.txt", "3pGtoA_freq.txt"):
+    if not os.path.exists(folder+"/"+filename):
+      print("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
+            % filename)
+      return True
+
+    with open(os.path.join(folder, filename)) as handle:
+      for line in handle:
+        freq = line.strip().split('\t')
+        if freq[0] == "1":
+          total += float(freq[1])
+          break
+      else:
+        print("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
+              % filename)
+        return True
+
+  if total < 0.01:
+    print("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total)
+
+  return False
+
+
+def _print_freq_table(table, columns, opt, out, offset = 0):
+  out.write("# table produced by mapDamage version %s\n" % __version__)
+  out.write("# using mapped file %s and %s as reference file\n" % (opt.filename, opt.ref))
+  if opt.minqual != 0:
+    out.write("# Bases with a Phred score < %d were filtered out\n" % opt.minqual)
+  out.write("# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads\n")
+  out.write("Chr\tEnd\tStd\tPos\t%s\n" % ("\t".join(columns)))
+
+  for (reference, ends) in sorted(table.iteritems()):
+    for (end, strands) in sorted(ends.iteritems()):
+      for (strand, subtable) in sorted(strands.iteritems()):
+        subtable["Total"] = {}
+        for index in sorted(subtable[columns[0]]):
+          subtable["Total"][index] = sum(subtable[letter][index] for letter in mapdamage.seq.LETTERS)
+
+          out.write("%s\t%s\t%s\t%d" % (reference, end, strand, index + offset))
+          for base in columns:
+            out.write("\t%d" % subtable[base][index])
+          out.write("\n")
diff --git a/mapdamage/version.py b/mapdamage/version.py
new file mode 100644
index 0000000..4e95d9f
--- /dev/null
+++ b/mapdamage/version.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+try:
+    from mapdamage._version import __version__
+except ImportError:
+    __version__ = "2.0.6"
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b166396
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from distutils.core import setup
+from distutils.command.install import install as DistutilsInstall
+import os
+import subprocess
+
+def compile_seqtk():
+    """Compiling the seqtk toolkit"""
+    old_wd = os.getcwd()
+    new_wd = os.path.join(old_wd,"mapdamage","seqtk")
+    os.chdir(new_wd)
+    if not os.path.isfile("seqtk.c"):
+        raise SystemExit("Cannot find seqtk.c")
+    if (os.path.isfile("seqtk")):
+        os.system("rm seqtk")
+    xs = os.system("make -f Makefile")
+    os.chdir(old_wd)
+    if (xs != 0):
+        raise SystemExit("Cannot compile seqtk")
+
+
+def setup_version():
+    if not os.path.exists(".git"):
+        # Release version, no .git folder
+        return
+
+    try:
+        version = subprocess.check_output(("git", "describe", "--always", "--tags", "--dirty"))
+        with open(os.path.join("mapdamage", "_version.py"), "w") as handle:
+            handle.write("#!/usr/bin/env python\n")
+            handle.write("__version__ = %r\n" % (version.strip(),))
+    except (subprocess.CalledProcessError, OSError), error:
+        raise SystemExit("Could not determine mapDamage version: %s" % (error,))
+
+
+class compileInstall(DistutilsInstall):
+    # extension of the class to account for an extra compiling step
+    def run(self):
+        self.record=""
+        setup_version()
+        compile_seqtk()
+        DistutilsInstall.run(self)
+        # fixing the permission problem of seqtk
+        files = self.get_outputs()
+        for fi in files:
+            if fi.endswith("seqtk/seqtk"):
+                os.chmod(fi,0755)
+
+
+
+
+
+setup(
+    cmdclass={'install': compileInstall},
+    name='mapdamage',
+    version='2.0',
+    author='Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson',
+    author_email='MSchubert at snm.ku.dk, jonsson.hakon at gmail.com',
+    packages=['mapdamage'],
+    package_data={'mapdamage': ['Rscripts/*.R','Rscripts/stats/*.R','seqtk/seqtk']},
+    scripts=['bin/mapDamage'],
+    url='https://github.com/ginolhac/mapDamage',
+    license='LICENSE.txt',
+    description='mapDamage tracks and quantify DNA damage pattern among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms',
+    long_description=open('README.md').read()
+)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mapdamage.git