[med-svn] [mapdamage] 01/02: Imported Upstream version 2.0.6+dfsg
Andreas Tille
tille at debian.org
Fri Jul 29 12:46:36 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository mapdamage.
commit cc6aebcb3f9f3c220b52fbd93dd18af62f7948ed
Author: Andreas Tille <tille at debian.org>
Date: Fri Jul 29 14:45:20 2016 +0200
Imported Upstream version 2.0.6+dfsg
---
LICENSE.txt | 18 +
MANIFEST | 13 +
MANIFEST.in | 2 +
README.md | 32 +
bin/mapDamage | 382 ++++++++++++
mapdamage/Rscripts/lengths.R | 109 ++++
mapdamage/Rscripts/mapDamage.R | 180 ++++++
mapdamage/Rscripts/stats/checkLibraries.R | 18 +
mapdamage/Rscripts/stats/data.R | 55 ++
mapdamage/Rscripts/stats/function.R | 694 +++++++++++++++++++++
mapdamage/Rscripts/stats/main.R | 283 +++++++++
mapdamage/Rscripts/stats/postConditonal.R | 185 ++++++
mapdamage/Rscripts/stats/priorPropose.R | 125 ++++
mapdamage/Rscripts/stats/runGeneral.R | 95 +++
mapdamage/Rscripts/stats/start.R | 107 ++++
mapdamage/__init__.py | 10 +
mapdamage/align.py | 134 ++++
mapdamage/composition.py | 84 +++
mapdamage/parseoptions.py | 264 ++++++++
mapdamage/rescale.py | 352 +++++++++++
mapdamage/rescale_test.py | 57 ++
mapdamage/rescale_test/long_align/pe.sam | 3 +
.../pe_output/Stats_out_MCMC_correct_prob.csv | 5 +
mapdamage/rescale_test/long_align/ref.fa | 2 +
mapdamage/rescale_test/pe_test/pe.sam | 8 +
.../pe_output/Stats_out_MCMC_correct_prob.csv | 5 +
.../rescale_test/pe_test/pe_rescaled_correct.sam | 8 +
mapdamage/rescale_test/pe_test/ref.fa | 2 +
mapdamage/rscript.py | 153 +++++
mapdamage/seq.py | 108 ++++
mapdamage/tables.py | 116 ++++
mapdamage/version.py | 6 +
setup.py | 68 ++
33 files changed, 3683 insertions(+)
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..ed6195c
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,18 @@
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..e217680
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,13 @@
+# file GENERATED by distutils, do NOT edit
+README.txt
+setup.py
+bin/mapDamage
+mapdamage/__init__.py
+mapdamage/align.py
+mapdamage/composition.py
+mapdamage/parseoptions.py
+mapdamage/rscript.py
+mapdamage/seq.py
+mapdamage/tables.py
+mapdamage/version.py
+mapdamage/Rscripts/mapDamage.R
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..3381982
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include mapdamage/Rscripts/mapDamage.R
+include mapdamage/Rscripts/stats/*R
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f908dba
--- /dev/null
+++ b/README.md
@@ -0,0 +1,32 @@
+### Important
+Users with versions dating prior to June 12 2013 please update. A nasty bug that caused the statistical part of mapDamage to use half of the data for estimation of the damage parameters, sorry for the inconvenience.
+
+### Introduction
+Complete documentation, instructions, examples, screenshots and FAQ are available at [this address](http://ginolhac.github.io/mapDamage/).
+
+[mapDamage2](http://geogenetics.ku.dk/publications/mapdamage2.0/) is a computational framework written in Python and R, which tracks and quantifies DNA damage patterns
+among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms.
+
+Mapdamage is developed at the [Centre for GeoGenetics](http://geogenetics.ku.dk/) by the [Orlando Group ](http://geogenetics.ku.dk/research/research_groups/palaeomix_group/).
+
+
+
+
+### Citation
+If you use this program, please cite the following publication:
+Jónsson H, Ginolhac A, Schubert M, Johnson P, Orlando L.
+[mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters.
+_Bioinformatics_ 23rd April 2013. doi: 10.1093/bioinformatics/btt193](http://bioinformatics.oxfordjournals.org/content/early/2013/05/17/bioinformatics.btt193)
+
+
+The original mapDamage1 is described in the following article:
+Ginolhac A, Rasmussen M, Gilbert MT, Willerslev E, Orlando L.
+[mapDamage: testing for damage patterns in ancient DNA sequences. _Bioinformatics_ 2011 **27**(15):2153-5
+http://bioinformatics.oxfordjournals.org/content/27/15/2153](http://bioinformatics.oxfordjournals.org/content/27/15/2153)
+
+
+
+### Contact
+Please report bugs and suggest possible improvements to Aurélien Ginolhac, Mikkel Schubert or Hákon Jónsson by email:
+aginolhac at snm.ku.dk, MSchubert at snm.ku.dk or jonsson.hakon at gmail.com.
+
diff --git a/bin/mapDamage b/bin/mapDamage
new file mode 100755
index 0000000..d13ee33
--- /dev/null
+++ b/bin/mapDamage
@@ -0,0 +1,382 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import logging
+import random
+import time
+import sys
+import os
+
+""" Copyright (c) 2012 Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson
+and Ludovic Orlando
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+plot and quantify damage patterns from a SAM/BAM file
+
+:Authors: Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson, Ludovic Orlando
+:Contact: aginolhac at snm.ku.dk, MSchubert at snm.ku.dk, jonsson.hakon at gmail.com
+:Date: November 2012
+:Type: tool
+:Input: SAM/BAM
+:Output: tabulated tables, pdf
+"""
+
+# check if pysam if available
+MODULE = "pysam"
+URL = "http://code.google.com/p/pysam/"
+try:
+ __import__(MODULE)
+except ImportError, e:
+ sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
+ sys.stderr.write(" If module is not installed, please download from '%s'.\n" % URL)
+ sys.stderr.write(" A local install may be performed using the following command:\n")
+ sys.stderr.write(" $ python setup.py install --user\n\n")
+ sys.exit(1)
+
+import pysam
+
+
+_BAM_UNMAPPED = 0x4
+_BAM_SECONDARY = 0x100
+_BAM_FAILED_QC = 0x200
+_BAM_PCR_DUPE = 0x400
+_BAM_CHIMERIC = 0x800
+
+def _filter_reads(bamfile):
+ filtered_flags = _BAM_UNMAPPED | \
+ _BAM_SECONDARY | \
+ _BAM_FAILED_QC | \
+ _BAM_PCR_DUPE | \
+ _BAM_CHIMERIC
+
+ for read in bamfile:
+ if not (read.flag & filtered_flags):
+ yield read
+
+
+def _downsample_to_fraction(bamfile, options):
+ if options.verbose:
+ print("\tDownsampling %.2f fraction of the original file" % options.downsample)
+ assert options.downsample > 0, "Downsample fraction must be positive, not %s" % options.downsample
+
+ rand = random.Random(options.downsample_seed)
+ for read in _filter_reads(bamfile):
+ if rand.random() < options.downsample:
+ yield read
+
+
+def _downsample_to_fixed_number(bamfile, options):
+ if options.verbose:
+ print("\tDownsampling %d random reads from the original file" % options.downsample)
+
+ # use reservoir sampling
+ downsample_to = int(options.downsample)
+ rand = random.Random(options.downsample_seed)
+ sample = [None] * downsample_to
+ for (index, record) in enumerate(_filter_reads(bamfile)):
+ if index >= downsample_to:
+ index = rand.randint(0, index)
+ if index >= downsample_to:
+ continue
+ sample[index] = record
+ return filter(None, sample)
+
+
+def _read_bamfile(bamfile, options):
+ """
+ Takes a subset of the bamfile. Can use a approximate fraction of the
+ hits or specific number of reads using reservoir sampling. Returns
+ a list in the last case otherwise a iterator.
+ """
+ if options.downsample is None:
+ return _filter_reads(bamfile)
+ elif options.downsample < 1:
+ return _downsample_to_fraction(bamfile, options)
+ else:
+ return _downsample_to_fixed_number(bamfile, options)
+
+def _check_mm_option():
+ """
+ As the user can override the system wide mapdamage modules with
+ the --mapdamage-modules, it has to happen before option parsing
+ in mapdamage.parseoptions
+ """
+ path_to_mm = None
+ for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
+ if arg.startswith("--mapdamage-modules"):
+ try:
+ if "=" in arg:
+ # the option is of the format --mapdamage-modules=AAAA
+ arg_p = arg.split("=")
+ path_to_mm = arg_p[1]
+ else:
+ # the option is of the format --mapdamage-modules AAAA
+ path_to_mm = sys.argv[nr+1]
+ break
+ except IndexError as e:
+ raise SystemExit("Must specify a path to --mapdamage-modules")
+ if path_to_mm != None:
+ if not os.path.isdir(path_to_mm):
+ raise SystemExit("The --mapdamage-modules option must be a valid path (path=%s)" % path_to_mm)
+ if not os.path.isdir(os.path.join(path_to_mm,"mapdamage")):
+ raise SystemExit("The --mapdamage-modules path (path=%s) must contain the mapdamage module" % path_to_mm)
+ return path_to_mm
+
+
+def main():
+ start_time = time.time()
+
+ # the user can override the system wide mapdamage modules
+ path_to_mm = _check_mm_option()
+ if path_to_mm != None:
+ sys.path.insert(0,path_to_mm)
+ import mapdamage
+
+ fpath_seqtk = mapdamage.rscript.construct_path("seqtk", folder="seqtk")
+ if not (os.path.isfile(fpath_seqtk) and os.access(fpath_seqtk, os.X_OK)):
+ sys.stderr.write("Seqtk executable not accessible; mapDamage has not\n"
+ "been intalled properly or current user does not\n"
+ "sufficient permissions. Expected executable at\n "
+ "'%s'\n" % (fpath_seqtk,))
+ return 1
+
+ options = mapdamage.parseoptions.options()
+ if options == None:
+ sys.stderr.write("Option parsing failed, terminating the program\n")
+ return 1
+
+ logging.basicConfig(filename = os.path.join(options.folder, "Runtime_log.txt"),
+ format = '%(asctime)s\t%(levelname)s\t%(name)s: %(message)s',
+ level = logging.DEBUG)
+ handler = logging.StreamHandler()
+ handler.setLevel(logging.INFO)
+ logging.getLogger().addHandler(handler)
+
+ logger = logging.getLogger("main")
+ logger.info("Started with the command: " + " ".join(sys.argv))
+
+ # plot using R if results folder already done
+ if options.plot_only:
+ if options.no_r:
+ logger.error("Cannot use plot damage patterns if R is missing, terminating the program")
+ return 1
+ else:
+ mapdamage.rscript.plot(options)
+ mapdamage.rscript.opt_plots(options)
+ return 0
+
+ # run the Bayesian estimation if the matrix construction is done
+ if options.stats_only:
+ # does not work for very low damage levels
+ if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
+ logger.error("Cannot use the Bayesian estimation, terminating the program")
+ return 1
+ else:
+ # before running the Bayesian estimation get the base composition
+ path_to_basecomp = os.path.join(options.folder,"dnacomp_genome.csv")
+ if os.path.isfile(path_to_basecomp):
+ #Try to read the base composition file
+ mapdamage.composition.read_base_comp(path_to_basecomp)
+ else:
+ #Construct the base composition file
+ mapdamage.composition.get_base_comp(options.ref,path_to_basecomp)
+ mapdamage.rscript.run_stats(options)
+ return 0
+
+ # fetch all references and associated lengths in nucleotides
+ try:
+ ref = pysam.Fastafile(options.ref)
+ except IOError as e:
+ # Couldn't open the reference file
+ logger.error("Couldn't open the reference file "+options.ref+\
+ " or there are permission problems writing the "+options.ref+".fai file")
+ raise e
+
+ # rescale the qualities
+ if options.rescale_only:
+ if not options.quiet:
+ print("Starting rescaling...")
+ mapdamage.rescale.rescale_qual(ref, options)
+ return 0
+
+ # open SAM/BAM file
+ if options.filename == "-":
+ in_bam = pysam.Samfile("-", 'rb')
+ # disabling rescaling if reading from standard input since we need
+ # to read it twice
+ if options.rescale:
+ print("Warning, reading from standard input, rescaling is disabled")
+ options.rescale = False
+ else:
+ in_bam = pysam.Samfile(options.filename)
+
+
+ reflengths = dict(zip(in_bam.references, in_bam.lengths))
+ # check if references in SAM/BAM are the same in the fasta reference file
+ fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
+ if not fai_lengths:
+ return 1
+ elif not mapdamage.seq.compare_sequence_dicts(fai_lengths, reflengths):
+ return 1
+ elif (len(reflengths) >= 1000) and not options.merge_reference_sequences:
+ print("WARNING: Alignment contains a large number of reference sequences (%i)!" % len(reflengths))
+ print(" This may lead to excessive memory/disk usage.")
+ print(" Consider using --merge-reference-sequences")
+ print()
+
+ refnames = in_bam.references
+ if options.merge_reference_sequences:
+ refnames = ['*']
+
+ # for misincorporation patterns, record mismatches
+ misincorp = mapdamage.tables.initialize_mut(refnames, options.length)
+ # for fragmentation patterns, record base compositions
+ dnacomp = mapdamage.tables.initialize_comp(refnames, options.around, options.length)
+ # for length distributions
+ lgdistrib = mapdamage.tables.initialize_lg()
+
+ if not options.quiet:
+ print("\tReading from '%s'" % options.filename)
+ if options.minqual != 0:
+ print("\tFiltering out bases with a Phred score < %d" % options.minqual)
+ if options.verbose:
+ print("\t%d references are assumed in SAM/BAM file, for a total of %d nucleotides" % (len(reflengths), sum(reflengths.values())))
+ print("\tWriting results to '%s/'" % options.folder)
+
+
+ # open file handler to write alignments in fasta format
+ if options.fasta:
+ # use name of the SAM/BAM filename without extension
+ ffasta = os.path.splitext(os.path.basename(options.filename))[0]+'.fasta'
+ if not options.quiet:
+ print("\tWriting alignments in '%s'" % ffasta)
+ fhfasta = open(options.folder+"/"+ffasta,"w")
+
+ counter = 0
+
+ # main loop
+ for read in _read_bamfile(in_bam, options):
+ counter += 1
+
+ # external coordinates 5' and 3' , 3' is 1-based offset
+ coordinate = mapdamage.align.get_coordinates(read)
+ # record aligned length for single-end reads
+ lgdistrib = mapdamage.seq.record_lg(read, coordinate, lgdistrib)
+ # fetch reference name, chromosome or contig names
+ chrom = in_bam.getrname(read.tid)
+
+ (before, after) = mapdamage.align.get_around(coordinate, chrom, reflengths, options.around, ref)
+ refseq = ref.fetch(chrom, min(coordinate), max(coordinate)).upper()
+ # read.query contains aligned sequences while read.seq is the read itself
+ seq = read.query
+
+ # add gaps according to the cigar string, do it for qualities if filtering options is on
+ if not (options.minqual and read.qual):
+ if options.minqual:
+ logger.warning("Cannot filter bases by PHRED scores for read '%s'; no scores available." % read.qname)
+
+ (seq, refseq) = mapdamage.align.align(read.cigar, seq, refseq)
+ else:
+ # add gaps to qualities and mask read and reference nucleotides if below desired threshold
+ (seq, _, refseq) = mapdamage.align.align_with_qual(read.cigar, seq, read.qqual, options.minqual, refseq)
+
+ # reverse complement read and reference when mapped reverse strand
+ if read.is_reverse:
+ refseq = mapdamage.seq.revcomp(refseq)
+ seq = mapdamage.seq.revcomp(seq)
+ beforerev = mapdamage.seq.revcomp(after)
+ after = mapdamage.seq.revcomp(before)
+ before = beforerev
+
+ if options.merge_reference_sequences:
+ chrom = '*'
+
+ # record soft clipping when present
+ mapdamage.align.record_soft_clipping(read, misincorp[chrom], options.length)
+
+ # count misincorparations by comparing read and reference base by base
+ mapdamage.align.get_mis(read, seq, refseq, chrom, options.length, misincorp, '5p')
+ # do the same with sequences align to 3'-ends
+ mapdamage.align.get_mis(read, seq[::-1], refseq[::-1], chrom, options.length, misincorp, '3p')
+ # compute base composition for reads
+ mapdamage.composition.count_read_comp(read, chrom, options.length, dnacomp)
+
+ # compute base composition for genomic regions
+ mapdamage.composition.count_ref_comp(read, chrom, before, after, dnacomp)
+
+ if options.fasta:
+ mapdamage.seq.write_fasta(read, chrom, seq, refseq, min(coordinate), max(coordinate), before, after, fhfasta)
+
+ if options.verbose:
+ if counter % 50000 == 0:
+ print("\t%10d filtered alignments processed" % (counter,))
+
+ if options.verbose:
+ print("\tDone. %d filtered alignments processed" % (counter,))
+ logger.debug("BAM read in %f seconds" % (time.time() - start_time,))
+
+ # close file handlers
+ in_bam.close()
+ if options.fasta:
+ fhfasta.close()
+
+ # output results, write summary tables to disk
+ with open(options.folder+"/"+"misincorporation.txt", 'w') as fmut:
+ mapdamage.tables.print_mut(misincorp, options, fmut)
+ with open(options.folder+"/"+"dnacomp.txt", 'w') as fcomp:
+ mapdamage.tables.print_comp(dnacomp, options, fcomp)
+ with open(options.folder+"/"+"lgdistribution.txt", 'w') as flg:
+ mapdamage.tables.print_lg(lgdistrib, options, flg)
+
+
+ # plot using R
+ if not options.no_r:
+ mapdamage.rscript.plot(options)
+ mapdamage.rscript.opt_plots(options)
+
+ # raises a warning for very low damage levels
+ if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
+ options.no_stats = True
+
+
+ # run the Bayesian estimation
+ if not options.no_stats:
+ # before running the Bayesian estimation get the base composition
+ mapdamage.composition.get_base_comp(options.ref,os.path.join(options.folder,"dnacomp_genome.csv"))
+ mapdamage.rscript.run_stats(options)
+
+ # rescale the qualities
+ if options.rescale:
+ mapdamage.rescale.rescale_qual(ref, options)
+
+ # need the fasta reference still open for rescaling
+ ref.close()
+
+ # log the time it took
+ logger.info("Successful run")
+ logger.debug("Run completed in %f seconds" % (time.time() - start_time,))
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/mapdamage/Rscripts/lengths.R b/mapdamage/Rscripts/lengths.R
new file mode 100644
index 0000000..8955ace
--- /dev/null
+++ b/mapdamage/Rscripts/lengths.R
@@ -0,0 +1,109 @@
+# Enable full backtraces on errors
+on_error <- function(e)
+ {
+ traceback(2)
+ quit(status = 1)
+ }
+options(error = on_error)
+
+
+args <- commandArgs(trailingOnly = TRUE)
+OPT.LGDIST <- args[1]
+OPT.PDFOUT <- args[2]
+OPT.MISINCORP <- args[3]
+OPT.LENGTH <- args[4]
+OPT.TITLE <- args[5]
+OPT.VERSION <- args[6]
+OPT.QUIET <- args[7]
+
+MISMATCHES <- c("C>T", "G>A")
+
+calculate.mutation.table <- function(filename){
+
+ tbl <- read.table(file = filename, sep = "\t", header = TRUE, check.names = FALSE)
+ tbl <- aggregate(tbl[, MISMATCHES], tbl[, c("End", "Std", "Pos")], sum)
+ return(tbl)
+}
+
+
+plot.length <- function(tbl, title, color) {
+ table <- aggregate(tbl$Occurences, by=list(tbl$Length), FUN=sum)
+ names(table)<- c("Length", "Occurences")
+ plot(table$Length, table$Occurences, type="h",
+ col = color, main = title, cex.axis = 0.8, las = 2,
+ xlab = "", ylab = "", axes = FALSE)
+
+ mtext("Occurences", side = 2, line = 2.5, cex = 0.7)
+ mtext("Read length", side = 1, line = 2, cex = 0.7)
+ xcoord = seq(min(table$Length), max(table$Length), 10)
+ axis(side = 1, labels = xcoord, at = xcoord, las = 2, cex.axis = 0.6)
+ axis(side = 2, labels = TRUE, las = 2, cex.axis = 0.6)
+}
+
+plot.lengthStd <- function(tbl, title) {
+ subplus = subset(tbl, Std == "+")
+ subminus = subset(tbl, Std == "-")
+ plot(subplus$Length, subplus$Occurences, type="h", col=rgb(1,0,0,1/2), main = title, axes = FALSE)
+ lines(subminus$Length, subminus$Occurences, type="h", col=rgb(0,0,1,1/2))
+ mtext("Occurences", side = 4, line = 2.5, cex = 0.7)
+ mtext("Read length", side = 1, line = 2, cex = 0.7)
+ xcoord = seq(min(subplus$Length), max(subplus$Length), 10)
+ axis(side = 1, labels = xcoord, at = xcoord, las = 2, cex.axis = 0.6)
+ axis(side = 4, labels = TRUE, las = 2, cex.axis = 0.6)
+ legend('topright',c('+ strand','- strand'),
+ fill = rgb(1:0,0,0:1,0.4), bty = 'n',
+ border = NA)
+}
+
+plot.cumul.mutation <- function(tbl, end, mut, sid) {
+ subplus = subset(tbl, Std == "+" & End == end)
+ subminus = subset(tbl, Std == "-" & End == end)
+ plot(c(0, cumsum(subplus[, mut]/sum(subplus[, mut]))),
+ type = "l", col = rgb(1,0,0,1/2), lwd = 2, axes = FALSE)
+ lines(c(0, cumsum(subminus[, mut]/sum(subminus[, mut]))),
+ col = rgb(0,0,1,1/2), lwd = 2)
+ axis(side = 1, labels = TRUE, las = 2, cex.axis = 0.6)
+ axis(side = sid, labels = seq(0,1,0.1), at = seq(0,1,0.1), las = 2, cex.axis = 0.6)
+ mtext(mut, side = 3, line = 2, cex = 0.8)
+ mtext("Read position", side = 1, line = 1.8, cex = 0.7)
+ mtext("Cumulative frequencies", side = sid, line = 2.5, cex = 0.7)
+ legend('topleft',c('+ strand','- strand'),
+ fill = rgb(1:0,0,0:1,0.4), bty = 'n',
+ border = NA)
+}
+
+lg <- read.table(file = OPT.LGDIST, sep = "\t", header = TRUE, as.is = TRUE)
+if(nrow(lg) == 0){
+ write("No length distributions are available, plotting length distribution only works for single-end reads", stderr())
+}else{
+
+ pdf(file = OPT.PDFOUT, title = paste("mapDamage-", OPT.VERSION, sep=""))
+ par(oma = c(4,2,2,2), mar = c(1,2,1,2))
+ layout(matrix(c(1,1, # Title
+ 2,3, # lengths
+ 4,5), # Cumulative mutation
+ 3, 2, byrow = TRUE),
+ heights = c(3, 20, 20))
+
+ # Plot title
+ plot(0, type = "n", xaxt = "n", yaxt = "n", bty = "n", xlab = "", ylab = "")
+ mtext(OPT.TITLE, 3, cex = 1.3)
+
+ # Base compositions
+ plot.length(lg, "Single-end read length distribution", "black")
+ plot.lengthStd(lg, "Single-end read length per strand")
+
+ # Misincorporation patterns
+ mut <- calculate.mutation.table(OPT.MISINCORP)
+
+ par(mar = c(1, 2, 7, 1))
+ plot.cumul.mutation(mut, "5p", "C>T", 2)
+ par(mar = c(1, 1, 7, 2))
+ plot.cumul.mutation(mut, "3p", "G>A", 4)
+
+ # graphics.off() calls dev.off() for all devices but doesn't return anything (avoid null device message)
+ graphics.off()
+ if(OPT.QUIET == 0){
+ cat(paste("additional", OPT.PDFOUT, "generated\n"))
+ }
+}
diff --git a/mapdamage/Rscripts/mapDamage.R b/mapdamage/Rscripts/mapDamage.R
new file mode 100644
index 0000000..542c846
--- /dev/null
+++ b/mapdamage/Rscripts/mapDamage.R
@@ -0,0 +1,180 @@
+# R script mapdamage
+# Enable full backtraces on errors
+on_error <- function(e)
+ {
+ traceback(2)
+ quit(status = 1)
+ }
+options(error = on_error)
+
+
+NUCLEOTIDES <- c("A", "C", "G", "T", "Total")
+MISMATCHES <- c("A>C", "A>G", "A>T", "C>A", "C>G", "C>T", "G>A", "G>C", "G>T", "T>A", "T>C", "T>G")
+INSERTIONS <- c("->A", "->C", "->G", "->T")
+DELETIONS <- c("A>-", "C>-", "G>-", "T>-")
+CLIPPING <- c("S")
+EVERYTHING <- c(NUCLEOTIDES, MISMATCHES, INSERTIONS, DELETIONS, CLIPPING)
+
+args <- commandArgs(trailingOnly = TRUE)
+OPT.COMP <- args[1]
+OPT.PDFOUT <- args[2]
+OPT.AROUND <- as.numeric(args[3])
+OPT.MISINCORP <- args[4]
+OPT.LENGTH <- as.numeric(args[5])
+OPT.YMAX <- as.numeric(args[6])
+OPT.FOLDER <- args[7]
+OPT.TITLE <- args[8]
+OPT.VERSION <- args[9]
+
+
+draw.open.rect <- function(xleft, ybottom, xright, ytop, padding = 0) {
+ if (xleft < 0) {
+ xpoints <- c(xleft, xright + padding, xright + padding, xleft)
+ } else {
+ xpoints <- c(xright, xleft - padding, xleft - padding, xright)
+ }
+
+ lines(xpoints, c(ytop, ytop, ybottom, ybottom), col = "darkgrey")
+}
+
+
+plot.base.composition <- function(tbl, base, color, around, ylabels.at = c(), xlabels = FALSE) {
+ xcoords <- c(-around:-1, 1:around)
+
+ plot.axis <- function(yaxis.at) {
+ axis(side = yaxis.at, labels = (yaxis.at == ylabels.at), line = 0, las = 2, cex.axis = 0.8)
+ if ((yaxis.at == 2) && (yaxis.at == ylabels.at)) {
+ mtext("Frequency", side = 2, line = 2.5, cex = 0.6)
+ }
+
+ if (xlabels) {
+ axis(side = 1, labels = xcoords, at = xcoords, las = 2, cex.axis = 0.6)
+ } else {
+ axis(side = 1, labels = FALSE, at = xcoords)
+ }
+ }
+
+ plot.frequencies <- function(end) {
+ subtbl <- subset(tbl, End == end)
+ plot(subtbl$Pos, subtbl[, base] / subtbl$Total, pch = '.',
+ xlim = c(-around, around), ylim = c(0, 0.5),
+ col = color, main = base, cex.axis = 0.8, las = 2,
+ xlab = "", ylab = "", lab = c(2 * around, 6, 0.2),
+ axes = FALSE)
+
+ ycoords <- NULL
+ for (i in xcoords) {
+ ycoords <- append(ycoords, mean(subtbl[(subtbl$Pos==i), base]/subtbl$Total[(subtbl$Pos==i)],na.rm=T))
+ }
+ points(xcoords, ycoords, pch = 20, col = color, type = "b")
+ }
+
+ # 5p end
+ par(mar = c(1, 2, 1, 0.25))
+ plot.frequencies("5p")
+ plot.axis(2)
+ draw.open.rect(0, 0, around, 0.5)
+
+ # 3p end
+ par(mar = c(1, 0.25, 1, 2))
+ plot.frequencies("3p")
+ plot.axis(4)
+ draw.open.rect(-around, 0, 0, 0.5)
+
+ par(mar = c(1, 2, 1, 2))
+}
+
+
+calculate.mutation.table <- function(filename, length)
+ {
+ tbl <- read.table(file = filename, sep = "\t", header = TRUE, check.names = FALSE)
+ tbl <- aggregate(tbl[, EVERYTHING], tbl[, c("End", "Pos")], sum)
+ for (mismatch in MISMATCHES) {
+ tbl[, mismatch] <- tbl[, mismatch] / tbl[, substr(mismatch, 1, 1)]
+ }
+
+ for (mismatch in c(INSERTIONS, DELETIONS, CLIPPING)) {
+ tbl[, mismatch] <- tbl[, mismatch] / tbl[, "Total"]
+ }
+
+ return(tbl[tbl$Pos <= length, ])
+ }
+
+
+write.mutation.table <- function(tbl, end, mismatch, filename)
+ {
+ columns <- c("pos", sprintf("5p%s", mismatch))
+ tbl[is.na(tbl)] <- 0 # Python doesn't like NAs
+ write.table(tbl[tbl$End == end, c("Pos", mismatch)],
+ row.names = FALSE, col.names = columns,
+ sep = "\t", quote = FALSE,
+ file = filename)
+ }
+
+
+plot.mutations <- function(end, axis.at, start.i, end.i, modifier) {
+ do.plot <- function(tbl, end, modifier, mismatches, color, width) {
+ subtable <- tbl[tbl$End == end, c("Pos", mismatches)]
+ rates <- rowSums(subtable) - subtable$Pos
+ subtable <- aggregate(list(Rate = rates), list(Pos = subtable$Pos), sum)
+
+ lines(subtable$Pos * modifier, subtable$Rate,
+ xlim = c(1, OPT.LENGTH), ylim = c(0, OPT.YMAX),
+ col = color, lwd = width)
+ }
+
+ plot(NA, xlim = c(start.i, end.i), ylim=c(0, OPT.YMAX), col="grey", lwd = 1, type = "l", xlab = "", ylab = "", axes = FALSE)
+ axis(side = 1, labels = start.i:end.i, at = start.i:end.i, las = 2, cex.axis = 0.8)
+ axis(side = axis.at, labels = TRUE, las = 2, cex.axis = 0.8)
+ if (end == "5p") {
+ mtext("Frequency", side = 2, line = 2.5, cex = 0.6)
+ }
+ draw.open.rect(start.i, OPT.YMAX, end.i, -0.01, padding = 0.5)
+
+ for (mismatch in MISMATCHES) {
+ do.plot(mut, end, modifier, mismatch, "grey", 1)
+ }
+
+ do.plot(mut, end, modifier, CLIPPING, "orange", 1)
+ do.plot(mut, end, modifier, DELETIONS, "green", 1)
+ do.plot(mut, end, modifier, INSERTIONS, "purple", 1)
+ do.plot(mut, end, modifier, "G>A", "blue", 2)
+ do.plot(mut, end, modifier, "C>T", "red", 2)
+}
+
+
+pdf(file = OPT.PDFOUT, title = paste("mapDamage-", OPT.VERSION, " plot"))
+par(oma = c(4,2,2,2), mar = c(1,2,1,2))
+layout(matrix(c(1,1, 1,1, # Title
+ 2,3, 4,5, # A, C
+ 6,7, 8,9, # G, T
+ 10,10, 11,11), # Mismatches 5p, 3p
+ 4, 4, byrow = TRUE),
+ heights = c(3, 20, 20, 20))
+
+# Plot title
+plot(0, type = "n", xaxt = "n", yaxt = "n", bty = "n", xlab = "", ylab = "")
+mtext(OPT.TITLE, 3, cex = 1.3)
+
+# Base compositions
+com <- read.table(file = OPT.COMP, sep = "\t", header = TRUE, as.is = TRUE)
+plot.base.composition(com, "A", "blue", OPT.AROUND, xlabels = FALSE, ylabels.at = c(2, 4))
+plot.base.composition(com, "C", "green", OPT.AROUND, xlabels = FALSE, ylabels.at = c(4))
+plot.base.composition(com, "G", "black", OPT.AROUND, xlabels = TRUE, ylabels.at = c(2, 4))
+plot.base.composition(com, "T", "red", OPT.AROUND, xlabels = TRUE, ylabels.at = c(4))
+
+
+# Misincorporation patterns
+mut <- calculate.mutation.table(OPT.MISINCORP, OPT.LENGTH)
+# Write table of post-morten damage frequencies to tables
+write.mutation.table(mut, "5p", "C>T", paste(OPT.FOLDER, "/5pCtoT_freq.txt", sep = ""))
+write.mutation.table(mut, "3p", "G>A", paste(OPT.FOLDER, "/3pGtoA_freq.txt", sep = ""))
+
+par(mar = c(1, 2, 1, 1))
+plot.mutations("5p", 2, 1, OPT.LENGTH, 1)
+
+par(mar = c(1, 1, 1, 2))
+plot.mutations("3p", 4, -OPT.LENGTH, -1, -1)
+
+# graphics.off() calls dev.off() for all devices but doesn't return anything (avoid null device message)
+graphics.off()
diff --git a/mapdamage/Rscripts/stats/checkLibraries.R b/mapdamage/Rscripts/stats/checkLibraries.R
new file mode 100755
index 0000000..458b633
--- /dev/null
+++ b/mapdamage/Rscripts/stats/checkLibraries.R
@@ -0,0 +1,18 @@
+#!/usr/bin/Rscript
+#Simple script to check if the necessary packages are installed.
+args <- commandArgs(TRUE)
+whichProgram = args[2]
+
+if (whichProgram=="inline"){
+ library(inline)
+} else if (whichProgram=="ggplot2"){
+ library(ggplot2)
+} else if (whichProgram=="Rcpp"){
+ library(Rcpp)
+} else if (whichProgram=="gam"){
+ library(gam)
+} else if (whichProgram=="RcppGSL"){
+ library(RcppGSL)
+} else {
+ stop(paste("Error in checkLibraries.R, no need to check this library",whichProgram))
+}
diff --git a/mapdamage/Rscripts/stats/data.R b/mapdamage/Rscripts/stats/data.R
new file mode 100644
index 0000000..910201b
--- /dev/null
+++ b/mapdamage/Rscripts/stats/data.R
@@ -0,0 +1,55 @@
+#Functions to load the raw data
+sumTheChr <- function(da,co){
+ #Sum up the columns for the different chromosomes
+ return(tapply(da[[co]],da$Pos,sum))
+}
+
+readMapDamData <- function(folder,forward=1){
+ #Reads in the data from mapdamage
+ #Sums up the position counts for the different chromosomes
+ fil <- "misincorporation.txt"
+ raw_dat <- read.table(paste(folder,fil,sep=""),header=TRUE)
+ if (forward==1){
+ raw_dat_plus <- raw_dat[raw_dat$End=="5p" & raw_dat$Std=="+",]
+ raw_dat_minus <- raw_dat[raw_dat$End=="5p" & raw_dat$Std=="-",]
+ } else {
+ raw_dat_plus <- raw_dat[raw_dat$End=="3p" & raw_dat$Std=="+",]
+ raw_dat_minus <- raw_dat[raw_dat$End=="3p" & raw_dat$Std=="-",]
+ }
+ dat <- matrix(nrow=max(raw_dat_plus$Pos),ncol=length(colnames(raw_dat_plus))-3)
+ colnames(dat) <- colnames(raw_dat_plus)[c(-1,-2,-3)]
+ dat[,"Pos"] <- seq(from=1,to=nrow(dat),by=1)
+ if (forward!=1){
+ dat[,"Pos"] <- - dat[,"Pos"]
+ }
+ for (i in colnames(raw_dat)[c(-1,-2,-3,-4)]){
+ dat[,i] <- sumTheChr(raw_dat_plus,i)+sumTheChr(raw_dat_minus,i)
+ }
+ return(dat)
+}
+
+joinFowAndRev <- function(fo,re,nrPos){
+ #Joins the 5' and 3' end with nrPos bases for each end
+ te <- fo[1:nrPos,]
+ te2 <- re[nrPos:1,]
+ out <- rbind(te,te2)
+ out[,"Pos"] <- c(1:nrPos,-c(nrPos:1))
+ return(out)
+}
+
+getSeqLen <- function(pa){
+ #Path to the mapDamage folder to get the length distribution
+ le_dat_min <- read.table(paste(pa,"lengthDistribStrd-.txt",sep=""),header=TRUE)
+ le_dat_plu <- read.table(paste(pa,"lengthDistribStrd+.txt",sep=""),header=TRUE)
+ les <- list(Length=le_dat_min$Length,Occurences=le_dat_min$Occurences+le_dat_plu$Occurences)
+ les$Length <- les$Length[les$Occurences!=0]
+ les$Occurences <- les$Occurences[les$Occurences!=0]/sum(les$Occurences)
+ return(les)
+}
+
+readBaseFreqs <- function(fol){
+ #Get the nucleotide composition of the genome
+ fil <- "dnacomp_genome.csv"
+ dat <- read.csv(paste(fol,fil,sep=""),header=TRUE)
+ return(c(dat$A,dat$C,dat$G,dat$T))
+}
diff --git a/mapdamage/Rscripts/stats/function.R b/mapdamage/Rscripts/stats/function.R
new file mode 100644
index 0000000..50174e6
--- /dev/null
+++ b/mapdamage/Rscripts/stats/function.R
@@ -0,0 +1,694 @@
+#Various useful functions
+getPmat <- function(tmu,tv_ti_ratio,acgt){
+ #Returns the evolutionary substitution matrix
+ if (sum(acgt>=1)!=0 || sum(acgt<=0)!=0){
+ write("The ACGT frequencies must be in the range 0 to 1",stderr())
+ stop()
+ }
+ if (all.equal(sum(acgt),1)!=TRUE){
+ write("The ACGT frequencies do not sum to 1",stderr())
+ stop()
+ }
+ if (tv_ti_ratio<=0){
+ write("The transversion and transtition ratio cannot go under 0",stderr())
+ stop()
+ }
+ #Returns the substitution probability matrix.
+ if (identical(tv_ti_ratio,1) && identical(acgt,c(.25,.25,.25,.25))){
+ return(jukesCantorPmat(tmu))
+ }else{
+ Q <- qmatHKY85(tmu,tv_ti_ratio,acgt)
+ r <- eigen(Q)
+ B <- r$vectors
+ E <- diag(exp(r$values))
+
+ # Q The eigen vector change of basis
+ # M -> M
+ #
+ # ^ ^
+ # |B^-1 |B
+ # Eig
+ # N -> N
+ out <- solve(a=t(B),b= E %*% t(B))#Little trick to avoid numerical difficulties
+ rownames(out) <- c("A","C","G","T")
+ colnames(out) <- c("A","C","G","T")
+ return(out)
+ }
+}
+
+jukesCantorPmat <- function(tmu){
+ #Using the Juke-Cantor model
+ return(matrix(rep(1/4-exp(-tmu)/4,16),nrow=4,ncol=4,dimnames=list(c("A","C","G","T"),c("A","C","G","T")))+diag(rep(exp(-tmu),4)))
+}
+
+qmatHKY85 <- function(tmu,tv_ti,acgt){
+ #HKY85 model
+ # | sum_1 pi_c * tv_ti pi_g pi_t * tv_ti |
+ # | pi_a * tv_ti sum_2 pi_g * tv_ti pi_t |
+ # Q = tmu * | pi_a pi_c * tv_ti sum_3 pi_t * tv_ti |
+ # | pi_a*tv_ti pi_c pi_g * tv_ti sum_4 |
+ #returns this matrix
+ #This could be replaced with the analytical formulas for the substitutions probabilities.
+
+ Qmat <- matrix(rep(acgt,4),ncol=4,byrow=TRUE)
+ diag(Qmat) <- 0
+
+ #Adjusting the transversions versus transitions
+ #- * - *
+ #* - * -
+ #- * - *
+ #* - * -
+
+ Qmat[1,2] <- tv_ti*Qmat[1,2]
+ Qmat[1,4] <- tv_ti*Qmat[1,4]
+
+ Qmat[2,1] <- tv_ti*Qmat[2,1]
+ Qmat[2,3] <- tv_ti*Qmat[2,3]
+
+ Qmat[3,2] <- tv_ti*Qmat[3,2]
+ Qmat[3,4] <- tv_ti*Qmat[3,4]
+
+ Qmat[4,1] <- tv_ti*Qmat[4,1]
+ Qmat[4,3] <- tv_ti*Qmat[4,3]
+
+ diag(Qmat) <- -apply(Qmat,1,sum)
+
+ Qmat <- tmu * Qmat
+ return(Qmat)
+}
+
+metroDesc <- function(lpr,lol){
+ # the logic in the Metropolis-Hastings step
+ stopifnot(!is.na(lpr))
+ stopifnot(!is.na(lol))
+ if (log(runif(1))<lpr-lol){
+ return(1)
+ }else {
+ return(0)
+ }
+}
+
+genOverHang <- function(la){
+ #Currently not used in the main code but could used to generate
+ #overhangs like Philip
+ r <- runif(1)
+ i <- -1
+ p <- 0
+ while (p< r){
+ if (i==-1){
+ term <- 1
+ }else {
+ term <- 0
+ }
+ p <-p+ (la*((1-la)**(i+1))+term)/2
+ i <- i+1
+ }
+ return(i)
+}
+
+sampleHJ <- function(x,size,prob){
+ #Convenience function since sample wasn't good enough
+ if (length(prob)==1){
+ return(rep(x,size))
+ }else {
+ return(sample(x,size=size,prob=prob,replace=TRUE))
+ }
+}
+
+
+seqProbVecLambda <- function(lambda,lambda_disp,m,fo_only=NA,re_only=NA){
+ #Returns the position specific probability of being in an overhang
+ if (is.na(fo_only) || is.na(re_only)){
+ write("Must give parameters to fo_only or re_only",stderr())
+ stop()
+ }
+ psum <- matrix(ncol=1,nrow=m)
+ pvals <- dnbinom(c(1:m)-1,prob=lambda,size=lambda_disp)
+ for (i in 1:m){
+ psum[i,1] <- (1-sum(pvals[1:i]))/2
+ }
+ if (fo_only){
+ #Only the forward part
+ return(c(psum))
+ }else if (re_only && fo_only){
+ write("Shouldn't call this function with forward and reverse only.",stderr())
+ stop()
+ }else if (re_only) {
+ #The reverse part
+ return(rev(c(psum)))
+ }else{
+ #Both ends
+ psum <- c(psum[1:(m/2),1],rev(psum[1:(m/2),1]))
+ return(c(psum))
+ }
+}
+
+#The following is an MC simulation code to mimic the
+#nick frequency part in the model from Philip
+seqProbVecNuWithLengths<- cxxfunction( signature(
+ I_la="numeric",
+ I_la_disp="numeric",
+ I_nu="numeric",
+ I_m="numeric",
+ I_lengths="numeric",
+ I_mLe="numeric",
+ I_fo="numeric",
+ I_iter="numeric",
+ I_ds_protocol="numeric"
+ ) ,includes='
+ #include <gsl/gsl_randist.h>
+ int genOverHang(double la,double la_disp)
+ {
+ double r = ((double) rand() / (RAND_MAX));
+ int i = -1;
+ double p = 0;
+ double term = -500;
+ while (p<r){
+ if (i==-1){
+ term = 1;
+ }else {
+ term = 0;
+ }
+ p = p + (gsl_ran_negative_binomial_pdf(i+1,la,la_disp)+term)/2;
+ i++;
+ }
+ return(i);
+ }
+ ',body= '
+ srand(time(0));
+ Rcpp::NumericVector la(I_la);
+ Rcpp::NumericVector la_disp(I_la_disp);
+ Rcpp::NumericVector nu(I_nu);
+ Rcpp::NumericVector m(I_m);
+ Rcpp::NumericVector les(I_lengths);
+ Rcpp::NumericVector mLe(I_mLe);
+ Rcpp::NumericVector fo(I_fo);
+ Rcpp::NumericVector iter(I_iter);
+ Rcpp::NumericVector ds_protocol(I_ds_protocol);
+//
+ Rcpp::NumericVector output(mLe[0]);
+ Rcpp::NumericVector reduced_output(m[0]);
+ if (ds_protocol[0]==0){
+ for (int j = 0; j < m[0];j++){
+ reduced_output(j) = 1;
+ }
+ return(reduced_output);
+ }else {
+ for (int i = 0; i < iter[0];i++ ){
+//
+ double left_o_hang = genOverHang(la[0],la_disp[0]);
+ double right_o_hang = genOverHang(la[0],la_disp[0]);
+ double o_hang = left_o_hang+right_o_hang;
+//
+ if (o_hang>=les(i)){
+ //Single stranded sequence
+ for (int j = 0; j <les(i);j++){
+ output(j) = output(j)+1;
+ }
+ } else {
+ Rcpp::NumericVector r = runif(1);
+ if (r[0]< (1-nu[0])/((les[i]-o_hang-1)*nu[0]+(1-nu[0]))){
+ for (int j = 0; j <(les[i]-right_o_hang);j++){
+ output(j) = output(j)+1;
+ }
+ //The right overhang is always G>A for the double stranded but
+ //Here we will make the assumption that the pattern is symmetric
+ //for practical reasons We can\'t do that ....
+ }else {
+ Rcpp::NumericVector sa = floor(runif(1,0,les[i]-o_hang))+left_o_hang;
+ for (int j = 0; j <=sa[0];j++){
+ output(j) = output(j)+1;
+ }
+ }
+ }
+ }
+ if (fo(0)){
+ //Only considering the forward part
+ for (int j = 0; j < m[0];j++){
+ reduced_output(j) = output(j)/iter(0);
+ }
+ }else {
+ for (int j = 0; j < m[0]/2;j++){
+ reduced_output(j) = output(j)/iter(0);
+ reduced_output(m[0]-j-1) = 1-output(j)/iter(0);
+ }
+ }
+ return(reduced_output);
+ }
+',plugin='RcppGSL' )
+
+
+
+
+pDam <- function(th,ded,des,la,nu,lin){
+ #The damage and mutation matrix multiplied together
+ pct <- nu*(la*des+ded*(1-la))
+ pga <- (1-nu)*(la*des+ded*(1-la))
+ return(
+ c(
+ th[lin,1]*1+th[lin,3]*pga
+ ,
+ th[lin,2]*(1-pct)
+ ,
+ th[lin,3]*(1-pga)
+ ,
+ th[lin,2]*pct+th[lin,4]*1
+ )
+ )
+}
+
+logLikFunOneBaseSlow <- function(Gen,S,Theta,deltad,deltas,laVec,nuVec,m,lin){
+ #Calculates the log likelihood using R, a C++ version is available
+ ll <- 0
+ for (i in 1:length(laVec)){
+ #Get the damage probabilities
+ pd <- pDam(Theta,deltad,deltas,laVec[i],nuVec[i],lin)
+ ll <- ll + dmultinom(S[i,],Gen[i],pd,log=TRUE)
+ }
+ return(ll)
+}
+
+#The same logic as in logLikFunOneBaseSlow except using a compiled code
+#to do the hard work
+logLikFunOneBaseFast <- cxxfunction(signature(
+ I_Gen="numeric",
+ I_S="numeric",
+ I_Theta="numeric",
+ I_deltad="numeric",
+ I_deltas="numeric",
+ I_laVec="numeric",
+ I_nuVec="numeric",
+ I_m="numeric",
+ I_lin="numeric"
+ ), body = '
+Rcpp::NumericMatrix S(I_S);
+Rcpp::NumericMatrix th(I_Theta);
+
+Rcpp::NumericVector Gen(I_Gen);
+
+Rcpp::NumericVector Vded(I_deltad);
+double ded = Vded[0];
+
+Rcpp::NumericVector Vdes(I_deltas);
+double des = Vdes[0];
+
+Rcpp::NumericVector laVec(I_laVec);
+
+Rcpp::NumericVector nuVec(I_nuVec);
+
+Rcpp::NumericVector Vm(I_m);
+int m = Vm[0];
+
+Rcpp::NumericVector Vlin(I_lin);
+int lin = Vlin[0];
+
+Rcpp::NumericVector pDam(4);
+
+Rcpp::NumericVector ret(1);
+ret[0] = 0;
+
+for (int i = 0; i<laVec.size();i++){
+ double la = laVec[i];
+ double nu = nuVec[i];
+ double pct = nu*(la*des+ded*(1-la));
+ double pga = (1-nu)*(la*des+ded*(1-la));
+ pDam[0] = th(lin-1,0)*1+th(lin-1,2)*pga;
+ pDam[1] = th(lin-1,1)*(1-pct);
+ pDam[2] = th(lin-1,2)*(1-pga);
+ pDam[3] = th(lin-1,1)*pct+th(lin-1,3)*1;
+ double p1 = gsl_sf_lnfact(Gen(i))
+ -gsl_sf_lnfact(S(i,0))
+ -gsl_sf_lnfact(S(i,1))
+ -gsl_sf_lnfact(S(i,2))
+ -gsl_sf_lnfact(S(i,3));
+ double p2 = S(i,0)*log(pDam[0])
+ +S(i,1)*log(pDam[1])
+ +S(i,2)*log(pDam[2])
+ +S(i,3)*log(pDam[3]);
+ ret[0] = ret[0] + p1 + p2;
+}
+return(ret);
+', plugin="RcppGSL",include="#include <gsl/gsl_sf_gamma.h>")
+
+
+
+logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
+ #Calculates the logLikelihood for all the bases by calling
+ # logLikFunOneBaseFast for each base
+ if (deltad<0 || deltad>1 || deltas<0 || deltas>1 ){
+ return(-Inf)
+ }
+ #A,C,G and T
+
+ deb <- 0
+
+ Asub <- dat[,"A.C"]+dat[,"A.G"]+dat[,"A.T"]
+ ALL <- logLikFunOneBaseFast(dat[,"A"],cbind(dat[,"A"]-Asub,dat[,"A.C"],dat[,"A.G"],dat[,"A.T"]),Theta,deltad,deltas,laVec,nuVec,m,1)
+ if (deb){
+ ALLSlow <- logLikFunOneBaseSlow(dat[,"A"],cbind(dat[,"A"]-Asub,dat[,"A.C"],dat[,"A.G"],dat[,"A.T"]),Theta,deltad,deltas,laVec,nuVec,m,1)
+ stopifnot(all.equal(ALL,ALLSlow))
+ }
+
+ Csub <- dat[,"C.A"]+dat[,"C.G"]+dat[,"C.T"]
+ CLL <- logLikFunOneBaseFast(dat[,"C"],cbind(dat[,"C.A"],dat[,"C"]-Csub,dat[,"C.G"],dat[,"C.T"]),Theta,deltad,deltas,laVec,nuVec,m,2)
+ if (deb){
+ CLLSlow <- logLikFunOneBaseSlow(dat[,"C"],cbind(dat[,"C.A"],dat[,"C"]-Csub,dat[,"C.G"],dat[,"C.T"]),Theta,deltad,deltas,laVec,nuVec,m,2)
+ stopifnot(all.equal(CLL,CLLSlow))
+ }
+
+
+ Gsub <- dat[,"G.A"]+dat[,"G.C"]+dat[,"G.T"]
+ GLL <- logLikFunOneBaseFast(dat[,"G"],cbind(dat[,"G.A"],dat[,"G.C"],dat[,"G"]-Gsub,dat[,"G.T"]),Theta,deltad,deltas,laVec,nuVec,m,3)
+ if (deb){
+ GLLSlow <- logLikFunOneBaseSlow(dat[,"G"],cbind(dat[,"G.A"],dat[,"G.C"],dat[,"G"]-Gsub,dat[,"G.T"]),Theta,deltad,deltas,laVec,nuVec,m,3)
+ stopifnot(all.equal(CLL,CLLSlow))
+ }
+
+ Tsub <- dat[,"T.A"]+dat[,"T.C"]+dat[,"T.G"]
+ TLL <- logLikFunOneBaseFast(dat[,"T"],cbind(dat[,"T.A"],dat[,"T.C"],dat[,"T.G"],dat[,"T"]-Tsub),Theta,deltad,deltas,laVec,nuVec,m,4)
+ if (deb){
+ TLLSlow <- logLikFunOneBaseSlow(dat[,"T"],cbind(dat[,"T.A"],dat[,"T.C"],dat[,"T.G"],dat[,"T"]-Tsub),Theta,deltad,deltas,laVec,nuVec,m,4)
+ stopifnot(all.equal(TLL,TLLSlow))
+ }
+ return(ALL+CLL+GLL+TLL)
+}
+
+
+getParams <- function(cp){
+ #Utility function nice to update the MCMC iterations matrix
+ return(c(cp$Theta,cp$Rho,cp$DeltaD,cp$DeltaS,cp$Lambda,cp$LambdaRight,cp$LambdaDisp,cp$Nu))
+}
+
+plotTrace<- function(dat,main,k=111){
+ #Running median of the MCMC iterations
+ plot(1:length(dat),dat,xlab="Iteration",ylab="",main=main,type="l")
+}
+
+plotEverything <- function(mcmcOut,hi=0,pl,thin=100){
+ #Plots the MCMC traceplot in the form of a running median and
+ #histogram of the MCMC iterations
+ if (sum(c(cu_pa$same_overhangs==FALSE,
+ cu_pa$fix_disp==FALSE,
+ cu_pa$fix_ti_tv==FALSE,
+ cu_pa$nuSamples!=0))>1){
+ #Check if I need to add a extra row
+ a_extra_row <- 1
+ }else {
+ a_extra_row <- 0
+ }
+ par(mfrow=c(3,2+a_extra_row))
+ if(hi){
+ hist(mcmcOut$out[,"Theta"],main=expression(theta),xlab="",freq=FALSE)
+ if (!mcmcOut$cu_pa$fix_ti_tv){
+ hist(mcmcOut$out[,"Rho"],main=expression(rho),xlab="",freq=FALSE)
+ }
+ hist(mcmcOut$out[,"DeltaD"],main=expression(delta[d]),xlab="",freq=FALSE)
+ hist(mcmcOut$out[,"DeltaS"],main=expression(delta[s]),xlab="",freq=FALSE)
+ hist(mcmcOut$out[,"Lambda"],main=expression(lambda),xlab="",freq=FALSE)
+ if (!mcmcOut$cu_pa$same_overhangs){
+ hist(mcmcOut$out[,"LambdaRight"],main=expression(lambda[r]),xlab="",freq=FALSE)
+ }
+ if (!mcmcOut$cu_pa$fix_disp){
+ hist(mcmcOut$out[,"LambdaDisp"],main=expression(sigma[lambda]),xlab="",freq=FALSE)
+ }
+ if (mcmcOut$cu_pa$nuSamples!=0){
+ hist(mcmcOut$out[,"Nu"],main=expression(nu),xlab="",freq=FALSE)
+ }
+ hist(mcmcOut$out[,"LogLik"],main="LogLik",xlab="",freq=FALSE)
+ }else {
+ plotTrace(mcmcOut$out[,"Theta"],main=expression(theta))
+ if (!mcmcOut$cu_pa$fix_ti_tv){
+ plotTrace(mcmcOut$out[,"Rho"],main=expression(theta))
+ }
+ plotTrace(mcmcOut$out[,"DeltaD"],main=expression(delta[d]))
+ plotTrace(mcmcOut$out[,"DeltaS"],main=expression(delta[s]))
+ plotTrace(mcmcOut$out[,"Lambda"],main=expression(lambda))
+ if (!mcmcOut$cu_pa$same_overhangs){
+ plotTrace(mcmcOut$out[,"LambdaRight"],main=expression(lambda[r]))
+ }
+ if (!mcmcOut$cu_pa$fix_disp){
+ plotTrace(mcmcOut$out[,"LambdaDisp"],main=expression(sigma[lambda]))
+ }
+ if (mcmcOut$cu_pa$nuSamples!=0){
+ plotTrace(mcmcOut$out[,"Nu"],main=expression(nu))
+ }
+ plotTrace(mcmcOut$out[,"LogLik"],main="LogLik")
+ }
+ par(mfrow=c(1,1))
+}
+
+accRat <- function(da){
+ #A rough measure of the acceptance ratio
+ return(length(unique(da))/length(da))
+}
+
+adjustPropVar <- function(mcmc,propVar){
+ #Adjust the proposal variance to get something near .22
+ for (i in colnames(mcmc$out)){
+ if (i=="LogLik"){
+ next
+ } else if (i=="LambdaRight" & mcmc$cu_pa$same_overhangs){
+ next
+ } else if (i=="Nu" & mcmc$cu_pa$nuSamples==0){
+ next
+ } else if (i=="LambdaDisp" & mcmc$cu_pa$fix_disp){
+ next
+ } else if (i=="Rho" & mcmc$cu_pa$fix_ti_tv){
+ next
+ }
+ rat <- accRat(mcmc$out[,i])
+ if (rat<0.1){
+ propVar[[i]] <- propVar[[i]]/2
+ } else if (rat>0.3) {
+ propVar[[i]] <- propVar[[i]]*2
+ }
+ }
+ return(propVar)
+}
+
+runGibbs <- function(cu_pa,iter){
+ #Sampling over the conditional posterior distribution
+ #for the parameters.
+ esti <- matrix(nrow=iter,ncol=9)
+ colnames(esti) <- c("Theta","Rho","DeltaD","DeltaS","Lambda","LambdaRight","LambdaDisp","Nu","LogLik")
+ for (i in 1:iter){
+ cu_pa<-updateTheta(cu_pa)
+ if (!cu_pa$fix_ti_tv){
+ #Fix the transition and transversion ratio
+ cu_pa<-updateRho(cu_pa)
+ }
+ cu_pa<-updateDeltaD(cu_pa)
+ cu_pa<-updateDeltaS(cu_pa)
+ cu_pa<-updateLambda(cu_pa)
+ if (!cu_pa$same_overhangs){
+ #Not the same overhangs update lambda right
+ cu_pa <- updateLambdaRight(cu_pa)
+ }
+ if (!cu_pa$fix_disp){
+ #Allowing dispersion in the overhangs
+ cu_pa<-updateLambdaDisp(cu_pa)
+ }
+ if (cu_pa$nuSamples!=0){
+ #Update the nu parameter by via MC estimation
+ cu_pa<-updateNu(cu_pa)
+ }
+ esti[i,c(1:8)] <- getParams(cu_pa)
+ esti[i,"LogLik"] <- logLikAll(cu_pa$dat,cu_pa$ThetaMat,cu_pa$DeltaD,cu_pa$DeltaS,cu_pa$laVec,cu_pa$nuVec,cu_pa$m)
+ if (! (i %% 1000) && cu_pa$verbose){
+ cat("MCMC-Iter\t",i,"\t",esti[i,"LogLik"],"\n")
+ }
+ }
+ return(list(out=esti,cu_pa=cu_pa))
+}
+
+
+simPredCheck <- function(da,output){
+ #Simulates one draw from the posterior predictive distribution
+ #and the probability of a C>T substitution because of a cytosine
+ #demnation.
+ bases <- da[,c("A","C","G","T")]
+ #Constructing the lambda vector
+ if (output$cu_pa$same_overhangs){
+ laVec <- seqProbVecLambda(sample(output$out[,"Lambda"],1),
+ sample(output$out[,"LambdaDisp"],1),
+ output$cu_pa$m,
+ output$cu_pa$forward_only,
+ cu_pa$reverse_only)
+ }else {
+ laVecLeft <- seqProbVecLambda(sample(output$out[,"Lambda"],1),
+ sample(output$out[,"LambdaDisp"],1),
+ output$cu_pa$m,
+ 0,
+ 0)
+ laVecRight <- seqProbVecLambda(sample(output$out[,"LambdaRight"],1),
+ sample(output$out[,"LambdaDisp"],1),
+ output$cu_pa$m,
+ 0,
+ 0)
+ laVec <- c(laVecLeft[1:(output$cu_pa$m/2)],laVecRight[(output$cu_pa$m/2+1):output$cu_pa$m])
+ }
+ #Constructing the nu vector
+ if (output$cu_pa$nuSamples !=0){
+ write("The MC sampling for the nu vector hasn't gone through a extensive testing procedure",stderr())
+ stop()
+ nuVec <- seqProbVecNuWithLengths(sample(output$out[,"Lambda"],1),
+ sample(output$out[,"LambdaDisp"],1),
+ sample(output$out[,"Nu"],1),
+ nrow(cu_pa$dat),
+ sampleHJ(output$cu_pa$lengths$Length,
+ size=output$cu_pa$laSamples,
+ prob=output$cu_pa$lengths$Occurences),
+ output$cu_pa$mLe,
+ output$cu_pa$forward_only,
+ output$cu_pa$nuSamples,
+ output$cu_pa$ds_protocol)
+ nuVec <- c(nuVec,rev(1-nuVec))
+ }else {
+ nuVec <- output$cu_pa$nuVec
+ }
+
+ #Sample the other parameters
+ des <- sample(output$out[,"DeltaS"],1)
+ ded <- sample(output$out[,"DeltaD"],1)
+ the <- sample(output$out[,"Theta"],1)
+ rho <- sample(output$out[,"Rho"],1)
+ pmat <- getPmat(the,rho,output$cu_pa$acgt)
+ ptransCT <- pmat["C","T"]
+ ptransCC <- pmat["C","C"]
+ ptransGA <- pmat["G","A"]
+ ptransGG <- pmat["G","G"]
+ #
+ coln <- c("A.C","A.G","A.T","C.A","C.G","C.T","G.A","G.C","G.T","T.A","T.C","T.G")
+ subs <- matrix(NA,nrow=nrow(output$cu_pa$dat),ncol=4+length(coln))
+ colnames(subs) <- c("A","C","G","T",coln)
+ #
+ damProb <- rep(NA,nrow(output$cu_pa$dat))
+ damProbGA <- damProb
+ for (i in 1:nrow(output$cu_pa$dat)){
+ #Construct the site specific probabilities
+ pct <- nuVec[i]*(laVec[i]*des+ded*(1-laVec[i]))
+ pga <- (1-nuVec[i])*(laVec[i]*des+ded*(1-laVec[i]))
+ pDamMat <- matrix(c(
+ 1,0,0,0,
+ 0,1-pct,0,pct,
+ pga,0,1-pga,0,
+ 0,0,0,1
+ ),nrow=4,byrow=TRUE)
+ ThetapDam <- pDamMat %*% pmat
+ #Calculate the probability C.T due to cytosine demanation
+ damProb[i] <- ptransCC*pct/(ptransCC*pct+ptransCT)
+ #Do not forget the reverse complement
+ damProbGA[i] <- ptransGG*pga/(ptransGG*pga+ptransGA)
+ #Then draw from a multinomial distribution
+ subs[i,c("A.C","A.G","A.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"A"],ThetapDam[1,]))[-1]/output$cu_pa$dat[i,"A"]
+ subs[i,c("C.A","C.G","C.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"C"],ThetapDam[2,]))[-2]/output$cu_pa$dat[i,"C"]
+ subs[i,c("G.A","G.C","G.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"G"],ThetapDam[3,]))[-3]/output$cu_pa$dat[i,"G"]
+ subs[i,c("T.A","T.C","T.G")] <- t(rmultinom(1,output$cu_pa$dat[i,"T"],ThetapDam[4,]))[-4]/output$cu_pa$dat[i,"T"]
+ }
+ return(list(subs=subs,damProb=damProb,damProbGA=damProbGA))
+}
+
+calcSampleStats <- function(da,X){
+ #Summary statistics of the posterior distributions
+ return(data.frame(
+ x=1:nrow(da),
+ pos=da[,"Pos"],
+ mea=apply(X,1,mean),
+ med=apply(X,1,median),
+ loCI=apply(X,1,quantile,c(0.025)),
+ hiCI=apply(X,1,quantile,c(0.975))
+ ))
+}
+
+postPredCheck <- function(da,output,samples=10000){
+ #Plots the 95% posterior predictive intervals with the data as lines.
+ #Returns the site specific probability of a C>T or G>A substitution
+ #because of a cytosine demnation.
+ CTs <- matrix(NA,nrow=nrow(da),ncol=samples)
+ GAs <- matrix(NA,nrow=nrow(da),ncol=samples)
+ REs <- matrix(NA,nrow=nrow(da),ncol=samples)
+ C2TProbs <- matrix(NA,nrow=nrow(da),ncol=samples)
+ G2AProbs <- matrix(NA,nrow=nrow(da),ncol=samples)
+ #Two indices here, 1-based and the relative one
+ da <- cbind(1:(nrow(da)),da)
+ colnames(da)[1] <- "oneBased"
+ #Get the breaks depending on parameters for pretty plots
+ if (!output$cu_pa$forward_only || !output$cu_pa$reverse_only){
+ bres <- c(seq(from=1,to=floor(nrow(da)/2),by=2),rev(seq(from=nrow(dat),to=floor(nrow(da)/2)+1,by=-2)))
+ }else if (output$cu_pa$forward_only) {
+ bres <- seq(from=1,to=nrow(da),by=2)
+ } else if (output$cu_pa$reverse_only){
+ bres <- seq(from=nrow(dat),to=1,by=-2)
+ }else {
+ write("There is something fishy with the options",stderr())
+ stop()
+ }
+ labs <- dat[bres,"Pos"]
+ #Sample from the posterior predicitive distibution
+ for (i in 1:samples){
+ sam <- simPredCheck(da,output)
+ C2TProbs[,i] <- sam$damProb
+ G2AProbs[,i] <- sam$damProbGA
+ CTs[,i] <- sam$subs[,"C.T"]
+ GAs[,i] <- sam$subs[,"G.A"]
+ REs[,i] <- apply(sam$subs[,c("A.C","A.G","A.T","C.A","C.G","G.C","G.T","T.A","T.C","T.G")],1,mean)
+ }
+ CTsStats <- calcSampleStats(da,CTs)
+ GAsStats <- calcSampleStats(da,GAs)
+ REsStats <- calcSampleStats(da,REs)
+ #Plotting the posterior predictive intervals
+ p <- ggplot()+
+ geom_point(aes(x,mea,colour="C->T",aes_string="subs"),data=CTsStats)+
+ geom_point(aes(x,mea,colour="G->A"),data=GAsStats)+
+ geom_point(aes(x,mea,colour="Others"),data=REsStats)+
+ geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="C->T"),data=CTsStats)+
+ geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="G->A"),data=GAsStats)+
+ geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="Others"),data=REsStats)+
+ geom_line(aes(oneBased,C.T/C),color="red",data=data.frame(da))+
+ geom_line(aes(oneBased,G.A/G),color="green",data=data.frame(da))+
+ geom_line(aes(oneBased,((A.C+A.G+A.T)/A+(C.A+C.G)/C+(G.C+G.T)/G+(T.A+T.C+T.G)/T)/10),color="blue",data=data.frame(da))+
+ ylab("Substitution rate")+
+ xlab("Relative position")+
+ scale_x_continuous(breaks=bres,labels=labs)+
+ labs(colour = "Subs. type")+
+ ggtitle("Posterior prediction intervals")
+ if (output$cu_pa$use_bw_theme){
+ p <- p+theme_bw()
+ }
+ plot(p)
+ #The correcting probabilities
+ coProbs <- cbind(da[,"Pos"],apply(C2TProbs,1,mean),apply(G2AProbs,1,mean))
+ colnames(coProbs) <- c("Position","C.T","G.A")
+ return(coProbs)
+}
+
+
+writeMCMC <- function(out,filename){
+ #Writes the posterior samples to a file
+ parameters <- c("Theta","DeltaD","DeltaS","Lambda")
+ if (!out$cu_pa$fix_ti_tv){
+ parameters <- c(parameters,"Rho")
+ }
+ if (!out$cu_pa$same_overhangs){
+ parameters <- c(parameters,"LambdaRight")
+ }
+ if (!out$cu_pa$fix_disp){
+ parameters <- c(parameters,"LambdaDisp")
+ }
+ if (out$cu_pa$nuSamples!=0){
+ parameters <- c(parameters,"Nu")
+ }
+ parameters <- c(parameters,"LogLik")
+ write.csv(out$out[,parameters],paste(filename,".csv",sep=""))
+ #Now calculate summary statistic of the posterior distributions
+ mea <- apply(out$out[,parameters],2,mean)
+ std <- apply(out$out[,parameters],2,sd)
+ qua <- apply(out$out[,parameters],2,quantile,seq(from=0,to=1,by=.025))
+ acc <- apply(out$out[,parameters],2,accRat)
+ summStat <- rbind(mea,std,acc,qua)
+ rownames(summStat)[1] <- "Mean"
+ rownames(summStat)[2] <- "Std."
+ rownames(summStat)[3] <- "Acceptance ratio"
+ write.csv(summStat,paste(filename,"_summ_stat.csv",sep=""))
+}
+
diff --git a/mapdamage/Rscripts/stats/main.R b/mapdamage/Rscripts/stats/main.R
new file mode 100644
index 0000000..cfc48cd
--- /dev/null
+++ b/mapdamage/Rscripts/stats/main.R
@@ -0,0 +1,283 @@
+#The main work flow of the package
+#Load the libraries
+suppressMessages(library(inline)) #Already checked the libraries
+suppressMessages(library(ggplot2)) #thus ignoring any messages from them
+suppressMessages(library(Rcpp))
+suppressMessages(library(gam))
+suppressMessages(library(RcppGSL))
+
+#Miscellaneous functions
+source(paste(path_to_mapDamage_stats,"function.R",sep=""))
+
+#Prior and proposal distributions for the parameters
+source(paste(path_to_mapDamage_stats,"priorPropose.R",sep=""))
+
+#Update functions for the gibbs sampler
+source(paste(path_to_mapDamage_stats,"postConditonal.R",sep=""))
+
+#functions for the grid search
+source(paste(path_to_mapDamage_stats,"start.R",sep=""))
+
+#functions for loading the data
+source(paste(path_to_mapDamage_stats,"data.R",sep=""))
+
+
+#######################################################
+#
+# Loading the data
+#
+#######################################################
+
+#This used for the Briggs MC estimation
+if (nu_samples!=0){
+ start_vals$lengths <- getSeqLen(path_to_dat)
+ start_vals$le <- floor(weighted.mean(x=start_vals$lengths$Length,w=start_vals$lengths$Occurences))
+}
+
+if (forward_only && reverse_only){
+ write("Cannot specify using only the 5' end and the 3' end which makes no sense",stderr())
+ stop()
+}
+
+fow_dat <-readMapDamData(path_to_dat)
+rev_dat <-readMapDamData(path_to_dat,forward=0)
+if (forward_only){
+ #Taking only the forward part
+ dat <- fow_dat[1:sub_length,]
+}else if (reverse_only){
+ #Taking only the reverse part
+ dat <- rev_dat[sub_length:1,]
+}else {
+ #Using both ends
+ dat <- joinFowAndRev(fow_dat,rev_dat,sub_length)
+}
+
+#Getting everything ready for the mutation model
+if (jukes_cantor){
+ acgt <- c(0.25,0.25,0.25,0.25)
+ fix_ti_tv <- TRUE
+}else {
+ acgt <- readBaseFreqs(path_to_dat)
+ fix_ti_tv <- FALSE
+}
+
+#A list that keeps everything between the iterations
+cu_pa <- list(
+ dat=dat,
+ ThetaMat=NA,
+ Theta=-log((-start_vals$ptrans+.25)*4),
+ Rho=start_vals$rho,
+ DeltaD=start_vals$deltad,
+ DeltaS=start_vals$deltas,
+ Lambda=start_vals$lambda,
+ LambdaRight=start_vals$lambda_right,
+ LambdaDisp=start_vals$lambda_disp,
+ Nu=start_vals$nu,
+ m=nrow(dat),
+ meanLength=NA,
+ lengths=NA,
+ mLe=NA,
+ laVec=NA,
+ nuVec=NA,
+ iter=iterations,
+ nuSamples=nu_samples,
+ fix_nu=fix_nu,
+ ds_protocol=ds_protocol,
+ forward_only=forward_only,
+ reverse_only=reverse_only,
+ fix_disp= fix_disp,
+ same_overhangs= same_overhangs,
+ fix_ti_tv = fix_ti_tv,
+ acgt = acgt,
+ old_lik=-Inf,
+ verbose=verbose,
+ quiet=quiet,
+ use_raw_nick_freq = use_raw_nick_freq,
+ use_bw_theme = use_bw_theme
+ )
+
+if (nu_samples!=0){
+ #Need the lengths for the Briggs MC estimation
+ cu_pa$meanLength <- start_vals$le
+ cu_pa$lengths <- start_vals$lengths
+ cu_pa$mLe <- max(start_vals$lengths$Length)
+}
+
+cu_pa$ThetaMat <- getPmat(cu_pa$Theta,cu_pa$Rho,cu_pa$acgt)
+
+#######################################################
+#
+# Setting the lambda vector
+#
+#######################################################
+
+cu_pa$laVec <- seqProbVecLambda(cu_pa$Lambda,cu_pa$LambdaDisp,nrow(cu_pa$dat),cu_pa$forward_only,cu_pa$reverse_only)
+
+if (!cu_pa$same_overhangs){
+ #The overhangs are not the same
+ if (cu_pa$forward_only){
+ write("Cannot use different overhangs with only the 5' end",stderr())
+ stop()
+ } else if (cu_pa$reverse_only){
+ write("Cannot use different overhangs with only the 3' end",stderr())
+ stop()
+ }
+ cu_pa$laVecRight <- seqProbVecLambda(cu_pa$LambdaRight,cu_pa$LambdaDisp,nrow(cu_pa$dat),cu_pa$forward_only,cu_pa$reverse_only)
+}
+
+#######################################################
+#
+# Setting the nu vector
+#
+#######################################################
+
+if (!cu_pa$ds_protocol & cu_pa$nuSamples!=0){
+ #Using the single stranded protocol with nu samples which makes
+ #no sense
+ write("Silly to use the MC estimation for nu_i if using the single stranded protocol", stderr())
+ stop()
+}else if (cu_pa$nuSamples!=0 & cu_pa$fix_nu) {
+ #Using the fixed nu parameter with nu samples which makes no sense
+ write("Silly to use the MC estimation for nu_i and use fix_nu ", stderr())
+ stop()
+}else if (cu_pa$nuSamples!=0){
+ #Using the nu vector
+ #IF ds protocol we set nu_vector as 1
+ cu_pa$nuVec <- seqProbVecNuWithLengths(cu_pa$Lambda,
+ cu_pa$LambdaDisp,
+ cu_pa$Nu,nrow(cu_pa$dat),
+ sampleHJ(cu_pa$lengths$Length,size=cu_pa$nuSamples,prob=cu_pa$lengths$Occurences),
+ cu_pa$mLe,
+ cu_pa$forward_only,
+ cu_pa$nuSamples,
+ cu_pa$ds_protocol)
+}else if (!cu_pa$ds_protocol){
+ #The single stranded protocol
+ cu_pa$nuVec <- rep(1,nrow(dat))
+}else if (fix_nu){
+ #Ones at the 5' end and zeros at the 3' end
+ if (cu_pa$forward_only){
+ cu_pa$nuVec <- rep(1,nrow(dat))
+ }else if (cu_pa$reverse_only){
+ cu_pa$nuVec <- rep(0,nrow(dat))
+ }else {
+ cu_pa$nuVec <- c(rep(1,nrow(dat)/2),rep(0,nrow(dat)/2))
+ }
+}else {
+ #This is for a non linear nick frequency, assumes the G>T and G>A are
+ #mostly due to DNA damage patterns do not use for low damage datasets
+ te<-(dat[,"C.T"]/dat[,"C"])/(dat[,"G.A"]/dat[,"G"]+dat[,"C.T"]/dat[,"C"])
+ if (sum(is.na(te) )!=0 ){
+ write("Warning, To few substitutions to assess the nick frequency, using constant nick frequency instead", stderr())
+ if (cu_pa$forward_only){
+ cu_pa$nuVec <- rep(1,nrow(dat))
+ }else if (cu_pa$reverse_only){
+ cu_pa$nuVec <- rep(0,nrow(dat))
+ }else {
+ cu_pa$nuVec <- c(rep(1,nrow(dat)/2),rep(0,nrow(dat)/2))
+ }
+ } else {
+ #The substitutes seem to be okay estimate the nick frequency using GAM
+ if (cu_pa$forward_only || cu_pa$reverse_only){
+ if (cu_pa$use_raw_nick_freq){
+ #Use the frequency
+ cu_pa$nuVec <- te
+ }else{
+ #Use a smoother
+ cu_pa$nuVec <- predict(gam(te~s(1:(cu_pa$m))))
+ }
+ }else {
+ if (cu_pa$use_raw_nick_freq){
+ cu_pa$nuVec <- c(te[1:(cu_pa$m/2)],te[(cu_pa$m/2+1):cu_pa$m])
+ }else{
+ cu_pa$nuVec <- c(predict(gam(te[1:(cu_pa$m/2)]~s(1:(cu_pa$m/2)))),
+ predict(gam(te[(cu_pa$m/2+1):cu_pa$m]~s(1:(cu_pa$m/2)))))
+ }
+ }
+ #This shouldn't happen but for sanity check
+ cu_pa$nuVec[cu_pa$nuVec>1] <- 1
+ cu_pa$nuVec[cu_pa$nuVec<0] <- 0
+ }
+ rm(te)
+}
+#######################################################
+#
+# Finding an "optimal" starting place
+#
+#######################################################
+
+if (grid_iter!=0){
+ #Start at random places and optimize the likelihood function
+ cu_pa <- gridSearch(cu_pa,grid_iter)
+}
+
+#Calculate the log likelihood in the beginning
+if (cu_pa$same_overhangs){
+ te_laVec <- cu_pa$laVec
+}else {
+ te_laVec <- c(cu_pa$laVec[1:(cu_pa$m/2)],cu_pa$laVecRight[(cu_pa$m/2+1):cu_pa$m])
+}
+cu_pa$old_lik <- logLikAll(cu_pa$dat,
+ cu_pa$ThetaMat,
+ cu_pa$DeltaD,
+ cu_pa$DeltaS,
+ te_laVec,
+ cu_pa$nuVec,
+ cu_pa$m
+ )
+
+
+if (adjust_iter==0){
+ #Single burning period
+ if (!cu_pa$quiet){
+ cat("Single burn in period\n")
+ }
+ mcmcOut <- runGibbs(cu_pa,burn_in)
+ cu_pa <- mcmcOut$cu_pa
+}else {
+ for (i in 1:adjust_iter){
+ if (!cu_pa$quiet){
+ cat("Adjusting the proposal variance iteration ",i," \n")
+ }
+ mcmcOut <- runGibbs(cu_pa,burn_in)
+ cu_pa <- mcmcOut$cu_pa
+ proposeParameters <- adjustPropVar(mcmcOut,proposeParameters)
+ }
+}
+
+if (!cu_pa$quiet){
+ cat("Done burning, starting the iterations\n")
+}
+mcmcOut <- runGibbs(cu_pa,iterations)
+cu_pa <- mcmcOut$cu_pa
+
+if (!cu_pa$quiet){
+ cat("Done with the iterations, finishing up\n")
+}
+
+if (out_file_base!=""){
+ cat("Writing and plotting to files\n")
+ #Print everything to file
+ writeMCMC(mcmcOut,paste(out_file_base,"_MCMC_iter",sep=""))
+ #Trace plot
+ pdf(paste(out_file_base,"_MCMC_trace.pdf",sep=""))
+ plotEverything(mcmcOut,hi=0)
+ dev.off()
+ #histogram of the conditional distributions
+ pdf(paste(out_file_base,"_MCMC_hist.pdf",sep=""))
+ plotEverything(mcmcOut,hi=1)
+ dev.off()
+ #Posterior predictive plot
+ pdf(paste(out_file_base,"_MCMC_post_pred.pdf",sep=""))
+ siteProb <- postPredCheck(dat,mcmcOut)
+ dev.off()
+ #Write correcting probabilities
+ write.csv(siteProb,paste(out_file_base,"_MCMC_correct_prob.csv",sep=""))
+} else {
+ cat("Plotting\n")
+ plotEverything(mcmcOut,hi=0)
+ X11()
+ plotEverything(mcmcOut,hi=1)
+ X11()
+ postPredCheck(dat,mcmcOut)
+}
diff --git a/mapdamage/Rscripts/stats/postConditonal.R b/mapdamage/Rscripts/stats/postConditonal.R
new file mode 100644
index 0000000..e64d92b
--- /dev/null
+++ b/mapdamage/Rscripts/stats/postConditonal.R
@@ -0,0 +1,185 @@
+#The posterior conditional function utilized by the
+# Gibbs sampler in function.R. They have all the
+#same form maybe they should be implemented in a
+#smarter way.
+
+#The basic structure is the following
+
+#1. Get the old parameter
+#2. Propose a jump
+#3. Accept it using the MH ratio
+#4. Return the old or new value based on the MH ratio
+
+updateTheta <- function(cp){
+ old_lik <- cp$old_lik+priorTheta(cp$Theta)
+ theta_star <- proposeTheta(cp$Theta,1)
+ if (theta_star<0 ){
+ new_lik<- -Inf
+ } else {
+ theta_star_mat <- getPmat(theta_star,cp$Rho,cp$acgt)
+ new_lik_func <- logLikAll(cp$dat,theta_star_mat,cp$DeltaD,cp$DeltaS,cp$laVec,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorTheta(theta_star)
+ }
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$Theta <- theta_star
+ cp$ThetaMat <- theta_star_mat
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateRho <- function(cp){
+ old_lik <- cp$old_lik+priorRho(cp$Rho)
+ rho_star <- proposeRho(cp$Rho,1)
+ if (rho_star<=0 ){
+ new_lik<- -Inf
+ } else {
+ rho_star_mat <- getPmat(cp$Theta,rho_star,cp$acgt)
+ new_lik_func <- logLikAll(cp$dat,rho_star_mat,cp$DeltaD,cp$DeltaS,cp$laVec,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorRho(rho_star)
+ }
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$Rho <- rho_star
+ cp$ThetaMat <- rho_star_mat
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateDeltaD <- function(cp){
+ old_lik <- cp$old_lik+priorDeltaD(cp$DeltaD)
+ deltad_star <- proposeDeltaD(cp$DeltaD,1)
+ if (deltad_star<0 || deltad_star>1){
+ new_lik <- -Inf
+ }else {
+ new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,deltad_star,cp$DeltaS,cp$laVec,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorDeltaD(deltad_star)
+ }
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$DeltaD <- deltad_star
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateDeltaS <- function(cp){
+ old_lik <- cp$old_lik+priorDeltaS(cp$DeltaS)
+ deltas_star <- proposeDeltaS(cp$DeltaS,1)
+ if (deltas_star<0|| deltas_star>1){
+ new_lik <- -Inf
+ }else {
+ new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,deltas_star,cp$laVec,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorDeltaS(deltas_star)
+ }
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$DeltaS <- deltas_star
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateLambda <- function(cp){
+ old_lik <- cp$old_lik+priorLambda(cp$Lambda)
+ lambda_star <- proposeLambda(cp$Lambda,1)
+ if (lambda_star<0 || lambda_star>1){
+ return(cp)
+ }
+ laVecStarLeft <- seqProbVecLambda(lambda_star,cp$LambdaDisp,cp$m,cp$forward_only,cp$reverse_only)
+ if (!cp$same_overhangs){
+ #The left and right overhangs are not the same!
+ laVecStar <- c(laVecStarLeft[1:(cp$m/2)],cp$laVecRight[(cp$m/2+1):cp$m])
+ } else {
+ #It left and right are the same
+ laVecStar <- laVecStarLeft
+ }
+ new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,laVecStar,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorLambda(lambda_star)
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$Lambda <- lambda_star
+ cp$laVec <- laVecStar
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateLambdaRight <- function(cp){
+ old_lik <- cp$old_lik+priorLambdaRight(cp$LambdaRight)
+ lambda_right_star <- proposeLambdaRight(cp$LambdaRight,1)
+ if (lambda_right_star<0 || lambda_right_star>1){
+ #Reject this right away
+ return(cp)
+ }
+ laVecStarRight <- seqProbVecLambda(lambda_right_star,cp$LambdaDisp,cp$m,cp$forward_only,cp$reverse_only)
+ if (!cp$same_overhangs){
+ #The left and right overhangs are not the same!
+ laVecStar <- c(cp$laVec[1:(cp$m/2)],laVecStarRight[(cp$m/2+1):cp$m])
+ } else {
+ #left and right are the same
+ print("You shouldn't be calling this function if the overhangs are the same")
+ stop()
+ }
+ if (cp$forward_only){
+ print("You shouldn't be calling this function if you are only considering the forward part")
+ stop()
+ }
+ if (cp$reverse_only){
+ print("You shouldn't be calling this function if you are only considering the reverse part")
+ stop()
+ }
+ new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,laVecStar,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorLambdaRight(lambda_right_star)
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$LambdaRight <- lambda_right_star
+ cp$laVecRight <- laVecStar
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateLambdaDisp <- function(cp){
+ old_lik <- cp$old_lik+priorLambdaDisp(cp$LambdaDisp)
+ lambda_disp_star <- proposeLambdaDisp(cp$LambdaDisp,1)
+ if (lambda_disp_star<0 ){
+ return(cp)
+ }
+ if (!cp$same_overhangs){
+ leftLaVecStar <- seqProbVecLambda(cp$Lambda,lambda_disp_star,cp$m,cp$forward_only,cp$reverse_only)
+ rightLaVecStar <- seqProbVecLambda(cp$LambdaRight,lambda_disp_star,cp$m,cp$forward_only,cp$reverse_only)
+ laVecStar <- c(leftLaVecStar[1:(cp$m/2)],rightLaVecStar[(cp$m/2+1):cp$m])
+ }else {
+ laVecStar <- seqProbVecLambda(cp$Lambda,lambda_disp_star,cp$m,cp$forward_only,cp$reverse_only)
+ }
+ new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,laVecStar,cp$nuVec,cp$m)
+ new_lik <- new_lik_func+priorLambdaDisp(lambda_disp_star)
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$LambdaDisp <- lambda_disp_star
+ cp$laVec <- laVecStar
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
+
+updateNu <- function(cp){
+ old_lik <- cp$old_lik+priorNu(cp$Nu)
+ nu_star <- proposeNu(cp$Nu,1)
+ if (nu_star<0 || nu_star>1){
+ return(cp)
+ }
+ nu_Vec_star <- seqProbVecNuWithLengths(cp$Lambda,cp$LambdaDisp,nu_star,cp$m,sampleHJ(cp$lengths$Length,size=cp$nuSamples,prob=cp$lengths$Occurences),cp$mLe,cp$forward_only,cu_pa$nuSamples,cu_pa$ds_protocol)
+ new_lik_func <- logLikAll(cp$dat,cp$ThetaMat,cp$DeltaD,cp$DeltaS,cp$laVec,nu_Vec_star,cp$m)
+ new_lik <- new_lik_func+priorNu(nu_star)
+ if (metroDesc(new_lik,old_lik)) {
+ #Accept
+ cp$Nu <- nu_star
+ cp$nuVec <- nu_Vec_star
+ cp$old_lik <- new_lik_func
+ }
+ return(cp)
+}
diff --git a/mapdamage/Rscripts/stats/priorPropose.R b/mapdamage/Rscripts/stats/priorPropose.R
new file mode 100644
index 0000000..2baff55
--- /dev/null
+++ b/mapdamage/Rscripts/stats/priorPropose.R
@@ -0,0 +1,125 @@
+#Prior and proposal distributions for the parameter,
+#they are all uninformative.
+
+priorTheta <- function(x){
+ return(dnorm(x=x,mean=1,sd=500,log=TRUE))
+}
+
+priorRho <- function(x){
+ return(dnorm(x=x,mean=1,sd=500,log=TRUE))
+}
+
+priorDeltaD <- function(x){
+ if (x<0 || x>1){
+ return(-Inf)
+ }
+ return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorDeltaS <- function(x){
+ if (x<0 || x>1){
+ return(-Inf)
+ }
+ return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorLambda <- function(x){
+ if (x<0 || x>1){
+ return(-Inf)
+ }
+ return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorLambdaRight <- function(x){
+ if (x<0 || x>1){
+ return(-Inf)
+ }
+ return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+priorLambdaDisp <- function(x){
+ if (x<0){
+ return(-Inf)
+ }
+ #2 times since we truncate it at zero
+ return(log(2)+dnorm(x=x,mean=0,sd=100,log=TRUE))
+}
+
+priorNu <- function(x){
+ if (x<0 || x>1){
+ return(-Inf)
+ }
+ return(dbeta(x=x,shape1=1,shape2=1,log=TRUE))
+}
+
+proposeTheta <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$Theta
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
+
+proposeRho <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$Rho
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
+
+proposeDeltaD <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$DeltaD
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)#Okay since norm is symmetric
+ }
+}
+
+proposeDeltaS <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$DeltaS
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
+
+proposeLambda <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$Lambda
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
+
+proposeLambdaRight <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$LambdaRight
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
+
+proposeLambdaDisp <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$LambdaDisp
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
+
+proposeNu <- function(x=NA,ra=NA){
+ sh1 <- proposeParameters$Nu
+ if (!is.na(ra)){
+ return(rnorm(1,mean=x,sd=sh1))
+ } else {
+ return(0)
+ }
+}
diff --git a/mapdamage/Rscripts/stats/runGeneral.R b/mapdamage/Rscripts/stats/runGeneral.R
new file mode 100755
index 0000000..03d4e52
--- /dev/null
+++ b/mapdamage/Rscripts/stats/runGeneral.R
@@ -0,0 +1,95 @@
+#! /usr/bin/Rscript
+#Parses the command line arguments and calls the main program
+
+#Enable full backtraces on errors
+on_error <- function(e)
+ {
+ traceback(2)
+ quit(status = 1)
+ }
+options(error = on_error)
+
+rm(list=ls())
+graphics.off()
+
+argsList<-commandArgs(TRUE) #Assuming the user is not using this script directly
+argsList <- argsList[-1] #Skip --args
+
+
+#######################################################
+#
+# Proposal parameters
+#
+#######################################################
+
+proposeParameters <- list(
+ Theta=0.0003,
+ Rho=0.001,
+ DeltaD=0.001,
+ DeltaS=0.009,
+ Lambda=0.008,
+ LambdaRight=0.008,
+ LambdaDisp=0.015,
+ Nu=0.001
+ )
+
+#######################################################
+#
+# Initial values
+#
+#######################################################
+
+
+start_vals <- list(
+ ptrans = 0.00396/3,
+ rho = 1,
+ deltad = 0.0285,
+ deltas = 0.269,
+ lambda = 0.27,
+ lambda_right = 0.27,
+ lambda_disp = 1,
+ len = 60,
+ nu = 0.0645
+ )
+
+
+#######################################################
+#
+# Various parameters
+#
+#######################################################
+
+grid_iter <- as.integer(argsList[1]) # Number of random starting points for the grid search
+burn_in <- as.integer(argsList[2]) # Burn in period
+adjust_iter <- as.integer(argsList[3]) # Adjust proposal variance parameters iterations
+iterations <- as.integer(argsList[4]) # Iterations
+
+forward_only <- as.logical(as.numeric(argsList[5])) # Taking only the 5' end of the seqs
+reverse_only <- as.logical(as.numeric(argsList[6])) # Taking only the 3' end of the seqs
+fix_disp <- as.logical(as.numeric(argsList[7])) # Geom instead of neg Bin
+same_overhangs <- as.logical(as.numeric(argsList[8]))# The overhangs are the same on both sides
+
+nu_samples <- as.integer(argsList[9]) # Estimate the nu vector using the Briggs model (Should be similiar ammount to the number of sequences use this on your own risk....)
+fix_nu <- as.logical(as.numeric(argsList[10])) # Set 1 at 5' end and 0 at 3' end or else estimates it with GAM
+ds_protocol <- as.logical(as.numeric(argsList[11])) # Single stranded protocol C>T at both sides
+sub_length <- as.integer(argsList[12]) # How long sequence to use from each side
+
+path_to_dat <- argsList[13] # Absolute path to the dataset
+path_to_mapDamage_stats <- argsList[14] # Absolute path to the mapDamage-stats folder
+out_file_base <- argsList[15] # Base file name of the output
+verbose <- as.logical(as.numeric(argsList[16])) # These options control the volume of the output
+quiet <- as.logical(as.numeric(argsList[17]))
+
+jukes_cantor <- as.logical(as.numeric(argsList[18])) # Fix the transition and transversion ratio and acgt frequencies are equal
+path_to_acgt <- argsList[19]
+use_raw_nick_freq <- as.logical(as.numeric(argsList[20]))
+use_bw_theme <- as.logical(as.numeric(argsList[21]))
+
+
+#######################################################
+#
+# Run the program
+#
+#######################################################
+
+source(paste(path_to_mapDamage_stats,"main.R",sep=""))
diff --git a/mapdamage/Rscripts/stats/start.R b/mapdamage/Rscripts/stats/start.R
new file mode 100644
index 0000000..a71f034
--- /dev/null
+++ b/mapdamage/Rscripts/stats/start.R
@@ -0,0 +1,107 @@
+#Runs likelihood optimization in the beginning to
+#start in a better place.
+
+logLikAllOptimize <- function(x,cp){
+ #Optimizer wrapper for the log likelihood function
+ Theta <- x["Theta"]
+ DeltaD <- x["DeltaD"]
+ DeltaS <- x["DeltaS"]
+ Lambda <- x["Lambda"]
+ LambdaRight <- x["LambdaRight"]
+ LambdaDisp <- x["LambdaDisp"]
+ Rho <- x["Rho"]
+ if (sum(c(DeltaD,DeltaS,Lambda,LambdaRight)>1)>0 || sum(c(Theta,DeltaD,DeltaS,Lambda,LambdaRight,Rho)<0)>0){
+ #This very convenient for lazy people, using a unbounded optimization method Nelder-Mead
+ #just returning Inf when we are outside the boundary.
+ return(Inf)
+ }
+ if(cp$fix_ti_tv){
+ theta_mat <- getPmat(Theta,cp$Rho,cu_pa$acgt)
+ }else {
+ theta_mat <- getPmat(Theta,Rho,cu_pa$acgt)
+ }
+ if (cp$fix_disp){
+ disp_to_use = cp$LambdaDisp
+ }else {
+ disp_to_use = LambdaDisp
+ }
+ leftLaVec <- seqProbVecLambda(Lambda,disp_to_use,cu_pa$m,cu_pa$forward_only,cu_pa$reverse_only)
+ rightLaVec <- seqProbVecLambda(LambdaRight,disp_to_use,cu_pa$m,cu_pa$forward_only,cu_pa$reverse_only)
+ if (cp$same_overhangs){
+ rightLaVec <- leftLaVec
+ }
+ if (cp$forward_only){
+ #Only using the forward end
+ laVec <- leftLaVec
+ }else if (cp$reverse_only){
+ #Only using the backward end
+ laVec <- rightLaVec
+ }else {
+ #Both ends
+ laVec <- c(leftLaVec[1:(cu_pa$m/2)],rightLaVec[(cu_pa$m/2+1):cu_pa$m])
+ }
+ return(-logLikAll(cp$dat,theta_mat,DeltaD,DeltaS,laVec,cp$nuVec,cp$m))
+}
+
+gridSearch <- function(cp,iter){
+ #Starts the Markov chain in local maxima for "quicker" burn-in period
+ #No theoretical reasoning behind this but seems to work well in practice.
+ if (cp$nuSamples!=0){
+ write("Can't use the grid search with simulated nu vector",stderr())
+ stop()
+ }
+ if(!cp$quiet){
+ cat("Starting grid search, starting from random values\n")
+ }
+ minVal <- Inf
+ minParams <- list()
+ control <- list(maxit=5000)
+ for (i in 1:iter){
+ out <- optim(
+ c(Theta=runif(1),
+ DeltaD=runif(1),
+ DeltaS=runif(1),
+ Lambda=runif(1),
+ LambdaRight=runif(1),
+ LambdaDisp=sample(c(0.5,1,2,3,4,50,100,150,400),1),
+ Rho=sample(c(0.5,.75,1,1.25,1.5),1)
+ ),
+ logLikAllOptimize,
+ NULL,
+ cp=cp,
+ method="Nelder-Mead",
+ control=control
+ )
+ if (out$value<minVal){
+ minParams <- out
+ minVal <- out$value
+ }
+ if (cp$verbose){
+ cat("Iteration\t",i,"\t",-minVal,"\n")
+ }
+ }
+ #These parameters are always optimized
+ cp$Theta <- minParams$par["Theta"]
+ cp$DeltaD <- minParams$par["DeltaD"]
+ cp$DeltaS <- minParams$par["DeltaS"]
+ cp$Lambda <- minParams$par["Lambda"]
+
+ #Only update the other ones if the user requested for them
+ if (!cp$fix_ti_tv){
+ cp$Rho <- minParams$par["Rho"]
+ }
+ if (!cp$fix_disp){
+ cp$LamdaDisp <- minParams$par["LambdaDisp"]
+ }
+ if (!cp$same_overhangs){
+ cp$LamdaRight <- minParams$par["LambdaRight"]
+ }
+
+ #Now calculating matrix and vectors accompanying the optimal values
+ cp$laVec <- seqProbVecLambda(cp$Lambda,cp$LambdaDisp,nrow(cp$dat),cp$forward_only,cp$reverse_only)
+ cp$laVecRight <- seqProbVecLambda(cp$Lambda,cp$LambdaDisp,nrow(cp$dat),cp$forward_only,cp$reverse_only)
+ cp$ThetaMat <- getPmat(cp$Theta,cp$Rho,cp$acgt)
+ cp$old_lik <- - minVal
+
+ return(cp)
+}
diff --git a/mapdamage/__init__.py b/mapdamage/__init__.py
new file mode 100644
index 0000000..ee20c07
--- /dev/null
+++ b/mapdamage/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+
+import mapdamage.parseoptions
+import mapdamage.seq
+import mapdamage.align
+import mapdamage.tables
+import mapdamage.composition
+import mapdamage.rscript
+import mapdamage.rescale
+import mapdamage.version
diff --git a/mapdamage/align.py b/mapdamage/align.py
new file mode 100644
index 0000000..6abe523
--- /dev/null
+++ b/mapdamage/align.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+
+import itertools
+import mapdamage
+
+# from Martin Kircher, description of CIGAR operations
+#O BAM Description
+#M 0 alignment match (can be a sequence match or mismatch)
+#I 1 insertion to the reference
+#D 2 deletion from the reference
+#N 3 skipped region from the reference
+#S 4 soft clipping (clipped sequences present in SEQ)
+#H 5 hard clipping (clipped sequences NOT present in SEQ)
+#P 6 padding (silent deletion from padded reference)
+#= 7 sequence match
+#X 8 sequence mismatch
+
+
+def get_coordinates(read):
+ """ return external coordinates of aligned read bases """
+ fivep = read.aend if read.is_reverse else read.pos
+ threep = read.pos if read.is_reverse else read.aend
+
+ return fivep, threep
+
+
+def get_around(coord, chrom, reflengths, length, ref):
+ """ return reference sequences before and after the read
+ check for extremities and return what is available """
+ coord_min = min(coord)
+ coord_max = max(coord)
+
+ pos_before = max(0, coord_min - length)
+ pos_after = min(reflengths[chrom], coord_max + length)
+
+ # Uppercased, to be sure that we don't compare A,C,G,T and a,c,g,t
+ before = ref.fetch(chrom, pos_before, coord_min).upper()
+ after = ref.fetch(chrom, coord_max, pos_after).upper()
+
+ return before, after
+
+
+def align(cigarlist, seq, ref):
+ """ insert gaps according to the cigar string
+ deletion: gaps to be inserted into read sequences,
+ insertions: gaps to be inserted into reference sequence """
+ lref = list(ref)
+ for nbr, idx in parse_cigar(cigarlist, 1):
+ lref[idx:idx] = ["-"] * nbr
+
+ lread = list(seq)
+ for nbr, idx in parse_cigar(cigarlist, 2):
+ lread[idx:idx] = ["-"] * nbr
+
+ return "".join(lread), "".join(lref)
+
+
+def align_with_qual(cigarlist, seq, qual, threshold, ref):
+ """ insert gaps according to the cigar string
+ deletion: gaps to be inserted into read sequences and qualities,
+ insertions: gaps to be inserted into reference sequence """
+ lref = list(ref)
+ for nbr, idx in parse_cigar(cigarlist, 1):
+ lref[idx:idx] = ["-"] * nbr
+
+ lread = list(seq)
+ lqual = list(qual)
+ for nbr, idx in parse_cigar(cigarlist, 2):
+ lread[idx:idx] = ["-"] * nbr
+ lqual[idx:idx] = ["-"] * nbr
+
+ for idx, score in enumerate(lqual):
+ # mask read and reference nucleotides (so not gaps) if below desired threshold
+ if (ord(score)-33) < threshold and lread[idx] != "-":
+ lread[idx] = "N"
+ lref[idx] = "N"
+
+ return "".join(lread), "".join(lqual), "".join(lref)
+
+
+def get_mis(read, seq, refseq, ref, length, tab, end):
+ """ count mismatches using aligned reference and read,
+ must be redone since N in reference were randomly replaced by any bases """
+ std = '-' if read.is_reverse else '+'
+ subtable = tab[ref][end][std]
+
+ for (i, nt_seq, nt_ref) in itertools.izip(xrange(length), seq, refseq):
+ if (nt_seq in "ACGT-") and (nt_ref in "ACGT-"):
+ if nt_ref != "-":
+ # record base composition in the reference, only A, C, G, T
+ subtable[nt_ref][i] += 1
+
+ # Most ref/seq pairs will be identical
+ if (nt_ref != nt_seq):
+ mut = "%s>%s" % (nt_ref, nt_seq)
+ subtable[mut][i] += 1
+
+
+def parse_cigar(cigarlist, ope):
+ """ for a specific operation (mismach, match, insertion, deletion... see above)
+ return occurences and index in the alignment """
+ tlength = 0
+ coordinate = []
+ # count matches, indels and mismatches
+ oplist = (0, 1, 2, 7, 8)
+ for operation, length in cigarlist:
+ if operation == ope:
+ coordinate.append([length, tlength])
+ if operation in oplist:
+ tlength += length
+ return coordinate
+
+
+def record_soft_clipping(read, tab, length):
+ """ record soft clipped bases at extremities """
+ def update_table(end, std, bases):
+ for i in range(0, min(bases, length)):
+ tab[end][std]['S'][i] += 1
+
+ strand = '-' if read.is_reverse else '+'
+ for (nbases, idx) in mapdamage.align.parse_cigar(read.cigar, 4):
+ if idx == 0:
+ # Soft-clipping at the left side of the alignment
+ end = '3p' if read.is_reverse else '5p'
+ else:
+ # Soft-clipping at the right side of the alignment
+ end = '5p' if read.is_reverse else '3p'
+
+ update_table(end, strand, nbases)
+
+
+
+
+
diff --git a/mapdamage/composition.py b/mapdamage/composition.py
new file mode 100644
index 0000000..882139a
--- /dev/null
+++ b/mapdamage/composition.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+import mapdamage
+import itertools
+import csv
+import subprocess
+import sys
+
+def count_ref_comp(read, chrom, before, after, comp):
+ """ record basae composition in external genomic regions """
+ std = '-' if read.is_reverse else '+'
+
+ _update_table(comp[chrom]['5p'][std], before, xrange(-len(before), 0))
+ _update_table(comp[chrom]['3p'][std], after, xrange(1, len(after) + 1))
+
+
+def count_read_comp(read, chrom, length, comp):
+ """ record base composition of read, discard marked nucleotides """
+ std, seq = '+', read.query
+ if read.is_reverse:
+ std, seq = '-', mapdamage.seq.revcomp(seq)
+
+ _update_table(comp[chrom]['5p'][std], seq, xrange(1, length + 1))
+ _update_table(comp[chrom]['3p'][std], reversed(seq), xrange(-1, - length - 1, -1))
+
+
+def _update_table(table, sequence, indices):
+ for (index, nt) in itertools.izip(indices, sequence):
+ if nt in table:
+ table[nt][index] += 1
+
+
+def get_base_comp(filename,destination=False):
+ """
+ Gets the basecomposition of all the sequences in filename
+ and returns the value to destination if given.
+ """
+ path_to_seqtk = mapdamage.rscript.construct_path("seqtk",folder="seqtk")
+ bases = {"A":0,"C":0,"G":0,"T":0}
+ alp = ["A","C","G","T"]
+ try:
+ proc = subprocess.Popen([path_to_seqtk,"comp",filename],stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ for li in out.splitlines():
+ tabs = li.split() # 1 is the ref, 2 is the total and then the base counts A, C, G and T.
+ bases["A"] = bases["A"] + int(tabs[2])
+ bases["C"] = bases["C"] + int(tabs[3])
+ bases["G"] = bases["G"] + int(tabs[4])
+ bases["T"] = bases["T"] + int(tabs[5])
+ except (OSError, ValueError), error:
+ sys.stderr.write("Error: Seqtk failed: %s\n" % (error,))
+ sys.exit(1)
+ # get the base frequencies
+ ba_su = sum(bases.values())
+ for ba in alp:
+ bases[ba] = float(bases[ba])/float(ba_su)
+ if (destination==False):
+ return bases
+ else:
+ # write the results
+ fo = open(destination,"w")
+ vals = [str(bases[i]) for i in alp]
+ fo.write(",".join(alp)+"\n")
+ fo.write(",".join(vals)+"\n")
+ fo.close()
+
+def read_base_comp(filename):
+ """
+ Read the base compition from a file created by get_base_comp
+ """
+ fh = open(filename)
+ first_line = True
+ for li in fh:
+ li = li.rstrip()
+ lp = li.split()
+ if first_line:
+ header = lp
+ first_line = False
+ else:
+ body = lp
+ bases = {}
+ for ba,per in zip(header,body):
+ bases[ba] = per
+ return bases
diff --git a/mapdamage/parseoptions.py b/mapdamage/parseoptions.py
new file mode 100644
index 0000000..7102cf2
--- /dev/null
+++ b/mapdamage/parseoptions.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
+import os
+import sys
+
+from mapdamage.version import __version__
+from mapdamage.rscript import check_R_lib
+
+def file_exist(filename):
+ if os.path.exists(filename) and not os.path.isdir(filename):
+ return True
+ elif filename == "-":
+ return True
+ else:
+ sys.stderr.write("Error: '%s' is not a valid file\n" % (filename))
+ return None
+
+
+def whereis(program):
+ for path in os.environ.get('PATH', '').split(':'):
+ if os.path.exists(os.path.join(path, program)) and \
+ not os.path.isdir(os.path.join(path, program)):
+ return os.path.join(path, program)
+ return None
+
+
+
+def check_py_version():
+ req_version = (2, 6)
+ cur_version = sys.version_info
+
+ if cur_version >= req_version:
+ return True
+ else:
+ sys.stderr.write("Your Python interpreter is too old."\
+ "Please consider upgrading to at least %d.%d\n" % (req_version[0], req_version[1]))
+ return None
+
+
+def options():
+ parser = OptionParser("%prog [options] -i BAMfile -r reference.fasta\n\nUse option -h or --help for help", version=__version__, \
+ epilog="report bugs to aginolhac at snm.ku.dk, MSchubert at snm.ku.dk or jonsson.hakon at gmail.com")
+
+ args = OptionGroup(parser, "Input files")
+ args.add_option("-i", "--input", help="SAM/BAM file, must contain a valid header, use '-' for reading a BAM from stdin", \
+ action="store", type="string", dest="filename")
+ args.add_option("-r", "--reference", help="Reference file in FASTA format", \
+ action="store", dest="ref")
+
+ parser.add_option_group(args)
+ group = OptionGroup(parser, "General options")
+ group.add_option("-n", "--downsample", help = "Downsample to a randomly selected fraction of the reads (if 0 < DOWNSAMPLE < 1), or " \
+ "a fixed number of randomly selected reads (if DOWNSAMPLE >= 1). By default, no downsampling is performed.",
+ type = float, default = None)
+ group.add_option("--downsample-seed", help = "Seed value to use for downsampling. See documentation for py module 'random' for default behavior.",
+ type = int, default = None)
+ group.add_option("--merge-reference-sequences", help = "Ignore referece sequence names when tabulating reads (using '*' instead). "
+ "Useful for alignments with a large number of reference sequnces, which may otherwise result in excessive "
+ "memory or disk usage due to the number of tables generated.",
+ default = False, action = "store_true")
+ group.add_option("-l", "--length", dest="length", help="read length, in nucleotides to consider [%default]", \
+ type = int, default=70,action="store")
+ group.add_option("-a", "--around", dest="around", help="nucleotides to retrieve before/after reads [%default]", \
+ type = int, default=10,action="store")
+ group.add_option("-Q", "--min-basequal", dest="minqual", help="minimun base quality Phred score considered, Phred-33 assumed [%default]", \
+ type = int, default=0, action="store")
+ group.add_option("-d", "--folder", help="folder name to store results [results_FILENAME]", \
+ action="store", type="string", dest="folder")
+ group.add_option("-f", "--fasta", dest="fasta", help="Write alignments in a FASTA file", \
+ default=False,action="store_true")
+ group.add_option("--plot-only", dest="plot_only", help="Run only plotting from a valid result folder", \
+ default=False,action="store_true")
+ group.add_option("-q", "--quiet", dest="quiet", help="Disable any output to stdout", \
+ default=False,action="store_true")
+ group.add_option("-v", "--verbose", dest="verbose", help="Display progression information during parsing", \
+ default=False,action="store_true")
+ group.add_option("--mapdamage-modules", dest="mapdamage_modules", help="Override the system wide installed mapDamage module", \
+ default=None)
+ group.add_option("--no-plot", dest="no_r", help=SUPPRESS_HELP, default=False, action="store_true")
+ parser.add_option_group(group)
+
+ # options for plotting damage patterns
+ group2 = OptionGroup(parser, "Options for graphics")
+ group2.add_option("-y", "--ymax", dest="ymax", \
+ help="graphical y-axis limit for nucleotide misincorporation frequencies [%default]", type = float, \
+ default=0.3,action="store")
+ group2.add_option("-m", "--readplot", dest="readplot", \
+ help="read length, in nucleotides, considered for plotting nucleotide misincorporations [%default]", \
+ type = int, default=25, action="store")
+ group2.add_option("-b", "--refplot", dest="refplot", \
+ help="the number of reference nucleotides to consider for ploting base composition in the region located upstream "
+ "and downstream of every read [%default]", type= int, default=10, action="store")
+ group2.add_option("-t", "--title", dest="title", \
+ help="title used for plots [%default]", \
+ type="string", default="",action="store")
+ parser.add_option_group(group2)
+
+ # Then the plethora of optional options for the statistical estimation ..
+ group3 = OptionGroup(parser,"Options for the statistical estimation")
+ group3.add_option("", "--rand", dest="rand", \
+ help="Number of random starting points for the likelihood optimization [%default]", type = int, default=30, action="store")
+ group3.add_option("", "--burn", dest="burn", \
+ help="Number of burnin iterations [%default]", type = int, default=10000,action="store")
+ group3.add_option("", "--adjust", dest="adjust", \
+ help="Number of adjust proposal variance parameters iterations [%default]", type = int, default=10, action="store")
+ group3.add_option("", "--iter", dest="iter", \
+ help="Number of final MCMC iterations [%default]", type = int, default=50000, action="store")
+ group3.add_option("", "--forward", dest="forward", \
+ help="Using only the 5' end of the seqs [%default]", default=False, action="store_true")
+ group3.add_option("", "--reverse", dest="reverse", \
+ help="Using only the 3' end of the seqs [%default]", default=False, action="store_true")
+ group3.add_option("", "--var-disp", dest="var_disp", \
+ help="Variable dispersion in the overhangs [%default]", default=False,action="store_true")
+ group3.add_option("", "--jukes-cantor", dest="jukes_cantor", \
+ help="Use Jukes Cantor instead of HKY85 [%default]", default=False,action="store_true")
+ group3.add_option("", "--diff-hangs", dest="diff_hangs", \
+ help="The overhangs are different for 5' and 3' [%default]", default=False, action="store_true")
+ group3.add_option("", "--fix-nicks" , dest="fix_nicks", \
+ help="Fix the nick frequency vector (Only C.T from the 5' end and G.A from the 3' end) [%default]", default=False, action="store_true")
+ group3.add_option("", "--use-raw-nick-freq" , dest="use_raw_nick_freq", \
+ help="Use the raw nick frequency vector without smoothing [%default]", default=False, action="store_true")
+ group3.add_option("", "--single-stranded", dest="single_stranded", \
+ help="Single stranded protocol [%default]", default=False, action="store_true")
+ group3.add_option("", "--theme-bw", dest="theme_bw", \
+ help="Use black and white theme in post. pred. plot [%default]", default=False, action="store_true")
+ group3.add_option("", "--seq-length", dest="seq_length", \
+ help="How long sequence to use from each side [%default]", type = int, default=12, action="store")
+ group3.add_option("--stats-only", dest="stats_only", help="Run only statistical estimation from a valid result folder", \
+ default=False, action="store_true")
+ group3.add_option("--rescale", dest="rescale", help="Rescale the quality scores in the BAM file using the output from the statistical estimation", \
+ default=False, action="store_true")
+ group3.add_option("--rescale-only", dest="rescale_only", help="Run only rescaling from a valid result folder", \
+ default=False, action="store_true")
+ group3.add_option("--rescale-out", dest="rescale_out", help="Write the rescaled BAM to this file", \
+ default=None, action="store")
+ group3.add_option("--no-stats", help="Disabled statistical estimation, active by default", default=False, action="store_true")
+ group3.add_option("--check-R-packages", help="Check if the R modules are working", default=False, action="store_true")
+
+ parser.add_option_group(group3)
+
+ #Parse the arguments
+ (options, args) = parser.parse_args()
+
+ # check python version
+ if not check_py_version():
+ return None
+
+ # if the user wants to check the R packages then do that before the option parsing
+ if options.check_R_packages:
+ if check_R_lib():
+ sys.exit(1)
+ else:
+ print("All R packages are present")
+ sys.exit(0)
+
+ # check general arguments
+ if not (options.plot_only or options.stats_only) and not options.filename:
+ parser.error('SAM/BAM file not given (-i)')
+ if not (options.plot_only or options.ref):
+ parser.error('Reference file not given (-r)')
+ if not options.plot_only and not options.stats_only:
+ if not file_exist(options.filename) or not file_exist(options.ref):
+ return None
+ if options.downsample is not None:
+ if options.downsample <= 0:
+ parser.error("-n/--downsample must be a positive value")
+ elif options.downsample >= 1:
+ options.downsample = int(options.downsample)
+
+ if options.plot_only and not options.folder:
+ parser.error('Folder not provided, required with --plot-only')
+ if options.stats_only and not options.folder:
+ parser.error('Folder not provided, required with --stats-only')
+ if options.rescale_only and not options.folder:
+ parser.error('Folder not provided, required with --rescale-only')
+ if options.rescale_only and not options.filename:
+ parser.error('Input bam not provided, required with --rescale-only')
+ if options.rescale_only and not options.ref:
+ parser.error('Reference not provided, required with --rescale-only')
+
+ if options.verbose and options.quiet:
+ parser.error('Cannot use verbose and quiet option at the same time')
+
+ # check options
+ if options.length < 0:
+ parser.error('length (-l) must be a positive integrer')
+ if options.around < 0:
+ parser.error('around (-a) must be a positive integrer')
+ if options.ymax <= 0 or options.ymax > 1:
+ parser.error('ymax (-b) must be an real number beetween 0 and 1')
+ if options.readplot < 0:
+ parser.error('readplot (-m) must be a positive integrer')
+ if options.refplot < 0:
+ parser.error('refplot (-b) must be a positive integrer')
+ if options.refplot > options.around and not options.plot_only:
+ parser.error('refplot (-b) must be inferior to around (-a)')
+ if options.readplot > options.length:
+ parser.error('readplot (-m) must be inferior to length (-l)')
+ if options.minqual < 0 or options.minqual > 41:
+ parser.error('minimal base quality, Phred score, must be within this range: 0 - 41')
+
+ # check statistic options
+ if options.forward and options.reverse:
+ parser.error('Cannot use only forward end and only reverse end for the statistics')
+
+ # use filename as default for plot titles if not set
+ if options.title == "" and options.filename:
+ options.title = os.path.splitext(os.path.basename(options.filename))[0]
+ # for --plot-only, use the folder name, without results_ as title
+ if options.title == "" and not options.filename and options.folder:
+ options.title = os.path.splitext(os.path.basename(options.folder))[0].replace("results_", "")
+
+ # check folder
+ if not options.folder and options.filename:
+ options.folder = "results_"+os.path.splitext(os.path.basename(options.filename))[0]
+
+ # check destination for rescaled bam
+ if not options.rescale_out and (options.rescale or options.rescale_only):
+ # if there are mulitiple bam files to rescale then pick first one as
+ # the name of the rescaled file
+ if isinstance(options.filename,list):
+ basename = os.path.basename(options.filename[0])
+ else:
+ basename = os.path.basename(options.filename)
+ with_ext = os.path.splitext(basename)[0] + ".rescaled.bam"
+ options.rescale_out = os.path.join(options.folder, with_ext)
+
+ if os.path.isdir(options.folder):
+ if not options.quiet and not options.plot_only:
+ print("Warning, %s already exists" % options.folder)
+ if options.plot_only:
+ if not file_exist(options.folder+"/dnacomp.txt") or not file_exist(options.folder+"/misincorporation.txt"):
+ parser.error('folder %s is not a valid result folder' % options.folder)
+ else:
+ os.makedirs(options.folder, mode = 0750)
+ if options.plot_only or options.stats_only or options.rescale_only:
+ sys.stderr.write("Error, %s does not exist while plot/stats/rescale only was used\n" % options.folder)
+ return None
+
+
+ # check if the Rscript executable is present on the system
+ if not whereis('Rscript'):
+ print("Warning, Rscript is not in your PATH, plotting is disabled")
+ options.no_r = True
+
+ # check the nick frequencies options
+ if (options.use_raw_nick_freq + options.fix_nicks + options.single_stranded)>1:
+ parser.error("The options --use-raw-nick-freq, --fix-nicks and --single-stranded are mutually exclusive.")
+
+ if check_R_lib():
+ # check for R libraries
+ print("The Bayesian estimation has been disabled\n")
+ options.no_stats = True
+ if options.stats_only:
+ sys.exit("Cannot use --stats-only with missing R libraries")
+ if options.rescale:
+ sys.exit("Cannot use --rescale with missing R libraries")
+ if options.rescale_only:
+ sys.exit("Cannot use --rescale-only with missing R libraries")
+
+
+ return options
+
diff --git a/mapdamage/rescale.py b/mapdamage/rescale.py
new file mode 100644
index 0000000..789d21e
--- /dev/null
+++ b/mapdamage/rescale.py
@@ -0,0 +1,352 @@
+import csv
+import sys
+import os
+import mapdamage
+import pysam
+import itertools
+import math
+import logging
+import time
+
+
+def phred_pval_to_char(pval):
+ """Transforming error rate to ASCII character using the Phred scale"""
+ return chr(int(round(-10*math.log10(abs(pval)))+33))
+
+def phred_char_to_pval(ch):
+ """Transforming ASCII character in the Phred scale to the error rate"""
+ return 10**(-(float(ord(ch))-float(33))/10)
+
+def get_corr_prob(folder):
+ """
+ Reads the damage probability correction file, returns a
+ dictionary with this structure
+ position (one based) - CT - probability
+ - GA - probability
+ """
+ full_path = os.path.join(folder,"Stats_out_MCMC_correct_prob.csv")
+ if not os.path.isfile(full_path):
+ sys.exit("Missing file, the file \n\tStats_out_MCMC_correct_prob.csv\nshould be in the folder\n\t"+folder+"\nDid you run the MCMC estimates of the parameters?")
+ try:
+ fi_handle = csv.DictReader(open(full_path))
+ corr_prob = {}
+ for line in fi_handle:
+ if (corr_prob.has_key(line["Position"])):
+ sys.exit('This file has multiple position definitions %s, line %d: %s' % \
+ (folder, fi_handle.line_num, corr_prob[line["Position"]]))
+ else:
+ corr_prob[int(line["Position"])] = {'C.T':float(line["C.T"]), 'G.A':float(line["G.A"])}
+ return corr_prob
+ except csv.Error as e:
+ sys.exit('File %s, line %d: %s' % (os.path.join(folder,"Stats_out_MCMC_correct_prob.csv"), \
+ fi_handle.line_num, e))
+
+
+def corr_this_base(corr_prob, nt_seq, nt_ref, pos, length,direction="both"):
+ """
+ The position specific damaging correction, using the input
+ corr_prob dictionary holding the damage correcting values
+ nt_seq nucleotide in the sequence
+ nt_ref nucleotide in the reference
+ pos relative position from the 5' end
+ length length of the sequence
+ direction which end to consider the rescaling
+ returns the correction probability for this particular set
+ """
+ if (pos == 0):
+ # not using 0 based indexing
+ raise SystemError
+ if ( nt_seq == "T" and nt_ref == "C" ):
+ # an C to T transition
+ subs = "C.T"
+ elif( nt_seq == "A" and nt_ref == "G" ):
+ # an G to A transition
+ subs = "G.A"
+ else:
+ # other transitions/transversions are not affected by damage
+ return 0
+
+ back_pos = pos-length-1
+ # position from 3' end
+
+ if corr_prob.has_key(pos):
+ p5_corr = corr_prob[pos][subs]
+ # correction from 5' end
+ else:
+ p5_corr = 0
+
+ if corr_prob.has_key(back_pos):
+ p3_corr = corr_prob[back_pos][subs]
+ # correction from 3' end
+ else:
+ p3_corr = 0
+
+ if direction == "forward":
+ return p5_corr
+ elif direction == "backward":
+ return p3_corr
+ elif direction == "both":
+ if pos < abs(back_pos) :
+ # then we use the forward correction
+ return p5_corr
+ else :
+ # else the backward correction
+ return p3_corr
+ else:
+ # this should not happen
+ raise SystemExit("Abnormal direction in the rescaling procedure")
+
+def initialize_subs():
+ """Initialize a substitution table, to track the expected substitution counts"""
+ per_qual = dict(zip(range(0,130),[0]*130))
+ subs = {"CT-before":per_qual.copy(),\
+ "TC-before":per_qual.copy(),\
+ "GA-before":per_qual.copy(),\
+ "AG-before":per_qual.copy(),\
+ "CT-after":per_qual.copy(),\
+ "TC-after":per_qual.copy(),\
+ "GA-after":per_qual.copy(),\
+ "AG-after":per_qual.copy(),\
+ "A":0,\
+ "C":0,\
+ "G":0,\
+ "T":0,\
+ "CT-pvals":0.0,\
+ "CT-pvals_before":0.0,\
+ "TC-pvals":0.0,\
+ "GA-pvals":0.0,\
+ "GA-pvals_before":0.0,\
+ "AG-pvals":0.0,\
+ }
+ return subs
+
+
+
+def record_subs(subs,nt_seq,nt_ref,nt_qual,nt_newqual,prob_corr):
+ """ record the expected substitution change, prob_corr is the excact version for nt_qual"""
+ if ( nt_seq == "T" and nt_ref == "C"):
+ sub_type = "CT"
+ subs["CT-pvals"] += prob_corr
+ subs["CT-pvals_before"] += 1-phred_char_to_pval(nt_qual)
+ elif ( nt_seq == "A" and nt_ref == "G"):
+ sub_type = "GA"
+ subs["GA-pvals"] += prob_corr
+ subs["GA-pvals_before"] += 1-phred_char_to_pval(nt_qual)
+ elif ( nt_seq == "C" and nt_ref == "T"):
+ sub_type = "TC"
+ subs["TC-pvals"] += 1-phred_char_to_pval(nt_qual)
+ if (nt_qual != nt_newqual):
+ raise SystemError("Internal error: rescaling qualities for the wrong transitions")
+ elif ( nt_seq == "G" and nt_ref == "A"):
+ sub_type = "AG"
+ subs["AG-pvals"] += 1-phred_char_to_pval(nt_qual)
+ if (nt_qual != nt_newqual):
+ raise SystemError("Internal error: rescaling qualities for the wrong transitions")
+ else:
+ sub_type = "NN"
+ if (sub_type != "NN"):
+ # record only transitions
+ subs[sub_type+"-before"][int(ord(nt_qual))-33] += 1
+ subs[sub_type+"-after"][int(ord(nt_newqual))-33] += 1
+ if (nt_ref in ["A","C","G","T"]):
+ subs[nt_ref] += 1
+
+def qual_summary_subs(subs):
+ """Calculates summary statistics for the substition table subs"""
+ for i in ["CT-before","TC-before","GA-before","AG-before","CT-after","TC-after","GA-after","AG-after"]:
+ for lv in [0,10,20,30,40]:
+ for qv in subs[i]:
+ if qv >= lv :
+ key = i+"-Q"+str(lv)
+ if subs.has_key(key):
+ subs[key] += subs[i][qv]
+ else:
+ subs[key] = subs[i][qv]
+
+def print_subs(subs):
+ """Print the substition table"""
+ print("\tThe expected substition frequencies before and after scaling using the scaled qualities as probalities:")
+ if subs["C"]!=0:
+ # the special case of no substitutions
+ print("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"]))
+ else:
+ print("\tCT\tNA\t\tNA")
+ if subs["T"]!=0:
+ print("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"]))
+ else:
+ print("\tTC\tNA\t\tNA")
+ if subs["G"]!=0:
+ print("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"]))
+ else:
+ print("\tGA\tNA\t\tNA")
+ if subs["A"]!=0:
+ print("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"]))
+ else:
+ print("\tAG\tNA\t\tNA")
+ print("\tQuality metrics before and after scaling")
+ print("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"]))
+ print("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"]))
+ print("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"]))
+ print("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"]))
+ print("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"]))
+ print("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"]))
+ print("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"]))
+ print("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"]))
+ print("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"]))
+ print("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"]))
+
+
+def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="both"):
+ """
+ bam a pysam bam object
+ read a pysam read object
+ ref a pysam fasta ref file
+ reflengths a dictionary holding the length of the references
+ subs a dictionary holding the corrected number of substition before and after scaling
+ corr_prob dictionary from get_corr_prob
+ returns a read with rescaled quality score
+
+ Iterates through the read and reference, rescales the quality
+ according to corr_prob
+ """
+ if not debug:
+ # no need to log when unit testing
+ logger = logging.getLogger(__name__)
+ raw_seq = read.query
+ # external coordinates 5' and 3' , 0-based offset
+ coordinate = mapdamage.align.get_coordinates(read)
+ # fetch reference name, chromosome or contig names
+ chrom = bam.getrname(read.tid)
+ refseq = ref.fetch(chrom, min(coordinate), max(coordinate)).upper()
+ # add gaps to qualities and mask read and reference nucleotides if below desired threshold
+ (seq, qual, refseq) = mapdamage.align.align_with_qual(read.cigar, \
+ raw_seq, read.qqual, -100, refseq)
+ length_read = len(raw_seq)
+ length_align = len(seq)
+ # reverse complement read and reference when mapped reverse strand
+ if read.is_reverse:
+ refseq = mapdamage.seq.revcomp(refseq)
+ seq = mapdamage.seq.revcomp(seq)
+ qual = qual[::-1]
+ new_qual = [-100]*length_read
+ pos_on_read = 0
+ number_of_rescaled_bases = 0.0
+ for (i, nt_seq, nt_ref, nt_qual) in itertools.izip(xrange(length_align), seq, refseq, qual):
+ # rescale the quality according to the triplet position,
+ # pair of the reference and the sequence
+ if ((nt_seq == "T" and nt_ref =="C") or (nt_seq == "A" and nt_ref =="G")):
+ # need to rescale this subs.
+ pdam = 1 - corr_this_base(corr_prob, nt_seq, nt_ref, pos_on_read + 1, length_read,direction=direction)
+ pseq = 1 - phred_char_to_pval(nt_qual)
+ newp = pdam*pseq # this could be numerically unstable
+ newq = phred_pval_to_char(1-newp)
+ number_of_rescaled_bases += 1-pdam
+ else:
+ # don't rescale, other bases
+ newp = 1 - phred_char_to_pval(nt_qual)
+ newq = nt_qual
+ if pos_on_read < length_read:
+ new_qual[pos_on_read] = newq
+ record_subs(subs,nt_seq,nt_ref,nt_qual,new_qual[pos_on_read],newp)
+ if nt_seq != "-":
+ pos_on_read += 1
+ # done with the aligned portion of the read
+ else:
+ if not debug:
+ logger.warning("Warning: The aligment of the read is longer than the actual read %s",(read.qname))
+ break
+ new_qual = "".join(new_qual)
+
+ if read.is_reverse:
+ new_qual = new_qual[::-1]
+ if (read.cigar[0][0] == 4):
+ # check for soft clipping at forward end
+ new_qual = read.qual[0:read.cigar[0][1]] + new_qual
+ if (read.cigar[-1][0] == 4):
+ # the same backwards
+ new_qual = new_qual + read.qual[-read.cigar[-1][1]:]
+
+ read.qual = new_qual
+ # truncate this to 5 digits
+ number_of_rescaled_bases = float("%.5f" % number_of_rescaled_bases)
+ # check if the read has a MR tag
+ for tag in read.tags:
+ if tag[0] == "MR":
+ raise SystemExit("Read: %s already has a MR tag, can't rescale" % (read))
+ read.tags = read.tags + [("MR:f",number_of_rescaled_bases)]
+ return read
+
+
+def rescale_qual(ref, options,debug=False):
+ """
+ ref a pysam fasta ref file
+ bam_filename name of a BAM/SAM file to read
+ fi file containing the csv with correction probabilities
+ reflengths dictionary with the reference lengths
+ options options from the command line parsing
+
+ Iterates through BAM file, makes a new BAM file with rescaled qualities.
+ """
+ if not debug:
+ # no need to log when unit testing
+ logger = logging.getLogger(__name__)
+ logger.info("Rescaling BAM: '%s' -> '%s'" % (options.filename, options.rescale_out))
+ start_time = time.time()
+
+ # open SAM/BAM files
+ bam = pysam.Samfile(options.filename)
+ if debug:
+ write_mode = "wh"
+ else:
+ write_mode = "wb"
+ bam_out = pysam.Samfile(options.rescale_out,write_mode, template = bam)
+ corr_prob = get_corr_prob(options.folder)
+ subs = initialize_subs()
+ first_pair = True
+ number_of_non_proper_pairs = 0
+
+ for hit in bam:
+ if hit.is_unmapped:
+ pass
+ elif not hit.qual and not debug:
+ logger.warning("Cannot rescale base PHRED scores for read '%s'; no scores assigned." % hit.qname)
+ elif hit.is_paired :
+ if first_pair and not debug:
+ # assuming the ends are non-overlapping
+ logger.warning("Warning! Assuming the pairs are non-overlapping, facing inwards and correctly paired.")
+ first_pair=False
+ #5p --------------> 3p
+ #3p <-------------- 5p
+ # pair 1 (inwards)
+ #5p ---->
+ # <---- 5p
+ # A B
+ # pair 2 (outwards), this happens if the reference is RC this is not supported
+ # ----> 3p
+ #3p <----
+ # A B
+ # Correct outwards pairs from the 3p and inwards pairs with the 5p end
+ if ((not hit.is_reverse) and hit.mate_is_reverse and (hit.pnext>hit.pos) and hit.tid==hit.mrnm):
+ # the inwards case mate A
+ hit = rescale_qual_read(bam, hit, ref, corr_prob,subs,direction="forward",debug=debug)
+ elif (hit.is_reverse and (not hit.mate_is_reverse) and (hit.pnext<hit.pos) and hit.tid==hit.mrnm):
+ # the inwards case mate B
+ hit = rescale_qual_read(bam, hit, ref, corr_prob,subs,direction="forward",debug=debug)
+ else:
+ number_of_non_proper_pairs += 1
+ # cannot do much with conflicting pairing information
+ else:
+ hit = rescale_qual_read(bam, hit, ref, corr_prob,subs,debug=debug)
+
+ bam_out.write(hit)
+ if number_of_non_proper_pairs!=0 and not debug:
+ logger.warning("Number of non-rescaled reads due to improper pairing: %d" % number_of_non_proper_pairs)
+ if (subs["TC-before"] != subs["TC-after"] or subs["AG-before"] != subs["AG-after"]):
+ sys.exit("Qualities for T.C and A.G transitions should not change in the rescaling, please contact the authors.")
+ qual_summary_subs(subs)
+ bam.close()
+ bam_out.close()
+ if not options.quiet:
+ print_subs(subs)
+ if not debug:
+ logger.debug("Rescaling completed in %f seconds" % (time.time() - start_time,))
diff --git a/mapdamage/rescale_test.py b/mapdamage/rescale_test.py
new file mode 100644
index 0000000..df86575
--- /dev/null
+++ b/mapdamage/rescale_test.py
@@ -0,0 +1,57 @@
+import unittest
+import optparse
+import rescale
+import pysam
+import filecmp
+
+def mock_options(filename,rescale_out,folder):
+ """Make the options object with nice values for testing"""
+ return optparse.Values({
+ "filename":filename,
+ "rescale_out":rescale_out,
+ "verbose":True,
+ "folder":folder,
+ "quiet":True
+ })
+
+
+class testPairedFile(unittest.TestCase):
+ """Tests if rescaling of a paired end file"""
+ def test_paired_end_file(self):
+ """Test, paired end SAM file"""
+ # here is the example
+ # >ref
+ # 5p CGA AAA CGA 3p
+ # 3p GCT TTT GCT 5p
+ # >r001/1
+ # CGA
+ # >r001/2
+ # GCA
+ # The sam file looks like this
+ #@HD VN:1.5 SO:coordinate
+ #@SQ SN:ref LN:9
+ #r001 163 ref 1 30 3M = 7 9 CGA III MR:f:0 #the normal ones
+ #r001 83 ref 7 30 3M = 1 -9 TCG III MR:f:0
+ #r002 163 ref 1 30 3M = 7 9 TGA III MR:f:0.9 #With one dam subs
+ #r002 83 ref 7 30 3M = 1 -9 CAA III MR:f:0.009
+ #r003 83 ref 1 30 3M = 7 9 TGA III #The reverse complement, should not rescale (thus no flags)
+ #r003 163 ref 7 30 3M = 1 -9 CAA III
+ #
+ #hand calculated the correct rescaled sam file in pe_rescaled_correct.sam
+ options = mock_options("rescale_test/pe_test/pe.sam","rescale_test/pe_test/pe_rescaled.sam","rescale_test/pe_test/pe_output/")
+ ref = pysam.Fastafile("rescale_test/pe_test/ref.fa")
+ rescale.rescale_qual(ref,options,debug=True)
+ self.assertTrue(filecmp.cmp("rescale_test/pe_test/pe_rescaled.sam","rescale_test/pe_test/pe_rescaled_correct.sam"))
+
+class testCases(unittest.TestCase):
+ """Various cases that failed"""
+ def test_longalignshortread(self):
+ """Check if fails on an aligment longer than the read"""
+ prefix="rescale_test/long_align/"
+ options = mock_options(prefix+"pe.sam",prefix+"pe_out.sam",prefix+"pe_output/")
+ ref = pysam.Fastafile("rescale_test/pe_test/ref.fa")
+ rescale.rescale_qual(ref,options,debug=True)
+ #Should run without an error
+
+if __name__=='__main__':
+ unittest.main()
diff --git a/mapdamage/rescale_test/long_align/pe.sam b/mapdamage/rescale_test/long_align/pe.sam
new file mode 100644
index 0000000..92c9427
--- /dev/null
+++ b/mapdamage/rescale_test/long_align/pe.sam
@@ -0,0 +1,3 @@
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:9
+r001 163 ref 1 30 3M2D = 7 9 CGA III
diff --git a/mapdamage/rescale_test/long_align/pe_output/Stats_out_MCMC_correct_prob.csv b/mapdamage/rescale_test/long_align/pe_output/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..e295f35
--- /dev/null
+++ b/mapdamage/rescale_test/long_align/pe_output/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,5 @@
+"","Position","C.T","G.A"
+"1",1,0.9,0
+"2",2,0.009,0
+"3",-2,0,0.09
+"4",-1,0,0.9
diff --git a/mapdamage/rescale_test/long_align/ref.fa b/mapdamage/rescale_test/long_align/ref.fa
new file mode 100644
index 0000000..29e24ff
--- /dev/null
+++ b/mapdamage/rescale_test/long_align/ref.fa
@@ -0,0 +1,2 @@
+>ref
+CGAAAACGA
diff --git a/mapdamage/rescale_test/pe_test/pe.sam b/mapdamage/rescale_test/pe_test/pe.sam
new file mode 100644
index 0000000..5910991
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/pe.sam
@@ -0,0 +1,8 @@
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:9
+r001 163 ref 1 30 3M = 7 9 CGA III
+r001 83 ref 7 30 3M = 1 -9 CGA III
+r002 163 ref 1 30 3M = 7 9 TGA III
+r002 83 ref 7 30 3M = 1 -9 CAA III
+r003 83 ref 1 30 3M = 7 9 TGA III
+r003 163 ref 7 30 3M = 1 -9 CAA III
diff --git a/mapdamage/rescale_test/pe_test/pe_output/Stats_out_MCMC_correct_prob.csv b/mapdamage/rescale_test/pe_test/pe_output/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..e295f35
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/pe_output/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,5 @@
+"","Position","C.T","G.A"
+"1",1,0.9,0
+"2",2,0.009,0
+"3",-2,0,0.09
+"4",-1,0,0.9
diff --git a/mapdamage/rescale_test/pe_test/pe_rescaled_correct.sam b/mapdamage/rescale_test/pe_test/pe_rescaled_correct.sam
new file mode 100644
index 0000000..9b45850
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/pe_rescaled_correct.sam
@@ -0,0 +1,8 @@
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:9
+r001 163 ref 1 30 3M = 7 9 CGA III MR:f:0
+r001 83 ref 7 30 3M = 1 -9 CGA III MR:f:0
+r002 163 ref 1 30 3M = 7 9 TGA !II MR:f:0.9
+r002 83 ref 7 30 3M = 1 -9 CAA I5I MR:f:0.009
+r003 83 ref 1 30 3M = 7 9 TGA III
+r003 163 ref 7 30 3M = 1 -9 CAA III
diff --git a/mapdamage/rescale_test/pe_test/ref.fa b/mapdamage/rescale_test/pe_test/ref.fa
new file mode 100644
index 0000000..29e24ff
--- /dev/null
+++ b/mapdamage/rescale_test/pe_test/ref.fa
@@ -0,0 +1,2 @@
+>ref
+CGAAAACGA
diff --git a/mapdamage/rscript.py b/mapdamage/rscript.py
new file mode 100644
index 0000000..9019aeb
--- /dev/null
+++ b/mapdamage/rscript.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+from subprocess import CalledProcessError, check_call
+from mapdamage.version import __version__
+import mapdamage
+import logging
+import time
+
+
+def construct_path(name,folder="Rscripts"):
+ """Construct a path to the mapdamage package data given the name"""
+ return(os.path.join(mapdamage.__path__[0], folder, name))
+
+def plot(opt):
+ """
+ Calls the R script to make the plots, takes in the arguments
+ from parameter processing
+ """
+ # comp<-args[1]
+ # pdfout<-args[2]
+ # around<-as.numeric(args[3])
+ # misincorp<-args[4]
+ # lg<-as.numeric(args[5])
+ # ymax<-as.numeric(args[6])
+ # folder<-args[7]
+ # title<-args[8]
+ # version<-args[9]
+
+ fmut = opt.folder+"/"+"misincorporation.txt"
+ fcomp = opt.folder+"/"+"dnacomp.txt"
+ title = opt.folder+"/"+"Fragmisincorporation_plot.pdf"
+
+ script = construct_path("mapDamage.R")
+ call = ["Rscript", script, fcomp, title, opt.refplot, fmut, opt.readplot, \
+ opt.ymax, opt.folder, opt.title, __version__]
+ code = subprocess.call(map(str, call))
+
+ if code == 0:
+ if not opt.quiet:
+ print("pdf %s generated" % title)
+ return 0
+ else:
+ print("Error: plotting with R failed")
+ return 1
+
+
+
+def opt_plots(opt):
+ """ optional plots for length distribution
+ and cumulative C>T mutations, per strand """
+
+ fmut = opt.folder+"/"+"misincorporation.txt"
+ flength = opt.folder+"/"+"lgdistribution.txt"
+ output = opt.folder+"/"+"Length_plot.pdf"
+
+ script = construct_path("lengths.R")
+ call = ["Rscript", script, flength, output, fmut, opt.length, \
+ opt.title, __version__, bool_to_int(opt.quiet)]
+ code = subprocess.call(map(str, call))
+ if code == 0:
+ return 0
+ else:
+ print("Error: plotting with R failed")
+ return 1
+
+
+def check_R_one_lib(name):
+ """Checks if a necessary R library is here."""
+ with open(os.devnull, "w") as devnull:
+ rpa = construct_path("stats/checkLibraries.R")
+ return subprocess.call(["Rscript", rpa, "--args", name],
+ stdout = devnull,
+ stderr = devnull)
+
+
+def check_R_lib():
+ """
+ Checks if the necessary R libraries are here, signal
+ otherwise
+ """
+ libs = ["inline", "ggplot2", "gam", "Rcpp", "RcppGSL"]
+ missing_lib = []
+ for lib in libs:
+ # Check the libraries
+ if check_R_one_lib(lib):
+ #found a missing library
+ missing_lib.append(lib)
+ if len(missing_lib) > 1:
+ # Grammar Nazi has arrived
+ last_ele = missing_lib.pop()
+ print ("Missing the following R libraries '" + "', '".join(missing_lib) \
+ + "' and '" + last_ele + "'")
+ return 1
+ elif len(missing_lib) == 1:
+ print ("Missing the following R library "+missing_lib[0])
+ return 1
+ else :
+ # No missing libraries
+ return 0
+
+
+def run_stats(opt):
+ """
+ Runs the Bayesian estimation program, using the options opt
+ """
+ arg = ["Rscript", \
+ construct_path("stats/runGeneral.R"), \
+ "--args", \
+ opt.rand, \
+ opt.burn, \
+ opt.adjust, \
+ opt.iter, \
+ bool_to_int(opt.forward), \
+ bool_to_int(opt.reverse), \
+ bool_to_int(not opt.var_disp), \
+ bool_to_int(not opt.diff_hangs), \
+ 0, \
+ bool_to_int(opt.fix_nicks), \
+ bool_to_int(not opt.single_stranded), \
+ opt.seq_length, \
+ opt.folder+"/", \
+ construct_path("stats/"), \
+ opt.folder+"/Stats_out", \
+ int(opt.verbose), \
+ int(opt.quiet), \
+ int(opt.jukes_cantor), \
+ opt.folder+"/acgt_ratio.csv", \
+ int(opt.use_raw_nick_freq), \
+ int(opt.theme_bw) \
+ ]
+ arg = [str(i) for i in arg]
+
+ logger = logging.getLogger(__name__)
+ logger.info("Performing Bayesian estimates")
+ logger.debug("Call: %s" % (" ".join(arg),))
+ start_time = time.time()
+
+ try:
+ check_call(arg)
+ except CalledProcessError as e:
+ logger.error("The Bayesian statistics program failed to finish")
+ raise e
+
+ logger.debug("Bayesian estimates completed in %f seconds" % (time.time() - start_time,))
+ return 0
+
+
+def bool_to_int(boolean):
+ """ return 0 for False and 1 for True """
+ result = 1 if boolean else 0
+ return result
diff --git a/mapdamage/seq.py b/mapdamage/seq.py
new file mode 100644
index 0000000..a3d2d58
--- /dev/null
+++ b/mapdamage/seq.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+import sys
+import string
+
+# from Martin Kircher, to complement DNA
+TABLE = string.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
+ 'ACGTKYWSRMBDHVacgtkywsrmbdhv')
+
+LETTERS = ("A", "C", "G", "T")
+MUTATIONS = ('G>A', 'C>T', 'A>G', 'T>C', 'A>C', 'A>T', 'C>G', 'C>A', 'T>G',
+ 'T>A', 'G>C', 'G>T', 'A>-', 'T>-', 'C>-', 'G>-', '->A', '->T',
+ '->C', '->G', 'S')
+HEADER = LETTERS + ("Total", ) + MUTATIONS
+
+
+def write_fasta(read, ref, seq, refseq, start, end, before, after, fout):
+ std = '-' if read.is_reverse else '+'
+
+ # output coordinate in 1-based offset for 5'
+ if len(before) > 0:
+ fout.write(">%s\n%s\n" % (ref, before))
+ fout.write(">%s:%d-%d\n%s\n>%s_%s\n%s\n" % (ref, start+1, end, refseq, read.qname, std, seq))
+ if len(after) > 0:
+ fout.write(">%s\n%s\n" % (ref, after))
+
+
+def revcomp(seq):
+ """ return reverse complemented string """
+ return seq.translate(TABLE)[::-1]
+
+
+def record_lg(read, coordinate, tab):
+ """ record global length distribution
+ don't record paired reads as they are normally not used for aDNA """
+ std = '-' if read.is_reverse else '+'
+
+ length = (max(coordinate) - min(coordinate))
+ if not read.is_paired:
+ tab[std][length] = tab[std][length] + 1
+
+ return tab
+
+
+def read_fasta_index(filename):
+ """ from a fasta index file, fai, return dictionary of references:lengths """
+ def print_err(msg, filename, line):
+ sys.stderr.write("Error: %s\n" % msg)
+ sys.stderr.write(" Filename: %s\n" % filename)
+ sys.stderr.write(" Line: %s\n" % repr(line))
+
+ fai = {}
+ with open(filename, 'r') as handle:
+ for line in handle:
+ ref = line.split("\t")
+ if len(ref) != 5:
+ print_err("Line in fasta index contains wrong number of fields, found %i, expected 5:" \
+ % len(ref), filename, line)
+ return None
+
+ try:
+ fai[ref[0]] = int(ref[1])
+ except ValueError:
+ print_err("Column 2 in FASTA index did not contain a number, found '%s':" % ref[1], filename, line)
+ return None
+
+ return fai
+
+
+def compare_sequence_dicts(fasta_dict, bam_dict):
+ """Compares a FASTA and BAM sequence dictionary, and prints any differences.
+ Returns true if all required sequences are found, false otherwise."""
+ if fasta_dict == bam_dict:
+ return True
+
+ sys.stderr.write("Sequence dictionaries in FASTA/BAM files differ:\n")
+ common = set(fasta_dict) & set(bam_dict)
+ if not common:
+ sys.stderr.write("FATAL ERROR: No sequences in common!\n")
+ return False
+
+ # Check that the lengths of required sequences match (fatal error)
+ different = []
+ for key in sorted(common):
+ if fasta_dict[key] != bam_dict[key]:
+ different.append((key, fasta_dict[key], bam_dict[key]))
+
+ if different:
+ sys.stderr.write("FATAL ERROR: Length of required sequences differ:\n")
+ for values in different:
+ sys.stderr.write(" - %s: %i bp vs %i bp\n" % values)
+
+ # Check for sequences only found in the BAM file (fatal errors)
+ bam_only = set(bam_dict) - common
+ if bam_only:
+ sys.stderr.write("FATAL ERROR: Sequences missing from FASTA dictionary:\n")
+ for key in bam_only:
+ sys.stderr.write(" - %s = %i bp\n" % (key, bam_dict[key]))
+
+ # Check for sequences only found in the BAM file (fatal errors)
+ fasta_only = set(fasta_dict) - common
+ if fasta_only:
+ sys.stderr.write("WARNING: FASTA file contains extra sequences:\n")
+ for key in fasta_only:
+ sys.stderr.write(" - %s = %i bp\n" % (key, fasta_dict[key]))
+
+ sys.stderr.write("\n")
+
+ return not (different or bam_only)
diff --git a/mapdamage/tables.py b/mapdamage/tables.py
new file mode 100644
index 0000000..04fdeef
--- /dev/null
+++ b/mapdamage/tables.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+import os
+import collections
+
+import mapdamage
+from mapdamage.version import __version__
+
+
+def initialize_mut(ref, length):
+ tab = {}
+ for contig in ref:
+ tab_contig = tab[contig] = {}
+ for end in ('5p','3p'):
+ tab_end = tab_contig[end] = {}
+ for std in ('+','-'):
+ tab_std = tab_end[std] = {}
+ for mut in mapdamage.seq.HEADER:
+ tab_std[mut] = dict.fromkeys(xrange(length), 0)
+
+ return tab
+
+
+def print_mut(mut, opt, out):
+ _print_freq_table(mut, mapdamage.seq.HEADER, opt, out, offset = 1)
+
+
+def initialize_comp(ref, around, length):
+ keys = {"3p" : range(-length, 0) + range(1, around + 1),
+ "5p" : range(-around, 0) + range(1, length + 1)}
+
+ tab = {}
+ for contig in ref:
+ tab_contig = tab[contig] = {}
+ for end in ('5p','3p'):
+ tab_end = tab_contig[end] = {}
+ for std in ('+','-'):
+ tab_std = tab_end[std] = {}
+ for letters in mapdamage.seq.LETTERS:
+ tab_std[letters] = dict.fromkeys(keys[end], 0)
+
+ return tab
+
+
+def print_comp(comp, opt, out):
+ columns = mapdamage.seq.LETTERS + ("Total",)
+ _print_freq_table(comp, columns, opt, out)
+
+
+def initialize_lg():
+ tab = {}
+ for std in ('+','-'):
+ tab[std] = collections.defaultdict(int)
+
+ return tab
+
+
+def print_lg(tab, opt, out):
+ out.write("# table produced by mapDamage version %s\n" % __version__)
+ out.write("# using mapped file %s and %s as reference file\n" % (opt.filename, opt.ref))
+ if opt.minqual != 0:
+ out.write("# Quality filtering of bases with a Phred score < %d\n" % opt.minqual)
+ out.write("# Std: strand of reads\n")
+ out.write("Std\tLength\tOccurences \n")
+ for std in tab:
+ for i in tab[std]:
+ out.write("%s\t%d\t%d\n" % (std, i, tab[std][i]))
+
+
+def check_table_and_warn_if_dmg_freq_is_low(folder):
+ """ Returns true if the damage frequencies are too low to allow
+ Bayesian estimation of DNA damages, i.e < 1% at first position.
+ """
+ total = 0.0
+ for filename in ("5pCtoT_freq.txt", "3pGtoA_freq.txt"):
+ if not os.path.exists(folder+"/"+filename):
+ print("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
+ % filename)
+ return True
+
+ with open(os.path.join(folder, filename)) as handle:
+ for line in handle:
+ freq = line.strip().split('\t')
+ if freq[0] == "1":
+ total += float(freq[1])
+ break
+ else:
+ print("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
+ % filename)
+ return True
+
+ if total < 0.01:
+ print("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total)
+
+ return False
+
+
+def _print_freq_table(table, columns, opt, out, offset = 0):
+ out.write("# table produced by mapDamage version %s\n" % __version__)
+ out.write("# using mapped file %s and %s as reference file\n" % (opt.filename, opt.ref))
+ if opt.minqual != 0:
+ out.write("# Bases with a Phred score < %d were filtered out\n" % opt.minqual)
+ out.write("# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads\n")
+ out.write("Chr\tEnd\tStd\tPos\t%s\n" % ("\t".join(columns)))
+
+ for (reference, ends) in sorted(table.iteritems()):
+ for (end, strands) in sorted(ends.iteritems()):
+ for (strand, subtable) in sorted(strands.iteritems()):
+ subtable["Total"] = {}
+ for index in sorted(subtable[columns[0]]):
+ subtable["Total"][index] = sum(subtable[letter][index] for letter in mapdamage.seq.LETTERS)
+
+ out.write("%s\t%s\t%s\t%d" % (reference, end, strand, index + offset))
+ for base in columns:
+ out.write("\t%d" % subtable[base][index])
+ out.write("\n")
diff --git a/mapdamage/version.py b/mapdamage/version.py
new file mode 100644
index 0000000..4e95d9f
--- /dev/null
+++ b/mapdamage/version.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+try:
+ from mapdamage._version import __version__
+except ImportError:
+ __version__ = "2.0.6"
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b166396
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from distutils.core import setup
+from distutils.command.install import install as DistutilsInstall
+import os
+import subprocess
+
+def compile_seqtk():
+ """Compiling the seqtk toolkit"""
+ old_wd = os.getcwd()
+ new_wd = os.path.join(old_wd,"mapdamage","seqtk")
+ os.chdir(new_wd)
+ if not os.path.isfile("seqtk.c"):
+ raise SystemExit("Cannot find seqtk.c")
+ if (os.path.isfile("seqtk")):
+ os.system("rm seqtk")
+ xs = os.system("make -f Makefile")
+ os.chdir(old_wd)
+ if (xs != 0):
+ raise SystemExit("Cannot compile seqtk")
+
+
+def setup_version():
+ if not os.path.exists(".git"):
+ # Release version, no .git folder
+ return
+
+ try:
+ version = subprocess.check_output(("git", "describe", "--always", "--tags", "--dirty"))
+ with open(os.path.join("mapdamage", "_version.py"), "w") as handle:
+ handle.write("#!/usr/bin/env python\n")
+ handle.write("__version__ = %r\n" % (version.strip(),))
+ except (subprocess.CalledProcessError, OSError), error:
+ raise SystemExit("Could not determine mapDamage version: %s" % (error,))
+
+
+class compileInstall(DistutilsInstall):
+ # extension of the class to account for an extra compiling step
+ def run(self):
+ self.record=""
+ setup_version()
+ compile_seqtk()
+ DistutilsInstall.run(self)
+ # fixing the permission problem of seqtk
+ files = self.get_outputs()
+ for fi in files:
+ if fi.endswith("seqtk/seqtk"):
+ os.chmod(fi,0755)
+
+
+
+
+
+setup(
+ cmdclass={'install': compileInstall},
+ name='mapdamage',
+ version='2.0',
+ author='Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson',
+ author_email='MSchubert at snm.ku.dk, jonsson.hakon at gmail.com',
+ packages=['mapdamage'],
+ package_data={'mapdamage': ['Rscripts/*.R','Rscripts/stats/*.R','seqtk/seqtk']},
+ scripts=['bin/mapDamage'],
+ url='https://github.com/ginolhac/mapDamage',
+ license='LICENSE.txt',
+ description='mapDamage tracks and quantify DNA damage pattern among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms',
+ long_description=open('README.md').read()
+)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mapdamage.git
More information about the debian-med-commit
mailing list