[med-svn] [reapr] 01/05: Imported Upstream version 1.0.18+dfsg
Sascha Steinbiss
sascha-guest at moszumanska.debian.org
Sun Dec 13 18:45:05 UTC 2015
This is an automated email from the git hooks/post-receive script.
sascha-guest pushed a commit to branch master
in repository reapr.
commit bb85a15019796819770544938adb0731286734b8
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Sun Dec 13 13:25:18 2015 +0000
Imported Upstream version 1.0.18+dfsg
---
README | 86 ++++
changelog.txt | 168 +++++++
install.sh | 169 +++++++
licence.txt | 674 ++++++++++++++++++++++++++
src/.gitignore | 16 +
src/Makefile | 125 +++++
src/bam2fcdEstimate.cpp | 224 +++++++++
src/bam2fragCov.cpp | 195 ++++++++
src/bam2insert.cpp | 190 ++++++++
src/bam2perfect.cpp | 224 +++++++++
src/bamtools | 1 +
src/coveragePlot.cpp | 45 ++
src/coveragePlot.h | 38 ++
src/errorWindow.cpp | 61 +++
src/errorWindow.h | 40 ++
src/fa2gaps.cpp | 47 ++
src/fa2gc.cpp | 187 ++++++++
src/fasta.cpp | 131 +++++
src/fasta.h | 50 ++
src/findknownsnps | 1 +
src/histogram.cpp | 401 ++++++++++++++++
src/histogram.h | 102 ++++
src/make_plots.cpp | 124 +++++
src/n50.cpp | 178 +++++++
src/reapr.pl | 101 ++++
src/samtools | 1 +
src/scaff2contig.cpp | 92 ++++
src/smalt | 1 +
src/tabix | 1 +
src/task_break.cpp | 439 +++++++++++++++++
src/task_facheck.pl | 104 ++++
src/task_fcdrate.cpp | 380 +++++++++++++++
src/task_gapresize.cpp | 239 +++++++++
src/task_perfectfrombam.pl | 211 ++++++++
src/task_perfectmap.pl | 143 ++++++
src/task_pipeline.pl | 135 ++++++
src/task_plots.pl | 130 +++++
src/task_preprocess.pl | 484 +++++++++++++++++++
src/task_score.cpp | 1148 ++++++++++++++++++++++++++++++++++++++++++++
src/task_seqrename.pl | 74 +++
src/task_smaltmap.pl | 190 ++++++++
src/task_stats.cpp | 953 ++++++++++++++++++++++++++++++++++++
src/task_summary.pl | 339 +++++++++++++
src/trianglePlot.cpp | 397 +++++++++++++++
src/trianglePlot.h | 93 ++++
src/utils.cpp | 110 +++++
src/utils.h | 50 ++
47 files changed, 9292 insertions(+)
diff --git a/README b/README
new file mode 100644
index 0000000..38391fe
--- /dev/null
+++ b/README
@@ -0,0 +1,86 @@
+REAPR version 1.0.18 README file
+
+
+_____________________________ INSTALLATION ___________________________________
+
+Prerequisites:
+ - R installed and in your path (http://www.r-project.org/)
+ - The following Perl modules need to be installed:
+ File::Basename
+ File::Copy
+ File::Spec
+ File::Spec::Link
+ Getopt::Long
+ List::Util
+
+
+To install REAPR, run
+
+ ./install.sh
+
+Note that (depending on your system) this could take quite a long time
+because there are several third-party tools that need to be compiled.
+Once it has finished, add ./reapr to your $PATH, or call it explicitly with
+/path/to/your/installation/directory/reapr
+
+Optionally, you might want to have Artemis installed
+(http://www.sanger.ac.uk/resources/software/artemis/) to view the output.
+It must be at least Artemis version 15.0.0.
+
+
+______________________________ RUN THE TEST __________________________________
+
+If you want to check the installation is ok, you can run a test of the
+whole REAPR pipeline.
+
+IMPORTANT: the test assumes that 'reapr' has been added to your $PATH.
+
+The test data are available as a separate download. These are solely to check
+that the installation runs, using a cut-down dataset. As such, the results
+are not the same as those that would be obtained when running on the full
+dataset. The data contains reads from the ENA (accession number SRR022865).
+The remainder of the test data is from the GAGE paper
+(Salzberg et al. 2011), using the Velvet assembly of S. aureus
+(see http://gage.cbcb.umd.edu/data/index.html).
+
+Download the test data tarball, unpack it and run the
+test script from inside the test directory:
+
+./test.sh
+
+
+____________________________ MAPPING READS ___________________________________
+
+SMALT (http://www.sanger.ac.uk/resources/software/smalt/) is recommended to
+map the reads. REAPR has been tested using versions 0.6.4 and 0.7.0.1 of
+SMALT, but we have noticed issues with writing a BAM file directly
+(with the SMALT option -f bam). Please use -f samsoft when running SMALT, then
+import to BAM with samtools or Picard.
+
+REAPR can map reads for you using SMALT - you can run 'reapr smaltmap' with
+the -x option to print a list of the mapping commands that REAPR would
+run to produce a BAM for input to the REAPR pipeline.
+
+
+________________________________ GET HELP ____________________________________
+
+Please read the manual: manual.pdf
+
+
+_______________________________ BRIEF USAGE __________________________________
+
+All REAPR tasks are run via a call to
+
+ reapr
+
+Call with no arguments for a list of tasks. Call any task with no arguments
+to get the usage for that task. To run the entire pipeline, run
+
+ reapr pipeline
+
+Full instructions can be found in the manual: manual.pdf.
+
+
+__________________________ BUG REPORTS/QUESTIONS/COMMENTS ____________________
+
+Please email Martin Hunt, mh12 at sanger.ac.uk.
diff --git a/changelog.txt b/changelog.txt
new file mode 100644
index 0000000..5fe5c32
--- /dev/null
+++ b/changelog.txt
@@ -0,0 +1,168 @@
+______ 1.0.17 -> 1.0.18 ______
+
+* Bug fix for rare cases when sampling to get coverage/GC sstats.
+
+* When writing broken assembly, no by default include all bin
+contigs >= 1000 bases long in the main assembly. Can change this
+cutoff with -m option. Short contigs still written to the bin.
+
+______ 1.0.16 -> 1.0.17 ______
+
+* Expose number of threads option in smalt map
+
+* Expose all task options when running the pipeline
+
+* Bug fix with some options not working in stats task.
+
+* Add comma to the list of bad characters in facheck
+
+* Update manual to reflect changes and change examples
+to have what to do with just one library. Fix a couple
+of typos in examples.
+
+* Speed up fa2gc stage of preprocess by about 4 times.
+
+
+______ 1.0.15 -> 1.0.16 ______
+
+* Added option -t to task 'break'. This can be used to trim
+bases off contigs ends, wherever a contig is broken from an
+FCD error with the -a option.
+
+* Change default of -l option of break to output sequences that are at
+least 100bp long (default was 1bp).
+
+* install.sh now checks that the required Perl modules are installed, and
+checks that R is in the path.
+
+* install.sh checks that the OS appears to be Linux and dies if
+it's not. Added option to try to force the install anyway regardless of OS.
+
+* Bug fix in smaltmap: depending on the OS, bam header was not getting
+made correctly.
+
+* smaltmap now starts by running samtools faidx on the assembly fasta file.
+A common cause of the pipeline falling over is a fasta file that makes
+samtools faidx segfault. Print a nice error message about this
+if samtools faidx ends badly.
+
+______ 1.0.14 -> 1.0.15 ______
+
+* Added task 'seqrename' to rename all the sequences
+in a BAM file. This saves remapping the reads to make a
+new BAM that will be OK for the pipeline.
+
+* Added task 'smaltmap' run map reads using SMALT.
+
+* Updated the plots task to make tabix indexed plots, since
+Artemis (version 15.0.0) can now read these.
+
+* Bug fix in task 'break', where the -l option for min length
+of sequence to output didn't always work.
+
+______ 1.0.13 -> 1.0.14 ______
+
+* Fixed Makefiles for tabix and reapr because they didn't work
+on some systems (e.g. Ubuntu).
+
+* Change sequence names output by break: use underscores instead
+of : and -, so that the output is compatible with REAPR itself.
+
+* Added -b option to break, which will ignore FCD and low fragment
+coverage errors within contigs (i.e. those that don't contain a
+gap)
+
+______ 1.0.12 -> 1.0.13 ______
+
+* Bug fix: off by one error in coordinates in
+errors gff file made by 'score'.
+
+* pipeline now starts by running facheck
+on the assembly.
+
+* pipeline changed so that it writes a bash
+script of all the commands it's going to run,
+then runs that bash script. Useful if it dies
+and you want to know the commands needed to finish
+the pipeline.
+
+* Change in perfectmap: added --variance to be
+0.5 * fragment size in the call to snpomatic.
+The previous default was 0.25 * fragment size.
+
+* Added -a option to 'break' for aggressive breaking:
+it breaks contigs at errors (as well as breaking at
+gaps).
+
+
+______ 1.0.11 -> 1.0.12 ______
+
+* Bug fix where in rare cases the 'break'
+task would incorrectly make a broken fasta file
+with duplicated sequences, or sequences continuing
+right through to the end of the scaffold, instead of
+stopping at the appropriate gap.
+
+* Prefix the name of every bin contig
+made when running break with 'REAPR_bin.'.
+
+* In facheck, added brackets (){} and various
+other characters to the list of characters that
+break the pipeline.
+
+* More verbose error message in preprocess when
+something goes wrong at the point of sampling the
+fragment coverage vs GC content.
+
+* Fix typo in report.txt file made by summary, should be
+'low score' not 'high score'. Also now writes the same
+information in a report.tsv file, for ease of putting
+results into spreadsheets.
+
+______ 1.0.10 -> 1.0.11 ______
+
+* Switch meaning of score to be more intuitive,
+so that a score of 1 means perfect, down to
+0 for bad. Give all gaps a score of -1.
+
+
+______ 1.0.9 -> 1.0.10 ______
+
+* Bug fix with counting perfect bases. It was slightly
+overestimating, by counting gaps which were too long
+to call as perfect.
+
+
+______ 1.0.8 -> 1.0.9 ______
+
+* Added task 'perfectfrombam' to use as an alternative to
+perfectmap. perfectmap maps reads with SNP-o-matic, which
+is very fast but also very high memory. perfectmapfrombam
+takes a BAM file as input, and generates a file of perfect
+and uniquely mapped reads, same format as for perfectmap,
+for use in the REAPR pipeline. Intended use case is
+large genomes.
+
+* Fix bug where facheck was writing .fa and .info files
+when just an assembly fasta was given as input, with no
+output files prefix.
+
+* Bug fix of link reporting. The coords needed 1 adding
+to them in the Note... section of the gff file made by score.
+
+* Remove superfluous double-quotes in the note section
+of the gff errors file made by score.
+
+* For each plot file, now additionally writes data in a .dat file,
+(the R plots truncate the x axis and so the .R files don't
+have all the data in them, but the .dat files do
+have all the data in them, should anyone want it).
+
+* Add option -u to stats task, to just run on a given
+list of chromosomes.
+
+* Added -f to every system call to tabix
+
+* 'break' now also outputs a prefix.broken_assembly_bin.fa
+fasta file of the parts of the genomes which were replaced
+with Ns.
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..a4fa316
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+set -e
+rootdir=$PWD
+
+unamestr=$(uname)
+
+# Check if the OS seems to be Linux. This should be informative to
+# Windows/Mac users that are trying to install this, despite it being
+# developed for Linux. There is a VM avilable for Mac/Windows.
+if [ "$unamestr" != "Linux" ]
+then
+ echo "Operating system does not appear to be Linux: uname says it's '$unamestr'"
+
+ if [ "$1" = "force" ]
+ then
+ echo "'force' option used, carrying on anyway. Best of luck to you..."
+ else
+ echo "
+If you *really* want to try installing anyway, run this:
+./install.sh force
+
+If you're using a Mac or Windows, then the recommended way to run REAPR is
+to use the virtual machine."
+ exit 1
+ fi
+fi
+
+
+echo "------------------------------------------------------------------------------
+ Checking prerequisites
+------------------------------------------------------------------------------
+"
+echo "Checking Perl modules..."
+
+modules_ok=1
+
+for module in File::Basename File::Copy File::Spec File::Spec::Link Getopt::Long List::Util
+do
+ set +e
+ perl -M$module -e 1 2>/dev/null
+
+ if [ $? -eq 0 ]
+ then
+ echo " OK $module"
+ else
+ echo " NOT FOUND: $module"
+ modules_ok=0
+ fi
+ set -e
+done
+
+if [ $modules_ok -ne 1 ]
+then
+ echo "Some Perl modules were not found - please install them. Cannot continue"
+ exit 1
+else
+ echo "... Perl modules all OK"
+fi
+
+
+echo
+echo "Looking for R..."
+
+if type -P R
+then
+ echo "... found R OK"
+else
+ echo "Didn't find R. It needs to be installed and in your path. Cannot continue"
+fi
+
+
+cd third_party
+
+echo "
+------------------------------------------------------------------------------
+ Compiling cmake
+------------------------------------------------------------------------------
+"
+cd cmake
+./bootstrap --prefix $PWD
+make
+cd ..
+echo "
+------------------------------------------------------------------------------
+ cmake compiled
+------------------------------------------------------------------------------
+"
+
+echo "
+------------------------------------------------------------------------------
+ Compiling Bamtools
+------------------------------------------------------------------------------
+"
+cd bamtools
+mkdir build
+cd build
+$rootdir/third_party/cmake/bin/cmake ..
+make
+cd $rootdir
+echo "
+------------------------------------------------------------------------------
+ Bamtools compiled
+------------------------------------------------------------------------------
+"
+
+echo "
+------------------------------------------------------------------------------
+ Compiling Tabix
+------------------------------------------------------------------------------
+"
+cd third_party/tabix
+make
+cd ..
+echo "
+------------------------------------------------------------------------------
+ Tabix compiled
+------------------------------------------------------------------------------
+"
+
+echo "
+------------------------------------------------------------------------------
+ Compiling snpomatic
+------------------------------------------------------------------------------
+"
+cd snpomatic
+make
+cd ..
+echo "
+------------------------------------------------------------------------------
+ snpomatic compiled
+------------------------------------------------------------------------------
+"
+
+echo "
+------------------------------------------------------------------------------
+ Compiling samtools
+------------------------------------------------------------------------------
+"
+cd samtools
+make
+echo "
+------------------------------------------------------------------------------
+ samtools compiled
+------------------------------------------------------------------------------
+"
+
+echo "
+------------------------------------------------------------------------------
+ Compiling Reapr
+------------------------------------------------------------------------------
+"
+cd $rootdir/src
+make
+cd ..
+ln -s src/reapr.pl reapr
+echo "
+Reapr compiled
+
+All done!
+
+Run
+./reapr
+for usage.
+
+Read the manual
+manual.pdf
+for full instructions
+"
+
diff --git a/licence.txt b/licence.txt
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/licence.txt
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..cde0e74
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,16 @@
+*.o
+bam2fcdEstimate
+bam2fragCov
+bam2insert
+bam2perfect
+fa2gaps
+fa2gc
+make_plots
+n50
+scaff2contig
+scaff2contig.cpp
+task_break
+task_fcdrate
+task_gapresize
+task_score
+task_stats
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..5fc2223
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,125 @@
+BAMTOOLS_ROOT = $(CURDIR)/bamtools
+CC = g++
+CFLAGS = -Wl,-rpath,$(BAMTOOLS_ROOT)/lib -Wall -O3 -I $(BAMTOOLS_ROOT)/include -L $(BAMTOOLS_ROOT)/lib
+TABIX = tabix/tabix.o -L./tabix -ltabix -lz
+STATS_OBJS = trianglePlot.o coveragePlot.o fasta.o histogram.o utils.o
+SCORE_OBJS = errorWindow.o utils.o histogram.o
+BREAK_OBJS = fasta.o utils.o
+BAM2INSERT_OBJS = histogram.o utils.o
+BAM2PERFECT_OBJS = utils.o
+BAM2COV_OBJS = coveragePlot.o utils.o
+BAM2FCDESTIMATE_OBJS = histogram.o utils.o trianglePlot.o
+FA2GAPS_OBJS = fasta.o
+SCAFF2CONTIG_OBJS = fasta.o
+FA2GC_OBJS = fasta.o
+GAPRESIZE_OBJS = utils.o trianglePlot.o fasta.o
+FCDRATE_OBJS = histogram.o utils.o
+N50_OBJS = fasta.o
+EXECUTABLES = task_score task_stats task_break bam2fragCov bam2insert bam2fcdEstimate make_plots fa2gaps fa2gc scaff2contig n50 task_gapresize task_fcdrate bam2perfect
+
+all: $(EXECUTABLES)
+
+errorWindow.o: errorWindow.cpp
+ $(CC) $(CFLAGS) -c errorWindow.cpp
+
+trianglePlot.o: trianglePlot.cpp utils.o
+ $(CC) $(CFLAGS) -c trianglePlot.cpp
+
+coveragePlot.o: coveragePlot.cpp
+ $(CC) $(CFLAGS) -c coveragePlot.cpp
+
+fasta.o: fasta.cpp
+ $(CC) $(CFLAGS) -c fasta.cpp
+
+histogram.o: histogram.cpp
+ $(CC) $(CFLAGS) -c histogram.cpp
+
+utils.o: utils.cpp
+ $(CC) $(CFLAGS) -lbamtools -c utils.cpp
+
+task_stats: task_stats.o $(STATS_OBJS)
+ $(CC) $(CFLAGS) task_stats.o $(STATS_OBJS) -lbamtools -o task_stats $(TABIX)
+
+task_stats.o: task_stats.cpp $(STATS_OBJS)
+ $(CC) $(CFLAGS) -c task_stats.cpp
+
+task_score: task_score.o $(SCORE_OBJS)
+ $(CC) $(CFLAGS) task_score.o $(SCORE_OBJS) -lbamtools -o task_score $(TABIX)
+
+task_score.o: task_score.cpp $(SCORE_OBJS)
+ $(CC) $(CFLAGS) -c task_score.cpp
+
+task_break: task_break.o $(BREAK_OBJS)
+ $(CC) $(CFLAGS) task_break.o $(BREAK_OBJS) -lbamtools -o task_break $(TABIX)
+
+task_break.o: task_break.cpp $(BREAK_OBJS)
+ $(CC) $(CFLAGS) -c task_break.cpp
+
+bam2fragCov: bam2fragCov.o $(BAM2COV_OBJS)
+ $(CC) $(CFLAGS) bam2fragCov.o $(BAM2COV_OBJS) -lbamtools -o bam2fragCov $(TABIX)
+
+bam2fragCov.o: bam2fragCov.cpp $(BAM2COV_OBJS)
+ $(CC) $(CFLAGS) -c bam2fragCov.cpp
+
+bam2insert: bam2insert.o $(BAM2INSERT_OBJS)
+ $(CC) $(CFLAGS) bam2insert.o $(BAM2INSERT_OBJS) -lbamtools -o bam2insert $(TABIX)
+
+bam2insert.o: bam2insert.cpp $(BAM2INSERT_OBJS)
+ $(CC) $(CFLAGS) -c bam2insert.cpp
+
+bam2perfect: bam2perfect.o $(BAM2PERFECT_OBJS)
+ $(CC) $(CFLAGS) bam2perfect.o $(BAM2PERFECT_OBJS) -lbamtools -o bam2perfect $(TABIX)
+
+bam2perfect.o: bam2perfect.cpp $(BAM2PERFECT_OBJS)
+ $(CC) $(CFLAGS) -c bam2perfect.cpp
+
+bam2fcdEstimate: bam2fcdEstimate.o $(BAM2FCDESTIMATE_OBJS)
+ $(CC) $(CFLAGS) bam2fcdEstimate.o $(BAM2FCDESTIMATE_OBJS) -lbamtools -o bam2fcdEstimate $(TABIX)
+
+bam2fcdEstimate.o: bam2fcdEstimate.cpp $(BAM2FCDESTIMATE_OBJS)
+ $(CC) $(CFLAGS) -c bam2fcdEstimate.cpp
+
+make_plots: make_plots.o
+ $(CC) $(CFLAGS) make_plots.o -o make_plots
+
+make_plots.o: make_plots.cpp
+ $(CC) $(CFLAGS) -c make_plots.cpp
+
+fa2gc: fa2gc.o $(FA2GC_OBJS)
+ $(CC) $(CFLAGS) fa2gc.o $(FA2GC_OBJS) -o fa2gc
+
+fa2gc.o: fa2gc.cpp $(FA2GC_OBJS)
+ $(CC) $(CFLAGS) -c fa2gc.cpp
+
+fa2gaps: fa2gaps.o $(FA2GAPS_OBJS)
+ $(CC) $(CFLAGS) fa2gaps.o $(FA2GAPS_OBJS) -o fa2gaps
+
+fa2gaps.o: fa2gaps.cpp $(FA2GAPS_OBJS)
+ $(CC) $(CFLAGS) -c fa2gaps.cpp
+
+n50: n50.o $(N50_OBJS)
+ $(CC) $(CFLAGS) n50.o $(N50_OBJS) -o n50
+
+n50.o: n50.cpp $(N50_OBJS)
+ $(CC) $(CFLAGS) -c n50.cpp
+
+scaff2contig: scaff2contig.o $(SCAFF2CONTIG_OBJS)
+ $(CC) $(CFLAGS) scaff2contig.o $(SCAFF2CONTIG_OBJS) -o scaff2contig
+
+scaff2contig.o: scaff2contig.cpp $(SCAFF2CONTIG_OBJS)
+ $(CC) $(CFLAGS) -c scaff2contig.cpp
+
+task_gapresize: task_gapresize.o $(GAPRESIZE_OBJS)
+ $(CC) $(CFLAGS) task_gapresize.o $(GAPRESIZE_OBJS) -lbamtools -o task_gapresize $(TABIX)
+
+task_gapresize.o: task_gapresize.cpp $(GAPRESIZE_OBJS)
+ $(CC) $(CFLAGS) -c task_gapresize.cpp
+
+task_fcdrate: task_fcdrate.o $(FCDRATE_OBJS)
+ $(CC) $(CFLAGS) task_fcdrate.o $(FCDRATE_OBJS) -lbamtools -o task_fcdrate $(TABIX)
+
+task_fcdrate.o: task_fcdrate.cpp $(FCDRATE_OBJS)
+ $(CC) $(CFLAGS) -c task_fcdrate.cpp
+
+clean:
+ rm -rf *.o $(EXECUTABLES)
diff --git a/src/bam2fcdEstimate.cpp b/src/bam2fcdEstimate.cpp
new file mode 100644
index 0000000..0a1e6d5
--- /dev/null
+++ b/src/bam2fcdEstimate.cpp
@@ -0,0 +1,224 @@
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "utils.h"
+#include "histogram.h"
+#include "trianglePlot.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+
+using namespace BamTools;
+using namespace std;
+
+const string ERROR_PREFIX = "[REAPR bam2fcdEstimate] ";
+
+struct CmdLineOptions
+{
+ long maxInsert;
+ unsigned long sampleStep;
+ bool verbose;
+ string bamInfile;
+ string gapsInfile;
+ string outfile;
+ unsigned long maxSamples;
+};
+
+
+void initialiseFCDHists(vector<Histogram>& v, CmdLineOptions& ops);
+void updateFCDHists(vector<Histogram>& hists, vector<double>& heights);
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ BamReader bamReader;
+ BamAlignment bamAlign;
+ SamHeader header;
+ RefVector references;
+ TrianglePlot triplot(options.maxInsert);
+ unsigned long histCounter = 0;
+ int32_t currentRefID = -1;
+ bool firstRecord = true;
+ multiset<pair<unsigned long, unsigned long> > fragments;
+ vector<Histogram> fcdLHS, fcdRHS;
+ initialiseFCDHists(fcdLHS, options);
+ initialiseFCDHists(fcdRHS, options);
+
+ map<string, list<pair<unsigned long, unsigned long> > > globalGaps;
+ loadGaps(options.gapsInfile, globalGaps);
+
+ if (!bamReader.Open(options.bamInfile))
+ {
+ cerr << "Error opening bam file '" << options.bamInfile << "'" << endl;
+ return 1;
+ }
+
+ header = bamReader.GetHeader();
+ references = bamReader.GetReferenceData();
+
+ while (bamReader.GetNextAlignmentCore(bamAlign) && histCounter < options.maxSamples)
+ {
+ if (!bamAlign.IsMapped() || bamAlign.IsDuplicate()
+ || bamAlign.InsertSize <= 0
+ || bamAlign.InsertSize > options.maxInsert) continue;
+
+ if (currentRefID != bamAlign.RefID)
+ {
+ if (firstRecord)
+ {
+ firstRecord = false;
+ }
+ else
+ {
+ triplot.clear(0);
+ fragments.clear();
+ }
+ currentRefID = bamAlign.RefID;
+ }
+ while (bamAlign.Position > triplot.centreCoord())
+ {
+ triplot.add(fragments);
+ if (triplot.depth() > 0)
+ {
+ vector<double> leftHeights;
+ vector<double> rightHeights;
+ triplot.getHeights(options.maxInsert, leftHeights, rightHeights);
+ updateFCDHists(fcdLHS, leftHeights);
+ updateFCDHists(fcdRHS, rightHeights);
+ histCounter++;
+ if (options.verbose && histCounter % 100 == 0)
+ {
+ cerr << ERROR_PREFIX << "progress:" << histCounter << endl;
+ }
+ }
+ triplot.shift(options.sampleStep);
+ }
+
+
+ short pairOrientation = getPairOrientation(bamAlign);
+
+ if (pairOrientation == INNIE)
+ {
+ fragments.insert(fragments.end(), make_pair(bamAlign.Position, bamAlign.Position + bamAlign.InsertSize - 1));
+ }
+ }
+
+ ofstream ofs(options.outfile.c_str());
+
+ if (!ofs.good())
+ {
+ cerr << "Error opening file '" << options.outfile << "'" << endl;
+ return 1;
+ }
+
+ ofs << "#position\theight" << endl;
+
+ for (unsigned int i = 0; i < fcdLHS.size(); i++)
+ {
+ double leftMean, rightMean;
+ double stddev;
+ fcdLHS[i].meanAndStddev(leftMean, stddev);
+ fcdRHS[i].meanAndStddev(rightMean, stddev);
+ leftMean = (fcdLHS[i].size() == 0) ? 0 : 0.001 * (leftMean - 0.5);
+ rightMean = (fcdRHS[i].size() == 0) ? 0 : 0.001 * (rightMean - 0.5);
+ ofs << i << '\t' << 0.5 * (leftMean + rightMean) << endl;
+ }
+
+
+ ofs.close();
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 4;
+ int i;
+ ops.sampleStep = 0;
+ ops.verbose = false;
+ ops.maxSamples = 100000;
+
+ usage = "[options] <in.bam> <assembly.gaps.gz> <max insert size> <output file>\n\n\
+options:\n\n\
+-m <int>\n\tMaximum number of FCDs to analyse [100000]\n\
+-s <int>\n\tSample every n^th base [max(insert_size / 2, 500)]\n\
+";
+
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ cerr << "usage:\nbam2insert " << usage;
+ exit(1);
+ }
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // deal with booleans (there aren't any (yet...))
+ if (strcmp(argv[i], "-v") == 0)
+ {
+ ops.verbose = true;
+ continue;
+ }
+
+ // non booleans are of form -option value, so check
+ // next value in array is there before using it!
+ if (strcmp(argv[i], "-m") == 0)
+ {
+ ops.maxSamples = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-s") == 0)
+ {
+ ops.sampleStep = atoi(argv[i+1]);
+ }
+ else
+ {
+ cerr << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.bamInfile = argv[i];
+ ops.gapsInfile = argv[i+1];
+ ops.maxInsert = atoi(argv[i+2]);
+ ops.outfile = argv[i+3];
+ if(ops.sampleStep == 0)
+ {
+ ops.sampleStep = max(ops.maxInsert / 2, (long) 500);
+ }
+}
+
+void initialiseFCDHists(vector<Histogram>& v, CmdLineOptions& ops)
+{
+ for (unsigned long i = 0; i <= ops.maxInsert; i++)
+ {
+ v.push_back(Histogram(1));
+ }
+}
+
+
+
+void updateFCDHists(vector<Histogram>& hists, vector<double>& heights)
+{
+ for (unsigned long i = 0; i < hists.size(); i++)
+ {
+ hists[i].add(1000 * heights[i], 1);
+ }
+}
diff --git a/src/bam2fragCov.cpp b/src/bam2fragCov.cpp
new file mode 100644
index 0000000..738f62c
--- /dev/null
+++ b/src/bam2fragCov.cpp
@@ -0,0 +1,195 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include <iomanip>
+
+#include "coveragePlot.h"
+#include "utils.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+
+using namespace BamTools;
+using namespace std;
+
+
+struct CmdLineOptions
+{
+ string bamInfile;
+ long minInsert;
+ long maxInsert;
+ uint16_t minMapQuality;
+ unsigned long sample;
+};
+
+void updateCovInfo(unsigned long pos, CoveragePlot& covPlot, multiset<pair<unsigned long, unsigned long> >& frags, vector<unsigned long>& covVals);
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ BamReader bamReader;
+ BamAlignment bamAlign;
+ SamHeader header;
+ RefVector references;
+ int32_t currentRefID = -1;
+ string currentRefIDstring = "";
+ bool firstRecord = true;
+ vector<unsigned long> fragCoverageVals;
+ CoveragePlot fragCovPlot(options.maxInsert, 0);
+ multiset<pair<unsigned long, unsigned long> > fragments; // fragment positions sorted by start position
+ unsigned long baseCounter = 0;
+
+ // Go through input bam file getting insert coverage
+ if (!bamReader.Open(options.bamInfile))
+ {
+ cerr << "Error opening bam file '" << options.bamInfile << "'" << endl;
+ return 1;
+ }
+
+ header = bamReader.GetHeader();
+ references = bamReader.GetReferenceData();
+
+ while (bamReader.GetNextAlignmentCore(bamAlign))
+ {
+ if (!bamAlign.IsMapped()
+ || bamAlign.IsDuplicate()
+ || bamAlign.MapQuality < options.minMapQuality)
+ {
+ continue;
+ }
+
+ // Deal with the case when we find a new reference sequence in the bam
+ if (currentRefID != bamAlign.RefID)
+ {
+ if (firstRecord)
+ {
+ firstRecord = false;
+ }
+ else
+ {
+ updateCovInfo(references[currentRefID].RefLength, fragCovPlot, fragments, fragCoverageVals);
+ //printStats(options, currentRefIDstring, references[currentRefID].RefLength, stats, gaps);
+ for (unsigned long i = 0; i < fragCoverageVals.size(); i++)
+ {
+ cout << currentRefIDstring << '\t' << i + 1 << '\t' << fragCoverageVals[i] << '\n';
+ if (baseCounter > options.sample) return 0;
+ baseCounter++;
+ }
+ }
+
+ currentRefID = bamAlign.RefID;
+ currentRefIDstring = references[bamAlign.RefID].RefName;
+ fragCovPlot = CoveragePlot(options.maxInsert, 0);
+ fragments.clear();
+ fragCoverageVals.clear();
+ }
+
+ short pairOrientation = getPairOrientation(bamAlign);
+ updateCovInfo(bamAlign.Position, fragCovPlot, fragments, fragCoverageVals);
+
+ // if correct orienation (but insert size could be good or bad)
+ if (pairOrientation == INNIE)
+ {
+ // update fragment coverage. We only want
+ // to count each fragment once: in a sorted bam the first appearance
+ // of a fragment is when the insert size is positive
+ if (options.minInsert <= bamAlign.InsertSize && bamAlign.InsertSize <= options.maxInsert)
+ {
+ int64_t fragStart = bamAlign.GetEndPosition() + 1;
+ int64_t fragEnd = bamAlign.MatePosition - 1;
+ if (fragStart <= fragEnd) fragments.insert(fragments.end(), make_pair(fragStart, fragEnd));
+ }
+ }
+ }
+
+ // print the remaining stats from the last ref sequence in the bam
+ updateCovInfo(references[currentRefID].RefLength, fragCovPlot, fragments, fragCoverageVals);
+ for (unsigned long i = 0; i < fragCoverageVals.size(); i++)
+ {
+ cout << currentRefIDstring << '\t' << i + 1 << '\t' << fragCoverageVals[i] << '\n';
+ if (baseCounter > options.sample) return 0;
+ baseCounter++;
+ }
+
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 3;
+ int i;
+
+ if (argc < requiredArgs)
+ {
+ cerr << "usage:\nbam2fragCov [options] <in.bam> <min insert> <max insert>\n\n\
+Gets fragment coverage from a BAM file. Uses 'inner' fragments, i.e. the\n\
+inner mate pair distance (or the fragment size, minus the length of reads)\n\
+min/max insert should be cutoffs for the insert size.\n\
+options:\n\n\
+-s <int>\n\tNumber of bases to sample [2000000]\n\
+";
+ exit(1);
+ }
+
+ // set defaults
+ ops.minMapQuality = 0;
+ ops.sample = 1000000;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // deal with booleans
+
+ // non booleans are of form -option value, so check
+ // next value in array is there before using it!
+ if (strcmp(argv[i], "-s") == 0)
+ {
+ ops.sample = atoi(argv[i+1]);
+ }
+ else
+ {
+ cerr << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.bamInfile = argv[i];
+ ops.minInsert = atoi(argv[i+1]);
+ ops.maxInsert = atoi(argv[i+2]);
+}
+
+
+void updateCovInfo(unsigned long pos, CoveragePlot& covPlot, multiset<pair<unsigned long, unsigned long> >& frags, vector<unsigned long>& covVals)
+{
+ for (unsigned long i = covVals.size(); i < pos; i++)
+ {
+ covPlot.increment();
+ // add any new fragments to the coverage plot, if there are any
+ while (frags.size() && frags.begin()->first == i)
+ {
+ covPlot.add(frags.begin()->second);
+ frags.erase(frags.begin());
+ }
+ covVals.push_back(covPlot.depth());
+ }
+}
+
diff --git a/src/bam2insert.cpp b/src/bam2insert.cpp
new file mode 100644
index 0000000..4d957c2
--- /dev/null
+++ b/src/bam2insert.cpp
@@ -0,0 +1,190 @@
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <string>
+
+#include "utils.h"
+#include "histogram.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+
+using namespace BamTools;
+using namespace std;
+
+struct CmdLineOptions
+{
+ unsigned int binWidth;
+ long minInsert;
+ long maxInsert;
+ unsigned long sample;
+ string bamInfile;
+ string faiInfile;
+ string outprefix;
+};
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ BamReader bamReader;
+ BamAlignment bamAlign;
+ SamHeader header;
+ RefVector references;
+ Histogram innies(options.binWidth);
+ Histogram outies(options.binWidth);
+ Histogram samies(options.binWidth);
+ string out_stats = options.outprefix + ".stats.txt";
+ unsigned long fragCounter = 0;
+ vector<pair< string, unsigned long> > sequencesAndLengths;
+ orderedSeqsFromFai(options.faiInfile, sequencesAndLengths);
+
+ if (!bamReader.Open(options.bamInfile))
+ {
+ cerr << "Error opening bam file '" << options.bamInfile << "'" << endl;
+ return 1;
+ }
+
+ header = bamReader.GetHeader();
+ references = bamReader.GetReferenceData();
+
+ for (vector<pair< string, unsigned long> >:: iterator iter = sequencesAndLengths.begin(); iter != sequencesAndLengths.end(); iter++)
+ {
+ int id = bamReader.GetReferenceID(iter->first);
+ bamReader.SetRegion(id, 1, id, iter->second);
+
+ while (bamReader.GetNextAlignmentCore(bamAlign))
+ {
+ if (!bamAlign.IsMapped() || bamAlign.IsDuplicate()
+ || bamAlign.InsertSize > options.maxInsert || bamAlign.InsertSize < options.minInsert) continue;
+
+ short pairOrientation = getPairOrientation(bamAlign);
+
+ fragCounter += options.sample ? 1 : 0;
+ if (options.sample && fragCounter > options.sample) break;
+
+ if (pairOrientation == DIFF_CHROM || pairOrientation == UNPAIRED) continue;
+ if (bamAlign.MatePosition < bamAlign.GetEndPosition() || bamAlign.InsertSize <= 0) continue;
+
+ if (pairOrientation == SAME)
+ {
+ samies.add(bamAlign.InsertSize, 1);
+ }
+ else if (pairOrientation == INNIE)
+ {
+ innies.add(bamAlign.InsertSize, 1);
+ }
+ else if (pairOrientation == OUTTIE)
+ {
+ outies.add(bamAlign.InsertSize, 1);
+ }
+ }
+
+ if (options.sample && fragCounter > options.sample) break;
+ }
+
+ // Make a plot of each histogram
+ innies.plot(options.outprefix + ".in", "pdf", "Insert size", "Frequency");
+ outies.plot(options.outprefix + ".out", "pdf", "Insert size", "Frequency");
+ samies.plot(options.outprefix + ".same", "pdf", "Insert size", "Frequency");
+
+ // print some stats for the innies
+ ofstream ofs(out_stats.c_str());
+ if (!ofs.good())
+ {
+ cerr << "Error opening file '" << out_stats << "'" << endl;
+ return 1;
+ }
+
+ double mean, sd, pc1, pc99;
+ unsigned long x;
+ ofs.precision(0);
+ fixed(ofs);
+ innies.meanAndStddev(mean, sd);
+ innies.endPercentiles(pc1, pc99);
+ ofs << "mean\t" << mean << "\n"
+ << "sd\t" << sd << "\n"
+ << "mode\t" << innies.mode(x) << "\n"
+ << "pc1\t" << pc1 << "\n"
+ << "pc99\t" << pc99 << "\n";
+
+ ofs.close();
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 3;
+ int i;
+
+ usage = "[options] <in.bam> <assembly.fa.fai> <outfiles prefix>\n\n\
+options:\n\n\
+-b <int>\n\tBin width to use when making histograms [10]\n\
+-m <int>\n\tMin insert size [0]\n\
+-n <int>\n\tMax insert size [20000]\n\
+-s <int>\n\tMax number of fragments to sample. Using -s N\n\
+\twill take the first N fragments from the BAM file [no max]\n\
+";
+
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ cerr << "usage:\nbam2insert " << usage;
+ exit(1);
+ }
+
+ // set defaults
+ ops.binWidth = 10;
+ ops.minInsert = 0;
+ ops.maxInsert = 20000;
+ ops.sample = 0;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // deal with booleans (there aren't any (yet...))
+
+ // non booleans are of form -option value, so check
+ // next value in array is there before using it!
+ if (strcmp(argv[i], "-b") == 0)
+ {
+ ops.binWidth = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-m") == 0)
+ {
+ ops.minInsert = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-n") == 0)
+ {
+ ops.maxInsert = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-s") == 0)
+ {
+ ops.sample = atoi(argv[i+1]);
+ }
+ else
+ {
+ cerr << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.bamInfile = argv[i];
+ ops.faiInfile = argv[i+1];
+ ops.outprefix = argv[i+2];
+}
+
diff --git a/src/bam2perfect.cpp b/src/bam2perfect.cpp
new file mode 100644
index 0000000..2127a41
--- /dev/null
+++ b/src/bam2perfect.cpp
@@ -0,0 +1,224 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include <iomanip>
+
+#include "utils.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+#include "api/BamWriter.h"
+
+using namespace BamTools;
+using namespace std;
+
+
+struct CmdLineOptions
+{
+ string bamInfile;
+ string outPrefix;
+ long minInsert;
+ long maxInsert;
+ uint16_t maxRepetitiveQuality;
+ uint16_t minPerfectQuality;
+ unsigned long perfectAlignmentScore;
+};
+
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+// Writes alignments, up to the position endPos
+void writeAlignments(multimap<unsigned long, BamAlignment>& alignments, BamWriter& bamWriterPerfect, unsigned long endPos);
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ BamReader bamReader;
+ BamAlignment bamAlign;
+ SamHeader header;
+ RefVector references;
+ int32_t currentRefID = -1;
+ bool firstRecord = true;
+ map<string, BamAlignment> perfectBamAlignments;
+ multimap<unsigned long, BamAlignment> alignmentsToWritePerfect; // kept in reference pos order
+ BamWriter bamWriterPerfect, bamWriterRepetitive;
+ string perfectOut = options.outPrefix + ".perfect.bam";
+ string repetitiveOut = options.outPrefix + ".repetitive.bam";
+
+ if (!bamReader.Open(options.bamInfile))
+ {
+ cerr << "Error opening bam file '" << options.bamInfile << "'" << endl;
+ return 1;
+ }
+
+ header = bamReader.GetHeader();
+ references = bamReader.GetReferenceData();
+ bamWriterPerfect.Open(perfectOut, header, references);
+ bamWriterRepetitive.Open(repetitiveOut, header, references);
+
+ while (bamReader.GetNextAlignmentCore(bamAlign))
+ {
+ if (currentRefID != bamAlign.RefID)
+ {
+ if (firstRecord)
+ {
+ firstRecord = false;
+ }
+ else
+ {
+ writeAlignments(alignmentsToWritePerfect, bamWriterPerfect, references[currentRefID].RefLength);
+ perfectBamAlignments.clear();
+ alignmentsToWritePerfect.clear();
+ }
+
+ currentRefID = bamAlign.RefID;
+ }
+
+ if (bamAlign.IsMapped() && bamAlign.Position > 2 * options.maxInsert)
+ {
+ writeAlignments(alignmentsToWritePerfect, bamWriterPerfect, bamAlign.Position - 2 * options.maxInsert);
+ }
+
+ if (bamAlign.IsDuplicate()) continue;
+
+ bamAlign.BuildCharData();
+ bool alignmentIsPerfect = true;
+
+ if (!bamAlign.IsMapped()
+ || !bamAlign.IsMateMapped()
+ || bamAlign.MapQuality < options.minPerfectQuality)
+ {
+ alignmentIsPerfect = false;
+ }
+ else
+ {
+ uint32_t alignmentScore;
+ if (!bamAlign.GetTag("AS", alignmentScore))
+ {
+ cerr << "Read " << bamAlign.Name << " doesn't have an alignment score AS:... Cannot continue" << endl;
+ return(1);
+ }
+ if (alignmentScore < options.perfectAlignmentScore)
+ {
+ alignmentIsPerfect = false;
+ }
+ else
+ {
+ short pairOrientation = getPairOrientation(bamAlign);
+
+ if (pairOrientation != INNIE
+ || options.minInsert > abs(bamAlign.InsertSize)
+ || abs(bamAlign.InsertSize) > options.maxInsert)
+ {
+ alignmentIsPerfect = false;
+ }
+ }
+ }
+
+ multimap<string, BamAlignment>::iterator iter = perfectBamAlignments.find(bamAlign.Name);
+ if (alignmentIsPerfect)
+ {
+ bamAlign.SetIsProperPair(1);
+ if (iter == perfectBamAlignments.end())
+ {
+ perfectBamAlignments[bamAlign.Name] = bamAlign;
+ }
+ else
+ {
+ alignmentsToWritePerfect.insert(make_pair(iter->second.Position, iter->second));
+ perfectBamAlignments.erase(iter);
+ alignmentsToWritePerfect.insert(make_pair(bamAlign.Position, bamAlign));
+ }
+ }
+ else
+ {
+ if (iter != perfectBamAlignments.end())
+ {
+ perfectBamAlignments.erase(iter);
+ }
+ }
+
+ if (bamAlign.IsMapped() && bamAlign.MapQuality <= options.maxRepetitiveQuality)
+ {
+ bamWriterRepetitive.SaveAlignment(bamAlign);
+ }
+ }
+
+ writeAlignments(alignmentsToWritePerfect, bamWriterPerfect, references[currentRefID].RefLength);
+
+ bamWriterPerfect.Close();
+ bamWriterRepetitive.Close();
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 7;
+ int i;
+
+ if (argc < requiredArgs)
+ {
+ cerr << "usage:\nbam2fragCov [options] <in.bam> <out prefix> <min insert> <max insert> <repetitive qual max> <perfect qual min> <min alignment score>\n\n\
+options:\n\n\
+-q <int>\n\tMinimum mapping quality [10]\n\n\
+Writes new BAM file contining only read pairs which:\n\
+ - point towards each other\n\
+ - are in the given insert size range\n\
+ - both have at least the chosen mapping quality (set by -q)\n\
+ - Have an alignment score >= the chosen value\n\n\
+It ignores whether or not the reads are flagged as proper pairs. All reads in the\n\
+output BAM will be set to be proper pairs (0x0002)\n\
+";
+ exit(1);
+ }
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // fill this in if options get added...
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.bamInfile = argv[i];
+ ops.outPrefix = argv[i+1];
+ ops.minInsert = atoi(argv[i+2]);
+ ops.maxInsert = atoi(argv[i+3]);
+ ops.maxRepetitiveQuality = atoi(argv[i+4]);
+ ops.minPerfectQuality = atoi(argv[i+5]);
+ ops.perfectAlignmentScore = atoi(argv[i+6]);
+}
+
+
+void writeAlignments(multimap<unsigned long, BamAlignment>& alignments, BamWriter& bamWriterPerfect, unsigned long endPos)
+{
+ set<unsigned long> toErase;
+
+ for (map<unsigned long, BamAlignment>::iterator p = alignments.begin(); p != alignments.end(); p++)
+ {
+ if (p->second.Position >= endPos) break;
+ bamWriterPerfect.SaveAlignment(p->second);
+ toErase.insert(p->first);
+ }
+
+ for (set<unsigned long>::iterator p = toErase.begin(); p != toErase.end(); p++)
+ {
+ alignments.erase(*p);
+ }
+
+}
+
diff --git a/src/bamtools b/src/bamtools
new file mode 120000
index 0000000..cfb8ee8
--- /dev/null
+++ b/src/bamtools
@@ -0,0 +1 @@
+../third_party/bamtools
\ No newline at end of file
diff --git a/src/coveragePlot.cpp b/src/coveragePlot.cpp
new file mode 100644
index 0000000..f9211e0
--- /dev/null
+++ b/src/coveragePlot.cpp
@@ -0,0 +1,45 @@
+#include "coveragePlot.h"
+
+CoveragePlot::CoveragePlot()
+{
+ CoveragePlot(0, 0);
+}
+
+CoveragePlot::CoveragePlot(unsigned long n, unsigned long pos) : coord_(pos), depth_(0)
+{
+ depthDiff_ = deque<unsigned long>(n + 1, 0);
+}
+
+
+void CoveragePlot::increment()
+{
+ coord_++;
+ depth_ -= depthDiff_[0];
+ depthDiff_.pop_front();
+ depthDiff_.push_back(0);
+}
+
+unsigned long CoveragePlot::depth()
+{
+ return depth_;
+}
+
+
+unsigned long CoveragePlot::front()
+{
+ return depthDiff_[0];
+}
+
+
+void CoveragePlot::add(unsigned long n)
+{
+ depth_++;
+ depthDiff_[n - coord_ + 1]++;
+}
+
+
+unsigned long CoveragePlot::position()
+{
+ return coord_;
+}
+
diff --git a/src/coveragePlot.h b/src/coveragePlot.h
new file mode 100644
index 0000000..e643838
--- /dev/null
+++ b/src/coveragePlot.h
@@ -0,0 +1,38 @@
+#ifndef COVERAGEPLOT_H
+#define COVERAGEPLOT_H
+
+#include <deque>
+
+
+using namespace std;
+
+
+class CoveragePlot
+{
+public:
+ // construct a coverage plot, with max read length n and position pos
+ CoveragePlot();
+ CoveragePlot(unsigned long n, unsigned long pos);
+
+ // move the coord of the plot 1 base to the right
+ void increment();
+
+ // returns the coverage at the current base
+ unsigned long depth();
+
+ // return the zero element depth
+ unsigned long front();
+
+ // add a read that ends at position n
+ void add(unsigned long n);
+
+ // returns the position of the plot
+ unsigned long position();
+
+private:
+ unsigned long coord_;
+ unsigned long depth_;
+ deque<unsigned long> depthDiff_;
+};
+
+#endif // COVERAGEPLOT
diff --git a/src/errorWindow.cpp b/src/errorWindow.cpp
new file mode 100644
index 0000000..9efbbee
--- /dev/null
+++ b/src/errorWindow.cpp
@@ -0,0 +1,61 @@
+#include "errorWindow.h"
+
+
+ErrorWindow::ErrorWindow() {}
+
+ErrorWindow::ErrorWindow(unsigned long coord, double min, double max, unsigned long minLength, double minPC, bool useMin, bool useMax) : coord_(coord), min_(min), max_(max), minLength_(minLength), minPC_(minPC), useMin_(useMin), useMax_(useMax) {}
+
+
+void ErrorWindow::clear(unsigned long coord)
+{
+ coord_ = coord;
+ passOrFail_.clear();
+ failCoords_.clear();
+}
+
+unsigned long ErrorWindow::start()
+{
+ return failCoords_.size() ? failCoords_.front() : coord_;
+}
+
+unsigned long ErrorWindow::end()
+{
+ return failCoords_.size() ? failCoords_.back() : coord_;
+}
+
+bool ErrorWindow::fail()
+{
+ unsigned long size = end() - start() + 1;
+ return size >= minLength_ && 1.0 * failCoords_.size() / size >= minPC_;
+}
+
+
+bool ErrorWindow::lastFail()
+{
+ return passOrFail_.size() && passOrFail_.back();
+}
+
+void ErrorWindow::add(unsigned long pos, double val)
+{
+ if ( (useMin_ && val < min_) || (useMax_ && val > max_) )
+ {
+ failCoords_.push_back(pos);
+ passOrFail_.push_back(true);
+ }
+ else
+ {
+ passOrFail_.push_back(false);
+ }
+
+ if (passOrFail_.size() > minLength_)
+ {
+ if (passOrFail_.front())
+ {
+ failCoords_.pop_front();
+ }
+ passOrFail_.pop_front();
+ coord_++;
+ }
+}
+
+
diff --git a/src/errorWindow.h b/src/errorWindow.h
new file mode 100644
index 0000000..be69fc5
--- /dev/null
+++ b/src/errorWindow.h
@@ -0,0 +1,40 @@
+#ifndef ERRORWINDOW_H
+#define ERRORWINDOW_H
+
+#include <deque>
+
+using namespace std;
+
+class ErrorWindow
+{
+public:
+ // set everything to zero
+ ErrorWindow();
+
+ // construct an error window, with max read length n and position pos
+ ErrorWindow(unsigned long coord, double min, double max, unsigned long minLength, double minPC, bool useMin, bool useMax);
+
+ void clear(unsigned long start); // resets to the given position
+
+ unsigned long start(); // returns start position of window
+ unsigned long end(); // returns end position of window
+
+ bool fail(); // returns true iff window is bad
+
+ bool lastFail(); // returns true iff last element added failed the test
+
+ // add a number to the end of the window
+ void add(unsigned long pos, double val);
+
+private:
+ unsigned long coord_;
+ double min_, max_; // values in [min_, max_] are OK
+ unsigned long minLength_; // min length of window to consider
+ double minPC_; // min % of positions to be failures across min length of window
+ bool useMin_, useMax_;
+ deque<bool> passOrFail_; // 1 == fail
+ deque<unsigned long> failCoords_;
+
+};
+
+#endif // ERRORWINDOW
diff --git a/src/fa2gaps.cpp b/src/fa2gaps.cpp
new file mode 100644
index 0000000..f3506f3
--- /dev/null
+++ b/src/fa2gaps.cpp
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <list>
+
+#include "fasta.h"
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+ if (argc != 2)
+ {
+ cerr << "usage:\nfa2gaps <in.fasta>" << endl;
+ exit(1);
+ }
+
+ Fasta fa;
+ string infile = argv[1];
+
+ ifstream ifs(infile.c_str());
+
+ if (!ifs.good())
+ {
+ cerr << "Error opening file '" << infile << "'" << endl;
+ exit(1);
+ }
+
+ // for each sequence in the input file, print the gap locations
+ while (fa.fillFromFile(ifs))
+ {
+ list<pair<unsigned long, unsigned long> > gaps;
+ fa.findGaps(gaps);
+
+ for(list<pair<unsigned long, unsigned long> >::iterator p = gaps.begin(); p != gaps.end(); p++)
+ {
+ cout << fa.id << '\t' << p->first + 1 << '\t' << p->second + 1 << '\n';
+ }
+ }
+
+ ifs.close();
+ return 0;
+}
+
diff --git a/src/fa2gc.cpp b/src/fa2gc.cpp
new file mode 100644
index 0000000..66d1e1c
--- /dev/null
+++ b/src/fa2gc.cpp
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include <iomanip>
+
+#include "fasta.h"
+
+using namespace std;
+
+
+struct CmdLineOptions
+{
+ unsigned long windowWidth;
+ string fastaInfile;
+};
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ Fasta fa;
+ ifstream ifs(options.fastaInfile.c_str());
+ cout.precision(0);
+
+ if (!ifs.good())
+ {
+ cerr << "Error opening file '" << options.fastaInfile << "'" << endl;
+ exit(1);
+ }
+
+ // for each sequence in the input file, print
+ // gc around each base
+ while (fa.fillFromFile(ifs))
+ {
+ transform(fa.seq.begin(), fa.seq.end(), fa.seq.begin(), ::toupper);
+ list<pair<unsigned long, unsigned long> > gaps;
+ list<pair<unsigned long, unsigned long> > sections;
+
+ fa.findGaps(gaps);
+ unsigned long halfWindow = options.windowWidth / 2;
+ list<pair<unsigned long, unsigned long> >::iterator gapsIter;
+
+ if (gaps.empty())
+ {
+ sections.push_back(make_pair(0, fa.length() - 1));
+ }
+ else
+ {
+ unsigned long previousPos = 0;
+
+ for (gapsIter = gaps.begin(); gapsIter != gaps.end(); gapsIter++)
+ {
+ sections.push_back(make_pair(previousPos, gapsIter->first - 1));
+ previousPos = gapsIter->second + 1;
+ }
+
+ sections.push_back(make_pair(gaps.back().second + 1, fa.length() - 1));
+ }
+
+ gapsIter = gaps.begin(); // this will point at the gap immediately after the current section
+ // in the following loop
+
+ for (list<pair<unsigned long, unsigned long> >::iterator sectionIter = sections.begin(); sectionIter != sections.end(); sectionIter++)
+ {
+ list<char> window;
+ unsigned long window_size = 0; // keep track because list.size() is "up to linear", which seems to mean "linear".
+ unsigned long gcCount = 0;
+ unsigned long sectionLength = sectionIter->second - sectionIter->first + 1;
+ double gc;
+
+ // fill list up to window width (if section is not too short) and count the GC
+ while (window_size <= options.windowWidth && window_size <= sectionLength)
+ {
+ window.push_back(fa.seq[window_size + sectionIter->first]);
+ window_size++;
+ gcCount += window.back() == 'G' || window.back() == 'C' ? 1 : 0;
+ }
+
+ gc = 100.0 * gcCount / window_size;
+
+ if (sectionLength <= options.windowWidth)
+ {
+ for (unsigned long i = 0; i < sectionLength; i++)
+ {
+ cout << fixed << fa.id << '\t' << sectionIter->first + i + 1 << '\t' << gc << '\n';
+ }
+ }
+ else
+ {
+ // print the start: everything before half a window width
+ for (unsigned long i = 0; i < halfWindow; i++)
+ {
+ cout << fixed << fa.id << '\t' << sectionIter->first + i + 1 << '\t' << gc << '\n';
+ }
+
+ // print the middle part of the section
+ for (unsigned long i = halfWindow; i + halfWindow < sectionLength; i++)
+ {
+ window.push_back(fa.seq[sectionIter->first + i]);
+ gcCount += window.back() == 'G' || window.back() == 'C' ? 1 : 0;
+ gcCount -= window.front() == 'G' || window.front() == 'C' ? 1 : 0;
+ window.pop_front();
+ cout << fixed << fa.id << '\t' << sectionIter->first + i + 1 << '\t' << 100.0 * gcCount / window_size << '\n';
+ }
+
+ // print the end part of the section
+ gc = 100.0 * gcCount / window_size;
+
+ for (unsigned long i = halfWindow + 1; i < options.windowWidth; i++)
+ {
+ cout << fixed << fa.id << '\t' << sectionIter->first + sectionLength - options.windowWidth + i + 1 << '\t' << gc << '\n';
+ }
+ }
+ // do the GC over the next gap (if there is one)
+ if (gapsIter != gaps.end())
+ {
+ unsigned long gapEnd = gapsIter->second;
+ gapsIter++;
+
+ // print the GC content over the gap
+ for (unsigned long i = sectionIter->second + 1; i <= gapEnd; i++)
+ {
+ cout << fixed << fa.id << '\t' << i + 1 << '\t' << 0 << '\n';
+
+ }
+ }
+
+ }
+ }
+
+ ifs.close();
+
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 1;
+ int i;
+
+ usage = "[options] <in.fasta>\n\n\
+options:\n\n\
+-w <int>\n\tWindow width around each base used to calculate GC content [101]\n\
+";
+
+ if (argc <= requiredArgs)
+ {
+ cerr << "usage:\nfa2gc " << usage;
+ exit(1);
+ }
+
+ ops.windowWidth = 101;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ if (strcmp(argv[i], "-w") == 0)
+ {
+ ops.windowWidth = atoi(argv[i+1]);
+ }
+ else
+ {
+ cerr << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ // round up window to nearest odd number
+ ops.windowWidth += ops.windowWidth % 2 ? 0 : 1;
+
+ ops.fastaInfile = argv[i];
+}
+
+
diff --git a/src/fasta.cpp b/src/fasta.cpp
new file mode 100644
index 0000000..84083ec
--- /dev/null
+++ b/src/fasta.cpp
@@ -0,0 +1,131 @@
+#include "fasta.h"
+
+Fasta::Fasta() : id(""), seq("") {}
+Fasta::Fasta(string& name, string& s) : id(name), seq(s) {}
+
+
+void Fasta::print(ostream& outStream, unsigned int lineWidth) const
+{
+ if (lineWidth == 0)
+ {
+ outStream << ">" << id << endl << seq << endl;
+ }
+ else
+ {
+ outStream << ">" << id << endl;
+
+ for (unsigned int i = 0; i < length(); i += lineWidth)
+ {
+ outStream << seq.substr(i, lineWidth) << endl;
+ }
+ }
+}
+
+
+Fasta Fasta::subseq(unsigned long start, unsigned long end)
+{
+ Fasta fa;
+ fa.id = id;
+ fa.seq = seq.substr(start, end - start + 1);
+ return fa;
+}
+
+
+unsigned long Fasta::length() const
+{
+ return seq.length();
+}
+
+
+void Fasta::findGaps(list<pair<unsigned long, unsigned long> >& lIn)
+{
+ unsigned long pos = seq.find_first_of("nN");
+ lIn.clear();
+
+ while (pos != string::npos)
+ {
+ unsigned long start = pos;
+ pos = seq.find_first_not_of("nN", pos);
+ if (pos == string::npos)
+ {
+ lIn.push_back(make_pair(start, seq.length() - 1));
+ }
+ else
+ {
+ lIn.push_back(make_pair(start, pos - 1));
+ pos = seq.find_first_of("nN", pos);
+ }
+ }
+}
+
+unsigned long Fasta::getNumberOfGaps()
+{
+ list<pair<unsigned long, unsigned long> > gaps;
+ findGaps(gaps);
+ return gaps.size();
+}
+
+bool Fasta::fillFromFile(istream& inStream)
+{
+ string line;
+ seq = "";
+ id = "";
+ getline(inStream, line);
+
+ // check if we're at the end of the file
+ if (inStream.eof())
+ {
+ return false;
+ }
+ // Expecting a header line. If not, abort
+ else if (line[0] == '>')
+ {
+ id = line.substr(1);
+ }
+ else
+ {
+ cerr << "Error reading fasta file!" << endl
+ << "Expected line starting with '>', but got this:" << endl
+ << line << endl;
+ exit(1);
+ }
+
+ // Next lines should be sequence, up to next header, or end of file
+ while ((inStream.peek() != '>') && (!inStream.eof()))
+ {
+ getline(inStream, line);
+ seq += line;
+ }
+
+ return true;
+}
+
+
+unsigned long Fasta::nCount()
+{
+ return count(seq.begin(), seq.end(), 'n') + count(seq.begin(), seq.end(), 'N');
+}
+
+
+void Fasta::trimNs(unsigned long& startBases, unsigned long& endBases)
+{
+ list<pair<unsigned long, unsigned long> > gaps;
+ findGaps(gaps);
+ startBases = endBases = 0;
+ if (gaps.size())
+ {
+ if (gaps.back().second == length() - 1)
+ {
+ endBases = gaps.back().second - gaps.back().first + 1;
+ seq.resize(gaps.back().first);
+ }
+
+ if (gaps.front().first == 0)
+ {
+ startBases = gaps.front().second + 1;
+ seq.erase(0, startBases);
+ }
+ }
+}
+
+
diff --git a/src/fasta.h b/src/fasta.h
new file mode 100644
index 0000000..fad65de
--- /dev/null
+++ b/src/fasta.h
@@ -0,0 +1,50 @@
+#ifndef FASTA_H
+#define FASTA_H
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <algorithm>
+#include <list>
+
+
+using namespace std;
+
+class Fasta
+{
+public:
+ Fasta();
+ Fasta(string& name, string& s);
+
+ // prints the sequence to outStream, with lineWidth bases on each
+ // line. If lineWidth = 0, no linebreaks are printed in the sequence.
+ void print(ostream& outStream, unsigned int lineWidth = 60) const;
+
+ // returns number of bases in sequence
+ unsigned long length() const;
+
+ // returns a new fasta object which is subseq of the original.
+ // ID will be same as original
+ Fasta subseq(unsigned long start, unsigned long end);
+
+ // fills vector with (start, end) positions of each gap in the sequence
+ void findGaps(list<pair<unsigned long, unsigned long> >& lIn);
+
+ unsigned long getNumberOfGaps();
+
+ // reads next sequence from file, filling contents appropriately
+ // Returns true if worked ok, false if at end of file
+ bool fillFromFile(istream& inStream);
+
+ // Returns the number of 'N' + 'n' in the sequence
+ unsigned long nCount();
+
+ // Removes any Ns off the start/end of the sequence. Sets startBases and endBases to number
+ // of bases trimmed off each end
+ void trimNs(unsigned long& startBases, unsigned long& endBases);
+
+ string id;
+ string seq;
+};
+
+#endif // FASTA_H
diff --git a/src/findknownsnps b/src/findknownsnps
new file mode 120000
index 0000000..4a6e636
--- /dev/null
+++ b/src/findknownsnps
@@ -0,0 +1 @@
+../third_party/snpomatic/findknownsnps
\ No newline at end of file
diff --git a/src/histogram.cpp b/src/histogram.cpp
new file mode 100644
index 0000000..b548410
--- /dev/null
+++ b/src/histogram.cpp
@@ -0,0 +1,401 @@
+#include "histogram.h"
+
+Histogram::Histogram(unsigned int bin)
+{
+ binWidth_ = bin;
+ plotLabelXcoordsLeftBin_ = false;
+ plotXdivide_ = 0;
+ plotTrim_ = 4;
+}
+
+
+unsigned long Histogram::sample()
+{
+ // work out the reverese CDF if not already known
+ if (reverseCDF_.empty())
+ {
+ map<unsigned long, vector<unsigned long> > cdfMap;
+ unsigned long sum = 0;
+ unsigned long total = size();
+
+ for (unsigned int i = 0; i < 101; i++)
+ {
+ reverseCDF_.push_back(0);
+ }
+
+ // first work out the cdf
+ for(map<unsigned long, unsigned long>::iterator p = bins_.begin(); p != bins_.end(); p++)
+ {
+ sum += p->second;
+ unsigned long key = floor(100.0 * sum / total);
+ cdfMap[key].push_back(p->first);
+ }
+
+ for(map<unsigned long, vector<unsigned long> >::iterator p = cdfMap.begin(); p != cdfMap.end(); p++)
+ {
+ // work out the mean of current vector
+ unsigned long s = 0;
+ for (unsigned long i = 0; i < p->second.size(); i++)
+ {
+ s+= p->second[i];
+ }
+ reverseCDF_[p->first] = floor(s / p->second.size());
+ }
+
+ // interpolate the missing (==0) values
+ unsigned int index = 0;
+ while (index < reverseCDF_.size())
+ {
+ if (reverseCDF_[index] == 0)
+ {
+ // find next non-zero value
+ unsigned int nextNonzero = index;
+ while (reverseCDF_[nextNonzero] == 0) {nextNonzero++;}
+
+ // interpolate the missing values
+ for (unsigned int k = index - 1; k < nextNonzero; k++)
+ {
+ reverseCDF_[k] = floor(reverseCDF_[index - 1] + 1.0 * (reverseCDF_[nextNonzero] - reverseCDF_[index - 1]) * (k - index + 1) / (nextNonzero - index + 1));
+ }
+ index = nextNonzero + 1;
+ }
+ else
+ {
+ index++;
+ }
+ }
+ }
+
+ // now pick value at random and return the bin midpoint
+ return reverseCDF_[rand() % 101];
+}
+
+
+
+void Histogram::add(unsigned long val, unsigned long count)
+{
+ val = binWidth_ * (val / binWidth_);
+ bins_[val] += count;
+}
+
+
+void Histogram::clear()
+{
+ bins_.clear();
+}
+
+
+bool Histogram::empty()
+{
+ return bins_.empty();
+}
+
+
+unsigned int Histogram::binWidth()
+{
+ return binWidth_;
+}
+
+
+unsigned long Histogram::get(long pos)
+{
+ pos = binWidth_ * (pos / binWidth_);
+ map<unsigned long, unsigned long>::iterator p = bins_.find(pos);
+ return p == bins_.end() ? 0 : p->second;
+}
+
+unsigned long Histogram::size()
+{
+ unsigned long size = 0;
+
+ for (map<unsigned long, unsigned long>::iterator iter = bins_.begin(); iter != bins_.end(); iter++)
+ {
+ size += iter->second;
+ }
+ return size;
+}
+
+
+double Histogram::mode(unsigned long& val)
+{
+ if (bins_.size() == 0) return 0;
+
+ val = 0;
+ unsigned long mode = 0;
+
+ for (map<unsigned long, unsigned long>::iterator iter = bins_.begin(); iter != bins_.end(); iter++)
+ {
+ if (val < iter->second)
+ {
+ mode = iter->first;
+ val = iter->second;
+ }
+ }
+
+ return mode + 0.5 * binWidth_;
+}
+
+
+double Histogram::modeNearMean()
+{
+ if (bins_.size() == 0) return 0;
+ unsigned long val;
+ double mean, testMode, stddev;
+ testMode = mode(val);
+ meanAndStddev(mean, stddev);
+
+ // is the real mode ok?
+ if (abs(mean - testMode) < stddev) return testMode;
+cerr << "[modeNearMean] looking for mode near mean. testMode=" << testMode << ". mean=" << mean << ". stddev=" << stddev << endl;
+ // look for a mode nearer the mean
+ val = 0;
+ testMode = 0;
+ for (map<unsigned long, unsigned long>::iterator iter = bins_.begin(); iter != bins_.end(); iter++)
+ {
+ if (abs(iter->first + 0.5 * binWidth_ - mean) < stddev && val < iter->second)
+ {
+ testMode = iter->first;
+ val = iter->second;
+ }
+ }
+
+ return testMode + 0.5 * binWidth_;
+}
+
+double Histogram::leftPercentile(double p)
+{
+ if (bins_.size() == 0) return 0;
+
+ unsigned long sum = 0;
+
+ // get the sum, up to the mode
+ unsigned long total = 0;
+ unsigned long modeValue;
+ double thisMode = mode(modeValue);
+ for (map<unsigned long, unsigned long>::iterator iter = bins_.begin(); (double) iter->first <= thisMode; iter++)
+ {
+ total += iter->second;
+ }
+
+ total -= modeValue / 2;
+
+ for (map<unsigned long, unsigned long>::iterator iter = bins_.begin(); iter != bins_.end(); iter++)
+ {
+ if (sum + iter->second > total * p)
+ {
+ // interpolate between the two bins.
+ double yDiff = 1.0 * (1.0 * total * p - sum) / iter->second;
+ return 1.0 * iter->first + 1.0 * yDiff / binWidth_;
+ }
+ sum += iter->second;
+ }
+
+ map<unsigned long, unsigned long>::iterator i = bins_.end();
+ i--;
+ return i->first + 0.5 * binWidth_;
+}
+
+
+void Histogram::endPercentiles(double& first, double& last)
+{
+ if (bins_.size() == 0) return;
+ unsigned long sum = 0;
+ unsigned long total = size();
+ first = bins_.begin()->first + 0.5 * binWidth_;
+
+ for (map<unsigned long, unsigned long>::iterator iter = bins_.begin(); iter != bins_.end(); iter++)
+ {
+ sum += iter->second;
+ if (sum > total * 0.01)
+ {
+ first = iter->first + 0.5 * binWidth_;
+ break;
+ }
+ }
+
+ sum = 0;
+ last = bins_.rbegin()->first + 0.5 * binWidth_;
+
+ for (map<unsigned long, unsigned long>::reverse_iterator iter = bins_.rbegin(); iter != bins_.rend(); iter++)
+ {
+ sum += iter->second;
+ if (sum > total * 0.01)
+ {
+ last = iter->first + 0.5 * binWidth_;
+ break;
+ }
+ }
+}
+
+double Histogram::minimumBin()
+{
+ if (bins_.size() == 0)
+ return -1;
+
+ return bins_.begin()->first + 0.5 * binWidth_;
+}
+
+map<unsigned long, unsigned long>::const_iterator Histogram::begin()
+{
+ return bins_.begin();
+}
+
+
+map<unsigned long, unsigned long>::const_iterator Histogram::end()
+{
+ return bins_.end();
+}
+
+
+void Histogram::meanAndStddev(double& mean, double& stddev)
+{
+ if (bins_.size() == 0) return;
+
+ double sum = 0;
+ unsigned long count = 0;
+
+ // first work out the mean
+ for (map<unsigned long, unsigned long>::iterator p = bins_.begin(); p != bins_.end(); p++)
+ {
+ sum += p->second * (p->first + 0.5 * binWidth_);
+ count += p->second;
+ }
+
+ mean = sum / count;
+
+ // now do standard deviation
+ count = 0;
+ sum = 0;
+
+ for (map<unsigned long, unsigned long>::iterator p = bins_.begin(); p != bins_.end(); p++)
+ {
+ sum += p->second * ( 0.5 * binWidth_ + p->first - mean) * ( 0.5 * binWidth_ + p->first - mean);
+ count += p->second;
+ }
+
+ stddev = sqrt(sum / (1.0 * count));
+}
+
+void Histogram::setPlotOptionTrim(double trim)
+{
+ plotTrim_ = trim;
+}
+
+void Histogram::setPlotOptionXdivide(double divisor)
+{
+ plotXdivide_ = divisor;
+}
+
+void Histogram::setPlotOptionUseLeftOfBins(bool useLeft)
+{
+ plotLabelXcoordsLeftBin_ = useLeft;
+}
+
+
+void Histogram::setPlotOptionAddVline(double d)
+{
+ plotVlines_.push_back(d);
+}
+
+
+void Histogram::plot(string outprefix, string ext, string xlabel, string ylabel)
+{
+ if (bins_.size() == 0)
+ {
+ return;
+ }
+
+ string outfile_plot = outprefix + '.' + ext;
+ string outfile_R = outprefix + ".R";
+ ofstream ofs(outfile_R.c_str());
+
+ if (!ofs.good())
+ {
+ cerr << "Error opening file '" << outfile_R << "'" << endl;
+ exit(1);
+ }
+
+ unsigned long currentBin;
+ double mean, stddev;
+ meanAndStddev(mean, stddev);
+ mean = mean ? mean : 1;
+
+ stringstream xCoords;
+ stringstream yCoords;
+
+ currentBin = bins_.begin()->first;
+
+ for (map<unsigned long, unsigned long>::iterator p = bins_.begin(); p != bins_.end(); p++)
+ {
+ double xval = p->first;
+ if (!plotLabelXcoordsLeftBin_) xval += 0.5 * binWidth_;
+
+ if (plotTrim_ && xval < mean - plotTrim_ * stddev)
+ {
+ continue;
+ }
+ else if (plotTrim_ && xval > mean + plotTrim_ * stddev)
+ {
+ break;
+ }
+
+ if (plotXdivide_) xval /= plotXdivide_;
+
+
+ while (currentBin < p->first)
+ {
+ double x = currentBin;
+ if (!plotLabelXcoordsLeftBin_) currentBin += 0.5 * binWidth_;
+ if (plotXdivide_) x /= plotXdivide_;
+ xCoords << ',' << x;
+ yCoords << ",0";
+ currentBin += binWidth_;
+ }
+
+ xCoords << ',' << xval;
+ yCoords << ',' << p->second;
+ currentBin += binWidth_;
+ }
+
+ string x = xCoords.str();
+ string y = yCoords.str();
+
+ if (x[x.size() - 1] == ',')
+ x.resize(x.size() - 1);
+
+ if (y[y.size() - 1] == ',')
+ y.resize(y.size() - 1);
+
+ ofs << "x = c(" << x.substr(1, x.size() - 1) << ')' << endl
+ << "y = c(" << y.substr(1, y.size() - 1) << ')' << endl
+ << ext << "(\"" << outfile_plot << "\")" << endl
+ << "plot(x, y, xlab=\"" << xlabel << "\", ylab=\"" << ylabel << "\", type=\"l\")" << endl;
+
+ for (vector<double>::iterator i = plotVlines_.begin(); i != plotVlines_.end(); i++)
+ {
+ ofs << "abline(v=" << *i << ", col=\"red\", lty=2)" << endl;
+ }
+
+ ofs << "dev.off()" << endl;
+ ofs.close();
+ systemCall("R CMD BATCH " + outfile_R + " " + outfile_R + "out");
+}
+
+
+void Histogram::writeToFile(string fname, double offset, double xMultiplier)
+{
+ ofstream ofs(fname.c_str());
+ if (!ofs.good())
+ {
+ cerr << "Error opening file for writing '" << fname << "'" << endl;
+ exit(1);
+ }
+
+ offset = offset == -1 ? 0.5 * binWidth_ : offset;
+
+ for (map<unsigned long, unsigned long>::iterator p = bins_.begin(); p != bins_.end(); p++)
+ {
+ ofs << xMultiplier * (offset + p->first) << '\t' << p->second << endl;;
+ }
+ ofs.close();
+}
diff --git a/src/histogram.h b/src/histogram.h
new file mode 100644
index 0000000..bb150d1
--- /dev/null
+++ b/src/histogram.h
@@ -0,0 +1,102 @@
+#ifndef HISTOGRAM_H
+#define HISTOGRAM_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cmath>
+#include <vector>
+#include <ctype.h>
+#include "utils.h"
+
+using namespace std;
+
+class Histogram
+{
+public:
+ // Construct a histogram with the given bin width
+ Histogram(unsigned int bin = 50);
+
+ // picks a value at random from the histogram
+ unsigned long sample();
+
+ // adds count to the bin which value falls into
+ void add(unsigned long val, unsigned long count);
+
+ // clears the histogram
+ void clear();
+
+ // returns true iff the histogram is empty
+ bool empty();
+
+ // returns the bin width of the histogram
+ unsigned int binWidth();
+
+ // get value of the bin which position falls into
+ unsigned long get(long pos);
+
+ // returns a count of the numbers in the histogram
+ // (i.e. sum of size of each bin)
+ unsigned long size();
+
+ // returns the mode of the histogram (returns zero if the histogram
+ // is empty, so check if it's empty first!)
+ // 'val' will be set to the value at the mode
+ double mode(unsigned long& val);
+
+ // returns the mode value within one standard deviation of the mean
+ double modeNearMean();
+
+ // gets the mean and standard deviation (does nothing if the histogram is empty)
+ void meanAndStddev(double& mean, double& stddev);
+
+ // gets the first and last percentiles
+ void endPercentiles(double& first, double& last);
+
+
+ // p in [0,1]
+ double leftPercentile(double p);
+
+ double minimumBin();
+
+ // trim determines the x range when plotting, which will be [mode - trim * stddev, mode + trim * stddev]
+ void setPlotOptionTrim(double trim);
+
+ // when plotting, divide every x value by this number (default is to do no dividing).
+ // This is handy when using a Histogram store non-integers (rounded a bit, obviously)
+ void setPlotOptionXdivide(double divisor);
+
+ // When plotting, default is to use middle of each bin as the x coords.
+ // Set this to use the left of the bin instead
+ void setPlotOptionUseLeftOfBins(bool useLeft);
+
+ // When plotting, add in a vertical dashed red line at the given value
+ void setPlotOptionAddVline(double d);
+
+ // make a plot of the histogram. file extension 'ext' must be pdf or png
+ // xlabel, ylabel = labels for x and y axes respectively. plotExtra is added
+ // to tha call to plot in R
+ void plot(string outprefix, string ext, string xlabel, string ylabel);
+
+ // write data to file, tab-delimited per line: bin count.
+ // offset is amount to add to x (bin) values. -1 means
+ // add 1/2 bin width.
+ void writeToFile(string fname, double offset, double xMultiplier);
+
+ map<unsigned long, unsigned long>::const_iterator begin();
+ map<unsigned long, unsigned long>::const_iterator end();
+
+private:
+ unsigned int binWidth_;
+ map<unsigned long, unsigned long> bins_;
+ vector<unsigned long> reverseCDF_;
+ bool plotLabelXcoordsLeftBin_;
+ double plotXdivide_;
+ double plotTrim_;
+ vector<double> plotVlines_;
+};
+
+#endif // HISTOGRAM_H
diff --git a/src/make_plots.cpp b/src/make_plots.cpp
new file mode 100644
index 0000000..0177120
--- /dev/null
+++ b/src/make_plots.cpp
@@ -0,0 +1,124 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <map>
+
+using namespace std;
+
+const short CHR = 0;
+const short POS = 1;
+const short PERFECT_COV = 2;
+const short READ_F = 3;
+const short READ_PROP_F = 4;
+const short READ_ORPHAN_F = 5;
+const short READ_ISIZE_F = 6;
+const short READ_BADORIENT_F = 7;
+const short READ_R = 8;
+const short READ_PROP_R = 9;
+const short READ_ORPHAN_R = 10;
+const short READ_ISIZE_R = 11;
+const short READ_BADORIENT_R = 12;
+const short FRAG_COV = 13;
+const short FRAG_COV_CORRECT = 14;
+const short FCD_MEAN = 15;
+const short CLIP_FL = 16;
+const short CLIP_RL = 17;
+const short CLIP_FR = 18;
+const short CLIP_RR = 19;
+const short FCD_ERR = 20;
+
+
+int main(int argc, char* argv[])
+{
+ map<string, vector<short> > plots;
+ map<string, ofstream*> plot_handles;
+ bool firstLine = true;
+ string line;
+ string preout = argv[1];
+
+ plots["read_cov"].push_back(READ_F);
+ plots["read_cov"].push_back(READ_R);
+ plots["frag_cov"].push_back(FRAG_COV);
+ plots["frag_cov_cor"].push_back(FRAG_COV_CORRECT);
+ plots["FCD_err"].push_back(FCD_ERR);
+
+ plots["read_ratio_f"].push_back(READ_PROP_F);
+ plots["read_ratio_f"].push_back(READ_ORPHAN_F);
+ plots["read_ratio_f"].push_back(READ_ISIZE_F);
+ plots["read_ratio_f"].push_back(READ_BADORIENT_F);
+
+ plots["read_ratio_r"].push_back(READ_PROP_R);
+ plots["read_ratio_r"].push_back(READ_ORPHAN_R);
+ plots["read_ratio_r"].push_back(READ_ISIZE_R);
+ plots["read_ratio_r"].push_back(READ_BADORIENT_R);
+
+ plots["clip"].push_back(CLIP_FL);
+ plots["clip"].push_back(CLIP_RL);
+ plots["clip"].push_back(CLIP_FR);
+ plots["clip"].push_back(CLIP_RR);
+
+ while (getline(cin, line) && !cin.eof())
+ {
+ vector<string> data;
+ string tmp;
+
+ // split the line into a vector
+ stringstream ss(line);
+ data.clear();
+
+ while(getline(ss, tmp, '\t'))
+ {
+ data.push_back(tmp);
+ }
+
+ // open all the files, if it's the first line of the file
+ if (firstLine)
+ {
+ // check if need to make perfect read mapping plot file
+ if (data[2].compare("-1")) plots["perfect_cov"].push_back(PERFECT_COV);
+
+ // open the plot files
+ for (map<string, vector<short> >::iterator p = plots.begin(); p != plots.end(); p++)
+ {
+ string fname = preout + "." + p->first + ".plot";
+ plot_handles[p->first] = new ofstream(fname.c_str());
+
+ if (! plot_handles[p->first]->good())
+ {
+ cerr << "Error opening file " << fname << endl;
+ return 1;
+ }
+ }
+
+ firstLine = false;
+ }
+
+ // write data to output files
+ for (map<string, vector<short> >::iterator p = plots.begin(); p != plots.end(); p++)
+ {
+ *(plot_handles[p->first]) << data[0] << '\t' << data[1] << '\t' << data[p->second.front()];
+
+ for (vector<short>::iterator i = p->second.begin() + 1; i < p->second.end(); i++)
+ {
+ *plot_handles[p->first] << "\t" + data[*i];
+ }
+
+ *plot_handles[p->first] << "\n";
+ }
+ }
+
+ // close the plot files
+ for (map<string, vector<short> >::iterator p = plots.begin(); p != plots.end(); p++)
+ {
+ plot_handles[p->first]->close();
+ }
+
+ return 0;
+}
+
diff --git a/src/n50.cpp b/src/n50.cpp
new file mode 100644
index 0000000..2db5f52
--- /dev/null
+++ b/src/n50.cpp
@@ -0,0 +1,178 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <cstring>
+
+#include "fasta.h"
+
+using namespace std;
+
+
+struct Stats
+{
+ double mean;
+ unsigned long n50[9];
+ unsigned long n50n[9];
+ unsigned long longest;
+ unsigned long shortest;
+ unsigned long number;
+ unsigned long totalLength;
+ unsigned long nCount;
+ unsigned long gapCount;
+};
+
+
+struct CmdLineOps
+{
+ unsigned long minLength;
+ int infileStartIndex;
+};
+
+
+void parseOptions(int argc, char** argv, CmdLineOps& ops);
+Stats file2stats(string filename, CmdLineOps& ops);
+
+void print_stats(string fname, Stats& s);
+
+int main(int argc, char* argv[])
+{
+ CmdLineOps ops;
+ parseOptions(argc, argv, ops);
+ bool first = true;
+
+ for (int i = ops.infileStartIndex; i < argc; i++)
+ {
+ Stats s = file2stats(argv[i], ops);
+ if (!first) cout << "-------------------------------" << endl;
+ first = false;
+ print_stats(argv[i], s);
+ }
+
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOps& ops)
+{
+ string usage;
+ ops.minLength = 1;
+ ops.infileStartIndex = 1;
+
+ usage = "usage: stats [options] list of fasta files\n\n\
+options:\n\
+-l <int>\n\tMinimum length cutoff for each sequence [1]\n";
+
+ if (argc < 2)
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ while (argv[ops.infileStartIndex][0] == '-')
+ {
+ if (strcmp(argv[ops.infileStartIndex], "-l") == 0)
+ {
+ ops.minLength = atoi(argv[ops.infileStartIndex + 1]);
+ ops.infileStartIndex += 2;
+ }
+ else
+ {
+ cerr << "error parsing options, somewhere around this: " << argv[ops.infileStartIndex] << endl;
+ exit(1);
+ }
+ }
+}
+
+
+Stats file2stats(string filename, CmdLineOps& ops)
+{
+ Stats s;
+ vector<unsigned long> seqLengths;
+ unsigned long cumulativeLength = 0;
+ ifstream ifs(filename.c_str());
+ Fasta fa;
+
+ if (!ifs.good())
+ {
+ cerr << "[n50] Error opening file '" << filename << "'" << endl;
+ exit(1);
+ }
+
+ s.totalLength = 0;
+ s.nCount = 0;
+ s.gapCount = 0;
+
+ while(fa.fillFromFile(ifs))
+ {
+ if (fa.length() >= ops.minLength)
+ {
+ unsigned long l = fa.length();
+ seqLengths.push_back(l);
+ s.totalLength += l;
+ s.nCount += fa.nCount();
+ s.gapCount += fa.getNumberOfGaps();
+ }
+ }
+
+ ifs.close();
+
+ for (unsigned long i = 0; i < 9; i++)
+ {
+ s.n50[i] = 0;
+ s.n50n[i] = 0;
+ }
+
+ if (seqLengths.size() == 0)
+ {
+ s.longest = 0;
+ s.shortest = 0;
+ s.number = 0;
+ s.mean = 0;
+ s.totalLength = 0;
+ return s;
+ }
+
+ sort(seqLengths.begin(), seqLengths.end());
+ s.longest = seqLengths.back();
+ s.shortest = seqLengths.front();
+ s.number = seqLengths.size();
+ s.mean = 1.0 * s.totalLength / s.number;
+
+ unsigned long k = 0;
+
+ for (unsigned long j = 0; j < seqLengths.size(); j++)
+ {
+ unsigned long i = seqLengths.size() - 1 - j;
+ cumulativeLength += seqLengths[i];
+
+ while (k < 9 && cumulativeLength >= s.totalLength * 0.1 * (k + 1))
+ {
+ s.n50[k] = seqLengths[i];
+ s.n50n[k] = seqLengths.size() - i;
+ k++;
+ }
+ }
+
+ return s;
+}
+
+
+void print_stats(string fname, Stats& s)
+{
+ cout.precision(2);
+
+ cout << "filename\t" << fname << endl
+ << "bases\t" << s.totalLength << endl
+ << "sequences\t" << s.number << endl
+ << "mean_length\t" << fixed << s.mean << endl
+ << "longest\t" << s.longest << endl
+ << "N50\t" << s.n50[4] << endl << "N50_n\t" << s.n50n[4] << endl
+ << "N60\t" << s.n50[5] << endl << "N60_n\t" << s.n50n[5] << endl
+ << "N70\t" << s.n50[6] << endl << "N70_n\t" << s.n50n[6] << endl
+ << "N80\t" << s.n50[7] << endl << "N80_n\t" << s.n50n[7] << endl
+ << "N90\t" << s.n50[8] << endl << "N90_n\t" << s.n50n[8] << endl
+ << "N100\t" << s.shortest << endl << "N100_n\t" << s.number << endl
+ << "gaps\t" << s.gapCount << endl
+ << "gaps_bases\t" << s.nCount << endl;
+}
diff --git a/src/reapr.pl b/src/reapr.pl
new file mode 100755
index 0000000..2d9df53
--- /dev/null
+++ b/src/reapr.pl
@@ -0,0 +1,101 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use File::Basename;
+use File::Spec;
+use File::Spec::Link;
+
+my $reapr = File::Spec->rel2abs($0);
+my $this_script = File::Spec::Link->resolve($0);
+$this_script = File::Spec->rel2abs($this_script);
+my ($scriptname, $scriptdir) = fileparse($this_script);
+$scriptdir = File::Spec->rel2abs($scriptdir);
+my $tabix = File::Spec->catfile($scriptdir, 'tabix/tabix');
+my $bgzip = File::Spec->catfile($scriptdir, 'tabix/bgzip');
+my $version = '1.0.18';
+
+if ($#ARGV == -1) {
+ die qq/REAPR version: $version
+Usage:
+ reapr <task> [options]
+
+Common tasks:
+ facheck - checks IDs in fasta file
+ smaltmap - map read pairs using SMALT: makes a BAM file to be used as
+ input to the pipeline
+ perfectmap - make perfect uniquely mapping plot files
+ pipeline - runs the REAPR pipeline, using an assembly and mapped reads
+ as input, and optionally results of perfectmap.
+ (It runs facheck, preprocess, stats, fcdrate, score,
+ summary and break)
+ plots - makes Artemis plot files for a given contig, using results
+ from stats (and optionally results from score)
+ seqrename - renames all sequences in a BAM file: use this if you already
+ mapped your reads but then found facheck failed - saves
+ remapping the reads so that pipeline can be run
+
+Advanced tasks:
+ preprocess - preprocess files: necessary for running stats
+ stats - generates stats from a BAM file
+ fcdrate - estimates FCD cutoff for score, using results from stats
+ score - calculates scores and assembly errors, using results from stats
+ summary - make summary stats file, using results from score
+ break - makes broken assembly, using results from score
+ gapresize - experimental, calculates gap sizes based on read mapping
+ perfectfrombam - generate perfect mapping plots from a bam file (alternative
+ to using perfectmap for large genomes)
+/;
+}
+
+my %tasks = (
+ 'perfectmap' => "task_perfectmap.pl",
+ 'smaltmap' => "task_smaltmap.pl",
+ 'preprocess' => "task_preprocess.pl",
+ 'stats' => "task_stats",
+ 'break' => "task_break",
+ 'score' => "task_score",
+ 'plots' => "task_plots.pl",
+ 'pipeline' => "task_pipeline.pl",
+ 'seqrename' => "task_seqrename.pl",
+ 'summary' => "task_summary.pl",
+ 'facheck' => 'task_facheck.pl',
+ 'gapresize' => 'task_gapresize',
+ 'fcdrate' => 'task_fcdrate',
+ 'perfectfrombam' => 'task_perfectfrombam.pl',
+);
+
+for my $k (keys %tasks) {
+ $tasks{$k} = File::Spec->catfile($scriptdir, $tasks{$k});
+}
+
+
+if ($tasks{$ARGV[0]}) {
+ if ($#ARGV == 0) {
+ print STDERR "usage:\nreapr $ARGV[0] ";
+ exec "$tasks{$ARGV[0]} --wrapperhelp" or die;
+ }
+ else {
+ my $cmd;
+
+ if ($ARGV[0] eq "score") {
+ my $score_out = "$ARGV[-1].per_base.gz";
+ my $errors_out = "$ARGV[-1].errors.gff";
+ $cmd = "$tasks{$ARGV[0]} " . join(" ", @ARGV[1..$#ARGV]) . " | $bgzip -f -c > $score_out && $tabix -f -b 2 -e 2 $score_out && $bgzip -f $errors_out && $tabix -f -p gff $errors_out.gz";
+ }
+ else {
+ $cmd = "$tasks{$ARGV[0]} " . join(" ", @ARGV[1..$#ARGV]);
+
+ if ($ARGV[0] eq "stats") {
+ my $outfile = "$ARGV[-1].per_base.gz";
+ $cmd .= " | $bgzip -f -c > $outfile && $tabix -f -b 2 -e 2 $outfile";
+ }
+ }
+
+ exec $cmd or die;
+ }
+}
+else {
+ die qq/Task "$ARGV[0]" not recognised.\n/;
+}
diff --git a/src/samtools b/src/samtools
new file mode 120000
index 0000000..8fb3c67
--- /dev/null
+++ b/src/samtools
@@ -0,0 +1 @@
+../third_party/samtools/samtools
\ No newline at end of file
diff --git a/src/scaff2contig.cpp b/src/scaff2contig.cpp
new file mode 100755
index 0000000..b8af400
--- /dev/null
+++ b/src/scaff2contig.cpp
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <list>
+
+#include "fasta.h"
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+ if (argc < 2)
+ {
+ cerr << "usage:\nscaff2contig <in.fasta> [min length, default = 1]" << endl;
+ exit(1);
+ }
+
+ Fasta fa;
+ string infile = argv[1];
+ unsigned long minLength = argc == 3 ? atoi(argv[2]) : 1;
+
+ ifstream ifs(infile.c_str());
+
+ if (!ifs.good())
+ {
+ cerr << "Error opening file '" << infile << "'" << endl;
+ exit(1);
+ }
+
+ // for each sequence in the input file, find the gaps, print contigs
+ while (fa.fillFromFile(ifs))
+ {
+ list<pair<unsigned long, unsigned long> > gaps;
+ fa.findGaps(gaps);
+
+ if (gaps.size() == 0)
+ {
+ fa.print(cout);
+ continue;
+ }
+
+ unsigned long counter = 1;
+ unsigned long previousGapEnd = 0;
+ bool first = true;
+
+ for(list<pair<unsigned long, unsigned long> >::iterator p = gaps.begin(); p != gaps.end(); p++)
+ {
+ unsigned long startCoord;
+
+ if (first)
+ {
+ startCoord = 0;
+ first = false;
+ }
+ else
+ {
+ startCoord = previousGapEnd + 1;
+ }
+
+ previousGapEnd = p->second;
+ if (p->first - startCoord < minLength) continue;
+
+ stringstream ss;
+ ss << fa.id << "_" << counter << "_" << startCoord + 1 << "_" << p->first;
+ string id = ss.str();
+ string seq = fa.seq.substr(startCoord, p->first - startCoord);
+ Fasta ctg(id, seq);
+ ctg.print(cout);
+ counter++;
+ }
+
+ unsigned long lastLength = fa.length() - gaps.back().second - 1;
+ if (lastLength >= minLength)
+ {
+ stringstream ss;
+ ss << fa.id << "_" << counter << "_" << gaps.back().second + 2 << "_" << fa.length();
+ string id = ss.str();
+ string seq = fa.seq.substr(gaps.back().second + 1, lastLength);
+ Fasta ctg(id, seq);
+ ctg.print(cout);
+ }
+
+ }
+
+ ifs.close();
+ return 0;
+}
+
diff --git a/src/smalt b/src/smalt
new file mode 120000
index 0000000..632d3e3
--- /dev/null
+++ b/src/smalt
@@ -0,0 +1 @@
+../third_party/smalt_x86_64
\ No newline at end of file
diff --git a/src/tabix b/src/tabix
new file mode 120000
index 0000000..e8165e0
--- /dev/null
+++ b/src/tabix
@@ -0,0 +1 @@
+../third_party/tabix
\ No newline at end of file
diff --git a/src/task_break.cpp b/src/task_break.cpp
new file mode 100644
index 0000000..a88f5c8
--- /dev/null
+++ b/src/task_break.cpp
@@ -0,0 +1,439 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cstring>
+#include <sstream>
+#include <fstream>
+#include <map>
+#include <set>
+#include <vector>
+#include <assert.h>
+#include "fasta.h"
+#include "utils.h"
+#include "tabix/tabix.hpp"
+
+using namespace std;
+
+const string ERROR_PREFIX = "[REAPR break] ";
+
+string int2string(int number);
+
+struct Breakpoint
+{
+ unsigned long start;
+ unsigned long end;
+ short type;
+};
+
+struct CmdLineOptions
+{
+ double minTriError;
+ unsigned long minScaffLength;
+ unsigned long minMainScaffLength;
+ bool breakContigs;
+ unsigned long breakContigsTrim;
+ bool ignoreContigErrors;
+ string fastaIn;
+ string outprefix;
+ string gff;
+};
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+int main(int argc, char** argv)
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ Fasta seq;
+ string line;
+ map<string, set<pair<unsigned long, unsigned long> > > gapsToBreak;
+ map<string, set<pair<unsigned long, unsigned long> > > badRegions;
+ map<string, set<pair<unsigned long, unsigned long> > > contigBreakFlanks;
+ ifstream inStream;
+ ofstream outStreamFasta, outStreamBin;
+ Tabix ti(options.gff);
+ string fastaOut = options.outprefix + ".broken_assembly.fa";
+ string binOut = options.outprefix + ".broken_assembly_bin.fa";
+ vector<pair< string, unsigned long> > refLengths;
+ string binPrefix = "REAPR_bin.";
+
+ // For each chromosome, get an (ordered by coord) list of the gaps to be broken and the
+ // sections to be replaced by Ns. Some of these regions can intersect.
+ while (ti.getNextLine(line))
+ {
+ vector<string> v;
+ split(line, '\t', v);
+ string id = v[0];
+
+ // we break when errors are called over a gap, replace with Ns when
+ // error called in a contig (i.e. not over a gap)
+ if (v[2].compare("Frag_cov_gap") == 0)
+ {
+ // scaffold19_size430204 REAPR Frag_cov_gap 362796 363397 0.00166113 . . Note=Error: Fragment coverage too low over gap 363104-363113;colour=12
+ vector<string> a,b,c;
+ split(v.back(), ';', a);
+ split(a[a.size() - 2], ' ', b);
+ split(b.back(), ',', c);
+ for (unsigned long i = 0; i < c.size(); i++)
+ {
+ vector<string> d;
+ split(c[i], '-', d);
+ unsigned long start = atoi(d[0].c_str()) - 1;
+ unsigned long end = atoi(d[1].c_str()) - 1;
+ gapsToBreak[id].insert(make_pair(start, end));
+ }
+ }
+ else if (v[2].compare("FCD_gap") == 0 && atof(v[5].c_str()) >= options.minTriError)
+ {
+ // scaffold7_size612284 REAPR FCD_gap 547103 553357 0.781932 . . Note=Error: FCD failure over gap 550388-550397,552547-552686;colour=16
+ vector<string> a,b,c;
+ split(v.back(), ';', a);
+ split(a[a.size() - 2], ' ', b);
+ split(b.back(), ',', c);
+ for (unsigned long i = 0; i < c.size(); i++)
+ {
+ vector<string> d;
+ split(c[i], '-', d);
+ unsigned long start = atoi(d[0].c_str()) - 1;
+ unsigned long end = atoi(d[1].c_str()) - 1;
+ gapsToBreak[id].insert(make_pair(start, end));
+ }
+ }
+ else if (!options.ignoreContigErrors &&
+ (v[2].compare("Frag_cov") == 0 || (v[2].compare("FCD") == 0 && atof(v[5].c_str()) >= options.minTriError)) )
+ {
+ // scaffold6_size716595 REAPR Frag_cov 600296 601454 0 . . Note=Error: Fragment coverage too low;color=15
+ unsigned long start = atoi(v[3].c_str()) - 1;
+ unsigned long end = atoi(v[4].c_str()) - 1;
+ if (options.breakContigs)
+ {
+ unsigned long middle = 0.5 * (start + end);
+ gapsToBreak[id].insert(make_pair(middle, middle));
+ if (options.breakContigsTrim)
+ {
+ start = middle >= options.breakContigsTrim ? middle - options.breakContigsTrim : 0;
+ end = middle + options.breakContigsTrim;
+ contigBreakFlanks[id].insert(make_pair(start, end));
+ }
+ }
+ else
+ {
+ badRegions[id].insert(make_pair(start, end));
+ }
+ }
+ }
+
+
+
+ // for each chromosome, the replace by Ns errors could intersect. Take the union of them each time two intersect
+ for (map<string, set<pair<unsigned long, unsigned long> > >::iterator namesIter = badRegions.begin(); namesIter != badRegions.end(); namesIter++)
+ {
+ set<pair<unsigned long, unsigned long> > newRegions;
+
+ for (set<pair<unsigned long, unsigned long> >::iterator posIter = namesIter->second.begin(); posIter != namesIter->second.end(); posIter++)
+ {
+ if (newRegions.size() == 0)
+ {
+ newRegions.insert(*posIter);
+ }
+ else
+ {
+ set<pair<unsigned long, unsigned long> >::iterator lastRegion = newRegions.end();
+ lastRegion--;
+ // if last region added intersects with the current region
+ if (lastRegion->first <= posIter->second && posIter->first <= lastRegion->second)
+ {
+ unsigned long start = min(lastRegion->first, posIter->first);
+ unsigned long end = max(lastRegion->second, posIter->second);
+ newRegions.erase(lastRegion);
+ newRegions.insert(make_pair(start, end));
+ }
+ else // no overlap
+ {
+ newRegions.insert(*posIter);
+ }
+ }
+ }
+
+ badRegions[namesIter->first] = newRegions;
+ }
+
+ inStream.open(options.fastaIn.c_str());
+
+ if (! inStream.is_open())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << options.fastaIn << "'" << endl;
+ return 1;
+ }
+
+ outStreamFasta.open(fastaOut.c_str());
+
+ if (! outStreamFasta.is_open())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << fastaOut << "'" << endl;
+ return 1;
+ }
+
+ outStreamBin.open(binOut.c_str());
+
+ if (! outStreamBin.is_open())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << binOut << "'" << endl;
+ return 1;
+ }
+ // do the breaking
+ while (seq.fillFromFile(inStream))
+ {
+ map<string, set<pair<unsigned long, unsigned long> > >::iterator gapsToBreakNamesIter = gapsToBreak.find(seq.id);
+ map<string, set<pair<unsigned long, unsigned long> > >::iterator badRegionsNamesIter = badRegions.find(seq.id);
+ map<string, set<pair<unsigned long, unsigned long> > >::iterator contigBreakFlanksIter = contigBreakFlanks.find(seq.id);
+
+ // replace each region flanking a contig error with Ns. This is only
+ // relevant if -a and -t were used.
+ if (contigBreakFlanksIter != contigBreakFlanks.end())
+ {
+ for(set<pair<unsigned long, unsigned long> >::iterator p = contigBreakFlanksIter->second.begin(); p != contigBreakFlanksIter->second.end(); p++)
+ {
+ unsigned long start = p->first;
+ unsigned long end = min(p->second, seq.length() - 1);
+ seq.seq.replace(start, end - start + 1, end - start + 1, 'N');
+ }
+ }
+
+ // replace the bad regions with Ns, write these sequences out to the "bin" assembly.
+ // Each of these sequences is broken at any bad gaps flagged. (Sometimes the errors
+ // over and not over a gap can overlap.)
+ if (badRegionsNamesIter != badRegions.end())
+ {
+ set<pair<unsigned long, unsigned long> >::iterator gapsIter;
+ if (gapsToBreakNamesIter != gapsToBreak.end())
+ {
+ gapsIter = gapsToBreakNamesIter->second.begin();
+ }
+
+ for (set<pair<unsigned long, unsigned long> >::iterator p = badRegionsNamesIter->second.begin(); p != badRegionsNamesIter->second.end(); p++)
+ {
+ vector<unsigned long> binRegions;
+ binRegions.push_back(p->first);
+
+ if (gapsToBreakNamesIter != gapsToBreak.end())
+ {
+ while (gapsIter != gapsToBreakNamesIter->second.end() && gapsIter->second < p->first)
+ {
+ gapsIter++;
+ }
+
+ while (gapsIter != gapsToBreakNamesIter->second.end() && gapsIter->first <= p->second)
+ {
+ if (binRegions.size())
+ {
+ assert(binRegions.back() < gapsIter->first);
+ }
+ binRegions.push_back(max(p->first, gapsIter->first));
+ binRegions.push_back(min(gapsIter->second, p->second));
+ gapsIter++;
+ }
+ }
+
+ binRegions.push_back(p->second);
+
+ for (unsigned long i = 0; i < binRegions.size(); i+= 2)
+ {
+ Fasta contig = seq.subseq(binRegions[i], binRegions[i+1]);
+ stringstream ss;
+ unsigned long startBasesTrimmed, endBasesTrimmed;
+ contig.trimNs(startBasesTrimmed, endBasesTrimmed);
+ ss << binRegions[i] + 1 + startBasesTrimmed << '_' << binRegions[i+1] - endBasesTrimmed + 1;
+ contig.id = binPrefix + contig.id + "_" + ss.str();
+ if (contig.length() > options.minMainScaffLength)
+ {
+ contig.print(outStreamFasta, 60);
+ }
+ else if (contig.length() >= options.minScaffLength)
+ {
+ contig.print(outStreamBin, 60);
+ }
+ }
+
+ seq.seq.replace(p->first, p->second - p->first + 1, p->second - p->first + 1, 'N');
+ }
+ }
+
+ // if there's no breaks to be made at gaps
+ if (gapsToBreakNamesIter == gapsToBreak.end())
+ {
+ unsigned long startBasesTrimmed, endBasesTrimmed;
+ seq.trimNs(startBasesTrimmed, endBasesTrimmed);
+
+ if (startBasesTrimmed || endBasesTrimmed)
+ {
+ stringstream ss;
+ ss << startBasesTrimmed + 1 << '_' << seq.length() - endBasesTrimmed - startBasesTrimmed;
+ seq.id += "_" + ss.str();
+ if (seq.length() >= options.minScaffLength)
+ {
+ seq.print(outStreamFasta, 60);
+ }
+ }
+ else
+ {
+ if (seq.length() >= options.minScaffLength)
+ {
+ seq.print(outStreamFasta, 60);
+ }
+ }
+ }
+ else // gaps to be broken
+ {
+ set<pair<unsigned long, unsigned long> > contigCoords;
+ vector<unsigned long> breakpoints;
+ breakpoints.push_back(0);
+
+ for (set<pair<unsigned long, unsigned long> >::iterator gapsIter = gapsToBreakNamesIter->second.begin(); gapsIter != gapsToBreakNamesIter->second.end(); gapsIter++)
+ {
+ if (breakpoints.size())
+ {
+ assert(breakpoints.back() <= gapsIter->first);
+ }
+
+ breakpoints.push_back(gapsIter->first == 0 ? 0 : gapsIter->first - 1);
+ breakpoints.push_back(gapsIter->second + 1);
+ }
+
+ breakpoints.push_back(seq.length() - 1);
+
+ for (unsigned long i = 0; i < breakpoints.size(); i+= 2)
+ {
+ Fasta contig = seq.subseq(breakpoints[i], breakpoints[i+1]);
+ stringstream ss;
+ unsigned long startBasesTrimmed, endBasesTrimmed;
+ contig.trimNs(startBasesTrimmed, endBasesTrimmed);
+ ss << breakpoints[i] + 1 + startBasesTrimmed << '_' << breakpoints[i+1] - endBasesTrimmed + 1;
+ contig.id += "_" + ss.str();
+ if (contig.length() >= options.minScaffLength)
+ {
+ contig.print(outStreamFasta, 60);
+ }
+ }
+ }
+ }
+
+ inStream.close();
+ outStreamFasta.close();
+ outStreamBin.close();
+
+ return 0;
+}
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 3;
+ int i;
+ usage = "\
+where 'errors.gff.gz' is the errors gff file made when running score.\n\n\
+Options:\n\
+-a\n\tAgressive breaking: break contigs at any FCD or low_frag error, as\n\
+\topposed to the default of replacing with Ns. Contigs are broken at the\n\
+\tmidpoint of each error. Also see option -t. Incompatible with -b\n\
+-b\n\tIgnore FCD and low fragment coverage errors that do not contain\n\
+\ta gap (the default is to replace these with Ns). incompatible with -a\n\
+-e <float>\n\tMinimum FCD error [0]\n\
+-l <int>\n\tMinimum sequence length to output [100]\n\
+-m <int>\n\tMax sequence length to write to the bin. Sequences longer\n\
+\tthan this are written to the main assembly output. This is to stop\n\
+\tlong stretches of sequence being lost [999]\n\
+-t <int>\n\tWhen -a is used, use this option to specify how many bases\n\
+\tare trimmed off the end of each new contig around a break.\n\
+\t-t N means that, at an FCD error, a contig is broken at the middle\n\
+\tcoordinate of the error, then N bases are\n\
+\ttrimmed off each new contig end [0]\n\
+";
+
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ usage = "[options] <assembly.fa> <errors.gff.gz> <outfiles prefix>\n\n" + usage;
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ usage = "[options] <assembly.fa> <errors.gff.gz> <outfiles prefix>\n\n" + usage;
+ cerr << "usage: task_break " << usage;
+ exit(1);
+ }
+
+ ops.minTriError = 0;
+ ops.minScaffLength = 100;
+ ops.minMainScaffLength = 999;
+ ops.breakContigs = false;
+ ops.ignoreContigErrors = false;
+ ops.breakContigsTrim = 0;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ if (strcmp(argv[i], "-a") == 0)
+ {
+ ops.breakContigs = true;
+ continue;
+ }
+ if (strcmp(argv[i], "-b") == 0)
+ {
+ ops.ignoreContigErrors = true;
+ continue;
+ }
+
+ if (strcmp(argv[i], "-e") == 0)
+ {
+ ops.minTriError = atof(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-l") == 0)
+ {
+ ops.minScaffLength = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-m") == 0)
+ {
+ ops.minMainScaffLength = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-t") == 0)
+ {
+ ops.breakContigsTrim = atoi(argv[i+1]);
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ if (ops.ignoreContigErrors && ops.breakContigs)
+ {
+ cerr << ERROR_PREFIX << "Options -a and -b are incompatible. Cannot continue" << endl;
+ exit(1);
+ }
+
+ if (ops.breakContigsTrim && !ops.breakContigs)
+ {
+ cerr << ERROR_PREFIX << "Warning: ignoring -t " << ops.breakContigsTrim << " because -a was not used" << endl;
+ }
+
+ ops.fastaIn = argv[i];
+ ops.gff = argv[i+1];
+ ops.outprefix = argv[i+2];
+}
+
+
+string int2string(int number)
+{
+ stringstream ss;
+ ss << number;
+ return ss.str();
+}
diff --git a/src/task_facheck.pl b/src/task_facheck.pl
new file mode 100755
index 0000000..6e37cfc
--- /dev/null
+++ b/src/task_facheck.pl
@@ -0,0 +1,104 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options;
+my $usage = qq/<in.fa> [out_prefix]
+
+Checks that the names in the fasta file are ok. Things like
+trailing whitespace or characters |':- could break the pipeline.
+
+If out_prefix is not given, it will die when a bad ID is found.
+No output means everything is OK.
+
+If out_prefix is given, writes a new fasta file out_prefix.fa with new
+names, and out_prefix.info which has the mapping from old name to new name.
+/;
+
+
+my $ERROR_PREFIX = '[REAPR facheck]';
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if ($#ARGV < 0 or $#ARGV > 1 or !($ops_ok)) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+my $fasta_in = $ARGV[0];
+my $out_prefix = $#ARGV == 1 ? $ARGV[1] : "";
+-e $fasta_in or die "$ERROR_PREFIX Cannot find file '$fasta_in'\n";
+my %new_id_counts;
+open FIN, $fasta_in or die "$ERROR_PREFIX Error opening '$fasta_in'\n";
+
+if ($out_prefix) {
+ my $fasta_out = "$out_prefix.fa";
+ my $info_out = "$out_prefix.info";
+ open FA_OUT, ">$fasta_out" or die "$ERROR_PREFIX Error opening '$fasta_out'\n";
+ open INFO_OUT, ">$info_out" or die "$ERROR_PREFIX Error opening '$info_out'\n";
+ print INFO_OUT "#old_name\tnew_name\n";
+}
+
+while (<FIN>) {
+ chomp;
+
+ if (/^>/) {
+ my ($name) = split /\t/;
+ $name =~ s/^.//;
+ my $new_name = check_name($name, \%new_id_counts, $out_prefix);
+ if ($out_prefix) {
+ print FA_OUT ">$new_name\n";
+ print INFO_OUT "$name\t$new_name\n";
+ }
+ }
+ elsif ($out_prefix) {
+ print FA_OUT "$_\n";
+ }
+}
+
+close FIN;
+
+if ($out_prefix) {
+ close FA_OUT;
+ close INFO_OUT;
+}
+
+# checks if the given name is OK.
+# arg 0 = name to be checked
+# arg 1 = refrence to hash of new id counts
+# arg 2 = ouput files prefix
+# Returns new name
+sub check_name {
+ my $old_id = shift;
+ my $h = shift;
+ my $pre = shift;
+ my $new_id = $old_id;
+ $new_id =~ s/[,;'|:\+\-\s\(\)\{\}\[\]]/_/g;
+
+ if ($old_id ne $new_id and $pre eq "") {
+ print "Sequence name '$old_id' not OK and will likely break pipeline\n";
+ exit(1);
+ }
+ $h->{$new_id}++;
+
+ if ($h->{$new_id} == 1) {
+ return $new_id;
+ }
+ else {
+ return "$new_id." . $h->{$new_id};
+ }
+}
+
diff --git a/src/task_fcdrate.cpp b/src/task_fcdrate.cpp
new file mode 100644
index 0000000..77277b8
--- /dev/null
+++ b/src/task_fcdrate.cpp
@@ -0,0 +1,380 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <algorithm>
+
+#include "utils.h"
+#include "histogram.h"
+#include "tabix/tabix.hpp"
+
+using namespace std;
+
+
+struct CmdLineOptions
+{
+ string statsInfile;
+ string preprocessDir;
+ unsigned long windowWidth;
+ unsigned long windowStep;
+ unsigned long maxWindows;
+ unsigned long windowPercentCutoff;
+ unsigned long fragmentLength;
+ bool debug;
+ string outprefix;
+};
+
+const short FCD_ERR_COLUMN = 20;
+const string ERROR_PREFIX = "[REAPR fcdrate] ";
+
+void parseOptions(int argc, char** argv, CmdLineOptions& options);
+
+void getGradient(vector<double>& x, vector<double>& y, vector<double>& d1x, vector<double>& d1y);
+
+void printVectors(string prefix, vector<double>& x, vector<double>& y);
+
+string vector2Rstring(vector<double>& x);
+
+unsigned long normalise(vector<double> & v);
+
+unsigned long fragmentLengthFromFile(string fname);
+
+
+int main(int argc, char* argv[])
+{
+ string line;
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ Tabix ti(options.statsInfile);
+ ti.getNextLine(line); // ignore the header line
+ unsigned long windowCount = 0;
+ list<double> fcdErrors;
+ double maxCutoff = 5;
+ unsigned long fcdAccuracy = 50;
+ unsigned long stepCounter = 0;
+ vector<unsigned long> fcdCutoffs(maxCutoff * fcdAccuracy + 1, 0);
+ vector<pair< string, unsigned long> > sequencesAndLengths;
+ orderedSeqsFromFai(options.preprocessDir + "/00.assembly.fa.fai", sequencesAndLengths);
+
+ for (vector<pair< string, unsigned long> >:: iterator iter = sequencesAndLengths.begin(); iter != sequencesAndLengths.end() && iter->second > 2 * options.fragmentLength + options.windowWidth && windowCount < options.maxWindows ; iter++)
+ {
+ stringstream regionSS;
+ regionSS << iter->first << ':' << options.fragmentLength << '-' << iter->second - options.fragmentLength;
+ string region(regionSS.str());
+ if (options.debug) cerr << regionSS.str() << endl;
+ ti.setRegion(region);
+ fcdErrors.clear();
+ stepCounter = 0;
+
+ while (ti.getNextLine(line) && windowCount < options.maxWindows)
+ {
+ string tmp;
+ stringstream ss(line);
+
+
+ for (short i = 0; i <= FCD_ERR_COLUMN; i++)
+ {
+ getline(ss, tmp, '\t');
+ }
+
+ double fcdError = atof(tmp.c_str());
+
+ if (fcdError == -1)
+ {
+ fcdErrors.clear();
+ stepCounter = 0;
+ }
+ else if (fcdErrors.size() < options.windowWidth)
+ {
+ fcdErrors.push_back(fcdError);
+ }
+ else
+ {
+ if (stepCounter < options.windowStep)
+ {
+ fcdErrors.push_back(fcdError);
+ fcdErrors.pop_front();
+ stepCounter++;
+ }
+
+ if (stepCounter == options.windowStep)
+ {
+ if (options.debug && windowCount %100 == 0)
+ cerr << "windowCount\t" << windowCount << endl;
+ vector<double> errs(fcdErrors.begin(), fcdErrors.end());
+ sort(errs.begin(), errs.end());
+ double ninetiethValue = min(maxCutoff, errs[errs.size() * options.windowPercentCutoff / 100]);
+ fcdCutoffs[ninetiethValue * fcdAccuracy]++;
+ windowCount++;
+ stepCounter = 0;
+ }
+ }
+
+ }
+
+ }
+
+ vector<double> cumulativeErrorCountsXvals;
+ vector<double> cumulativeErrorCountsYvals;
+
+ unsigned long total = 0;
+
+ for (unsigned long i = 0; i < fcdCutoffs.size(); i++)
+ {
+ cumulativeErrorCountsXvals.push_back(1.0 * i / fcdAccuracy);
+ cumulativeErrorCountsYvals.push_back(1.0 * (windowCount - fcdCutoffs[i] - total) / windowCount);
+ total += fcdCutoffs[i];
+ }
+
+
+ vector<double> cumulativeErrorCountsD1Xvals;
+ vector<double> cumulativeErrorCountsD1Yvals;
+ vector<double> cumulativeErrorCountsD2Xvals;
+ vector<double> cumulativeErrorCountsD2Yvals;
+
+
+ getGradient(cumulativeErrorCountsXvals, cumulativeErrorCountsYvals, cumulativeErrorCountsD1Xvals, cumulativeErrorCountsD1Yvals);
+ unsigned long minValueIndexD1 = normalise(cumulativeErrorCountsD1Yvals);
+ getGradient(cumulativeErrorCountsD1Xvals, cumulativeErrorCountsD1Yvals, cumulativeErrorCountsD2Xvals, cumulativeErrorCountsD2Yvals);
+ normalise(cumulativeErrorCountsD2Yvals);
+
+ unsigned long cutoffIndex;
+
+ for (cutoffIndex = cumulativeErrorCountsD2Yvals.size(); cutoffIndex > max(minValueIndexD1,minValueIndexD1); cutoffIndex--)
+ {
+ if (cumulativeErrorCountsD1Yvals[cutoffIndex] < -0.05 && cumulativeErrorCountsD2Yvals[cutoffIndex] > 0.05)
+ {
+ cutoffIndex++;
+ break;
+ }
+ }
+
+
+ if (options.debug)
+ {
+ for (unsigned long i = 0; i < fcdCutoffs.size(); i++)
+ {
+ cout << "fcd\t" << 1.0 * i / fcdAccuracy << '\t' << fcdCutoffs[i] << endl;
+ }
+ printVectors("d0", cumulativeErrorCountsXvals, cumulativeErrorCountsYvals);
+ printVectors("d1", cumulativeErrorCountsD1Xvals, cumulativeErrorCountsD1Yvals);
+ printVectors("d2", cumulativeErrorCountsD2Xvals, cumulativeErrorCountsD2Yvals);
+ }
+
+ double fcdCutoff = 0.5 * (cumulativeErrorCountsD1Xvals[cutoffIndex] + cumulativeErrorCountsD1Xvals[cutoffIndex+1]);
+
+ string outfile = options.outprefix + ".info.txt";
+ ofstream ofs(outfile.c_str());
+ if (!ofs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << outfile << "'" << endl;
+ return 1;
+ }
+
+ ofs << "#fcd_cutoff\twindow_length\twindows_sampled\tpercent_cutoff" << endl
+ << fcdCutoff << '\t'
+ << options.windowWidth << '\t'
+ << windowCount << '\t'
+ << options.windowPercentCutoff << endl;
+ ofs.close();
+
+ outfile = options.outprefix + ".plot.R";
+ ofs.open(outfile.c_str());
+ if (!ofs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << outfile << "'" << endl;
+ return 1;
+ }
+
+ ofs << "fcd_cutoff = " << fcdCutoff << endl
+ << "x=" << vector2Rstring(cumulativeErrorCountsXvals) << endl
+ << "y=" << vector2Rstring(cumulativeErrorCountsYvals) << endl
+ << "xd1=" << vector2Rstring(cumulativeErrorCountsD1Xvals) << endl
+ << "yd1=" << vector2Rstring(cumulativeErrorCountsD1Yvals) << endl
+ << "xd2=" << vector2Rstring(cumulativeErrorCountsD2Xvals) << endl
+ << "yd2=" << vector2Rstring(cumulativeErrorCountsD2Yvals) << endl
+ << "pdf(\"" << options.outprefix + ".plot.pdf\")" << endl
+ << "plot(x, y, xlab=\"FCD cutoff\", ylim=c(-1,1), xlim=c(0," << fcdCutoff + 0.5 << "), ylab=\"Proportion of failed windows\", type=\"l\")" << endl
+ << "abline(v=fcd_cutoff, col=\"red\")" << endl
+ << "lines(xd2, yd2, col=\"blue\", lty=2)" << endl
+ << "lines(xd1, yd1, col=\"green\", lty=2)" << endl
+ << "text(fcd_cutoff+0.02, 0.8, labels=c(paste(\"y =\", fcd_cutoff)), col=\"red\", adj=c(0,0))" << endl
+ << "dev.off()" << endl;
+
+ ofs.close();
+ systemCall("R CMD BATCH --no-save " + outfile + " " + outfile + "out");
+ return 0;
+}
+
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 3;
+ int i;
+
+ usage = "[options] <preprocess directory> <stats prefix> <prefix of outut files>\n\n\
+where 'stats prefix' is output files prefix used when stats was run\n\n\
+Options:\n\n\
+-l <int>\n\tWindow length [insert_size / 2] (insert_size is taken to be\n\
+\tsample_ave_fragment_length in the file global_stats.txt file made by stats)\n\
+-p <int>\n\tPercent of bases in window > fcd cutoff to call as error [80]\n\
+-s <int>\n\tStep length for window sampling [100]\n\
+-w <int>\n\tMax number of windows to sample [100000]\n\
+";
+
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ cerr << "usage:\ntask_fcdrate " << usage;
+ exit(1);
+ }
+
+ // set defaults
+ ops.windowWidth = 0;
+ ops.windowPercentCutoff = 80;
+ ops.maxWindows = 100000;
+ ops.windowStep = 100;
+ ops.debug = false;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // deal with booleans
+ if (strcmp(argv[i], "-d") == 0)
+ {
+ ops.debug = true;
+ continue;
+ }
+
+ // non booleans are of form -option value, so check
+ // next value in array is there before using it!
+ if (strcmp(argv[i], "-l") == 0)
+ {
+ ops.windowWidth = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-p") == 0)
+ {
+ ops.windowPercentCutoff = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-s") == 0)
+ {
+ ops.windowStep = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-w") == 0)
+ {
+ ops.maxWindows = atoi(argv[i+1]);
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.preprocessDir = argv[i];
+ string statsPrefix = argv[i+1];
+ ops.outprefix = argv[i+2];
+ ops.statsInfile = statsPrefix + ".per_base.gz";
+ ops.fragmentLength = fragmentLengthFromFile(statsPrefix + ".global_stats.txt");
+
+ if (ops.windowWidth == 0)
+ ops.windowWidth = ops.fragmentLength > 1000 ? ops.fragmentLength / 2 : ops.fragmentLength;
+}
+
+
+void getGradient(vector<double>& x, vector<double>& y, vector<double>& d1x, vector<double>& d1y)
+{
+ unsigned short pointSkip = 2;
+ for (unsigned long i = 0; i < x.size() - pointSkip; i++)
+ {
+ d1x.push_back(0.5 * (x[i+pointSkip] + x[i]));
+ d1y.push_back( (y[i+pointSkip] - y[i]) / (x[i+pointSkip] - x[i]) );
+ }
+}
+
+
+string vector2Rstring(vector<double>& x)
+{
+ stringstream ss;
+ ss << "c(";
+
+ for (unsigned long i = 0; i < x.size(); i++)
+ {
+ ss << x[i] << ",";
+ }
+
+ string out = ss.str();
+ return out.substr(0, out.size() - 1) + ")";
+}
+
+
+unsigned long normalise(vector<double> & v)
+{
+ unsigned long maxValueIndex = 0;
+
+ for (unsigned long i = 1; i < v.size(); i++)
+ {
+ if ( abs(v[i]) > abs(v[maxValueIndex]) )
+ {
+ maxValueIndex = i;
+ }
+ }
+
+ double scaleFactor = abs(1.0 / v[maxValueIndex]);
+
+ for (unsigned long i = 0; i < v.size(); i++)
+ {
+ v[i] *= scaleFactor;
+ }
+
+ return maxValueIndex;
+}
+
+
+void printVectors(string prefix, vector<double>& x, vector<double>& y)
+{
+ for (unsigned long i = 0; i < x.size(); i++)
+ {
+ cout << prefix << '\t' << x[i] << '\t' << y[i] << endl;
+ }
+}
+
+unsigned long fragmentLengthFromFile(string fname)
+{
+ ifstream ifs(fname.c_str());
+
+ if (!ifs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << fname << "'" << endl;
+ exit(1);
+ }
+
+ string line;
+
+ while (getline(ifs, line))
+ {
+ vector<string> v;
+ split(line, '\t', v);
+ if (v[0].compare("sample_ave_fragment_length") == 0)
+ return (unsigned long)atoi(v[1].c_str());
+ }
+
+ ifs.close();
+ cerr << ERROR_PREFIX << "Error getting fragment length from file '" << fname << "'" << endl;
+ exit(1);
+}
diff --git a/src/task_gapresize.cpp b/src/task_gapresize.cpp
new file mode 100644
index 0000000..28b01e4
--- /dev/null
+++ b/src/task_gapresize.cpp
@@ -0,0 +1,239 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include <iomanip>
+#include <list>
+
+#include "trianglePlot.h"
+#include "utils.h"
+#include "fasta.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+
+using namespace BamTools;
+using namespace std;
+
+const string ERROR_PREFIX = "[REAPR gapresize] ";
+
+struct CmdLineOptions
+{
+ string bamInfile;
+ string assemblyInfile;
+ string outprefix;
+ unsigned long minGapToResize;
+ unsigned long maxFragLength;
+ unsigned long aveFragLength;
+};
+
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ BamReader bamReader;
+ BamAlignment bamAlign;
+ SamHeader header;
+ RefVector references;
+ string currentRefIDstring = "";
+
+ Fasta fa;
+ ifstream ifs(options.assemblyInfile.c_str());
+
+ if (!ifs.good())
+ {
+ cerr << "Error opening file '" << options.assemblyInfile << "'" << endl;
+ exit(1);
+ }
+
+ string info_outfile = options.outprefix + ".info";
+ ofstream ofs_info(info_outfile.c_str());
+ if (!ofs_info.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << info_outfile << "'" << endl;
+ exit(1);
+ }
+
+ ofs_info << "#chr\toriginal_coords\toriginal_lgth\toriginal_fcd_err\tnew_coords\tnew_lgth\tnew_fcd_err\tfragment_depth" << endl;
+
+
+ string fasta_outfile = options.outprefix + ".fasta";
+ ofstream ofs_fasta(fasta_outfile.c_str());
+ if (!ofs_fasta.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << fasta_outfile << "'" << endl;
+ exit(1);
+ }
+
+ if (!bamReader.Open(options.bamInfile))
+ {
+ cerr << ERROR_PREFIX << "Error opening bam file '" << options.bamInfile << "'" << endl;
+ return 1;
+ }
+
+ if (!bamReader.LocateIndex())
+ {
+ cerr << ERROR_PREFIX << "Couldn't find index for bam file '" << options.bamInfile << "'!" << endl;
+ exit(1);
+ }
+
+ if (!bamReader.HasIndex())
+ {
+ cerr << ERROR_PREFIX << "No index for bam file '" << options.bamInfile << "'!" << endl;
+ exit(1);
+ }
+
+ header = bamReader.GetHeader();
+ references = bamReader.GetReferenceData();
+ TrianglePlot triplot(0);
+
+ // Go through input fasta file, checking each gap in each sequence
+ while (fa.fillFromFile(ifs))
+ {
+ long basesOffset = 0;
+ list<pair<unsigned long, unsigned long> > gaps;
+ fa.findGaps(gaps);
+
+ for(list<pair<unsigned long, unsigned long> >::iterator gapIter = gaps.begin(); gapIter != gaps.end(); gapIter++)
+ {
+ // set the range of the bam reader. We need fragments within max insert size of either end
+ // of the gap, and spanning the gap.
+ unsigned long rangeStart = gapIter->first <= options.maxFragLength ? 1 : gapIter->first - options.maxFragLength;
+ int id = bamReader.GetReferenceID(fa.id);
+ bamReader.SetRegion(id, rangeStart, id, gapIter->first);
+ unsigned long oldGapLength = gapIter->second - gapIter->first + 1;
+ triplot.clear(gapIter->first);
+ bool considerThisGap = (gapIter->second - gapIter->first + 1 >= options.minGapToResize);
+
+ if (considerThisGap)
+ {
+ // put all the fragments into the triangle plot
+ while (bamReader.GetNextAlignmentCore(bamAlign))
+ {
+ if (!bamAlign.IsMapped() || bamAlign.IsDuplicate())
+ {
+ continue;
+ }
+
+ short pairOrientation = getPairOrientation(bamAlign);
+ int64_t fragEnd = bamAlign.Position + bamAlign.InsertSize - 1;
+ if (!bamAlign.IsReverseStrand() && pairOrientation == INNIE && bamAlign.InsertSize > 0 && gapIter->second < fragEnd && fragEnd <= gapIter->second + options.maxFragLength)
+ {
+ pair<unsigned long, unsigned long> fragment(bamAlign.Position, fragEnd);
+ triplot.add(fragment);
+ }
+ }
+ }
+
+ if (triplot.depth() > 0)
+ {
+ unsigned long bestGapLength = 0;
+ double minimumError = -1;
+ triplot.optimiseGap(options.maxFragLength, options.aveFragLength, gapIter->first, gapIter->second, bestGapLength, minimumError);
+ unsigned long newGapStart = gapIter->first + basesOffset;
+ unsigned long newGapEnd = newGapStart + bestGapLength;
+ fa.seq.replace(gapIter->first + basesOffset, oldGapLength, bestGapLength, 'N');
+
+ ofs_info << fa.id
+ << '\t' << gapIter->first + 1 << '-' << gapIter->second + 1
+ << '\t' << oldGapLength
+ << '\t' << triplot.areaError(options.maxFragLength, options.aveFragLength, true, gapIter->first, gapIter->second)
+ << '\t' << newGapStart + 1 << '-' << newGapEnd
+ << '\t' << bestGapLength
+ << '\t' << minimumError
+ << '\t' << triplot.depth()
+ << endl;
+
+ basesOffset += bestGapLength;
+ basesOffset -= oldGapLength;
+ }
+ else
+ {
+ unsigned long newGapStart = gapIter->first + basesOffset;
+ unsigned long newGapEnd = gapIter->second + basesOffset;
+
+ ofs_info << fa.id
+ << '\t' << gapIter->first + 1 << '-' << gapIter->second + 1
+ << '\t' << oldGapLength
+ << '\t' << '.'
+ << '\t' << newGapStart + 1 << '-' << newGapEnd + 1
+ << '\t' << newGapEnd - newGapStart + 1
+ << '\t' << '.';
+
+ if (considerThisGap)
+ ofs_info << '\t' << triplot.depth() << endl;
+ else
+ ofs_info << '\t' << '.' << endl;
+ }
+ }
+
+ fa.print(ofs_fasta);
+ }
+
+ ifs.close();
+ ofs_info.close();
+ ofs_fasta.close();
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 5;
+ int i;
+ ops.minGapToResize = 1;
+
+ usage = "<assembly.fasta> <in.bam> <ave fragment length> <max fragment length> <prefix of outfiles>\n\n\
+Options:\n\n\
+-g <int>\n\tOnly consider gaps of at least this length [1]\n\n\
+";
+
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ cerr << "usage:\ntask_gapresize " << usage;
+ exit(1);
+ }
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ if (strcmp(argv[i], "-g") == 0)
+ {
+ ops.minGapToResize = atoi(argv[i+1]);;
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.assemblyInfile = argv[i];
+ ops.bamInfile = argv[i+1];
+ ops.aveFragLength = atoi(argv[i+2]);
+ ops.maxFragLength = atoi(argv[i+3]);
+ ops.outprefix = argv[i+4];
+}
+
diff --git a/src/task_perfectfrombam.pl b/src/task_perfectfrombam.pl
new file mode 100755
index 0000000..ffce475
--- /dev/null
+++ b/src/task_perfectfrombam.pl
@@ -0,0 +1,211 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options;
+my $usage = qq/[options] <in.bam> <prefix of output files> <min insert> <max insert> <repetitive max qual> <perfect min qual> <perfect min alignment score>
+
+Options:
+ -noclean
+ Use this to not delete the temporary bam file
+
+Alternative to using perfectmap, for large genomes.
+
+Takes a BAM, which must have AS:... tags in each line. Makes file
+of perfect mapping depth, for use with the REAPR pipeline. Recommended to
+use 'reapr perfectmap' instead, unless your genome is large (more than ~300MB),
+since although very fast to run, 'reapr perfectmap' uses a lot of memory.
+
+A BAM file made by 'reapr smaltmap' is suitable input.
+
+Reads in pair pointing towards each other, with the given minimum
+alignment score and mapping quality and within the given insert size range
+are used to generate the coverage across the genome.
+
+Additionally, regions with repetitive coverage are called, by taking read
+pairs where at least one read of the pair (is mapped and) has mapping
+quality less than or equal to <repetitive max qual>.
+/;
+
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+ 'noclean',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if ($#ARGV != 6 or !($ops_ok)) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+my $bam_in = $ARGV[0];
+my $out_prefix = $ARGV[1];
+my $min_insert = $ARGV[2];
+my $max_insert = $ARGV[3];
+my $max_repeat_map_qual = $ARGV[4];
+my $min_perfect_map_qual = $ARGV[5];
+my $min_align_score = $ARGV[6];
+my $bam2perfect = File::Spec->catfile($scriptdir, 'bam2perfect');
+my $bgzip = File::Spec->catfile($scriptdir, 'tabix/bgzip');
+my $tabix = File::Spec->catfile($scriptdir, 'tabix/tabix');
+my $ERROR_PREFIX = '[REAPR perfectfrombam]';
+my $perfect_bam = "$out_prefix.tmp.perfect.bam";
+my $repetitive_bam = "$out_prefix.tmp.repetitive.bam";
+my $samtools = File::Spec->catfile($scriptdir, 'samtools');
+my %seq_lengths;
+my %used_seqs;
+my $hist_file = "$out_prefix.hist";
+my $perfect_cov_out = "$out_prefix.perfect_cov.gz";
+my $repetitive_regions_out = "$out_prefix.repetitive_regions.gz";
+my @coverage = (0) x 101;
+
+
+# Make a new BAM with just the perfect + uniquely mapped reads
+system_call("$bam2perfect $bam_in $out_prefix.tmp $min_insert $max_insert $max_repeat_map_qual $min_perfect_map_qual $min_align_score");
+
+# Get sequence length info from bam header
+open F, "$samtools view -H $bam_in |" or die "$ERROR_PREFIX Error reading header of '$bam_in'";
+while (<F>) {
+ if (/^\@SQ/) {
+ my $id;
+ my $length;
+ if (/\tSN:(.*?)[\t\n]/) {
+ $id = $1;
+ }
+ if (/\tLN:(.*)[\t\n]/) {
+ $length = $1;
+ }
+
+ unless (defined $id and defined $length) {
+ die "Error parsing \@SQ line from header of bam at this line:\n$_";
+ }
+
+ $seq_lengths{$id} = $length;
+ }
+}
+
+close F or die $!;
+
+# run samtools mpileup on the perfect coverage BAM, writing the coverage to a new file.
+# Have to be careful because mpileup only reports bases with != coverage.
+open FIN, "$samtools mpileup $perfect_bam | cut -f 1,2,4|" or die "$ERROR_PREFIX Error running samtools mpileup on '$perfect_bam'";
+open FOUT, "| $bgzip -c > $perfect_cov_out" or die "$ERROR_PREFIX Error opening '$perfect_cov_out'";
+my $current_ref = "";
+my $current_pos = 1;
+
+
+while (<FIN>) {
+ chomp;
+ my ($ref, $pos, $cov) = split /\t/;
+
+ if ($current_ref ne $ref) {
+ if ($current_ref ne "") {
+ while ($current_pos <= $seq_lengths{$current_ref}) {
+ print FOUT "$current_ref\t$current_pos\t0\n";
+ $coverage[0]++;
+ $current_pos++;
+ }
+ }
+ $used_seqs{$ref} = 1;
+ $current_ref = $ref;
+ $current_pos = 1;
+ }
+
+ while ($current_pos < $pos) {
+ print FOUT "$ref\t$current_pos\t0\n";
+ $coverage[0]++;
+ $current_pos++;
+ }
+
+ print FOUT "$ref\t$pos\t$cov\n";
+ $coverage[$cov > 100 ? 100 : $cov]++;
+ $current_pos++;
+}
+
+
+while ($current_pos <= $seq_lengths{$current_ref}) {
+ print FOUT "$current_ref\t$current_pos\t0\n";
+ $coverage[0]++;
+ $current_pos++;
+}
+
+close FIN or die $!;
+close FOUT or die $!;
+system_call("$tabix -f -b 2 -e 2 $perfect_cov_out");
+
+
+# make histogram of coverage file. First need to account
+# for the sequences that had no coverage
+for my $seq (keys %seq_lengths) {
+ unless (exists $used_seqs{$seq}) {
+ $coverage[0] += $seq_lengths{$seq};
+ }
+}
+
+open F, ">$hist_file" or die "$ERROR_PREFIX Error opening file '$hist_file'";
+print F "#coverage\tnumber_of_bases\n";
+
+for my $i (0..$#coverage) {
+ print F "$i\t$coverage[$i]\n";
+}
+
+close F;
+
+# get the regions of nonzero repetitive coverage from the
+# repetitive BAM
+open FIN, "$samtools mpileup -A -C 0 $repetitive_bam | cut -f 1,2,4|" or die "$ERROR_PREFIX Error running samtools mpileup on '$repetitive_bam'";
+open FOUT, "| $bgzip -c > $repetitive_regions_out" or die "$ERROR_PREFIX Error opening '$repetitive_regions_out'";
+$current_ref = "";
+my $interval_start = -1;
+my $interval_end = -1;
+
+while (<FIN>) {
+ chomp;
+ my ($ref, $pos, $cov) = split /\t/;
+
+ if ($current_ref ne $ref) {
+ if ($current_ref ne "") {
+ print FOUT "$current_ref\t$interval_start\t$interval_end\n";
+ }
+ $current_ref = $ref;
+ $interval_start = $interval_end = $pos;
+ }
+ else {
+ if ($pos == $interval_end + 1) {
+ $interval_end++;
+ }
+ else {
+ print FOUT "$current_ref\t$interval_start\t$interval_end\n";
+ $interval_start = $interval_end = $pos;
+ }
+ }
+}
+
+print FOUT "$current_ref\t$interval_start\t$interval_end\n";
+close FIN or die $!;
+close FOUT or die $!;
+
+unless ($options{noclean}) {
+ unlink $perfect_bam or die $!;
+ unlink $repetitive_bam or die $!;
+}
+
+sub system_call {
+ my $cmd = shift;
+ if (system($cmd)) {
+ print STDERR "$ERROR_PREFIX Error in system call:\n$cmd\n";
+ exit(1);
+ }
+}
+
diff --git a/src/task_perfectmap.pl b/src/task_perfectmap.pl
new file mode 100755
index 0000000..5faba51
--- /dev/null
+++ b/src/task_perfectmap.pl
@@ -0,0 +1,143 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options;
+my $usage = qq/<assembly.fa> <reads_1.fastq> <reads_2.fastq> <ave insert size> <prefix of output files>
+
+Note: the reads can be gzipped. If the extension is '.gz', then they
+are assumed to be gzipped and dealt with accordingly (i.e. called something
+like reads_1.fastq.gz reads_2.fastq.gz).
+
+IMPORTANT: all reads must be the same length.
+/;
+
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if ($#ARGV != 4 or !($ops_ok)) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+my $ref_fa = $ARGV[0];
+my $reads_1 = $ARGV[1];
+my $reads_2 = $ARGV[2];
+my $fragsize = $ARGV[3];
+my $preout = $ARGV[4];
+my $findknownsnps = File::Spec->catfile($scriptdir, 'findknownsnps');
+my $ERROR_PREFIX = '[REAPR perfect_map]';
+my $raw_coverage_file = "$preout.tmp.cov.txt";
+my $tmp_bin = "$preout.tmp.bin";
+my $tmp_bin_single_match = "$tmp_bin\_single_match.fastq";
+my $tabix = File::Spec->catfile($scriptdir, 'tabix/tabix');
+my $bgzip = File::Spec->catfile($scriptdir, 'tabix/bgzip');
+my $samtools = File::Spec->catfile($scriptdir, 'samtools');
+my $all_bases_outfile = "$preout.perfect_cov.gz";
+my $hist_outfile = "$preout.hist";
+my @coverage = (0) x 101;
+my %ids_with_coverage;
+my $reads_for_snpomatic_1 = "$preout.tmp.reads_1.fq";
+my $reads_for_snpomatic_2 = "$preout.tmp.reads_2.fq";
+my $frag_variance = int(0.5 * $fragsize);
+
+# we want an indexed fasta file
+unless (-e "$ref_fa.fai"){
+ system_call("$samtools faidx $ref_fa");
+}
+
+# snp-o-matic can't take gzipped reads, so decompress them first if necessary
+if ($reads_1 =~ /\.gz$/) {
+ system_call("gunzip -c $reads_1 > $reads_for_snpomatic_1");
+}
+else {
+ symlink($reads_1, $reads_for_snpomatic_1) or die "Error making symlink $reads_1, $reads_for_snpomatic_1";
+}
+
+if ($reads_2 =~ /\.gz$/) {
+ system_call("gunzip -c $reads_2 > $reads_for_snpomatic_2");
+}
+else {
+ symlink($reads_2, $reads_for_snpomatic_2) or die "Error making symlink $reads_2, $reads_for_snpomatic_2";
+}
+
+# get the read length
+open F, "$reads_for_snpomatic_1" or die "$ERROR_PREFIX Error opening file '$reads_for_snpomatic_1'";
+<F>; # first ID line @... of fastq file
+my $line = <F>;
+chomp $line;
+my $read_length = length $line;
+close F;
+
+# do the mapping with snpomatic, makes file of perfect read coverage at each position of the genome
+system_call("$findknownsnps --genome=$ref_fa --fastq=$reads_for_snpomatic_1 --fastq2=$reads_for_snpomatic_2 --bins=$tmp_bin --binmask=0100 --fragment=$fragsize --variance=$frag_variance --chop=10");
+system_call("$findknownsnps --genome=$ref_fa --fastq=$tmp_bin_single_match --pair=$read_length --fragment=$fragsize --variance=$frag_variance --coverage=$raw_coverage_file --chop=10");
+
+unlink $reads_for_snpomatic_1 or die "Error deleting $reads_for_snpomatic_1";
+unlink $reads_for_snpomatic_2 or die "Error deleting $reads_for_snpomatic_2";
+
+# use perfect coverage file to make a tabixed file of the perfect coverage
+open FIN, "$raw_coverage_file" or die "$ERROR_PREFIX Error opening '$raw_coverage_file'";
+open FOUT, "| $bgzip -c >$all_bases_outfile" or die "$ERROR_PREFIX Error opening '$all_bases_outfile'";
+print FOUT "#chr\tposition\tcoverage\n";
+<FIN>;
+while (<FIN>) {
+ my @a = split /\t/;
+ my $s = $a[3] + $a[4] + $a[5] + $a[6];
+ print FOUT "$a[0]\t$a[1]\t$s\n";
+ $coverage[$s > 100 ? 100 : $s]++;
+ unless (exists $ids_with_coverage{$a[0]}) {
+ $ids_with_coverage{$a[0]} = 1;
+ }
+}
+
+close FIN;
+close FOUT;
+
+# sequences with no coverage at all are not in the snpomatic output,
+# so check against the fai file to mop up the zero coverage base count
+open FIN, "$ref_fa.fai" or die "$ERROR_PREFIX Error opening $ref_fa.fai";
+while (<FIN>) {
+ my @a = split /\t/;
+ unless (exists $ids_with_coverage{$a[0]}) {
+ $coverage[0] += $a[1];
+ }
+}
+close FIN;
+
+
+open FOUT, ">$hist_outfile" or die "$ERROR_PREFIX Error opening '$hist_outfile'";
+print FOUT "#coverage\t#number_of_bases\n";
+
+for my $i (0..$#coverage) {
+ print FOUT "$i\t$coverage[$i]\n";
+}
+
+close FOUT;
+
+
+system_call("$tabix -f -b 2 -e 2 $all_bases_outfile");
+unlink $tmp_bin_single_match or die "$ERROR_PREFIX Error deleting file $tmp_bin_single_match";
+unlink $raw_coverage_file or die "$ERROR_PREFIX Error deleting file $raw_coverage_file";
+
+
+sub system_call {
+ my $cmd = shift;
+ if (system($cmd)) {
+ print STDERR "$ERROR_PREFIX Error in system call:\n$cmd\n";
+ exit(1);
+ }
+}
diff --git a/src/task_pipeline.pl b/src/task_pipeline.pl
new file mode 100755
index 0000000..e3c4a17
--- /dev/null
+++ b/src/task_pipeline.pl
@@ -0,0 +1,135 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+use Cwd 'abs_path';
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my $reapr_dir = abs_path(File::Spec->catfile($scriptdir, File::Spec->updir()));
+my $reapr = File::Spec->catfile($reapr_dir, 'reapr');
+
+my %options = (fcdcut => 0);
+
+my $usage = qq/[options] <assembly.fa> <in.bam> <out directory> [perfectmap prefix]
+
+where 'perfectmap prefix' is optional and should be the prefix used when task
+perfectmap was run.
+
+It is assumed that reads in in.bam are 'innies', i.e. the correct orientation
+is reads in a pair pointing towards each other (---> <---).
+
+Options:
+
+-stats|fcdrate|score|break option=value
+\tYou can pass options to stats, fcdrate, score or break
+\tif you want to change the default settings. These
+\tcan be used multiple times to use more than one option. e.g.:
+\t\t-stats i=100 -stats j=1000
+\tIf an option has no value, use 1. e.g.
+\t\t-break b=1
+-fcdcut <float>
+\tSet the fcdcutoff used when running score. Default is to
+\trun fcdrate to determine the cutoff. Using this option will
+\tskip fcdrate and use the given value.
+-x
+\tBy default, a bash script is written to run all
+\tthe pipeline stages. Using this option stops the
+\tscript from being run.
+/;
+
+my $ERROR_PREFIX = '[REAPR pipeline]';
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+ 'stats=s%',
+ 'fcdrate=s%',
+ 'score=s%',
+ 'break=s%',
+ 'x',
+ 'fcdcut=f',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if (!($ops_ok) or $#ARGV < 2) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+my $ref = $ARGV[0];
+my $bam = $ARGV[1];
+my $dir = $ARGV[2];
+my $version = '1.0.18';
+my $bash_script = "$dir.run-pipeline.sh";
+my $stats_prefix = '01.stats';
+my $fcdrate_prefix = '02.fcdrate';
+my $score_prefix = '03.score';
+my $break_prefix = '04.break';
+my $summary_prefix = '05.summary';
+
+my $perfect_prefix = "";
+if ($#ARGV == 3) {
+ $perfect_prefix = File::Spec->rel2abs($ARGV[3]);
+}
+
+# make a bash script that runs all the pipeline commands
+my %commands;
+$commands{facheck} = "$reapr facheck $ref";
+$commands{preprocess} = "$reapr preprocess $ref $bam $dir\n"
+. "cd $dir";
+
+if ($perfect_prefix) {
+ $commands{stats} = "$reapr stats " . hash_to_ops($options{stats}) . " -p $perfect_prefix.perfect_cov.gz ./ $stats_prefix";
+ $commands{score} = "$reapr score " . hash_to_ops($options{score}) . " -P 5 00.assembly.fa.gaps.gz 00.in.bam $stats_prefix \$fcdcutoff $score_prefix";
+}
+else {
+ $commands{stats} = "$reapr stats " . hash_to_ops($options{stats}) . " ./ $stats_prefix";
+ $commands{score} = "$reapr score " . hash_to_ops($options{score}) . " 00.assembly.fa.gaps.gz 00.in.bam $stats_prefix \$fcdcutoff $score_prefix";
+}
+
+if ($options{fcdcut} == 0) {
+ $commands{fcdrate} = "$reapr fcdrate " . hash_to_ops($options{fcdrate}) . " ./ $stats_prefix $fcdrate_prefix\n"
+ . "fcdcutoff=`tail -n 1 $fcdrate_prefix.info.txt | cut -f 1`";
+}
+else {
+ $commands{fcdrate} = "echo \"$ERROR_PREFIX ... skipping. User provided cutoff: $options{fcdcut}\"\n"
+ . "fcdcutoff=$options{fcdcut}";
+}
+
+my $break_ops = hash_to_ops($options{break});
+$break_ops =~ s/\-a 1/-a/;
+$break_ops =~ s/-b 1/-b/;
+$commands{break} = "$reapr break $break_ops 00.assembly.fa $score_prefix.errors.gff.gz $break_prefix";
+$commands{summary} = "$reapr summary 00.assembly.fa $score_prefix $break_prefix $summary_prefix";
+
+open F, ">$bash_script" or die "$ERROR_PREFIX Error opening file for writing '$bash_script'";
+print F "set -e\n"
+. "echo \"Running reapr version $version pipeline:\"\n"
+. "echo \"$reapr " . join(' ', @ARGV) . "\"\n\n";
+
+for my $task (qw/facheck preprocess stats fcdrate score break summary/) {
+ print F "echo \"$ERROR_PREFIX Running $task\"\n"
+ . "$commands{$task}\n\n";
+}
+
+close F;
+
+$options{x} or exec "bash $bash_script" or die $!;
+
+sub hash_to_ops {
+ my $h = shift;
+ my $s = '';
+ for my $k (keys %{$h}) {
+ $s .= " -$k " . $h->{$k}
+ }
+ $s =~ s/^\s+//;
+ return $s;
+}
+
diff --git a/src/task_plots.pl b/src/task_plots.pl
new file mode 100755
index 0000000..75fc7d4
--- /dev/null
+++ b/src/task_plots.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+use File::Spec;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options;
+
+my $usage = qq/[options] <in.stats.gz> <out prefix> <assembly.fa> <contig id>
+
+Options:
+
+-s <scores prefix>
+\tThis should be the outfiles prefix used when score was run
+/;
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'score_prefix:s',
+ 'wrapperhelp',
+);
+
+if ($options{wrapperhelp}) {
+ die $usage;
+}
+
+if ($#ARGV != 3 or !($ops_ok)) {
+ die "usage:\n$scriptname $usage";
+}
+
+my $stats = File::Spec->rel2abs($ARGV[0]);
+my $outprefix = File::Spec->rel2abs($ARGV[1]);
+my $ref_fa = File::Spec->rel2abs($ARGV[2]);
+my $ref_id = $ARGV[3];
+my $ERROR_PREFIX = '[REAPR plots]';
+
+# check input files exist
+unless (-e $stats) {
+ print STDERR "$ERROR_PREFIX Can't find stats file '$stats'\n";
+ exit(1);
+}
+
+
+unless (-e $ref_fa) {
+ print STDERR "$ERROR_PREFIX Can't find assmbly fasta file '$ref_fa'\n";
+ exit(1);
+}
+
+
+my $tabix = File::Spec->catfile($scriptdir, 'tabix/tabix');
+my $bgzip = File::Spec->catfile($scriptdir, 'tabix/bgzip');
+my $samtools = File::Spec->catfile($scriptdir, 'samtools');
+my @plot_list = ('frag_cov', 'frag_cov_cor', 'read_cov', 'read_ratio_f', 'read_ratio_r', 'clip', 'FCD_err');
+my @file_list;
+my $fa_out = "$outprefix.ref.fa";
+my $gff_out;
+
+foreach (@plot_list) {
+ push @file_list, "$outprefix.$_.plot";
+}
+
+# make the standard plot files
+my $plot_prog = File::Spec->catfile($scriptdir, "make_plots");
+system_call("$tabix $stats '$ref_id' | $plot_prog $outprefix");
+
+# if requested, make a sores plot file and gff errors file
+if ($options{score_prefix}) {
+ # scores
+ my $scores_in = $options{score_prefix} . ".per_base.gz";
+ my $score_plot = "$outprefix.score.plot";
+ system_call("$tabix $scores_in '$ref_id' > $score_plot");
+ push @file_list, $score_plot;
+
+ # gff
+ my $gff_in = $options{score_prefix} . ".errors.gff.gz";
+ $gff_out = "$outprefix.errors.gff.gz";
+ system_call("$tabix $gff_in '$ref_id' | $bgzip -c > $gff_out");
+}
+
+# check if a perfect_cov plot file was made
+my $perfect_plot = "$outprefix.perfect_cov.plot";
+if (-e $perfect_plot) {push @file_list, $perfect_plot};
+
+# get the reference sequence from the fasta file
+system_call("$samtools faidx $ref_fa '$ref_id' > $fa_out");
+
+# bgzip the plots
+foreach (@file_list) {
+ system_call("$bgzip $_");
+ if (/\.gff$/) {
+ system_call("$tabix -p gff $_.gz");
+ }
+ else {
+ system_call("$tabix -b 2 -e 2 $_.gz");
+ }
+ $_ .= ".gz";
+
+}
+
+
+
+# write shell script to start artemis
+my $bash_script = "$outprefix.run_art.sh";
+open FILE, ">$bash_script" or die $!;
+print FILE "#!/usr/bin/env bash
+set -e
+art -Duserplot='";
+print FILE join (",", sort @file_list);
+if ($options{score_prefix}) {
+ print FILE "' $fa_out + $gff_out\n";
+}
+else {
+ print FILE "' $fa_out\n";
+}
+close FILE;
+chmod 0755, $bash_script;
+
+
+# usage: system_call(string)
+# Runs the string as a system call, dies if call returns nonzero error code
+sub system_call {
+ my $cmd = shift;
+ if (system($cmd)) {
+ print STDERR "Error in system call:\n$cmd\n";
+ exit(1);
+ }
+}
diff --git a/src/task_preprocess.pl b/src/task_preprocess.pl
new file mode 100755
index 0000000..1b60927
--- /dev/null
+++ b/src/task_preprocess.pl
@@ -0,0 +1,484 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+use List::Util qw[min max];
+use File::Copy;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options;
+my $usage = qq/<assembly.fa> <in.bam> <output directory>
+/;
+
+my $ERROR_PREFIX = '[REAPR preprocess]';
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if ($#ARGV != 2 or !($ops_ok)) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+my $fasta_in = $ARGV[0];
+my $bam_in = $ARGV[1];
+my $outdir = $ARGV[2];
+
+-e $fasta_in or die "$ERROR_PREFIX Cannot find file '$fasta_in'. Aborting\n";
+-e $bam_in or die "$ERROR_PREFIX Cannot find file '$bam_in'. Aborting\n";
+
+my $prefix = '00';
+my $ref = "$prefix.assembly.fa";
+my $bam = "$prefix.in.bam";
+my $gaps_file = "$prefix.assembly.fa.gaps.gz";
+my $gc_file = "$prefix.assembly.fa.gc.gz";
+my $bases_to_sample = 1000000;
+my $total_frag_sample_bases = 4000000;
+my $sample_dir = "$prefix.Sample";
+my $fcd_file = File::Spec->catfile($sample_dir, 'fcd.txt');
+my $insert_prefix = File::Spec->catfile($sample_dir, 'insert');
+my $frag_cov_file = File::Spec->catfile($sample_dir, 'fragCov.gz');
+my $gc_vs_cov_data_file = File::Spec->catfile($sample_dir, 'gc_vs_cov.dat');
+my $ideal_fcd_file = File::Spec->catfile($sample_dir, 'ideal_fcd.txt');
+my $lowess_prefix = File::Spec->catfile($sample_dir, 'gc_vs_cov.lowess');
+my $r_script = File::Spec->catfile($sample_dir, 'gc_vs_cov.R');
+my $tabix = File::Spec->catfile($scriptdir, 'tabix/tabix');
+my $bgzip = File::Spec->catfile($scriptdir, 'tabix/bgzip');
+my $samtools = File::Spec->catfile($scriptdir, 'samtools');
+
+# make directory and soft links to required files
+$fasta_in = File::Spec->rel2abs($fasta_in);
+$bam_in = File::Spec->rel2abs($bam_in);
+
+if (-d $outdir) {
+ die "$ERROR_PREFIX Directory '$outdir' already exists. Cannot continue\n";
+}
+
+mkdir $outdir or die $!;
+chdir $outdir or die $!;
+symlink $fasta_in, $ref or die $!;
+symlink $bam_in, $bam or die $!;
+mkdir $sample_dir or die $!;
+
+# we want indexed fasta and bam files.
+# Check if they are already indexed, run the indexing if needed or soft link index files
+if (-e "$fasta_in.fai") {
+ symlink "$fasta_in.fai", "$ref.fai";
+}
+else {
+ system_call("$samtools faidx $ref");
+}
+
+if (-e "$bam_in.bai") {
+ symlink "$bam_in.bai", "$bam.bai";
+}
+else {
+ system_call("$samtools index $bam");
+}
+
+# make gaps file of gaps in reference
+my $fa2gaps = File::Spec->catfile($scriptdir, 'fa2gaps');
+system_call("$fa2gaps $ref | $bgzip -c > $gaps_file");
+system_call("$tabix -f -b 2 -e 3 $gaps_file");
+
+# get insert distribution from a sample of the BAM
+my $bam2insert = File::Spec->catfile($scriptdir, 'bam2insert');
+system_call("$bam2insert -n 50000 -s 2000000 $bam $ref.fai $insert_prefix");
+
+# get the insert stats from output file from bam2insert
+my %insert_stats = (
+ mode => -1,
+ mean => -1,
+ pc1 => -1,
+ pc99 => -1,
+ sd => -1,
+);
+
+open F, "$insert_prefix.stats.txt" or die $!;
+while (<F>) {
+ chomp;
+ my ($stat, $val) = split /\t/;
+ $insert_stats{$stat} = $val;
+}
+close F;
+
+foreach (keys %insert_stats) {
+ if ($insert_stats{$_} == -1) {
+ print STDERR "$ERROR_PREFIX Error getting insert stat $_ from file $insert_prefix.stats.txt\n";
+ exit(1);
+ }
+}
+
+# with large insert libraries, can happen that the mode can be very small,
+# even though the mean is something like 30k. Check for this by looking for
+# the mode within a standard deviations of the mean.
+my $ave_insert = get_mean($insert_stats{mode}, $insert_stats{mean}, $insert_stats{sd}, "$insert_prefix.in");
+
+# update the insert stats file with the 'average insert size'
+open F, ">>$insert_prefix.stats.txt" or die $!;
+print F "ave\t$ave_insert\n";
+close F;
+
+# get GC content across the genome
+if (-e "$fasta_in.gc.gz") {
+ symlink "$fasta_in.gc.gz", "$gc_file";
+ symlink "$fasta_in.gc.gz.tbi", "$gc_file.tbi";
+}
+else {
+ my $fa2gc = File::Spec->catfile($scriptdir, 'fa2gc');
+ system_call("$fa2gc -w $ave_insert $ref | $bgzip -c > $gc_file");
+ system_call("$tabix -f -b 2 -e 2 $gc_file");
+}
+
+
+my %ref_lengths;
+my @ref_seqs; # we need them in order as well as hashed with name->length
+open F, "$ref.fai" or die $!;
+while (<F>) {
+ chomp;
+ my ($chrom, $length) = split;
+ $ref_lengths{$chrom} = $length;
+ push @ref_seqs, $chrom;
+}
+close F;
+
+# work out how far to go into BAM file when getting a sample of fragment coverage.
+# This depends on gaps in the reference
+my %regions_to_sample;
+my @gaps;
+my $sampled_bases = 0;
+my $current_id = "";
+my $ref_seqs_index = 0;
+my $last_pos_for_frag_sample = 0;
+my $frag_sample_bases = 0; # number of bases to use when sampling fragment coverage
+
+while ($sampled_bases < $bases_to_sample and $ref_seqs_index <= $#ref_seqs) {
+ # skip sequences that are too short
+ if ($ref_lengths{$ref_seqs[$ref_seqs_index]} < 3 * $ave_insert) {
+ $frag_sample_bases += $ref_lengths{$ref_seqs[$ref_seqs_index]};
+ $ref_seqs_index++;
+ next;
+ }
+
+ @gaps = ();
+ push @gaps, [0, $ave_insert];
+
+ # get gaps for current ref seq
+ open F, "$tabix $gaps_file $ref_seqs[$ref_seqs_index] | " or die $!;
+ while (<F>) {
+ chomp;
+ my ($chr, $start, $end) = split;
+
+ if ($gaps[-1][1] >= $start) {
+ $gaps[-1][1] = $end;
+ }
+ else {
+ push @gaps, [$start, $end];
+ }
+ }
+
+ close F;
+
+ # add fake gap at end of current seq, the length of the insert size
+ my $insert_from_end_pos = $ref_lengths{$ref_seqs[$ref_seqs_index]} - $ave_insert;
+
+ # need to first remove any gaps which are completely contained in where the
+ # new gap would be
+ while ($gaps[-1][0] >= $insert_from_end_pos) {
+ pop @gaps;
+ }
+
+ # extend the last gap
+ if ($gaps[-1][1] >= $insert_from_end_pos) {
+ $gaps[-1][1] = $ref_lengths{$ref_seqs[$ref_seqs_index]} - 1;
+ }
+ # add the final gap after the existing last gap
+ else {
+ push @gaps, [$insert_from_end_pos, $ref_lengths{$ref_seqs[$ref_seqs_index]} - 1];
+ }
+
+ # if there's only one gap then skip this sequence (the whole thing is pretty much Ns)
+ if ($#gaps == 0) {
+ $ref_seqs_index++;
+ next;
+ }
+
+ $regions_to_sample{$ref_seqs[$ref_seqs_index]} = ();
+ my $last_end = 0;
+
+ # update regions of interest for sampling
+ my $new_bases = 0;
+ for my $i (0..($#gaps - 1)){
+ my $start = $gaps[$i][1] + 1;
+ my $end = $gaps[$i+1][0] - 1;
+ my $region_length = $end - $start + 1;
+
+ # if region gets us enough sampled bases
+ if ($sampled_bases + $region_length >= $bases_to_sample) {
+ $region_length = $bases_to_sample - $sampled_bases;
+ $end = $start + $region_length - 1;
+ }
+
+ push @{$regions_to_sample{$ref_seqs[$ref_seqs_index]}}, [$start, $end];
+ $sampled_bases += $region_length;
+ $frag_sample_bases += $gaps[$i+1][0] - $gaps[$i][0] + 1;
+
+ if ($sampled_bases >= $bases_to_sample) {
+ last;
+ }
+ elsif ($i == $#gaps - 1) {
+ $frag_sample_bases += $gaps[$i+1][1] - $gaps[$i+1][0] + 1;
+ }
+ }
+
+ $ref_seqs_index++;
+}
+
+$frag_sample_bases += 1000; # just for paranoia with off by one errors
+
+# get fragment coverage for a sample of the genome
+my $bam2fragCov = File::Spec->catfile($scriptdir, 'bam2fragCov');
+system_call("$bam2fragCov -s $frag_sample_bases $bam $insert_stats{pc1} $insert_stats{pc99} | $bgzip -c > $frag_cov_file");
+system_call("$tabix -f -b 2 -e 2 $frag_cov_file");
+
+# now get the GC vs coverage data, and also work out the mean fragment coverage
+my $frag_cov_total = 0;
+my $frag_cov_base_count = 0;
+open F, ">$gc_vs_cov_data_file" or die $!;
+
+for my $chr (keys %regions_to_sample) {
+ for my $ar (@{$regions_to_sample{$chr}}) {
+ my ($start, $end) = @{$ar};
+ my $region = "$chr:$start-$end";
+ print "$ERROR_PREFIX sampling region $region. Already sampled $frag_cov_base_count bases\n";
+ open GC, "$tabix $gc_file $region |" or die $!;
+ open COV, "$tabix $frag_cov_file $region |" or die $!;
+
+ while (my $gc_line = <GC>) {
+ my $cov_line = <COV>;
+ # might hav been no coverage of this in the bam file, in which case there will be nothing in
+ # the sample coverage file, so skip it
+ unless ($cov_line) {
+ print STDERR "$ERROR_PREFIX No coverage in $region. Skipping\n";
+ last;
+ }
+ chomp $cov_line or die "$ERROR_PREFIX Error reading sample coverage file, opened with: tabix $frag_cov_file $region";
+ chomp $gc_line or die "$ERROR_PREFIX Error reading sample GC file, opened with: tabix $gc_file $region";
+ my (undef, undef, $cov) = split /\t/, $cov_line;
+ my (undef, undef, $gc) = split /\t/, $gc_line;
+ print F "$gc\t$cov\n";
+ $frag_cov_total += $cov;
+ $frag_cov_base_count++;
+ }
+
+ close GC;
+ close COV;
+ }
+}
+
+close F;
+
+if ($frag_cov_base_count == 0) {
+ print STDERR qq{$ERROR_PREFIX Error sampling from files '$frag_cov_file', '$gc_file'
+Most likely causes are:
+1. A character in the assembly sequence names that
+ broke tabix, such as :,| or -. You can check for this by running
+ reapr facheck
+2. A mismatch of names in the input BAM and assembly fasta files.
+ A common cause is trailing whitespace in the fasta file, or
+ everything after the first whitesace character in a name being
+ removed by the mapper, so the name is different in the BAM file.
+3. There is not enough fragment coverage because the assembly is
+ too fragmented. You may want to compare your mean contig length with
+ the insert size of the reads. Also have a look at this plot of the
+ insert size distribution:
+ 00.Sample/insert.in.pdf
+};
+ exit(1);
+}
+
+open F, ">>$insert_prefix.stats.txt" or die $!;
+print F "inner_mean_cov\t" . ($frag_cov_total / $frag_cov_base_count) . "\n";
+close F;
+
+if ($frag_cov_total == 0) {
+ print STDERR "$ERROR_PREFIX Something went wrong sampling fragment coverage - didn't get any coverage.\n";
+ exit(1);
+}
+
+
+if ($sampled_bases == 0) {
+ print STDERR "$ERROR_PREFIX Something went wrong sampling bases for GC/coverage estimation\n";
+ exit(1);
+}
+else {
+ print "$ERROR_PREFIX Sampled $frag_cov_base_count bases for GC/coverage estimation\n";
+}
+
+# Do some R stuff: plot cov vs GC, run lowess and make a file
+# of the lowess numbers
+open F, ">$r_script" or die $!;
+print F qq/data=read.csv(file="$gc_vs_cov_data_file", colClasses=c("numeric", "integer"), header=F, sep="\t", comment.char="")
+l=lowess(data)
+data_out=unique(data.frame(l\$x,l\$y))
+write(t(data_out), sep="\t", ncolumns=2, file="$lowess_prefix.dat.tmp")
+pdf("$lowess_prefix.pdf")
+ smoothScatter(data, xlab="GC", ylab="Coverage")
+ lines(data_out)
+dev.off()
+/;
+
+close F;
+
+system_call("R CMD BATCH $r_script " . $r_script . "out");
+
+# We really want a value for each GC value in 0..100, so interpolate
+# the values made by R.
+# Any not found values will be set to -1
+open F, "$lowess_prefix.dat.tmp" or die $!;
+
+
+my @gc2cov = (-1) x 101;
+
+while (<F>) {
+ chomp;
+ my ($gc, $cov) = split;
+ next if $cov < 0;
+ $gc2cov[$gc] = $cov;
+}
+
+close F;
+
+unlink "$lowess_prefix.dat.tmp" or die $!;
+
+# interpolate any missing values
+my $first_known = 0;
+my $last_known = 100;
+while ($gc2cov[$first_known] == -1) {$first_known++}
+while ($gc2cov[$last_known] == -1) {$last_known--}
+
+for my $i ($first_known..$last_known) {
+ if ($gc2cov[$i] == -1) {
+ my $left = $i;
+ while ($gc2cov[$left] == -1) {$left--}
+ my $right = $i;
+ while ($gc2cov[$right] == -1) {$right++}
+
+ for my $j ($left + 1 .. $right - 1) {
+ $gc2cov[$j] = $gc2cov[$left] + ($gc2cov[$right] - $gc2cov[$left]) * ($j - $left) / ($right - $left);
+ }
+ }
+}
+
+# linearly extrapolate the missing values at the start, using first two
+# values that we do have for gc vs coverage, but we don't want negative values.
+if ($gc2cov[0] == -1) {
+ my $i = 0;
+ while ($gc2cov[$i] == -1) {$i++};
+ die "Error in getting GC vs coverage. Not enough data?" if ($i > 99 or $gc2cov[$i + 1] == -1);
+
+ my $diff = $gc2cov[$i + 1] - $gc2cov[$i];
+ $i--;
+ while ($i >= 0) {
+ $gc2cov[$i] = $gc2cov[$i+1] - $diff < 0 ? $gc2cov[$i+1] : $gc2cov[$i+1] - $diff;
+ $i--;
+ }
+}
+
+# linearly extrapolate the missing values at the end, using last two
+# values that we do have for gc vs coverage. but we don't want negative values
+if ($gc2cov[-1] == -1) {
+ my $i = 100;
+ while ($i > 0 and $gc2cov[$i] == -1) {
+ $i--;
+ }
+
+ my $diff = $gc2cov[$i-1] - $gc2cov[$i];
+ $i++;
+ while ($i <= 100) {
+ $gc2cov[$i] = $gc2cov[$i-1] - $diff < 0 ? $gc2cov[$i-1] : $gc2cov[$i-1] - $diff;
+ $i++;
+ }
+}
+
+
+# write the gc -> coverage to a file
+open F, ">$lowess_prefix.dat" or die $!;
+
+for my $i (0..100){
+ print F "$i\t$gc2cov[$i]\n";
+}
+
+close F;
+
+
+sub system_call {
+ my $cmd = shift;
+ if (system($cmd)) {
+ print STDERR "$ERROR_PREFIX Error in system call:\n$cmd\n";
+ exit(1);
+ }
+}
+
+
+sub get_mean {
+ my $mode = shift;
+ my $mean = shift;
+ my $sd = shift;
+ my $fname_prefix = shift;
+
+ my $ave = -1;
+
+ if (abs($mode - $mean) > $sd) {
+ print STDERR "$ERROR_PREFIX Warning: mode insert size $mode is > a standard deviation from the mean $mean\n";
+ print STDERR "$ERROR_PREFIX ... looking for new mode nearer to the mean ...\n";
+ my $max_count = -1;
+ my $range_min = $mean - $sd;
+ my $range_max = $mean + $sd;
+
+ open F, "$fname_prefix.R" or die "$ERROR_PREFIX Error opening file '$fname_prefix.R'";
+ my $xvals = <F>;
+ my $yvals = <F>;
+ close F;
+
+ $xvals = substr($xvals, 6, length($xvals) - 8);
+ print "$xvals\n";
+ my @x_r_vector = split(/,/, $xvals);
+ $yvals = substr($yvals, 6, length($yvals) - 8);
+ my @y_r_vector = split(/,/, $yvals);
+
+ for my $i (0..$#x_r_vector) {
+ my $isize = $x_r_vector[$i];
+ my $count = $y_r_vector[$i];
+
+ if ($range_min <= $isize and $isize <= $range_max and $count > $max_count) {
+ $max_count = $count;
+ $ave = $isize;
+ }
+ }
+
+ if ($ave == -1) {
+ print STDERR "$ERROR_PREFIX ... error getting new mode. Cannot continue.\n",
+ "$ERROR_PREFIX You might want to have a look at the insert plot $fname_prefix.pdf\n";
+ exit(1);
+ }
+ else {
+ print STDERR "$ERROR_PREFIX ... got new mode $ave\n",
+ "$ERROR_PREFIX ... you might want to sanity check this by inspecting the insert plot $fname_prefix.pdf\n";
+ }
+ }
+ else {
+ $ave = $mode;
+ }
+ return $ave;
+}
diff --git a/src/task_score.cpp b/src/task_score.cpp
new file mode 100644
index 0000000..12dce9f
--- /dev/null
+++ b/src/task_score.cpp
@@ -0,0 +1,1148 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <map>
+#include <list>
+#include <algorithm>
+#include <cmath>
+#include <set>
+
+#include "utils.h"
+#include "errorWindow.h"
+#include "histogram.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+#include "tabix/tabix.hpp"
+
+using namespace BamTools;
+using namespace std;
+
+const string ERROR_PREFIX = "[REAPR score] ";
+const string TOOL_NAME = "REAPR";
+const short CHR = 0;
+const short POS = 1;
+const short PERFECT_COV = 2;
+const short READ_F = 3;
+const short READ_PROP_F = 4;
+const short READ_ORPHAN_F = 5;
+const short READ_ISIZE_F = 6;
+const short READ_BADORIENT_F = 7;
+const short READ_R = 8;
+const short READ_PROP_R = 9;
+const short READ_ORPHAN_R = 10;
+const short READ_ISIZE_R = 11;
+const short READ_BADORIENT_R = 12;
+const short FRAG_COV = 13;
+const short FRAG_COV_CORRECT = 14;
+const short FCD_MEAN = 15;
+const short CLIP_FL = 16;
+const short CLIP_RL = 17;
+const short CLIP_FR = 18;
+const short CLIP_RR = 19;
+const short FCD_ERR = 20;
+const short CLIP_FAIL = 42;
+const short GAP = 43;
+const short READ_COV = 44;
+
+
+
+struct CmdLineOptions
+{
+ string bamInfile;
+ string gapsInfile;
+ string globalStatsInfile; // the one made by 'stats'
+ string outprefix;
+ string statsInfile;
+ int64_t minInsert; // to count as a proper read pair
+ int64_t maxInsert; // to count as a proper read pair
+ unsigned long fragMin; // minimum inner fragment coverage
+ unsigned long windowLength;
+ unsigned long readCovWinLength;
+ unsigned long usePerfect; // min perfect coverage (if being used)
+ bool perfectWins;
+ double readRatioMax;
+ unsigned long minReadCov;
+ double windowPercent;
+ double clipCutoff; // used to callpielup of soft clipping errors
+ unsigned long maxGap; // max gap length to call errors over
+ unsigned long outerInsertSize; // ave outer fragment size. Get from stats file made by stats (which got it from preprocess
+ // stats file)
+ ofstream ofs_breaks;
+ unsigned long minMapQuality; // ignore reads with mapping qualiy less than this
+ float minReportScore; // cutoff for reporting high score regions
+ unsigned long minScoreReportLength;
+ float maxFragCorrectCov; // cutoff in relative error of fragment coverage for repeat calling
+ unsigned long minRepeatLength; // min repeat length to report
+ short readType;
+ double fcdCutoff;
+ unsigned long fcdWindow;
+ float scoreDivider;
+ bool verbose;
+ bool debug;
+ bool callRepeats;
+};
+
+
+struct BAMdata
+{
+ BamReader bamReader;
+ SamHeader header;
+ RefVector references;
+};
+
+
+struct Link
+{
+ string id;
+ string hitId;
+ unsigned long start;
+ unsigned long end;
+ unsigned long hitStart;
+ unsigned long hitEnd;
+};
+
+
+// for sorting by start/end position
+bool compare_link (Link first, Link second)
+{
+ return first.start < second.start;
+}
+
+template <class T>
+inline string toString(const T& t)
+{
+ stringstream ss;
+ ss << t;
+ return ss.str();
+}
+
+
+struct Error
+{
+ unsigned long start;
+ unsigned long end;
+ short type;
+};
+
+
+
+string getNearbyGaps(list<Error>::iterator p, list<pair<unsigned long, unsigned long > >& gaps, list<pair<unsigned long, unsigned long > >::iterator gapIter);
+
+
+// use to sort by start position
+bool compareErrors(const Error& e, const Error& f);
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops, map<short, ErrorWindow>& windows);
+
+void updateErrorList(list<Error>& l, Error& e);
+
+void scoreAndFindBreaks(CmdLineOptions& ops, map<short, list<Error> >& errors_map, list<pair<unsigned long, unsigned long > > &gaps, unsigned long seqLength, string& seqName, vector<float>& scores, vector<bool>& perfectCov, BAMdata& bamData);
+
+void updateScoreHist(map<float, unsigned long>& hist, vector<float>& scores);
+
+
+void bam2possibleLink(CmdLineOptions& ops, string& refID, unsigned long start, unsigned long end, string& hitName, unsigned long& hitStart, unsigned long& hitEnd, BAMdata& bamData);
+
+double region2meanScore(CmdLineOptions& ops, string& seqID, unsigned long start, unsigned long end, short column);
+
+
+
+int main(int argc, char* argv[])
+{
+ map<short, ErrorWindow> windows;
+ CmdLineOptions options;
+ parseOptions(argc, argv, options, windows);
+ map<string, list<pair<unsigned long, unsigned long> > > globalGaps;
+ map<short, list<Error> > errors;
+ loadGaps(options.gapsInfile, globalGaps);
+ string line;
+ string currentRefID = "";
+ string fout_breaks(options.outprefix + ".errors.gff");
+ map<float, unsigned long> scoreHist;
+
+ options.ofs_breaks.open(fout_breaks.c_str());
+
+ if (!options.ofs_breaks.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening '" << fout_breaks << "'" << endl;
+ exit(1);
+ }
+
+ unsigned long lastPos = 0;
+ vector<float> scores;
+ vector<bool> perfectCov;
+ BAMdata bamData;
+ Tabix ti(options.statsInfile);
+
+ // open bam file ready for when we look for links to other regions
+ if (!bamData.bamReader.Open(options.bamInfile))
+ {
+ cerr << ERROR_PREFIX << "Error opening bam file " << options.bamInfile << endl;
+ exit(1);
+ }
+
+ if (!bamData.bamReader.LocateIndex())
+ {
+ cerr << ERROR_PREFIX << "Couldn't find index for bam file '" << options.bamInfile << "'!" << endl;
+ exit(1);
+ }
+
+ if (!bamData.bamReader.HasIndex())
+ {
+ cerr << ERROR_PREFIX << "No index for bam file '" << options.bamInfile << "'!" << endl;
+ exit(1);
+ }
+
+ bamData.header = bamData.bamReader.GetHeader();
+ bamData.references = bamData.bamReader.GetReferenceData();
+
+ if (bamData.header.Sequences.Size() == 0)
+ {
+ cerr << ERROR_PREFIX << "Error reading header of BAM file. Didn't find any sequences" << endl;
+ return(1);
+ }
+
+ while (ti.getNextLine(line))
+ {
+ if (line[0] == '#') continue;
+
+ float currentScore;
+ vector<string> data;
+ string tmp;
+ split(line, '\t', data);
+
+ if (options.verbose && lastPos % 100000 == 0)
+ {
+ cerr << ERROR_PREFIX << "progress" << '\t' << data[CHR] << '\t' << lastPos << endl;
+ }
+
+ if (data[CHR].compare(currentRefID))
+ {
+ if (currentRefID.size() != 0)
+ {
+ // just got to new ref ID, so need to print out stuff from last one
+ scoreAndFindBreaks(options, errors, globalGaps[currentRefID], lastPos, currentRefID, scores, perfectCov, bamData);
+ updateScoreHist(scoreHist, scores);
+ }
+
+ currentRefID = data[CHR];
+ map<string, list<pair<unsigned long, unsigned long> > >::iterator p = globalGaps.find(currentRefID);
+ errors.clear();
+ scores.clear();
+ perfectCov.clear();
+
+ for (map<short, ErrorWindow>::iterator p = windows.begin(); p != windows.end(); p++)
+ {
+ p->second.clear(atoi(data[POS].c_str()));
+ }
+ }
+
+ // update perfect cov
+ if (options.usePerfect)
+ {
+ if (atoi(data[PERFECT_COV].c_str()) > 0)
+ {
+ perfectCov.push_back(true);
+ }
+ else
+ {
+ perfectCov.push_back(false);
+ }
+ }
+
+
+ // update the windows
+ for (map<short, ErrorWindow>::iterator p = windows.begin(); p != windows.end(); p++)
+ {
+ if (p->second.fail())
+ {
+ Error tmp;
+ tmp.start = p->second.start();
+ tmp.end = p->second.end();
+ tmp.type = p->first;
+ updateErrorList(errors[p->first], tmp);
+ }
+
+ if (p->first == READ_COV)
+ {
+ p->second.add( atoi(data[POS].c_str()), atoi(data[READ_F].c_str()) + atoi(data[READ_R].c_str()) );
+ }
+ else
+ {
+ p->second.add(atoi(data[POS].c_str()), atof(data[p->first].c_str()) );
+ }
+ }
+
+ // update the score
+ currentScore = (options.usePerfect && windows[PERFECT_COV].lastFail()) ? 1 : 0;
+ currentScore += windows[FCD_ERR].lastFail() ? 1 : 0;
+
+ if (!options.perfectWins || currentScore)
+ {
+ currentScore += windows[READ_F].lastFail() ? 0.5 : 0;
+ currentScore += windows[READ_R].lastFail() ? 0.5 : 0;
+ currentScore += windows[READ_PROP_F].lastFail() ? 0.5 : 0;
+ currentScore += windows[READ_PROP_R].lastFail() ? 0.5 : 0;
+ //currentScore += windows[FCD_ERR].lastFail() ? 1 : 0;
+ }
+
+ // Even if we have perfect coverage, could still have a collapsed repeat
+ if (options.callRepeats)
+ {
+ currentScore += windows[FRAG_COV_CORRECT].lastFail() ? 1 : 0;
+ }
+
+ // too much soft clipping?
+ unsigned long depthFwd = atoi(data[READ_F].c_str());
+ unsigned long depthRev = atoi(data[READ_R].c_str());
+ bool fl = depthFwd && atof(data[CLIP_FL].c_str()) / depthFwd >= options.clipCutoff;
+ bool fr = depthFwd && atof(data[CLIP_FR].c_str()) / depthFwd >= options.clipCutoff;
+ bool rl = depthRev && atof(data[CLIP_RL].c_str()) / depthRev >= options.clipCutoff;
+ bool rr = depthRev && atof(data[CLIP_RR].c_str()) / depthRev >= options.clipCutoff;
+
+ if ((fl && rl) || (fr && rr))
+ {
+ Error err;
+ err.start = err.end = atoi(data[POS].c_str());
+ err.type = CLIP_FAIL;
+ updateErrorList(errors[CLIP_FAIL], err);
+ if (!options.perfectWins || (options.usePerfect && windows[PERFECT_COV].lastFail())) currentScore++;
+ }
+
+ scores.push_back(1.0 * currentScore / options.scoreDivider);
+ lastPos = atoi(data[POS].c_str());
+ }
+
+ // sort out the final chromosome from the input stats
+ scoreAndFindBreaks(options, errors, globalGaps[currentRefID], lastPos, currentRefID, scores, perfectCov, bamData);
+ updateScoreHist(scoreHist, scores);
+ options.ofs_breaks.close();
+ string scoreHistFile(options.outprefix + ".score_histogram.dat");
+ ofstream ofsScore(scoreHistFile.c_str());
+ if (!ofsScore.good())
+ {
+ cerr << ERROR_PREFIX << "error opening file '" << scoreHistFile << "'" << endl;
+ exit(1);
+ }
+
+ for (map<float, unsigned long>::iterator i = scoreHist.begin(); i != scoreHist.end(); i++)
+ {
+ ofsScore << i->first << '\t' << i->second << endl;
+ }
+
+ ofsScore.close();
+
+ return 0;
+}
+
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops, map<short, ErrorWindow>& windows)
+{
+ string usage;
+ short requiredArgs = 5;
+ int i;
+ usage = "\
+where 'stats prefix' is the output prefix used when stats was run\n\n\
+Options:\n\n\
+-f <int>\n\tMinimum inner fragment coverage [1]\n\
+-g <int>\n\tMax gap length to call over [0.5 * outer_mean_insert_size]\n\
+-l <int>\n\tLength of window [100]\n\
+-p <int>\n\tUse perfect mapping reads score with given min coverage.\n\
+\tIncompatible with -P.\n\
+-P <int>\n\tSame as -p, but force the score to be zero at any position with\n\
+\tat least the given coverage of perfect mapping reads and which has an\n\
+\tOK insert plot, , i.e. perfect mapping reads + insert distribution\n\
+\toverride all other tests when calculating the score.\n\
+\tIncompatible with -p.\n\
+-q <float>\n\tMax bad read ratio [0.33]\n\
+-r <int>\n\tMin read coverage [max(1, mean_read_cov - 4 * read_cov_stddev)]\n\
+-R <float>\n\tRepeat calling cutoff. -R N means call a repeat if fragment\n\
+\tcoverage is >= N * (expected coverage).\n\
+\tUse -R 0 to not call repeats [2]\n\
+-s <int>\n\tMin score to report in errors file [0.4]\n\
+-u <int>\n\tFCD error window length for error calling [insert_size / 2]\n\
+-w <float>\n\tMin \% of bases in window needed to call as bad [0.8]\n\n\
+";
+
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ usage = "[options] <assembly.fa.gaps.gz> <in.bam> <stats prefix> <FCD cutoff> <prefix of output files>\n\n" + usage;
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ usage = "[options] <assembly.fa.gaps.gz> <in.bam> <stats prefix> <FCD cutoff> <prefix of output files> | bgzip -c > out.scores.gz\n\n" + usage;
+ cerr << "usage: task_score " << usage;
+ exit(1);
+ }
+
+ string statsPrefix = argv[argc-3];
+ ops.globalStatsInfile = statsPrefix + ".global_stats.txt";
+ ifstream ifs(ops.globalStatsInfile.c_str());
+ if (!ifs.good())
+ {
+ cerr << ERROR_PREFIX << "error opening file '" << ops.globalStatsInfile << "'" << endl;
+ exit(1);
+ }
+
+ string line;
+ double readCovMean = -1;
+ double readCovSd = -1;
+ double fragCovMean = -1;
+ double fragCovSd = -1;
+ long usePerfect = -1;
+ bool perfectWins = false;
+
+ while (getline(ifs, line))
+ {
+ vector<string> v;
+ split(line, '\t', v);
+
+ if (v[0].compare("read_cov_mean") == 0)
+ {
+ readCovMean = atof(v[1].c_str());
+ }
+ else if (v[0].compare("read_cov_sd") == 0)
+ {
+ readCovSd = atof(v[1].c_str());
+ }
+ else if (v[0].compare("fragment_cov_mean") == 0)
+ {
+ fragCovMean = atof(v[1].c_str());
+ }
+ else if (v[0].compare("fragment_cov_sd") == 0)
+ {
+ fragCovSd = atof(v[1].c_str());
+ }
+ else if (v[0].compare("fragment_length_min") == 0)
+ {
+ ops.minInsert = atoi(v[1].c_str());
+ }
+ else if (v[0].compare("fragment_length_max") == 0)
+ {
+ ops.maxInsert = atoi(v[1].c_str());
+ }
+ else if (v[0].compare("use_perfect") == 0)
+ {
+ usePerfect = atoi(v[1].c_str()) == 1 ? 5 : 0;
+ perfectWins = usePerfect == 0 ? false : true;
+ }
+ else if (v[0].compare("sample_ave_fragment_length") == 0)
+ {
+ ops.outerInsertSize = atoi(v[1].c_str());
+ }
+ }
+
+ if (readCovMean == -1 || readCovSd == -1 || fragCovMean == -1 || fragCovSd == -1 || usePerfect == -1)
+ {
+ cerr << ERROR_PREFIX << "Error getting stats from '" << ops.globalStatsInfile << "'" << endl;
+ exit(1);
+ }
+
+ ifs.close();
+
+ // set defaults
+ ops.clipCutoff = 0.5;
+ ops.fragMin = 1;
+ ops.minReadCov = readCovMean > 4 * readCovSd ? readCovMean - 4 * readCovSd : 1;
+ ops.minReportScore = 0.5;
+ ops.minScoreReportLength = 10;
+ ops.maxGap = 0;
+ ops.perfectWins = false;
+ ops.readRatioMax = 0.5;
+ ops.usePerfect = 0;
+ ops.windowLength = 0;
+ ops.windowPercent = 0.8;
+ ops.verbose = false;
+ ops.fcdWindow = 0;
+ ops.callRepeats = true;
+ ops.debug = false;
+ ops.maxFragCorrectCov = 2;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // deal with booleans
+ if (strcmp(argv[i], "-d") == 0)
+ {
+ ops.debug = true;
+ ops.verbose =true;
+ continue;
+ }
+ if (strcmp(argv[i], "-v") == 0)
+ {
+ ops.verbose = true;
+ continue;
+ }
+
+ // non booleans ....
+ if (strcmp(argv[i], "-f") == 0)
+ {
+ ops.fragMin = atoi(argv[i+1]);
+ }
+ if (strcmp(argv[i], "-g") == 0)
+ {
+ ops.maxGap = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-l") == 0)
+ {
+ ops.windowLength = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-p") == 0)
+ {
+ if (ops.usePerfect)
+ {
+ cerr << ERROR_PREFIX << "Error! both -p and -P used. Cannot continue" << endl;
+ exit(1);
+ }
+ ops.usePerfect = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-P") == 0)
+ {
+ if (ops.usePerfect)
+ {
+ cerr << ERROR_PREFIX << "Error! both -p and -P used. Cannot continue" << endl;
+ exit(1);
+ }
+ ops.usePerfect = atoi(argv[i+1]);
+ ops.perfectWins = true;
+ }
+ else if (strcmp(argv[i], "-q") == 0)
+ {
+ ops.readRatioMax = atof(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-R") == 0)
+ {
+ ops.maxFragCorrectCov = atof(argv[i+1]);
+ if (ops.maxFragCorrectCov == 0) ops.callRepeats = false;
+ }
+ else if (strcmp(argv[i], "-r") == 0)
+ {
+ ops.minReadCov = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-s") == 0)
+ {
+ ops.minReportScore = atof(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-u") == 0)
+ {
+ ops.fcdWindow = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-w") == 0)
+ {
+ ops.windowPercent = atof(argv[i+1]);
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ ops.gapsInfile = argv[i];
+ ops.bamInfile = argv[i+1];
+ ops.fcdCutoff = atof(argv[i+3]);
+ ops.outprefix = argv[i+4];
+ ops.statsInfile = statsPrefix + ".per_base.gz";
+
+ // If user didn't specify whether or not to use perfect reads, use the global stats file to decide
+ if (ops.usePerfect == 0 && perfectWins)
+ {
+ ops.usePerfect = usePerfect;
+ ops.perfectWins = true;
+ }
+
+ //if (!ops.maxGap) ops.maxGap = ops.outerInsertSize * 8 / 10;
+ if (!ops.maxGap) ops.maxGap = ops.outerInsertSize / 2;
+ if (!ops.fcdWindow) ops.fcdWindow = ops.outerInsertSize > 1000 ? ops.outerInsertSize / 2 : ops.outerInsertSize;
+ if (!ops.windowLength) ops.windowLength = 100;
+ ops.readCovWinLength = 5;
+ ops.minMapQuality = 0;
+ ops.minRepeatLength = ops.windowLength;
+ ops.scoreDivider = ops.usePerfect ? 7.0 : 6.0;
+
+ // set up the error windows
+ if (ops.usePerfect)
+ {
+ windows[PERFECT_COV] = ErrorWindow(1, ops.usePerfect, 0, 30, 1, true, false);
+ }
+
+ windows[READ_F] = ErrorWindow(1, max(ops.minReadCov / 2, (unsigned long) 1), 0, ops.readCovWinLength, ops.windowPercent, true, false);
+ windows[READ_PROP_F] = ErrorWindow(1, 0.66, 0, ops.windowLength, ops.windowPercent, true, false);
+ windows[READ_ORPHAN_F] = ErrorWindow(1, 0, ops.readRatioMax, ops.windowLength, ops.windowPercent, false, true);
+ windows[READ_ISIZE_F] = ErrorWindow(1, 0, ops.readRatioMax, ops.windowLength, ops.windowPercent, false, true);
+ windows[READ_BADORIENT_F] = ErrorWindow(1, 0, ops.readRatioMax, ops.windowLength, ops.windowPercent, false, true);
+ windows[READ_R] = ErrorWindow(1, max(ops.minReadCov / 2, (unsigned long) 1), 0, ops.readCovWinLength, ops.windowPercent, true, false);
+ windows[READ_PROP_R] = ErrorWindow(1, 0.66, 0, ops.windowLength, ops.windowPercent, true, false);
+ windows[READ_ORPHAN_R] = ErrorWindow(1, 0, ops.readRatioMax, ops.windowLength, ops.windowPercent, false, true);
+ windows[READ_ISIZE_R] = ErrorWindow(1, 0, ops.readRatioMax, ops.windowLength, ops.windowPercent, false, true);
+ windows[READ_BADORIENT_R] = ErrorWindow(1, 0, ops.readRatioMax, ops.windowLength, ops.windowPercent, false, true);
+ windows[FRAG_COV] = ErrorWindow(1, ops.fragMin, 0, 1, 1, true, false);
+ windows[READ_COV] = ErrorWindow(1, ops.minReadCov, 0, ops.windowLength, ops.windowPercent, true, false);
+ if (ops.callRepeats)
+ {
+ windows[FRAG_COV_CORRECT] = ErrorWindow(1, 0, ops.maxFragCorrectCov, ops.minRepeatLength, 0.95, false, true);
+ }
+ else
+ {
+ ops.scoreDivider--;
+ }
+ windows[FCD_ERR] = ErrorWindow(1, 0, ops.fcdCutoff, ops.fcdWindow, ops.windowPercent, false, true);
+
+ string parametersFile = ops.outprefix + ".parameters.txt";
+ ofstream ofs(parametersFile.c_str());
+ if (!ofs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << parametersFile << "'" << endl;
+ exit(1);
+ }
+
+ // write the parameters to stderr
+ ofs << "fragment_length_min\t" << ops.minInsert << endl
+ << "fragment_length_ave\t" << ops.outerInsertSize << endl
+ << "fragment_length_max\t" << ops.maxInsert << endl
+ << "min_inner_fragment_coverage\t" << ops.fragMin << endl
+ << "max_inner_fragment_coverage_relative_error\t" << (ops.callRepeats ? ops.maxFragCorrectCov : -1) << endl
+ << "window_length\t" << ops.windowLength << endl
+ << "window_percent\t" << ops.windowPercent << endl
+ << "read_coverage_window\t" << ops.readCovWinLength << endl
+ << "use_perfect\t" << ops.usePerfect << endl
+ << "perfect_wins\t" << ops.perfectWins << endl
+ << "read_ratio_max\t" << ops.readRatioMax << endl
+ << "min_read_coverage\t" << ops.minReadCov << endl
+ << "clip_cutoff\t" << ops.clipCutoff << endl
+ << "max_callable_gap_length\t" << ops.maxGap << endl
+ << "FCD_err_cutoff\t" << ops.fcdCutoff << endl
+ << "FCD_err_window\t" << ops.fcdWindow << endl
+ << "min_map_qual\t" << ops.minMapQuality << endl
+ << "min_report_score\t" << ops.minReportScore << endl;
+
+ ofs.close();
+}
+
+
+void updateErrorList(list<Error>& l, Error& e)
+{
+ if (l.empty())
+ {
+ l.push_back(e);
+ }
+ else
+ {
+ if (e.start - 1 <= l.back().end)
+ {
+ l.back().end = e.end;
+ }
+ else
+ {
+ l.push_back(e);
+ }
+ }
+}
+
+
+void scoreAndFindBreaks(CmdLineOptions& ops, map<short, list<Error> >& errors_map, list<pair<unsigned long, unsigned long > > &gaps, unsigned long seqLength, string& seqName, vector<float>& scores, vector<bool>& perfectCov, BAMdata& bamData)
+{
+ if (ops.verbose)
+ {
+ cerr << ERROR_PREFIX << "scoring and error calling on sequence " << seqName << endl;
+ }
+
+ unsigned long scoreIndex = 0;
+ map<unsigned long, string> gff_out; // start position -> gff line
+
+ // Pull out all the regions of high fragment coverage
+ if (ops.callRepeats)
+ {
+ for (list<Error>::iterator p = errors_map[FRAG_COV_CORRECT].begin(); p != errors_map[FRAG_COV_CORRECT].end(); p++)
+ {
+ gff_out[p->start + 1] += seqName + "\t" + TOOL_NAME + "\tRepeat\t" + toString(p->start + 1) + "\t" + toString(p->end + 1) + "\t" + toString(region2meanScore(ops, seqName, p->start, p->end, FRAG_COV_CORRECT)) + "\t.\t.\tNote=Warning: Collapsed repeat;colour=6\n";
+ }
+
+ errors_map.erase(FRAG_COV_CORRECT);
+ }
+
+ // Pull out all the soft clipping errors, but don't count those next to a gap
+ list<pair<unsigned long, unsigned long > >::iterator gapsIter = gaps.begin();
+ unsigned long gapLimit = 5;
+
+ for (list<Error>::iterator clipIter = errors_map[CLIP_FAIL].begin(); clipIter != errors_map[CLIP_FAIL].end(); clipIter++)
+ {
+ while (gapsIter != gaps.end() && clipIter->start > gapsIter->second + gapLimit)
+ {
+ gapsIter++;
+ }
+ if (clipIter->start >= 10 && clipIter->start + 10 < seqLength && (gapsIter == gaps.end() || clipIter->start + gapLimit < gapsIter->first)) {
+ gff_out[clipIter->start] += seqName + "\t" + TOOL_NAME + "\tClip\t" + toString(clipIter->start) + '\t' + toString(clipIter->end) + "\t.\t.\t.\tNote=Warning: Soft clip failure;colour=7\n";
+ }
+ }
+
+ errors_map.erase(CLIP_FAIL);
+
+ // Pull out all the low read coverage errors. Don't include gaps.
+ vector<pair <unsigned long, unsigned long> > fwdReadcovErrs;
+ gapsIter = gaps.begin();
+
+ for (list<Error>::iterator readIter = errors_map[READ_COV].begin(); readIter != errors_map[READ_COV].end(); readIter++)
+ {
+ while (gapsIter != gaps.end() && readIter->start > gapsIter->second)
+ {
+ gapsIter++;
+ }
+ if (gapsIter == gaps.end() || readIter->end < gapsIter->first) {
+ gff_out[readIter->start + 1] += seqName + "\t" + TOOL_NAME + "\tRead_cov\t" + toString(readIter->start + 1) + '\t' + toString(readIter->end + 1) + "\t.\t.\t.\tNote=Warning: Low read coverage;colour=8\n";
+ }
+ }
+ errors_map.erase(READ_COV);
+
+ // Pull out all the not-enough-perfect-coverage regions. Don't include gaps
+ if (ops.usePerfect)
+ {
+ list<pair<unsigned long, unsigned long > >::iterator gapsIter = gaps.begin();
+
+ for (list<Error>::iterator perfIter = errors_map[PERFECT_COV].begin(); perfIter != errors_map[PERFECT_COV].end(); perfIter++)
+ {
+ while (gapsIter != gaps.end() && perfIter->start > gapsIter->second)
+ {
+ gapsIter++;
+ }
+ if (gapsIter == gaps.end() || perfIter->end < gapsIter->first) {
+ gff_out[perfIter->start + 1] += seqName + "\t" + TOOL_NAME + "\tPerfect_cov\t" + toString(perfIter->start + 1) + '\t' + toString(perfIter->end + 1) + "\t.\t.\t.\tNote=Warning: Low perfect unique coverage;colour=9\n";
+ }
+ }
+ errors_map.erase(PERFECT_COV);
+ }
+
+ // can't call a score over a gap, so set them all to -1
+ list<pair<unsigned long, unsigned long> >::iterator gapIter;
+ for (gapIter = gaps.begin(); gapIter != gaps.end(); gapIter++)
+ {
+ for (unsigned long i = gapIter->first; i <= gapIter->second; i++)
+ {
+ scores[i] = -1;
+ }
+ }
+
+ // look for run of high scores
+ ErrorWindow scoreWindow(1, 0, ops.minReportScore, ops.minScoreReportLength, ops.windowPercent, false, true);
+ list<Error> scoreErrors;
+
+ for (unsigned int i = 0; i< scores.size(); i++)
+ {
+ if (scoreWindow.fail())
+ {
+ Error err;
+ err.start = scoreWindow.start();
+ err.end = scoreWindow.end();
+ err.type = 1; // type is irrelevant here, we know it's a score failure
+ updateErrorList(scoreErrors, err);
+ }
+ scoreWindow.add(i + 1, scores[i]);
+ }
+
+
+ // update the gff output with the bad score errors
+ for (list<Error>::iterator p = scoreErrors.begin(); p != scoreErrors.end(); p++)
+ {
+ // not optimal, but get the max error in this window by going back to the scores vector
+ float maxScore = 0;
+
+ for (unsigned int i = p->start - 1; i < p->end - 1; i++)
+ {
+ maxScore = max(scores[i], maxScore);
+ }
+ gff_out[p->start + 1] += seqName + '\t' + TOOL_NAME + "\tLow_score\t" + toString(p->start + 1) + '\t' + toString(p->end + 1) + '\t' + toString(1 - maxScore) + "\t.\t.\tNote=Warning: Low score;colour=10\n";
+ }
+
+ scoreWindow.clear(1);
+ scoreErrors.clear();
+
+ // Pull out all the runs of not proper pair reads, looking for links to elsewhere in the genome.
+ vector<short> indexes;
+ indexes.push_back(READ_ORPHAN_F);
+ indexes.push_back(READ_ISIZE_F);
+ indexes.push_back(READ_BADORIENT_F);
+ indexes.push_back(READ_ORPHAN_R);
+ indexes.push_back(READ_ISIZE_R);
+ indexes.push_back(READ_BADORIENT_R);
+ list<pair<unsigned long, unsigned long> > links;
+
+
+ for (unsigned long i = 0; i < indexes.size(); i++)
+ {
+ for (list<Error>::iterator p = errors_map[indexes[i]].begin(); p != errors_map[indexes[i]].end(); p++)
+ {
+ links.push_back(make_pair(p->start, p->end));
+ }
+
+ errors_map.erase(indexes[i]);
+ }
+
+ // sort the links and merge the overlaps
+ links.sort();
+ for (list<pair<unsigned long, unsigned long> >::iterator p = links.begin(); p != links.end(); p++)
+ {
+ list<pair<unsigned long, unsigned long> >::iterator nextLink = p;
+ nextLink++;
+
+ if (nextLink != links.end())
+ {
+ if (nextLink->first <= p->second)
+ {
+ p->second = nextLink->second;
+ links.erase(nextLink);
+ }
+ }
+ }
+
+ // update the gff errors with the merge links, when we get a hit.
+ // If no hit, then just report a warning of bad read orientation (as long as
+ // it's not at a contig end)
+ for (list<pair<unsigned long, unsigned long> >::iterator p = links.begin(); p != links.end(); p++)
+ {
+ unsigned long hitStart = 0;
+ unsigned long hitEnd = 0;
+ string hitName;
+
+ bam2possibleLink(ops, seqName, p->first, p->second, hitName, hitStart, hitEnd, bamData);
+ if (hitName.size() && !(hitName.compare(seqName) == 0 && p->first <= hitEnd + ops.maxInsert && hitStart <= p->second + ops.maxInsert))
+ {
+ gff_out[p->first + 1] += seqName + "\t" + TOOL_NAME + "\tLink\t" + toString(p->first + 1) + '\t' + toString(p->second + 1) + "\t.\t.\t.\tNote=Warning: Link " + hitName + ":" + toString(hitStart+1) + "-" + toString(hitEnd+1) + ";colour=11\n";
+ }
+ else if (p->first > ops.outerInsertSize && p->second + ops.outerInsertSize < seqLength)
+ {
+ gff_out[p->first + 1] += seqName + "\t" + TOOL_NAME + "\tRead_orientation\t" + toString(p->first + 1) + '\t' + toString(p->second + 1) + "\t.\t.\t.\tNote=Warning: Bad read orientation;colour=1\n";
+ }
+ }
+
+ // we don't want to call an insert size failure next to a gap which is too long, since
+ // we wouldn't expect proper fragmet coverage next to a gap which can't be spanned by read pairs.
+ // So, get the coords of all the flanking regions of the long gaps
+ list<pair< unsigned long, unsigned long> > longGapFlanks;
+ longGapFlanks.push_back(make_pair(0, ops.outerInsertSize));
+
+ for (gapIter = gaps.begin(); gapIter != gaps.end(); gapIter++)
+ {
+ if (gapIter->second - gapIter->first + 1 > ops.maxGap)
+ {
+ // add region to left of gap
+ if (gapIter->first > 1)
+ {
+ unsigned long start = gapIter->first < ops.outerInsertSize ? 0 : gapIter->first - ops.outerInsertSize;
+ unsigned long end = gapIter->first - 1;
+ if (longGapFlanks.size() && longGapFlanks.back().second >= end)
+ {
+ longGapFlanks.back().second = end;
+ }
+ else
+ {
+ longGapFlanks.push_back(make_pair(start, end));
+ }
+ }
+
+ // add region to right of gap
+ unsigned long start = gapIter->second + 1;
+ unsigned long end = min(gapIter->second + ops.outerInsertSize, seqLength - 1);
+ longGapFlanks.push_back(make_pair(start, end));
+ }
+ }
+
+ // add a fake bit to stop calls at the end of the sequnce
+ if (longGapFlanks.back().first + ops.outerInsertSize > seqLength)
+ {
+ longGapFlanks.back().first = seqLength > ops.outerInsertSize ? seqLength - ops.outerInsertSize : 0;
+ }
+
+ if (longGapFlanks.back().second + ops.outerInsertSize > seqLength)
+ {
+ longGapFlanks.back().second = seqLength - 1;
+ }
+ else if (seqLength > ops.outerInsertSize)
+ {
+ longGapFlanks.push_back(make_pair(seqLength - ops.outerInsertSize, seqLength - 1));
+ }
+
+ // look for fragment coverage too low failures (check if over a gap or near contig end)
+ gapIter = gaps.begin();
+ list<pair< unsigned long, unsigned long> >::iterator gapFlankIter = longGapFlanks.begin();
+ list<pair< unsigned long, unsigned long> > lowFragCovGaps; // keep track to stop calling insert dist error over low coverage gaps
+ unsigned long gapExtra = 5;
+
+ for (list<Error>::iterator p = errors_map[FRAG_COV].begin(); p != errors_map[FRAG_COV].end(); p++)
+ {
+ // update iterators so that they are not to the left of the current tri area error
+ while (gapIter != gaps.end() && gapIter->second < p->start) gapIter++;
+ while (gapFlankIter != longGapFlanks.end() && gapFlankIter->second < p->start) gapFlankIter++;
+
+ // check we're not at a scaffold end
+ if (p->end <= ops.outerInsertSize || p->start + ops.outerInsertSize > seqLength)
+ {
+ continue;
+ }
+
+ // if overlaps a gap that is short enough
+ if (gapIter != gaps.end() && gapIter->first <= p->end && gapIter->second - gapIter->first + 1 <= ops.maxGap)
+ {
+ gff_out[p->start + 1] += seqName + "\t" + TOOL_NAME + "\tFrag_cov_gap\t" + toString(p->start + 1) + "\t" + toString(p->end + 1) + "\t" + toString(region2meanScore(ops, seqName, p->start + 1, p->end + 1, FRAG_COV)) + "\t.\t.\tNote=Error: Fragment coverage too low over gap " + getNearbyGaps(p, gaps, gapIter) + ";colour=12\n";
+ if (p->start < gapExtra)
+ {
+ lowFragCovGaps.push_back(make_pair(0, p->end + gapExtra));
+ }
+ else
+ {
+ lowFragCovGaps.push_back(make_pair(p->start - gapExtra, p->end + gapExtra));
+ }
+ }
+ // if this error doesn't overlap a gap and is not too near a large gap
+ else if ( (gapIter == gaps.end() || gapIter->first > p->end) && (gapFlankIter == longGapFlanks.end() || gapFlankIter->first > p->end) )
+ {
+ gff_out[p->start + 1] += seqName + "\t" + TOOL_NAME + "\tFrag_cov\t" + toString(p->start + 1) + "\t" + toString(p->end + 1) + "\t" + toString(region2meanScore(ops, seqName, p->start + 1, p->end + 1, FRAG_COV)) + "\t.\t.\tNote=Error: Fragment coverage too low;color=15\n";
+ }
+ }
+
+ // look for triangle plot failures (check if over a gap or near contig end)
+ gapIter = gaps.begin();
+ gapFlankIter = longGapFlanks.begin();
+ list<pair< unsigned long, unsigned long> >::iterator gapLowCovIter = lowFragCovGaps.begin();
+
+ for (list<Error>::iterator p = errors_map[FCD_ERR].begin(); p != errors_map[FCD_ERR].end(); p++)
+ {
+ // update iterators so that they are not to the left of the current tri area error
+ while (gapIter != gaps.end() && gapIter->second < p->start) gapIter++;
+ while (gapFlankIter != longGapFlanks.end() && gapFlankIter->second < p->start) gapFlankIter++;
+ while (gapLowCovIter != lowFragCovGaps.end() && gapLowCovIter->second < p->start) gapLowCovIter++;
+
+ // check if we're next to a gap that has already been called
+ if (gapLowCovIter != lowFragCovGaps.end() && gapLowCovIter->first <= p->end)
+ {
+ continue;
+ }
+
+ // check we're not at a scaffold end
+ if (p->end <= ops.outerInsertSize || p->start + ops.outerInsertSize > seqLength)
+ {
+ continue;
+ }
+
+ // if overlaps a gap that is short enough
+ if (gapIter != gaps.end() && gapIter->first <= p->end && gapIter->second - gapIter->first + 1 <= ops.maxGap)
+ {
+ gff_out[p->start + 1] += seqName + "\t" + TOOL_NAME + "\tFCD_gap\t" + toString(p->start + 1) + "\t" + toString(p->end + 1) + "\t" + toString(region2meanScore(ops, seqName, p->start + 1, p->end + 1, FCD_ERR)) + "\t.\t.\tNote=Error: FCD failure over gap " + getNearbyGaps(p, gaps, gapIter) + ";colour=16\n";
+ }
+ // if this error doesn't overlap a gap and is not too near a large gap
+ else if ( (gapIter == gaps.end() || gapIter->first > p->end) && (gapFlankIter == longGapFlanks.end() || gapFlankIter->first > p->end) )
+ {
+ gff_out[p->start + 1] += seqName + "\t" + TOOL_NAME + "\tFCD\t" + toString(p->start + 1) + "\t" + toString(p->end + 1) + "\t" + toString(region2meanScore(ops, seqName, p->start + 1, p->end + 1, FCD_ERR)) + "\t.\t.\tNote=Error: FCD failure;colour=17\n";
+ }
+ }
+
+ // write the remaining scores to file
+ while (scoreIndex < scores.size())
+ {
+ double s = scores[scoreIndex] == -1 ? -1 : 1 - scores[scoreIndex];
+ cout << seqName << '\t' << scoreIndex + 1 << '\t' << s << '\n';
+ scoreIndex++;
+ }
+
+ // write the gff lines
+ for(map<unsigned long, string>::iterator p = gff_out.begin(); p != gff_out.end(); p++)
+ {
+ ops.ofs_breaks << p->second;
+ }
+ ops.ofs_breaks.flush();
+}
+
+
+void updateScoreHist(map<float, unsigned long>& hist, vector<float>& scores)
+{
+ for (vector<float>::iterator i = scores.begin(); i != scores.end(); i++)
+ {
+ hist[*i]++;
+ }
+}
+
+
+bool compareErrors(const Error& e, const Error& f)
+{
+ return e.start < f.start;
+}
+
+
+void bam2possibleLink(CmdLineOptions& ops, string& refID, unsigned long start, unsigned long end, string& hitName, unsigned long& hitStart, unsigned long& hitEnd, BAMdata& bamData)
+{
+ BamAlignment bamAlign;
+ map<string, list<unsigned long> > hitPositions;
+ map<string, Histogram> histsBigBins;
+ hitName = "";
+ unsigned long readCount = 0;
+ unsigned long readTotal = 0;
+ unsigned long binWidth = (end - start + 2 * ops.maxInsert) / 2;
+
+ // Set the region in the BAM file to be read
+ if (bamData.header.Sequences.Contains(refID))
+ {
+ int id = bamData.bamReader.GetReferenceID(refID);
+ if (!bamData.bamReader.SetRegion(id, start - 1, id, end - 1))
+ {
+ cerr << ERROR_PREFIX << "Error jumping to region " << refID << ":" << start << "-" << end << " in bam file" << endl;
+ exit(1);
+ }
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error. Sequence '" << refID << "' not found in bam file" << endl;
+ exit(1);
+ }
+ // parse BAM, looking for hits in common
+ while (bamData.bamReader.GetNextAlignmentCore(bamAlign))
+ {
+ readTotal++;
+ if (!bamAlign.IsMapped() || bamAlign.IsDuplicate() || bamAlign.MapQuality < ops.minMapQuality)
+ {
+ continue;
+ }
+
+ short pairOrientation = getPairOrientation(bamAlign);
+
+ // want paired reads where the mate is outside insert size range, or on a different chr
+ if (pairOrientation == DIFF_CHROM // || pairOrientation == OUTTIE || pairOrientation == SAME
+ || (pairOrientation == INNIE && (abs(bamAlign.InsertSize) < ops.minInsert || abs(bamAlign.InsertSize) > ops.maxInsert)))
+ {
+ string id = bamData.references[bamAlign.MateRefID].RefName;
+ hitPositions[id].push_back(bamAlign.MatePosition);
+ map<string, Histogram>::iterator p = histsBigBins.find(id);
+ if (p == histsBigBins.end())
+ {
+ histsBigBins[id] = Histogram(binWidth);
+ }
+ histsBigBins[id].add(bamAlign.MatePosition, 1);
+ readCount++;
+ }
+ }
+
+ // work out if they mostly hit in the same place
+ if (histsBigBins.size() == 0) return;
+
+ string modeID;
+ unsigned long maxModeVal = 0;
+ unsigned long maxMode = 0;
+
+ for (map<string, Histogram>::iterator p = histsBigBins.begin(); p != histsBigBins.end(); p++)
+ {
+ unsigned long modeVal;
+ unsigned long mode = p->second.mode(modeVal);
+
+ if (modeVal > maxModeVal)
+ {
+ maxModeVal = modeVal;
+ maxMode = mode;
+ modeID = p->first;
+ }
+ }
+
+ // The big bins were half the width of our window of interest, so to get all
+ // the hits, we need to check the bin either side of the mode
+ unsigned long totalHits = maxModeVal;
+ totalHits += histsBigBins[modeID].get(maxMode - binWidth);
+ totalHits += histsBigBins[modeID].get(maxMode + binWidth);
+
+ // get more accurate position of the hits
+ if (1.0 * totalHits / readCount > 0.5)
+ {
+ unsigned long startCoord = binWidth > maxMode ? 0 : maxMode - binWidth;
+ unsigned long endCoord = maxMode + binWidth;
+
+ hitPositions[modeID].sort();
+ list<unsigned long>::iterator p = hitPositions[modeID].begin();
+ while (p != hitPositions[modeID].end() && *p < startCoord)
+ {
+ p++;
+ }
+ if (p == hitPositions[modeID].end()) p--;
+ hitStart = *p;
+ p = hitPositions[modeID].end();
+ p--;
+ while (p != hitPositions[modeID].begin() && *p > endCoord)
+ {
+ p--;
+ }
+ hitEnd = *p;
+ hitName = modeID;
+ }
+}
+
+double region2meanScore(CmdLineOptions& ops, string& seqID, unsigned long start, unsigned long end, short column)
+{
+ Tabix tbx(ops.statsInfile);
+ double total = 0;
+ string line;
+ vector<string> data;
+ stringstream ss;
+ ss << seqID << ':' << start << '-' << end;
+ string region(ss.str());
+ tbx.setRegion(region);
+
+ while (tbx.getNextLine(line)) {
+ split(line, '\t', data);
+ total += atof(data[column].c_str());
+ }
+
+ return total / (end - start + 1);
+}
+
+
+string getNearbyGaps(list<Error>::iterator p, list<pair<unsigned long, unsigned long > >& gaps, list<pair<unsigned long, unsigned long > >::iterator gapIter)
+{
+ list<pair<unsigned long, unsigned long > >::iterator iter = gapIter;
+ vector<unsigned long> v;
+
+ while (iter->second >= p->start)
+ {
+ v.push_back(iter->first + 1);
+ v.push_back(iter->second + 1);
+ if (iter == gaps.begin())
+ {
+ break;
+ }
+ else
+ {
+ iter--;
+ }
+ }
+
+ iter = gapIter;
+ iter++;
+ while (iter != gaps.end() && iter->first <= p->end)
+ {
+ v.push_back(iter->first + 1);
+ v.push_back(iter->second + 1);
+ iter++;
+ }
+
+ sort(v.begin(), v.end());
+
+ stringstream ss;
+
+ for (unsigned int i = 0; i < v.size(); i+=2)
+ {
+ ss << ',' << v[i] << '-' << v[i+1];
+ }
+
+ string s = ss.str();
+ return s.size() ? s.substr(1) : "";
+}
diff --git a/src/task_seqrename.pl b/src/task_seqrename.pl
new file mode 100755
index 0000000..0603656
--- /dev/null
+++ b/src/task_seqrename.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my $samtools = File::Spec->catfile($scriptdir, 'samtools');
+my %options;
+my $usage = qq/<rename file> <in.bam> <out.bam>
+
+where <rename file> is the file *.info made by 'facheck', which
+contains the mapping of old name to new name
+/;
+
+my $ERROR_PREFIX = '[REAPR seqrename]';
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if (!($ops_ok) or $#ARGV != 2) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+
+my $old2new_file = $ARGV[0];
+my $bam_in = $ARGV[1];
+my $bam_out = $ARGV[2];
+
+# hash the old -> new names
+my %old2new;
+open F, $old2new_file or die $!;
+
+while (<F>) {
+ chomp;
+ my ($old, $new) = split /\t/;
+ $old =~ s/\s+$//;
+ $old2new{$old} = $new;
+}
+close F;
+$old2new{'*'} = '*';
+$old2new{'='} = '=';
+
+# make the new bam file
+open FIN, "$samtools view -h $bam_in|" or die $!;
+open FOUT, "| $samtools view -bS - > $bam_out" or die $!;
+
+while (<FIN>) {
+ chomp;
+ my @a =split /\t/;
+
+ if ($a[0] =~ /^\@SQ/) {
+ $a[1] = "SN:" . $old2new{substr($a[1], 3)};
+ }
+ elsif ($a[0] !~ /^\@/) {
+ $a[2] = $old2new{$a[2]};
+ $a[6] = $old2new{$a[6]};
+ }
+
+ print FOUT join("\t", @a), "\n";
+}
+
+close FIN;
+close FOUT;
diff --git a/src/task_smaltmap.pl b/src/task_smaltmap.pl
new file mode 100755
index 0000000..b0fdd7a
--- /dev/null
+++ b/src/task_smaltmap.pl
@@ -0,0 +1,190 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+use File::Spec;
+use File::Spec::Link;
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options = (
+ k => 13,
+ s => 2,
+ y => 0.5,
+ u => 1000,
+ n => 1,
+);
+
+my $usage = qq/[options] <assembly.fa> <reads_1.fastq> <reads_2.fastq> <out.bam>
+
+Maps read pairs to an asembly with SMALT, making a final BAM file that
+is sorted, indexed and has duplicates removed.
+
+The n^th read in reads_1.fastq should be the mate of the n^th read in
+reads_2.fastq.
+
+It is assumed that reads are 'innies', i.e. the correct orientation
+is reads in a pair pointing towards each other (---> <---).
+
+Options:
+
+-k <int>
+\tThe -k option (kmer hash length) when indexing the genome
+\twith 'smalt index' [$options{k}]
+-s <int>
+\tThe -s option (step length) when indexing the genome
+\twith 'smalt index' [$options{s}]
+-m <int>
+\tThe -m option when mapping reads with 'smalt map' [not used by default]
+-n <int>
+\tThe number of threads used when running 'smalt map' [$options{n}]
+-y <float>
+\tThe -y option when mapping reads with 'smalt map'.
+\tThe default of 0.5 means that at least 50% of each read must map
+\tperfectly. Depending on the quality of your reads, you may want to
+\tincrease this to be more stringent (or consider using -m) [$options{y}]
+-x
+\tUse this to just print the commands that will be run, instead of
+\tactually running them
+-u <int>
+\tThe -u option of 'smalt sample'. This is used to estimate the insert
+\tsize from a sample of the reads, by mapping every n^th read pair [$options{u}]
+/;
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'wrapperhelp',
+ 'k=i',
+ 's=i',
+ 'y=f',
+ 'm:i',
+ 'n=i',
+ 'u=i',
+ 'x',
+);
+
+if ($options{wrapperhelp}) {
+ die $usage;
+}
+
+if ($#ARGV != 3 or !($ops_ok)) {
+ die "usage:\n$scriptname $usage";
+}
+
+my $assembly = $ARGV[0];
+my $reads_1 = $ARGV[1];
+my $reads_2 = $ARGV[2];
+my $final_bam = $ARGV[3];
+my $ERROR_PREFIX = '[REAPR smaltmap]';
+my $samtools = File::Spec->catfile($scriptdir, 'samtools');
+my $smalt = File::Spec->catfile($scriptdir, 'smalt');
+my $tmp_prefix = "$final_bam.tmp.$$.smaltmap";
+my $smalt_index = "$tmp_prefix.smalt_index";
+my $smalt_sample = "$tmp_prefix.smalt_sample";
+my $raw_bam = "$tmp_prefix.raw.bam";
+my $rmdup_bam = "$tmp_prefix.rmdup.bam";
+my $header = "$tmp_prefix.header";
+
+# check input files exist
+foreach my $f ($assembly, $reads_1, $reads_2) {
+ unless (-e $f) {
+ print STDERR "$ERROR_PREFIX Can't find file '$f'\n";
+ exit(1);
+ }
+}
+
+# Run facheck on the input assembly and die if it doesn't like it.
+my $this_script = File::Spec::Link->resolve($0);
+$this_script = File::Spec->rel2abs($this_script);
+my $reapr = File::Spec->catfile($scriptdir, 'reapr.pl');
+my $cmd = "$reapr facheck $assembly";
+if (system($cmd)) {
+ print STDERR "
+$ERROR_PREFIX reapr facheck failed - there is at least one sequence name
+the input file '$assembly' that will break the pipeline.
+
+Please make a new fasta file using:
+reapr facheck $assembly $assembly.facheck
+";
+ exit(1);
+}
+
+# Common reason for this pipeline failing is a fasta file that samtools
+# doesn't like. Try running faidx on it (if .fai file doesn't exist
+# already). The .fai file would get made in the samtools view -T .. call
+# anyway.
+unless (-e "$assembly.fai") {
+ my $cmd = "$samtools faidx $assembly";
+ if (system($cmd)) {
+ print STDERR "$ERROR_PREFIX Error in system call:\n$cmd\n\n";
+ print STDERR "This means samtools is unhappy with the assembly
+fasta file '$assembly'.
+
+Common causes are empty lines in the file, or inconsistent line lengths
+(all sequence lines must have the same length, except the last line of
+any sequence which can be shorter). Cannot continue.
+";
+ unlink "$assembly.fai" or die $!;
+ exit(1);
+ }
+}
+
+# make a list of commands to be run
+my @commands;
+
+# index the genome
+push @commands, "$smalt index -k $options{k} -s $options{s} $smalt_index $assembly";
+
+# estimae the insert size
+push @commands, "$smalt sample -u $options{u} -o $smalt_sample $smalt_index $reads_1 $reads_2";
+
+# run the mapping
+my $m_option = '';
+if ($options{m}) {
+ $m_option = "-m $options{m}";
+}
+
+my $n_option = '';
+if ($options{n} > 1) {
+ $n_option = "-n $options{n}";
+}
+
+push @commands, "$smalt map -r 0 -x -y $options{y} $n_option $m_option -g $smalt_sample -f samsoft $smalt_index $reads_1 $reads_2"
+ . q{ | awk '$1!~/^#/' } # SMALT writes some stuff to do with the sampling to stdout
+ . " | $samtools view -S -T $assembly -b - > $raw_bam";
+
+# sort the bam by coordinate
+push @commands, "$samtools sort $raw_bam $raw_bam.sort";
+
+# remove duplicates
+push @commands, "$samtools rmdup $raw_bam.sort.bam $rmdup_bam";
+
+# Bamtools needs the @HD line of the BAM to be present.
+# So need to make new header for the BAM
+
+# need to get tab characters printed properly. Can't rely on echo -e working, so pipe through awk
+push @commands, q~echo "@HD VN:1.0 SO:coordinate" | awk '{OFS="\t"; $1=$1; print}' > ~ . "$header";
+push @commands, "$samtools view -H $rmdup_bam >> $header";
+push @commands, "$samtools reheader $header $rmdup_bam > $final_bam";
+
+# index the BAM
+push @commands, "$samtools index $final_bam";
+
+# clean up temp files
+push @commands, "rm $tmp_prefix.*";
+
+# run the commands
+foreach my $cmd (@commands) {
+ if ($options{x}) {
+ print "$cmd\n";
+ }
+ else {
+ print "$ERROR_PREFIX Running: $cmd\n";
+ if (system($cmd)) {
+ print STDERR "$ERROR_PREFIX Error in system call:\n$cmd\n";
+ exit(1);
+ }
+ }
+}
+
diff --git a/src/task_stats.cpp b/src/task_stats.cpp
new file mode 100644
index 0000000..803a6fc
--- /dev/null
+++ b/src/task_stats.cpp
@@ -0,0 +1,953 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include <iomanip>
+#include <list>
+
+#include "fasta.h"
+#include "trianglePlot.h"
+#include "coveragePlot.h"
+#include "histogram.h"
+#include "utils.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+#include "tabix/tabix.hpp"
+
+using namespace BamTools;
+using namespace std;
+
+const string ERROR_PREFIX = "[REAPR stats] ";
+
+
+struct Region
+{
+ string id;
+ unsigned long start;
+ unsigned long end;
+};
+
+
+struct CmdLineOptions
+{
+ unsigned int binWidth;
+ unsigned long minInsert; // cutoff for outer frag size to count as proper pair
+ unsigned long maxInsert; // cutoff for outer frag size to count as proper pair
+ uint16_t minMapQuality; // ignore reads mapped with quality less than this
+ vector<Region> regions;
+ string bamInfile;
+ string gapsInfile;
+ string outprefix;
+ string perfectFile;
+ string refGCfile;
+ string gc2covFile;
+ string statsInfile; // insert.stats.txt made by preprocess
+ unsigned long printPlotsSkip;
+ string printPlotsChr;
+ unsigned long printPlotsStart;
+ unsigned long printPlotsEnd;
+ unsigned long plotCount;
+ string rangeID;
+ unsigned long aveFragmentLength;
+ float innerAveFragCov; // mean coverage of inner fragments. Used to normalize fragment coverage
+ unsigned long rangeStart;
+ unsigned long rangeEnd;
+ unsigned long maxReadLength; // for allocating memory
+ ofstream plot_ofs;
+ unsigned long areaSkip; // calculate triangle error every n^th base. This is the skip
+ unsigned long samples; // when simulating triangle error, use this many iterations
+ unsigned long maxOuterFragCov; // only used when simulating the triangle error
+};
+
+
+struct Stats
+{
+ TrianglePlot innerTriplot;
+ TrianglePlot outerTriplot;
+ CoveragePlot covProper;
+ CoveragePlot covOrphan;
+ CoveragePlot covBadInsert;
+ CoveragePlot covWrongOrient;
+ CoveragePlot covProperRev;
+ CoveragePlot covOrphanRev;
+ CoveragePlot covBadInsertRev;
+ CoveragePlot covWrongOrientRev;
+ CoveragePlot softClipFwdLeft;
+ CoveragePlot softClipRevLeft;
+ CoveragePlot softClipFwdRight;
+ CoveragePlot softClipRevRight;
+ vector<unsigned long> perfectCov;
+ Histogram globalReadCov;
+ Histogram globalFragmentCov;
+ Histogram globalFragmentLength;
+ Histogram globalFCDerror;
+ Histogram clipProportion;
+ vector<unsigned int> refGC;
+ vector<float> gc2cov;
+ double lastTriArea;
+
+ // in sorted BAM file, the reads are in order, which means the fragments, where
+ // we include the reads as part of the fragments, are ordered by start position.
+ // But they are not in order if we don't include the reads, since reads could
+ // be of different lengths or clipped. Use a multiset so that fragments are automatically
+ // kept in order
+ multiset<pair<unsigned long, unsigned long> > outerFragments; // fragment positions, including reads
+ multiset<pair<unsigned long, unsigned long> > innerFragments; // fragment positions, not including reads
+};
+
+
+
+// deals with command line options: fills the options struct
+void parseOptions(int argc, char** argv, CmdLineOptions& ops);
+
+// Fills vector with perfect mapping coverage from file.
+// File expected to be one number (= coverage) per line
+void getPerfectMapping(Tabix* ti, string& refname, vector<unsigned long>& v_in, unsigned long refLength);
+
+// Prints all stats up to (not including) the given base to os.
+// Updates values in stats accoringly.
+void printStats(CmdLineOptions& ops, string& refname, unsigned long pos, Stats& stats, map<string, list<pair<unsigned long, unsigned long> > >& gaps);
+
+// Loads the contents of file into vector. The file is expected to be
+// of the form:
+// GC coverage
+// (tab separated), where GC is all integers in [0,100] in ascending numerical order.
+void loadGC2cov(string& filename, vector<float>& v_in);
+
+// Loads the GC from bgzipped tabixed file, for just the given reference sequence.
+// Fills the vector with GC content at each position of the sequence (v_in[n] = GC at position n (zero based))
+void loadGC(Tabix& ti, string& refname, vector<unsigned int>& v_in);
+
+void setBamReaderRegion(BamReader& bamReader, Region& region, SamHeader& samHeader);
+
+
+int main(int argc, char* argv[])
+{
+ CmdLineOptions options;
+ parseOptions(argc, argv, options);
+ BamReader bamReader;
+ BamAlignment bamAlign;
+ SamHeader header;
+ RefVector references;
+ int32_t currentRefID = -1;
+ string currentRefIDstring = "";
+ bool firstRecord = true;
+ string out_stats = options.outprefix + ".global_stats.txt";
+ Stats stats;
+ stats.globalFragmentCov = Histogram(1);
+ stats.globalReadCov = Histogram(1);
+ stats.globalFCDerror = Histogram(1);
+ stats.globalFragmentLength = Histogram(1);
+ stats.clipProportion = Histogram(1);
+
+ map<string, list<pair<unsigned long, unsigned long> > > gaps;
+ Tabix ti_gc(options.refGCfile);
+ Tabix* ti_perfect;
+
+ if (options.perfectFile.size())
+ ti_perfect = new Tabix(options.perfectFile);
+
+
+ cout << "#chr\tpos\tperfect_cov\tread_cov\tprop_cov\torphan_cov\tbad_insert_cov\tbad_orient_cov\tread_cov_r\tprop_cov_r\torphan_cov_r\tbad_insert_cov_r\tbad_orient_cov_r\tfrag_cov\tfrag_cov_err\tFCD_mean\tclip_fl\tclip_rl\tclip_fr\tclip_rr\tFCD_err\tmean_frag_length\n";
+
+ loadGaps(options.gapsInfile, gaps);
+ loadGC2cov(options.gc2covFile, stats.gc2cov);
+ if (options.printPlotsSkip)
+ {
+ string plots_outfile = options.outprefix + ".plots";
+ options.plot_ofs.open(plots_outfile.c_str());
+
+ if (!options.plot_ofs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << plots_outfile << "'" << endl;
+ return 1;
+ }
+ options.plotCount = 0;
+ }
+
+ // Go through input bam file getting local stats
+ if (!bamReader.Open(options.bamInfile))
+ {
+ cerr << ERROR_PREFIX << "Error opening bam file '" << options.bamInfile << "'" << endl;
+ return 1;
+ }
+
+ header = bamReader.GetHeader();
+ references = bamReader.GetReferenceData();
+
+ // If we're only looking at some regions, check the bam file is indexed
+ if (options.regions.size())
+ {
+ if (!bamReader.LocateIndex())
+ {
+ cerr << ERROR_PREFIX << "Couldn't find index for bam file '" << options.bamInfile << "'!" << endl;
+ exit(1);
+ }
+
+ if (!bamReader.HasIndex())
+ {
+ cerr << ERROR_PREFIX << "No index for bam file '" << options.bamInfile << "'!" << endl;
+ exit(1);
+ }
+
+ setBamReaderRegion(bamReader, options.regions[0], header);
+/*
+ if (header.Sequences.Contains(options.rangeID))
+ {
+ int id = bamReader.GetReferenceID(options.rangeID);
+ int left = options.rangeStart ? options.rangeStart : -1;
+ int right = options.rangeEnd ? options.rangeEnd : -1;
+ bamReader.SetRegion(id, left, id, right);
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error. " << options.rangeID << " not found in bam file" << endl;
+ return 1;
+ }
+*/
+ }
+
+
+
+
+ unsigned long regionsIndex = 0;
+ bool incrementedRegionsIndex = false;
+
+ while (1)
+ {
+ if (!bamReader.GetNextAlignmentCore(bamAlign))
+ {
+ incrementedRegionsIndex = true;
+ if (regionsIndex + 1 < options.regions.size())
+ {
+ regionsIndex++;
+ setBamReaderRegion(bamReader, options.regions[regionsIndex], header);
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ if (!bamAlign.IsMapped()
+ || bamAlign.IsDuplicate()
+ || bamAlign.MapQuality < options.minMapQuality)
+ {
+ continue;
+ }
+
+ // Deal with the case when we find a new reference sequence in the bam
+ if (currentRefID != bamAlign.RefID || incrementedRegionsIndex)
+ {
+ if (firstRecord)
+ {
+ firstRecord = false;
+ }
+ else
+ {
+ printStats(options, currentRefIDstring, references[currentRefID].RefLength, stats, gaps);
+ }
+
+ currentRefID = bamAlign.RefID;
+ currentRefIDstring = references[bamAlign.RefID].RefName;
+
+ if (options.perfectFile.size())
+ {
+ getPerfectMapping(ti_perfect, currentRefIDstring, stats.perfectCov, references[currentRefID].RefLength);
+ }
+
+
+
+ unsigned int startPos = options.regions.size() && options.regions[regionsIndex].start > 0 ? options.regions[regionsIndex].start - 1: 0;
+
+ stats.innerTriplot.clear(startPos);
+ stats.outerTriplot.clear(startPos);
+ stats.covProper = CoveragePlot(options.maxReadLength, startPos);
+ stats.covOrphan = CoveragePlot(options.maxReadLength, startPos);
+ stats.covBadInsert = CoveragePlot(options.maxReadLength, startPos);
+ stats.covWrongOrient = CoveragePlot(options.maxReadLength, startPos);
+ stats.covProperRev = CoveragePlot(options.maxReadLength, startPos);
+ stats.covOrphanRev = CoveragePlot(options.maxReadLength, startPos);
+ stats.covBadInsertRev = CoveragePlot(options.maxReadLength, startPos);
+ stats.covWrongOrientRev = CoveragePlot(options.maxReadLength, startPos);
+ stats.softClipFwdLeft = CoveragePlot(options.maxReadLength, startPos);
+ stats.softClipRevLeft = CoveragePlot(options.maxReadLength, startPos);
+ stats.softClipFwdRight = CoveragePlot(options.maxReadLength, startPos);
+ stats.softClipRevRight = CoveragePlot(options.maxReadLength, startPos);
+ stats.innerFragments.clear();
+ stats.outerFragments.clear();
+ loadGC(ti_gc, currentRefIDstring, stats.refGC);
+ incrementedRegionsIndex = false;
+ }
+
+ int64_t readEnd = bamAlign.GetEndPosition();
+
+ // print all stats to left of current read
+ printStats(options, currentRefIDstring, bamAlign.Position, stats, gaps);
+
+ // update count of soft-clipped bases. Don't care what type of read pair this is
+ if (bamAlign.CigarData[0].Type == 'S')
+ {
+ if (bamAlign.IsReverseStrand())
+ {
+ stats.softClipRevLeft.add(bamAlign.Position - 1);
+ }
+ else
+ {
+ stats.softClipFwdLeft.add(bamAlign.Position - 1);
+ }
+ }
+
+ if (bamAlign.CigarData.back().Type == 'S')
+ {
+ if (bamAlign.IsReverseStrand())
+ {
+ stats.softClipRevRight.add(readEnd);
+ }
+ else
+ {
+ stats.softClipFwdRight.add(readEnd);
+ }
+ }
+
+ // Work out what kind of read this is and update the right read coverage
+ // histogram
+ short pairOrientation = getPairOrientation(bamAlign);
+
+ // if an orphaned read
+ if (!bamAlign.IsMateMapped() || pairOrientation == UNPAIRED || pairOrientation == DIFF_CHROM)
+ {
+ if (bamAlign.IsReverseStrand()) stats.covOrphanRev.add(readEnd);
+ else stats.covOrphan.add(readEnd);
+ }
+ // correct orienation (but insert size could be good or bad)
+ else if (pairOrientation == INNIE)
+ {
+ int64_t fragStart = readEnd + 1;
+ int64_t fragEnd = bamAlign.MatePosition - 1;
+ if (0 < bamAlign.InsertSize && abs(bamAlign.InsertSize) < options.maxInsert * 5) stats.globalFragmentLength.add(bamAlign.InsertSize, 1);
+
+ // if insert size is bad
+ if (abs(bamAlign.InsertSize) < options.minInsert || abs(bamAlign.InsertSize) > options.maxInsert)
+ {
+ if (bamAlign.IsReverseStrand()) stats.covBadInsertRev.add(readEnd);
+ else stats.covBadInsert.add(readEnd);
+
+ }
+ // otherwise, this is a proper pair
+ else
+ {
+ if (bamAlign.IsReverseStrand()) stats.covProperRev.add(readEnd);
+ else stats.covProper.add(readEnd);
+
+ // update insert size ditribution and fragment coverage. We only want
+ // to count each fragment once: in a sorted bam the first appearance
+ // of a fragment is when the insert size is positive
+ if (bamAlign.InsertSize < 0)
+ {
+ continue;
+ }
+
+ stats.outerFragments.insert(stats.outerFragments.end(), make_pair(bamAlign.Position, bamAlign.Position + bamAlign.InsertSize - 1));
+
+ // update inner fragment stuff
+ if (fragStart < fragEnd)
+ {
+ stats.innerFragments.insert(stats.innerFragments.end(), make_pair(fragStart, fragEnd));
+ }
+ }
+ }
+ // wrong orientation
+ else if (pairOrientation == SAME || pairOrientation == OUTTIE)
+ {
+ if (bamAlign.IsReverseStrand()) stats.covWrongOrientRev.add(readEnd);
+ else stats.covWrongOrient.add(readEnd);
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Didn't expect this to happen..." << endl;
+ }
+ }
+
+ // print the remaining stats from the last ref sequence in the bam
+ //unsigned long endCoord = options.rangeID.size() && options.rangeEnd ? options.rangeEnd : references[currentRefID].RefLength;
+ unsigned long endCoord = options.regions.size() && options.regions.back().end != 0 ? options.regions.back().end : references[currentRefID].RefLength;
+ printStats(options, currentRefIDstring, endCoord, stats, gaps);
+
+ // Make some global plots
+ stats.globalReadCov.plot(options.outprefix + ".read_coverage", "pdf", "Read coverage", "Frequency");
+ stats.globalFragmentCov.plot(options.outprefix + ".fragment_coverage", "pdf", "Fragment coverage", "Frequency");
+ stats.globalFragmentLength.plot(options.outprefix + ".fragment_length", "pdf", "Fragment length", "Frequency");
+
+ stats.globalReadCov.writeToFile(options.outprefix + ".read_coverage.dat", 0, 1);
+ stats.globalFragmentCov.writeToFile(options.outprefix + ".fragment_coverage.dat", 0, 1);
+ stats.globalFragmentLength.writeToFile(options.outprefix + ".fragment_length.dat", 0, 1);
+
+ // print some global stats
+ ofstream ofs(out_stats.c_str());
+ if (!ofs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << out_stats << "'" << endl;
+ return 1;
+ }
+
+ double mean, sd;
+ unsigned long x;
+ ofs.precision(2);
+ fixed(ofs);
+ stats.globalReadCov.meanAndStddev(mean, sd);
+ ofs << "read_cov_mean\t" << mean << "\n"
+ << "read_cov_sd\t" << sd << "\n"
+ << "read_cov_mode\t" << stats.globalReadCov.mode(x) << "\n";
+
+ stats.globalFragmentCov.meanAndStddev(mean, sd);
+ ofs << "fragment_cov_mean\t" << mean << "\n"
+ << "fragment_cov_sd\t" << sd << "\n"
+ << "fragment_cov_mode\t" << stats.globalFragmentCov.mode(x) << "\n";
+
+ stats.globalFragmentLength.meanAndStddev(mean, sd);
+ ofs << "fragment_length_mean\t" << mean << "\n"
+ << "fragment_length_sd\t" << sd << "\n"
+ << "fragment_length_mode\t" << stats.globalFragmentLength.mode(x) << "\n";
+
+ ofs << "fragment_length_min\t" << options.minInsert << "\n"
+ << "fragment_length_max\t" << options.maxInsert << "\n";
+
+ if (options.perfectFile.size())
+ {
+ ofs << "use_perfect\t1\n";
+ }
+ else
+ {
+ ofs << "use_perfect\t0\n";
+ }
+
+ ofs << "sample_ave_fragment_length\t" << options.aveFragmentLength << "\n";
+ ofs << "fcd_skip\t" << options.areaSkip << endl;
+ ofs.close();
+
+ stats.globalFCDerror.setPlotOptionTrim(6);
+ stats.globalFCDerror.setPlotOptionXdivide(100);
+ stats.globalFCDerror.setPlotOptionUseLeftOfBins(true);
+ stats.globalFCDerror.plot(options.outprefix + ".FCDerror", "pdf", "FCD Error", "Frequency");
+ stats.globalFCDerror.writeToFile(options.outprefix + ".FCDerror.dat", 0, 0.01);
+ if (options.perfectFile.size())
+ delete ti_perfect;
+
+ return 0;
+}
+
+
+void parseOptions(int argc, char** argv, CmdLineOptions& ops)
+{
+ string usage;
+ short requiredArgs = 2;
+ int i;
+
+ usage = "[options] <preprocess output directory> <outfiles prefix>\n\n\
+options:\n\n\
+-f <int>\n\tInsert size [ave from stats.txt]\n\
+-i <int>\n\tMinimum insert size [pc1 from stats.txt]\n\
+-j <int>\n\tMaximum insert size [pc99 from stats.txt]\n\
+-m <int>\n\tMaximum read length (this doesn't need to be exact, it just\n\
+\tdetermines memory allocation, so must be >= max read length) [2000]\n\
+-p <string>\n\tName of .gz perfect mapping file made by 'perfectmap'\n\
+-q <int>\n\tIgnore reads with mapping quality less than this [0]\n\
+-s <int>\n\tCalculate FCD error every n^th base\n\
+\t[ceil((fragment size) / 1000)]\n\
+-u <string>\n\tFile containing list of chromosomes to look at\n\
+\t(one per line)\n\
+";
+/*
+-r id[:start-end]\n\tOnly look at the ref seq with this ID, and\n\
+\toptionally in the given base range\n\
+-t <int>\n\t-t N will make file of triangle plot data at every\n\
+\tN^th position. This file could be big!\n\
+\tRecommended usage is in conjunction with -r option\n\
+";
+*/
+ if (argc == 2 && strcmp(argv[1], "--wrapperhelp") == 0)
+ {
+ cerr << usage << endl;
+ exit(1);
+ }
+ else if (argc < requiredArgs)
+ {
+ cerr << "usage:\ntask_stats " << usage;
+ exit(1);
+ }
+
+
+ // set defaults
+ ops.maxReadLength = 2000;
+ ops.binWidth = 10;
+ ops.minMapQuality = 0;
+ ops.perfectFile = "";
+ ops.printPlotsSkip = 0;
+ ops.rangeID = "";
+ ops.rangeStart = 0;
+ ops.rangeEnd = 0;
+ ops.areaSkip = 0;
+ ops.minInsert = 0;
+ ops.aveFragmentLength = 0;
+ ops.maxInsert = 0;
+
+ for (i = 1; i < argc - requiredArgs; i++)
+ {
+ // deal with booleans
+
+ // non booleans are of form -option value, so check
+ // next value in array is there before using it!
+ if (strcmp(argv[i], "-b") == 0)
+ ops.binWidth = atoi(argv[i+1]);
+ else if (strcmp(argv[i], "-g") == 0)
+ ops.aveFragmentLength = atoi(argv[i+1]);
+ else if (strcmp(argv[i], "-i") == 0)
+ ops.minInsert = atoi(argv[i+1]);
+ else if (strcmp(argv[i], "-j") == 0)
+ ops.maxInsert = atoi(argv[i+1]);
+ else if (strcmp(argv[i], "-m") == 0)
+ ops.maxReadLength = atoi(argv[i+1]);
+ else if (strcmp(argv[i], "-p") == 0)
+ ops.perfectFile = argv[i+1];
+ else if (strcmp(argv[i], "-q") == 0)
+ ops.minMapQuality = atof(argv[i+1]);
+ else if (strcmp(argv[i], "-r") == 0)
+ {
+ string locus(argv[i+1]);
+ size_t pos_colon = locus.rfind(':');
+ if (pos_colon == string::npos)
+ {
+ ops.rangeID = locus;
+ }
+ else
+ {
+ ops.rangeID = locus.substr(0, pos_colon);
+ size_t pos_dash = locus.find('-', pos_colon);
+ if (pos_dash == string::npos)
+ {
+ cerr << ERROR_PREFIX << "Error getting coords from this input: " << locus << endl;
+ exit(1);
+ }
+
+ ops.rangeStart = atoi(locus.substr(pos_colon + 1, pos_dash - pos_colon).c_str());
+ ops.rangeEnd = atoi(locus.substr(pos_dash + 1).c_str());
+
+
+ if (ops.rangeEnd < ops.rangeStart)
+ {
+ cerr << ERROR_PREFIX << "Range end < range start. Cannot continue" << endl;
+ exit(1);
+ }
+ }
+ }
+ else if (strcmp(argv[i], "-s") == 0)
+ {
+ ops.areaSkip = atoi(argv[i+1]);
+ }
+ else if (strcmp(argv[i], "-t") == 0)
+ {
+ ops.printPlotsSkip = atoi(argv[i+1]);
+ ops.printPlotsChr = argv[i+2];
+ ops.printPlotsStart = atoi(argv[i+3]);
+ ops.printPlotsEnd = atoi(argv[i+4]);
+ i += 3;
+ }
+ else if (strcmp(argv[i], "-u") == 0)
+ {
+ // load regions from file into vector
+ string fname = argv[i+1];
+ ifstream ifs(fname.c_str());
+ if (!ifs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << fname << "'" << endl;
+ exit(1);
+ }
+
+ string line;
+
+ while (getline(ifs, line))
+ {
+ vector<string> v;
+ split(line, '\t', v);
+ Region r;
+ r.id = v[0];
+/*
+ if (v.size() > 1)
+ {
+ r.start = atoi(v[1].c_str());
+ r.end = atoi(v[2].c_str());
+ }
+ else
+ {
+ r.start = 0;
+ r.end = 0;
+ }
+*/
+ // doing regions within a chromosome is buggy, so don't do it for now
+ r.start = r.end = 0;
+ ops.regions.push_back(r);
+ }
+ ifs.close();
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error! Switch not recognised: " << argv[i] << endl;
+ exit(1);
+ }
+ i++;
+ }
+
+ if (argc - i != requiredArgs || argv[i+1][0] == '-')
+ {
+ cerr << usage;
+ exit(1);
+ }
+
+ string preprocessDirectory = argv[i];
+ ops.gapsInfile = preprocessDirectory + "/00.assembly.fa.gaps.gz";
+ ops.refGCfile = preprocessDirectory + "/00.assembly.fa.gc.gz";
+ ops.gc2covFile = preprocessDirectory + "/00.Sample/gc_vs_cov.lowess.dat";
+ ops.bamInfile = preprocessDirectory + "/00.in.bam";
+ ops.statsInfile = preprocessDirectory + "/00.Sample/insert.stats.txt";
+ ops.outprefix = argv[i+1];
+
+ ops.innerAveFragCov = 0;
+ ops.samples = 100000;
+ ops.maxOuterFragCov = 2000;
+
+ // get stats from file
+ ifstream ifs(ops.statsInfile.c_str());
+ if (!ifs.good())
+ {
+ cerr << "Error opening file '" << ops.statsInfile << "'" << endl;
+ exit(1);
+ }
+ string line;
+
+ while (getline(ifs, line))
+ {
+ vector<string> v;
+ split(line, '\t', v);
+
+ if (v[0].compare("pc1") == 0 && ops.minInsert == 0)
+ {
+ ops.minInsert = atoi(v[1].c_str());
+ }
+ else if (v[0].compare("ave") == 0 && ops.aveFragmentLength == 0)
+ {
+ ops.aveFragmentLength = atoi(v[1].c_str());
+ }
+ else if (v[0].compare("pc99") == 0 && ops.maxInsert == 0)
+ {
+ ops.maxInsert = atoi(v[1].c_str());
+ }
+ else if (v[0].compare("inner_mean_cov") == 0)
+ {
+ ops.innerAveFragCov = atof(v[1].c_str());
+ }
+ }
+
+ if (ops.minInsert == 0 || ops.aveFragmentLength == 0 || ops.maxInsert == 0 || ops.innerAveFragCov == 0)
+ {
+ cerr << ERROR_PREFIX << "Error getting stats from file '" << ops.statsInfile << "'" << endl;
+ exit(1);
+ }
+
+ // set area skip in terms of ave insert size, unless user already chose it
+ if (!ops.areaSkip)
+ {
+ ops.areaSkip = ceil(1.0 * ops.aveFragmentLength / 1000);
+ }
+}
+
+
+void printStats(CmdLineOptions& ops, string& refname, unsigned long pos, Stats& stats, map<string, list<pair<unsigned long, unsigned long> > >& gaps)
+{
+if (pos < stats.outerTriplot.centreCoord()) return;
+
+ // see if any gaps in this reference sequence
+ map<string, list<pair<unsigned long, unsigned long> > >::iterator gapsIter = gaps.find(refname);
+ list<pair<unsigned long, unsigned long> >::iterator iter;
+
+ if (gapsIter != gaps.end())
+ {
+ iter = gapsIter->second.begin();
+ }
+
+ for (unsigned long i = stats.outerTriplot.centreCoord(); i < pos; i++)
+ {
+ unsigned long gapStart = 0;
+ unsigned long gapEnd = 0;
+ unsigned long leftGapDistance = 0;
+ unsigned long rightGapDistance = 0;
+ bool inGap = false;
+ bool nearGap = false;
+
+ // update position of gaps iterator, so i is either to left of
+ // next gap, or i lies in a gap pointed to by iter
+ if (gapsIter != gaps.end())
+ {
+ while (iter != gapsIter->second.end() && iter->second < i)
+ {
+ iter++;
+ }
+
+ // if i lies in a gap
+ if (iter != gapsIter->second.end() && i >= iter->first)
+ {
+ inGap = true;
+ gapStart = iter->first;
+ gapEnd = iter->second;
+ }
+ else
+ {
+ if (iter != gapsIter->second.begin())
+ {
+ iter--;
+ leftGapDistance = i - iter->second;
+ iter++;
+ }
+ if (iter != gapsIter->second.end())
+ {
+ rightGapDistance = iter->first - i;
+ }
+
+ if (leftGapDistance != 0 && (leftGapDistance < rightGapDistance || rightGapDistance == 0) && leftGapDistance < ops.aveFragmentLength - 1)
+ {
+ iter--;
+ nearGap = true;
+ gapStart = iter->first;
+ gapEnd = iter->second;
+ iter++;
+ }
+ else if (rightGapDistance != 0 && (rightGapDistance < leftGapDistance || leftGapDistance == 0) && rightGapDistance < ops.aveFragmentLength - 1)
+ {
+ nearGap = true;
+ gapStart = iter->first;
+ gapEnd = iter->second;
+ }
+ }
+ }
+ unsigned long readCov = stats.covProper.depth() + stats.covOrphan.depth() + stats.covBadInsert.depth() + stats.covWrongOrient.depth();
+ unsigned long readCovRev = stats.covProperRev.depth() + stats.covOrphanRev.depth() + stats.covBadInsertRev.depth() + stats.covWrongOrientRev.depth();
+
+
+ cout << refname << "\t"
+ << i + 1 << "\t";
+
+ if (ops.perfectFile.size())
+ {
+ if (stats.perfectCov.size())
+ {
+ cout << stats.perfectCov[i] << "\t";
+ }
+ else
+ {
+ cout << 0 << "\t";
+ }
+ }
+ else
+ {
+ cout << "-1\t";
+ }
+
+ unsigned long innerFragCov = stats.innerTriplot.depth();
+ float innerFragCovCorrected = inGap ? 0 : 1.0 * (innerFragCov - stats.gc2cov[stats.refGC[i]]) / ops.innerAveFragCov;
+
+ double triArea;
+
+ if (i % ops.areaSkip == 0)
+ {
+ triArea = stats.outerTriplot.areaError(ops.maxInsert, ops.aveFragmentLength, inGap || nearGap, gapStart, gapEnd);
+ stats.lastTriArea = triArea;
+ if (triArea > -1)
+ {
+ stats.globalFCDerror.add((long) 100 * triArea,1);
+ }
+ }
+ else
+ {
+ triArea = stats.lastTriArea;
+ }
+
+ cout << readCov << "\t"
+ << (readCov ? 1.0 * stats.covProper.depth() / readCov : 0) << "\t"
+ << (readCov ? 1.0 * stats.covOrphan.depth() / readCov : 0) << "\t"
+ << (readCov ? 1.0 * stats.covBadInsert.depth() / readCov : 0) << "\t"
+ << (readCov ? 1.0 * stats.covWrongOrient.depth() / readCov : 0) << "\t"
+ << readCovRev << "\t"
+ << (readCovRev ? 1.0 * stats.covProperRev.depth() / readCovRev : 0) << "\t"
+ << (readCovRev ? 1.0 * stats.covOrphanRev.depth() / readCovRev : 0) << "\t"
+ << (readCovRev ? 1.0 * stats.covBadInsertRev.depth() / readCovRev : 0) << "\t"
+ << (readCovRev ? 1.0 * stats.covWrongOrientRev.depth() / readCovRev : 0) << "\t"
+ << innerFragCov << "\t"
+ << innerFragCovCorrected << "\t"
+ << stats.outerTriplot.mean() << "\t"
+ << stats.softClipFwdLeft.front() << "\t"
+ << stats.softClipRevLeft.front() << "\t"
+ << stats.softClipFwdRight.front() << "\t"
+ << stats.softClipRevRight.front() << "\t"
+ << triArea << "\t"
+ << stats.outerTriplot.meanFragLength() << "\n";
+
+
+ if (readCov)
+ {
+ stats.clipProportion.add((long) 100 * stats.softClipFwdLeft.front() / readCov, 1);
+ stats.clipProportion.add((long) 100 * stats.softClipFwdRight.front() / readCov, 1);
+ }
+ if (readCovRev)
+ {
+ stats.clipProportion.add((long) 100 * stats.softClipRevLeft.front() / readCovRev, 1);
+ stats.clipProportion.add((long) 100 * stats.softClipRevRight.front() / readCovRev, 1);
+ }
+
+ if (ops.printPlotsSkip && refname.compare(ops.printPlotsChr) == 0 && ops.printPlotsStart <= i + 1 && i + 1 <= ops.printPlotsEnd)
+ {
+ if (ops.plotCount % ops.printPlotsSkip == 0) {
+ string plot = stats.outerTriplot.toString(ops.maxInsert);
+ ops.plot_ofs << refname << "\t" << i + 1 << "\t" << ops.maxInsert;
+
+ if (plot.size())
+ {
+ ops.plot_ofs << "\t" << stats.outerTriplot.toString(ops.maxInsert);
+ }
+ ops.plot_ofs << "\n";
+ }
+
+ ops.plotCount++;
+ }
+ else if (ops.printPlotsSkip && refname.compare(ops.printPlotsChr) == 0 && ops.printPlotsEnd < i + 1)
+ {
+ ops.plot_ofs.close();
+ exit(0);
+ }
+
+ stats.innerTriplot.shift(1);
+ stats.innerTriplot.add(stats.innerFragments);
+ stats.outerTriplot.shift(1);
+ stats.outerTriplot.add(stats.outerFragments);
+ stats.covProper.increment();
+ stats.covOrphan.increment();
+ stats.covBadInsert.increment();
+ stats.covWrongOrient.increment();
+ stats.covProperRev.increment();
+ stats.covOrphanRev.increment();
+ stats.covBadInsertRev.increment();
+ stats.covWrongOrientRev.increment();
+ stats.softClipFwdLeft.increment();
+ stats.softClipRevLeft.increment();
+ stats.softClipFwdRight.increment();
+ stats.softClipRevRight.increment();
+ if (stats.covProper.depth() + stats.covProperRev.depth() ) stats.globalReadCov.add(stats.covProper.depth() + stats.covProperRev.depth(), 1);
+ if (stats.outerTriplot.depth()) stats.globalFragmentCov.add(stats.outerTriplot.depth(), 1);
+ }
+}
+
+
+void getPerfectMapping(Tabix* ti, string& refname, vector<unsigned long>& v_in, unsigned long refLength)
+{
+ string line;
+ vector<string> data;
+ v_in.clear();
+ if(ti->setRegion(refname))
+ {
+ while (ti->getNextLine(line)) {
+ split(line, '\t', data);
+ v_in.push_back(atoi(data[2].c_str()));
+ }
+ }
+
+ if (v_in.size() == 0)
+ {
+ cerr << ERROR_PREFIX << "Warning: didn't get any perfect mapping info for '" << refname << "'. Assuming zero perfect coverage" << endl;
+ }
+ else if (v_in.size() != refLength)
+ {
+ cerr << ERROR_PREFIX << "Mismatch in sequence length when getting perfect mapping coverage." << endl
+ << "I found " << v_in.size() << " lines for sequence '" << refname << "'. Expected " << refLength << endl;
+ exit(1);
+ }
+}
+
+
+
+void loadGC2cov(string& filename, vector<float>& v_in)
+{
+ ifstream ifs(filename.c_str());
+ string line;
+
+ if (!ifs.good())
+ {
+ cerr << ERROR_PREFIX << "Error opening file '" << filename << "'" << endl;
+ exit(1);
+ }
+
+ while (getline(ifs, line))
+ {
+ vector<string> tmp;
+ split(line, '\t', tmp);
+ unsigned int gc = atoi(tmp[0].c_str());
+
+ // sanity check we've got the right GC
+ if (gc != v_in.size())
+ {
+ cerr << ERROR_PREFIX << "Error in GC to coverage file '" << filename << "'." << endl
+ << "Need GC in numerical order from 0 to 100. Problem around this line:" << endl
+ << line << endl;
+ exit(1);
+ }
+
+ v_in.push_back(atof(tmp[1].c_str()));
+ }
+
+ ifs.close();
+}
+
+
+
+void loadGC(Tabix& ti, string& refname, vector<unsigned int>& v_in)
+{
+ v_in.clear();
+ ti.setRegion(refname);
+ vector<string> tmp;
+ string line;
+
+ // load the GC into vector
+ while (ti.getNextLine(line))
+ {
+ split(line, '\t', tmp);
+ v_in.push_back(atoi(tmp[2].c_str()));
+ }
+}
+
+
+
+void setBamReaderRegion(BamReader& bamReader, Region& region, SamHeader& header)
+{
+ if (header.Sequences.Contains(region.id))
+ {
+ int id = bamReader.GetReferenceID(region.id);
+ if (region.start == 0 && region.end == 0)
+ {
+ bamReader.SetRegion(id, 1, id, bamReader.GetReferenceData()[id].RefLength);
+ }
+ else // this currently can't happen. It's buggy.
+ {
+ bamReader.SetRegion(id, region.start - 1, id, region.end - 1);
+ }
+ }
+ else
+ {
+ cerr << ERROR_PREFIX << "Error. " << region.id << " not found in bam file" << endl;
+ exit(1);
+ }
+}
diff --git a/src/task_summary.pl b/src/task_summary.pl
new file mode 100755
index 0000000..1530807
--- /dev/null
+++ b/src/task_summary.pl
@@ -0,0 +1,339 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long;
+use List::Util qw[min max];
+
+my ($scriptname, $scriptdir) = fileparse($0);
+my %options = ('min_insert_error' => 0);
+my $usage = qq/[options] <assembly.fa> <score prefix> <break prefix> <outfiles prefix>
+
+where 'score prefix' is the outfiles prefix used when score was run, and
+'break prefix' is the outfiles prefix used when break was run.
+
+Options:
+
+-e <float>
+\tMinimum FCD error [0]
+/;
+
+my $ERROR_PREFIX = '[REAPR summary]';
+
+my $ops_ok = GetOptions(
+ \%options,
+ 'min_insert_error|e=f',
+ 'wrapperhelp',
+);
+
+if ($options{wrapperhelp}) {
+ print STDERR "$usage\n";
+ exit(1);
+}
+
+if ($#ARGV != 3 or !($ops_ok)) {
+ print STDERR "usage:\n$scriptname $usage\n";
+ exit(1);
+}
+
+my $ref_fa = $ARGV[0];
+my $score_prefix = $ARGV[1];
+my $break_prefix = $ARGV[2];
+my $out_prefix = $ARGV[3];
+my $ref_broken_fa = "$break_prefix.broken_assembly.fa";
+my $errors_gff = "$score_prefix.errors.gff.gz";
+my $score_dat_file = "$score_prefix.score_histogram.dat";
+my $ref_fai = "$ref_fa.fai";
+my $out_tsv = "$out_prefix.stats.tsv";
+my $out_report = "$out_prefix.report.txt";
+my $out_report_tsv = "$out_prefix.report.tsv";
+
+my @n50_stats = qw(bases
+sequences
+mean_length
+longest
+N50
+N50_n
+N60
+N60_n
+N70
+N70_n
+N80
+N80_n
+N90
+N90_n
+N100
+N100_n
+gaps
+gaps_bases);
+
+
+my @stats_keys = qw(
+length
+errors
+errors_length
+perfect_cov
+perfect_cov_length
+repeat
+repeat_length
+clip
+clip_length
+score
+score_length
+frag_dist
+frag_dist_length
+frag_dist_gap
+frag_dist_gap_length
+frag_cov
+frag_cov_length
+frag_cov_gap
+frag_cov_gap_length
+read_cov
+read_cov_length
+link
+link_length
+read_orientation
+read_orientation_length
+);
+
+
+my @stats_keys_for_report = qw(
+perfect_bases
+frag_dist
+frag_dist_gap
+frag_cov
+frag_cov_gap
+score
+link
+clip
+repeat
+read_cov
+perfect_cov
+read_orientation
+);
+
+
+my %stats_keys_for_report_to_outstring = (
+ 'perfect_bases' => 'error_free',
+ 'perfect_cov' => 'low_perfect_cov',
+ 'repeat' => 'collapsed_repeat',
+ 'clip' => 'soft_clipped',
+ 'score' => 'low_score',
+ 'frag_dist' => 'FCD',
+ 'frag_dist_gap' => 'FCD_gap',
+ 'frag_cov' => 'frag_cov',
+ 'frag_cov_gap' => 'frag_cov_gap',
+ 'read_cov' => 'read_cov',
+ 'link' => 'link',
+ 'read_orientation' => 'read_orientation',
+);
+
+
+
+
+
+my %gff2stat = (
+ 'Repeat' => 'repeat',
+ 'Low_score' => 'score',
+ 'Clip' => 'clip',
+ 'FCD' => 'frag_dist',
+ 'FCD_gap' => 'frag_dist_gap',
+ 'Frag_cov' => 'frag_cov',
+ 'Frag_cov_gap' => 'frag_cov_gap',
+ 'Read_cov' => 'read_cov',
+ 'Link' => 'link',
+ 'Perfect_cov' => 'perfect_cov',
+ 'Read_orientation' => 'read_orientation',
+);
+
+
+open OUT, ">$out_tsv" or die "$ERROR_PREFIX Error opening file $out_tsv";
+print OUT "#id\t" . join("\t", @stats_keys) . "\n";
+
+open FAI, $ref_fai or die "$ERROR_PREFIX Error opening file $ref_fai";
+my %ref_lengths;
+my @ref_ids; # need these because some sequences are not in the errors file and
+ # might as well preserve the order of the sequences
+
+while (<FAI>) {
+ chomp;
+ my ($id, $length) = split /\t/;
+ $ref_lengths{$id} = $length;
+ push @ref_ids, $id;
+}
+
+close FAI;
+
+my %error_stats;
+
+open GFF, "gunzip -c $errors_gff |" or die "$ERROR_PREFIX Error opening file $errors_gff";
+my $ref_id_index = 0;
+
+while (<GFF>) {
+ chomp;
+ my @gff = split /\t/;
+
+ # make new hash if it's the first time we've seen this sequence
+ unless (exists $error_stats{$gff[0]}) {
+ $error_stats{$gff[0]} = {};
+
+ foreach (@stats_keys) {
+ $error_stats{$gff[0]}{$_} = 0;
+ }
+ $error_stats{$gff[0]}{length} = $ref_lengths{$gff[0]};
+ }
+
+ # update the error counts for this sequence
+ exists $gff2stat{$gff[2]} or die "$ERROR_PREFIX Didn't recognise type of error '$gff[2]' from gff file. Cannot continue";
+
+ next if ($gff[2] =~ /^Insert/ and $gff[5] <= $options{min_insert_error});
+
+ $error_stats{$gff[0]}{$gff2stat{$gff[2]}}++;
+ $error_stats{$gff[0]}{"$gff2stat{$gff[2]}_length"} += $gff[4] - $gff[3] + 1;
+ $error_stats{$gff[0]}{errors}++;
+ $error_stats{$gff[0]}{errors_length} += $gff[4] - $gff[3] + 1;
+}
+
+close GFF;
+
+# print the output, adding up total stats for whole assembly along the way
+my %whole_assembly_stats;
+foreach (@stats_keys) {
+ $whole_assembly_stats{$_} = 0;
+}
+foreach my $id (@ref_ids) {
+ if (exists $error_stats{$id}) {
+ my $outstring = "$id\t";
+ $error_stats{length} = $ref_lengths{$id};
+
+ foreach (@stats_keys) {
+ $outstring .= $error_stats{$id}{$_} . "\t";
+ $whole_assembly_stats{$_} += $error_stats{$id}{$_};
+ }
+ chomp $outstring;
+ print OUT "$outstring\n";
+ }
+ else {
+ print OUT "$id\t$ref_lengths{$id}\t" . join("\t", (0) x 22) . "\n";
+ $whole_assembly_stats{length} += $ref_lengths{$id};
+ }
+}
+
+print OUT "WHOLE_ASSEMBLY";
+foreach (@stats_keys) {
+ print OUT "\t", $whole_assembly_stats{$_};
+}
+print OUT "\n";
+close OUT;
+
+
+open my $fh, ">$out_report" or die "$ERROR_PREFIX Error opening file '$out_report'";
+my $n50_exe = File::Spec->catfile($scriptdir, 'n50');
+
+print $fh "Stats for original assembly '$ref_fa':\n";
+my %stats_original;
+get_n50($n50_exe, $ref_fa, \%stats_original);
+print $fh n50_to_readable_string(\%stats_original);
+
+my $total_bases = $stats_original{'bases'};
+
+# get the % perfect bases from the histogram written by score
+my $perfect_bases = 0;
+open F, "$score_dat_file" or die "$ERROR_PREFIX Error opening file $score_dat_file";
+while (<F>) {
+ chomp;
+ my ($score, $count) = split /\t/;
+ if ($score == 0) {
+ $perfect_bases = $count;
+ last;
+ }
+}
+
+close F;
+
+$total_bases != 0 or die "Error getting %perfect bases. Total base count of zero from file '$score_dat_file'\n";
+my $percent_perfect = sprintf("%.2f", 100 * $perfect_bases / $total_bases);
+print $fh "Error free bases: $percent_perfect\% ($perfect_bases of $total_bases bases)\n",
+ "\n" , $whole_assembly_stats{frag_dist} + $whole_assembly_stats{frag_dist_gap} + $whole_assembly_stats{frag_cov} + $whole_assembly_stats{frag_cov_gap}, " errors:\n",
+ "FCD errors within a contig: $whole_assembly_stats{frag_dist}\n",
+ "FCD errors over a gap: $whole_assembly_stats{frag_dist_gap}\n",
+ "Low fragment coverage within a contig: $whole_assembly_stats{frag_cov}\n",
+ "Low fragment coverage over a gap: $whole_assembly_stats{frag_cov_gap}\n",
+ "\n", $whole_assembly_stats{score} + $whole_assembly_stats{link} + $whole_assembly_stats{clip} + $whole_assembly_stats{repeat} + $whole_assembly_stats{read_cov} + $whole_assembly_stats{perfect_cov} + $whole_assembly_stats{read_orientation} , " warnings:\n",
+ "Low score regions: $whole_assembly_stats{score}\n",
+ "Links: $whole_assembly_stats{link}\n",
+ "Soft clip: $whole_assembly_stats{clip}\n",
+ "Collapsed repeats: $whole_assembly_stats{repeat}\n",
+ "Low read coverage: $whole_assembly_stats{read_cov}\n",
+ "Low perfect coverage: $whole_assembly_stats{perfect_cov}\n",
+ "Wrong read orientation: $whole_assembly_stats{read_orientation}\n";
+
+
+print $fh "\nStats for broken assembly '$ref_broken_fa':\n";
+my %stats_broken;
+get_n50($n50_exe, $ref_broken_fa, \%stats_broken);
+print $fh n50_to_readable_string(\%stats_broken);
+close $fh;
+
+$whole_assembly_stats{perfect_bases} = $perfect_bases;
+
+open F, ">$out_report_tsv" or die "$ERROR_PREFIX Error opening file '$out_report_tsv'";
+print F "#filename\t" . join("\t", @n50_stats) . "\t";
+
+foreach(@n50_stats) {
+ print F $_ . "_br\t";
+}
+
+
+print F make_tsv_string(\@stats_keys_for_report, \%stats_keys_for_report_to_outstring) . "\n"
+ . File::Spec->rel2abs($ref_fa) . "\t"
+ . make_tsv_string(\@n50_stats, \%stats_original) . "\t"
+ . make_tsv_string(\@n50_stats, \%stats_broken) . "\t"
+ . make_tsv_string(\@stats_keys_for_report, \%whole_assembly_stats) . "\n";
+close F;
+
+
+sub get_n50 {
+ my $exe = shift;
+ my $infile = shift;
+ my $hash_ref = shift;
+
+
+ open F, "$exe $infile|" or die "$ERROR_PREFIX Error getting N50 from $infile";
+ while (<F>) {
+ chomp;
+ my ($stat, $value) = split;
+ $hash_ref->{$stat} = $value;
+ }
+ close F or die $!;
+}
+
+sub n50_to_readable_string {
+ my $h = shift;
+ return "Total length: $h->{bases}
+Number of sequences: $h->{sequences}
+Mean sequence length: $h->{mean_length}
+Length of longest sequence: $h->{longest}
+N50 = $h->{N50}, n = $h->{N50_n}
+N60 = $h->{N60}, n = $h->{N60_n}
+N70 = $h->{N70}, n = $h->{N70_n}
+N80 = $h->{N80}, n = $h->{N80_n}
+N90 = $h->{N90}, n = $h->{N90_n}
+N100 = $h->{N100}, n = $h->{N100_n}
+Number of gaps: $h->{gaps}
+Total gap length: $h->{gaps_bases}
+";
+}
+
+sub make_tsv_string {
+ my $keys = shift;
+ my $h = shift;
+ my $s = "";
+ foreach(@{$keys}) {
+ $s .= $h->{$_} . "\t";
+ }
+ $s =~ s/\t$//;
+ return $s;
+}
diff --git a/src/trianglePlot.cpp b/src/trianglePlot.cpp
new file mode 100644
index 0000000..1d359bb
--- /dev/null
+++ b/src/trianglePlot.cpp
@@ -0,0 +1,397 @@
+#include "trianglePlot.h"
+
+
+TrianglePlot::TrianglePlot(unsigned long centreCoord) : totalDepth_(0), depthSum_(0), centreCoord_(centreCoord), totalFragLength_(0) {}
+
+
+void TrianglePlot::shift(unsigned long n)
+{
+ // total depth is unchanged.
+ centreCoord_ += n;
+ depthSum_ -= n * totalDepth_;
+
+ // remove fragments which end before the new centre position
+ while (fragments_.size())
+ {
+ multiset< pair<unsigned long, unsigned long> >::iterator p = fragments_.begin();
+
+ // remove this coord, if necessary
+ if (p->second < centreCoord_)
+ {
+ // translate the coords
+ long start = p->first - centreCoord_;
+ long end = p->second - centreCoord_;
+
+ // update the depths
+ depthSum_ += ( start * (start - 1) - end * (end + 1) ) / 2;
+ totalDepth_ += start - end - 1;
+
+ // update the total frag length
+ totalFragLength_ -= end - start + 1;
+
+ // forget about this fragment: it doesn't cover the centre position
+ fragments_.erase(p);
+ }
+ else
+ {
+ break;
+ }
+ }
+}
+
+
+void TrianglePlot::move(unsigned long n)
+{
+ shift(n - centreCoord_);
+}
+
+
+bool TrianglePlot::add(pair<unsigned long, unsigned long>& fragment)
+{
+ // add the fragment, if possible
+ if (fragment.first <= centreCoord_ && centreCoord_ <= fragment.second)
+ {
+ // translate the coords
+ long start = fragment.first - centreCoord_;
+ long end = fragment.second - centreCoord_;
+
+ // update the depths
+ depthSum_ += ( end * (end + 1) - start * (start - 1)) / 2;
+ totalDepth_ += end - start + 1;
+ totalFragLength_ += end - start + 1;
+
+ // update the list of fragments covering this plot
+ fragments_.insert(fragments_.end(), fragment);
+
+ return true;
+ }
+ else if (fragment.second < centreCoord_)
+ {
+ return true;
+ }
+ // if can't add the fragment, nothing to do
+ else
+ {
+ return false;
+ }
+}
+
+void TrianglePlot::add(multiset<pair<unsigned long, unsigned long> >& frags)
+{
+ multiset<pair<unsigned long, unsigned long> >::iterator p;
+ while (frags.size())
+ {
+ p = frags.begin();
+ pair<unsigned long, unsigned long> frag = *p;
+ if (add(frag))
+ {
+ frags.erase(p);
+ }
+ else
+ {
+ break;
+ }
+ }
+}
+
+
+double TrianglePlot::mean()
+{
+ return totalDepth_ == 0 ? 0 : 1.0 * depthSum_ / totalDepth_;
+}
+
+
+unsigned long TrianglePlot::centreCoord()
+{
+ return centreCoord_;
+}
+
+
+unsigned long TrianglePlot::depth()
+{
+ return fragments_.size();
+}
+
+double TrianglePlot::meanFragLength()
+{
+ return fragments_.size() ? totalFragLength_ / fragments_.size() : 0;
+}
+
+
+void TrianglePlot::clear(unsigned long n)
+{
+ fragments_.clear();
+ totalDepth_ = 0;
+ depthSum_ = 0;
+ centreCoord_ = n;
+ totalFragLength_ = 0;
+}
+
+bool TrianglePlot::comparePairBySecond(pair<unsigned long, unsigned long>& i, pair<unsigned long, unsigned long>& j)
+{
+ return i.second < j.second;
+}
+
+
+void TrianglePlot::getHeights(unsigned long maxInsert, vector<double>& leftHeights, vector<double>& rightHeights)
+{
+ if (fragments_.size() == 0)
+ return;
+
+ leftHeights.clear();
+ for (unsigned long i = 0; i <= maxInsert; i++)
+ {
+ leftHeights.push_back(0);
+ }
+ rightHeights.clear();
+ for (unsigned long i = 0; i <= maxInsert; i++)
+ {
+ rightHeights.push_back(0);
+ }
+
+ unsigned long rightHeight = 0;
+
+ for (multiset< pair<unsigned long, unsigned long> >:: iterator p = fragments_.begin(); p != fragments_.end(); p++)
+ {
+ rightHeights[p->second - centreCoord_]++;
+ rightHeight++;
+ leftHeights[centreCoord_ - p->first]++;
+ }
+
+ unsigned long leftHeight = fragments_.size();
+
+ for(unsigned long i = 1; i < maxInsert - 1; i++)
+ {
+ leftHeights[i-1] = 1.0 * leftHeight / fragments_.size();
+ leftHeight -= leftHeights[i];
+ rightHeights[i-1] = 1.0 * rightHeight / fragments_.size();
+ rightHeight -= rightHeights[i];
+ }
+}
+
+
+string TrianglePlot::toString(unsigned long maxInsert)
+{
+ if (fragments_.size() == 0)
+ {
+ return "";
+ }
+
+ stringstream ss;
+ vector<unsigned long> leftHeights(maxInsert, 0);
+ vector<unsigned long> rightHeights(maxInsert, 0);
+ unsigned long rightHeight = 0;
+
+ for (multiset< pair<unsigned long, unsigned long> >:: iterator p = fragments_.begin(); p != fragments_.end(); p++)
+ {
+ rightHeights[p->second - centreCoord_]++;
+ rightHeight++;
+ leftHeights[centreCoord_ - p->first]++;
+ }
+
+ unsigned long height = 0;
+
+ for (unsigned long i = maxInsert - 1; i > 0; i--)
+ {
+ height += leftHeights[i];
+ ss << height << " ";
+ }
+
+ ss << 0 << " " << fragments_.size() << " ";
+
+ for (unsigned long i = 1; i < maxInsert; i++)
+ {
+ ss << rightHeight << " ";
+ rightHeight -= rightHeights[i];
+ }
+
+ ss << rightHeight;
+ return ss.str();
+}
+
+
+double TrianglePlot::getTheoryHeight(unsigned long gapStart, unsigned long gapEnd, unsigned long position, unsigned long meanInsert)
+{
+ long s = (long) gapStart - (long) centreCoord_; // gap start centred on zero
+ long e = (long) gapEnd - (long) centreCoord_; // gap end centred on zero
+ long p = (long) position - (long) centreCoord_; // position centred at zero
+ long i = (long) meanInsert;
+
+ if (p <= -i || p >= i)
+ {
+ return 0;
+ }
+
+ // theory height depends on where the gap is relative to the centre of the plot.
+ if (s <= 0 && 0 <= e)
+ {
+ if (p <= e - i || s + i <= p)
+ return 0;
+ else if (p < s)
+ return 1.0 * (p - e + i) / (s + i - e);
+ else if (p <= e)
+ return 1.0;
+ else
+ return -1.0 * (p - s - i) / (s + i - e);
+ }
+ else if (s > 0 && e > i)
+ {
+ if (p < s - i)
+ return 1.0 * (p + i) / s;
+ else if (p <= 0)
+ return 1.0;
+ else if (p < s)
+ return -1.0 * (p - s) / s;
+ else
+ return 0;
+ }
+ else if (e < 0 && s < -i)
+ {
+ if (p <= e)
+ return 0;
+ else if (p < 0)
+ return -1.0 * (p - e) / e;
+ else if (p <= e + i)
+ return 1.0;
+ else
+ return 1.0 * (p - i) / e;
+ }
+ else
+ {
+ if (-i <= s && s <= e && e < 0)
+ {
+ s += (long) i;
+ e += (long) i;
+ }
+
+ if (!(0 <= s && s <= e && e <= i))
+ {
+ cerr << "Unexpected error in FCD theory height estimation. Abort!" << endl
+ << "gapStart=" << gapStart << ". gapEnd=" << gapEnd << ". position=" << position << ". centre=" << centreCoord_ << ". s=" << s << ". e=" << e << ". i=" << i << endl;
+ exit(1);
+ }
+
+ if (p < s - i)
+ return 1.0 * (p + i) / (s + i - e);
+ else if (p <= e - i)
+ return 1.0 * s / (s + i - e);
+ else if (p <= 0)
+ return 1.0 * (1.0 + 1.0 * p / (s + i - e));
+ else if (p < s)
+ return 1.0 * (1.0 - 1.0 * p / (s + i - e));
+ else if (p <= e)
+ return 1.0 - 1.0 * s / (s + i - e);
+ else
+ return -1.0 * (p - i) / (s + i - e);
+ }
+}
+
+
+double TrianglePlot::areaError(unsigned long maxInsert, unsigned long meanInsert, bool gapCorrect, unsigned long gapStart, unsigned long gapEnd)
+{
+ if (fragments_.size() == 0) return -1;
+ double area = 0;
+ vector<unsigned long> leftHeights(maxInsert + 1, 0);
+ vector<unsigned long> rightHeights(maxInsert + 1, 0);
+ unsigned long rightHeight = 0;
+
+ for (multiset< pair<unsigned long, unsigned long> >:: iterator p = fragments_.begin(); p != fragments_.end(); p++)
+ {
+ rightHeights[min(p->second - centreCoord_, maxInsert)]++;
+ rightHeight++;
+ leftHeights[min(centreCoord_ - p->first, maxInsert)]++;
+ }
+
+ unsigned long leftHeight = fragments_.size();
+
+ for (unsigned long i = 1; i < maxInsert - 1; i++)
+ {
+ leftHeight -= leftHeights[i];
+ rightHeight -= rightHeights[i];
+ double theoryLeftHeight;
+ double theoryRightHeight;
+ if (gapCorrect)
+ {
+ theoryLeftHeight = getTheoryHeight(gapStart, gapEnd, centreCoord_ - i, meanInsert);
+ theoryRightHeight = getTheoryHeight(gapStart, gapEnd, centreCoord_ + i, meanInsert);
+ }
+ else
+ {
+ theoryLeftHeight = theoryRightHeight = max(0.0, 1.0 - 1.0 * i / meanInsert);
+ }
+
+ area += abs(theoryLeftHeight - 1.0 * leftHeight / fragments_.size());
+ area += abs(theoryRightHeight - 1.0 * rightHeight / fragments_.size());
+ }
+
+ return min(5.0, area / meanInsert);
+}
+
+
+void TrianglePlot::optimiseGap(unsigned long maxInsert, unsigned long meanInsert, unsigned long gapStart, unsigned long gapEnd, unsigned long& bestGapLength, double& bestError)
+{
+ // we can only do this if the centre coord of the plot is inside the gap
+ if (centreCoord_ < gapStart || centreCoord_ > gapEnd)
+ {
+ cerr << "Error in TrianglePlot::optimiseGap. coord=" << centreCoord_ << " is not in gap " << gapStart << "-" << gapEnd << endl;
+ exit(1);
+ }
+
+ unsigned long originalCentreCoord = centreCoord_;
+ multiset< pair<unsigned long, unsigned long>, fragcomp> originalFragments(fragments_);
+ unsigned long originalTotalDepth_ = totalDepth_;
+ unsigned long originalDepthSum = depthSum_;
+ unsigned long originalTotalFragLength = totalFragLength_;
+ bestGapLength = 0;
+ bestError = 999999;
+ unsigned long bigStep = 10;
+ for (unsigned long gapLength = 1; gapLength < maxInsert / 2; gapLength += bigStep)
+ {
+ clear();
+ unsigned long thisGapEnd = gapStart + gapLength - 1;
+ centreCoord_ = gapStart + (thisGapEnd - gapStart) / 2;
+
+ for (multiset< pair<unsigned long, unsigned long> >:: iterator p = originalFragments.begin(); p != originalFragments.end(); p++)
+ {
+ pair<unsigned long, unsigned long> fragment(p->first, thisGapEnd + p->second - gapEnd);
+ add(fragment);
+ }
+
+ double error = areaError(maxInsert, meanInsert, true, gapStart, thisGapEnd);
+ if (error < bestError)
+ {
+ bestError = error;
+ bestGapLength = gapLength;
+ }
+ }
+
+ unsigned long windowStart = bestGapLength < bigStep ? 1 : bestGapLength - bigStep;
+
+ for (unsigned long gapLength = windowStart; gapLength < min(maxInsert / 2, bestGapLength + bigStep - 1); gapLength++)
+ {
+ clear();
+ unsigned long thisGapEnd = gapStart + gapLength - 1;
+ centreCoord_ = gapStart + (thisGapEnd - gapStart) / 2;
+
+ for (multiset< pair<unsigned long, unsigned long> >:: iterator p = originalFragments.begin(); p != originalFragments.end(); p++)
+ {
+ pair<unsigned long, unsigned long> fragment(p->first, thisGapEnd + p->second - gapEnd);
+ add(fragment);
+ }
+
+ double error = areaError(maxInsert, meanInsert, true, gapStart, thisGapEnd);
+
+ if (error < bestError)
+ {
+ bestError = error;
+ bestGapLength = gapLength;
+ }
+ }
+
+ fragments_ = originalFragments;
+ centreCoord_ = originalCentreCoord;
+ totalDepth_ = originalTotalDepth_;
+ depthSum_ = originalDepthSum;
+ totalFragLength_ = originalTotalFragLength;
+}
+
diff --git a/src/trianglePlot.h b/src/trianglePlot.h
new file mode 100644
index 0000000..4011b05
--- /dev/null
+++ b/src/trianglePlot.h
@@ -0,0 +1,93 @@
+#ifndef TRIANGLEPLOT_H
+#define TRIANGLEPLOT_H
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <map>
+#include <algorithm>
+#include <set>
+#include <vector>
+#include <cmath>
+
+#include "utils.h"
+
+using namespace std;
+
+class TrianglePlot
+{
+public:
+ // construct a triangle plot, centred at the given position
+ TrianglePlot(unsigned long centreCoord = 0);
+
+ // move the centre of the plot to the right n bases
+ void shift(unsigned long n);
+
+ // move the centre to the given position
+ void move(unsigned long n);
+
+ // Updates the plot, by adding the given fragment at <start, end>, if that
+ // fragment covers the centre of the plot.
+ // Returns true iff the fragment could be added.
+ bool add(pair<unsigned long, unsigned long>& fragment);
+
+ // adds all possible fragments from the set to the plot.
+ // Each fragment that is added gets deleted from the list
+ void add(multiset<pair<unsigned long, unsigned long> >& frags);
+
+ // Returns the mean of the plot. Returns zero if the plot has no fragments - you
+ // can check this with a call to depth().
+ double mean();
+
+ // returns the centre position of the plot
+ unsigned long centreCoord();
+
+ // returns the depth of the plot, i.e. number of fragments covering its position
+ unsigned long depth();
+
+ // returns the mean fragment length
+ double meanFragLength();
+
+ // empties the plot and sets the centre coord to n
+ void clear(unsigned long n = 0);
+
+ // Returns the y values of the plot in the form:
+ // y1 y2 ...
+ // y values are space separated. So if maxInsert was 5, would have 11 values
+ // for plot (for x an int in [-5,5]), e.g:
+ // 0 1 2 2 2 3 3 2 0 0 0
+ string toString(unsigned long maxInsert);
+
+ void getHeights(unsigned long maxInsert, vector<double>& leftHeights, vector<double>& rightHeights);
+
+ double areaError(unsigned long maxInsert, unsigned long meanInsert, bool gapCorrect = false, unsigned long gapStart = 0, unsigned long gapEnd = 0);
+
+ void optimiseGap(unsigned long maxInsert, unsigned long meanInsert, unsigned long gapStart, unsigned long gapEnd, unsigned long& bestGapLength, double& bestError);
+
+
+private:
+ unsigned long totalDepth_;
+ long depthSum_;
+ unsigned long centreCoord_;
+ unsigned long totalFragLength_;
+
+ struct fragcomp
+ {
+ bool operator() (const pair<unsigned long, unsigned long>& i, const pair<unsigned long, unsigned long>& j) const
+ {
+ return i.second < j.second;
+ }
+ };
+
+ // stores fragments sorted by end position, coords zero-based relative to the genome, not this plot
+ multiset< pair<unsigned long, unsigned long>, fragcomp> fragments_;
+
+ // returns the theoretical height of a triangle plot at 'position', a gap from 'gapStart' to 'gapEnd'
+ double getTheoryHeight(unsigned long gapStart, unsigned long gapEnd, unsigned long position, unsigned long meanInsert);
+
+ // compare a pair by their second elements
+ bool comparePairBySecond(pair<unsigned long, unsigned long>& first, pair<unsigned long, unsigned long>& second);
+};
+
+#endif // TRIANGLEPLOT
diff --git a/src/utils.cpp b/src/utils.cpp
new file mode 100644
index 0000000..56d5aa2
--- /dev/null
+++ b/src/utils.cpp
@@ -0,0 +1,110 @@
+#include "utils.h"
+
+
+short getPairOrientation(BamAlignment& al, bool reverse)
+{
+ if (!(al.IsPaired() && al.IsMapped() && al.IsMateMapped()))
+ {
+ return UNPAIRED;
+ }
+ else if (al.RefID != al.MateRefID)
+ {
+ return DIFF_CHROM;
+ }
+ else if (al.IsReverseStrand() == al.IsMateReverseStrand())
+ {
+ return SAME;
+ }
+ else if ((al.Position <= al.MatePosition) == al.IsMateReverseStrand())
+ {
+ return reverse ? OUTTIE : INNIE;
+ }
+ else if ((al.Position > al.MatePosition) == al.IsMateReverseStrand())
+ {
+ return reverse ? INNIE : OUTTIE;
+ }
+ // logically impossible for this to happen...
+ else
+ {
+ cerr << "Unexpected error in getPairOrientation(). Aborting." << endl;
+ exit(1);
+ }
+}
+
+
+void loadGaps(string fname, map<string, list<pair<unsigned long, unsigned long > > >& gaps)
+{
+ Tabix ti(fname);
+ string line;
+
+ while (ti.getNextLine(line))
+ {
+ vector<string> data;
+ split(line, '\t', data);
+ gaps[data[0]].push_back( make_pair(atoi(data[1].c_str()) - 1, atoi(data[2].c_str()) - 1) );
+ }
+}
+
+
+void split(const string &s, char delim, vector<string> &elems)
+{
+ stringstream ss(s);
+ string item;
+ elems.clear();
+
+ while(getline(ss, item, delim))
+ {
+ elems.push_back(item);
+ }
+}
+
+
+void systemCall(string cmd)
+{
+ if (system(NULL))
+ {
+ int retcode = system(cmd.c_str());
+
+ if (retcode)
+ {
+ cerr << "Error in system call. Error code=" << retcode << ". Command was:"
+ << endl << cmd << endl;
+ exit(1);
+ }
+ }
+ else
+ {
+ cerr << "Error in system call. Shell not available" << endl;
+ exit(1);
+ }
+}
+
+
+bool sortByLength(const pair< string, unsigned long>& p1, const pair< string, unsigned long>& p2)
+{
+ return p1.second > p2.second;
+}
+
+
+void orderedSeqsFromFai(string faiFile, vector<pair< string, unsigned long> >& seqs)
+{
+ ifstream ifs(faiFile.c_str());
+ if (!ifs.good())
+ {
+ cerr << "Error opening file '" << faiFile << "'" << endl;
+ exit(1);
+ }
+
+ string line;
+
+ while (getline(ifs, line))
+ {
+ vector<string> tmp;
+ split(line, '\t', tmp);
+ seqs.push_back(make_pair(tmp[0], atoi(tmp[1].c_str())));
+ }
+
+ ifs.close();
+ sort(seqs.begin(), seqs.end(), sortByLength);
+}
+
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000..3d5412d
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,50 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <list>
+#include <string>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+#include "tabix/tabix.hpp"
+
+const short INNIE = 1;
+const short OUTTIE = 2;
+const short SAME = 3;
+const short DIFF_CHROM = 4;
+const short UNPAIRED = 5;
+
+
+using namespace BamTools;
+using namespace std;
+
+// Returns the orientation of read pair in bam alignment, as
+// one of the constants INNIE, OUTTIE, SAME, DIFF_CHROM, UNPAIRED.
+// Setting reverse=true will swap whether INNIE or OUTTIE are returned:
+// useful if you have reads pointing outwards instead of in.
+short getPairOrientation(BamAlignment& al, bool reverse=false);
+
+// fills each list with (start, end) positions of gaps in the input file, of the form:
+// sequence_name<TAB>start<TAB>end
+// and this file is expected to be bgzipped and tabixed
+// Map key = sequence_name
+void loadGaps(string fname, map<string, list<pair<unsigned long, unsigned long > > >& gaps);
+
+// splits the string on delimiter, filling vector with the result
+void split(const string& s, char delim, vector<string>& elems);
+
+// Does a system call. Dies if command returns non-zero error code
+void systemCall(string cmd);
+
+// Fills vector with sequence names from fai file, in decreasing size order
+void orderedSeqsFromFai(string faiFile, vector<pair< string, unsigned long> >& seqs);
+
+
+bool sortByLength(const pair< string, unsigned long>& p1, const pair< string, unsigned long>& p2);
+
+#endif // UTILS_H
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/reapr.git
More information about the debian-med-commit
mailing list