[med-svn] [nanook] 01/02: New upstream version 1.26+dfsg
Andreas Tille
tille at debian.org
Fri Sep 1 13:01:58 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository nanook.
commit c0318156f21b8879a586c9dd1039247c443af7ba
Author: Andreas Tille <tille at debian.org>
Date: Fri Sep 1 15:00:54 2017 +0200
New upstream version 1.26+dfsg
---
.gitignore | 8 +
Dockerfile | 26 +
HDF5License.txt | 92 +++
LICENSE | 674 ++++++++++++++++
README.md | 5 +
bin/get_contig_stats.pl | 253 ++++++
bin/nanook | 21 +
bin/nanook_get_read_stats.pl | 59 ++
bin/nanook_get_tracking.pl | 157 ++++
bin/nanook_plot_comparison.R | 100 +++
bin/nanook_plot_comparison_reference.R | 215 +++++
bin/nanook_plot_lengths.R | 87 ++
bin/nanook_plot_reference.R | 475 +++++++++++
bin/nanook_split_fasta | 83 ++
bin/slurmit | 62 ++
src/nanook/Alignment.java | 146 ++++
src/nanook/AlignmentFileParser.java | 83 ++
src/nanook/AlignmentFileStats.java | 24 +
src/nanook/AlignmentInfo.java | 178 ++++
src/nanook/AlignmentMerger.java | 411 ++++++++++
src/nanook/AlignmentsTableFile.java | 146 ++++
src/nanook/BLASRParser.java | 50 ++
src/nanook/BWAParser.java | 71 ++
src/nanook/BlastHandler.java | 202 +++++
src/nanook/BlastMerger.java | 86 ++
src/nanook/CIGARString.java | 285 +++++++
src/nanook/ComparisonReportWriter.java | 196 +++++
src/nanook/DirectoryWatcher.java | 184 +++++
src/nanook/DirectoryWatcherNative.java | 219 +++++
src/nanook/Fast5File.java | 492 +++++++++++
src/nanook/FastAQBlastMerger.java | 138 ++++
src/nanook/FastAQFile.java | 103 +++
src/nanook/FileWatcher.java | 177 ++++
src/nanook/FileWatcherItem.java | 31 +
src/nanook/GCCounter.java | 93 +++
src/nanook/GraphMapParser.java | 75 ++
src/nanook/KmerAbundance.java | 59 ++
src/nanook/KmerMotifStatistic.java | 211 +++++
src/nanook/KmerTable.java | 75 ++
src/nanook/LastParser.java | 70 ++
src/nanook/MAFAlignmentLine.java | 98 +++
src/nanook/MAFParser.java | 131 +++
src/nanook/MarginAlignParser.java | 71 ++
src/nanook/MergedFastAQFile.java | 45 ++
src/nanook/MotifStatistics.java | 217 +++++
src/nanook/NanoOK.java | 428 ++++++++++
src/nanook/NanoOKLog.java | 76 ++
src/nanook/NanoOKOptions.java | 1389 ++++++++++++++++++++++++++++++++
src/nanook/OverallStats.java | 39 +
src/nanook/ParserRunnable.java | 163 ++++
src/nanook/ProcessLogger.java | 202 +++++
src/nanook/RGraphPlotter.java | 145 ++++
src/nanook/RGraphRunnable.java | 69 ++
src/nanook/ReadAligner.java | 226 ++++++
src/nanook/ReadExtractor.java | 189 +++++
src/nanook/ReadExtractorRunnable.java | 83 ++
src/nanook/ReadFileMerger.java | 95 +++
src/nanook/ReadLengthsSummaryFile.java | 64 ++
src/nanook/ReadParser.java | 123 +++
src/nanook/ReadProcessor.java | 263 ++++++
src/nanook/ReadProcessorRunnable.java | 369 +++++++++
src/nanook/ReadSet.java | 353 ++++++++
src/nanook/ReadSetStats.java | 625 ++++++++++++++
src/nanook/ReadStats.java | 6 +
src/nanook/ReferenceSequence.java | 190 +++++
src/nanook/ReferenceSequenceStats.java | 548 +++++++++++++
src/nanook/References.java | 345 ++++++++
src/nanook/SAMParser.java | 236 ++++++
src/nanook/SampleChecker.java | 281 +++++++
src/nanook/SampleComparer.java | 154 ++++
src/nanook/SampleReportWriter.java | 822 +++++++++++++++++++
src/nanook/SequenceCoverage.java | 139 ++++
src/nanook/SequenceLogo.java | 133 +++
src/nanook/SequenceReader.java | 319 ++++++++
src/nanook/SystemCommandRunnable.java | 93 +++
src/nanook/WatcherLog.java | 65 ++
src/nanook/WatcherRunnable.java | 106 +++
77 files changed, 15022 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e4fecd6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+build
+build.xml
+manifest.mf
+nbproject
+dist/README.TXT
+
+
+install-packages.R
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..19ca29c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,26 @@
+# NanoOK Dockerfile
+FROM ubuntu:14.04
+MAINTAINER Richard Leggett <richard.leggett at earlham.ac.uk>
+
+RUN echo "deb http://cran.cnr.berkeley.edu/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list
+RUN apt-get update
+RUN apt-get install -y --force-yes r-base
+RUN apt-get install -y --force-yes r-cran-ggplot2
+RUN apt-get install -y hdf5-tools
+RUN apt-get install -y texlive
+RUN apt-get install -y texlive-latex-extra
+RUN apt-get install -y default-jre
+RUN apt-get install -y git
+ADD http://last.cbrc.jp/last-761.zip /usr/
+RUN cd /usr ; unzip last-761 ; cd last-761 ; make ; make install
+RUN cd /usr ; git clone https://github.com/lh3/bwa.git
+RUN cd /usr/bwa ; make ; cp bwa /usr/local/bin
+RUN cd /usr ; git clone https://github.com/TGAC/NanoOK
+ENV NANOOK_DIR="/usr/NanoOK"
+RUN echo "export PATH=/usr/NanoOK/bin:${PATH}" >> ~/.bashrc
+RUN Rscript -e "install.packages('ggplot2', repos='https://cran.ma.imperial.ac.uk/')"
+RUN Rscript -e "install.packages('ggmap', repos='https://cran.ma.imperial.ac.uk/')"
+RUN Rscript -e "install.packages('plyr', repos='https://cran.ma.imperial.ac.uk/')"
+RUN Rscript -e "install.packages('scales', repos='https://cran.ma.imperial.ac.uk/')"
+RUN Rscript -e "install.packages('gridExtra', repos='https://cran.ma.imperial.ac.uk/')"
+RUN Rscript -e "install.packages('reshape', repos='https://cran.ma.imperial.ac.uk/')"
\ No newline at end of file
diff --git a/HDF5License.txt b/HDF5License.txt
new file mode 100644
index 0000000..b6eee1e
--- /dev/null
+++ b/HDF5License.txt
@@ -0,0 +1,92 @@
+
+Copyright Notice and License Terms for
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+-----------------------------------------------------------------------------
+
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+Copyright 2006-2015 by The HDF Group.
+
+NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted for any purpose (including commercial purposes)
+provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions, and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions, and the following disclaimer in the documentation
+ and/or materials provided with the distribution.
+
+3. In addition, redistributions of modified forms of the source or binary
+ code must carry prominent notices stating that the original code was
+ changed and the date of the change.
+
+4. All publications or advertising materials mentioning features or use of
+ this software are asked, but not required, to acknowledge that it was
+ developed by The HDF Group and by the National Center for Supercomputing
+ Applications at the University of Illinois at Urbana-Champaign and
+ credit the contributors.
+
+5. Neither the name of The HDF Group, the name of the University, nor the
+ name of any Contributor may be used to endorse or promote products derived
+ from this software without specific prior written permission from
+ The HDF Group, the University, or the Contributor, respectively.
+
+DISCLAIMER:
+THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS
+"AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED. In no
+event shall The HDF Group or the Contributors be liable for any damages
+suffered by the users arising out of the use of this software, even if
+advised of the possibility of such damage.
+
+-----------------------------------------------------------------------------
+-----------------------------------------------------------------------------
+
+Contributors: National Center for Supercomputing Applications (NCSA) at
+the University of Illinois, Fortner Software, Unidata Program Center (netCDF),
+The Independent JPEG Group (JPEG), Jean-loup Gailly and Mark Adler (gzip),
+and Digital Equipment Corporation (DEC).
+
+-----------------------------------------------------------------------------
+
+Portions of HDF5 were developed with support from the Lawrence Berkeley
+National Laboratory (LBNL) and the United States Department of Energy
+under Prime Contract No. DE-AC02-05CH11231.
+
+-----------------------------------------------------------------------------
+
+Portions of HDF5 were developed with support from the University of
+California, Lawrence Livermore National Laboratory (UC LLNL).
+The following statement applies to those portions of the product and must
+be retained in any redistribution of source code, binaries, documentation,
+and/or accompanying materials:
+
+ This work was partially produced at the University of California,
+ Lawrence Livermore National Laboratory (UC LLNL) under contract
+ no. W-7405-ENG-48 (Contract 48) between the U.S. Department of Energy
+ (DOE) and The Regents of the University of California (University)
+ for the operation of UC LLNL.
+
+ DISCLAIMER:
+ This work was prepared as an account of work sponsored by an agency of
+ the United States Government. Neither the United States Government nor
+ the University of California nor any of their employees, makes any
+ warranty, express or implied, or assumes any liability or responsibility
+ for the accuracy, completeness, or usefulness of any information,
+ apparatus, product, or process disclosed, or represents that its use
+ would not infringe privately- owned rights. Reference herein to any
+ specific commercial products, process, or service by trade name,
+ trademark, manufacturer, or otherwise, does not necessarily constitute
+ or imply its endorsement, recommendation, or favoring by the United
+ States Government or the University of California. The views and
+ opinions of authors expressed herein do not necessarily state or reflect
+ those of the United States Government or the University of California,
+ and shall not be used for advertising or product endorsement purposes.
+-----------------------------------------------------------------------------
+
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..70566f2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ {one line to give the program's name and a brief idea of what it does.}
+ Copyright (C) {year} {name of author}
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ {project} Copyright (C) {year} {fullname}
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9a5aa1d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+![NanoOK](https://documentation.tgac.ac.uk/download/thumbnails/7209095/nanook-01.jpg?version=1&modificationDate=1447675247000&api=v2)
+
+Full documentation can be found at https://documentation.tgac.ac.uk/display/NANOOK/NanoOK
+
+Contact richard.leggett at earlham.ac.uk for more information or for comments/bug reports.
diff --git a/bin/get_contig_stats.pl b/bin/get_contig_stats.pl
new file mode 100755
index 0000000..11d9f27
--- /dev/null
+++ b/bin/get_contig_stats.pl
@@ -0,0 +1,253 @@
+#!/usr/bin/perl -w
+
+# Script: get_contig_stats.pl
+# Purpose: Calculate mean, N50, average etc. stats for a FASTA file
+# Author: Richard Leggett
+
+use warnings;
+use strict;
+use Getopt::Long;
+
+my $inputfile;
+my $longerthan;
+my %contig_lengths;
+my $type = 0;
+my $id = "";
+my $contig_length = 0;
+my $total_length = 0;
+my $shortest;
+my $longest;
+my $cumulative = 0;
+my $counter = 0;
+my $n50;
+my $n50count;
+my $n90;
+my $n90count;
+my @lengths;
+my @length_counts;
+my $help_requested;
+my $is_fasta;
+my $is_fastq;
+my $histogram;
+my $length_file;
+my $length_fh;
+my %hist_counts;
+
+&GetOptions(
+'i|input:s' => \$inputfile,
+'g|histogram:s' => \$histogram,
+'h|help' => \$help_requested,
+'l|longerthan:s' => \$longerthan,
+'a|fasta' => \$is_fasta,
+'q|fastq' => \$is_fastq,
+'r|lengthfile:s' => \$length_file
+);
+
+if (defined $help_requested) {
+ print "\nGet contig stats on a FASTA file.\n\n";
+ print "Usage: get_contig_stats.pl <-f filename> [-l lengths]\n\n";
+ print "Options:\n";
+ print " -i | -input input file\n";
+ print " -a | -fasta input file is FASTA (default)\n";
+ print " -g | -histogram filename to output length histogram\n";
+ print " -q | -fastq input file is FASTQ\n";
+ print " -r | -lengthfile filename to output lengths\n";
+ print " -l | -longerthan list of comma separated lengths for which you wish\n";
+ print " to know number of contigs >= to - eg. 76,151\n\n";
+
+ exit;
+}
+
+die "You must specify -input or -i\n" if not defined $inputfile;
+
+$is_fasta = 1 if not defined $is_fastq;
+
+if (defined $length_file) {
+ open($length_fh, ">".$length_file) or die "Can't open $length_file\n";
+}
+
+if (defined $longerthan) {
+ @lengths = split(/,/, $longerthan);
+ for (my $i=0; $i<@lengths; $i++) {
+ $length_counts[$i] = 0;
+ }
+}
+
+if (defined $is_fastq) {
+ read_fastq($inputfile);
+} else {
+ read_fasta($inputfile);
+}
+
+if (defined $length_fh) {
+ close($length_fh);
+}
+
+foreach $id (sort {$contig_lengths{$b} <=> $contig_lengths{$a}} keys %contig_lengths)
+{
+ my $contig_length = $contig_lengths{$id};
+ $cumulative += $contig_length;
+ $counter++;
+
+ if (not defined $n50) {
+ if ($cumulative >= ($total_length * 0.5)) {
+ $n50 = $contig_length;
+ $n50count = $counter;
+ }
+ }
+
+ if (not defined $n90) {
+ if ($cumulative >= ($total_length * 0.9)) {
+ $n90 = $contig_length;
+ $n90count = $counter;
+ }
+ }
+}
+
+my $mean = $cumulative / $counter;
+
+my $header_string="NumContigs\tTotalSum\tMeanLength\tShortest\tLongest\tN50Length\tN50Count\tN90Length\tN90Count";
+
+print "NumContigs:\t", $counter, "\n";
+print "TotalSum:\t", $cumulative, "\n";
+printf "MeanLength:\t%.2f\n", $mean;
+print "Shortest:\t", $shortest, "\n";
+print "Longest:\t", $longest, "\n";
+print "N50Length:\t", $n50, "\n";
+print "N50Count:\t", $n50count, "\n";
+print "N90Length:\t", $n90, "\n";
+print "N90Count:\t", $n90count, "\n";
+
+if (defined $longerthan) {
+ for (my $i=0; $i<@lengths; $i++) {
+ $header_string = $header_string."\tGE".$lengths[$i]."Count";
+ print "GE",$lengths[$i],"Count:\t", $length_counts[$i], "\n";
+ }
+}
+
+print "Headings:\t", $header_string, "\n";
+print "AllFields:\t", $counter, "\t", $cumulative, "\t";
+printf "%.2f\t", $mean;
+print $shortest, "\t", $longest, "\t", $n50, "\t", $n50count, "\t", $n90, "\t", $n90count;
+
+if (defined $longerthan) {
+ for (my $i=0; $i<@lengths; $i++) {
+ print "\t", $length_counts[$i];
+ }
+}
+
+print "\n";
+
+if (defined $histogram) {
+ output_histogram();
+}
+
+
+sub read_fasta
+{
+ my $filename = $_[0];
+ open(INPUTFILE, $filename) or die "Can't open $filename\n";
+
+
+ while(<INPUTFILE>) {
+ chomp(my $line = $_);
+
+ if ($line =~ /^>(\S+)/) {
+ if ($contig_length > 0) {
+ store_length($id, $contig_length);
+ }
+
+ $contig_length = 0;
+ $id = $1;
+ } else {
+ $contig_length += length($line);
+ }
+ }
+
+ if ($contig_length > 0) {
+ store_length($id, $contig_length);
+ }
+
+ close(INPUTFILE);
+}
+
+sub read_fastq
+{
+ my $filename = $_[0];
+ open(INPUTFILE, $filename) or die "Can't open $filename\n";
+
+ while(<INPUTFILE>) {
+ chomp(my $seq_header = $_);
+ chomp(my $sequence = <INPUTFILE>);
+ chomp(my $qual_header = <INPUTFILE>);
+ chomp(my $qualities = <INPUTFILE>);
+ my @fields = split(/ /, $seq_header);
+ my $id = substr $fields[0], 1;
+ my $contig_length = length($sequence);
+
+ store_length($id, $contig_length);
+ }
+
+ close(INPUTFILE);
+}
+
+sub store_length
+{
+ my $id = $_[0];
+ my $contig_length = $_[1];
+
+ if (defined $contig_lengths{$id}) {
+ my $new_id = $id;
+ my $counter = 0;
+ do {
+ $counter++;
+ $new_id = $id."_duplicate_".$counter;
+ } while (defined $contig_lengths{$new_id});
+ $id = $new_id;
+ print "Found duplicate ID - used new ID $id\n";
+ }
+
+ $contig_lengths{$id} = $contig_length;
+ $total_length += $contig_length;
+
+ if ((not defined $longest) || ($contig_length > $longest)) {
+ $longest = $contig_length;
+ }
+
+ if ((not defined $shortest) || ($contig_length < $shortest)) {
+ $shortest = $contig_length;
+ }
+
+ if (defined $longerthan) {
+ for (my $i=0; $i<@lengths; $i++) {
+ if ($contig_length >= $lengths[$i]) {
+ $length_counts[$i]++;
+ }
+ }
+ }
+
+ if (defined $hist_counts{$contig_length}) {
+ $hist_counts{$contig_length}++;
+ } else {
+ $hist_counts{$contig_length}=1;
+ }
+
+ if (defined $length_fh) {
+ print $length_fh $id, "\t", $contig_length, "\n";
+ }
+}
+
+sub output_histogram
+{
+ open(my $output_fh, ">".$histogram) or die "Can't open $histogram\n";
+
+ for (my $i=1; $i<=$longest; $i++) {
+ if (defined $hist_counts{$i}) {
+ print $output_fh $i, "\t", $hist_counts{$i}, "\n";
+ } else {
+ print $output_fh $i, "\t0\n";
+ }
+ }
+
+ close($output_fh);
+}
diff --git a/bin/nanook b/bin/nanook
new file mode 100755
index 0000000..16fe060
--- /dev/null
+++ b/bin/nanook
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+JAVA_ARGS="-Xmx2048m"
+
+if [ -z "$NANOOK_DIR" ] ; then
+ echo "Error: You must set NANOOK_DIR before running."
+ exit 1
+fi
+
+JARFILE=${NANOOK_DIR}/dist/NanoOK.jar
+
+if [ ! -f ${JARFILE} ] ; then
+ echo "Error: Can't find NanoOK.jar - it needs to be inside the dist subdirectory of the directory pointed to by NANOOK_DIR which is currently ${NANOOK_DIR}"
+ exit 1
+fi
+
+# If your library path (DYLD_LIBRARY_PATH on MAC) doesn't include HDF5 libraries,
+# you can manually set it here by adding a -Djava.library.path=/path/to/lib/dir
+# to the java command...
+
+exec java ${JAVA_ARGS} -jar ${JARFILE} "$@"
diff --git a/bin/nanook_get_read_stats.pl b/bin/nanook_get_read_stats.pl
new file mode 100755
index 0000000..06be937
--- /dev/null
+++ b/bin/nanook_get_read_stats.pl
@@ -0,0 +1,59 @@
+#!/usr/bin/perl
+#
+# Program: nanotools_extract_reads
+# Author: Richard Leggett
+# Contact: richard.leggett at tgac.ac.uk
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $sample;
+my $help_requested;
+my $basedir="/Users/leggettr/Documents/Projects/Nanopore/";
+
+&GetOptions(
+'b|basedir:s' => \$basedir,
+'s|sample:s' => \$sample,
+'h|help' => \$help_requested
+);
+
+print "\nnanotools_get_read_stats\n\n";
+
+if (defined $help_requested) {
+ print "Get stats on read lengths.\n\n";
+ print "Usage: nanotools_get_read_stats.pl <-s sample> [-b directory]\n\n";
+ print "Options:\n";
+ print " -s | -sample Sample name\n";
+ print " -b | -basedir Base directory containing all sample directories\n";
+ print "\n";
+
+ exit;
+}
+
+die "You must specify a sample name" if not defined $sample;
+
+print "Base directory: $basedir\n";
+print "Sample: $sample\n";
+
+print "Merging template reads\n";
+system("find ${basedir}/${sample}/fasta/Template -name '*BaseCalled_Template.fasta' | xargs cat > ${basedir}/${sample}/fasta/all_Template.fasta");
+print "Merging complement reads\n";
+system("find ${basedir}/${sample}/fasta/Complement -name '*BaseCalled_Complement.fasta' | xargs cat > ${basedir}/${sample}/fasta/all_Complement.fasta");
+print "Merging 2D reads\n";
+system("find ${basedir}/${sample}/fasta/2D -name '*BaseCalled_2D.fasta' | xargs cat > ${basedir}/${sample}/fasta/all_2D.fasta");
+
+print "Generating stats for template reads\n";
+system("get_contig_stats.pl -i ${basedir}/${sample}/fasta/all_Template.fasta -a -g ${basedir}/${sample}/analysis/all_Template_fasta_hist.txt -r ${basedir}/${sample}/analysis/all_Template_lengths.txt -l 500,1000,1500,2000,2500,3000,3500,4000,4500,5000 > ${basedir}/${sample}/analysis/all_Template_stats.txt");
+print "Generating stats for complement reads\n";
+system("get_contig_stats.pl -i ${basedir}/${sample}/fasta/all_Complement.fasta -a -g ${basedir}/${sample}/analysis/all_Complement_fasta_hist.txt -r ${basedir}/${sample}/analysis/all_Complement_lengths.txt -l 500,1000,1500,2000,2500,3000,3500,4000,4500,5000 > ${basedir}/${sample}/analysis/all_Complement_stats.txt");
+print "Generating stats for 2D reads\n";
+system("get_contig_stats.pl -i ${basedir}/${sample}/fasta/all_2D.fasta -a -g ${basedir}/${sample}/analysis/all_2D_fasta_hist.txt -r ${basedir}/${sample}/analysis/all_2D_lengths.txt -l 500,1000,1500,2000,2500,3000,3500,4000,4500,5000 > ${basedir}/${sample}/analysis/all_2D_stats.txt");
+
+system("echo \"\" >> ${basedir}/${sample}/summary.txt");
+system("cat ${basedir}/${sample}/all_Template_stats.txt | grep 'Headings:' | sed 's/Headings:/ ReadType/' >> ${basedir}/${sample}/summary.txt");
+system("cat ${basedir}/${sample}/all_Template_stats.txt | grep 'AllFields:' | sed 's/AllFields:/ Template/' >> ${basedir}/${sample}/summary.txt");
+system("cat ${basedir}/${sample}/all_Complement_stats.txt | grep 'AllFields:' | sed 's/AllFields:/Complement/' >> ${basedir}/${sample}/summary.txt");
+system("cat ${basedir}/${sample}/all_2D_stats.txt | grep 'AllFields:' | sed 's/AllFields:/ 2D/' >> ${basedir}/${sample}/summary.txt");
+
+print "DONE\n";
\ No newline at end of file
diff --git a/bin/nanook_get_tracking.pl b/bin/nanook_get_tracking.pl
new file mode 100644
index 0000000..32297e5
--- /dev/null
+++ b/bin/nanook_get_tracking.pl
@@ -0,0 +1,157 @@
+#!/usr/bin/perl
+#
+# Program: nanotools_extract_reads
+# Author: Richard Leggett
+# Contact: richard.leggett at tgac.ac.uk
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $sample;
+my $help_requested;
+my $basedir="/Users/leggettr/Documents/Projects/Nanopore/";
+my @channels;
+my @count_in_time;
+
+&GetOptions(
+'b|basedir:s' => \$basedir,
+'s|sample:s' => \$sample,
+'h|help' => \$help_requested
+);
+
+print "\nnanotools_extract_reads\n\n";
+
+if (defined $help_requested) {
+ print "Get tracking information.\n\n";
+ print "Usage: nanotools_get_tracking.pl <-s sample> [-b directory]\n\n";
+ print "Options:\n";
+ print " -s | -sample Sample name\n";
+ print " -b | -basedir Base directory containing all sample directories\n";
+ print "\n";
+ print "Sample directories should be inside the base directory. Within each sample\n";
+ print "directory, there should be a fast5 directory containing the input files.\n";
+ print "\n";
+
+ exit;
+}
+
+die "You must specify a sample name" if not defined $sample;
+
+print "Base directory: $basedir\n";
+print "Sample: $sample\n";
+
+my $in_dir = $basedir."/".$sample."/fast5";
+my $out_fasta;
+my $out_fastq;
+
+
+if ((-d $in_dir."/pass") && (-d $in_dir."/fail")) {
+ print "Got pass/fail directory\n";
+ process_directory($in_dir."/pass");
+ #process_directory($in_dir."/fail");
+} else {
+ print "Got all-in-one directory\n";
+ process_directory($in_dir);
+}
+
+print "\nAnalysing...\n\n";
+
+sub process_directory {
+ my $input_dir = $_[0];
+ my $total_reads = 0;
+ my $total_2d = 0;
+ my $total_template = 0;
+ my $total_complement = 0;
+ my $datatype_2d = "\/Analyses\/Basecall\_2D\_000\/BaseCalled\_2D\/Fastq";
+ my $datatype_template = "\/Analyses\/Basecall\_2D\_000\/BaseCalled\_template\/Fastq";
+ my $datatype_complement = "\/Analyses\/Basecall\_2D\_000\/BaseCalled\_complement\/Fastq";
+
+ print "Processing reads\n";
+ print " In: ", $input_dir, "\n";
+
+ opendir(DIR, $input_dir) or die $!;
+ while (my $file = readdir(DIR)) {
+ next unless ($file =~ m/\.fast5$/);
+ my $channel;
+ my $template_time;
+ my $complement_time;
+
+ print "Extracting $file\n";
+
+ if ($file =~ /_ch(\d+)_/) {
+ $channel = $1;
+ }
+
+ my $pathname = "${input_dir}/${file}";
+ my @dump = `h5dump -a /Analyses/Basecall_2D_000/BaseCalled_template/Events/start_time ${pathname}`;
+ for (my $i=0; $i<@dump; $i++) {
+ if ($dump[$i] =~ /\(0\)\: (\S+)/) {
+ $template_time = $1;
+ }
+ }
+
+ #@dump = `h5dump -a /Analyses/Basecall_2D_000/BaseCalled_complement/Events/start_time ${pathname}`;
+ #for (my $i=0; $i<@dump; $i++) {
+ # if ($dump[$i] =~ /\(0\)\: (\S+)/) {
+ # $complement_time = $1;
+ # }
+ #}
+
+ #if ((defined $channel) && (defined $template_time)) {
+ # print $channel, "\t", $template_time, "\n";
+ #}
+
+ if (defined $channels[$channel]) {
+ $channels[$channel] = $channels[$channel].",".$template_time;
+ } else {
+ $channels[$channel] = $template_time;
+ }
+
+ }
+ closedir(DIR);
+}
+
+print "Channel\tTime\tDifference\tMean\n";
+
+for (my $c=0; $c<512; $c++) {
+ if (defined $channels[$c]) {
+ my @times = split(/,/, $channels[$c]);
+ my @sorted_times = sort {$a <=> $b} @times;
+ my $total;
+ my $count = 0;
+
+ for (my $i=0; $i<@times; $i++) {
+ my $difference = 0;
+ my $mean = 0;
+
+ if ($i > 0) {
+ $difference = $sorted_times[$i] - $sorted_times[$i-1];
+ $total += $difference;
+ $mean = $total / $i;
+ }
+
+ if ($sorted_times[$i] < (60*60*12)) {
+ $count++;
+ }
+
+ print $c, "\t", $sorted_times[$i], "\t", $difference, "\t", $mean, "\n";
+ }
+
+ $count_in_time[$c] = $count;
+ }
+}
+
+print "\n";
+print "Channel\tCount\n";
+my $total = 0;
+my $n = 0;
+for (my $c=0; $c<512; $c++) {
+ if (defined $channels[$c]) {
+ print $c, "\t", $count_in_time[$c], "\n";
+ $total += $count_in_time[$c];
+ $n++;
+ }
+}
+
+print "\nMean: ".($total / $n)."\n";
\ No newline at end of file
diff --git a/bin/nanook_plot_comparison.R b/bin/nanook_plot_comparison.R
new file mode 100755
index 0000000..e3cd79d
--- /dev/null
+++ b/bin/nanook_plot_comparison.R
@@ -0,0 +1,100 @@
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+library(reshape2)
+
+# Filenames
+args <- commandArgs(TRUE)
+analysisdir <- args[1];
+graphsdir <- args[2];
+samplelist <- args[3];
+outdir <- args[4];
+format <- args[5];
+
+types = c("2D", "Template", "Complement");
+colours = c("#68B5B9", "#CF746D", "#91A851");
+
+if (format=="png") {
+ textsize <- c(40)
+ pointsize <- c(5)
+ pointalpha <- c(0.5)
+ pointshape <- c(1)
+ pointwidth <- c(3)
+ xvjust <- c(1.2)
+ yvjust <- c(1.8)
+} else {
+ textsize <- c(10)
+ pointsize <- c(2)
+ pointalpha <-c(0.4)
+ pointshape <- c(1)
+ pointwidth <- c(1)
+ xvjust <- c(0.2)
+ yvjust <- c(0.8)
+}
+
+
+data_samples = read.table(samplelist, header=TRUE);
+
+for (t in 1:3) {
+ colourcode = colours[t];
+
+ # Gather data for box plots of length
+ df <- data.frame();
+ listOfDataFrames <- NULL;
+ count <- c(1);
+ for (i in 1:nrow(data_samples)) {
+ type = types[t];
+ sampledir <- data_samples[i, "SampleDir"];
+ filename_lengths <- paste(sampledir, "/", analysisdir, "/", "all_",type,"_lengths.txt", sep="");
+ data_lengths = read.table(filename_lengths, col.name=c("name", "length"));
+ #df$size <- data_lengths$length;
+ thisid <- data_samples[i, "SampleName"];
+ #paste(data_samples[i, "SampleName"], type, sep="_");
+ thisid
+ listOfDataFrames[[count]] <- data.frame(Sample=thisid, Length=data_lengths$length);
+ count <- count + 1;
+ }
+
+ # Read lengths
+ imagewidth <- 1 + (nrow(data_samples) * 0.5);
+ df <- do.call("rbind", listOfDataFrames);
+ output_file <- paste(graphsdir, "/", type, "_lengths.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Length, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]));
+ garbage <- dev.off();
+
+ # Bar stacked plot of mapping
+ imagewidth <- 1 + (nrow(data_samples) * 0.5) + 1.5;
+ filename_maps <- paste(outdir, "/", type,"_map_summary.txt", sep="");
+ message(filename_maps)
+ #filename_maps <- c("~/temp/2D_map_summary.txt");
+ data_maps = read.table(filename_maps, header=TRUE);
+ df <- melt(data_maps, id.var="Sample")
+ output_file <- paste(graphsdir, "/", type, "_maps.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("%"));
+ garbage <- dev.off();
+
+ imagewidth <- 1 + (nrow(data_samples) * 0.5);
+
+ # Number of reads
+ filename_comparison <- paste(outdir, "/", type,"_comparison.txt", sep="");
+ data_comparison = read.table(filename_comparison, header=TRUE);
+ output_file <- paste(graphsdir, "/", type, "_number_of_reads.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(data_comparison, aes(x=data_comparison$Name, y=data_comparison$NumReads)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Sample") + ylab("Number of reads") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-x [...]
+ garbage <- dev.off();
+
+ # Total bases
+ data_comparison = read.table(filename_comparison, header=TRUE);
+ output_file <- paste(graphsdir, "/", type, "_total_bases.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(data_comparison, aes(x=data_comparison$Name, y=data_comparison$TotalBases)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Sample") + ylab("Total bases") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvj [...]
+ garbage <- dev.off();
+
+}
diff --git a/bin/nanook_plot_comparison_reference.R b/bin/nanook_plot_comparison_reference.R
new file mode 100755
index 0000000..103ab22
--- /dev/null
+++ b/bin/nanook_plot_comparison_reference.R
@@ -0,0 +1,215 @@
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+
+# Filenames
+args <- commandArgs(TRUE)
+analysisdir <- args[1];
+graphsdir <- args[2];
+samplelist <- args[3];
+outdir <- args[4];
+reference <- args[5];
+format <- args[6];
+
+types = c("2D", "Template", "Complement");
+colours = c("#68B5B9", "#CF746D", "#91A851");
+
+if (format=="png") {
+ textsize <- c(40)
+ pointsize <- c(5)
+ pointalpha <- c(0.5)
+ pointshape <- c(1)
+ pointwidth <- c(3)
+ xvjust <- c(1.2)
+ yvjust <- c(1.8)
+} else {
+ textsize <- c(10)
+ pointsize <- c(2)
+ pointalpha <-c(0.4)
+ pointshape <- c(1)
+ pointwidth <- c(1)
+ xvjust <- c(0.2)
+ yvjust <- c(0.8)
+}
+
+
+data_samples = read.table(samplelist, header=TRUE);
+
+imagewidth <- 1 + nrow(data_samples) * 0.5;
+
+# Query identity
+for (t in 1:3) {
+ df <- data.frame();
+ listOfDataFrames <- NULL;
+ count <- c(1);
+ for (i in 1:nrow(data_samples)) {
+ type = types[t];
+ sampledir <- data_samples[i, "SampleDir"];
+ filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
+ message(filename_data);
+ if (file.exists(filename_data)) {
+ data_field = read.table(filename_data, header=TRUE);
+ message(nrow(data_field));
+ if (nrow(data_field) > 0) {
+ thisid <- data_samples[i, "SampleName"];
+ listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$QueryPercentIdentity);
+ count <- count + 1;
+ }
+ }
+ }
+
+ df <- do.call("rbind", listOfDataFrames);
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_query_identity.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read identity %"));
+ garbage <- dev.off();
+
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_query_identity_zoom.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read identity %") + scale_y_continuous(limits=c(60, 100)));
+ garbage <- dev.off();
+}
+
+# Query GC
+for (t in 1:3) {
+ df <- data.frame();
+ listOfDataFrames <- NULL;
+ count <- c(1);
+ for (i in 1:nrow(data_samples)) {
+ type = types[t];
+ sampledir <- data_samples[i, "SampleDir"];
+ filename_data <- paste(sampledir, "/", analysisdir ,"/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
+ if (file.exists(filename_data)) {
+ data_field = read.table(filename_data, header=TRUE);
+ message(nrow(data_field));
+ if (nrow(data_field) > 0) {
+ thisid <- data_samples[i, "SampleName"];
+ listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$QueryGC);
+ count <- count + 1;
+ }
+ }
+ }
+
+ df <- do.call("rbind", listOfDataFrames);
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_query_gc.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read GC %"));
+ garbage <- dev.off();
+}
+
+# Best perfect kmer
+for (t in 1:3) {
+ df <- data.frame();
+ listOfDataFrames <- NULL;
+ count <- c(1);
+ for (i in 1:nrow(data_samples)) {
+ type = types[t];
+ sampledir <- data_samples[i, "SampleDir"];
+ filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
+ if (file.exists(filename_data)) {
+ data_field = read.table(filename_data, header=TRUE);
+ if (nrow(data_field) > 0) {
+ thisid <- data_samples[i, "SampleName"];
+ listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$LongestPerfectKmer);
+ count <- count + 1;
+ }
+ }
+ }
+
+ df <- do.call("rbind", listOfDataFrames);
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_best_perfect_kmer.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Best perfect kmer"));
+ garbage <- dev.off();
+}
+
+#PercentQueryAligned
+for (t in 1:3) {
+ df <- data.frame();
+ listOfDataFrames <- NULL;
+ count <- c(1);
+ for (i in 1:nrow(data_samples)) {
+ type = types[t];
+ sampledir <- data_samples[i, "SampleDir"];
+ filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
+ if (file.exists(filename_data)) {
+ data_field = read.table(filename_data, header=TRUE);
+ if (nrow(data_field) > 0) {
+ thisid <- data_samples[i, "SampleName"];
+ listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$PercentQueryAligned);
+ count <- count + 1;
+ }
+ }
+ }
+
+ df <- do.call("rbind", listOfDataFrames);
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_percent_query_aligned.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("% read aligned"));
+ garbage <- dev.off();
+
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_percent_query_aligned_zoom.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("% read aligned") + scale_y_continuous(limits=c(75, 100)));
+ garbage <- dev.off();
+}
+
+#AlignmentSize
+for (t in 1:3) {
+ df <- data.frame();
+ listOfDataFrames <- NULL;
+ count <- c(1);
+ for (i in 1:nrow(data_samples)) {
+ type = types[t];
+ sampledir <- data_samples[i, "SampleDir"];
+ filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
+ if (file.exists(filename_data)) {
+ data_field = read.table(filename_data, header=TRUE);
+ if (nrow(data_field) > 0) {
+ thisid <- data_samples[i, "SampleName"];
+ listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$AlignmentSize);
+ count <- count + 1;
+ }
+ }
+ }
+
+ df <- do.call("rbind", listOfDataFrames);
+ output_file <- paste(graphsdir, "/", reference, "_", type, "_alignment_size.pdf", sep="");
+ message(output_file);
+ pdf(output_file, width=imagewidth, height = 4);
+ print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Alignment size"));
+ garbage <- dev.off();
+}
+
+#AlignmentPercentIdentity
+#for (t in 1:3) {
+# df <- data.frame();
+# listOfDataFrames <- NULL;
+# count <- c(1);
+# for (i in 1:nrow(data_samples)) {
+# type = types[t];
+# sampledir <- data_samples[i, "SampleDir"];
+# filename_data <- paste(sampledir, "/analysis/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
+# if (file.exists(filename_data)) {
+# data_field = read.table(filename_data, header=TRUE);
+# thisid <- data_samples[i, "SampleName"];
+# message(thisid);
+# listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$AlignmentPercentIdentity);
+# count <- count + 1;
+# }
+# }
+#
+# df <- do.call("rbind", listOfDataFrames);
+# output_file <- paste(outdir, "/graphs/", reference, "_", type, "_alignment_identity.pdf", sep="");
+# message(output_file);
+# pdf(output_file, width=imagewidth, height = 4);
+# print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Alignment identity %"));
+# garbage <- dev.off();
+#}
diff --git a/bin/nanook_plot_lengths.R b/bin/nanook_plot_lengths.R
new file mode 100755
index 0000000..7c237c2
--- /dev/null
+++ b/bin/nanook_plot_lengths.R
@@ -0,0 +1,87 @@
+library(ggplot2)
+library(scales)
+library(grid)
+
+args <- commandArgs(TRUE)
+analysisdir <- args[1];
+graphsdir <- args[2];
+format <- args[3];
+
+types = c("2D", "Template", "Complement");
+colours = c("#68B5B9", "#CF746D", "#91A851");
+
+if (format=="png") {
+ textsize <- c(40)
+ pointsize <- c(5)
+ pointalpha <- c(0.5)
+ pointshape <- c(1)
+ pointwidth <- c(3)
+ xvjust <- c(1.2)
+ yvjust <- c(1.8)
+} else {
+ textsize <- c(14)
+ pointsize <- c(2)
+ pointalpha <- c(0.4)
+ pointshape <- c(1)
+ pointwidth <- c(1)
+ xvjust <- c(0.2)
+ yvjust <- c(0.8)
+}
+
+for (t in 1:3) {
+ type = types[t];
+ colourcode = colours[t];
+ #cat(type, " ", colourcode, "\n");
+
+ # Count vs length
+ filename_lengths <- paste(analysisdir, "/", "all_",type,"_lengths.txt", sep="");
+ filename_kmers <- paste(analysisdir, "/", "all_",type,"_kmers.txt", sep="");
+
+ if (file.exists(filename_lengths)) {
+ data_lengths = read.table(filename_lengths, col.name=c("name", "length"))
+
+ if (nrow(data_lengths) > 1) {
+ if (format=="png") {
+ lengths_png <- paste(graphsdir, "/", "all_",type,"_lengths.png", sep="");
+ png(lengths_png, width=1200, height=800)
+ print(ggplot(data_lengths, aes(x=data_lengths$length), xlab="Length") + geom_histogram(binwidth=1000, fill=colourcode) + xlab("Length") +ylab("Count") + scale_x_continuous(limits=c(0, 35000)) + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ } else {
+ lengths_pdf <- paste(graphsdir, "/", "all_",type,"_lengths.pdf", sep="");
+ pdf(lengths_pdf, width=6, height=4)
+ print(ggplot(data_lengths, aes(x=data_lengths$length), xlab="Length") + geom_histogram(binwidth=1000, fill=colourcode) + xlab("Length") +ylab("Count") + scale_x_continuous(limits=c(0, 35000)) + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", filename_lengths, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", filename_lengths, "\n");
+ }
+
+ # Number of perfect 21mers verses length scatter
+ if (file.exists(filename_kmers)) {
+ data_kmers = try(read.table(filename_kmers, header=TRUE), silent=TRUE)
+
+ if (inherits(data_kmers, "try-error")) {
+ cat("WARNING: Couldn't read", filename_kmers,"\n");
+ } else {
+ if (nrow(data_kmers) > 1) {
+ if (format=="png") {
+ kmers_png <- paste(graphsdir, "/", "all_",type,"_21mers.png", sep="");
+ png(kmers_png, width=1200, height=800)
+ print(ggplot(data_kmers, aes(x=data_kmers$Length, y=data_kmers$nk21), xlab="Read length") + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Number of perfect 21mers") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(breaks=seq(0, 40000, 4000)) + scale_y_continuous(breaks=seq(0, 400, 20)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_t [...]
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ kmers_pdf <- paste(graphsdir, "/", "all_",type,"_21mers.pdf", sep="");
+ pdf(kmers_pdf, width=6, height=4)
+ print(ggplot(data_kmers, aes(x=data_kmers$Length, y=data_kmers$nk21), xlab="Read length") + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Number of perfect 21mers") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(breaks=seq(0, 40000, 4000)) + scale_y_continuous(breaks=seq(0, 400, 20)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjus [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", filename_kmers, "\n");
+ }
+ }
+ } else {
+ cat("WARNING: Couldn't find", filename_kmers, "\n");
+ }
+}
diff --git a/bin/nanook_plot_reference.R b/bin/nanook_plot_reference.R
new file mode 100755
index 0000000..43ef583
--- /dev/null
+++ b/bin/nanook_plot_reference.R
@@ -0,0 +1,475 @@
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+
+# Filenames
+args <- commandArgs(TRUE)
+analysisdir <- args[1];
+graphsdir <- args[2];
+refid <- args[3];
+format <- args[4];
+
+roundUp <- function(x,to=10)
+{
+ to*(x%/%to + as.logical(x%%to))
+}
+
+types = c("Template", "Complement", "2D");
+colours = c("#CF746D", "#91A851", "#68B5B9");
+
+if (format=="png") {
+ textsize <- c(40)
+ pointsize <- c(5)
+ pointalpha <- c(0.5)
+ pointshape <- c(1)
+ pointwidth <- c(3)
+ xvjust <- c(1.2)
+ yvjust <- c(1.8)
+} else {
+ textsize <- c(14)
+ pointsize <- c(2)
+ pointalpha <-c(0.4)
+ pointshape <- c(1)
+ pointwidth <- c(1)
+ xvjust <- c(0.2)
+ yvjust <- c(0.8)
+}
+
+# Plot GC% vs position
+data_gc_filename <- paste(analysisdir, "/", refid, "/", refid, "_gc.txt", sep="");
+
+if (file.exists(data_gc_filename)) {
+ data_gc = read.table(data_gc_filename, col.name=c("Position", "Coverage"))
+
+ if (nrow(data_gc) > 1) {
+ if (format=="png") {
+ png_gc <- paste(graphsdir, "/", refid, "/", refid, "_gc.png", sep="");
+ cat("Writing", png_gc, "\n");
+ png(png_gc, width=1600, height=400)
+ print(ggplot(data_gc, aes(x=data_gc$Position, y=data_gc$Coverage)) + geom_line(color="black") + ggtitle("GC content") + theme(text = element_text(size=textsize)) + xlab("Position") + ylab("GC %") + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ } else {
+ pdf_gc <- paste(graphsdir, "/", refid, "/", refid, "_gc.pdf", sep="");
+ cat("Writing", pdf_gc, "\n");
+ pdf(pdf_gc, width=16, height=4)
+ print(ggplot(data_gc, aes(x=data_gc$Position, y=data_gc$Coverage)) + geom_line(color="black") + ggtitle("GC content") + theme(text = element_text(size=textsize)) + xlab("Position") + ylab("GC %") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text( [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", data_gc_filename, "\n");
+ }
+} else {
+ cat("WARNING: Couldn't find", data_gc_filename, "\n");
+}
+
+listOfDataFrames <- NULL;
+count <-c(1);
+
+# Work out longest kmer
+cum_maxk <- 0;
+maxk <- 0;
+for (t in 1:3) {
+ type = types[t];
+ data_perfect_cumulative_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_cumulative_perfect_kmers.txt", sep="");
+ if (file.exists(data_perfect_cumulative_filename)) {
+ cat("Reading", data_perfect_cumulative_filename, "\n");
+ data_perfect_cumulative = read.table(data_perfect_cumulative_filename, col.name=c("Size", "n", "Perfect"))
+ cat("Read", nrow(data_perfect_cumulative), "rows\n");
+
+ if (nrow(data_perfect_cumulative) > 0) {
+ for (i in 1:length(data_perfect_cumulative$Perfect)) {
+ if (data_perfect_cumulative$Size[i] > maxk) {
+ maxk <- data_perfect_cumulative$Size[i];
+ }
+ if (data_perfect_cumulative$Size[i] > cum_maxk) {
+ if (data_perfect_cumulative$Perfect[i] > 5) {
+ cum_maxk <- data_perfect_cumulative$Size[i];
+ }
+ }
+ }
+ } else {
+ cat("WARNING: No data in ", data_perfect_cumulative_filename, "\n");
+ }
+ }
+}
+maxk <- maxk + 10;
+cum_maxk <- roundUp(cum_maxk);
+cat("max k", maxk, "\n");
+cat("cum_max k", cum_maxk, "\n");
+
+
+for (t in 1:3) {
+ type = types[t];
+ colourcode = colours[t];
+
+ # Plot coverage vs position
+ data_coverage_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_coverage.txt", sep="");
+ cat("Reading", data_coverage_filename, "\n");
+
+ if (file.exists(data_coverage_filename)) {
+ data_coverage = read.table(data_coverage_filename, col.name=c("Position", "Coverage"))
+ if (nrow(data_coverage) > 0) {
+ if (format=="png") {
+ png_coverage <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_coverage.png", sep="");
+ cat("Writing", png_coverage, "\n");
+ png(png_coverage, width=1600, height=400)
+ print(ggplot(data_coverage, aes(x=data_coverage$Position, y=data_coverage$Coverage)) + geom_line(color=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Position") + ylab("Mean coverage") + expand_limits(y = 0) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ } else {
+ pdf_coverage <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_coverage.pdf", sep="");
+ cat("Writing", pdf_coverage, "\n");
+ pdf(pdf_coverage, width=16, height=4)
+ print(ggplot(data_coverage, aes(x=data_coverage$Position, y=data_coverage$Coverage)) + geom_line(color=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Position") + ylab("Mean coverage") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + expand_limits(y = 0) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=ele [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", data_coverage_filename, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", data_coverage_filename, "\n");
+ }
+
+ # Plot % reads with perfect kmer vs kmer size
+ data_perfect_cumulative_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_cumulative_perfect_kmers.txt", sep="");
+
+ if (file.exists(data_perfect_cumulative_filename)) {
+ cat("Reading", data_perfect_cumulative_filename, "\n");
+ data_perfect_cumulative = read.table(data_perfect_cumulative_filename, col.name=c("Size", "n", "Perfect"))
+ if (nrow(data_perfect_cumulative) > 0) {
+ if (format=="png") {
+ png_perfect_cumulative <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_cumulative_perfect_kmers.png", sep="");
+ cat("Writing", png_perfect_cumulative, "\n");
+ png(png_perfect_cumulative, width=1200, height=800)
+ print(ggplot(data_perfect_cumulative, aes(x=data_perfect_cumulative$Size, y=data_perfect_cumulative$Perfect)) + geom_bar(stat="identity", width=0.7, fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("kmer size") + ylab("% reads with perfect kmer") + scale_x_continuous(limits=c(0, cum_maxk), breaks=seq(0,cum_maxk,by=50)) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.titl [...]
+ } else {
+ pdf_perfect_cumulative <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_cumulative_perfect_kmers.pdf", sep="");
+ cat("Writing", pdf_perfect_cumulative, "\n");
+ pdf(pdf_perfect_cumulative, width=6, height=4)
+ print(ggplot(data_perfect_cumulative, aes(x=data_perfect_cumulative$Size, y=data_perfect_cumulative$Perfect)) + geom_bar(stat="identity", width=0.7, fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("kmer size") + ylab("% reads with perfect kmer") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + scale_x_continuous(limits=c(0, cum_maxk) [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", data_perfect_cumulative_filename, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", data_perfect_cumulative_filename, "\n");
+ }
+
+ # Plot %reads vs best perfect kmer
+ #data_perfect_best_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_best_perfect_kmers.txt", sep="");
+ #if (format=="png") {
+ # png_perfect_best <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_best_perfect_kmers.png", sep="");
+ # png(png_perfect_best, width=1200, height=800)
+ # data_perfect_best = read.table(data_perfect_best_filename, col.name=c("Size", "n", "Perfect"))
+ # print(ggplot(data_perfect_best, aes(x=data_perfect_best$Size, y=data_perfect_best$Perfect)) + geom_bar(stat="identity", width=0.7, fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Best perfect kmer") + ylab("% reads") + scale_x_continuous(limits=c(0, 140)) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vju [...]
+ #} else {
+ # pdf_perfect_best <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_best_perfect_kmers.pdf", sep="");
+ # pdf(pdf_perfect_best, width=6, height=4)
+ # data_perfect_best = read.table(data_perfect_best_filename, col.name=c("Size", "n", "Perfect"))
+ # print(ggplot(data_perfect_best, aes(x=data_perfect_best$Size, y=data_perfect_best$Perfect)) + geom_bar(stat="identity", width=0.7, fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Best perfect kmer") + ylab("% reads") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + scale_x_continuous(limits=c(0, 140)) + theme(text = element_text(size=texts [...]
+ #}
+ #garbage <- dev.off()
+
+ # ========== Indels files ==========
+
+ # Insertions
+ data_insertions_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_insertions.txt", sep="");
+ cat("Reading", data_insertions_filename, "\n");
+ if (file.exists(data_insertions_filename)) {
+ data_insertions = read.table(data_insertions_filename, col.name=c("Size", "Percent"))
+ if (nrow(data_insertions) > 0) {
+ if (format=="png") {
+ png_insertions <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_insertions.png", sep="");
+ cat("Writing", png_insertions, "\n");
+ png(png_insertions, width=1200, height=800)
+ print(ggplot(data_insertions, aes(x=data_insertions$Size, y=data_insertions$Percent)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Insertion size") + ylab("%") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ } else {
+ pdf_insertions <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_insertions.pdf", sep="");
+ cat("Writing", pdf_insertions, "\n");
+ pdf(pdf_insertions, width=6, height=4)
+ print(ggplot(data_insertions, aes(x=data_insertions$Size, y=data_insertions$Percent)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Insertion size") + ylab("%") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vju [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", data_insertions_filename, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", data_insertions_filename, "\n");
+ }
+
+ # Deletions
+ data_deletions_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_deletions.txt", sep="");
+ cat("Reading", data_deletions_filename, "\n");
+ if (file.exists(data_deletions_filename)) {
+ data_deletions = read.table(data_deletions_filename, col.name=c("Size", "Percent"))
+ if (nrow(data_deletions) > 0) {
+ if (format=="png") {
+ png_deletions <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_deletions.png", sep="");
+ cat("Writing", png_deletions, "\n");
+ png(png_deletions, width=1200, height=800)
+ print(ggplot(data_deletions, aes(x=data_deletions$Size, y=data_deletions$Percent)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Deletion size") + ylab("%") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ } else {
+ pdf_deletions <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_deletions.pdf", sep="");
+ cat("Writing", pdf_deletions, "\n");
+ pdf(pdf_deletions, width=6, height=4)
+ print(ggplot(data_deletions, aes(x=data_deletions$Size, y=data_deletions$Percent)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Deletion size") + ylab("%") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=- [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", data_deletions_filename, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", data_deletions_filename, "\n");
+ }
+
+ # ========== Alignments file ==========
+
+ input_filename <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_alignments.txt", sep="");
+ cat("Reading", input_filename, "\n");
+
+ if (file.exists(input_filename)) {
+ data_alignments = read.table(input_filename, header=TRUE);
+ if (nrow(data_alignments) > 1) {
+ # Length vs Identity histograms
+ if (format=="png") {
+ identity_hist_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_hist.png", sep="")
+ cat("Writing", identity_hist_png, "\n");
+ png(identity_hist_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryPercentIdentity)) + geom_histogram(fill=colourcode) + xlab("Read identity %") +ylab("Count") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)) )
+ } else {
+ identity_hist_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_hist.pdf", sep="")
+ cat("Writing", identity_hist_pdf, "\n");
+ pdf(identity_hist_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryPercentIdentity)) + geom_histogram(fill=colourcode) + xlab("Read identity %") +ylab("Count") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+
+ # GC histogram
+ if (format=="png") {
+ identity_hist_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_GC_hist.png", sep="")
+ cat("Writing", identity_hist_png, "\n");
+ png(identity_hist_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryGC)) + geom_histogram(fill=colourcode, binwidth=1) + xlab("GC %") +ylab("Read count") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)) + scale_x_continuous(limits=c(0, 100)) )
+ } else {
+ identity_hist_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_GC_hist.pdf", sep="")
+ cat("Writing", identity_hist_pdf, "\n");
+ pdf(identity_hist_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryGC)) + geom_histogram(fill=colourcode, binwidth = 1) + xlab("GC %") +ylab("Read count") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)) + scale_x_continuous(limits=c(0, 100)))
+ }
+ garbage <- dev.off()
+
+ # Identity vs Length Scatter plots
+ if (format=="png") {
+ identity_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_scatter.png", sep="");
+ cat("Writing", identity_scatter_pdf, "\n");
+ png(identity_scatter_pdf, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$QueryPercentIdentity)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Length") +ylab("Read identity %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vju [...]
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ identity_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_scatter.pdf", sep="");
+ cat("Writing", identity_scatter_pdf, "\n");
+ pdf(identity_scatter_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$QueryPercentIdentity)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Length") +ylab("Read identity %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+
+ # Identity vs Length heatmap
+ if (format=="png") {
+ identity_heatmap_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_heatmap.png", sep="");
+ cat("Writing", identity_heatmap_png, "\n");
+ png(identity_heatmap_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$QueryPercentIdentity)) + geom_bin2d(drop=TRUE, binwidth=c(500,2)) + xlab("Length") +ylab("Read identity %") + ggtitle(type) + theme(text = element_text(size=(textsize*0.75))) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)) + scale_fill_gradient [...]
+ #limits=c(0,50), breaks=seq(0, 40, by=10), colours=rainbow(4)
+ } else {
+ identity_heatmap_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_heatmap.pdf", sep="");
+ cat("Writing", identity_heatmap_pdf, "\n");
+ pdf(identity_heatmap_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$QueryPercentIdentity)) + geom_bin2d(drop=TRUE, binwidth=c(500,2)) + xlab("Length") +ylab("Read identity %") + ggtitle(type) + theme(text = element_text(size=(textsize*0.75))) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust))+ scale_fill_gradientn [...]
+ }
+ garbage <- dev.off()
+
+ # Identity vs Length heatmap zoomed
+ if (format=="png") {
+ identity_heatmap_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_heatmap_zoom.png", sep="");
+ cat("Writing", identity_heatmap_png, "\n");
+ png(identity_heatmap_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$QueryPercentIdentity)) + geom_bin2d(drop=TRUE, binwidth=c(500,1)) + xlab("Length") +ylab("Read identity %") + ggtitle(type) + theme(text = element_text(size=(textsize*0.75))) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)) + scale_fill_gradientn(colours=rev(rainbow(n=30, end=4/6)), [...]
+ #limits=c(0,50), breaks=seq(0, 40, by=10), colours=rainbow(4)
+ } else {
+ identity_heatmap_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_length_vs_identity_heatmap_zoom.pdf", sep="");
+ cat("Writing", identity_heatmap_pdf, "\n");
+ pdf(identity_heatmap_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$QueryPercentIdentity)) + geom_bin2d(drop=TRUE, binwidth=c(500,1)) + xlab("Length") +ylab("Read identity %") + ggtitle(type) + theme(text = element_text(size=(textsize*0.75))) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust))+ scale_fill_gradientn(colours=rev(rainbow(n=30, end=4/6)), n [...]
+ }
+ garbage <- dev.off()
+
+ # Identity boxplot
+ listOfDataFrames[[count]] <- data.frame(Readset=type, Variable=data_alignments$QueryPercentIdentity);
+ count <- count + 1;
+
+ # Alignment identity vs. Fraction of read aligned scatter plots
+ if (format=="png") {
+ aid_scatter_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_read_fraction_vs_alignment_identity_scatter.png", sep="");
+ cat("Writing", aid_scatter_png, "\n");
+ png(aid_scatter_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$PercentQueryAligned, y=data_alignments$AlignmentPercentIdentity)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Percentage of read aligned") +ylab("Alignment identity %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 105)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axi [...]
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ aid_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_read_fraction_vs_alignment_identity_scatter.pdf", sep="");
+ cat("Writing", aid_scatter_pdf, "\n");
+ pdf(aid_scatter_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$PercentQueryAligned, y=data_alignments$AlignmentPercentIdentity)) + geom_point(shape=pointshape, alpha=0.4, color=colourcode) + xlab("Percentage of read aligned") +ylab("Alignment identity %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 105)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text( [...]
+ }
+ garbage <- dev.off()
+
+ # Query identity vs. Fraction of read aligned scatter plots
+ if (format=="png") {
+ qid_scatter_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_read_fraction_vs_query_identity_scatter.png", sep="");
+ cat("Writing", qid_scatter_png, "\n");
+ png(qid_scatter_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$PercentQueryAligned, y=data_alignments$QueryPercentIdentity)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Percentage of read aligned") +ylab("Alignment identity %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 105)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.ti [...]
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ qid_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_read_fraction_vs_query_identity_scatter.pdf", sep="");
+ cat("Writing", qid_scatter_pdf, "\n");
+ pdf(qid_scatter_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$PercentQueryAligned, y=data_alignments$QueryPercentIdentity)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Percentage of read aligned") +ylab("Alignment identity %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 105)) + scale_y_continuous(limits=c(0, 100)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_te [...]
+ }
+ garbage <- dev.off()
+
+ # Best perfect sequence vs. length scatters
+ if (format=="png") {
+ best_perf_scatter_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_longest_perfect_vs_length_scatter.png", sep="");
+ cat("Writing", best_perf_scatter_png, "\n");
+ png(best_perf_scatter_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$LongestPerfectKmer)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Longest perfect kmer") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ best_perf_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_longest_perfect_vs_length_scatter.pdf", sep="");
+ cat("Writing", best_perf_scatter_pdf, "\n");
+ pdf(best_perf_scatter_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$LongestPerfectKmer)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Longest perfect kmer") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+
+ # Best perfect sequence vs. length scatters zoomed
+ if (format=="png") {
+ best_perf_zoom_scatter_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_longest_perfect_vs_length_zoom_scatter.png", sep="");
+ cat("Writing", best_perf_zoom_scatter_png, "\n");
+ png(best_perf_zoom_scatter_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$LongestPerfectKmer)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Longest perfect kmer") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 10000)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=elemen [...]
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ best_perf_zoom_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_longest_perfect_vs_length_zoom_scatter.pdf", sep="");
+ cat("Writing", best_perf_zoom_scatter_pdf, "\n");
+ pdf(best_perf_zoom_scatter_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$LongestPerfectKmer)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Longest perfect kmer") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 10000)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+
+ # Plot %reads vs best perfect kmer
+ cat(data_alignments$LongestPerfectKmer);
+
+ hdf <- hist(breaks=seq(0,maxk,by=10), x=data_alignments$LongestPerfectKmer, plot=FALSE, right=FALSE); # bins are 0-9, 10-19, 20-29 etc.
+ hdf$density = hdf$counts/sum(hdf$counts)*100
+ tdf <- data.frame(Pos=hdf$mids, Counts=hdf$density);
+
+ if (format=="png") {
+ png_perfect_best <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_best_perfect_kmers.png", sep="");
+ cat("Writing", png_perfect_best, "\n");
+ png(png_perfect_best, width=1200, height=800)
+ print(ggplot(tdf, aes(Pos, Counts)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Best perfect kmer") + ylab("% reads") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + scale_x_continuous(limits=c(0, cum_maxk), breaks=seq(0,cum_maxk,by=50)) + theme(text = element_text(size=textsize)) + theme(plot.margin [...]
+ } else {
+ pdf_perfect_best <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_best_perfect_kmers.pdf", sep="");
+ cat("Writing", pdf_perfect_best, "\n");
+ pdf(pdf_perfect_best, width=6, height=4)
+ print(ggplot(tdf, aes(Pos, Counts)) + geom_bar(stat="identity", fill=colourcode) + ggtitle(type) + theme(text = element_text(size=textsize)) + xlab("Best perfect kmer") + ylab("% reads") + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.y=element_text(vjust=0.2)) + theme(axis.title.x=element_text(vjust=-0.2)) + scale_x_continuous(limits=c(0, cum_maxk), breaks=seq(0,cum_maxk,by=50)) + theme(text = element_text(size=textsize)) + theme(plot.margin [...]
+ }
+ garbage <- dev.off()
+
+
+ # Number of perfect 21mers verses length scatter
+ if (format=="png") {
+ nk21_scatter_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_nk21_vs_length_scatter.png", sep="");
+ cat("Writing", nk21_scatter_png, "\n");
+ png(nk21_scatter_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$nk21)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Number of perfect 21mers") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ nk21_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_nk21_vs_length_scatter.pdf", sep="");
+ cat("Writing", nk21_scatter_pdf, "\n");
+ pdf(nk21_scatter_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$nk21)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Number of perfect 21mers") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+
+ # Mean perfect sequence vs. length scatters
+ #mean_perf_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_mean_perfect_vs_length_scatter.pdf", sep="");
+ #pdf(mean_perf_scatter_pdf, height=4, width=6)
+ #print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$MeanPerfectKmer), xlab="Read length") + geom_point(shape=pointshape, alpha=pointalpha) + xlab("Read length") +ylab("Mean perfect kmer") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 10000)))
+ #garbage <- dev.off()
+
+ # Percentage of read aligned vs read length
+ if (format=="png") {
+ output_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_percent_aligned_vs_length_scatter.png", sep="");
+ cat("Writing", output_png, "\n");
+ png(output_png, width=1200, height=800)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$PercentQueryAligned)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Percentage of read aligned") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ output_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_percent_aligned_vs_length_scatter.pdf", sep="");
+ cat("Writing", output_pdf, "\n");
+ pdf(output_pdf, width=6, height=4)
+ print(ggplot(data_alignments, aes(x=data_alignments$QueryLength, y=data_alignments$PercentQueryAligned)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Read length") +ylab("Percentage of read aligned") + ggtitle(type) + theme(text = element_text(size=textsize)) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text(vjust=-xvjust)) + theme(axis.title.y=element_text(vjust=yvjust)))
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", input_filename, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", input_filename, "\n");
+ }
+
+ # ========== Kmer file ==========
+
+ # Kmer abundance with labels
+ input_kmers <- paste(analysisdir, "/", refid, "/", refid, "_",type,"_kmers.txt", sep="");
+ cat("Reading", input_kmers, "\n");
+
+ if (file.exists(input_kmers)) {
+ data_kmers = read.table(input_kmers, header=TRUE);
+ if (nrow(data_kmers) > 1) {
+ if (format=="png") {
+ kmer_scatter_png <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_kmer_scatter.png", sep="");
+ cat("Writing", kmer_scatter_png, "\n");
+ png(kmer_scatter_png, width=1200, height=1200)
+ print(ggplot(data_kmers, aes(x=data_kmers$RefPc, y=data_kmers$ReadPc)) + geom_point(shape=pointshape, size=pointsize, alpha=pointalpha, color=colourcode) + xlab("Reference abundance %") +ylab("Reads abundance %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 0.3)) + scale_y_continuous(limits=c(0, 0.3)) + geom_text(aes(label=data_kmers$Kmer), size=4) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title [...]
+ #grid.edit("geom_point.points", grep = TRUE, gp = gpar(lwd = pointwidth))
+ } else {
+ kmer_scatter_pdf <- paste(graphsdir, "/", refid, "/", refid, "_",type,"_kmer_scatter.pdf", sep="");
+ cat("Writing", kmer_scatter_pdf, "\n");
+ pdf(kmer_scatter_pdf, width=6, height=6)
+ print(ggplot(data_kmers, aes(x=data_kmers$RefPc, y=data_kmers$ReadPc)) + geom_point(shape=pointshape, alpha=pointalpha, color=colourcode) + xlab("Reference abundance %") +ylab("Reads abundance %") + ggtitle(type) + theme(text = element_text(size=textsize)) + scale_x_continuous(limits=c(0, 0.3)) + scale_y_continuous(limits=c(0, 0.3)) + geom_text(aes(label=data_kmers$Kmer), size=1) + theme(plot.margin = unit(c(0.02,0.02,0.04,0.02), "npc")) + theme(axis.title.x=element_text( [...]
+ }
+ garbage <- dev.off()
+ } else {
+ cat("WARNING: No data in ", input_kmers, "\n");
+ }
+ } else {
+ cat("WARNING: Couldn't find", input_kmers, "\n");
+ }
+}
+
+# Identity boxplot
+df <- do.call("rbind", listOfDataFrames);
+if (format=="png") {
+ output_file <- paste(graphsdir, "/", refid, "/", refid, "_query_identity_boxplot.png", sep="");
+ cat("Writing", output_file, "\n");
+ png(output_file, width=1200, height=800);
+ print(ggplot(df, aes(x=Readset, y=Variable, fill=Readset)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read identity %") + scale_y_continuous(limits=c(70, 100)));
+} else {
+ output_file <- paste(graphsdir, "/", refid, "/", refid, "_query_identity_boxplot.pdf", sep="");
+ cat("Writing", output_file, "\n");
+ pdf(output_file, width=6, height = 4);
+ print(ggplot(df, aes(x=Readset, y=Variable, fill=Readset)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read identity %") + scale_y_continuous(limits=c(70, 100)));
+}
+garbage <- dev.off();
diff --git a/bin/nanook_split_fasta b/bin/nanook_split_fasta
new file mode 100755
index 0000000..6e474e9
--- /dev/null
+++ b/bin/nanook_split_fasta
@@ -0,0 +1,83 @@
+#!/usr/bin/perl
+#
+# Program: nanotools_split_fasta
+# Purpose: Split FASTA file into separate files for each read
+# Author: Richard Leggett
+# Contact: richard.leggett at tgac.ac.uk
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $input_file;
+my $output_dir;
+my $help_requested;
+my %ids;
+my $count = 0;
+
+&GetOptions(
+'i|input:s' => \$input_file,
+'o|outputdir:s' => \$output_dir,
+'h|help' => \$help_requested
+);
+
+if (defined $help_requested) {
+ print "\nnanotools_split_fasta\n\n";
+ print "Split a multi-read FASTA into separate files.\n\n";
+ print "Usage: nanotools_split_fasta.pl <-i input> [-o output_dir]\n\n";
+ print "Options:\n";
+ print " -i | -input Input FASTA file\n";
+ print " -o | -outputdir Output directory\n";
+ print "\n";
+
+ exit;
+}
+
+die "You must specify an input file\n" if not defined $input_file;
+die "You must specify an output directory\n" if not defined $output_dir;
+
+my $fh;
+
+local $| = 1;
+
+open(INPUTFILE, $input_file) or die "Can't open input ".$input_file."\n";
+
+while(<INPUTFILE>) {
+ my $line = $_;
+
+ if ($line =~ /^>(\S+)/) {
+ my $id = $1;
+
+ if (not defined $ids{$id}) {
+ $ids{$id} = 1;
+
+ if (defined $fh) {
+ close($fh);
+ }
+
+ my $out_filename = $output_dir."/".$id.".fasta";
+ $count++;
+ #print "Writing $out_filename\n";
+
+ if (($count % 10) == 0) {
+ print "\r$count";
+ }
+
+ open($fh, ">".$out_filename) or die "Can't open output ".$out_filename."\n";
+ } else {
+ print "WARNING: Repeat ID $id\n";
+ }
+ }
+
+ if (defined $fh) {
+ print $fh $line;
+ } else {
+ print "Eeek\n";
+ }
+}
+
+if (defined $fh) {
+ close($fh);
+}
+
+close(INPUTFILE);
diff --git a/bin/slurmit b/bin/slurmit
new file mode 100644
index 0000000..7d07a88
--- /dev/null
+++ b/bin/slurmit
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+commandtorun=""
+nodes=1
+ntasks=1
+mem=2G
+maxtime="6-23:00"
+outfile=""
+partition=""
+cpuspertask=1
+
+function usage
+{
+cat << EOF
+
+Submit commands to SLURM
+
+Usage: slurmit [options] "command to execute"
+
+Submission script for SLURM
+
+OPTIONS:
+ -h Show this message
+ -c Number of processors per task (--cpus-per-task parameter) (dedault 1)
+ -m memory required per node (--mem parameter) (defualt "2G")
+ -n maximum number of tasks (--ntasks parameter) (default 1)
+ -o Output file (stdout and stderr) (default undefined)
+ -p Parition (e.g. "tgac-medium") (default undefined)
+ -t Time limit (--time parameter) (default "6-23:00")
+ -N minimum number of nodes (--nodes parameter) (default 1)
+
+Example: slurmit -o logfile.txt "ls -l"
+
+Don't forget to backslash dollar variables, as appropriate.
+
+EOF
+}
+
+
+while getopts c:hm:n:o:p:t:N: OPTION
+do
+ case $OPTION in
+ c) cpuspertask=$OPTARG;;
+ h) usage ; exit 1 ;;
+ m) mem=$OPTARG;;
+ n) ntasks=$OPTARG;;
+ o) outfile=" -o $OPTARG";;
+ p) partition=" -p $OPTARG";;
+ t) maxtime=$OPTARG;;
+ N) nodes=$OPTARG;;
+ esac
+done
+shift $((OPTIND-1))
+
+commandtorun=$@
+
+if [ "$commandtorun" == "" ] ; then
+ echo "You must specify a command to run"
+ exit
+fi
+
+sbatch --nodes ${nodes} --cpus-per-task=${cpuspertask} --ntasks ${ntasks} --time ${maxtime} --mem ${mem}${outfile}${partition} --wrap="echo \"SLURM job output\" ; echo "" ; echo \"Command: ${commandtorun}\" ; echo \"Job ID: \${SLURM_JOB_ID}\" ; echo -n \"Start time: \" ; date ; printf \"%0.s-\" {1..70} ; printf \"\n\n\" ; ${commandtorun} ; printf \"\n\" ; printf \"%0.s-\" {1..70} ; printf \"\n\n\" ; sstat -j \${SLURM_JOB_ID}.batch ; printf \"\n\" ; echo \"SLURM ended\"; echo -n \"End tim [...]
diff --git a/src/nanook/Alignment.java b/src/nanook/Alignment.java
new file mode 100644
index 0000000..f790fdb
--- /dev/null
+++ b/src/nanook/Alignment.java
@@ -0,0 +1,146 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+
+/**
+ * Generic class to represent alignment
+ * @author Richard Leggett
+ */
+public class Alignment implements Comparable {
+ private int score;
+ private String queryName;
+ private int querySequenceSize;
+ private int queryStart;
+ private int queryAlignmentSize;
+ private int queryEnd;
+ private String queryStrand;
+ private String hitName;
+ private int hitSequenceSize;
+ private int hitStart;
+ private int hitAlignmentSize;
+ private int hitEnd;
+ private String hitStrand;
+ private String queryString;
+ private String hitString;
+ boolean fIsCIGAR;
+
+ public Alignment(int s, String qName, int qSize, int qStart, int qAlnSize, String qs, String hName, int hSize, int hStart, int hAlnSize, String hs, boolean cigar) {
+ score = s;
+ queryName = qName;
+ querySequenceSize = qSize;
+ queryStart = qStart;
+ queryAlignmentSize = qAlnSize;
+ queryEnd = qStart + qAlnSize - 1;
+ queryString = qs;
+ hitName = hName;
+ hitSequenceSize = hSize;
+ hitStart = hStart;
+ hitAlignmentSize = hAlnSize;
+ hitEnd = hStart + hAlnSize - 1;
+ hitString = hs;
+ fIsCIGAR = cigar;
+ queryStrand = "+";
+ hitStrand = "+";
+ }
+
+ public void setQueryStrand(String s) {
+ queryStrand = s;
+ }
+
+ public void setHitStrand(String s) {
+ hitStrand = s;
+ }
+
+ public String getQueryStrand() {
+ return queryStrand;
+ }
+
+ public String getHitStrand() {
+ return hitStrand;
+ }
+
+ public int getScore() {
+ return score;
+ }
+
+ public String getQueryName() {
+ return queryName;
+ }
+
+ public int getQuerySequenceSize() {
+ return querySequenceSize;
+ }
+
+ public int getQueryStart() {
+ return queryStart;
+ }
+
+ public int getQueryAlignmentSize() {
+ return queryAlignmentSize;
+ }
+
+ public int getQuertEnd() {
+ return queryEnd;
+ }
+
+ public String getQueryString() {
+ return queryString;
+ }
+
+ public String getHitName() {
+ return hitName;
+ }
+
+ public int getHitSequenceSize() {
+ return hitSequenceSize;
+ }
+
+ public int getHitStart() {
+ return hitStart;
+ }
+
+ public int getHitAlignmentSize() {
+ return hitAlignmentSize;
+ }
+
+ public int getHitEnd() {
+ return hitEnd;
+ }
+
+ public String getHitString() {
+ return hitString;
+ }
+
+ public boolean isCIGAR() {
+ return fIsCIGAR;
+ }
+
+ public void writeMafFile(String filename) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ pw.printf("s %24s %5d %5d %s %5d %s", hitName, hitStart, hitAlignmentSize, hitStrand, hitSequenceSize, hitString);
+ pw.println("");
+ pw.printf("s %24s %5d %5d %s %5d %s", queryName, queryStart, queryAlignmentSize, queryStrand, querySequenceSize, queryString);
+ pw.println("");
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("ReportWriter exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ @Override
+ public int compareTo(Object o) {
+ return ((Alignment)o).getScore() - score;
+ }
+}
diff --git a/src/nanook/AlignmentFileParser.java b/src/nanook/AlignmentFileParser.java
new file mode 100644
index 0000000..27d01fb
--- /dev/null
+++ b/src/nanook/AlignmentFileParser.java
@@ -0,0 +1,83 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Interface for parsers of alignment files.
+ *
+ * @author Richard Leggett
+ */
+
+public interface AlignmentFileParser {
+ /**
+ * Get identifier for the alignment program
+ * @return ID in lower case e.g. "last"
+ */
+ public String getProgramID();
+
+ /**
+ * Get file extension of alignments
+ * @return
+ */
+ public String getAlignmentFileExtension();
+
+ /**
+ * Get format of input reads expected
+ * @return NanoOKOptions.FASTA or NanoOKOptions.FASTQ
+ */
+ public int getReadFormat();
+
+ /**
+ * Set alignment parameters to run executable
+ * @return
+ */
+ public void setAlignmentParams(String p);
+
+ /**
+ * Get command to run aligner
+ * @param query query file
+ * @param output output file
+ * @param reference reference file
+ * @return
+ */
+ public String getRunCommand(String query, String output, String reference);
+
+ /**
+ * Parse an alignment file.
+ * @param filename the filename of the alignments file
+ * @param summaryFile the name of an alignments table summary file to write
+ * @return
+ */
+ int parseFile(String filename, AlignmentsTableFile summaryFile, ReadSetStats overallStats);
+
+ /**
+ * Sort alignments by score
+ */
+ void sortAlignments();
+
+ /**
+ * Get highest scoring set of alignments (ie. highest scoring reference)
+ * @return an List of Alignment objects
+ */
+ List<Alignment> getHighestScoringSet();
+
+ /**
+ * Return true if this aligner outputs to Stdout and not a file
+ * @return true or false
+ */
+ public boolean outputsToStdout();
+
+ /**
+ * Check index files are present before aligning
+ * @param referenceFile name of FASTA file
+ */
+ public void checkForIndex(String referenceFile);
+}
diff --git a/src/nanook/AlignmentFileStats.java b/src/nanook/AlignmentFileStats.java
new file mode 100644
index 0000000..0c152fd
--- /dev/null
+++ b/src/nanook/AlignmentFileStats.java
@@ -0,0 +1,24 @@
+package nanook;
+
+import java.io.File;
+
+public class AlignmentFileStats {
+ private String alignmentPathname;
+ private int nAlignments = 0;
+
+ public AlignmentFileStats(String p) {
+ alignmentPathname = p;
+ }
+
+ public void markNoAlignments() {
+ nAlignments = 0;
+ }
+
+ public void legacyActions(AlignmentsTableFile nonAlignedSummaryFile, ReadSetStats overallStats) {
+ if (nAlignments == 0) {
+ String leafName = new File(alignmentPathname).getName();
+ nonAlignedSummaryFile.writeNoAlignmentMessage(leafName);
+ overallStats.addReadWithoutAlignment();
+ }
+ }
+}
diff --git a/src/nanook/AlignmentInfo.java b/src/nanook/AlignmentInfo.java
new file mode 100644
index 0000000..230ef7b
--- /dev/null
+++ b/src/nanook/AlignmentInfo.java
@@ -0,0 +1,178 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+/**
+ * Class to hold information about an alignment.
+ *
+ * @author Richard Leggett
+ */
+public class AlignmentInfo {
+ private String hitName;
+ private int hitSize;
+ private String queryName;
+ private int querySize;
+ private int identicalBases;
+ private int longest;
+ private double meanPerfectKmer;
+ private int total;
+ private int count;
+ private int alignmentSize;
+ private int alignmentSizeMinusIndels;
+ private double queryIdentity;
+ private double alignmentIdentity;
+ private double alignmentIdentityMinusIndels;
+ private double percentQueryAligned;
+ private int queryAlignmentSize;
+ int kSizes[];
+ int kCounts[];
+ int nk;
+
+
+ /**
+ * Constructor.
+ *
+ * @param hn hit name
+ * @param hs hit size
+ * @param qn query name
+ * @param qs query size
+ * @param ib number of identical bases
+ * @param l longest perfect kmer
+ * @param t sum of perfect kmers
+ * @param c count of perfect kmers
+ * @param as alignment size
+ * @param ad alignment size minus indels
+ * @param qas query alignment size
+ */
+ public AlignmentInfo(String hn, int hs, String qn, int qs, int ib, int l, int t, int c, int as, int ami, int qas) {
+ hitName = hn;
+ hitSize = hs;
+ querySize = qs;
+ queryName = qn;
+ identicalBases = ib;
+ longest = l;
+ total = t;
+ count = c;
+ meanPerfectKmer = (double)t / (double)c;
+ alignmentSize = as;
+ alignmentSizeMinusIndels = ami;
+ queryAlignmentSize = qas;
+ queryIdentity = (100.0 * (double)identicalBases) / (double)querySize;
+ alignmentIdentity = (100.0 * (double)identicalBases) / (double)alignmentSize;
+ alignmentIdentityMinusIndels = (100.0 * (double)identicalBases) / (double)alignmentSizeMinusIndels;
+ //percentQueryAligned = (100.0 * (double)alignmentSize) / (double)querySize;
+ percentQueryAligned = (100.0 * (double)queryAlignmentSize) / (double)querySize;
+ }
+
+ /**
+ * Get identical bases count.
+ * @return number of identical bases
+ */
+ public int getIdenticalBases() {
+ return identicalBases;
+ }
+
+ /**
+ * Get longest perfect kmer.
+ * @return longest perfect kmer
+ */
+ public int getLongestPerfectKmer() {
+ return longest;
+ }
+
+ /**
+ * Get alignment size.
+ * @return alignment size, in bases
+ */
+ public int getAlignmentSize() {
+ return alignmentSize;
+ }
+
+ /**
+ * Get query identity.
+ * @return query identity percent
+ */
+ public double getQueryId() {
+ return queryIdentity;
+ }
+
+ public String getQueryName() {
+ return queryName;
+ }
+
+ public String getHitName() {
+ return hitName;
+ }
+
+ /**
+ * Get alignment identity.
+ * @return alignment identity percent
+ */
+ public double getAlignmentId() {
+ return alignmentIdentity;
+ }
+
+ /**
+ * Get alignment identity.
+ * @return alignment identity percent
+ */
+ public double getAlignmentIdMinusIndels() {
+ return alignmentIdentityMinusIndels;
+ }
+
+ /**
+ * Get mean perfect kmer size.
+ * @return mean perfect kmer size, in bases
+ */
+ public double getMeanPerfectKmer() {
+ return meanPerfectKmer;
+ }
+
+ /**
+ * Get query size.
+ * @return size of query, in bases.
+ */
+ public int getQuerySize() {
+ return querySize;
+ }
+
+ /**
+ * Get hit size.
+ * @return size of hit, in bases
+ */
+ public int getHitSize() {
+ return hitSize;
+ }
+
+ /**
+ * Get percentage of query aligned.
+ * @return percentage of hit sequence aligned
+ */
+ public double getPercentQueryAligned() {
+ return percentQueryAligned;
+ }
+
+ public void addkCounts(int n, int[] s, int[] c) {
+ nk = n;
+ kSizes = s;
+ kCounts = c;
+ }
+
+ public String getkCounts() {
+ String s="";
+
+ for (int i=0; i<nk; i++) {
+ s = s + Integer.toString(kCounts[i]);
+ if (i != (nk-1)) {
+ s = s + "\t";
+ }
+ }
+
+ return s;
+ }
+}
diff --git a/src/nanook/AlignmentMerger.java b/src/nanook/AlignmentMerger.java
new file mode 100644
index 0000000..cf41c91
--- /dev/null
+++ b/src/nanook/AlignmentMerger.java
@@ -0,0 +1,411 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+/**
+ * Class to merge alignments
+ *
+ * @author Richard Leggett
+ */
+public class AlignmentMerger {
+ private ReferenceSequence reference;
+ private ReadSetStats overallStats;
+ private NanoOKOptions options;
+ private int readLength;
+ private int[] covered;
+ private int deletionSize = 0;
+ private int insertionSize = 0;
+ private String errorKmer = "";
+ private int type;
+ private int kmerTotal = 0;
+ private int kmerCount = 0;
+ private int currentPerfectKmerSize = 0;
+ private int longestPerfectKmer = 0;
+ private int overallQueryStart = -1;
+ private int overallQueryEnd = -1;
+ private int overallHitStart = -1;
+ private int overallHitEnd = -1;
+ private int hitSeqSize = 0;
+ private int querySeqSize = 0;
+ private String queryName = null;
+ private String hitName = null;
+ private int identicalBases = 0;
+ private int alignmentSize = 0;
+ private int alignmentSizeWithoutIndels = 0;
+
+ // Bodge for speed - need to change way AlignmentInfo works
+ int kSizes[] = {15, 17, 19, 21, 23, 25};
+ int kCounts[] = {0, 0, 0, 0, 0, 0};
+ int nk = 6;
+
+ /**
+ * Constructor
+ * @param r the reference, as a ReferenceSequence object
+ * @param l the read length
+ * @param s the read set stats for this read set
+ * @param t the type number of read (defined in NanoOKOptions)
+ */
+ public AlignmentMerger(NanoOKOptions o, ReferenceSequence r, int l, ReadSetStats s, int t) {
+ options = o;
+ reference = r;
+ readLength = l;
+ overallStats = s;
+ type = t;
+
+ covered = new int[readLength];
+
+ options.getLog().println("");
+ options.getLog().println("New AlignmentMerger");
+ options.getLog().println("");
+ }
+
+ /**
+ * Helper method to check if to store insertion or deletion (and store it).
+ * @param reference Reference object this alignment relates to
+ * @param errorKmer The perfect sequence before this error
+ */
+ private void checkStoreInsertionsOrDeletions() {
+ if (deletionSize > 0) {
+ reference.getStatsByType(type).addDeletionError(deletionSize, errorKmer, overallStats); // Reference
+ deletionSize = 0;
+ }
+
+ if (insertionSize > 0) {
+ reference.getStatsByType(type).addInsertionError(insertionSize, errorKmer, overallStats); // Reference
+ insertionSize = 0;
+ }
+
+ errorKmer = "";
+ }
+
+ private void storePerfectKmerLength() {
+ // Store perfect kmers
+ if (currentPerfectKmerSize > 0) {
+ reference.getStatsByType(type).addPerfectKmer(currentPerfectKmerSize); // Reference
+
+ // Bodge - need to change
+ for (int l=0; l<nk; l++) {
+ if (currentPerfectKmerSize >= kSizes[l]) {
+ kCounts[l]++;
+ }
+ }
+
+ kmerTotal+=currentPerfectKmerSize;
+ kmerCount++;
+
+ if (currentPerfectKmerSize > longestPerfectKmer) {
+ longestPerfectKmer = currentPerfectKmerSize;
+ }
+
+ currentPerfectKmerSize = 0;
+ }
+ }
+
+ /**
+ * Merge in a new alignment
+ * @param a a Alignment
+ */
+ public void addAlignment(Alignment a) {
+ String hitSeq = a.getHitString();
+ String querySeq = a.getQueryString();
+ int hitSize = hitSeq.length();
+ int querySize = querySeq.length();
+ int loopFrom = 0;
+ int loopTo = hitSize <= querySize ? hitSize:querySize;
+ int queryPos = a.getQueryStart();
+ int hitPos = a.getHitStart();
+ String currentKmer = "";
+ AlignmentInfo ai;
+ boolean mergeAlignment = true;
+
+ // Deal with hit and query names
+ if (queryName == null) {
+ queryName = a.getQueryName();
+ hitName = a.getHitName();
+ querySeqSize = a.getQuerySequenceSize();
+ hitSeqSize = a.getHitSequenceSize();
+ }
+
+ if (! hitName.equals(a.getHitName())) {
+ System.out.println("Hit name ("+hitName+") doesn't match ("+a.getHitName()+")!");
+ System.exit(1);
+ }
+
+ if (! queryName.equals(a.getQueryName())) {
+ System.out.println("Query name ("+queryName+") doesn't match ("+a.getQueryName()+")!");
+ System.exit(1);
+ }
+
+ options.getLog().println("Merging new block");
+ options.getLog().println(" queryPos = "+queryPos);
+ options.getLog().println(" hitPos = "+hitPos);
+ options.getLog().println(" querySize = "+querySize);
+ options.getLog().println(" hitSize = "+hitSize);
+
+ // Check for new block too far from current block
+ if ((overallHitStart != -1) && (hitPos < overallHitStart)) {
+ int remainingQuerySequence = a.getQuerySequenceSize() - (overallQueryEnd - overallQueryStart);
+ int maximumDistance = remainingQuerySequence * 2;
+ if ((overallHitStart - hitPos) > maximumDistance) {
+ options.getLog().println("WARNING: hitPos too far (>"+maximumDistance+") from overallHitStart ("+overallHitStart+")");
+ mergeAlignment = false;
+ }
+ }
+
+ if ((overallHitEnd != -1) && (hitPos > overallHitEnd)) {
+ int remainingQuerySequence = a.getQuerySequenceSize() - (overallQueryEnd - overallQueryStart);
+ int maximumDistance = remainingQuerySequence * 2;
+
+ if ((hitPos - overallHitEnd) > maximumDistance) {
+ options.getLog().println("WARNING: hitPos too far from (>"+maximumDistance+") from overallHitEnd ("+overallHitEnd+")");
+ mergeAlignment = false;
+ }
+ }
+
+ if (overallQueryStart >= 0) {
+ int queryDistanceFromStart = Math.abs(queryPos - overallQueryStart);
+ int hitDistanceFromStart = Math.abs(hitPos - overallHitStart);
+ int difference = Math.abs(queryDistanceFromStart - hitDistanceFromStart);
+ options.getLog().println("queryDistanceFromStart = "+queryDistanceFromStart);
+ options.getLog().println("hitDistanceFromStart = "+hitDistanceFromStart);
+ options.getLog().println("difference = " + difference);
+
+ if (difference > (queryDistanceFromStart * 0.2)) {
+ options.getLog().println("WARNING: query offset too far from hit offet - extra alignment ignored");
+ mergeAlignment = false;
+ }
+ }
+
+ if (mergeAlignment) {
+ // Store alignment size
+ if ((overallQueryStart == -1) || (queryPos < overallQueryStart)) {
+ overallQueryStart = queryPos;
+ options.getLog().println("Modifying overallQueryStart = "+overallQueryStart);
+ }
+ if ((overallHitStart == -1) || (hitPos < overallHitStart)) {
+ overallHitStart = hitPos;
+ options.getLog().println("Modifying overallHitStart = "+overallHitStart);
+ }
+
+ // Expect these to be equal
+ if (hitSize != querySize) {
+ System.out.println("hitSize not equal to querySize");
+ }
+
+ currentPerfectKmerSize = 0;
+ insertionSize = 0;
+ deletionSize = 0;
+ errorKmer = "";
+
+ // If alignment starts in middle of area already covered, move to end
+ if (covered[queryPos] == 1) {
+ while((loopFrom < loopTo) && (covered[queryPos] == 1)) {
+ if (hitSeq.charAt(loopFrom)== '-') {
+ queryPos++;
+ } else if (querySeq.charAt(loopFrom) == '-') {
+ hitPos++;
+ } else {
+ queryPos++;
+ hitPos++;
+ }
+ loopFrom++;
+ }
+ }
+
+ options.getLog().println(" loopFrom = "+loopFrom);
+ options.getLog().println(" loopTo = "+loopTo);
+
+ for (int i=loopFrom; i<loopTo; i++) {
+ // If we've ventured into previously covered territory, break
+ if (covered[queryPos] == 1) {
+ break;
+ }
+
+ // Identical bases
+ if (hitSeq.charAt(i) == querySeq.charAt(i)) {
+ // Check if there are any insertions or deletions to store
+ checkStoreInsertionsOrDeletions();
+
+ currentPerfectKmerSize++;
+ currentKmer += querySeq.charAt(i);
+
+ // If reached end, store perfect sequence length
+ if (i == (loopTo-1)) {
+ storePerfectKmerLength();
+ }
+
+ // Mark this position and move on
+ identicalBases++;
+ covered[queryPos]= 1;
+ queryPos++;
+ hitPos++;
+ alignmentSizeWithoutIndels++;
+ } else {
+ // An insertion or deletion or substitution, so store perfect sequence length, if we have some
+ if (currentPerfectKmerSize > 0) {
+ storePerfectKmerLength();
+ }
+
+ // Insertion
+ if (hitSeq.charAt(i) == '-') {
+ // If new insertion, check if we have a previous deletion we were tracking
+ // And store the current perfect kmer as the one associated with this insertion
+ if (insertionSize == 0) {
+ checkStoreInsertionsOrDeletions();
+ errorKmer = currentKmer;
+ }
+
+ // Keep track of insertion size
+ insertionSize++;
+
+ // Keep track of position
+ queryPos++;
+ }
+
+ // Deletion
+ else if (querySeq.charAt(i) == '-') {
+ // If new deletion, check if we have a previous insertion we were tracking
+ // And store the current perfect kmer as the one associated with this deletion
+ if (deletionSize == 0) {
+ checkStoreInsertionsOrDeletions();
+ errorKmer = currentKmer;
+ }
+
+ // Keep track of size
+ deletionSize++;
+
+ // Keep track of position
+ hitPos++;
+ }
+
+ // Substitution
+ else {
+ // Check if previous insertion or deletion we were tracking
+ checkStoreInsertionsOrDeletions();
+
+ // Store current perfect kmer associated with this substitution
+ errorKmer = currentKmer;
+
+ // Store substitution
+ reference.getStatsByType(type).addSubstitutionError(errorKmer, hitSeq.charAt(i), querySeq.charAt(i), overallStats); // Reference
+
+ // Mark this position and move on
+ covered[queryPos] = 1;
+ queryPos++;
+ hitPos++;
+ alignmentSizeWithoutIndels++;
+ }
+
+ // Reset current kmer
+ currentKmer = "";
+ }
+
+ alignmentSize++;
+ }
+
+ options.getLog().println(" queryPos = " + queryPos);
+ options.getLog().println(" hitPos = " + hitPos);
+
+ if ((overallQueryEnd == -1) || (queryPos > overallQueryEnd)) {
+ overallQueryEnd = queryPos;
+ options.getLog().println("Modifying overallQueryEnd = "+overallQueryEnd);
+ }
+ if ((overallHitEnd == -1) || (hitPos > overallHitEnd)) {
+ overallHitEnd = hitPos;
+ options.getLog().println("Modifying overallHitEnd = "+overallHitEnd);
+ }
+
+ //reference.getStatsByType(type).addCoverage(a.getHitStart(), a.getHitAlignmentSize()); // Reference
+ }
+ }
+
+ /**
+ * Declare end of alignment merge
+ * @return an AlignmentInfo object
+ */
+ public AlignmentInfo endMergeAndStoreStats() {
+ AlignmentInfo ai = new AlignmentInfo(hitName,
+ hitSeqSize,
+ queryName,
+ querySeqSize,
+ identicalBases,
+ longestPerfectKmer,
+ kmerTotal,
+ kmerCount,
+ alignmentSize,
+ alignmentSizeWithoutIndels,
+ overallQueryEnd - overallQueryStart);
+
+ ai.addkCounts(nk, kSizes, kCounts);
+
+ overallStats.writekCounts(queryName, querySeqSize, nk, kSizes, kCounts); // ReadSetStats
+ overallStats.addReadWithAlignment(); // ReadSetStats
+ overallStats.addReadBestKmer(longestPerfectKmer); // ReadSetStats
+
+ reference.getStatsByType(type).addAlignmentStats(querySeqSize, alignmentSize, alignmentSizeWithoutIndels, identicalBases, "?", "?"); // Reference
+ reference.getStatsByType(type).addReadBestKmer(longestPerfectKmer); // Reference
+
+ return ai;
+ }
+
+ /**
+ * Get query start position of merged alignment
+ * @return start position
+ */
+ public int getOverallQueryStart() {
+ return overallQueryStart;
+ }
+
+ /**
+ * Get query end position of merged alignment
+ * @return end position
+ */
+ public int getOverallQueryEnd() {
+ return overallQueryEnd;
+ }
+
+ /**
+ * Get hit start position of merged alignment
+ * @return start position
+ */
+ public int getOverallHitStart() {
+ return overallHitStart;
+ }
+
+ /**
+ * Get hit end position of merged alignment
+ * @return end position
+ */
+ public int getOverallHitEnd() {
+ return overallHitEnd;
+ }
+
+ /**
+ * Get size of query covered by merged alignment
+ * @return size of alignment
+ */
+ public int getOverallQuerySize() {
+ return overallQueryEnd - overallQueryStart;
+ }
+
+ /**
+ * Get size of hit covered by merged alignment
+ * @return size of hit alignment
+ */
+ public int getOverallHitSize() {
+ return overallHitEnd - overallHitStart;
+ }
+
+ /**
+ * Get size of alignment without indels
+ * @return size
+ */
+ public int getAlignmentSize() {
+ return alignmentSizeWithoutIndels;
+ }
+}
diff --git a/src/nanook/AlignmentsTableFile.java b/src/nanook/AlignmentsTableFile.java
new file mode 100644
index 0000000..fd258ee
--- /dev/null
+++ b/src/nanook/AlignmentsTableFile.java
@@ -0,0 +1,146 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.*;
+
+/**
+ * Represents alignment summary file written by tool and used for graph plotting.
+ *
+ * @author Richard Leggett
+ */
+public class AlignmentsTableFile implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private String filename;
+ private transient PrintWriter pw = null;
+ private int count = 0;
+
+ /**
+ * Constructor.
+ * @param f filename of output file
+ */
+ public AlignmentsTableFile(String f) {
+ filename = f;
+ writeHeader();
+ }
+
+ private synchronized void openFile(boolean append) {
+ try {
+ pw = new PrintWriter(new FileWriter(filename, append));
+ } catch (IOException e) {
+ System.out.println("AlignmentsTableFile exception");
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Write header row to file.
+ */
+ private synchronized void writeHeader() {
+ openFile(false);
+ pw.print("Filename\t");
+ pw.print("QueryName\t");
+ pw.print("QueryGC\t");
+ pw.print("QueryStart\t");
+ pw.print("QueryBasesCovered\t");
+ pw.print("QueryStrand\t");
+ pw.print("QueryLength\t");
+ pw.print("HitName\t");
+ pw.print("HitStart\t");
+ pw.print("HitBasesCovered\t");
+ pw.print("HitStrand\t");
+ pw.print("HitLength\t");
+ pw.print("AlignmentSize\t");
+ pw.print("IdenticalBases\t");
+ pw.print("AlignmentPercentIdentity\t");
+ pw.print("QueryPercentIdentity\t");
+ pw.print("LongestPerfectKmer\t");
+ pw.print("MeanPerfectKmer\t");
+ pw.print("PercentQueryAligned\t");
+ pw.print("nk15\tnk17\tnk19\tnk21\tnk23\tnk25");
+ pw.println("");
+ pw.close();
+ }
+
+ /**
+ * Write an alignment line.
+ * @param alignmentFilename filename of alignment
+ * @param hitLine hit object
+ * @param queryLine query object
+ * @param ais AlignmentInfo statistics
+ */
+ public synchronized void writeAlignment(ReadSetStats stats, String alignmentFilename, MAFAlignmentLine hitLine, MAFAlignmentLine queryLine, AlignmentInfo ais) {
+ String outputLine = String.format("%s\t%s\t%.2f\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%.2f\t%.2f\t%s",
+ alignmentFilename,
+ queryLine.getName(),
+ stats.getGC(alignmentFilename, ais.getQueryName()),
+ queryLine.getStart(),
+ queryLine.getAlnSize(),
+ queryLine.getStrand(),
+ queryLine.getSeqSize(),
+ hitLine.getName(),
+ hitLine.getStart(),
+ hitLine.getAlnSize(),
+ hitLine.getStrand(),
+ hitLine.getSeqSize(),
+ ais.getAlignmentSize(),
+ ais.getIdenticalBases(),
+ ais.getAlignmentId(),
+ ais.getQueryId(),
+ ais.getLongestPerfectKmer(),
+ ais.getMeanPerfectKmer(),
+ ais.getPercentQueryAligned(),
+ ais.getkCounts());
+
+ openFile(true);
+ pw.println(outputLine);
+ pw.close();
+
+ count++;
+ }
+
+ public synchronized void writeMergedAlignment(ReadSetStats stats, String alignmentFilename, AlignmentMerger merger, AlignmentInfo ais) {
+ String outputLine = String.format("%s\t%s\t%.2f\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%.2f\t%.2f\t%s",
+ alignmentFilename,
+ ais.getQueryName(),
+ stats.getGC(alignmentFilename, ais.getQueryName()),
+ merger.getOverallQueryStart(),
+ merger.getOverallQuerySize(),
+ "+",
+ ais.getQuerySize(),
+ ais.getHitName(),
+ merger.getOverallHitStart(),
+ merger.getOverallHitSize(),
+ "+",
+ ais.getHitSize(),
+ ais.getAlignmentSize(),
+ ais.getIdenticalBases(),
+ ais.getAlignmentId(),
+ ais.getQueryId(),
+ ais.getLongestPerfectKmer(),
+ ais.getMeanPerfectKmer(),
+ ais.getPercentQueryAligned(),
+ ais.getkCounts());
+
+ openFile(true);
+ pw.println(outputLine);
+ pw.close();
+
+ count++;
+ }
+
+ /**
+ * Used when no alignment found for this query.
+ * @param alignmentFilename - alignment filename
+ */
+ public synchronized void writeNoAlignmentMessage(String alignmentFilename) {
+ openFile(true);
+ pw.println(alignmentFilename+"\tNO ALIGNMENTS");
+ pw.close();
+ }
+}
diff --git a/src/nanook/BLASRParser.java b/src/nanook/BLASRParser.java
new file mode 100644
index 0000000..727ede5
--- /dev/null
+++ b/src/nanook/BLASRParser.java
@@ -0,0 +1,50 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+/**
+ * Parser for BLASR files
+ * @author Richard Leggett
+ */
+public class BLASRParser extends SAMParser implements AlignmentFileParser {
+ private String alignmentParams = "";
+
+ public BLASRParser(NanoOKOptions o, References r) {
+ super(o, r);
+ }
+
+ public String getProgramID() {
+ return "blasr";
+ }
+
+ public int getReadFormat() {
+ return NanoOKOptions.FASTA;
+ }
+
+ public void setAlignmentParams(String p) {
+ alignmentParams = p;
+ }
+
+ public String getRunCommand(String query, String output, String reference) {
+ String command = "blasr " + query + " " + reference + " -sam -out " + output;
+
+ if (alignmentParams.length() > 0) {
+ command = command + alignmentParams;
+ }
+
+ return command;
+ }
+
+ public boolean outputsToStdout() {
+ return false;
+ }
+
+ public void checkForIndex(String referenceFile) {
+ return;
+ }
+}
diff --git a/src/nanook/BWAParser.java b/src/nanook/BWAParser.java
new file mode 100644
index 0000000..eff6fae
--- /dev/null
+++ b/src/nanook/BWAParser.java
@@ -0,0 +1,71 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+
+/**
+ * Parser for BWA files
+ * @author Richard Leggett
+ */
+public class BWAParser extends SAMParser implements AlignmentFileParser {
+ private String alignmentParams = "-x ont2d";
+ private NanoOKOptions options;
+
+ public BWAParser(NanoOKOptions o, References r) {
+ super(o, r);
+ options = o;
+ }
+
+ public String getProgramID() {
+ return "bwa";
+ }
+
+ public int getReadFormat() {
+ int or = options.getReadFormat();
+ return or;
+
+ //return NanoOKOptions.FASTA;
+ }
+
+ public void setAlignmentParams(String p) {
+ alignmentParams = p;
+ }
+
+ public boolean outputsToStdout() {
+ return true;
+ }
+
+ public String getRunCommand(String query, String output, String reference) {
+ //reference = reference.replaceAll("\\.fasta$", "");
+ //reference = reference.replaceAll("\\.fa$", "");
+
+ return "bwa mem " + alignmentParams + " " + reference + " " + query;
+ }
+
+ public void checkForIndex(String referenceFile) {
+ String[] files = {referenceFile + ".fasta.bwt",
+ referenceFile + ".fasta.pac"};
+
+ for (int i=0; i<files.length; i++) {
+ File f = new File(files[i]);
+
+ if (!f.exists()) {
+ System.out.println("");
+ System.out.println("Error:");
+ System.out.println("Can't find file " + f.getPath());
+ System.out.println("Have you indexed the reference with bwa index?");
+ System.out.println("Will continue but anticipate failure at analyse stage.");
+ System.out.println("");
+ return;
+ }
+ }
+
+ return;
+ }
+}
diff --git a/src/nanook/BlastHandler.java b/src/nanook/BlastHandler.java
new file mode 100644
index 0000000..544f8c4
--- /dev/null
+++ b/src/nanook/BlastHandler.java
@@ -0,0 +1,202 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+
+public class BlastHandler {
+ private NanoOKOptions options = null;
+ private int type;
+ private int passfail;
+ private int nSeqs = 0;
+ private int fileCounter = 0;
+ private ArrayList<String> mergeList = new ArrayList<String>();
+
+ public BlastHandler(NanoOKOptions o, int t, int pf) {
+ options = o;
+ type = t;
+ passfail = pf;
+ if (options.getFileCounterOffset() > 0) {
+ fileCounter = options.getFileCounterOffset();
+ System.out.println("File offset "+fileCounter);
+ }
+ }
+
+ private void writeMeganFile() {
+ ArrayList<String> blastProcesses = options.getBlastProcesses();
+ String meganDir = options.getSampleDirectory() + File.separator + "megan";
+ File f = new File(meganDir);
+
+ if (!f.exists()) {
+ f.mkdir();
+ }
+
+ for (int i=0; i<blastProcesses.size(); i++) {
+ String[] params = blastProcesses.get(i).split(",");
+ if (params.length == 5) {
+ String blastName = params[0];
+ String blastTool = params[1];
+ String blastDb = params[2];
+ String memory = params[3];
+ String queue = params[4];
+ String cmdPathname = meganDir + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) +
+ "_" + NanoOKOptions.getPassFailFromInt(passfail) + "_" + Integer.toString(fileCounter) + ".cmds";
+ String meganPathname = meganDir + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) +
+ "_" + NanoOKOptions.getPassFailFromInt(passfail) + "_" + Integer.toString(fileCounter) + ".rma";
+ String slurmPathname = meganDir + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) +
+ "_" + NanoOKOptions.getPassFailFromInt(passfail) + "_" + Integer.toString(fileCounter) + ".slurm.sh";
+ String slurmLogname = meganDir + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) +
+ "_" + NanoOKOptions.getPassFailFromInt(passfail) + "_" + Integer.toString(fileCounter) + ".slurm.log";
+
+ try {
+ options.getLog().println("Writing MEGAN command file " + cmdPathname);
+ PrintWriter pw = new PrintWriter(new FileWriter(cmdPathname));
+ pw.println("setprop MaxNumberCores=4;");
+ String blastFileString="";
+ String fastaFileString="";
+
+ for (int fc=0; fc<=fileCounter; fc++) {
+ String fileName = "all_" + NanoOKOptions.getTypeFromInt(type) + "_" + NanoOKOptions.getPassFailFromInt(passfail) + "_" + Integer.toString(fc);
+ String fastaPathname = options.getReadDir() + "_chunks" + File.separator + fileName + (options.getReadFormat() == NanoOKOptions.FASTA ? ".fasta":".fastq");
+ String blastPathname = options.getSampleDirectory() + File.separator +
+ blastTool + "_" + blastName + File.separator +
+ fileName + "_" + blastTool + "_" + blastName + ".txt";
+ if (blastFileString != "") {
+ blastFileString += ",";
+ fastaFileString += ",";
+ }
+ fastaFileString = fastaFileString + "'" + fastaPathname + "'";
+ blastFileString = blastFileString + "'" + blastPathname + "'";
+ }
+
+ pw.print("import blastFile="+blastFileString+" fastaFile="+fastaFileString +" meganFile="+meganPathname);
+ pw.println(" maxMatches=100 maxExpected=0.001 minSupport=1 minComplexity=0;");
+ pw.println("quit;");
+ pw.close();
+
+ pw = new PrintWriter(new FileWriter(slurmPathname));
+ pw.print("slurmit -p TempProject4 -c 4 -o " + slurmLogname + " -m \"8G\" \"source MEGAN-5.11.3 ; ");
+ pw.println("xvfb-run -d MEGAN -g -c " + cmdPathname + " -L /tgac/workarea/group-si/BAMBI_Pt1/megan_support/MEGAN5-academic-license.txt\"");
+ pw.close();
+ } catch (Exception e) {
+ System.out.println("writeMeganFile exception");
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ private void runBlasts(String inputPathname) {
+ String formatString = "'6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle'";
+ ArrayList<String> blastProcesses = options.getBlastProcesses();
+ File iff = new File(inputPathname);
+ String fileName = iff.getName();
+ String filePrefix = fileName;
+
+ if (filePrefix.contains(".")) {
+ filePrefix = fileName.substring(0, fileName.lastIndexOf('.'));
+ }
+
+ for (int i=0; i<blastProcesses.size(); i++) {
+ String[] params = blastProcesses.get(i).split(",");
+ if (params.length == 5) {
+ String blastName = params[0];
+ String blastTool = params[1];
+ String blastDb = params[2];
+ String memory = params[3];
+ String queue = params[4];
+ String outputBlast = options.getSampleDirectory() + File.separator +
+ blastTool + "_" + blastName + File.separator +
+ filePrefix + "_" + blastTool + "_" + blastName + ".txt";
+ String commandFile = options.getSampleDirectory() + File.separator +
+ blastTool + "_" + blastName + File.separator +
+ filePrefix + "_" + blastTool + "_" + blastName + ".sh";
+ String logFile = options.getLogsDir() + File.separator +
+ blastTool + "_" + blastName + File.separator +
+ filePrefix + "_" + blastTool + "_" + blastName + ".log";
+
+ options.getLog().println(" BLAST input: " + inputPathname);
+ options.getLog().println(" BLAST output: " + outputBlast);
+ options.getLog().println("BLAST command: " + commandFile);
+ options.getLog().println(" BLAST log: " + logFile);
+
+ try {
+ options.getLog().println("Writing blast command file "+commandFile);
+ PrintWriter pw = new PrintWriter(new FileWriter(commandFile));
+ // TODO: -task option shouldn't be hardcoded
+ pw.write(blastTool + " -db " + blastDb + " -query " + inputPathname + " -evalue 0.001 -show_gis -task blastn -out " + outputBlast + " -outfmt "+formatString);
+ pw.close();
+
+ options.getLog().println("Submitting blast command file to SLURM "+commandFile);
+ ProcessLogger pl = new ProcessLogger();
+ String[] commands = {"slurmit",
+ "-o", logFile,
+ "-p", queue,
+ "-m", memory,
+ "sh "+commandFile};
+ pl.runCommandToLog(commands, options.getLog());
+ } catch (IOException e) {
+ System.out.println("runBlast exception");
+ e.printStackTrace();
+ }
+ } else {
+ System.out.println("Badly formatted BLAST process: "+blastProcesses.get(i));
+ }
+ }
+ }
+
+ private String mergeInputFiles() {
+ String mergedPathname = options.getReadDir() +
+ "_chunks" + File.separator +
+ "all_" + NanoOKOptions.getTypeFromInt(type) + "_" + NanoOKOptions.getPassFailFromInt(passfail) + "_" +
+ Integer.toString(fileCounter) +
+ (options.getReadFormat() == NanoOKOptions.FASTA ? ".fasta":".fastq");
+
+ options.getLog().println("Writing merged file "+mergedPathname);
+
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(mergedPathname));
+
+ for (int i=0; i<mergeList.size(); i++) {
+ BufferedReader br = new BufferedReader(new FileReader(mergeList.get(i)));
+ String line;
+ while ((line = br.readLine()) != null) {
+ pw.println(line);
+ }
+ br.close();
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("mergeFiles exception");
+ e.printStackTrace();
+ }
+ return mergedPathname;
+ }
+
+ public synchronized void addRead(String readFilename) {
+ mergeList.add(readFilename);
+ nSeqs++;
+ if (nSeqs == options.getReadsPerBlast()) {
+ options.getLog().println("Merging files (nSeqs = "+nSeqs+")");
+ String mergedPathname = mergeInputFiles();
+ runBlasts(mergedPathname);
+ writeMeganFile();
+
+ //options.getThreadExecutor().execute(new FastAQMerger(options, mergedFilename, mergeList, fileCounter));
+ mergeList = new ArrayList();
+ fileCounter++;
+ nSeqs = 0;
+ }
+ }
+}
diff --git a/src/nanook/BlastMerger.java b/src/nanook/BlastMerger.java
new file mode 100644
index 0000000..6443ba0
--- /dev/null
+++ b/src/nanook/BlastMerger.java
@@ -0,0 +1,86 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+public class BlastMerger {
+ private transient PrintWriter pw = null;
+ private String filename = null;
+
+ public BlastMerger(NanoOKOptions options) {
+ }
+
+ public synchronized void open(String f, boolean clearLogs) {
+ if (clearLogs) {
+ filename = f + ".blast.txt";
+ } else {
+ DateFormat df = new SimpleDateFormat("ddMMyy_HHmmss");
+ Date dateobj = new Date();
+ filename = f + "_" + df.format(dateobj).toString()+".blast.txt";
+ }
+ System.out.println("Opening "+filename);
+
+ try {
+ pw = new PrintWriter(new FileWriter(filename, true));
+ } catch (IOException e) {
+ System.out.println("NanoOKLog exception");
+ e.printStackTrace();
+ }
+ }
+
+ public synchronized void mergeFile(String fileToMerge) {
+ try {
+ String line;
+ BufferedReader br = new BufferedReader(new FileReader(fileToMerge));
+ while ((line = br.readLine()) != null) {
+ if (!line.startsWith("#")) {
+ pw.println(line);
+ }
+ }
+ pw.flush();
+ br.close();
+ } catch (Exception e) {
+ System.out.println("BlastMerger exception");
+ e.printStackTrace();
+ }
+ }
+
+ public synchronized void close() {
+ if (pw != null) {
+ pw.close();
+ }
+ }
+
+ public synchronized void print(String s) {
+ if (pw != null) {
+ pw.print(s);
+ pw.flush();
+ }
+ }
+
+ public synchronized void println(String s) {
+ if (pw != null) {
+ pw.println(s);
+ pw.flush();
+ }
+ }
+
+ public synchronized PrintWriter getPrintWriter() {
+ return pw;
+ }
+}
diff --git a/src/nanook/CIGARString.java b/src/nanook/CIGARString.java
new file mode 100644
index 0000000..733baa7
--- /dev/null
+++ b/src/nanook/CIGARString.java
@@ -0,0 +1,285 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Represent and parse a CIGAR string
+ *
+ * @author Richard Leggett
+ */
+public class CIGARString {
+ private StringBuilder queryString = new StringBuilder("");
+ private StringBuilder hitString = new StringBuilder("");
+ private String hitFilename;
+ private String cigarString;
+ private String querySeq;
+ private String queryFilename;
+ private String queryID;
+ private ReferenceSequence hitReference;
+ private int queryStart;
+ private int hitStart;
+ private int queryAlnSize;
+ private int hitAlnSize;
+
+ /**
+ * Constructor
+ * @param cs
+ * @param qseq
+ * @param qf
+ * @param hs hit start position (0-based)
+ * @param hf
+ * @param hr
+ */
+ public CIGARString(String cs, String qseq, String qf, String qi, int hs, String hf, ReferenceSequence hr) {
+ cigarString = cs;
+ querySeq = qseq;
+ queryFilename = qf;
+ queryID = qi;
+ hitStart = hs;
+ hitFilename = hf;
+ hitReference = hr;
+ queryStart = 0;
+
+ //trimCIGAR(cs, qseq);
+ }
+
+ /**
+ * Attempt at handling marginAlign CIGAR strings
+ * Needs work!
+ * @param cs
+ * @param qseq
+ * @return
+ */
+ private String trimCIGAR(String cs, String qseq) {
+ //System.out.println("Old cigar: "+cs);
+
+ boolean foundStart = false;
+ int trimQueryStart = 0;
+ int trimCigarStart = 0;
+ int trimCigarEnd = 0;
+ int trimQueryEnd = 0;
+ Pattern outPattern = Pattern.compile("(\\d+)\\S");
+ Matcher outMatcher = outPattern.matcher(cs);
+ ArrayList<String> tags = new ArrayList();
+ while (outMatcher.find()) {
+ tags.add(outMatcher.group(0));
+ }
+
+
+ for (int i=0; i<tags.size(); i++) {
+ String tag = tags.get(i);
+ int n = Integer.parseInt(tag.substring(0, tag.length()-1));
+ String c = tag.substring(tag.length()-1);
+
+ if (c.equals("I")) {
+ queryStart += n;
+ trimQueryStart += n;
+ trimCigarStart += tag.length();
+ } else if (c.equals("D")) {
+ hitStart += n;
+ trimCigarStart += tag.length();
+ } else {
+ break;
+ }
+ }
+
+ for (int i=tags.size()-1; i>0; i--) {
+ String tag = tags.get(i);
+ int n = Integer.parseInt(tag.substring(0, tag.length()-1));
+ String c = tag.substring(tag.length()-1);
+
+ if (c.equals("I")) {
+ trimQueryEnd += n;
+ trimCigarEnd += tag.length();
+ } else if (c.equals("D")) {
+ trimCigarEnd += tag.length();
+ } else {
+ break;
+ }
+ }
+
+ cigarString = cs.substring(trimCigarStart, cs.length()-trimCigarEnd);
+ querySeq = qseq.substring(trimQueryStart, qseq.length()-trimQueryEnd);
+
+ //System.out.println("New cigar: "+cigarString);
+ return cigarString;
+ }
+
+ public boolean processString() {
+ String value = "";
+ SequenceReader r = new SequenceReader(true);
+ r.indexFASTAFile(hitFilename, null, true);
+ int l = 3*querySeq.length();
+ String hitSeq = r.getSubSequence(hitReference.getId(), hitStart, hitStart+l);
+ int hitPtr = 0;
+ int queryPtr = 0;
+ boolean displayResult = false;
+ boolean donePreClipping = false;
+ int tagCtr = 0;
+ int i = 0;
+ boolean continueParsing = true;
+ int totalCount = 0;
+ int delCount = 0;
+ int insCount = 0;
+ int matchCount = 0;
+ boolean processed = true;
+
+ //System.out.println("Query filename: "+queryFilename);
+ //System.out.println("CIGAR: "+cigarString);
+ //System.out.println(" Hit: "+hitSeq.length()+" "+hitSeq);
+ //System.out.println("Query: "+querySeq.length()+" "+querySeq);
+
+ hitAlnSize = 0;
+ queryAlnSize = 0;
+ hitAlnSize = 0;
+ while ((i<cigarString.length()) && (continueParsing)) {
+ //for (int i=0; i<cigarString.length(); i++) {
+ //System.out.println("hitPtr="+hitPtr+" queryPtr="+queryPtr);
+ //System.out.println("Query: " + queryString.toString());
+ //System.out.println(" Hit: " + hitString.toString());
+ char c = cigarString.charAt(i);
+
+ if (Character.isDigit(c)) {
+ value = value + c;
+ } else {
+ int n = Integer.parseInt(value);
+ totalCount += n;
+ //System.out.println(n + " " + c);
+ switch(c) {
+ case 'M':
+ case '=':
+ case 'X':
+ //System.out.println(hitString.length() + " " + hitPtr);
+ //System.out.println("Hit up: " + hitSeq.substring(hitPtr));
+ queryString.append(querySeq.substring(queryPtr, queryPtr + n));
+ hitString.append(hitSeq.substring(hitPtr, hitPtr + n));
+ queryPtr += n;
+ hitPtr += n;
+ queryAlnSize += n;
+ hitAlnSize += n;
+ donePreClipping = true;
+ matchCount+=n;
+ break;
+ case 'I':
+ if (n > 100) {
+ // DEBUG MODE TURNS OFF THIS
+ System.out.println("");
+ System.out.println("Error: large I ("+n+") - read "+queryID+" ignored");
+ processed = false;
+ continueParsing = false;
+ } else {
+ queryString.append(querySeq.substring(queryPtr, queryPtr + n));
+ for (int j=0; j<n; j++) {
+ hitString.append('-');
+ }
+ queryPtr += n;
+ queryAlnSize += n;
+ }
+ donePreClipping = true;
+ insCount+=n;
+ break;
+ case 'D':
+ if (n > 100) {
+ System.out.println("Error: large D ("+n+") - read "+queryID+" ignored");
+ processed = false;
+ continueParsing = false;
+ } else {
+ hitString.append(hitSeq.substring(hitPtr, hitPtr + n));
+ for (int j=0; j<n; j++) {
+ queryString.append('-');
+ }
+ hitPtr += n;
+ hitAlnSize += n;
+ }
+ donePreClipping = true;
+ delCount+=n;
+ break;
+ case 'N':
+ System.out.println("Warning: encountered N in CIGAR format!");
+ System.out.println("");
+ displayResult = true;
+ hitString.append(hitSeq.substring(hitPtr, hitPtr + n));
+ for (int j=0; j<n; j++) {
+ queryString.append('-');
+ }
+ queryPtr += n;
+ hitPtr += n;
+ donePreClipping = true;
+ break;
+ case 'S':
+ //System.out.println("Warnning: encountered S in CIGAR format!");
+ queryPtr += n;
+ if (!donePreClipping) {
+ queryStart += n;
+ }
+ displayResult = true;
+ break;
+ case 'H':
+ //System.out.println("Warning: encountered H in CIGAR format!");
+ if (!donePreClipping) {
+ queryStart += n;
+ } else {
+ //System.out.println("Warning: hard clipping at end");
+ }
+ displayResult = true;
+ break;
+ case 'P':
+ System.out.println("Warning: encountered P in CIGAR format!");
+ System.out.println("");
+ displayResult = true;
+ donePreClipping = true;
+ break;
+ default:
+ System.out.println("Unrecognised character in CIGAR string: "+c);
+ processed = false;
+ break;
+ }
+ value="";
+ tagCtr++;
+ //System.out.println("qseq="+querySeq.length()+" matchCount="+matchCount+" insCount="+insCount+" delCount="+delCount+" totalCount="+totalCount);
+ //System.out.println("Query: "+queryString.toString());
+ //System.out.println(" Hit: "+hitString.toString());
+ }
+
+ i++;
+ //System.out.println("i="+i+" and length="+cigarString.length());
+ }
+
+ //if (displayResult) {
+ //System.out.println(queryFilename);
+ //System.out.println("Query: " + queryString.toString());
+ //System.out.println(" Hit: " + hitString.toString());
+ //System.exit(1);
+ //}
+ return processed;
+ }
+
+ public int getQueryStart() {
+ return queryStart;
+ }
+
+ public int getQueryAlnSize() {
+ return queryAlnSize;
+ }
+
+ public int getHitAlnSize() {
+ return hitAlnSize;
+ }
+
+ public String getQueryString() {
+ return queryString.toString();
+ }
+
+ public String getHitString() {
+ return hitString.toString();
+ }
+}
diff --git a/src/nanook/ComparisonReportWriter.java b/src/nanook/ComparisonReportWriter.java
new file mode 100644
index 0000000..fc8a84d
--- /dev/null
+++ b/src/nanook/ComparisonReportWriter.java
@@ -0,0 +1,196 @@
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+
+/**
+ *
+ * @author Richard Leggett
+ */
+public class ComparisonReportWriter {
+ private NanoOKOptions options;
+ private PrintWriter pw = null;
+ private SampleComparer sampleComparer = null;
+
+ public ComparisonReportWriter(NanoOKOptions o, SampleComparer sc) {
+ options = o;
+ sampleComparer = sc;
+ }
+
+ /**
+ * Check if graphic file exists and only insert if it does
+ * @param preTex LaTeX before filename
+ * @param filename the file
+ * @param postTex LaTeX after filename
+ */
+ private void includeGraphicsIfExists(int type, String preTex, String filename, String postTex) {
+ if (options.isProcessingReadType(type)) {
+ String fullFilename = filename + "." + options.getImageFormat();
+ File f = new File(fullFilename);
+
+ if (f.exists()) {
+ pw.print(preTex);
+ pw.print(fullFilename);
+ pw.println(postTex);
+ } else {
+ pw.print(" ");
+ }
+ }
+ }
+
+ /**
+ * Open the .tex file.
+ */
+ public void open() {
+ try {
+ pw = new PrintWriter(new FileWriter(options.getLatexDir() + File.separator + "comparison.tex"));
+ writeLaTeXHeader();
+ } catch (IOException e) {
+ System.out.println("ReportWriter exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write end of LaTeX file.
+ */
+ private void writeLaTeXFooter() {
+ pw.println("\\end{document}");
+ }
+
+ /**
+ * Close the .tex file.
+ */
+ public void close() {
+ writeLaTeXFooter();
+ pw.close();
+ }
+
+ /**
+ * Write the top of the LaTeX document.
+ */
+ private void writeLaTeXHeader() {
+ pw.println("\\documentclass[a4paper,11pt,oneside]{article}");
+ pw.println("\\usepackage{graphicx}");
+ pw.println("\\usepackage{url}");
+ pw.println("\\usepackage{multirow}");
+ pw.println("\\usepackage{rotating}");
+ pw.println("\\usepackage{color}");
+ pw.println("\\usepackage[compact]{titlesec}");
+ pw.println("\\usepackage[portrait,top=1cm, bottom=2cm, left=1cm, right=1cm]{geometry}");
+ pw.println("\\usepackage{float}");
+ pw.println("\\restylefloat{table}");
+ pw.println("\\begin{document}");
+ pw.println("\\renewcommand*{\\familydefault}{\\sfdefault}");
+ pw.println("\\section*{\\large{NanoOK comparison report}}");
+ }
+
+ private void writeLengthSection() {
+ String graphSize = "height=5.2cm";
+ int type = options.getSpecifiedType();
+
+ pw.println("\\subsection*{Read lengths}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + NanoOKOptions.getTypeFromInt(type)+"_lengths", "} \\\\");
+
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\subsection*{Number of reads}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + NanoOKOptions.getTypeFromInt(type)+"_number_of_reads", "} \\\\");
+
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\subsection*{Total bases}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + NanoOKOptions.getTypeFromInt(type)+"_total_bases", "} \\\\");
+
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\subsection*{Alignment summary}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + NanoOKOptions.getTypeFromInt(type)+"_maps", "} \\\\");
+ }
+
+ public void writeReferenceSection(ReferenceSequence refSeq) {
+ String id = refSeq.getName().replaceAll("_", " ");
+ String graphSize = "height=6cm";
+ int type = options.getSpecifiedType();
+
+ pw.println("\\clearpage");
+ pw.println("\\subsection*{" + id + " identity}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + "_" + NanoOKOptions.getTypeFromInt(type)+"_query_identity", "} \\\\");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + "_" + NanoOKOptions.getTypeFromInt(type)+"_query_identity_zoom", "} \\\\");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + "_" + NanoOKOptions.getTypeFromInt(type)+"_percent_query_aligned", "} \\\\");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + "_" + NanoOKOptions.getTypeFromInt(type)+"_percent_query_aligned_zoom", "} \\\\");
+
+ pw.println("\\subsection*{" + id + " best perfect kmer}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + "_" + NanoOKOptions.getTypeFromInt(type)+"_best_perfect_kmer", "} \\\\");
+
+ pw.println("\\subsection*{" + id + " GC}");
+ includeGraphicsIfExists(type, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + "_" + NanoOKOptions.getTypeFromInt(type)+"_query_gc", "} \\\\");
+
+ for (int ou=0; ou<2; ou++) {
+ if (ou == 0) {
+ pw.println("\\subsection*{" + id + " " + NanoOKOptions.getTypeFromInt(type) + " Over-represented 5-mers}");
+ } else {
+ pw.println("\\subsection*{" + id + " " + NanoOKOptions.getTypeFromInt(type) + " Under-represented 5-mers}");
+ }
+
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{8pt}{10pt}\\selectfont");
+ pw.println("\\tabcolsep=0.15cm");
+ pw.println("\\begin{tabular}{|c|c c c c c c c c c c|}");
+ pw.println("\\cline{1-11}");
+ pw.println("Sample & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\\\");
+ pw.println("\\cline{1-11}");
+ for (int i=0; i<sampleComparer.getNumberOfSamples(); i++) {
+ OverallStats os = sampleComparer.getSample(i);
+ ReferenceSequence rs = os.getStatsByType(type).getOptions().getReferences().getReferenceById(refSeq.getId());
+ rs.getStatsByType(type).sortKmerAbundance();
+ ArrayList<KmerAbundance> ka = rs.getStatsByType(type).getKmerAbundance();
+ pw.print(sampleComparer.getSampleName(i).replaceAll("_", "\\\\_"));
+ for (int j=0; j<10; j++) {
+ KmerAbundance ko;
+
+ if (ou == 0) {
+ ko = ka.get(j);
+ } else {
+ ko = ka.get(ka.size() - 1 - j);
+ }
+ pw.print(" & "+ko.getKmer());
+ }
+ pw.println(" \\\\");
+ }
+ pw.println("\\cline{1-11}");
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ }
+ }
+
+ private void writeReferenceSection() {
+ ArrayList<ReferenceSequence> sortedRefs = options.getReferences().getSortedReferences();
+ for (int i=0; i<sortedRefs.size(); i++) {
+ ReferenceSequence rs = sortedRefs.get(i);
+
+ if ((options.debugMode() && (!rs.getName().equalsIgnoreCase("DNA_CS")))) {
+ writeReferenceSection(rs);
+ }
+ }
+ }
+
+ public void writeReport() {
+ open();
+ writeLengthSection();
+ writeReferenceSection();
+ close();
+ }
+
+ public void makePDF() {
+ ProcessLogger pl = new ProcessLogger();
+ String command = "pdflatex -interaction=nonstopmode -output-directory " + options.getLatexDir() + " " + options.getLatexDir() + File.separator + "comparison.tex";
+ String logFilename = options.getLogsDir() + File.separator + "pdflatex_output_log_comparison.txt";
+ System.out.println("pdflatex output " + logFilename);
+ pl.runAndLogCommand(command, logFilename, false);
+ }
+}
diff --git a/src/nanook/DirectoryWatcher.java b/src/nanook/DirectoryWatcher.java
new file mode 100644
index 0000000..8f2d1b4
--- /dev/null
+++ b/src/nanook/DirectoryWatcher.java
@@ -0,0 +1,184 @@
+package nanook;
+
+import java.io.*;
+import java.util.*;
+import java.io.File;
+import java.nio.file.*;
+import static java.nio.file.StandardWatchEventKinds.*;
+import static java.nio.file.LinkOption.*;
+import java.nio.file.attribute.*;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.commons.io.monitor.FileAlterationListener;
+import org.apache.commons.io.monitor.FileAlterationListenerAdaptor;
+import org.apache.commons.io.monitor.FileAlterationMonitor;
+import org.apache.commons.io.monitor.FileAlterationObserver;
+
+
+public class DirectoryWatcher implements FileAlterationListener {
+ private WatchService watcher = null;
+ private final Map<WatchKey,Path> keys;
+ private NanoOKOptions options;
+ private ReadAligner aligner;
+ private AlignmentFileParser parser;
+ private boolean keepWatching = true;
+
+ public DirectoryWatcher(NanoOKOptions o, ReadAligner a, AlignmentFileParser p) {
+ options = o;
+ aligner = a;
+ parser = p;
+
+ keys = new HashMap<WatchKey,Path>();
+
+
+ }
+
+ public void onStop(FileAlterationObserver observer) {};
+ public void onStart(FileAlterationObserver observer) {};
+ public void onFileDelete(File file) {};
+ public void onFileChange(File file) {};
+ public void onDirectoryDelete(File directory) {};
+ public void onDirectoryCreate(File directory) {};
+ public void onDirectoryChange(File directory) {};
+
+ public void onFileCreate(File file) {
+ Path child = file.toPath();
+
+ if (file.getName().toString().equals("stop")) {
+ keepWatching = false;
+ System.out.println("Stopping...");
+ } else if (file.getName().toString().endsWith(".fast5")) {
+ // print out event
+ System.out.println("Got new file " + file.getName());
+ String pf = child.getName(child.getNameCount() - 2).toString();
+ String fastaqDir = child.getParent().getParent().getParent().toString() + File.separator + "fasta" + File.separator + pf;
+ String alignDir = options.getAlignerDir() + File.separator + pf;
+ String logDir = options.getLogsDir() + File.separator + options.getAligner() + File.separator + pf;
+
+ options.getThreadExecutor().execute(new WatcherRunnable(options, child.getParent().toString(), child.getFileName().toString(), pf, fastaqDir, alignDir, parser));
+ }
+
+ }
+
+ private void checkAndMakeDirectory(String dir) {
+ File f = new File(dir);
+ if (f.exists()) {
+ if (!f.isDirectory()) {
+ System.out.println("Error: " + dir + " is a file, not a directory!");
+ System.exit(1);
+ }
+ } else {
+ System.out.println("Making directory " + dir);
+ f.mkdir();
+ }
+ }
+
+ private void makeDirs(String pf) {
+ checkAndMakeDirectory(options.getReadDir() + File.separator + pf);
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_card" + File.separator + pf);
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_nt" + File.separator + pf);
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + pf);
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + pf);
+
+ // Make output Template, Complement and 2D directories
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ checkAndMakeDirectory(options.getReadDir() + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_card" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_nt" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ }
+ }
+ }
+
+ public void watch() {
+ checkAndMakeDirectory(options.getFast5Dir());
+ checkAndMakeDirectory(options.getFast5Dir() + File.separator + "pass");
+ checkAndMakeDirectory(options.getFast5Dir() + File.separator + "fail");
+ checkAndMakeDirectory(options.getLogsDir());
+ checkAndMakeDirectory(options.getReadDir());
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_card");
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_bacteria");
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_nt");
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_card");
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_bacteria");
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_nt");
+
+ System.out.println("Opening logs");
+ options.getWatcherReadLog().open(options.getLogsDir() + File.separator + "watcher_reads", options.clearLogsOnStart());
+ options.getWatcherCardFileLog().open(options.getLogsDir() + File.separator + "watcher_CARD_files", options.clearLogsOnStart());
+ options.getWatcherCardCommandLog().open(options.getLogsDir() + File.separator + "watcher_CARD_commands", options.clearLogsOnStart());
+ options.getWatcherntFileLog().open(options.getLogsDir() + File.separator + "watcher_nt_files", options.clearLogsOnStart());
+ options.getWatcherntCommandLog().open(options.getLogsDir() + File.separator + "watcher_nt_commands", options.clearLogsOnStart());
+
+ options.getMergerCardPass().open(options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + "all_pass_blastn_card", options.clearLogsOnStart());
+ options.getMergerCardFail().open(options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + "all_fail_blastn_card", options.clearLogsOnStart());
+ //options.getMergerntPass().open(options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + "all_pass_blastn_nt", options.clearLogsOnStart());
+ //options.getMergerntFail().open(options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + "all_fail_blastn_nt", options.clearLogsOnStart());
+
+ System.out.println("Watching for new files...");
+ try {
+ FileAlterationMonitor monitor = new FileAlterationMonitor(500);
+
+ watcher = FileSystems.getDefault().newWatchService();
+
+ if (options.isProcessingPassReads()) {
+ String dirName = options.getFast5Dir() + File.separator + "pass";
+ Path passDir = Paths.get(dirName);
+
+ options.openMergedFile(options.getReadDir() + File.separator + options.getSample()+"_pass_1d", NanoOKOptions.TYPE_TEMPLATE, NanoOKOptions.READTYPE_PASS);
+ options.openMergedFile(options.getReadDir() + File.separator + options.getSample()+"_pass_2d", NanoOKOptions.TYPE_2D, NanoOKOptions.READTYPE_PASS);
+
+ FileAlterationObserver observer = new FileAlterationObserver(dirName);
+ observer.addListener(this);
+ monitor.addObserver(observer);
+
+ System.out.println("Watching "+dirName);
+ makeDirs("pass");
+ }
+
+ if (options.isProcessingFailReads()) {
+ String dirName = options.getFast5Dir() + File.separator + "fail";
+ Path failDir = Paths.get(dirName);
+
+ options.openMergedFile(options.getReadDir() + File.separator + options.getSample()+"_fail_1d", NanoOKOptions.TYPE_TEMPLATE, NanoOKOptions.READTYPE_FAIL);
+ options.openMergedFile(options.getReadDir() + File.separator + options.getSample()+"_fail_2d", NanoOKOptions.TYPE_2D, NanoOKOptions.READTYPE_FAIL);
+
+ FileAlterationObserver observer = new FileAlterationObserver(dirName);
+ observer.addListener(this);
+ monitor.addObserver(observer);
+
+ System.out.println("Watching "+dirName);
+ makeDirs("fail");
+ }
+
+ System.out.println("Waiting...\n");
+ monitor.start();
+ while (keepWatching) {
+ Thread.sleep(1000);
+ }
+ monitor.stop();
+ } catch (Exception e) {
+ System.out.println("ReadExtractor exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ System.out.println("Closing logs");
+ options.getWatcherReadLog().close();
+ options.getWatcherCardFileLog().close();
+ options.getWatcherCardCommandLog().close();
+ options.getWatcherntFileLog().close();
+ options.getWatcherntCommandLog().close();
+ options.getMergerCardPass().close();
+ options.getMergerntPass().close();
+ options.getMergerCardFail().close();
+ options.getMergerntFail().close();
+ }
+}
diff --git a/src/nanook/DirectoryWatcherNative.java b/src/nanook/DirectoryWatcherNative.java
new file mode 100644
index 0000000..7dc10b7
--- /dev/null
+++ b/src/nanook/DirectoryWatcherNative.java
@@ -0,0 +1,219 @@
+package nanook;
+
+import java.io.*;
+import java.util.*;
+import java.io.File;
+import java.nio.file.*;
+import static java.nio.file.StandardWatchEventKinds.*;
+import static java.nio.file.LinkOption.*;
+import java.nio.file.attribute.*;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.commons.io.monitor.FileAlterationListener;
+import org.apache.commons.io.monitor.FileAlterationListenerAdaptor;
+import org.apache.commons.io.monitor.FileAlterationMonitor;
+import org.apache.commons.io.monitor.FileAlterationObserver;
+
+
+public class DirectoryWatcherNative implements FileAlterationListener {
+ private WatchService watcher = null;
+ private final Map<WatchKey,Path> keys;
+ private NanoOKOptions options;
+ private ReadAligner aligner;
+ private AlignmentFileParser parser;
+ private boolean keepWatching = true;
+
+ public DirectoryWatcherNative(NanoOKOptions o, ReadAligner a, AlignmentFileParser p) {
+ options = o;
+ aligner = a;
+ parser = p;
+
+ keys = new HashMap<WatchKey,Path>();
+ }
+
+ @SuppressWarnings("unchecked")
+ static <T> WatchEvent<T> cast(WatchEvent<?> event) {
+ return (WatchEvent<T>)event;
+ }
+
+ public void onFileCreate(File file) {
+ Path child = file.toPath();
+ System.out.println("Created: "+file.getName());
+
+ if (file.getName().toString().equals("stop")) {
+ keepWatching = false;
+ System.out.println("Stopping...");
+ } else if (file.getName().toString().endsWith(".fast5")) {
+ // print out event
+ System.out.println("Got new file " + file.getName());
+ String pf = child.getName(child.getNameCount() - 2).toString();
+ String fastaqDir = child.getParent().getParent().getParent().toString() + File.separator + "fasta" + File.separator + pf;
+ String alignDir = options.getAlignerDir() + File.separator + pf;
+ String logDir = options.getLogsDir() + File.separator + options.getAligner() + File.separator + pf;
+
+ //executor.execute(new WatcherRunnable(options, child.getParent().toString(), child.getFileName().toString(), fastaqDir, alignDir, parser));
+ }
+
+ }
+
+ /**
+ * Process all events for keys queued to the watcher
+ */
+ void processEvents() {
+ System.out.println("Waiting...\n");
+ while (keepWatching) {
+
+ // wait for key to be signalled
+ WatchKey key;
+ try {
+ key = watcher.take();
+ } catch (InterruptedException x) {
+ return;
+ }
+
+ Path dir = keys.get(key);
+ if (dir == null) {
+ System.err.println("WatchKey not recognized!!");
+ continue;
+ }
+
+ for (WatchEvent<?> event: key.pollEvents()) {
+ WatchEvent.Kind kind = event.kind();
+
+ // Context for directory entry event is the file name of entry
+ WatchEvent<Path> ev = cast(event);
+ Path name = ev.context();
+ Path child = dir.resolve(name);
+
+ System.out.println("File "+child.getFileName().toString());
+ if (child.getFileName().toString().equals("stop")) {
+ keepWatching = false;
+ System.out.println("Stopping...");
+ } else if (child.getFileName().toString().endsWith(".fast5")) {
+ // print out event
+ System.out.println("Got new file " + child);
+ String pf = child.getName(child.getNameCount() - 2).toString();
+ String fastaqDir = child.getParent().getParent().getParent().toString() + File.separator + "fasta" + File.separator + pf;
+ String alignDir = options.getAlignerDir() + File.separator + pf;
+ String logDir = options.getLogsDir() + File.separator + options.getAligner() + File.separator + pf;
+
+ //executor.execute(new WatcherRunnable(options, child.getParent().toString(), child.getFileName().toString(), fastaqDir, alignDir, parser));
+ }
+ }
+
+ // reset key and remove from set if directory no longer accessible
+ boolean valid = key.reset();
+ if (!valid) {
+ keys.remove(key);
+
+ // all directories are inaccessible
+ if (keys.isEmpty()) {
+ break;
+ }
+ }
+ }
+ }
+
+ private void checkAndMakeDirectory(String dir) {
+ File f = new File(dir);
+ if (f.exists()) {
+ if (!f.isDirectory()) {
+ System.out.println("Error: " + dir + " is a file, not a directory!");
+ System.exit(1);
+ }
+ } else {
+ System.out.println("Making directory " + dir);
+ f.mkdir();
+ }
+ }
+
+ private void makeDirs(String pf) {
+ checkAndMakeDirectory(options.getReadDir());
+ checkAndMakeDirectory(options.getReadDir() + File.separator + pf);
+ checkAndMakeDirectory(options.getLogsDir());
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_card");
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_nt");
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_card" + File.separator + pf);
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_nt" + File.separator + pf);
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_card");
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + pf);
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_nt");
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + pf);
+
+ // Make output Template, Complement and 2D directories
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ checkAndMakeDirectory(options.getReadDir() + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_card" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ checkAndMakeDirectory(options.getLogsDir() + File.separator + "blastn_nt" + File.separator + pf + File.separator + NanoOKOptions.getTypeFromInt(t));
+ }
+ }
+ }
+
+ public void watch() {
+ System.out.println("Opening logs");
+ options.getWatcherReadLog().open(options.getLogsDir() + File.separator + "watcher_reads", options.clearLogsOnStart());
+ options.getWatcherCardFileLog().open(options.getLogsDir() + File.separator + "watcher_CARD_files", options.clearLogsOnStart());
+ options.getWatcherCardCommandLog().open(options.getLogsDir() + File.separator + "watcher_CARD_commands", options.clearLogsOnStart());
+ options.getWatcherntFileLog().open(options.getLogsDir() + File.separator + "watcher_nt_files", options.clearLogsOnStart());
+ options.getWatcherntCommandLog().open(options.getLogsDir() + File.separator + "watcher_nt_commands", options.clearLogsOnStart());
+
+ System.out.println("Watching for new files...");
+ try {
+ watcher = FileSystems.getDefault().newWatchService();
+
+ if (options.isProcessingPassReads()) {
+ String dirName = options.getFast5Dir() + File.separator + "pass";
+ Path passDir = Paths.get(dirName);
+
+ FileAlterationObserver observer = new FileAlterationObserver(dirName);
+ FileAlterationMonitor monitor = new FileAlterationMonitor(500);
+ observer.addListener(this);
+ monitor.addObserver(observer);
+ monitor.start();
+
+ System.out.println("Watching "+dirName);
+ makeDirs("pass");
+ //WatchKey passKey = passDir.register(watcher, ENTRY_CREATE);
+ //keys.put(passKey, passDir);
+ }
+
+ if (options.isProcessingFailReads()) {
+ String dirName = options.getFast5Dir() + File.separator + "fail";
+ Path failDir = Paths.get(dirName);
+ System.out.println("Watching "+dirName);
+ makeDirs("fail");
+ //WatchKey failKey = failDir.register(watcher, ENTRY_CREATE);
+ //keys.put(failKey, failDir);
+ }
+
+ this.processEvents();
+ } catch (Exception e) {
+ System.out.println("ReadExtractor exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ System.out.println("Closing logs");
+ options.getWatcherReadLog().close();
+ options.getWatcherCardFileLog().close();
+ options.getWatcherCardCommandLog().close();
+ options.getWatcherntFileLog().close();
+ options.getWatcherntCommandLog().close();
+ }
+
+ public void onStop(FileAlterationObserver observer) {};
+ public void onStart(FileAlterationObserver observer) {};
+ public void onFileDelete(File file) {};
+ public void onFileChange(File file) {};
+ public void onDirectoryDelete(File directory) {};
+ public void onDirectoryCreate(File directory) {};
+ public void onDirectoryChange(File directory) {};
+}
diff --git a/src/nanook/Fast5File.java b/src/nanook/Fast5File.java
new file mode 100644
index 0000000..209e699
--- /dev/null
+++ b/src/nanook/Fast5File.java
@@ -0,0 +1,492 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Representation of a FAST5 file
+ * @author leggettr
+ */
+public class Fast5File {
+ private NanoOKOptions options;
+ private String filename = null;
+ private HashSet<String> groups = new HashSet();
+ private HashSet<String> datasets = new HashSet();
+ private NanoOKLog log;
+ private boolean oldFormat = false;
+ private boolean isCorrupt = false;
+ private int highestBasecall1D = -1;
+ private int highestBasecall2D = -1;
+ private double meanQScore = 0;
+
+ /**
+ * Constructor
+ * @param f
+ */
+ public Fast5File(NanoOKOptions o, String f) {
+ options = o;
+ filename = f;
+ log = options.getLog();
+ indexFile();
+ }
+
+ /**
+ * Index groups and datasets
+ */
+ public void indexFile() {
+ boolean[] typesAvailable = new boolean[3];
+ ProcessLogger pl = new ProcessLogger();
+ ArrayList<String> response;
+
+ log.println("Indexing file "+filename);
+
+ response = pl.getCommandOutput("h5dump -n "+filename, true, true);
+ for (int i=0; i<response.size(); i++) {
+ String s = response.get(i).trim();
+ String[] cols = s.split("(\\s+)");
+ if (cols[0].equals("dataset")) {
+ datasets.add(cols[1]);
+ } else if (cols[0].equals("group")) {
+ groups.add(cols[1]);
+ if (cols[1].startsWith("/Analyses/Basecall_2D_")) {
+ Pattern outPattern = Pattern.compile("^/Analyses/Basecall_2D_(\\d+)$");
+ Matcher outMatcher = outPattern.matcher(cols[1]);
+ if (outMatcher.find()) {
+ int index = Integer.parseInt(outMatcher.group(1));
+ if (index > highestBasecall2D) {
+ highestBasecall2D = index;
+ }
+ }
+ } else if (cols[1].startsWith("/Analyses/Basecall_1D_")) {
+ Pattern outPattern = Pattern.compile("^/Analyses/Basecall_1D_(\\d+)$");
+ Matcher outMatcher = outPattern.matcher(cols[1]);
+ if (outMatcher.find()) {
+ int index = Integer.parseInt(outMatcher.group(1));
+ if (index > highestBasecall1D) {
+ highestBasecall1D = index;
+ }
+ }
+ }
+ }
+ }
+
+ // Old format files did not have separate Basecall_1D section
+ if ((highestBasecall1D == -1) && (highestBasecall2D == -1)) {
+ isCorrupt = true;
+ log.println("Error: couldn't find Basecall_1D or Basecall_2D in "+filename);
+ } else if ((highestBasecall1D == -1) && (highestBasecall2D >= 0)) {
+ oldFormat = true;
+ highestBasecall1D = highestBasecall2D;
+ } else {
+ if ((highestBasecall1D >=0) && (highestBasecall2D >=0)) {
+ if (highestBasecall1D != highestBasecall2D) {
+ //isCorrupt = true;
+ log.println("Warning: Basecall_1D and Basecall_2D highest indicies not the same in "+filename);
+ }
+ }
+ }
+
+ log.println(" Highest1D: "+highestBasecall1D+" Highest2D: "+highestBasecall2D);
+ }
+
+ public double getMeanQAttribute(String attribute) {
+ ProcessLogger pl = new ProcessLogger();
+ ArrayList<String> response = pl.getCommandOutput("h5dump -a "+attribute+" "+filename, true, true);
+ double meanq = 0;
+
+ // Look for value beginning (0):
+ int l;
+ for (l=0; l<response.size(); l++) {
+ String line = response.get(l);
+ if (line.contains("(0):")) {
+ meanq = Double.parseDouble(line.substring(line.indexOf("(0):") + 5));
+ }
+ }
+
+ return meanq;
+ }
+
+
+ /**
+ * Get the FASTQ data out of the dataset
+ *
+ * @param inputFilename
+ * @param dataset
+ * @return
+ */
+ public FastAQFile getFastqFromDataset(String dataset) {
+ ProcessLogger pl = new ProcessLogger();
+ ArrayList<String> response = pl.getCommandOutput("h5dump -d "+dataset+" "+filename, true, true);
+ FastAQFile ff = null;
+
+ // Look for start of FASTQ section
+ int l;
+ for (l=0; l<response.size(); l++) {
+ if (response.get(l).contains("\"@")) {
+ break;
+ }
+ }
+
+ // Parse FASTQ portion with regex
+ if (l < response.size()) {
+ String id = null;
+ String seq = null;
+ String qual = null;
+
+ // Header row
+ Pattern outPattern = Pattern.compile("@(.+)");
+ Matcher outMatcher = outPattern.matcher(response.get(l));
+ if (outMatcher.find()) {
+ id = outMatcher.group(1);
+ }
+
+ // Sequence
+ outPattern = Pattern.compile("(\\s*)(\\S+)");
+ outMatcher = outPattern.matcher(response.get(l+1));
+ if (outMatcher.find()) {
+ seq = outMatcher.group(2);
+ }
+
+ // Qualities
+ outPattern = Pattern.compile("(\\s*)(\\S+)");
+ outMatcher = outPattern.matcher(response.get(l+3));
+ if (outMatcher.find()) {
+ qual = outMatcher.group(2);
+ }
+
+ // Fix IDs
+ if (id != null) {
+ outPattern = Pattern.compile("00000000-0000-0000-0000-000000000000(.+)");
+ outMatcher = outPattern.matcher(id);
+ if (outMatcher.find()) {
+ if (options.fixIDs()) {
+ id = id.replaceAll("^00000000-0000-0000-0000-000000000000_", "");
+ id = id.replaceAll(" ", "");
+ } else {
+ System.out.println("Warning: " + id + " is non-unqiue. Recommend re-running with -fixids option.");
+ System.out.println("");
+ }
+ }
+ }
+
+ if ((id != null) && (seq != null) && (qual != null)) {
+ ff = new FastAQFile(id, seq, qual);
+ }
+ }
+
+ return ff;
+ }
+
+ public double getMeanQ(int index, int type) {
+ String meanQAttributePath = null;
+ String indexString;
+ double meanQ = 0;
+
+ log.println(" Trying to get mean Q type "+type+" from "+filename+" with index "+index);
+
+ if (!isCorrupt) {
+ if (index == -1) {
+ if (type == NanoOKOptions.TYPE_2D) {
+ index = highestBasecall2D;
+ } else {
+ index = highestBasecall1D;
+ }
+ } else {
+ int highestIndex = highestBasecall2D;
+
+ if (type != NanoOKOptions.TYPE_2D) {
+ highestIndex = highestBasecall1D;
+ }
+
+ if (index > highestIndex) {
+ log.println("Error: index higher than highest Basecall available");
+ isCorrupt = true;
+ }
+ }
+ }
+
+ if (!isCorrupt) {
+ // Make string for group
+ indexString = String.format("%03d", index);
+
+ // Build path to dataset
+ if (type == NanoOKOptions.TYPE_2D) {
+ meanQAttributePath = "/Analyses/Basecall_2D_"+indexString+"/Summary/basecall_2d/mean_qscore";
+ } else {
+ // Now look if we are new format (with Basecall_1D_XXX)
+ if (oldFormat) {
+ // Old format
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ meanQAttributePath = "/Analyses/Basecall_2D_"+indexString+"/Summary/basecall_1d_template/mean_qscore";
+ } else if (type == NanoOKOptions.TYPE_COMPLEMENT) {
+ meanQAttributePath = "/Analyses/Basecall_2D_"+indexString+"/Summary/basecall_1d_complement/mean_qscore";
+ } else {
+ System.out.println("Error: bad type in getFastq");
+ System.exit(1);
+ }
+ } else {
+ // New format
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ meanQAttributePath = "/Analyses/Basecall_1D_"+indexString+"/Summary/basecall_1d_template/mean_qscore";
+ } else if (type == NanoOKOptions.TYPE_COMPLEMENT) {
+ meanQAttributePath = "/Analyses/Basecall_1D_"+indexString+"/Summary/basecall_1d_complement/mean_qscore";
+ } else {
+ System.out.println("Error: bad type in getFastq");
+ System.exit(1);
+ }
+ }
+ }
+ }
+
+ if (meanQAttributePath != null) {
+ log.println(" Path: "+meanQAttributePath);
+ meanQ = getMeanQAttribute(meanQAttributePath);
+ log.println(" MeanQ: "+meanQ);
+ }
+
+ return meanQ;
+ }
+
+ /**
+ * Get a FastQ/A file for given (Basecall_) index and type (2D/Template/Complement)
+ * @param index
+ * @param type
+ * @return
+ */
+ public FastAQFile getFastq(int index, int type) {
+ String fastqDatasetPath = null;
+ String indexString;
+ FastAQFile ff = null;
+
+ log.println(" Trying to get FASTQ type "+type+" from "+filename+" with index "+index);
+
+ if (!isCorrupt) {
+ if (index == -1) {
+ if (type == NanoOKOptions.TYPE_2D) {
+ index = highestBasecall2D;
+ } else {
+ index = highestBasecall1D;
+ }
+ } else {
+ int highestIndex = highestBasecall2D;
+
+ if (type != NanoOKOptions.TYPE_2D) {
+ highestIndex = highestBasecall1D;
+ }
+
+ if (index > highestIndex) {
+ log.println("Error: index higher than highest Basecall available");
+ isCorrupt = true;
+ }
+ }
+ }
+
+ if (!isCorrupt) {
+ // Make string for group
+ indexString = String.format("%03d", index);
+
+ // Build path to dataset
+ if (type == NanoOKOptions.TYPE_2D) {
+ fastqDatasetPath = "/Analyses/Basecall_2D_"+indexString+"/BaseCalled_2D/Fastq";
+ } else {
+ // Now look if we are new format (with Basecall_1D_XXX)
+ if (oldFormat) {
+ // Old format
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ fastqDatasetPath = "/Analyses/Basecall_2D_"+indexString+"/BaseCalled_template/Fastq";
+ } else if (type == NanoOKOptions.TYPE_COMPLEMENT) {
+ fastqDatasetPath = "/Analyses/Basecall_2D_"+indexString+"/BaseCalled_complement/Fastq";
+ } else {
+ System.out.println("Error: bad type in getFastq");
+ System.exit(1);
+ }
+ } else {
+ // New format
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ fastqDatasetPath = "/Analyses/Basecall_1D_"+indexString+"/BaseCalled_template/Fastq";
+ } else if (type == NanoOKOptions.TYPE_COMPLEMENT) {
+ fastqDatasetPath = "/Analyses/Basecall_1D_"+indexString+"/BaseCalled_complement/Fastq";
+ } else {
+ System.out.println("Error: bad type in getFastq");
+ System.exit(1);
+ }
+ }
+ }
+ }
+
+ if (fastqDatasetPath != null) {
+ log.println(" Path: "+fastqDatasetPath);
+ if (datasets.contains(fastqDatasetPath)) {
+ log.println(" Found data: "+fastqDatasetPath);
+ ff = getFastqFromDataset(fastqDatasetPath);
+ } else {
+ log.println(" Not there: "+fastqDatasetPath);
+ }
+ }
+
+ return ff;
+ }
+
+ /**
+ * Print list of groups
+ */
+ public void printGroups() {
+ for (String s : groups) {
+ System.out.println(s);
+ }
+ }
+
+// JNI Library version
+// /**
+// * Get FASTQ section out of FAST5 file
+// * @param pathname path to FAST5 file
+// * @param type type of read
+// * @return multi-line String
+// */
+// public String getFastq(String pathname, int type) {
+// H5File file = null;
+// String[] fastq = null;
+//
+// // Open a file using default properties.
+// try {
+// file = new H5File(pathname, FileFormat.READ);
+//
+// // Find basecall group
+// H5Group grp;
+// String groupPath = new String();
+// String datasetPath = null;
+// String indexString;
+// int index = -1;
+// int i = 0;
+//
+// // Default behaviour is to find latest
+// if (options.getBasecallIndex() == -1) {
+// do {
+// indexString = String.format("%03d", i);
+// grp = (H5Group)file.get("/Analyses/Basecall_2D_" + indexString);
+// if (grp != null) {
+// index=i;
+// i++;
+// }
+// } while (grp != null);
+// } else {
+// // User has specified index - check it exists
+// indexString = String.format("%03d", options.getBasecallIndex());
+// grp = (H5Group)file.get("/Analyses/Basecall_2D_" + indexString);
+// if (grp != null) {
+// index=i;
+// }
+// }
+//
+// // index will = -1 if we didn't find any group
+// if (index >=0) {
+// // Make string for group
+// indexString = String.format("%03d", index);
+//
+// // Build path to dataset
+// if (type == NanoOKOptions.TYPE_2D) {
+// datasetPath = "/Analyses/Basecall_2D_"+indexString+"/BaseCalled_2D/Fastq";
+// } else {
+// // Now look if we are new format (with Basecall_1D_XXX)
+// grp = (H5Group)file.get("/Analyses/Basecall_1D_"+indexString);
+// if (grp == null) {
+// // Old format
+// if (type == NanoOKOptions.TYPE_TEMPLATE) {
+// datasetPath = "/Analyses/Basecall_2D_"+indexString+"/BaseCalled_template/Fastq";
+// } else if (type == NanoOKOptions.TYPE_COMPLEMENT) {
+// datasetPath = "/Analyses/Basecall_2D_"+indexString+"/BaseCalled_complement/Fastq";
+// } else {
+// System.out.println("Error: bad type in getFastq");
+// System.exit(1);
+// }
+// } else {
+// // New format
+// if (type == NanoOKOptions.TYPE_TEMPLATE) {
+// datasetPath = "/Analyses/Basecall_1D_"+indexString+"/BaseCalled_template/Fastq";
+// } else if (type == NanoOKOptions.TYPE_COMPLEMENT) {
+// datasetPath = "/Analyses/Basecall_1D_"+indexString+"/BaseCalled_complement/Fastq";
+// } else {
+// System.out.println("Error: bad type in getFastq");
+// System.exit(1);
+// }
+// }
+// }
+//
+// //System.out.println("Path: "+datasetPath);
+// Dataset ds = (Dataset)file.get(datasetPath);
+// if (ds == null) {
+// System.out.println("No dataset at "+datasetPath);
+// } else {
+// fastq = (String[])ds.getData();
+// }
+// }
+//
+// file.close();
+// } catch (Exception e) {
+// e.printStackTrace();
+// }
+//
+// if (fastq == null) {
+// return null;
+// } else {
+// return fastq[0];
+// }
+// }
+// /**
+// * Dump an individual read
+// * @param inputFilename filename of FAST5 file
+// * @param type type of read
+// */
+// private void dumpRead(String inputFilename, int type, String outputDir) {
+// String outName = new File(inputFilename).getName();
+//
+// String fastqDatafield = null; //getFastq(inputFilename, type);
+// if (fastqDatafield != null) {
+// String [] lines = fastqDatafield.split("\n");
+//
+// String id = null;
+// String seq = lines[1];
+// String qual = lines[3];
+//
+// if (lines[0].startsWith("@")) {
+// id = lines[0].substring(1);
+//
+// // Fix IDs
+// Pattern outPattern = Pattern.compile("00000000-0000-0000-0000-000000000000(.+)");
+// Matcher outMatcher = outPattern.matcher(id);
+// if (outMatcher.find()) {
+// if (options.fixIDs()) {
+// id = id.replaceAll("^00000000-0000-0000-0000-000000000000_", "");
+// id = id.replaceAll(" ", "");
+// } else {
+// System.out.println("Warning: " + id + " is non-unqiue. Recommend re-running with -fixids option.");
+// System.out.println("");
+// }
+// }
+// } else {
+// System.out.println("Couldn't parse "+inputFilename);
+// }
+//
+// if (id != null) {
+// if (options.getReadFormat() == NanoOKOptions.FASTA) {
+// writeFastaFile(id, seq, outputDir + File.separator + NanoOKOptions.getTypeFromInt(type) + File.separator + outName + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(type) + ".fasta");
+// } else if (options.getReadFormat() == NanoOKOptions.FASTQ) {
+// writeFastqFile(id, seq, qual, outputDir + File.separator + NanoOKOptions.getTypeFromInt(type) + File.separator + outName + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(type) + ".fastq");
+// }
+// }
+// } else {
+// System.out.println("Error: couldn't find payload in " + inputFilename);
+// }
+// }
+}
diff --git a/src/nanook/FastAQBlastMerger.java b/src/nanook/FastAQBlastMerger.java
new file mode 100644
index 0000000..eb55c86
--- /dev/null
+++ b/src/nanook/FastAQBlastMerger.java
@@ -0,0 +1,138 @@
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+
+public class FastAQBlastMerger implements Runnable {
+ private NanoOKOptions options;
+ private ArrayList<String> listOfFiles;
+ private String mergedFilePrefix;
+ private int fileCounter;
+
+ public FastAQBlastMerger(NanoOKOptions o, String m, ArrayList a, int fc) {
+ options = o;
+ mergedFilePrefix = m;
+ listOfFiles = a;
+ fileCounter = fc;
+ }
+
+ private void runBlastBacteria() {
+ File iff = new File(mergedFilePrefix);
+ String inputFasta = mergedFilePrefix + "_" + fileCounter + ".fasta";
+ String outputBlast = options.getSampleDirectory() + File.separator + "blastn_bacteria" + File.separator + iff.getName() + "_" + fileCounter + "_blast_bacteria.txt";
+ String commandFile = options.getSampleDirectory() + File.separator + "blastn_bacteria" + File.separator + iff.getName() + "_" + fileCounter + "_blast_bacteria.sh";
+ String logFile = options.getSampleDirectory() + File.separator + "blastn_bacteria" + File.separator + iff.getName() + "_" + fileCounter + "_blast_bacteria.log";
+ String formatString = "'6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle'";
+
+ try {
+ System.out.println("Writing blast command file "+commandFile);
+ PrintWriter pw = new PrintWriter(new FileWriter(commandFile));
+ pw.write("blastn -db "+options.getBacteriaPath()+" -query " + inputFasta + " -evalue 0.001 -show_gis -out " + outputBlast + " -outfmt "+formatString);
+ pw.close();
+
+ options.getLog().println("Submitting blast command file to SLURM "+commandFile);
+ ProcessLogger pl = new ProcessLogger();
+ String[] commands = {"slurmit",
+ "-o", logFile,
+ "-p", "Nanopore",
+ "-m", "8G",
+ "sh "+commandFile};
+ pl.runCommandToLog(commands, options.getLog());
+ } catch (IOException e) {
+ System.out.println("runBlast exception");
+ e.printStackTrace();
+ }
+ }
+
+ private void runBlastnt() {
+ File iff = new File(mergedFilePrefix);
+ String inputFasta = mergedFilePrefix + "_" + fileCounter + ".fasta";
+ String outputBlast = options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + iff.getName() + "_" + fileCounter + "_blast_nt.txt";
+ String commandFile = options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + iff.getName() + "_" + fileCounter + "_blast_nt.sh";
+ String logFile = options.getSampleDirectory() + File.separator + "blastn_nt" + File.separator + iff.getName() + "_" + fileCounter + "_blast_nt.log";
+ String formatString = "'6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle'";
+
+ try {
+ System.out.println("Writing blast command file "+commandFile);
+ PrintWriter pw = new PrintWriter(new FileWriter(commandFile));
+ pw.write("blastn -db "+options.getntPath()+" -query " + inputFasta + " -evalue 0.001 -show_gis -out " + outputBlast + " -outfmt "+formatString);
+ pw.close();
+
+ options.getLog().println("Submitting blast command file to SLURM "+commandFile);
+ ProcessLogger pl = new ProcessLogger();
+ String[] commands = {"slurmit",
+ "-o", logFile,
+ "-p", "tgac-medium",
+ "-m", "16G",
+ "sh "+commandFile};
+ pl.runCommandToLog(commands, options.getLog());
+ } catch (IOException e) {
+ System.out.println("runBlast exception");
+ e.printStackTrace();
+ }
+ }
+
+ private void runBlastCard() {
+ File iff = new File(mergedFilePrefix);
+ String inputFasta = mergedFilePrefix + "_" + fileCounter + ".fasta";
+ String outputBlast = options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + iff.getName() + "_" + fileCounter + "_blast_card.txt";
+ String commandFile = options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + iff.getName() + "_" + fileCounter + "_blast_card.sh";
+ String logFile = options.getSampleDirectory() + File.separator + "blastn_card" + File.separator + iff.getName() + "_" + fileCounter + "_blast_card.log";
+ String formatString = "'6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle'";
+
+ try {
+ System.out.println("Writing blast command file "+commandFile);
+ PrintWriter pw = new PrintWriter(new FileWriter(commandFile));
+ pw.write("blastn -db "+options.getCardPath()+" -query " + inputFasta + " -evalue 0.001 -show_gis -out " + outputBlast + " -outfmt "+formatString);
+ pw.close();
+
+ options.getLog().println("Submitting blast command file to SLURM "+commandFile);
+ ProcessLogger pl = new ProcessLogger();
+ String[] commands = {"slurmit",
+ "-o", logFile,
+ "-p", "Nanopore",
+ "-m", "4G",
+ "sh "+commandFile};
+ pl.runCommandToLog(commands, options.getLog());
+ } catch (IOException e) {
+ System.out.println("runBlast exception");
+ e.printStackTrace();
+ }
+ }
+
+ private void mergeFiles() {
+ String mergedFile = mergedFilePrefix + "_" + fileCounter + ".fasta";
+
+ options.getLog().println("Writing merged file "+mergedFile);
+
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(mergedFile));
+
+ for (int i=0; i<listOfFiles.size(); i++) {
+ BufferedReader br = new BufferedReader(new FileReader(listOfFiles.get(i)));
+ String line;
+ while ((line = br.readLine()) != null) {
+ pw.println(line);
+ }
+ br.close();
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("mergeFiles exception");
+ e.printStackTrace();
+ }
+
+ }
+
+ public void run() {
+ mergeFiles();
+ runBlastBacteria();
+ runBlastCard();
+ runBlastnt();
+ }
+}
diff --git a/src/nanook/FastAQFile.java b/src/nanook/FastAQFile.java
new file mode 100644
index 0000000..039e7fc
--- /dev/null
+++ b/src/nanook/FastAQFile.java
@@ -0,0 +1,103 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+
+/**
+ * Represent FASTA/FASTQ file
+ * @author leggettr
+ */
+public class FastAQFile {
+ private String id;
+ private String sequence;
+ private String qualities;
+
+ /**
+ * Constructor
+ * @param i id
+ * @param s sequence string
+ * @param q qualities string
+ */
+ public FastAQFile(String i, String s, String q) {
+ id = i;
+ sequence = s;
+ qualities = q;
+ }
+
+ public void writeFastqToHandle(PrintWriter pw) {
+ pw.print("@");
+ pw.println(id);
+ pw.println(sequence);
+ pw.println("+");
+ pw.println(qualities);
+ }
+
+ /**
+ * Write as FASTQ file
+ * @param filename output filename
+ */
+ public synchronized void writeFastq(String filename) {
+ PrintWriter pw;
+
+ try {
+ pw = new PrintWriter(new FileWriter(filename));
+ pw.print("@");
+ pw.println(id);
+ pw.println(sequence);
+ pw.println("+");
+ pw.println(qualities);
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeFastaFile exception");
+ e.printStackTrace();
+ }
+ }
+
+ public void writeFastaToHandle(PrintWriter pw, String fast5Path) {
+ pw.print(">");
+ pw.print(id);
+ if (fast5Path != null) {
+ pw.print(" "+fast5Path);
+ }
+ pw.println("");
+ pw.println(sequence);
+ }
+
+ /**
+ * Write as FASTA file
+ *
+ * @param filename output filename
+ */
+ public void writeFasta(String filename, String fast5Path) {
+ PrintWriter pw;
+
+ try {
+ pw = new PrintWriter(new FileWriter(filename));
+ pw.print(">");
+ pw.print(id);
+ if (fast5Path != null) {
+ pw.print(" "+fast5Path);
+ }
+ pw.println("");
+ pw.println(sequence);
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeFastaFile exception");
+ e.printStackTrace();
+ }
+ }
+
+ public int getLength() {
+ return sequence.length();
+ }
+
+ public String getID() {
+ return id;
+ }
+}
diff --git a/src/nanook/FileWatcher.java b/src/nanook/FileWatcher.java
new file mode 100644
index 0000000..0c28e98
--- /dev/null
+++ b/src/nanook/FileWatcher.java
@@ -0,0 +1,177 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.File;
+import java.util.*;
+import java.util.Hashtable;
+import java.util.LinkedList;
+import java.util.concurrent.ThreadPoolExecutor;
+
+public class FileWatcher {
+ private NanoOKOptions options;
+ private int filesToProcess = 0;
+ private int filesProcessed = 0;
+ private int lastCompleted = -1;
+ private long lastFileTime = System.nanoTime();
+ private long secsSinceLast = 0;
+ private ArrayList<FileWatcherItem> batchContainersToWatch = new ArrayList();
+ private ArrayList<FileWatcherItem> fileDirsToWatch = new ArrayList();
+ private Hashtable<String, Integer> batchDirs = new Hashtable();
+ private Hashtable<String, Integer> allFiles = new Hashtable();
+ private LinkedList<FileWatcherItem> pendingFiles = new LinkedList<FileWatcherItem>();
+
+ public FileWatcher(NanoOKOptions o) {
+ options = o;
+ }
+
+ //public FileWatcher(NanoOKOptions o, String d) {
+ // options = o;
+ // fileDirsToWatch.add(new FileWatcherDir(d, pf));
+ //}
+
+ public void addBatchContainer(String d, int pf) {
+ options.getLog().println("Added batch dir: "+d);
+ batchContainersToWatch.add(new FileWatcherItem(d, pf));
+ }
+
+ public void addWatchDir(String d, int pf) {
+ options.getLog().println("Added watch dir: "+d);
+ fileDirsToWatch.add(new FileWatcherItem(d, pf));
+ }
+
+ public synchronized void addPendingFile(String s, int pf) {
+ pendingFiles.add(new FileWatcherItem(s, pf));
+ filesToProcess++;
+ }
+
+ public synchronized FileWatcherItem getPendingFile() {
+ if (pendingFiles.size() > 0) {
+ filesProcessed++;
+ return pendingFiles.removeFirst();
+ } else {
+ return null;
+ }
+ }
+
+ public void writeProgress() {
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (filesToProcess > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * filesProcessed / filesToProcess;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+ System.out.print("\rProcessing [");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + filesProcessed +"/" + filesToProcess);
+ lastCompleted = filesProcessed;
+ }
+
+ private void checkForNewBatchDirs() {
+ int count = 0;
+ for (int i=0; i<batchContainersToWatch.size(); i++) {
+ FileWatcherItem dir = batchContainersToWatch.get(i);
+ int pf = dir.getPassOrFail();
+ String dirName = dir.getPathname();
+ File d = new File(dirName);
+ File[] listOfFiles = d.listFiles();
+
+ options.getLog().println("Scanning for new batch dirs "+dirName);
+
+ if (listOfFiles == null) {
+ options.getLog().println("Directory "+dirName+" doesn't exist");
+ } else if (listOfFiles.length <= 0) {
+ options.getLog().println("Directory "+dirName+" empty");
+ } else {
+ for (File file : listOfFiles) {
+ if (file.isDirectory()) {
+ if (!file.getName().startsWith(("."))) {
+ if (!batchDirs.containsKey(file.getPath())) {
+ count++;
+ options.getLog().println("Got batch dir "+file.getPath());
+ batchDirs.put(file.getPath(), 1);
+ fileDirsToWatch.add(new FileWatcherItem(file.getPath(), pf));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public void scan() {
+ int count = 0;
+
+ if (options.usingBatchDirs()) {
+ checkForNewBatchDirs();
+ }
+
+ for (int i=0; i<fileDirsToWatch.size(); i++) {
+ FileWatcherItem dir = fileDirsToWatch.get(i);
+ String dirName = dir.getPathname();
+ File d = new File(dirName);
+ File[] listOfFiles = d.listFiles();
+
+ options.getLog().println("Scanning "+dirName);
+
+ if (listOfFiles == null) {
+ options.getLog().println("Directory "+dirName+" doesn't exist");
+ } else if (listOfFiles.length <= 0) {
+ options.getLog().println("Directory "+dirName+" empty");
+ } else {
+ for (File file : listOfFiles) {
+ if (file.isFile()) {
+ if (!file.getName().startsWith(("."))) {
+ if (!allFiles.containsKey(file.getPath())) {
+ count++;
+ options.getLog().println("Got file "+file.getPath());
+ allFiles.put(file.getPath(), 1);
+ this.addPendingFile(file.getPath(), dir.getPassOrFail());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ options.getLog().println("Found "+count + " new files.");
+
+ if (count == 0) {
+ long timeSince = System.nanoTime() - lastFileTime;
+ secsSinceLast = timeSince / 1000000000;
+ options.getLog().println("Not seen file for " + (secsSinceLast) + "s");
+ } else {
+ lastFileTime = System.nanoTime();
+ }
+ }
+
+ public long getSecsSinceLastFile() {
+ return secsSinceLast;
+ }
+
+ public int getPendingFiles() {
+ return pendingFiles.size();
+ }
+
+ public boolean timedOut() {
+ if (pendingFiles.size() == 0) {
+ if (secsSinceLast >= options.getFileWatcherTimeout()) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+}
diff --git a/src/nanook/FileWatcherItem.java b/src/nanook/FileWatcherItem.java
new file mode 100644
index 0000000..844bbc4
--- /dev/null
+++ b/src/nanook/FileWatcherItem.java
@@ -0,0 +1,31 @@
+package nanook;
+
+public class FileWatcherItem {
+ private String pathname;
+ private int passOrFail;
+
+ public FileWatcherItem(String p, int pf) {
+ pathname = p;
+ passOrFail = pf;
+ }
+
+ public String getPathname() {
+ return pathname;
+ }
+
+ public int getPassOrFail() {
+ return passOrFail;
+ }
+
+ public boolean isPass() {
+ return passOrFail == NanoOKOptions.READTYPE_PASS ? true: false;
+ }
+
+ public boolean isFail() {
+ return passOrFail == NanoOKOptions.READTYPE_FAIL ? true: false;
+ }
+
+ public boolean isCombined() {
+ return passOrFail == NanoOKOptions.READTYPE_COMBINED ? true: false;
+ }
+}
diff --git a/src/nanook/GCCounter.java b/src/nanook/GCCounter.java
new file mode 100644
index 0000000..ac38869
--- /dev/null
+++ b/src/nanook/GCCounter.java
@@ -0,0 +1,93 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Count GC content in references
+ *
+ * @author Richard Leggett
+ */
+public class GCCounter {
+ PrintWriter pw = null;
+ int binSize = 0;
+ int currentGCPosition = 0;
+ int currentGCCounter = 0;
+ int currentGC = 0;
+ int counts[];
+
+ public GCCounter(int bs, String outputFilename) {
+ binSize = bs;
+ counts = new int[binSize*2];
+ currentGCPosition = binSize;
+
+ try {
+ pw = new PrintWriter(new FileWriter(outputFilename));
+ } catch (IOException e) {
+ System.out.println("GCCounter exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Store GC
+ */
+ private void storeGC() {
+ int gc = 0;
+ double pc;
+
+ for (int i=0; i<binSize; i++) {
+ gc += counts[i];
+ }
+
+ pc = (100.0 * (double)gc) / (double)binSize;
+ if (pw != null) {
+ pw.println(currentGCPosition + "\t" + pc);
+ }
+ }
+
+ /**
+ * Close file
+ */
+ public void closeFile() {
+ pw.close();
+ }
+
+ /**
+ * Process sequence string
+ * @param line
+ */
+ public void addString(String line) {
+ for (int i=0; i<line.length(); i++) {
+ if ((line.charAt(i) == 'G') || (line.charAt(i) == 'C') || (line.charAt(i) == 'g') || (line.charAt(i) == 'c')) {
+ counts[currentGCCounter] = 1;
+ currentGC++;
+ } else {
+ counts[currentGCCounter] = 0;
+ }
+ currentGCCounter++;
+
+ if (currentGCCounter == (binSize*2)) {
+ storeGC();
+
+ currentGCCounter = 0;
+ for (int j=binSize; j<(binSize*2); j++) {
+ counts[currentGCCounter++] = counts[j];
+ }
+ currentGCPosition += binSize;
+ currentGC = 0;
+ }
+ }
+ }
+}
diff --git a/src/nanook/GraphMapParser.java b/src/nanook/GraphMapParser.java
new file mode 100644
index 0000000..46ba9fd
--- /dev/null
+++ b/src/nanook/GraphMapParser.java
@@ -0,0 +1,75 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+
+/**
+ * Parser for BWA files
+ * @author Richard Leggett
+ */
+public class GraphMapParser extends SAMParser implements AlignmentFileParser {
+ private String alignmentParams = "";
+ private NanoOKOptions options;
+
+ public GraphMapParser(NanoOKOptions o, References r) {
+ super(o, r);
+ options = o;
+ }
+
+ public String getProgramID() {
+ return "graphmap";
+ }
+
+ public int getReadFormat() {
+ int or = options.getReadFormat();
+ return or;
+
+ //return NanoOKOptions.FASTA;
+ }
+
+ public void setAlignmentParams(String p) {
+ alignmentParams = p;
+ }
+
+ public boolean outputsToStdout() {
+ return false;
+ }
+
+ public String getRunCommand(String query, String output, String reference) {
+ //reference = reference.replaceAll("\\.fasta$", "");
+ //reference = reference.replaceAll("\\.fa$", "");
+ String command = "graphmap align -v 0 -r " + reference + " -d " + query + " -o " + output;
+ if (alignmentParams.length() > 0 ) {
+ command = command + " " + alignmentParams;
+ }
+
+ return command;
+ }
+
+ public void checkForIndex(String referenceFile) {
+ /*String[] files = {referenceFile + ".fasta.bwt",
+ referenceFile + ".fasta.pac"};
+
+ for (int i=0; i<files.length; i++) {
+ File f = new File(files[i]);
+
+ if (!f.exists()) {
+ System.out.println("");
+ System.out.println("Error:");
+ System.out.println("Can't find file " + f.getPath());
+ System.out.println("Have you indexed the reference with bwa index?");
+ System.out.println("Will continue but anticipate failure at analyse stage.");
+ System.out.println("");
+ return;
+ }
+ }*/
+
+ return;
+ }
+}
diff --git a/src/nanook/KmerAbundance.java b/src/nanook/KmerAbundance.java
new file mode 100644
index 0000000..c17af9d
--- /dev/null
+++ b/src/nanook/KmerAbundance.java
@@ -0,0 +1,59 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.Serializable;
+
+/**
+ * Represent abundance of a kmer
+ *
+ * @author Richard Leggett
+ */
+public class KmerAbundance implements Comparable, Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private String kmer;
+ private double refAbundance;
+ private double readAbundance;
+ private double difference;
+
+ public KmerAbundance(String k, double ref, double read) {
+ kmer = k;
+ refAbundance = ref;
+ readAbundance = read;
+ difference = read - ref;
+ }
+
+ public double getDifference() {
+ return difference;
+ }
+
+ public int compareTo(Object o) {
+ double d = ((KmerAbundance)o).getDifference() - difference;
+ int r = 0;
+
+ if (d < 0) {
+ r = -1;
+ } else if (d > 0) {
+ r = 1;
+ }
+
+ return r;
+ }
+
+ public String getKmer() {
+ return kmer;
+ }
+
+ public double getRefAbundance() {
+ return refAbundance;
+ }
+
+ public double getReadAbundance() {
+ return readAbundance;
+ }
+}
diff --git a/src/nanook/KmerMotifStatistic.java b/src/nanook/KmerMotifStatistic.java
new file mode 100644
index 0000000..474c4d9
--- /dev/null
+++ b/src/nanook/KmerMotifStatistic.java
@@ -0,0 +1,211 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.Serializable;
+import java.util.*;
+
+/**
+ * Class to store kmer motif statistics.
+ *
+ * @author Richard Leggett
+ */
+public class KmerMotifStatistic implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ public final static int TYPE_TOP = 1;
+ public final static int TYPE_BOTTOM = 2;
+ private int kSize;
+ private Hashtable<String, Integer> motifs = new Hashtable();
+ private Hashtable<String, Double> motifsPercent = new Hashtable();
+ private int totalCount = 0;
+ private int[][] baseCounts;
+
+ /**
+ * Constructor
+ * @param s - kmer size
+ */
+ public KmerMotifStatistic(int s) {
+ kSize = s;
+ baseCounts = new int[4][kSize];
+ }
+
+ /**
+ * Add a motif to store.
+ * @param kmer motif to store
+ */
+ public void addMotif(String kmer) {
+ Integer currentCount = motifs.get(kmer);
+
+ if (currentCount == null) {
+ currentCount = new Integer(1);
+ } else {
+ currentCount++;
+ }
+
+ motifs.put(kmer, currentCount);
+
+ totalCount++;
+
+ //System.out.println("Adding motif "+kmer+" to size "+kSize);
+ }
+
+ /**
+ * Parse motif, updating count of bases seen at each position.
+ * @param motif - kmer motif
+ * @param count - count of number of times seen
+ */
+ private void updateBaseCounts(String motif, int count) {
+ for (int i=0; i<motif.length(); i++) {
+ switch(motif.charAt(i)) {
+ case 'A': baseCounts[0][i]+=count; break;
+ case 'C': baseCounts[1][i]+=count; break;
+ case 'G': baseCounts[2][i]+=count; break;
+ case 'T': baseCounts[3][i]+=count; break;
+ }
+ }
+ }
+
+ /**
+ * Calculate percent each motif has been seen.
+ */
+ public void calculateMotifs() {
+ Set<String> keys = motifs.keySet();
+
+ for(String motif : keys) {
+ int count = motifs.get(motif);
+ double percent = (100.0 * (double)count) / (double)totalCount;
+ motifsPercent.put(motif, percent);
+ //updateBaseCounts(motif, count);
+ }
+ }
+
+ /**
+ * Update motif base counts for top 10 motifs
+ */
+ public void calculateTopBaseCounts() {
+ ArrayList<Map.Entry<String, Integer>> list = getSortedMotifCounts();
+
+ if (list.size() < 10) {
+ System.out.println("Error: motif list smaller than 10");
+ return;
+ }
+
+ for (int i=0; i<10; i++) {
+ if (i < list.size()) {
+ String motif = list.get(i).getKey();
+ updateBaseCounts(motif, list.get(i).getValue());
+ }
+ }
+ }
+
+ /**
+ * Update motif bases counts for bottom 10 motifs
+ */
+ public void calculateBottomBaseCounts() {
+ ArrayList<Map.Entry<String, Integer>> list = getSortedMotifCounts();
+
+ if (list.size() < 10) {
+ System.out.println("Error: motif list smaller than 10");
+ return;
+ }
+
+ for (int i=0; i<10; i++) {
+ if (i >= 0) {
+ String motif = list.get(list.size() - 1 - i).getKey();
+ updateBaseCounts(motif, list.get(list.size() - 1 - i).getValue());
+ }
+ }
+ }
+
+ /**
+ * Write a top 10 or bottom 10 logo image.
+ * @param type TYPE_TOP for Top 10 or TYPE_BOTTOM for bottom 10
+ * @param filename PNG output filename
+ */
+ public void writeLogoImage(int type, String filename) {
+ baseCounts = new int[4][kSize];
+ if (type == TYPE_TOP) {
+ calculateTopBaseCounts();
+ } else if (type == TYPE_BOTTOM) {
+ calculateBottomBaseCounts();
+ } else {
+ System.out.println("Error: wrong type to writeLogoImgae");
+ System.exit(1);
+ }
+
+ SequenceLogo sl = new SequenceLogo(kSize);
+ for (int i=0; i<kSize; i++) {
+ sl.addBase(i, baseCounts[0][i], baseCounts[1][i], baseCounts[2][i], baseCounts[3][i]);
+ }
+ sl.drawImage();
+ sl.saveImage(filename);
+ }
+
+ /**
+ * Return ArrayList of sorted motif counts.
+ * @return sorted motifs
+ */
+ public ArrayList<Map.Entry<String, Integer>> getSortedMotifCounts() {
+ ArrayList<Map.Entry<String, Integer>>list = new ArrayList(motifs.entrySet());
+
+ Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+ public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2){
+ return o2.getValue().compareTo(o1.getValue());
+ }});
+
+ return list;
+ }
+
+ /**
+ * Return ArrayList of sorted motif percentages.
+ * @return sorted motifs
+ */
+ public ArrayList<Map.Entry<String, Double>> getSortedMotifPercentages() {
+ if (motifsPercent.size() == 0) {
+ calculateMotifs();
+ }
+
+ ArrayList<Map.Entry<String, Double>>list = new ArrayList(motifsPercent.entrySet());
+
+ Collections.sort(list, new Comparator<Map.Entry<String, Double>>() {
+ public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2){
+ return o2.getValue().compareTo(o1.getValue());
+ }});
+
+ //for (int i=1; i<=10; i++) {
+ // if (list.size() >= i) {
+ // System.out.println(i + ". " + list.get(i-1).getKey() + "\t" + list.get(i-1).getValue());
+ // }
+ //}
+
+ return list;
+ }
+
+ /**
+ * Write motif counts to stdout.
+ */
+ public void outputMotifCounts() {
+ ArrayList<Map.Entry<String, Integer>>list = getSortedMotifCounts();
+
+ for (int i=1; i<=10; i++) {
+ if (list.size() >= i) {
+ System.out.println(i + ". " + list.get(i-1).getKey() + "\t" + list.get(i-1).getValue());
+ }
+ }
+
+ System.out.println(list);
+ }
+
+ /**
+ * Get total motif count.
+ * @return total motif count
+ */
+ public int getTotalMotifCount() {
+ return totalCount;
+ }
+}
diff --git a/src/nanook/KmerTable.java b/src/nanook/KmerTable.java
new file mode 100644
index 0000000..afffa3b
--- /dev/null
+++ b/src/nanook/KmerTable.java
@@ -0,0 +1,75 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.Serializable;
+import java.util.Hashtable;
+import java.util.Set;
+
+/**
+ * KmerTable used for 5-mer comparison
+ *
+ * @author Richard Leggett
+ */
+public class KmerTable implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private int kmerSize = 5;
+ private Hashtable<String, Integer> counts = new Hashtable();
+
+ public KmerTable(int k) {
+ kmerSize = k;
+ }
+
+ public synchronized void countKmer(String kmer) {
+ int count = 0;
+
+ if (counts.containsKey(kmer)) {
+ count = counts.get(kmer);
+ }
+
+ count++;
+
+ counts.put(kmer, count);
+ }
+
+ public void writeKmerTable() {
+ Set<String> keys = counts.keySet();
+
+ System.out.println("");
+ System.out.println("Writing kmer table...");
+
+ for(String kmer : keys) {
+ int count = counts.get(kmer);
+ System.out.println(kmer + "\t" + count);
+ }
+
+ System.out.println("");
+ }
+
+ public int getKmerSize() {
+ return kmerSize;
+ }
+
+ public Set<String> getKeys() {
+ return counts.keySet();
+ }
+
+ public int get(String kmer) {
+ int value = 0;
+
+ if (counts.containsKey(kmer)) {
+ value = counts.get(kmer);
+ }
+
+ return value;
+ }
+
+ public Hashtable getTable() {
+ return counts;
+ }
+}
diff --git a/src/nanook/LastParser.java b/src/nanook/LastParser.java
new file mode 100644
index 0000000..1581e5b
--- /dev/null
+++ b/src/nanook/LastParser.java
@@ -0,0 +1,70 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+
+/**
+ * Parser for LAST alignments
+ *
+ * @author Richard Leggett
+ */
+public class LastParser extends MAFParser implements AlignmentFileParser {
+ private String alignmentParams = "-s 2 -T 0 -Q 0 -a 1";
+
+ public LastParser(NanoOKOptions o, References r) {
+ super(o, r);
+ }
+
+ public String getProgramID() {
+ return "last";
+ }
+
+ public int getReadFormat() {
+ return NanoOKOptions.FASTA;
+ }
+
+ public void setAlignmentParams(String p) {
+ alignmentParams = p;
+ }
+
+ public String getRunCommand(String query, String output, String reference) {
+ reference = reference.replaceAll("\\.fasta$", "");
+ reference = reference.replaceAll("\\.fa$", "");
+
+ return "lastal " + alignmentParams + " " + reference + " " + query;
+ //return "lastal -o "+ output + " " + alignmentParams + " " + reference + " " + query;
+ }
+
+ public boolean outputsToStdout() {
+ return true;
+ }
+
+ public void checkForIndex(String referenceFile) {
+ String[] files = {referenceFile + ".bck",
+ referenceFile + ".suf"};
+
+ for (int i=0; i<files.length; i++) {
+ File f = new File(files[i]);
+
+ if (!f.exists()) {
+ System.out.println("");
+ System.out.println("Error:");
+ System.out.println("Can't find file " + f.getPath());
+ System.out.println("1. Have you indexed the reference with lastdb?");
+ System.out.println("2. Have you made sure that the output prefix is the same name as the reference file, apart from the .fasta or .fa extension?");
+ System.out.println(" e.g. lastdb -Q 0 referencename referencename.fasta");
+ System.out.println("Will continue but anticipate failure at analyse stage.");
+ System.out.println("");
+ return;
+ }
+ }
+
+ return;
+ }
+}
diff --git a/src/nanook/MAFAlignmentLine.java b/src/nanook/MAFAlignmentLine.java
new file mode 100644
index 0000000..ab3f84e
--- /dev/null
+++ b/src/nanook/MAFAlignmentLine.java
@@ -0,0 +1,98 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+/**
+ * Class representing an alignment line in a LAST file.
+ *
+ * @author Richard Leggett
+ */
+public class MAFAlignmentLine {
+ private String name;
+ private int start;
+ private int alnSize;
+ private String strand;
+ private int seqSize;
+ private String alignment;
+
+ /**
+ * Constructor.
+ * @param s - alignment line string
+ */
+ public MAFAlignmentLine(String s) {
+ String[] parts = s.split("\\s+");
+
+ if (parts.length == 7) {
+ name = parts[1];
+ start = Integer.parseInt(parts[2]);
+ alnSize = Integer.parseInt(parts[3]);
+ strand = parts[4];
+ seqSize = Integer.parseInt(parts[5]);
+ alignment = parts[6];
+ } else {
+ System.out.println("Error: can't understand alignment file format.");
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Get name (ID) of sequence.
+ * @return name, as String
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Get start position of alignment.
+ * @return start position
+ */
+ public int getStart() {
+ return start;
+ }
+
+ /**
+ * Get end position of alignment
+ * @return end position
+ */
+ public int getEnd() {
+ return start + alnSize - 1;
+ }
+
+ /**
+ * Get alignment size.
+ * @return alignment size, in bases
+ */
+ public int getAlnSize() {
+ return alnSize;
+ }
+
+ /**
+ * Get strand.
+ * @return strand, "+" or "-"
+ */
+ public String getStrand() {
+ return strand;
+ }
+
+ /**
+ * Get sequence size.
+ * @return sequence size, in bases.
+ */
+ public int getSeqSize() {
+ return seqSize;
+ }
+
+ /**
+ * Get alignment string.
+ * @return alignment string
+ */
+ public String getAlignment() {
+ return alignment;
+ }
+}
diff --git a/src/nanook/MAFParser.java b/src/nanook/MAFParser.java
new file mode 100644
index 0000000..1379cf1
--- /dev/null
+++ b/src/nanook/MAFParser.java
@@ -0,0 +1,131 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collections;
+
+/**
+ * Parser for LAST aligner files.
+ *
+ * @author Richard Leggett
+ */
+public abstract class MAFParser {
+ private NanoOKOptions options;
+ private References references;
+ private SampleReportWriter report;
+ ArrayList<Alignment> alignments;
+ String leafName;
+
+ /**
+ * Constructor.
+ * @param o NanoOKOptions object
+ * @param s ReadSetStats object to store stats in
+ * @param r References object
+ */
+ public MAFParser(NanoOKOptions o, References r) {
+ options = o;
+ references = r;
+ }
+
+ /**
+ * Get file extension of alignment files
+ * @return
+ */
+ public String getAlignmentFileExtension() {
+ return ".maf";
+ }
+
+ /**
+ * Parse a LAST file.
+ * @param filename filename to parse
+ * @param nonAlignedSummaryFile an AlignmentTableFile to output details of anything that doesn't align to
+ * @return number of alignments parsed
+ */
+ public int parseFile(String filename, AlignmentsTableFile nonAlignedSummaryFile, ReadSetStats overallStats) {
+ alignments = new ArrayList();
+ leafName = new File(filename).getName();
+
+ // Read all alignmnets and put into an ArrayList
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(filename));
+ String line;
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ if (line.startsWith("a score=")) {
+ String[] fields = line.substring(8).split(" ");
+ int score = Integer.parseInt(fields[0]);
+ MAFAlignmentLine hitLine = new MAFAlignmentLine(br.readLine());
+ MAFAlignmentLine queryLine = new MAFAlignmentLine(br.readLine());
+ Alignment al = new Alignment(score,
+ queryLine.getName(),
+ queryLine.getSeqSize(),
+ queryLine.getStart(),
+ queryLine.getAlnSize(),
+ queryLine.getAlignment(),
+ hitLine.getName(),
+ hitLine.getSeqSize(),
+ hitLine.getStart(),
+ hitLine.getAlnSize(),
+ hitLine.getAlignment(),
+ false);
+ alignments.add(al);
+ }
+ }
+ } while (line != null);
+ br.close();
+
+ if (alignments.size() == 0) {
+ nonAlignedSummaryFile.writeNoAlignmentMessage(leafName);
+ overallStats.addReadWithoutAlignment();
+ }
+
+ } catch (Exception e) {
+ System.out.println("parseFile Exception:");
+ e.printStackTrace();
+ options.getLog().println("Exception parsing "+filename);
+ options.getLog().close();
+ System.exit(1);
+ }
+
+ return alignments.size();
+ }
+
+ /**
+ * Sort alignments in order of score
+ */
+ public void sortAlignments() {
+ if (alignments.size() > 0) {
+ Collections.sort(alignments);
+ }
+ }
+
+ /**
+ * Get the set of alignments that match the highest scoring reference
+ */
+ public ArrayList getHighestScoringSet() {
+ ArrayList hss = new ArrayList();
+
+ if (alignments.size() > 0) {
+ String readReferenceName = alignments.get(0).getHitName();
+ ReferenceSequence readReference = references.getReferenceById(readReferenceName);
+ for (int i=0; i<alignments.size(); i++) {
+ Alignment a = alignments.get(i);
+ if (a.getHitName().equals(readReferenceName)) {
+ hss.add(a);
+ }
+ }
+ }
+
+ return hss;
+ }
+ }
diff --git a/src/nanook/MarginAlignParser.java b/src/nanook/MarginAlignParser.java
new file mode 100644
index 0000000..7f82b99
--- /dev/null
+++ b/src/nanook/MarginAlignParser.java
@@ -0,0 +1,71 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+
+/**
+ * Parser for marginAlign files
+ *
+ * @author Richard Leggett
+ */
+public class MarginAlignParser extends SAMParser implements AlignmentFileParser {
+ private String alignmentParams="";
+
+ public MarginAlignParser(NanoOKOptions o, References r) {
+ super(o, r);
+ }
+
+ public String getProgramID() {
+ return "marginalign";
+ }
+
+ public int getReadFormat() {
+ return NanoOKOptions.FASTQ;
+ }
+
+ public void setAlignmentParams(String p) {
+ alignmentParams = p;
+ }
+
+ private void removeJobTree(String dirName) {
+ File jt = new File(dirName);
+
+ if (jt.exists()) {
+ if (jt.isDirectory()) {
+ System.out.println("Removing "+dirName);
+ String command = "rm -rf "+dirName;
+ ProcessLogger pl = new ProcessLogger();
+ pl.runCommand(command);
+ }
+ }
+ }
+
+ public String getRunCommand(String query, String output, String reference) {
+ String jobtree = output + ".jobTree";
+ String command = "marginAlign ";
+
+ removeJobTree(jobtree);
+
+ if (alignmentParams.length() > 0) {
+ command = command + " " + alignmentParams + " ";
+ }
+
+ command = command + query + " " + reference + " " + output + " --jobTree " + jobtree;
+
+ return command;
+ }
+
+ public boolean outputsToStdout() {
+ return false;
+ }
+
+ public void checkForIndex(String referenceFile) {
+ return;
+ }
+}
diff --git a/src/nanook/MergedFastAQFile.java b/src/nanook/MergedFastAQFile.java
new file mode 100644
index 0000000..18eb424
--- /dev/null
+++ b/src/nanook/MergedFastAQFile.java
@@ -0,0 +1,45 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+
+/**
+ * Represent FASTA/FASTQ file
+ * @author leggettr
+ */
+public class MergedFastAQFile {
+ private NanoOKOptions options;
+ private String mergedFilename;
+ private int nSeqs = 0;
+ private int seqsPerFile = 500;
+ private int fileCounter = 0;
+ private ArrayList mergeList = new ArrayList();
+
+ public MergedFastAQFile(NanoOKOptions o, String f) {
+ mergedFilename = f;
+ options = o;
+ seqsPerFile = options.getReadsPerBlast();
+ }
+
+ public synchronized void addFile(String readFilename, String fast5Path) {
+ mergeList.add(readFilename);
+ nSeqs++;
+ if (nSeqs == seqsPerFile) {
+ System.out.println("Adding new thread...");
+ options.getThreadExecutor().execute(new FastAQBlastMerger(options, mergedFilename, mergeList, fileCounter));
+ mergeList = new ArrayList();
+ fileCounter++;
+ nSeqs = 0;
+ }
+ }
+}
diff --git a/src/nanook/MotifStatistics.java b/src/nanook/MotifStatistics.java
new file mode 100644
index 0000000..99bdac7
--- /dev/null
+++ b/src/nanook/MotifStatistics.java
@@ -0,0 +1,217 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.Map;
+import static nanook.NanoOKOptions.TYPE_2D;
+import static nanook.NanoOKOptions.TYPE_COMPLEMENT;
+import static nanook.NanoOKOptions.TYPE_TEMPLATE;
+
+/**
+ * Store all motif statistics (ie, insertion, deletion, substitution) at a range
+ * of sizes (3, 4, 5) for a single read type (Template, Complement or 2D).
+ *
+ * @author Richard Leggett
+ */
+public class MotifStatistics implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private KmerMotifStatistic[] insertionMotifs = new KmerMotifStatistic[3];
+ private KmerMotifStatistic[] deletionMotifs = new KmerMotifStatistic[3];
+ private KmerMotifStatistic[] substitutionMotifs = new KmerMotifStatistic[3];
+
+ /**
+ * Constructor
+ */
+ public MotifStatistics() {
+ for (int k=0; k<3; k++) {
+ insertionMotifs[k] = new KmerMotifStatistic(k+3);
+ deletionMotifs[k] = new KmerMotifStatistic(k+3);
+ substitutionMotifs[k] = new KmerMotifStatistic(k+3);
+ }
+ }
+
+ /**
+ * Given a stretch of perfect sequence, store motifs at all k size.
+ * @param motif KmerMotifStatistic object to add to
+ * @param kmer perfect sequence to get motifs from
+ */
+ public void addMotifs(KmerMotifStatistic[] motif, String kmer) {
+ if (kmer.length() < 3) {
+ return;
+ }
+
+ for (int k=3; k<=5; k++) {
+ if (kmer.length() > k) {
+ motif[k-3].addMotif(kmer.substring(kmer.length() - k));
+ }
+ }
+ }
+
+ /**
+ * Add a insertion motif.
+ * @param kmer motif to add
+ */
+ public void addInsertionMotifs(String kmer) {
+ addMotifs(insertionMotifs, kmer);
+ }
+
+ /**
+ * Add a deletion motif.
+ * @param kmer motif to add
+ */
+ public void addDeletionMotifs(String kmer) {
+ addMotifs(deletionMotifs, kmer);
+ }
+
+ /**
+ * Add a substitution motif
+ * @param kmer motif to add
+ */
+ public void addSubstitutionMotifs(String kmer) {
+ addMotifs(substitutionMotifs, kmer);
+ }
+
+ /**
+ * Output motif counts to screen (debugging).
+ * @param motif KmerMotifStatistic object to get counts from
+ */
+ private void outputMotifCounts(KmerMotifStatistic[] motif) {
+ for (int k=3; k<=5; k++) {
+ System.out.println("k="+k);
+ motif[k-3].outputMotifCounts();
+ }
+ }
+
+ /**
+ * Output motif counts for all types (debugging).
+ */
+ public void outputAllMotifCounts() {
+ System.out.println("Outputtng motif data");
+ System.out.println("Insertions");
+ outputMotifCounts(insertionMotifs);
+ System.out.println("Deletions");
+ outputMotifCounts(deletionMotifs);
+ System.out.println("Substitutions");
+ outputMotifCounts(substitutionMotifs);
+ }
+
+ /**
+ * Get a sorted list of insertion motif counts at given kmer size.
+ * @param k kmer size required
+ * @return ArrayList of counts.
+ */
+ public ArrayList<Map.Entry<String, Integer>> getSortedInsertionMotifCounts(int k) {
+ return insertionMotifs[k-3].getSortedMotifCounts();
+ }
+
+ /**
+ * Get a sorted list of deletion motif counts at given kmer size.
+ * @param k kmer size required
+ * @return ArrayList of counts.
+ */
+ public ArrayList<Map.Entry<String, Integer>> getSortedDeletionMotifCounts(int k) {
+ return deletionMotifs[k-3].getSortedMotifCounts();
+ }
+
+ /**
+ * Get a sorted list of substitution motif counts at given kmer size.
+ * @param k kmer size required
+ * @return ArrayList of counts.
+ */
+ public ArrayList<Map.Entry<String, Integer>> getSortedSubstitutionMotifCounts(int k) {
+ return substitutionMotifs[k-3].getSortedMotifCounts();
+ }
+
+ /**
+ * Get a sorted list of insertion motif percentages at given kmer size.
+ * @param k kmer size required
+ * @return ArrayList of counts.
+ */
+ public ArrayList<Map.Entry<String, Double>> getSortedInsertionMotifPercentages(int k) {
+ return insertionMotifs[k-3].getSortedMotifPercentages();
+ }
+
+ /**
+ * Get a sorted list of deletion motif percentages at given kmer size.
+ * @param k kmer size required
+ * @return ArrayList of counts.
+ */
+ public ArrayList<Map.Entry<String, Double>> getSortedDeletionMotifPercentages(int k) {
+ return deletionMotifs[k-3].getSortedMotifPercentages();
+ }
+
+ /**
+ * Get a sorted list of substitution motif percentages at given kmer size.
+ * @param k kmer size required
+ * @return ArrayList of counts.
+ */
+ public ArrayList<Map.Entry<String, Double>> getSortedSubstitutionMotifPercentages(int k) {
+ return substitutionMotifs[k-3].getSortedMotifPercentages();
+ }
+
+ /**
+ * Write insertion logo image (via KmerMotifStatistic object)
+ * @param type either TYPE_TOP or TYPE_BOTTOM (Top 10 or bottom 10)
+ * @param filename image filename
+ * @param k kmer size
+ */
+ public void writeInsertionLogoImage(int type, String filename, int k) {
+ insertionMotifs[k-3].writeLogoImage(type, filename);
+ }
+
+ /**
+ * Write deletion logo image (via KmerMotifStatistic object)
+ * @param type either TYPE_TOP or TYPE_BOTTOM (Top 10 or bottom 10)
+ * @param filename image filename
+ * @param k kmer size
+ */
+ public void writeDeletionLogoImage(int type, String filename, int k) {
+ deletionMotifs[k-3].writeLogoImage(type, filename);
+ }
+
+ /**
+ * Write substitution logo image (via KmerMotifStatistic object)
+ * @param type either TYPE_TOP or TYPE_BOTTOM (Top 10 or bottom 10)
+ * @param filename image filename
+ * @param k kmer size
+ */
+ public void writeSubstitutionLogoImage(int type, String filename, int k) {
+ substitutionMotifs[k-3].writeLogoImage(type, filename);
+ }
+
+ /**
+ * Get total count of motifs seen
+ * @param errorType type of error - TYPE_INSERTION etc.
+ * @param k kmer size
+ * @return count
+ */
+ public int getTotalMotifCounts(int errorType, int k) {
+ int count = 0;
+
+ switch(errorType) {
+ case NanoOKOptions.TYPE_INSERTION:
+ count = insertionMotifs[k-3].getTotalMotifCount();
+ break;
+ case NanoOKOptions.TYPE_DELETION:
+ count = deletionMotifs[k-3].getTotalMotifCount();
+ break;
+ case NanoOKOptions.TYPE_SUBSTITUTION:
+ count = substitutionMotifs[k-3].getTotalMotifCount();
+ break;
+ default:
+ System.out.println("Error: bad error type in getTotalMotifCounts");
+ System.exit(1);
+ break;
+ }
+
+ return count;
+ }
+}
diff --git a/src/nanook/NanoOK.java b/src/nanook/NanoOK.java
new file mode 100644
index 0000000..3150627
--- /dev/null
+++ b/src/nanook/NanoOK.java
@@ -0,0 +1,428 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+import java.util.Locale;
+import java.util.Set;
+import java.util.concurrent.ThreadPoolExecutor;
+//import ncsa.hdf.object.FileFormat;
+//import ncsa.hdf.object.h5.H5File;
+
+/**
+ * Entry class for tool.
+ *
+ * @author Richard Leggett
+ */
+public class NanoOK {
+ public final static String VERSION_STRING = "v1.26";
+ public final static long SERIAL_VERSION = 3L;
+
+ /**
+ * Check for program dependencies - R, pdflatex
+ */
+ public static void checkDependencies() {
+ ProcessLogger pl = new ProcessLogger();
+ ArrayList<String> response;
+ String rVersion = null;
+ String pdflatexVersion = null;
+ String hVersion = null;
+
+ response = pl.checkCommand("Rscript --version");
+ if (response != null) {
+ for (int i=0; i<response.size(); i++) {
+ String s = response.get(i);
+ if (s.startsWith("R scripting front-end")) {
+ rVersion = s;
+ }
+ }
+ }
+
+ if (rVersion == null) {
+ System.out.println("*** ERROR: Couldn't find Rscript - is R installed? ***");
+ } else {
+ System.out.println(rVersion);
+ }
+
+ response = pl.checkCommand("pdflatex --version");
+ if (response != null) {
+ for (int i=0; i<response.size(); i++) {
+ String s = response.get(i);
+ if (s.contains("pdfTeX")) {
+ pdflatexVersion = s;
+ break;
+ }
+ }
+ }
+
+ if (pdflatexVersion == null) {
+ System.out.println("*** ERROR: Couldn't find pdflatex - is TeX installed? ***");
+ } else {
+ System.out.println(pdflatexVersion);
+ }
+
+ response = pl.checkCommand("h5dump --version");
+ if (response != null) {
+ for (int i=0; i<response.size(); i++) {
+ String s = response.get(i);
+ if (s.startsWith("h5dump")) {
+ hVersion = s;
+ }
+ }
+ }
+
+ if (hVersion == null) {
+ System.out.println("*** ERROR: Couldn't find h5dump - is H5 Tools installed? ***");
+ } else {
+ System.out.println(hVersion);
+ }
+
+ //try {
+ // H5File file = new H5File();
+ //} catch (NoClassDefFoundError | UnsatisfiedLinkError e) {
+ // e.printStackTrace();
+ // System.out.println("");
+ // System.out.println("Error: Could not initialise HDF5 classes. Check that the HDF libraries are correctly installed (and pointed to by LD_LIBRARY_PATH or DYLD_LIBRARY_PATH).");
+ // System.out.println("Consult HDF documentation and/or NanoOK documentation.");
+ // System.out.println("");
+ // System.exit(1);
+ //}
+
+ System.out.println("");
+ }
+
+ /**
+ * Test logo plotting
+ */
+ public static void testLogo() {
+ SequenceLogo logo = new SequenceLogo();
+ logo.drawImage();
+ logo.saveImage("/Users/leggettr/Desktop/logo.png");
+ }
+
+ /**
+ * Test SequenceReader class
+ */
+ public static void testSequenceReader() {
+ SequenceReader r = new SequenceReader(true);
+ r.indexFASTAFile("/Users/leggettr/Documents/Projects/Nanopore/test.fasta", null, true);
+ String s = r.getSubSequence("gi|223667766|ref|NZ_DS264586.1|", 0, 499);
+ System.out.println("String (0,499) = ["+s+"]");
+ s = r.getSubSequence("gi|223667766|ref|NZ_DS264586.1|", 0, 9);
+ System.out.println("String (0,9) = ["+s+"]");
+ s = r.getSubSequence("gi|223667766|ref|NZ_DS264586.1|", 200, 209);
+ System.out.println("String (200,209) = ["+s+"]");
+ s = r.getSubSequence("gi|223667766|ref|NZ_DS264586.1|", 200, 214);
+ System.out.println("String (200,214) = ["+s+"]");
+ }
+
+ public static void testSamToLast(NanoOKOptions options, References references) {
+ BWAParser parser = new BWAParser(options, references);
+ AlignmentsTableFile nonAlignedSummaryFile = new AlignmentsTableFile("atf.txt");
+ ReadSetStats readSetStats = new ReadSetStats(options, NanoOKOptions.TYPE_2D);
+ options.getReferences().loadReferences();
+ parser.parseFile("/Users/leggettr/Desktop/test.fasta.sam", nonAlignedSummaryFile, readSetStats);
+ }
+
+ /**
+ * Test parser
+ * @param options
+ * @param overallStats
+ * @param references
+ */
+ public static void testParser(NanoOKOptions options, OverallStats overallStats, References references) {
+ AlignmentFileParser p = new LastParser(options, references);
+ AlignmentsTableFile nonAlignedSummary = new AlignmentsTableFile("blob.txt");
+ //p.parseFile("/Users/leggettr/Documents/Projects/Nanopore/N79681_EvenMC_R7_06082014/last/2D/N79681_EvenMC_R7_0608215_5314_1_ch319_file116_strand.fast5_BaseCalled_2D.fasta.maf", nonAlignedSummary, overallStats);
+ //System.exit(0);
+ }
+
+ /**
+ * Test HDF5 library
+ */
+ public static void testHDF(NanoOKOptions options) {
+ //ReadExtractorRunnable r = new ReadExtractorRunnable(options, null, null, null);
+ //String fastq = r.getFastq("/Users/leggettr/Desktop/TEST12345_ch1_file0.fast5", NanoOKOptions.TYPE_TEMPLATE);
+
+ Fast5File f = new Fast5File(options, "/Users/leggettr/Desktop/TEST12345_ch1_file0.fast5");
+ Fast5File g = new Fast5File(options, "/Users/leggettr/Documents/Projects/Nanopore/NanoOK_lambda_test/fast5/pass/N79596_Lambda8kbp_LCv4_test_3559_1_ch37_file38_strand.fast5");
+ FastAQFile ff = f.getFastq(-1, NanoOKOptions.TYPE_TEMPLATE);
+ FastAQFile fg = g.getFastq(-1, NanoOKOptions.TYPE_TEMPLATE);
+ if (ff != null) {
+ ff.writeFastq("ff.fq");
+ }
+
+ if (fg != null) {
+ fg.writeFastq("fg.fq");
+ }
+ //f.printGroups();
+ System.exit(0);
+ }
+
+ private static void analyse(NanoOKOptions options) throws InterruptedException {
+ OverallStats overallStats = new OverallStats(options);
+ options.getReferences().setOverallStats(overallStats);
+
+ options.getSampleChecker().checkReadDirectory();
+
+ // Load reference data
+ options.getReferences().loadReferences();
+ options.setReadFormat(options.getParser().getReadFormat());
+ options.initialiseAlignmentSummaryFile();
+
+ System.out.println("");
+
+ // Parse all reads sets
+ if (options.doParseAlignments()) {
+ ReadLengthsSummaryFile summary = new ReadLengthsSummaryFile(options.getLengthSummaryFilename());
+ summary.open(options.getSample());
+
+ for (int type = 0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ System.out.println("Parsing " + NanoOKOptions.getTypeFromInt(type));
+ ReadSet readSet = new ReadSet(type, options, overallStats.getStatsByType(type));
+ int nReads = readSet.processReads();
+
+ if (nReads < 1) {
+ System.out.println("Error: unable to find any " + NanoOKOptions.getTypeFromInt(type) + " reads to process.");
+ System.out.println("");
+ System.exit(1);
+ }
+
+ int nReadsWithAlignments = readSet.getStats().getNumberOfReadsWithAlignments();
+ if (nReadsWithAlignments < 1) {
+ System.out.println("");
+ System.out.println("Error: unable to find any " + NanoOKOptions.getTypeFromInt(type) + " alignments to process.");
+ System.out.println("Common reasons for this:");
+ System.out.println("1. Failure to index the reference with the alignment tool, resulting in alignment files of 0 bytes");
+ System.out.println("2. Wrong reference specified to the align stage, resulting in no alignments");
+ System.out.println("3. When indexing with LAST, the output prefix needs to be the same as the reference FASTA file, minus the .fasta extension");
+ System.out.println(" e.g. lastdb -Q 0 referencename referencename.fasta");
+ System.out.println("");
+ System.exit(1);
+ } else if (nReadsWithAlignments < 400) {
+ System.out.println("Warning: not many alignments ("+nReadsWithAlignments+") found to process.");
+ }
+
+ summary.addReadSetStats(overallStats.getStatsByType(type));
+ overallStats.getStatsByType(type).closeKmersFile();
+ overallStats.getStatsByType(type).writeSubstitutionStats();
+ overallStats.getStatsByType(type).writeErrorMotifStats();
+
+ int ignoredDuplicates = overallStats.getStatsByType(type).getIgnoredDuplicates();
+ if (ignoredDuplicates > 0) {
+ System.out.println(ignoredDuplicates + " ignored duplicate read IDs.");
+ }
+
+ System.out.println("");
+
+ }
+ }
+ summary.close();
+
+ // Write files
+ System.out.println("Writing analysis files");
+ Set<String> ids = options.getReferences().getAllIds();
+ int allCount = 3; //ids.size() * 3;
+ int counter = 1;
+ for (int type=0; type<3; type++) {
+ long completed = counter;
+ long total = allCount;
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (total > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * completed / total;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+ System.out.print("\r[");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + completed +"/" + total);
+ options.getReferences().writeReferenceStatFiles(type);
+ options.getReferences().writeReferenceSummary(type);
+ counter++;
+ }
+ System.out.println("");
+
+ System.out.println("Writing object");
+ try {
+ FileOutputStream fos = new FileOutputStream(options.getAnalysisDir() + File.separator + "OverallStats.ser");
+ ObjectOutputStream oos = new ObjectOutputStream(fos);
+ oos.writeObject(overallStats);
+ oos.close();
+ } catch (Exception e) {
+ System.out.println("Exception trying to write object:");
+ e.printStackTrace();
+ }
+
+ }
+
+ // Plot graphs
+ if (options.doPlotGraphs()) {
+ System.out.println("");
+ System.out.println("Plotting graphs");
+ RGraphPlotter plotter = new RGraphPlotter(options);
+ plotter.plot(false);
+ }
+
+ // Make report
+ if (options.doMakeReport()) {
+ System.out.println("");
+ System.out.println("Making report");
+ SampleReportWriter rw = new SampleReportWriter(options, overallStats);
+ rw.writeReport();
+
+ if (options.doMakePDF()) {
+ System.out.println("");
+ System.out.println("Making PDF");
+ rw.makePDF();
+ }
+ }
+
+ System.out.println("");
+ System.out.println("Done");
+ }
+
+ private static void extract(NanoOKOptions options) throws InterruptedException {
+ ReadExtractor re = new ReadExtractor(options);
+ re.createDirectories();
+ re.extract();
+ }
+
+ private static void align(NanoOKOptions options) throws InterruptedException {
+ AlignmentFileParser parser = options.getParser();
+ parser.checkForIndex(options.getReferenceFile().substring(0, options.getReferenceFile().lastIndexOf('.')));
+ ReadAligner aligner = new ReadAligner(options, parser);
+ options.setReadFormat(parser.getReadFormat());
+ aligner.createDirectories();
+ aligner.align();
+ }
+
+ private static void compare(NanoOKOptions options) throws InterruptedException {
+ System.out.println("Comparing");
+ SampleComparer comparer = new SampleComparer(options);
+ comparer.loadSamples();
+ comparer.compareSamples();
+
+ options.setReferences(comparer.getSample(0).getStatsByType(0).getOptions().getReferences());
+
+ System.out.println("");
+ System.out.println("Plotting graphs");
+ RGraphPlotter plotter = new RGraphPlotter(options);
+ plotter.plot(true);
+
+ System.out.println("");
+ System.out.println("Making PDF");
+ ComparisonReportWriter crw = new ComparisonReportWriter(options, comparer);
+ crw.writeReport();
+ crw.makePDF();
+ }
+
+ private static void watch(NanoOKOptions options) throws InterruptedException {
+ AlignmentFileParser parser = options.getParser();
+ parser.checkForIndex(options.getReferenceFile().substring(0, options.getReferenceFile().lastIndexOf('.')));
+ ReadAligner aligner = new ReadAligner(options, parser);
+ options.setReadFormat(parser.getReadFormat());
+ aligner.createDirectories();
+
+ DirectoryWatcher dw = new DirectoryWatcher(options, aligner, parser);
+ dw.watch();
+ }
+
+ private static void process(NanoOKOptions options) throws InterruptedException {
+ ReadProcessor rp = new ReadProcessor(options);
+ options.makeDirectories();
+ options.initialiseReadMerger();
+ rp.process();
+ }
+
+ private static void memoryReport() {
+ Runtime runtime = Runtime.getRuntime();
+ long mb = 1024 * 1024;
+ long totalMem = runtime.totalMemory() / mb;
+ long maxMem = runtime.maxMemory() / mb;
+ long freeMem = runtime.freeMemory() / mb;
+ System.out.println("totalMem: " + totalMem + "Mb");
+ System.out.println(" maxMem: " + maxMem + "Mb");
+ System.out.println(" freeMem: " + freeMem + "Mb");
+ }
+
+ /**
+ * Entry to tool.
+ * @param args command line arguments
+ */
+ public static void main(String[] args) throws InterruptedException {
+ System.out.println("");
+ System.out.println("NanoOK " + VERSION_STRING);
+ System.out.println("");
+ System.out.println("Comments/bugs to: richard.leggett at earlham.ac.uk");
+ System.out.println("Follow NanoOK on twitter: @NanoOK_Software");
+ System.out.println("");
+
+ NanoOKOptions options = new NanoOKOptions();
+
+ Locale.setDefault(new Locale("en", "US"));
+
+ // Parse command line
+ options.parseArgs(args);
+
+ // Check dependencies
+ System.out.println("");
+ System.out.println("Checking dependencies");
+ checkDependencies();
+
+ //testHDF(options);
+ //System.exit(0);
+
+ File logsDir = new File(options.getLogsDir());
+ if (!logsDir.exists()) {
+ logsDir.mkdir();
+ }
+
+
+ if (options.getRunMode() == NanoOKOptions.MODE_EXTRACT) {
+ //extract(options);
+ process(options);
+ } else if (options.getRunMode() == NanoOKOptions.MODE_ALIGN) {
+ //align(options);
+ process(options);
+ } else if (options.getRunMode() == NanoOKOptions.MODE_ANALYSE) {
+ options.checkAnalysisDirectoryStructure();
+ analyse(options);
+ //scan(options);
+ } else if (options.getRunMode() == NanoOKOptions.MODE_COMPARE) {
+ compare(options);
+ } else if (options.getRunMode() == NanoOKOptions.MODE_WATCH) {
+ watch(options);
+ } else if (options.getRunMode() == NanoOKOptions.MODE_PROCESS) {
+ process(options);
+ }
+
+ //memoryReport();
+
+ options.getLog().close();
+
+ options.getThreadExecutor().shutdown();
+
+ if (options.getReturnValue() != 0) {
+ System.out.println("Exiting with error code");
+ System.exit(options.getReturnValue());
+ }
+ }
+}
diff --git a/src/nanook/NanoOKLog.java b/src/nanook/NanoOKLog.java
new file mode 100644
index 0000000..16c6290
--- /dev/null
+++ b/src/nanook/NanoOKLog.java
@@ -0,0 +1,76 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Serializable;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+
+/**
+ * Logging
+ *
+ * @author Richard Leggett
+ */
+public class NanoOKLog implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private transient PrintWriter pw = null;
+
+ public NanoOKLog() {
+ }
+
+ public synchronized void open(String filename) {
+ try {
+ pw = new PrintWriter(new FileWriter(filename, false));
+ } catch (IOException e) {
+ System.out.println("NanoOKLog exception");
+ e.printStackTrace();
+ }
+ }
+
+ public synchronized void close() {
+ if (pw != null) {
+ pw.close();
+ }
+ }
+
+ public String getTime() {
+ GregorianCalendar timeNow = new GregorianCalendar();
+ String s = String.format("%d/%d/%d %02d:%02d:%02d",
+ timeNow.get(Calendar.DAY_OF_MONTH),
+ timeNow.get(Calendar.MONTH)+1,
+ timeNow.get(Calendar.YEAR),
+ timeNow.get(Calendar.HOUR_OF_DAY),
+ timeNow.get(Calendar.MINUTE),
+ timeNow.get(Calendar.SECOND));
+ return s;
+ }
+
+ public synchronized void writeTimeStamp() {
+ if (pw != null) {
+ }
+ }
+
+ public synchronized void print(String s) {
+ if (pw != null) {
+ pw.print(getTime() + " " + s);
+ }
+ }
+
+ public synchronized void println(String s) {
+ if (pw != null) {
+ pw.println(getTime() + " " + s);
+ }
+ }
+
+ public synchronized PrintWriter getPrintWriter() {
+ return pw;
+ }
+}
diff --git a/src/nanook/NanoOKOptions.java b/src/nanook/NanoOKOptions.java
new file mode 100644
index 0000000..de8376e
--- /dev/null
+++ b/src/nanook/NanoOKOptions.java
@@ -0,0 +1,1389 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Representation of program options and some global constants.
+ *
+ * @author Richard Leggett
+ */
+public class NanoOKOptions implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ public final static int MAX_KMER = 20000;
+ public final static int MAX_READ_LENGTH = 1000000;
+ public final static int MAX_READS = 1000000;
+ public final static int MODE_EXTRACT = 1;
+ public final static int MODE_ALIGN = 2;
+ public final static int MODE_ANALYSE = 3;
+ public final static int MODE_COMPARE = 4;
+ public final static int MODE_WATCH = 5;
+ public final static int MODE_PROCESS = 6;
+ public final static int FASTA = 1;
+ public final static int FASTQ = 2;
+ public final static int TYPE_TEMPLATE = 0;
+ public final static int TYPE_COMPLEMENT = 1;
+ public final static int TYPE_2D = 2;
+ public final static int TYPE_ALL = -1;
+ public final static int TYPE_INSERTION = 0;
+ public final static int TYPE_DELETION = 1;
+ public final static int TYPE_SUBSTITUTION = 2;
+ public final static int READTYPE_COMBINED = 0;
+ public final static int READTYPE_PASS = 1;
+ public final static int READTYPE_FAIL = 2;
+ public final static int MIN_ALIGNMENTS = 10;
+ public final static int PROGRESS_WIDTH = 50;
+ private References references = new References(this);
+ private String referenceFile=null;
+ private String sampleDirectory = null;
+ private String sampleName = null;
+ private String scriptsDir="/Users/leggettr/Documents/github/nanotools/scripts";
+ private String aligner="last";
+ private String alignerParams="";
+ private String scheduler="system";
+ private String sampleList = null;
+ private String comparisonDir = null;
+ private String bacteriaPath = null;
+ private String ntPath = null;
+ private String cardPath = null;
+ private String processFile = null;
+ private int coverageBinSize = 100;
+ private boolean processPassReads = true;
+ private boolean processFailReads = true;
+ private boolean parseAlignments = true;
+ private boolean plotGraphs = true;
+ private boolean makeReport = true;
+ private boolean makePDF = true;
+ private int maxReads = 0;
+ private boolean process2DReads = true;
+ private boolean processTemplateReads = true;
+ private boolean processComplementReads = true;
+ private boolean fixIDs = false;
+ private boolean fixRandom = false;
+ private boolean doKmerCounting = true;
+ private boolean showAlignerCommand = false;
+ private boolean extractingReads = false;
+ private boolean aligningReads = false;
+ private boolean parsingReads = false;
+ private boolean blastingReads = false;
+ private boolean mergeFastaFiles = false;
+ private boolean force = false;
+ private double minQForPass = -1;
+ private int runMode = 0;
+ private int readFormat = FASTA;
+ private int numThreads = 1;
+ private int fileWatcherTimeout = 10;
+ private String jobQueue = "";
+ private NanoOKLog logFile = new NanoOKLog();
+ private String imageFormat = "pdf";
+ private int specifiedType = TYPE_2D;
+ private String readsDir = "fast5";
+ private int returnValue = 0;
+ private int basecallIndex = -1;
+ private boolean outputFast5Path = true;
+ private int readsPerBlast = 500;
+ private boolean clearLogsOnStart = true;
+ private transient WatcherLog watcherReadLog = new WatcherLog(this);
+ private transient WatcherLog watcherCardFileLog = new WatcherLog(this);
+ private transient WatcherLog watcherntFileLog = new WatcherLog(this);
+ private transient WatcherLog watcherCardCommandLog = new WatcherLog(this);
+ private transient WatcherLog watcherntCommandLog = new WatcherLog(this);
+ private transient BlastMerger mergerCardPass = new BlastMerger(this);
+ private transient BlastMerger mergerntPass = new BlastMerger(this);
+ private transient BlastMerger mergerCardFail = new BlastMerger(this);
+ private transient BlastMerger mergerntFail = new BlastMerger(this);
+ private transient MergedFastAQFile mergedPass2D;
+ private transient MergedFastAQFile mergedPass1D;
+ private transient MergedFastAQFile mergedFail1D;
+ private transient MergedFastAQFile mergedFail2D;
+ private transient ThreadPoolExecutor executor;
+ private transient BlastHandler[][] blastHandlers = new BlastHandler[3][2];
+ private transient ArrayList<String> blastProcesses = new ArrayList<String>();
+ private int fileCounterOffset = 0;
+ private transient ReadFileMerger readFileMerger;
+ private transient SampleChecker sampleChecker = new SampleChecker(this);
+
+ public NanoOKOptions() {
+ String value = System.getenv("NANOOK_DIR");
+
+ if (value != null) {
+ scriptsDir = value + File.separator + "bin";
+ } else {
+ System.out.println("*** WARNING: You should set NANOOK_DIR. Default value unlikely to work. ***");
+ System.out.println("");
+ }
+
+ System.out.println("Scripts dir: "+scriptsDir);
+ }
+
+ public References getReferences() {
+ return references;
+ }
+
+ public void setReferences(References r) {
+ references = r;
+ }
+
+ public void setReturnValue(int r) {
+ returnValue = r;
+ }
+
+ public int getReturnValue() {
+ return returnValue;
+ }
+
+ /**
+ * Parse command line arguments.
+ * @param args array of command line arguments
+ */
+ public void parseArgs(String[] args) {
+ int i=0;
+
+ if (args.length <= 1) {
+ System.out.println("");
+ System.out.println("Syntax nanook <extract|align|analyse|compare|process> [options]");
+ System.out.println("");
+ System.out.println("extract options:");
+ System.out.println(" -s|-sample <dir> specifies sample directory");
+ System.out.println(" -f|-reads specifies alternative dir for FAST5 files (default fast5)");
+ System.out.println(" Can be absolute (beginning with /) or relative");
+ System.out.println(" e.g. -f reads/downloads if replicating Metrichor file structure");
+ System.out.println(" -a|-fasta specifies FASTA file extraction (default)");
+ System.out.println(" -q|-fastq specifies FASTQ file extraction");
+ System.out.println(" -basecallindex specifies the index of the analysis (default: latest)");
+ //System.out.println(" -printpath to output FAST5 path in FASTA read header");
+ System.out.println(" -mergereads to generate merged FASTA files in addition to single read files");
+ System.out.println(" -minquality <value> to set the minimum quality for a 'pass' read");
+ System.out.println("");
+ System.out.println("align options:");
+ System.out.println(" -s|-sample <dir> specifies sample directory");
+ System.out.println(" -r|-reference <path> specifies path to reference database");
+ System.out.println(" -aligner <name> specifies the aligner (default last)");
+ System.out.println(" -alignerparams <params> specifies paramters to the aligner");
+ System.out.println(" -showaligns echoes aligner commands to screen");
+ System.out.println("");
+ System.out.println("analyse options:");
+ System.out.println(" -s|-sample <dir> specifies sample directory");
+ System.out.println(" -r|-reference <path> specifies path to reference database");
+ System.out.println(" -aligner <name> specifies the aligner (default last)");
+ System.out.println(" -coveragebin <int> specifies coverage bin size (default 100)");
+ System.out.println(" -bitmaps to output bitmap PNG graphs instead of PDF");
+ System.out.println("");
+ System.out.println("compare options:");
+ System.out.println(" -l|-samplelist <file> specifies a sample list file");
+ System.out.println(" -o|-outputdir <directory> specifies an output directory");
+ System.out.println(" -type <2d|template|complement> specifies an output directory");
+ System.out.println("");
+ System.out.println("process options:");
+ System.out.println(" -process <file> specifies a process file");
+ System.out.println("");
+ //System.out.println("Sample type options:");
+ //System.out.println(" -barcoding if reads are barcoded and sorted into subdirs");
+ //System.out.println(" -batchdirs if using MinKNOW 1.4.2 or above with separate batch_ directories");
+ //System.out.println("");
+ System.out.println("Read type options:");
+ System.out.println(" -passonly to analyse only pass reads");
+ System.out.println(" -failonly to analyse only fail reads");
+ System.out.println(" -2donly to analyse only 2D reads");
+ System.out.println(" -templateonly to analyse just Template reads");
+ System.out.println(" -complementonly to analyse just Complement reads");
+ System.out.println("");
+ System.out.println("Other options:");
+ System.out.println(" -t|-numthreads <number> specifies the number of threads to use (default 1)");
+ System.out.println(" -log <filename> enables debug logging to file");
+ System.out.println(" -force to force NanoOK to ignore warnings");
+ System.out.println(" -timeout to set the number of seconds before giving up waiting for new reads (default 2)");
+ System.out.println("");
+ System.out.println("Valid aligners: last, bwa, blasr, marginalign, graphmap");
+ System.out.println("");
+ System.exit(0);
+ }
+
+ parseAlignments = true;
+ plotGraphs = true;
+ makeReport = true;
+
+ if (args[i].equals("extract")) {
+ runMode = MODE_EXTRACT;
+ extractingReads = true;
+ aligningReads = false;
+ parsingReads = false;
+ blastingReads = false;
+ mergeFastaFiles = false;
+ fileWatcherTimeout = 2;
+ } else if (args[i].equals("align")) {
+ runMode = MODE_ALIGN;
+ extractingReads = false;
+ aligningReads = true;
+ parsingReads = false;
+ blastingReads = false;
+ mergeFastaFiles = false;
+ fileWatcherTimeout = 2;
+ } else if (args[i].equals("analyse") || args[i].equals("analyze")) {
+ runMode = MODE_ANALYSE;
+ extractingReads = false;
+ aligningReads = false;
+ parsingReads = true;
+ blastingReads = false;
+ mergeFastaFiles = false;
+ fileWatcherTimeout = 2;
+ } else if (args[i].equals("compare")) {
+ runMode = MODE_COMPARE;
+ } else if (args[i].equals("watch")) {
+ runMode = MODE_WATCH;
+ } else if ((args[i].equals("process")) || (args[i].equals("scan"))) {
+ runMode = MODE_PROCESS;
+ } else {
+ System.out.println("Unknonwn mode " + args[i] + " - must be extract, align or analyse");
+ System.exit(1);
+ }
+ i++;
+
+ while (i < (args.length)) {
+ if (args[i].equalsIgnoreCase("-coveragebin")) {
+ coverageBinSize = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-batchdirs")) {
+ System.out.println("-batchdirs option ignore - now detected automatically.");
+ i++;
+ } else if (args[i].equalsIgnoreCase("-timeout")) {
+ fileWatcherTimeout = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-fileoffset")) {
+ fileCounterOffset = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-reference") || args[i].equalsIgnoreCase("-r")) {
+ referenceFile = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-process")) {
+ processFile = args[i+1];
+ readProcessFile();
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-bacteria")) {
+ bacteriaPath = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-nt")) {
+ ntPath = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-card")) {
+ cardPath = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-force")) {
+ force = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-sample") | args[i].equalsIgnoreCase("-s")) {
+ sampleDirectory = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-samplelist") | args[i].equalsIgnoreCase("-l")) {
+ sampleList = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-outputdir") | args[i].equalsIgnoreCase("-o")) {
+ comparisonDir = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-reads") | args[i].equalsIgnoreCase("-f")) {
+ readsDir = args[i+1];
+ if (readsDir.endsWith("/")) {
+ readsDir = readsDir.substring(0, readsDir.length()-1);
+ }
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-maxreads")) {
+ maxReads = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-log")) {
+ logFile.open(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-type")) {
+ if (args[i+1].equalsIgnoreCase("template")) {
+ specifiedType = TYPE_TEMPLATE;
+ } else if (args[i+1].equalsIgnoreCase("complement")) {
+ specifiedType = TYPE_COMPLEMENT;
+ } else if (args[i+1].equalsIgnoreCase("2d")) {
+ specifiedType = TYPE_2D;
+ }
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-nofail") || args[i].equalsIgnoreCase("-passonly")) {
+ processPassReads = true;
+ processFailReads = false;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-nopass") || args[i].equalsIgnoreCase("-failonly")) {
+ processPassReads = false;
+ processFailReads = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-fasta") || args[i].equalsIgnoreCase("-a")) {
+ //if (runMode == MODE_EXTRACT) {
+ readFormat = FASTA;
+ //}
+ i++;
+ } else if (args[i].equalsIgnoreCase("-fastq") || args[i].equalsIgnoreCase("-q")) {
+ //if (runMode == MODE_EXTRACT) {
+ readFormat = FASTQ;
+ //}
+ i++;
+ } else if (args[i].equalsIgnoreCase("-2donly")) {
+ process2DReads = true;
+ processTemplateReads = false;
+ processComplementReads = false;
+ i++;
+ } else if ((args[i].equalsIgnoreCase("-1d")) ||
+ (args[i].equalsIgnoreCase("-templateonly")) ) {
+ process2DReads = false;
+ processTemplateReads = true;
+ processComplementReads = false;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-complementonly")) {
+ process2DReads = false;
+ processTemplateReads = false;
+ processComplementReads = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-printpath")) {
+ outputFast5Path = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-bitmaps")) {
+ imageFormat = "png";
+ i++;
+ } else if (args[i].equalsIgnoreCase("-fixids")) {
+ fixIDs = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-showaligns")) {
+ showAlignerCommand = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-deterministic")) {
+ fixRandom = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-aligner")) {
+ aligner = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-alignerparams")) {
+ alignerParams = args[i+1];
+ System.out.println("Alignment parameters: "+alignerParams);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-scheduler")) {
+ scheduler = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-queue")) {
+ jobQueue = args[i+1];
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-readsperblast")) {
+ readsPerBlast = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-basecallindex")) {
+ basecallIndex = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-numthreads") || args[i].equalsIgnoreCase("-t")) {
+ numThreads = Integer.parseInt(args[i+1]);
+ i+=2;
+ } else if (args[i].equalsIgnoreCase("-subdirs") || args[i].equalsIgnoreCase("-barcoding")) {
+ System.out.println("-barcoding option ignore - now detected automatically.");
+ i++;
+ } else if (args[i].equalsIgnoreCase("-keeplogs")) {
+ clearLogsOnStart = false;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-mergereads")) {
+ mergeFastaFiles = true;
+ i++;
+ } else if (args[i].equalsIgnoreCase("-minquality")) {
+ minQForPass = Double.parseDouble(args[i+1]);
+ i+=2;
+ } else {
+ System.out.println("Unknown parameter: " + args[i]);
+ System.exit(0);
+ }
+ }
+
+ if ((runMode == MODE_ALIGN) || (runMode == MODE_ANALYSE)) {
+ if (referenceFile == null) {
+ System.out.println("Error: You must specify a reference");
+ System.exit(1);
+ }
+ if (!referenceFile.endsWith(".fa") && !referenceFile.endsWith(".fasta")) {
+ System.out.println("Error: reference must specify a .fa or .fasta file");
+ System.exit(1);
+ }
+ }
+
+ if (runMode == MODE_PROCESS) {
+ if (processFile == null) {
+ System.out.println("Error: you must specify a process file");
+ System.exit(1);
+ }
+ }
+
+ if (runMode == MODE_COMPARE) {
+ if (comparisonDir == null) {
+ System.out.println("Error: you must specify an output dir for the comparison");
+ System.exit(1);
+ } else {
+ checkAndMakeComparisonDirs();
+ }
+ } else {
+ if (sampleDirectory == null) {
+ System.out.println("Error: You must specify a sample");
+ System.exit(1);
+ } else {
+ File s = new File(sampleDirectory);
+ if (!s.exists()) {
+ System.out.println("Error: sample directory doesn't exist");
+ System.exit(1);
+ }
+
+ if (!s.isDirectory()) {
+ System.out.println("Error: sample doesn't point to a directory");
+ System.exit(1);
+ }
+
+ sampleDirectory = s.getAbsolutePath();
+
+ sampleName = s.getName();
+ }
+ }
+
+ initialiseBlastHandlers();
+
+ System.out.println("Number of cores: "+Runtime.getRuntime().availableProcessors());
+
+ executor = new ThreadPoolExecutor(numThreads, numThreads, 10, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ }
+
+ public String getAligner() {
+ return aligner;
+ }
+
+ public String getAlignerParams() {
+ return alignerParams;
+ }
+
+ public void setReadFormat(int f) {
+ readFormat = f;
+ //System.out.println("Read format "+f);
+ }
+
+ /**
+ * Get sample name.
+ * @return name String
+ */
+ public String getSample() {
+ return sampleName;
+ }
+
+ /**
+ * Get name of references file.
+ * @return filename String
+ */
+ public String getReferenceFile() {
+ return referenceFile;
+ }
+
+ /**
+ * Get coverage graph bin size.
+ * @return bin size
+ */
+ public int getCoverageBinSize() {
+ return coverageBinSize;
+ }
+
+ /**
+ * Get a type string (Template, Complement, 2D) from an integer.
+ * @param n integer to convert
+ * @return type String
+ */
+ public static String getTypeFromInt(int n) {
+ String typeString;
+
+ switch(n) {
+ case TYPE_TEMPLATE: typeString = "Template"; break;
+ case TYPE_COMPLEMENT: typeString = "Complement"; break;
+ case TYPE_2D: typeString = "2D"; break;
+ default: typeString = "Unknown"; break;
+ }
+
+ return typeString;
+ }
+
+ public static String getPassFailFromInt(int n) {
+ String typeString;
+
+ switch(n) {
+ case READTYPE_PASS: typeString = "pass"; break;
+ case READTYPE_FAIL: typeString = "fail"; break;
+ default: typeString = "Unknown"; break;
+ }
+
+ return typeString;
+ }
+
+ /**
+ * Get an error type string (Insertion, Deletion, Substitution) from an integer.
+ * @param n error type integer
+ * @return type String
+ */
+ public static String getErrorTypeFromInt(int n) {
+ String typeString;
+
+ switch(n) {
+ case TYPE_INSERTION: typeString = "Insertion"; break;
+ case TYPE_DELETION: typeString = "Deletion"; break;
+ case TYPE_SUBSTITUTION: typeString = "Substitution"; break;
+ default: typeString = "Unknown"; break;
+ }
+
+ return typeString;
+ }
+
+ /**
+ * Check if various required directories exist and create if not.
+ */
+ public void checkAnalysisDirectoryStructure() {
+ File analysisDir = new File(getAnalysisDir());
+ File unalignedAnalysisDir = new File(getAnalysisDir()+File.separator+"Unaligned");
+ File graphsDir = new File(getGraphsDir());
+ File motifsDir = new File(getGraphsDir() + File.separator + "motifs");
+ File latexDir = new File(getLatexDir());
+
+ if (!analysisDir.exists()) {
+ analysisDir.mkdir();
+ }
+
+ if (!unalignedAnalysisDir.exists()) {
+ unalignedAnalysisDir.mkdir();
+ }
+
+ if (!graphsDir.exists()) {
+ graphsDir.mkdir();
+ }
+
+ if (!motifsDir.exists()) {
+ motifsDir.mkdir();
+ }
+
+ if (!latexDir.exists()) {
+ latexDir.mkdir();
+ }
+ }
+
+ public void checkAndMakeComparisonDirs() {
+ File f = new File(comparisonDir);
+ if (!f.exists()) {
+ f.mkdir();
+ }
+
+ f = new File(this.getGraphsDir());
+ if (!f.exists()) {
+ f.mkdir();
+ }
+
+ f = new File(this.getLatexDir());
+ if (!f.exists()) {
+ f.mkdir();
+ }
+
+ f = new File(this.getLogsDir());
+ if (!f.exists()) {
+ f.mkdir();
+ }
+
+ f = new File(this.getLogsDir()+File.separator+"R");
+ if (!f.exists()) {
+ f.mkdir();
+ }
+ }
+
+ /**
+ * Check if an analysis reference directory exists and make if not.
+ * @param reference name of reference
+ */
+ public void checkAndMakeReferenceAnalysisDir(String reference) {
+ File analysisDir = new File(getAnalysisDir() + File.separator + reference);
+ File graphsDir = new File(getGraphsDir() + File.separator + reference);
+
+ if (!analysisDir.exists()) {
+ analysisDir.mkdir();
+ }
+ if (!graphsDir.exists()) {
+ graphsDir.mkdir();
+ }
+
+ }
+
+ /**
+ * Create a new alignment summary file.
+ */
+ public void initialiseAlignmentSummaryFile() {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(this.getAlignmentSummaryFilename()));
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("initialiseAlignmentSummaryFile exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Get filename of alignment summary file.
+ * @return filename String
+ */
+ public String getAlignmentSummaryFilename() {
+ return getAnalysisDir() + File.separator + "all_summary.txt";
+ }
+
+ /**
+ * Get filename of length summary file.
+ * @return filename String
+ */
+ public String getLengthSummaryFilename() {
+ return getAnalysisDir() + File.separator + "length_summary.txt";
+ }
+
+ /**
+ * Get scripts directory.
+ * @return directory name as String
+ */
+ public String getScriptsDir() {
+ return scriptsDir;
+ }
+
+ public String getSampleDirectory() {
+ return sampleDirectory;
+ }
+
+ /**
+ * Get graphs directory.
+ * @return directory name as String
+ */
+ public String getGraphsDir() {
+ if (runMode == MODE_COMPARE) {
+ return comparisonDir + File.separator + "graphs";
+ } else {
+ return sampleDirectory + File.separator + "graphs" + getAnalysisSuffix();
+ }
+ }
+
+ public String getFastaDir() {
+ return sampleDirectory + File.separator + "fasta";
+ }
+
+ public String getFastqDir() {
+ return sampleDirectory + File.separator + "fastq";
+ }
+
+ public String getFast5Dir() {
+ // Check for full path
+ if ((readsDir.startsWith("/")) || (readsDir.startsWith("~")) || (readsDir.startsWith("."))) {
+ return readsDir;
+ } else {
+ return sampleDirectory + File.separator + readsDir;
+ }
+ }
+
+ /**
+ * Get FASTA directory.
+ * @return directory name as String
+ */
+ public String getReadDir() {
+ String dir;
+
+ if (readFormat == FASTQ) {
+ dir = getFastqDir();
+ } else {
+ dir = getFastaDir();
+ }
+
+ return dir;
+ }
+
+ public String getExpectedReadFormat() {
+ String format;
+
+ if (readFormat == FASTQ) {
+ format = "FASTQ";
+ } else {
+ format = "FASTA";
+ }
+
+ return format;
+ }
+
+ /**
+ * Get LAST directory.
+ * @return directory name as String
+ */
+ public String getAlignerDir() {
+ return sampleDirectory + File.separator + aligner;
+ }
+
+ /**
+ * Get LAST directory.
+ * @return directory name as String
+ */
+ public String getParserDir() {
+ return sampleDirectory + File.separator + aligner+"_parse";
+ }
+
+ /**
+ * Get LaTeX directory.
+ * @return directory name as String
+ */
+ public String getLatexDir() {
+ if (runMode == MODE_COMPARE) {
+ return comparisonDir + File.separator + "latex" + getAnalysisSuffix();
+ } else {
+ return sampleDirectory + File.separator + "latex" + getAnalysisSuffix();
+ }
+ }
+
+ /**
+ * Get logs directory.
+ * @return directory name as String
+ */
+ public String getLogsDir() {
+ if (runMode == MODE_COMPARE) {
+ return comparisonDir + File.separator + "logs";
+ } else {
+ return sampleDirectory + File.separator + "logs";
+ }
+ }
+
+ //public boolean isPassFailFast5Dir() {
+ // File passDir = new File(getFast5Dir() + File.separator + "pass");
+ // File failDir = new File(getFast5Dir() + File.separator + "fail");
+ // boolean rc = false;
+
+ // if (((passDir.exists() && passDir.isDirectory()) || (failDir.exists() && failDir.isDirectory()))) {
+ // rc = true;
+ // }
+
+ // return rc;
+ //}
+
+
+
+ //public boolean isPassFailReadDir() {
+ // File passDir = new File(getReadDir() + File.separator + "pass");
+ // File failDir = new File(getReadDir() + File.separator + "pass");
+ // boolean rc = false;
+ //
+ // if (((passDir.exists() && passDir.isDirectory()) || failDir.exists() && failDir.isDirectory())) {
+ // rc = true;
+ // }
+ //
+ //
+ //
+ // return rc;
+ //}
+
+ public String getAnalysisSuffix() {
+ String s = new String("_"+aligner);
+ if (processPassReads && processFailReads) {
+ s += "_passfail";
+ } else if (processPassReads) {
+ s += "_passonly";
+ } else if (processFailReads) {
+ s += "_failonly";
+ }
+
+ if (!processTemplateReads && !processComplementReads) {
+ s += "_2donly";
+ }
+
+ return s;
+ }
+
+
+ /**
+ * Get analysis directory.
+ * @return directory name as String
+ */
+ public String getAnalysisDir() {
+ return sampleDirectory + File.separator + "analysis" + getAnalysisSuffix();
+ }
+
+ /**
+ * Get LaTeX filename.
+ * @return filename as String
+ */
+ public String getTexFilename() {
+ return sampleDirectory + File.separator + "latex" + getAnalysisSuffix() + File.separator + sampleName + ".tex";
+ }
+
+ /**
+ * Check if processing "pass" reads.
+ * @return true to process
+ */
+ public boolean isProcessingPassReads() {
+ return processPassReads;
+ }
+
+ /**
+ * Check if processing "fail" reads.
+ * @return true to process
+ */
+ public boolean isProcessingFailReads() {
+ return processFailReads;
+ }
+
+ public boolean isProcessingComplementReads() {
+ return processComplementReads;
+ }
+
+ public boolean isProcessingTemplateReads() {
+ return processTemplateReads;
+ }
+
+ public boolean isProcessing2DReads() {
+ return process2DReads;
+ }
+
+ public boolean isProcessingReadType(int type) {
+ boolean r = false;
+
+ switch(type) {
+ case TYPE_ALL:
+ r = true;
+ break;
+ case TYPE_TEMPLATE:
+ r = processTemplateReads;
+ break;
+ case TYPE_COMPLEMENT:
+ r = processComplementReads;
+ break;
+ case TYPE_2D:
+ r = process2DReads;
+ break;
+ }
+
+ return r;
+ }
+
+ public int getNumberOfTypes() {
+ int t = 0;
+ if (processTemplateReads) t++;
+ if (processComplementReads) t++;
+ if (process2DReads) t++;
+ return t;
+ }
+
+ /**
+ * Check if to parse alignments or not
+ * @return true to parse
+ */
+ public boolean doParseAlignments() {
+ return parseAlignments;
+ }
+
+ /**
+ * Check if to plot graphs or not
+ * @return true to plot
+ */
+ public boolean doPlotGraphs() {
+ return plotGraphs;
+ }
+
+ /**
+ * Check if to make report or not
+ * @return true to make report
+ */
+ public boolean doMakeReport() {
+ return makeReport;
+ }
+
+ /**
+ * Check if to make report or not
+ * @return true to make report
+ */
+ public boolean doMakePDF() {
+ return makePDF;
+ }
+
+ /**
+ * Get maximum number of reads (used for debugging)
+ * @return maximum number of reads
+ */
+ public int getMaxReads() {
+ return maxReads;
+ }
+
+ public int getReadFormat() {
+ return readFormat;
+ }
+
+ public int getRunMode() {
+ return runMode;
+ }
+
+ public boolean fixIDs() {
+ return fixIDs;
+ }
+
+ public boolean fixRandom() {
+ return fixRandom;
+ }
+
+ public String getScheduler() {
+ return scheduler;
+ }
+
+ public int getNumberOfThreads() {
+ return numThreads;
+ }
+
+ public String getQueue() {
+ return jobQueue;
+ }
+
+ public NanoOKLog getLog() {
+ return logFile;
+ }
+
+ public boolean isBarcoded() {
+ return sampleChecker.usingBarcodes();
+ }
+
+ /**
+ * Get the right parser
+ * @param options
+ * @return
+ */
+ public AlignmentFileParser getParser() {
+ AlignmentFileParser parser = null;
+
+ switch(aligner) {
+ case "last":
+ parser = new LastParser(this, references);
+ break;
+ case "bwa":
+ parser = new BWAParser(this, references);
+ break;
+ case "blasr":
+ parser = new BLASRParser(this, references);
+ break;
+ case "marginalign":
+ parser = new MarginAlignParser(this, references);
+ break;
+ case "graphmap":
+ parser = new GraphMapParser(this, references);
+ break;
+ default:
+ System.out.println("Aligner unknown!");
+ System.out.println("");
+ System.exit(1);
+ break;
+ }
+
+ if (alignerParams != "") {
+ parser.setAlignmentParams(alignerParams);
+ }
+
+ return parser;
+ }
+
+ public boolean doKmerCounting() {
+ return doKmerCounting;
+ }
+
+ public String getImageFormat() {
+ return imageFormat;
+ }
+
+ public String getSampleList() {
+ return sampleList;
+ }
+
+ public String getComparisonDir() {
+ return comparisonDir;
+ }
+
+ public int getSpecifiedType() {
+ return specifiedType;
+ }
+
+ public boolean showAlignerCommand() {
+ return showAlignerCommand;
+ }
+
+ public int getBasecallIndex() {
+ return basecallIndex;
+ }
+
+ public boolean outputFast5Path() {
+ return outputFast5Path;
+ }
+
+ public String getBacteriaPath() {
+ if (bacteriaPath == null) {
+ System.out.println("Error: no nt path set.\n");
+ System.exit(1);
+ }
+
+ return bacteriaPath;
+ }
+
+ public String getntPath() {
+ if (ntPath == null) {
+ System.out.println("Error: no nt path set.\n");
+ System.exit(1);
+ }
+
+ return ntPath;
+ }
+
+ public String getCardPath() {
+ if (cardPath == null) {
+ System.out.println("Error: no CARD path set.\n");
+ System.exit(1);
+ }
+
+ return cardPath;
+ }
+
+ public WatcherLog getWatcherReadLog() {
+ return watcherReadLog;
+ }
+
+ public WatcherLog getWatcherCardFileLog() {
+ return watcherCardFileLog;
+ }
+
+ public WatcherLog getWatcherCardCommandLog() {
+ return watcherCardCommandLog;
+ }
+
+ public WatcherLog getWatcherntFileLog() {
+ return watcherntFileLog;
+ }
+
+ public WatcherLog getWatcherntCommandLog() {
+ return watcherntCommandLog;
+ }
+
+ public BlastMerger getMergerCardPass() {
+ return mergerCardPass;
+ }
+
+ public BlastMerger getMergerntPass() {
+ return mergerntPass;
+ }
+
+ public BlastMerger getMergerCardFail() {
+ return mergerCardFail;
+ }
+
+ public BlastMerger getMergerntFail() {
+ return mergerntFail;
+ }
+
+
+ public boolean clearLogsOnStart() {
+ return clearLogsOnStart;
+ }
+
+ public void openMergedFile(String filename, int type, int pf) {
+ if (pf == NanoOKOptions.READTYPE_PASS) {
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ mergedPass1D = new MergedFastAQFile(this, filename);
+ } else if (type == NanoOKOptions.TYPE_2D) {
+ mergedPass2D = new MergedFastAQFile(this, filename);
+ }
+ } else if (pf == NanoOKOptions.READTYPE_FAIL) {
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ mergedFail1D = new MergedFastAQFile(this, filename);
+ } else if (type == NanoOKOptions.TYPE_2D) {
+ mergedFail2D = new MergedFastAQFile(this, filename);
+ }
+ }
+ }
+
+ public MergedFastAQFile getMergedFile(int type, int pf) {
+ if (pf == NanoOKOptions.READTYPE_PASS) {
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ return mergedPass1D;
+ } else if (type == NanoOKOptions.TYPE_2D) {
+ return mergedPass2D;
+ }
+ } else if (pf == NanoOKOptions.READTYPE_FAIL) {
+ if (type == NanoOKOptions.TYPE_TEMPLATE) {
+ return mergedFail1D;
+ } else if (type == NanoOKOptions.TYPE_2D) {
+ return mergedFail2D;
+ }
+ }
+
+ return null;
+ }
+
+ public int getReadsPerBlast() {
+ return readsPerBlast;
+ }
+
+ public ThreadPoolExecutor getThreadExecutor() {
+ return executor;
+ }
+
+ public boolean keepRunning() {
+ return true;
+ }
+
+ public boolean isExtractingReads() {
+ return extractingReads;
+ }
+
+ public boolean isAligningRead() {
+ return aligningReads;
+ }
+
+ public boolean isParsingRead() {
+ return parsingReads;
+ }
+
+ public boolean isBlastingRead() {
+ return blastingReads;
+ }
+
+ public int getFileWatcherTimeout() {
+ return fileWatcherTimeout;
+ }
+
+ private void checkAndMakeDirectory(String dir) {
+ File f = new File(dir);
+ if (f.exists()) {
+ if (!f.isDirectory()) {
+ System.out.println("Error: " + dir + " is a file, not a directory!");
+ System.exit(1);
+ }
+ } else {
+ System.out.println("Making directory " + dir);
+ f.mkdir();
+ }
+ }
+
+ private void checkAndMakeDirectoryWithChildren(String dirname) {
+ checkAndMakeDirectory(dirname);
+ for (int t=0; t<3; t++) {
+ if (this.isProcessingReadType(t)) {
+ checkAndMakeDirectory(dirname + File.separator + NanoOKOptions.getTypeFromInt(t));
+ }
+ }
+ }
+
+ // Directory structure
+ // fast5
+ // - pass
+ // - BC01
+ // - BC02
+ // - fail
+ // - unaligned
+ // fasta
+ // - pass
+ // - BC01
+ // - 2D
+ // - Template
+ // - Complement
+ // - BC02
+ // ...
+ // - fail
+ public void makeDirectories() {
+ checkAndMakeDirectory(this.getLogsDir());
+
+ if (this.isExtractingReads()) {
+ checkAndMakeDirectory(this.getReadDir());
+
+ if (this.isBlastingRead()) {
+ checkAndMakeDirectory(this.getReadDir() + "_chunks");
+ }
+ //if (this.isNewStyleDir()) {
+ // for (int i=READTYPE_PASS; i<=READTYPE_FAIL; i++) {
+ // String pf = NanoOKOptions.getPassFailFromInt(i);
+ // checkAndMakeDirectoryWithChildren(this.getReadDir() + File.separator + pf);
+ // if (this.processSubdirs()) {
+ // File inputDir = new File(this.getFast5Dir());
+ // File[] listOfFiles = inputDir.listFiles();
+ // for (File file : listOfFiles) {
+ // if (file.isDirectory()) {
+ // checkAndMakeDirectoryWithChildren(this.getReadDir() + File.separator + file.getName());
+ // }
+ // }
+ // }
+ // }
+ //}
+ }
+
+ if (this.isAligningRead()) {
+ checkAndMakeDirectory(this.getAlignerDir());
+ checkAndMakeDirectory(this.getLogsDir() + File.separator + this.getAligner());
+ //if (this.isNewStyleReadDir()) {
+ // for (int i=READTYPE_PASS; i<=READTYPE_FAIL; i++) {
+ // String pf = NanoOKOptions.getPassFailFromInt(i);
+ // checkAndMakeDirectoryWithChildren(this.getAlignerDir() + File.separator + pf);
+ // checkAndMakeDirectoryWithChildren(this.getLogsDir() + File.separator + this.getAligner() + File.separator + pf);
+ // if (this.processSubdirs()) {
+ // File inputDir = new File(this.getReadDir());
+ // File[] listOfFiles = inputDir.listFiles();
+ // for (File file : listOfFiles) {
+ // if (file.isDirectory()) {
+ // checkAndMakeDirectoryWithChildren(this.getAlignerDir() + File.separator + file.getName());
+ // checkAndMakeDirectoryWithChildren(this.getLogsDir() + File.separator + this.getAligner() + File.separator + file.getName());
+ // }
+ // }
+ // }
+ // }
+ //}
+ }
+
+ if (this.isParsingRead()) {
+ checkAndMakeDirectory(this.getParserDir());
+ //if (this.isNewStyleReadDir()) {
+ // for (int i=READTYPE_PASS; i<=READTYPE_FAIL; i++) {
+ // String pf = NanoOKOptions.getPassFailFromInt(i);
+ // checkAndMakeDirectoryWithChildren(this.getParserDir() + File.separator + pf);
+ // if (this.processSubdirs()) {
+ // File inputDir = new File(this.getReadDir());
+ // File[] listOfFiles = inputDir.listFiles();
+ // for (File file : listOfFiles) {
+ // if (file.isDirectory()) {
+ // checkAndMakeDirectoryWithChildren(this.getParserDir() + File.separator + file.getName());
+ // }
+ // }
+ // }
+ // }
+ //}
+ }
+
+ if (this.isBlastingRead()) {
+ for (int i=0; i<blastProcesses.size(); i++) {
+ String[] params = blastProcesses.get(i).split(",");
+ if (params.length == 5) {
+ String blastName = params[0];
+ String blastTool = params[1];
+ String blastDb = params[2];
+ String memory = params[3];
+ String queue = params[4];
+ checkAndMakeDirectory(getSampleDirectory() + File.separator + blastTool + "_" + blastName + File.separator);
+ checkAndMakeDirectory(getLogsDir() + File.separator + blastTool + "_" + blastName + File.separator);
+ }
+ }
+ }
+ }
+
+ void readProcessFile() {
+ BufferedReader br;
+
+ System.out.println("\nReading process file "+processFile);
+ try {
+ br = new BufferedReader(new FileReader(processFile));
+ String line;
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ if (line.length() > 1) {
+ String[] tokens = line.split(":");
+ if (tokens[0].compareToIgnoreCase("Extract") == 0) {
+ extractingReads = true;
+ System.out.println(" Extract "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Fast5Dir") == 0) {
+ readsDir = tokens[1];
+ System.out.println(" Fast5Dir "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Aligner") == 0) {
+ aligningReads = true;
+ System.out.println(" Aligner "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Reference") == 0) {
+ referenceFile = tokens[1];
+ System.out.println(" Reference "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Sample") == 0) {
+ sampleDirectory = tokens[1];
+ System.out.println(" Sample "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Analysis") == 0) {
+ parsingReads = true;
+ System.out.println(" Analysis "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Aligner") == 0) {
+ aligner = tokens[1];
+ System.out.println(" Aligner "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("Blast") == 0) {
+ blastingReads = true;
+ blastProcesses.add(tokens[1]);
+ System.out.println(" Blast "+tokens[1]);
+ } else if (tokens[0].compareToIgnoreCase("ReadsPerBlast") == 0) {
+ readsPerBlast = Integer.parseInt(tokens[1]);
+ System.out.println(" ReadsPerBlast "+readsPerBlast);
+ } else if (!tokens[0].startsWith("#")) {
+ System.out.println("Unknown token "+tokens[0]);
+ }
+ }
+ }
+ } while (line != null);
+ } catch (Exception e) {
+ System.out.println("readProcessFile Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ System.out.println("");
+ }
+
+ public void initialiseBlastHandlers() {
+ for (int t=0; t<3; t++) {
+ for (int pf=NanoOKOptions.READTYPE_PASS; pf<=NanoOKOptions.READTYPE_FAIL; pf++) {
+ blastHandlers[t][pf-1] = new BlastHandler(this, t, pf);
+ }
+ }
+ }
+
+ public BlastHandler getBlastHandler(int t, int pf) {
+ return blastHandlers[t][pf-1];
+ }
+
+ public ArrayList<String> getBlastProcesses() {
+ return blastProcesses;
+ }
+
+ public int getFileCounterOffset() {
+ return fileCounterOffset;
+ }
+
+ public boolean mergeFastaFiles() {
+ return mergeFastaFiles;
+ }
+
+ public ReadFileMerger getReadFileMerger() {
+ return readFileMerger;
+ }
+
+ public boolean usingBatchDirs() {
+ return sampleChecker.usingBatchDirs();
+ }
+
+ public boolean doForce() {
+ return force;
+ }
+
+ public SampleChecker getSampleChecker() {
+ return sampleChecker;
+ }
+
+ public boolean usingPassFailDirs() {
+ return sampleChecker.usingPassFailDirs();
+ }
+
+ public double getMinQ() {
+ return minQForPass;
+ }
+
+ public void initialiseReadMerger() {
+ readFileMerger = new ReadFileMerger(this);
+ }
+
+ public boolean debugMode() {
+ return false;
+ }
+}
diff --git a/src/nanook/OverallStats.java b/src/nanook/OverallStats.java
new file mode 100644
index 0000000..db5e183
--- /dev/null
+++ b/src/nanook/OverallStats.java
@@ -0,0 +1,39 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.Serializable;
+
+/**
+ * Represents overall (as opposed to per reference) stats for Template, Complement and 2D reads.
+ *
+ * @author Richard Leggett
+ */
+public class OverallStats implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private ReadSetStats[] readStats = new ReadSetStats[3];
+
+ /**
+ * Constructor.
+ * @param o NanoOKOptions structure
+ */
+ public OverallStats(NanoOKOptions o) {
+ for (int t=0; t<3; t++) {
+ readStats[t] = new ReadSetStats(o, t);
+ }
+ }
+
+ /**
+ * Get a set of stats (for either Template, Complement or 2D reads)
+ * @param t integer type - see defs in NanoOKOptions
+ * @return ReadSetStats object
+ */
+ public ReadSetStats getStatsByType(int t) {
+ return readStats[t];
+ }
+}
diff --git a/src/nanook/ParserRunnable.java b/src/nanook/ParserRunnable.java
new file mode 100644
index 0000000..7bc2826
--- /dev/null
+++ b/src/nanook/ParserRunnable.java
@@ -0,0 +1,163 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Enables multi-threading of parsing
+ *
+ * @author Richard Leggett
+ */
+public class ParserRunnable implements Runnable
+{
+ private NanoOKOptions options;
+ private ReadSetStats stats;
+ private String readPath;
+ private String alignmentPath;
+ private AlignmentsTableFile nonAlignedSummary;
+ private ReferenceSequence readReference = null;
+ private SequenceReader sr;
+ private int type;
+ private int passfail;
+
+ public ParserRunnable(NanoOKOptions o, ReadSetStats s, String rp, String ap, int t, int pf, AlignmentsTableFile nas) {
+ options = o;
+ readPath = rp;
+ alignmentPath = ap;
+ stats = s;
+ type = t;
+ passfail = pf;
+ nonAlignedSummary = nas;
+ }
+
+ /**
+ * Pick top alignment from sorted list. List is sorted in order of score, but if there are
+ * matching scores, we pick one at random.
+ * @param al list of alignments
+ * @return index
+ */
+ private int pickTopAlignment(List<Alignment> al) {
+ int index = 0;
+ int topScore = al.get(0).getScore();
+ int countSame = 0;
+
+ if (!options.fixRandom()) {
+ //for (int i=0; i<al.size(); i++) {
+ // System.out.println(i+" = "+al.get(i).getScore());
+ //}
+
+ // Find out how many have the same score
+ while ((countSame < al.size()) && (al.get(countSame).getScore() == topScore)) {
+ countSame++;
+ }
+
+ if (countSame > 1) {
+ Random rn = new Random();
+ index = rn.nextInt(countSame);
+ }
+
+ //System.out.println("Index chosen ("+countSame+") "+index);
+ }
+
+ return index;
+ }
+
+ /**
+ * Parse alignment
+ */
+ private void parseAlignment()
+ {
+ try {
+ File file = new File(alignmentPath);
+ AlignmentFileParser parser = options.getParser();
+
+ options.getLog().println("");
+ options.getLog().println("> New file " + file.getName());
+ options.getLog().println("");
+
+ int nAlignments = parser.parseFile(alignmentPath, nonAlignedSummary, stats);
+
+ if (nAlignments > 0) {
+ parser.sortAlignments();
+ List<Alignment> al = parser.getHighestScoringSet();
+ int topAlignment = pickTopAlignment(al);
+ String readReferenceName = al.get(topAlignment).getHitName();
+
+ options.getLog().println("Query size = " + al.get(topAlignment).getQuerySequenceSize());
+ options.getLog().println(" Hit size = " + al.get(topAlignment).getHitSequenceSize());
+
+ readReference = options.getReferences().getReferenceById(readReferenceName);
+ AlignmentMerger merger = new AlignmentMerger(options, readReference, al.get(topAlignment).getQuerySequenceSize(), stats, stats.getType());
+ for (int i=topAlignment; i<al.size(); i++) {
+ Alignment a = al.get(i);
+ merger.addAlignment(a);
+ }
+ AlignmentInfo ais = merger.endMergeAndStoreStats();
+ readReference.getStatsByType(stats.getType()).addCoverage(merger.getOverallHitStart(), merger.getOverallHitEnd()-merger.getOverallHitStart()+1);
+ readReference.getStatsByType(stats.getType()).getAlignmentsTableFile().writeMergedAlignment(stats, file.getName(), merger, ais);
+ }
+ } catch (Exception e) {
+ System.out.println("Error parsing alignment "+ alignmentPath);
+ options.setReturnValue(1);
+ options.getLog().println("Error parsing alignment " + alignmentPath);
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Parse a FASTA or FASTQ file, noting length of reads etc.
+ */
+ private void readQueryFile() {
+ int nReadsInFile;
+
+ sr = new SequenceReader(true);
+
+ if (options.getReadFormat() == NanoOKOptions.FASTQ) {
+ nReadsInFile = sr.indexFASTQFile(readPath);
+ } else {
+ nReadsInFile = sr.indexFASTAFile(readPath, null, true);
+ }
+
+ if (nReadsInFile > 1) {
+ System.out.println("Warning: File "+readPath+" has more than 1 read. NanoOK can't currently handle this.");
+ }
+
+ for (int i=0; i<sr.getSequenceCount(); i++) {
+ String id = sr.getID(i);
+
+ if (id.startsWith("00000000-0000-0000-0000-000000000000")) {
+ System.out.println("Error:");
+ System.out.println(readPath);
+ System.out.println("The reads in this file do not have unique IDs because they were generated when MinKNOW was producing UUIDs, but Metrichor was not using them. To fix, run nanook_extract_reads with the -fixids option.");
+ System.exit(1);
+ }
+
+ if (id.length() > 100) {
+ System.out.println("Problem id " + id);
+ System.out.println("Read path: " + readPath);
+ System.exit(1);
+ }
+ stats.addLength(readPath, id, sr.getLength(i), sr.getGC(i));
+ }
+ }
+
+ /**
+ * Entry point to thread
+ */
+ public void run() {
+ readQueryFile();
+ stats.addReadFile(passfail);
+ parseAlignment();
+ if ((readReference != null) && (options.doKmerCounting())) {
+ sr.storeKmers(0, readReference.getStatsByType(type).getReadKmerTable());
+ }
+ }
+}
diff --git a/src/nanook/ProcessLogger.java b/src/nanook/ProcessLogger.java
new file mode 100644
index 0000000..a2b65b0
--- /dev/null
+++ b/src/nanook/ProcessLogger.java
@@ -0,0 +1,202 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.lang.ProcessBuilder.Redirect;
+import java.util.ArrayList;
+
+/**
+ * Execute a system process and log result to a file
+ *
+ * @author Richard Leggett
+ */
+public class ProcessLogger {
+ private boolean writeStdio = true;
+ private boolean writeStderr = true;
+ private boolean writeHeadings = true;
+
+ public ArrayList getCommandOutput(String[] command, boolean stdin, boolean stderr) {
+ ArrayList outputLines = new ArrayList();
+
+ try {
+ Process p = Runtime.getRuntime().exec(command);
+ // ?? p.waitFor();
+
+ if (stdin) {
+ BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ String s = null;
+ while ((s = stdInput.readLine()) != null) {
+ outputLines.add(s);
+ }
+ }
+
+ if (stderr) {
+ BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+ String s = null;
+ while ((s = stdError.readLine()) != null) {
+ outputLines.add(s);
+ }
+ }
+ } catch (Exception e) {
+ System.out.println("\nProcessLogger exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ return outputLines;
+ }
+
+ public void runCommandToLog(String[] command, NanoOKLog log) {
+ ArrayList<String> response = getCommandOutput(command, true, true);
+ for (int i=0; i<response.size(); i++) {
+ log.println(response.get(i));
+ }
+ }
+
+ public void runCommand(String[] command) {
+ ArrayList<String> response = getCommandOutput(command, true, true);
+ for (int i=0; i<response.size(); i++) {
+ System.out.println(response.get(i));
+ }
+ }
+
+ public ArrayList getCommandOutput(String command, boolean stdin, boolean stderr) {
+ ArrayList outputLines = new ArrayList();
+
+ try {
+ Process p = Runtime.getRuntime().exec(command);
+ // ?? p.waitFor();
+
+ if (stdin) {
+ BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ String s = null;
+ while ((s = stdInput.readLine()) != null) {
+ outputLines.add(s);
+ }
+ }
+
+ if (stderr) {
+ BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+ String s = null;
+ while ((s = stdError.readLine()) != null) {
+ outputLines.add(s);
+ }
+ }
+ } catch (Exception e) {
+ System.out.println("ProcessLogger exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ return outputLines;
+ }
+
+ public void runCommand(String command) {
+ ArrayList<String> response = getCommandOutput(command, true, true);
+ for (int i=0; i<response.size(); i++) {
+ System.out.println(response.get(i));
+ }
+ }
+
+ public ArrayList checkCommand(String command) {
+ ArrayList outputLines;
+ boolean isOk = true;
+
+ try {
+ Process p = Runtime.getRuntime().exec(command);
+ BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+ String s = null;
+
+ outputLines = new ArrayList();
+ while ((s = stdInput.readLine()) != null) {
+ outputLines.add(s);
+ }
+ while ((s = stdError.readLine()) != null) {
+ outputLines.add(s);
+ }
+ } catch (Exception e) {
+ outputLines = null;
+ }
+
+ return outputLines;
+ }
+
+ private synchronized void writeLog(Process p, String command, String logFilename, boolean fAppend) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(logFilename, fAppend));
+ BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+
+ if (fAppend && writeHeadings) {
+ pw.println("");
+ pw.println("---");
+ pw.println("");
+ }
+
+ if (writeHeadings) {
+ pw.println("Running "+command);
+ }
+
+ // read the output from the command
+ if (writeHeadings) {
+ pw.println("");
+ pw.println("Stdout:");
+ }
+
+ if (writeStdio) {
+ String s = null;
+ while ((s = stdInput.readLine()) != null) {
+ pw.println(s);
+ }
+ }
+
+ // read any errors from the attempted command
+ if (writeHeadings) {
+ pw.println("");
+ pw.println("Stderr:");
+ }
+
+ if (writeStderr) {
+ String s = null;
+ while ((s = stdError.readLine()) != null) {
+ pw.println(s);
+ }
+ }
+
+ pw.close();
+ } catch (Exception e) {
+ System.out.println("ProcessLogger exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public void runAndLogCommand(String command, String logFilename, boolean fAppend) {
+ try {
+ Process p = Runtime.getRuntime().exec(command);
+ writeLog(p, command, logFilename, fAppend);
+ p.waitFor();
+ } catch (Exception e) {
+ System.out.println("ProcessLogger exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public void setWriteFormat(boolean headings, boolean io, boolean err) {
+ writeHeadings = headings;
+ writeStdio = io;
+ writeStderr = err;
+ }
+}
diff --git a/src/nanook/RGraphPlotter.java b/src/nanook/RGraphPlotter.java
new file mode 100644
index 0000000..bacaa11
--- /dev/null
+++ b/src/nanook/RGraphPlotter.java
@@ -0,0 +1,145 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Executes command to plot graphs with R.
+ *
+ * @author Richard Leggett
+ */
+public class RGraphPlotter {
+ private ThreadPoolExecutor executor;
+ private NanoOKOptions options;
+ private long lastCompleted = -1;
+ private String logDirectory;
+
+ /**
+ * Constructor.
+ * @param o NanoOKOptions object
+ */
+ public RGraphPlotter(NanoOKOptions o) {
+ options = o;
+ executor = new ThreadPoolExecutor(options.getNumberOfThreads(), options.getNumberOfThreads(), 10, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ logDirectory = options.getLogsDir() + File.separator + "R" + options.getAnalysisSuffix();
+ File f = new File(logDirectory);
+ if (!f.exists()) {
+ f.mkdir();
+ options.getLog().println("Made directory " + logDirectory);
+ }
+ }
+
+ /**
+ * Write progress
+ */
+ private void writeProgress() {
+ long completed = executor.getCompletedTaskCount();
+ long total = executor.getTaskCount();
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (total > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * completed / total;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+
+ if (completed != lastCompleted) {
+ System.out.print("\r[");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + completed +"/" + total);
+ lastCompleted = completed;
+ }
+ }
+
+ public void runScript(boolean fComparison, String scriptName, String logPrefix, String refName) {
+ ArrayList<String> args = new ArrayList<String>();
+ String logFilename = logDirectory + File.separator + logPrefix;
+
+ args.add("Rscript");
+ args.add(options.getScriptsDir() + File.separator + scriptName);
+
+ if (fComparison) {
+ File f = new File(options.getAnalysisDir());
+ args.add(f.getName());
+ } else {
+ args.add(options.getAnalysisDir());
+ }
+
+ args.add(options.getGraphsDir());
+
+ if (fComparison) {
+ args.add(options.getSampleList());
+ args.add(options.getComparisonDir());
+ }
+
+ if (refName != null) {
+ args.add(refName);
+ logFilename = logFilename + "_"+refName;
+ }
+
+ args.add(options.getImageFormat());
+
+ //System.out.println(args);
+
+ options.getLog().println("Running Rscript "+scriptName);
+ options.getLog().println("Log file is "+logFilename);
+ executor.execute(new RGraphRunnable("Rscript", args, logFilename + ".txt"));
+ writeProgress();
+ }
+
+ /**
+ * Execute plot commands.
+ * @param references References object containing all references
+ */
+ public void plot(boolean fComparison) throws InterruptedException {
+ String s = null;
+
+ if (fComparison) {
+ runScript(fComparison, "nanook_plot_comparison.R", "plot_lengths", null);
+ } else {
+ runScript(fComparison, "nanook_plot_lengths.R", "plot_lengths", null);
+ }
+
+ Set<String> ids = options.getReferences().getAllIds();
+ for (String id : ids) {
+ ReferenceSequence rs = options.getReferences().getReferenceById(id);
+ String name = rs.getName();
+ if (fComparison) {
+ runScript(fComparison, "nanook_plot_comparison_reference.R", "plot_reference", name);
+ } else {
+ if (rs.getTotalNumberOfAlignments() > NanoOKOptions.MIN_ALIGNMENTS) {
+ runScript(fComparison, "nanook_plot_reference.R", "plot_reference", name);
+ }
+ }
+ writeProgress();
+ }
+
+ // That's all - wait for all threads to finish
+ executor.shutdown();
+ while (!executor.isTerminated()) {
+ writeProgress();
+ Thread.sleep(100);
+ }
+
+ writeProgress();
+ System.out.println("");
+ }
+}
diff --git a/src/nanook/RGraphRunnable.java b/src/nanook/RGraphRunnable.java
new file mode 100644
index 0000000..c16dd3a
--- /dev/null
+++ b/src/nanook/RGraphRunnable.java
@@ -0,0 +1,69 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.List;
+
+/**
+ * Enable multi-threading of R plotting
+ *
+ * @author Richard Leggett
+ */
+public class RGraphRunnable implements Runnable {
+ private String command;
+ private String logFilename;
+ private List<String> args;
+
+ public RGraphRunnable(String cmd, List<String> a, String log) {
+ command = cmd;
+ args = a;
+ logFilename = log;
+ }
+
+ public void checkLogForErrors(String filename) {
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(filename));
+ String line = null;
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ if (line.contains("there is no package called")) {
+ System.out.println("");
+ System.out.println("R error - have you installed all the package dependencies? See documentation for help.");
+ System.out.println(line);
+ }
+ }
+ } while (line != null);
+ br.close();
+ } catch (Exception e) {
+ System.out.println("Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public void run() {
+ try {
+ ProcessBuilder pb = new ProcessBuilder(args);
+ pb.redirectErrorStream(true);
+ pb.redirectOutput(ProcessBuilder.Redirect.to(new File(logFilename))); //appendTo
+ Process p = pb.start();
+ p.waitFor();
+ checkLogForErrors(logFilename);
+ } catch (Exception e) {
+ System.out.println("RGraphRunnable exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+}
diff --git a/src/nanook/ReadAligner.java b/src/nanook/ReadAligner.java
new file mode 100644
index 0000000..db06c74
--- /dev/null
+++ b/src/nanook/ReadAligner.java
@@ -0,0 +1,226 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.*;
+
+/**
+ * Align reads
+ *
+ * @author Richard Leggett
+ */
+public class ReadAligner {
+ private NanoOKOptions options;
+ private AlignmentFileParser parser;
+ private ThreadPoolExecutor executor;
+ private long lastCompleted = -1;
+
+ /**
+ * Constructor
+ * @param o program options
+ */
+ public ReadAligner(NanoOKOptions o, AlignmentFileParser afp) {
+ options = o;
+ parser = afp;
+
+ executor = new ThreadPoolExecutor(options.getNumberOfThreads(), options.getNumberOfThreads(), 10, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ }
+
+ /**
+ * Write progress
+ */
+ private void writeProgress() {
+ long completed = executor.getCompletedTaskCount();
+ long total = executor.getTaskCount();
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (total > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * completed / total;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+ if (completed != lastCompleted) {
+ System.out.print("\rAlignment [");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + completed +"/" + total);
+ lastCompleted = completed;
+ }
+ }
+
+ private void checkAndMakeDir(String dir) {
+ File f = new File(dir);
+ if (f.exists()) {
+ if (!f.isDirectory()) {
+ System.out.println("Error: " + dir + " is a file, not a directory!");
+ System.exit(1);
+ }
+ } else {
+ //System.out.println("Making directory " + dir);
+ f.mkdir();
+ }
+ }
+
+ /**
+ * Create directories for output
+ */
+ public void createDirectories() {
+ checkAndMakeDir(options.getAlignerDir());
+ checkAndMakeDir(options.getLogsDir());
+ checkAndMakeDir(options.getLogsDir() + File.separator + options.getAligner());
+ }
+
+ public boolean isValidReadFile(String filename) {
+ boolean isValid = false;
+
+ //System.out.println(filename);
+
+ if (parser.getReadFormat() == NanoOKOptions.FASTA) {
+ if (filename.endsWith(".fa") || filename.endsWith(".fasta")) {
+ isValid = true;
+ }
+ } else if (parser.getReadFormat() == NanoOKOptions.FASTQ) {
+ if (filename.endsWith(".fq") || filename.endsWith(".fastq")) {
+ isValid = true;
+ }
+ }
+
+ return isValid;
+ }
+
+ private void checkReferenceSizesFile(String referenceFile) {
+ String sizesFilename = referenceFile + ".sizes";
+ File sizesFile = new File(sizesFilename);
+ if (!sizesFile.exists()) {
+ System.out.println("");
+ System.out.println("Generating .sizes file for reference. You may want to edit the display names.");
+ SequenceReader sr = new SequenceReader(false);
+ sr.indexFASTAFile(referenceFile, sizesFilename , false);
+ }
+ }
+
+ private void processDirectory(String readsDir, String alignDir, String logDirName, boolean allowSubdir, boolean processThisDir) {
+ String reference = options.getReferenceFile();
+
+ checkReferenceSizesFile(reference);
+ checkAndMakeDir(alignDir);
+ checkAndMakeDir(logDirName);
+
+ if (allowSubdir) {
+ File inputDir = new File(readsDir);
+ File[] listOfFiles = inputDir.listFiles();
+ for (File file : listOfFiles) {
+ if (file.isDirectory() &&
+ (!file.getName().equals("2D")) &&
+ (!file.getName().equals("Template")) &&
+ (!file.getName().equals("Complement"))) {
+ processDirectory(readsDir + File.separator + file.getName(),
+ alignDir + File.separator + file.getName(),
+ logDirName + File.separator + file.getName(),
+ false,
+ true);
+ }
+ }
+ }
+
+ if (processThisDir) {
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ String inputDirName = readsDir + File.separator + NanoOKOptions.getTypeFromInt(t);
+ String outputDirName = alignDir + File.separator + NanoOKOptions.getTypeFromInt(t);
+
+ checkAndMakeDir(outputDirName);
+
+ File inputDir = new File(inputDirName);
+ File[] listOfFiles = inputDir.listFiles();
+
+ if (listOfFiles == null) {
+ System.out.println("");
+ System.out.println("Directory "+inputDirName+" doesn't exist. Have you extracted reads as "+options.getExpectedReadFormat()+ " (some aligners require FASTA, some FASTQ)?");
+ } else if (listOfFiles.length <= 0) {
+ System.out.println("");
+ System.out.println("Directory "+inputDirName+" empty. Have you extracted reads as "+options.getExpectedReadFormat()+ " (some aligners require FASTA, some FASTQ)?");
+ } else {
+ int readCount = 0;
+ for (File file : listOfFiles) {
+ if (file.isFile()) {
+ if (isValidReadFile(file.getName())) {
+ String inPath = inputDirName + File.separator + file.getName();
+ String outPath = outputDirName + File.separator + file.getName() + parser.getAlignmentFileExtension();
+ String logFile = logDirName + File.separator + file.getName() + ".log";
+ String command = parser.getRunCommand(inPath, outPath, reference);
+ if (options.showAlignerCommand()) {
+ System.out.println("Running: " + command);
+ }
+ executor.execute(new SystemCommandRunnable(options, null, command, parser.outputsToStdout() ? outPath:null, logFile));
+ writeProgress();
+ readCount++;
+ }
+ }
+ }
+
+ if (readCount == 0) {
+ System.out.print("Error: unable to find any ");
+ if (parser.getReadFormat() == NanoOKOptions.FASTA) {
+ System.out.print("FASTA");
+ } else if (parser.getReadFormat() == NanoOKOptions.FASTQ) {
+ System.out.print("FASTQ");
+ }
+ System.out.println(" files to align");
+ System.out.println("");
+ System.exit(1);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public void align() throws InterruptedException {
+ if (options.usingPassFailDirs()) {
+ if (options.isProcessingPassReads()) {
+ processDirectory(options.getReadDir() + File.separator + "pass",
+ options.getAlignerDir() + File.separator + "pass",
+ options.getLogsDir() + File.separator + options.getAligner() + File.separator + "pass",
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true);
+ }
+
+ if (options.isProcessingFailReads()) {
+ processDirectory(options.getReadDir() + File.separator + "fail",
+ options.getAlignerDir() + File.separator + "fail",
+ options.getLogsDir() + File.separator + options.getAligner() + File.separator + "fail",
+ options.isBarcoded(),
+ true);
+ }
+ } else {
+ processDirectory(options.getReadDir(), options.getAlignerDir(), options.getLogsDir() + File.separator + options.getAligner(), false, true);
+ }
+
+ // That's all - wait for all threads to finish
+ executor.shutdown();
+ while (!executor.isTerminated()) {
+ writeProgress();
+ Thread.sleep(100);
+ }
+
+ writeProgress();
+ System.out.println("");
+ System.out.println("");
+ System.out.println("DONE");
+ }
+}
diff --git a/src/nanook/ReadExtractor.java b/src/nanook/ReadExtractor.java
new file mode 100644
index 0000000..30dd1f5
--- /dev/null
+++ b/src/nanook/ReadExtractor.java
@@ -0,0 +1,189 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Read extractor
+ *
+ * @author Richard Leggett
+ */
+public class ReadExtractor {
+ private NanoOKOptions options;
+ private ThreadPoolExecutor executor;
+ private long lastCompleted = -1;
+
+ /**
+ * Constructor
+ * @param o program options
+ */
+ public ReadExtractor(NanoOKOptions o) {
+ options = o;
+
+ System.out.println("ERROR: ReadExtractor class deprecated.");
+ System.exit(1);
+
+ //executor = Executors.newFixedThreadPool(options.getNumberOfThreads());
+ executor = new ThreadPoolExecutor(options.getNumberOfThreads(), options.getNumberOfThreads(), 10, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ }
+
+ /**
+ * Write progress of extraction
+ */
+ private void writeProgress() {
+ long completed = executor.getCompletedTaskCount();
+ long total = executor.getTaskCount();
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (total > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * completed / total;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+ if (completed != lastCompleted) {
+ System.out.print("\rExtraction [");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + completed +"/" + total);
+ lastCompleted = completed;
+ }
+ }
+
+ /**
+ * Create directories for output
+ */
+ public void createDirectories() {
+ File f = new File(options.getReadDir());
+ if (f.exists()) {
+ if (!f.isDirectory()) {
+ System.out.println("Error: "+options.getReadDir()+" is a file, not a directory!");
+ System.exit(1);
+ }
+ } else {
+ //System.out.println("Making directory "+options.getReadDir());
+ f.mkdir();
+ }
+
+ }
+
+ /**
+ * Process a directory and extract reads
+ * @param inputDirName input directory name
+ * @param outputDirName output directory name
+ */
+ private void processDirectory(String inputDirName, String outputDirName, boolean allowSubdir, boolean processThisDir) {
+ File f = new File(outputDirName);
+
+ options.getLog().println("Processing directory");
+ options.getLog().println("Input dir name: "+inputDirName);
+ options.getLog().println("Output dir name: "+outputDirName);
+ options.getLog().println("allowSubdir: "+allowSubdir);
+ options.getLog().println("processThisDir: "+processThisDir);
+
+ // Make directory
+ if (! f.exists()) {
+ f.mkdir();
+ }
+
+ if (processThisDir) {
+ // Make output Template, Complement and 2D directories
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ f = new File(outputDirName + File.separator + NanoOKOptions.getTypeFromInt(t));
+ if (! f.exists()) {
+ f.mkdir();
+ }
+ }
+ }
+ }
+
+ File inputDir = new File(inputDirName);
+ File[] listOfFiles = inputDir.listFiles();
+
+ if (listOfFiles == null) {
+ System.out.println("");
+ System.out.println("Directory "+inputDirName+" doesn't exist");
+ } else if (listOfFiles.length <= 0) {
+ System.out.println("");
+ System.out.println("Directory "+inputDirName+" empty");
+ } else {
+ for (File file : listOfFiles) {
+ if (file.isFile() && processThisDir) {
+ if (file.getName().endsWith(".fast5")) {
+ options.getLog().println("Got file "+file.getName());
+ executor.execute(new ReadExtractorRunnable(options, inputDirName, file.getName(), outputDirName));
+ writeProgress();
+ }
+ } else if (file.isDirectory() && allowSubdir) {
+ processDirectory(inputDirName + File.separator + file.getName(),
+ outputDirName + File.separator + file.getName(),
+ false,
+ true);
+ }
+ }
+ }
+ }
+
+ /**
+ * Extract reads
+ */
+ public void extract() throws InterruptedException {
+ if (options.usingPassFailDirs()) {
+ if (options.isProcessingPassReads()) {
+ processDirectory(options.getFast5Dir() + File.separator + "pass",
+ options.getReadDir() + File.separator + "pass",
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true);
+ }
+
+ if (options.isProcessingFailReads()) {
+ processDirectory(options.getFast5Dir() + File.separator + "fail",
+ options.getReadDir() + File.separator + "fail",
+ options.isBarcoded(),
+ true);
+ }
+ } else {
+ processDirectory(options.getFast5Dir(), options.getReadDir(), false, true);
+ }
+
+ // That's all - wait for all threads to finish
+ executor.shutdown();
+ while (!executor.isTerminated()) {
+ writeProgress();
+ Thread.sleep(100);
+ }
+
+ writeProgress();
+
+ System.out.println("");
+
+ options.getReadFileMerger().closeFiles();
+ if (options.mergeFastaFiles()) {
+ System.out.println("");
+ options.getReadFileMerger().writeMergedFiles();
+ }
+
+ System.out.println("");
+ System.out.println("DONE");
+ }
+}
+
+
diff --git a/src/nanook/ReadExtractorRunnable.java b/src/nanook/ReadExtractorRunnable.java
new file mode 100644
index 0000000..72fb7c7
--- /dev/null
+++ b/src/nanook/ReadExtractorRunnable.java
@@ -0,0 +1,83 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Enable multi-threading of read extraction
+ *
+ * @author Richard Leggett
+ */
+public class ReadExtractorRunnable implements Runnable {
+ public final static String TYPE_STRING_TEMPLATE = "/Analyses/Basecall_2D_000/BaseCalled_template/Fastq";
+ public final static String TYPE_STRING_COMPLEMENT = "/Analyses/Basecall_2D_000/BaseCalled_complement/Fastq";
+ public final static String TYPE_STRING_2D = "/Analyses/Basecall_2D_000/BaseCalled_2D/Fastq";
+ private String[] typeStrings = {TYPE_STRING_TEMPLATE, TYPE_STRING_COMPLEMENT, TYPE_STRING_2D};
+ public NanoOKOptions options;
+ public String inDir;
+ public String filename;
+ public String outDir;
+
+ public ReadExtractorRunnable(NanoOKOptions o, String in, String file, String out) {
+ options = o;
+ inDir = in;
+ filename = file;
+ outDir = out;
+
+ System.out.println("Error: Entered deprecated ReadExtractorRunnable!");
+ System.exit(1);
+ }
+
+ /**
+ * Extract reads of each type from file
+ * @param inDir input directory
+ * @param filename filename
+ * @param outDir output directory
+ */
+ public void run() {
+ String inputPathname = inDir + File.separator + filename;
+ Fast5File inputFile = new Fast5File(options, inputPathname);
+ //String outName = new File(inputPathname).getName();
+ String filePrefix = ReadProcessorRunnable.getFilePrefixFromPathname(inputPathname);
+
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ FastAQFile ff = inputFile.getFastq(options.getBasecallIndex(), t);
+ if (ff != null) {
+ if (options.getReadFormat() == NanoOKOptions.FASTA) {
+ //String fastaqPathname = outDir + File.separator + NanoOKOptions.getTypeFromInt(t) + File.separator + outName + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + ".fasta";
+ String fastaqPathname = outDir + File.separator + NanoOKOptions.getTypeFromInt(t) + File.separator + filePrefix + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + ".fasta";
+ ff.writeFasta(fastaqPathname, options.outputFast5Path() ? inputPathname:null);
+ if (options.mergeFastaFiles()) {
+ options.getReadFileMerger().addReadFile(fastaqPathname, t, 0, "", 0, 0);
+ }
+ } else if (options.getReadFormat() == NanoOKOptions.FASTQ) {
+ //String fastaqPathname = outDir + File.separator + NanoOKOptions.getTypeFromInt(t) + File.separator + outName + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + ".fastq";
+ String fastaqPathname = outDir + File.separator + NanoOKOptions.getTypeFromInt(t) + File.separator + filePrefix + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + ".fastq";
+ ff.writeFastq(fastaqPathname);
+ if (options.mergeFastaFiles()) {
+ options.getReadFileMerger().addReadFile(fastaqPathname, t, 0, "", 0, 0);
+ }
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/nanook/ReadFileMerger.java b/src/nanook/ReadFileMerger.java
new file mode 100644
index 0000000..d998856
--- /dev/null
+++ b/src/nanook/ReadFileMerger.java
@@ -0,0 +1,95 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015-2017 The Earlham Institute (formerly The Genome Analysis Centre)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+
+public class ReadFileMerger {
+ private NanoOKOptions options;
+ private ArrayList<String>[][] readFiles = new ArrayList[2][3];
+ private PrintWriter[][] summaryFiles = new PrintWriter[2][3];
+
+ public ReadFileMerger(NanoOKOptions o) {
+ options = o;
+
+ try {
+ for (int pf = 0; pf<2; pf++) {
+ for (int type=0; type<3; type++) {
+ readFiles[pf][type] = new ArrayList<String>();
+
+ String pathname = options.getReadDir() + File.separator +
+ options.getSample() + "_all_" +
+ NanoOKOptions.getTypeFromInt(type) + "_" +
+ NanoOKOptions.getPassFailFromInt(pf + 1) +
+ ".stats";
+
+ options.getLog().println("Opening stats file "+pathname);
+ summaryFiles[pf][type] = new PrintWriter(new FileWriter(pathname));
+ }
+ }
+ } catch (Exception e) {
+ System.out.println("ReadFileMerger exception");
+ e.printStackTrace();
+ }
+ }
+
+ public synchronized void addReadFile(String pathname, int type, int pf, String readID, int readLength, double meanQ) {
+ if (options.mergeFastaFiles()) {
+ readFiles[pf-1][type].add(pathname);
+ }
+
+ summaryFiles[pf-1][type].println(pathname+"\t"+readID+"\t"+readLength+"\t"+meanQ);
+ }
+
+ public void writeMergedFiles() {
+ for (int pf = 0; pf<2; pf++) {
+ for (int type=0; type<3; type++) {
+ if (readFiles[pf][type].size() > 0) {
+ String outputPathname = options.getReadDir() + File.separator +
+ options.getSample() + "_all_" +
+ NanoOKOptions.getTypeFromInt(type) + "_" +
+ NanoOKOptions.getPassFailFromInt(pf + 1) +
+ (options.getReadFormat() == NanoOKOptions.FASTA ? ".fasta":".fastq");
+
+ System.out.println("Writing " + outputPathname);
+
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(outputPathname));
+ for (int i=0; i<readFiles[pf][type].size(); i++) {
+ BufferedReader br = new BufferedReader(new FileReader(readFiles[pf][type].get(i)));
+ String line;
+ while ((line = br.readLine()) != null) {
+ pw.println(line);
+ }
+ br.close();
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeMergedFiles exception");
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+ public void closeFiles() {
+ options.getLog().println("Closing stats files");
+ for (int pf = 0; pf<2; pf++) {
+ for (int type=0; type<3; type++) {
+ summaryFiles[pf][type].close();
+ }
+ }
+ }
+}
diff --git a/src/nanook/ReadLengthsSummaryFile.java b/src/nanook/ReadLengthsSummaryFile.java
new file mode 100644
index 0000000..17e3486
--- /dev/null
+++ b/src/nanook/ReadLengthsSummaryFile.java
@@ -0,0 +1,64 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.*;
+
+/**
+ * Represents a summary file containing basic information on read lengths, N50 etc. for the three different read types.
+ *
+ * @author Richard Leggett
+ */
+public class ReadLengthsSummaryFile {
+ private PrintWriter pw;
+ private String filename;
+
+ /**
+ * Constructor.
+ * @param f filename of output file
+ */
+ public ReadLengthsSummaryFile(String f) {
+ filename = f;
+ }
+
+ /**
+ * Open output file.
+ * @param sample sample name
+ */
+ public void open(String sample) {
+ try {
+ pw = new PrintWriter(new FileWriter(filename));
+ pw.println("Nanotools report - "+sample);
+ pw.println("");
+ pw.println("Length summary");
+ pw.println("");
+ pw.printf("%-10s %-8s %-10s %-10s %-8s %-8s %-8s %-8s %-8s %-8s", "Type", "NumReads", "TotalBases", "Mean", "Long", "Short", "N50", "N50Count", "N90", "N90Count");
+ pw.println("");
+ } catch (IOException e) {
+ System.out.println("ReadLengthsSummaryFile exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Output read stats for a particular type (Template, Complement, 2D).
+ * @param r ReadSetStats object for the type
+ */
+ public void addReadSetStats(ReadSetStats r) {
+ pw.printf("%-10s %-8d %-10d %-10.2f %-8d %-8d %-8d %-8d %-8d %-8d", r.getTypeString(), r.getNumReads(), r.getTotalBases(), r.getMeanLength(), r.getLongest(), r.getShortest(), r.getN50(), r.getN50Count(), r.getN90(), r.getN90Count());
+ pw.println("");
+ }
+
+ /**
+ * Close output file.
+ */
+ public void close() {
+ pw.close();
+ }
+}
diff --git a/src/nanook/ReadParser.java b/src/nanook/ReadParser.java
new file mode 100644
index 0000000..80c4ce3
--- /dev/null
+++ b/src/nanook/ReadParser.java
@@ -0,0 +1,123 @@
+
+/*
+ TO DO:
+ - Use the AlignmentFileStats structure to store ALL alignment stats and write this to the separate alignment files.
+ - This requires rewriting the current parsers and methods.
+*/
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.List;
+
+/**
+ *
+ * @author leggettr
+ */
+public class ReadParser {
+ private NanoOKOptions options;
+ private SequenceReader sr;
+
+
+ public ReadParser(NanoOKOptions o) {
+ options = o;
+ }
+
+ /**
+ * Parse a FASTA or FASTQ file, noting length of reads etc.
+ */
+ private void readQueryFile(String readPath, PrintWriter pw) {
+ int nReadsInFile;
+
+ sr = new SequenceReader(true);
+
+ if (options.getReadFormat() == NanoOKOptions.FASTQ) {
+ nReadsInFile = sr.indexFASTQFile(readPath);
+ } else {
+ nReadsInFile = sr.indexFASTAFile(readPath, null, true);
+ }
+
+ if (nReadsInFile > 1) {
+ System.out.println("Warning: File "+readPath+" has more than 1 read. NanoOK can't currently handle this.");
+ }
+
+ for (int i=0; i<sr.getSequenceCount(); i++) {
+ String id = sr.getID(i);
+
+ if (id.startsWith("00000000-0000-0000-0000-000000000000")) {
+ System.out.println("Error:");
+ System.out.println(readPath);
+ System.out.println("The reads in this file do not have unique IDs because they were generated when MinKNOW was producing UUIDs, but Metrichor was not using them. To fix, run nanook_extract_reads with the -fixids option.");
+ System.exit(1);
+ }
+
+ //stats.addLength(readPath, id, sr.getLength(i), sr.getGC(i));
+ pw.printf("Read:%s\t%d\t%.2f\n", id, sr.getLength(i), sr.getGC(i));
+ }
+ }
+
+ /**
+ * Parse alignment
+ */
+ private void parseAlignment(String alignmentPath)
+ {
+// try {
+// File file = new File(alignmentPath);
+// AlignmentFileParser parser = options.getParser();
+//
+// options.getLog().println("");
+// options.getLog().println("> New file " + file.getName());
+// options.getLog().println("");
+//
+// int nAlignments = parser.parseFile(alignmentPath, nonAlignedSummary, stats);
+//
+// if (nAlignments > 0) {
+// parser.sortAlignments();
+// List<Alignment> al = parser.getHighestScoringSet();
+// int topAlignment = pickTopAlignment(al);
+// String readReferenceName = al.get(topAlignment).getHitName();
+//
+// options.getLog().println("Query size = " + al.get(topAlignment).getQuerySequenceSize());
+// options.getLog().println(" Hit size = " + al.get(topAlignment).getHitSequenceSize());
+//
+// readReference = options.getReferences().getReferenceById(readReferenceName);
+// AlignmentMerger merger = new AlignmentMerger(options, readReference, al.get(topAlignment).getQuerySequenceSize(), stats, stats.getType());
+// for (int i=topAlignment; i<al.size(); i++) {
+// Alignment a = al.get(i);
+// merger.addAlignment(a);
+// }
+// AlignmentInfo ais = merger.endMergeAndStoreStats();
+// readReference.getStatsByType(stats.getType()).getAlignmentsTableFile().writeMergedAlignment(stats, file.getName(), merger, ais);
+// }
+// } catch (Exception e) {
+// System.out.println("Error parsing alignment "+ alignmentPath);
+// options.setReturnValue(1);
+// options.getLog().println("Error parsing alignment " + alignmentPath);
+// e.printStackTrace();
+// }
+ }
+
+ public void parse(String fastaqPathname, String alignmentPathname, String parserPathname) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(parserPathname, false));
+ pw.println("NanoOKVersion:"+NanoOK.VERSION_STRING);
+ pw.println("FastAQPath:"+fastaqPathname);
+ pw.println("AlignmentPath:"+alignmentPathname);
+ pw.println("Aligner:"+options.getAligner());
+
+ readQueryFile(fastaqPathname, pw);
+ //stats.addReadFile(passfail);
+ //parseAlignment();
+ //if ((readReference != null) && (options.doKmerCounting())) {
+ // sr.storeKmers(0, readReference.getStatsByType(type).getReadKmerTable());
+ //}
+
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("parseAlignment exception");
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/nanook/ReadProcessor.java b/src/nanook/ReadProcessor.java
new file mode 100644
index 0000000..d3b72f0
--- /dev/null
+++ b/src/nanook/ReadProcessor.java
@@ -0,0 +1,263 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Read extractor
+ *
+ * @author Richard Leggett
+ */
+public class ReadProcessor {
+ private NanoOKOptions options;
+ private ThreadPoolExecutor executor;
+ private long lastCompleted = -1;
+ FileWatcher fw = null;
+
+ /**
+ * Constructor
+ * @param o program options
+ */
+ public ReadProcessor(NanoOKOptions o) {
+ options = o;
+ fw = new FileWatcher(options);
+
+ //executor = Executors.newFixedThreadPool(options.getNumberOfThreads());
+ executor = new ThreadPoolExecutor(options.getNumberOfThreads(), options.getNumberOfThreads(), 10, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ }
+
+ /**
+ * Write progress of extraction
+ */
+ private void writeProgress() {
+ long completed = executor.getCompletedTaskCount();
+ long total = executor.getTaskCount();
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (total > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * completed / total;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+ if (completed != lastCompleted) {
+ System.out.print("\rExtraction [");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + completed +"/" + total);
+ lastCompleted = completed;
+ }
+ }
+
+ /**
+ * Process a directory and extract reads
+ * @param inputDirName input directory name
+ * @param outputDirName output directory name
+ */
+ private void processDirectory(String inputDirName, boolean allowSubdir, boolean processThisDir, int pf) {
+ options.getLog().println("Processing directory");
+ options.getLog().println("Input dir name: "+inputDirName);
+ options.getLog().println("allowSubdir: "+allowSubdir);
+ options.getLog().println("processThisDir: "+processThisDir);
+
+ if (processThisDir) {
+ if (options.usingBatchDirs()) {
+ fw.addBatchContainer(inputDirName, pf);
+ } else {
+ fw.addWatchDir(inputDirName, pf);
+ }
+ } else {
+ File inputDir = new File(inputDirName);
+ File[] listOfFiles = inputDir.listFiles();
+
+ if (listOfFiles == null) {
+ options.getLog().println("Directory "+inputDirName+" doesn't exist");
+ } else if (listOfFiles.length <= 0) {
+ options.getLog().println("Directory "+inputDirName+" empty");
+ } else {
+ for (File file : listOfFiles) {
+ if (file.isDirectory() && allowSubdir) {
+ processDirectory(inputDirName + File.separator + file.getName(),
+ false,
+ true,
+ pf);
+ }
+ }
+ }
+ }
+ }
+
+ private void addDirsForExtract() {
+ if (options.usingPassFailDirs()) {
+ if (options.isProcessingPassReads()) {
+ processDirectory(options.getFast5Dir() + File.separator + "pass",
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_PASS);
+ }
+
+ if (options.isProcessingFailReads()) {
+ processDirectory(options.getFast5Dir() + File.separator + "fail",
+ options.isBarcoded(),
+ true,
+ NanoOKOptions.READTYPE_FAIL);
+ }
+ } else {
+ processDirectory(options.getFast5Dir(),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_COMBINED);
+ }
+ }
+
+ private void addDirsForAlign() {
+ // If using batch dirs, then we go sample/fasta/2D/pass/batch_XXX
+ // or sample/fasta/2D/pass/barcodeXXX/batch_XXX
+ // If using old style, then we go sample/fasta/pass/2D
+ // or sample/fasta/pass/2D/barcodeXXX/barcodeXXX
+
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ if (options.usingPassFailDirs()) {
+ if (options.isProcessingPassReads()) {
+ //if (options.usingBatchDirs()) {
+ processDirectory(options.getReadDir() + File.separator + "pass" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_PASS);
+ //} else {
+ // processDirectory(options.getReadDir() + File.separator + "pass" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ // options.isBarcoded(),
+ // options.isBarcoded() ? false:true);
+ //}
+ }
+
+ if (options.isProcessingFailReads()) {
+ //if (options.usingBatchDirs()) {
+ processDirectory(options.getReadDir() + File.separator + "fail" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_FAIL);
+ //} else {
+ // processDirectory(options.getReadDir() + File.separator + "fail" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ // options.isBarcoded(),
+ // true);
+ //}
+ }
+ } else {
+ processDirectory(options.getReadDir() + File.separator + NanoOKOptions.getTypeFromInt(t),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_COMBINED);
+ }
+ }
+ }
+ }
+
+ private void addDirsForParse() {
+ // If using batch dirs, then we go sample/last/2D/pass/batch_XXX
+ // If using old style, then we go sample/last/pass/2D
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ if (options.usingPassFailDirs()) {
+ if (options.isProcessingPassReads()) {
+ //if (options.usingBatchDirs()) {
+ processDirectory(options.getAlignerDir() + File.separator + "pass" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_PASS);
+ //} else {
+ // processDirectory(options.getAlignerDir() + File.separator + "pass" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ // options.isBarcoded(),
+ // options.isBarcoded() ? false:true);
+ //}
+ }
+
+ if (options.isProcessingFailReads()) {
+ //if (options.usingBatchDirs()) {
+ processDirectory(options.getAlignerDir() + File.separator + "pass" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_FAIL);
+ //} else {
+ // processDirectory(options.getAlignerDir() + File.separator + "fail" + File.separator + NanoOKOptions.getTypeFromInt(t),
+ // options.isBarcoded(),
+ // true);
+ //}
+ }
+ } else {
+ processDirectory(options.getAlignerDir() + File.separator + NanoOKOptions.getTypeFromInt(t),
+ options.isBarcoded(),
+ options.isBarcoded() ? false:true,
+ NanoOKOptions.READTYPE_COMBINED);
+ }
+ }
+ }
+ }
+
+ /**
+ * Extract reads
+ */
+ public void process() throws InterruptedException {
+ String baseDir = "";
+
+ if (options.isExtractingReads()) {
+ options.getSampleChecker().checkFast5Directory();
+ addDirsForExtract();
+ } else if (options.isAligningRead()) {
+ options.getSampleChecker().checkReadDirectory();
+ addDirsForAlign();
+ } else if (options.isParsingRead()) {
+ options.getSampleChecker().checkReadDirectory();
+ addDirsForParse();
+ }
+
+
+ for (int i=0; i<options.getNumberOfThreads(); i++) {
+ executor.execute(new ReadProcessorRunnable(options, fw));
+ }
+
+ // Now keep scanning
+ while (!fw.timedOut()) {
+ fw.scan();
+ fw.writeProgress();
+ Thread.sleep(500);
+ fw.writeProgress();
+ Thread.sleep(500);
+ }
+ fw.writeProgress();
+
+ // That's all - wait for all threads to finish
+ executor.shutdown();
+
+ options.getReadFileMerger().closeFiles();
+ if (options.mergeFastaFiles()) {
+ System.out.println("");
+ options.getReadFileMerger().writeMergedFiles();
+ }
+
+ //writeProgress();
+ System.out.println("");
+ System.out.println("");
+ System.out.println("DONE");
+ }
+}
+
+
diff --git a/src/nanook/ReadProcessorRunnable.java b/src/nanook/ReadProcessorRunnable.java
new file mode 100644
index 0000000..29b2959
--- /dev/null
+++ b/src/nanook/ReadProcessorRunnable.java
@@ -0,0 +1,369 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Enable multi-threading of read extraction
+ *
+ * @author Richard Leggett
+ */
+public class ReadProcessorRunnable implements Runnable {
+ public final static String TYPE_STRING_TEMPLATE = "/Analyses/Basecall_2D_000/BaseCalled_template/Fastq";
+ public final static String TYPE_STRING_COMPLEMENT = "/Analyses/Basecall_2D_000/BaseCalled_complement/Fastq";
+ public final static String TYPE_STRING_2D = "/Analyses/Basecall_2D_000/BaseCalled_2D/Fastq";
+ private String[] typeStrings = {TYPE_STRING_TEMPLATE, TYPE_STRING_COMPLEMENT, TYPE_STRING_2D};
+ public NanoOKOptions options;
+ public FileWatcher fileWatcher;
+ public boolean isNewStyleDir;
+
+ public ReadProcessorRunnable(NanoOKOptions o, FileWatcher f) {
+ options = o;
+ fileWatcher = f;
+ }
+
+ //rivate String getFastaqDirFromFast5Name(String fast5Pathname, int type) {
+ // File f = new File(fast5Pathname);
+ // String inDir = f.getParent();
+ // String outDir = options.getReadDir();
+
+ // if (!inDir.startsWith(options.getFast5Dir())) {
+ // System.out.println("Something wrong with fast5 filename - shouldn't get to this code. Please contact richard.leggett at earlham.ac.uk");
+ // System.exit(1);
+ // }
+
+ // If using batch dirs, then we go sample/fasta/2D/pass/batch_XXX
+ // If using old style, then we go sample/fasta/pass/2D
+ //if (options.usingBatchDirs()) {
+ // outDir = outDir + File.separator + NanoOKOptions.getTypeFromInt(type) + inDir.substring(options.getFast5Dir().length());
+ //} else {
+ // outDir = outDir + inDir.substring(options.getFast5Dir().length()) + File.separator + NanoOKOptions.getTypeFromInt(type);
+ //}
+
+ //options.getLog().println(" In: "+fast5Pathname);
+ //options.getLog().println(" OutDir: "+outDir);
+
+ // return outDir;
+ //}/
+
+ private String getAlignmentPathnameFromFastaqName(String fastaqPathname) {
+ File f = new File(fastaqPathname);
+ String inDir = f.getPath();
+ String outPathname;
+
+ if (!fastaqPathname.startsWith(options.getReadDir())) {
+ System.out.println("Something wrong with read filename - shouldn't get to this code. Please contact richard.leggett at earlham.ac.uk");
+ System.out.println("FastaPathname: "+fastaqPathname);
+ System.out.println("ReadDir: "+options.getReadDir());
+ System.exit(1);
+ }
+
+ outPathname = options.getAlignerDir() + inDir.substring(options.getReadDir().length());
+ File outFile = new File(outPathname);
+ File parent = new File(outFile.getParent());
+
+ if (!parent.exists()) {
+ options.getLog().println("Making directory " + parent.getPath());
+ parent.mkdirs();
+ }
+
+ return outPathname;
+ }
+
+ private String getAlignmentLogPathnameFromFastaqName(String fastaqPathname) {
+ File f = new File(fastaqPathname);
+ String inDir = f.getPath();
+ String outPathname;
+
+ if (!fastaqPathname.startsWith(options.getReadDir())) {
+ System.out.println("Something wrong with read filename - shouldn't get to this code. Please contact richard.leggett at earlham.ac.uk");
+ System.out.println("FastaPathname: "+fastaqPathname);
+ System.out.println("ReadDir: "+options.getReadDir());
+ System.exit(1);
+ }
+
+ outPathname = options.getLogsDir() + File.separator + options.getAligner() + inDir.substring(options.getReadDir().length());
+ File outFile = new File(outPathname);
+ File parent = new File(outFile.getParent());
+
+ if (!parent.exists()) {
+ options.getLog().println("Making directory " + parent.getPath());
+ parent.mkdirs();
+ }
+
+ return outPathname;
+ }
+
+ private String getParserPathnameFromAlignmentName(String alignmentPathname) {
+ File f = new File(alignmentPathname);
+ String inDir = f.getPath();
+ String outPathname;
+
+ if (!alignmentPathname.startsWith(options.getAlignerDir())) {
+ System.out.println("Something wrong with read filename - shouldn't get to this code. Please contact richard.leggett at earlham.ac.uk");
+ System.exit(1);
+ }
+
+ outPathname = options.getParserDir() + inDir.substring(options.getAlignerDir().length());
+ File outFile = new File(outPathname);
+ File parent = new File(outFile.getParent());
+
+ if (!parent.exists()) {
+ options.getLog().println("Making directory " + parent.getPath());
+ parent.mkdirs();
+ }
+
+ return outPathname;
+ }
+
+ private String getFastaqPathnameFromAlignmentName(String alignmentPathname) {
+ File f = new File(alignmentPathname);
+ String inDir = f.getPath();
+ String outPathname;
+
+ if (!alignmentPathname.startsWith(options.getAlignerDir())) {
+ System.out.println("Something wrong with read filename - shouldn't get to this code. Please contact richard.leggett at earlham.ac.uk");
+ System.exit(1);
+ }
+
+ outPathname = options.getReadDir() + inDir.substring(options.getAlignerDir().length(),inDir.lastIndexOf('.'));
+
+ return outPathname;
+ }
+
+ public static String getFilePrefixFromPathname(String pathname) {
+ File f = new File(pathname);
+ String inName = f.getName();
+ int suffixPos = inName.lastIndexOf(".");
+ String outName;
+
+ if (suffixPos > 0) {
+ outName = inName.substring(0, suffixPos);
+ } else {
+ outName = inName;
+ }
+
+ //options.getLog().println("OutName: "+outName);
+
+ return outName;
+ }
+
+ private void runCommandLocal(String command, String outPath) {
+ ProcessLogger pl = new ProcessLogger();
+
+ // outPath only non-null if aligner will only write to screen (yes, BWA, I'm talking about you)
+ if (outPath != null) {
+ pl.setWriteFormat(false, true, false);
+ pl.runAndLogCommand(command, outPath, false);
+ } else {
+ pl.runCommand(command);
+ }
+ }
+
+ public void runParse(String alignmentPathname) {
+ String parsedPathname = getParserPathnameFromAlignmentName(alignmentPathname) + ".txt";
+ String fastaqPathname = getFastaqPathnameFromAlignmentName(alignmentPathname);
+ ReadParser rp = new ReadParser(options);
+
+ options.getLog().println("Parsing file "+ alignmentPathname);
+ options.getLog().println(" to "+ parsedPathname);
+
+ rp.parse(fastaqPathname, alignmentPathname, parsedPathname);
+ }
+
+ public void runAlign(String fastaqPathname) {
+ String reference = options.getReferenceFile();
+ AlignmentFileParser parser = options.getParser();
+
+ String filePrefix = getFilePrefixFromPathname(fastaqPathname);
+ String alignmentPathname = getAlignmentPathnameFromFastaqName(fastaqPathname) + parser.getAlignmentFileExtension();
+ String alignmentLogPathname = getAlignmentLogPathnameFromFastaqName(fastaqPathname);
+
+ options.getLog().println("Aligning file "+fastaqPathname);
+ options.getLog().println(" to "+alignmentPathname);
+ options.getLog().println(" with log "+alignmentLogPathname);
+
+ String command = parser.getRunCommand(fastaqPathname, alignmentPathname, reference);
+ if (options.showAlignerCommand()) {
+ System.out.println("Running: " + command);
+ }
+ runCommandLocal(command, parser.outputsToStdout() ? alignmentPathname:null);
+ if (options.isParsingRead()) {
+ runParse(alignmentPathname);
+ }
+ }
+
+ public void addToBlast(String fastaqPathname, int type) {
+ int pf = NanoOKOptions.READTYPE_PASS;
+
+ if (fastaqPathname.contains("/fail/")) {
+ pf = NanoOKOptions.READTYPE_FAIL;
+ }
+
+ options.getBlastHandler(type, pf).addRead(fastaqPathname);
+ }
+
+ private String getFastaqFilename(String fast5Pathname, int t, int inputPF, int outputPF) {
+ File f = new File(fast5Pathname);
+ String inDir = f.getParent();
+ String suffixDirs;
+
+ if (!inDir.startsWith(options.getFast5Dir())) {
+ System.out.println("Something wrong with fast5 filename - shouldn't get to this code. Please contact richard.leggett at earlham.ac.uk");
+ System.exit(1);
+ }
+
+ if (inputPF == NanoOKOptions.READTYPE_COMBINED) {
+ suffixDirs = inDir.substring(options.getFast5Dir().length());
+ } else {
+ // +5 for /pass or /fail
+ suffixDirs = inDir.substring(options.getFast5Dir().length() + 5);
+ }
+
+ String fastaqDir = options.getReadDir() + File.separator;
+ if (outputPF == NanoOKOptions.READTYPE_FAIL) {
+ fastaqDir += "fail";
+ } else {
+ fastaqDir += "pass";
+ }
+
+ fastaqDir += File.separator + NanoOKOptions.getTypeFromInt(t) + suffixDirs;
+ File dir = new File(fastaqDir);
+
+ String filePrefix = getFilePrefixFromPathname(fast5Pathname);
+ String fileExtension = options.getReadFormat() == NanoOKOptions.FASTA ? ".fasta":".fastq";
+
+ if (!dir.exists()) {
+ options.getLog().println("Making directory " + fastaqDir);
+ dir.mkdirs();
+ }
+
+ String fastaqPathname = fastaqDir + File.separator + filePrefix + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + fileExtension;
+
+ return fastaqPathname;
+ }
+
+ public void runExtract(String fast5Pathname, int inputPF) {
+ Fast5File inputFile = new Fast5File(options, fast5Pathname);
+ int outputPF;
+
+ options.getLog().println("Extracting file "+fast5Pathname);
+
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ FastAQFile ff = inputFile.getFastq(options.getBasecallIndex(), t);
+ double meanQ = 0;
+
+ if (ff != null) {
+ // If pass/fail not assigned, default to pass directory output
+ if (inputPF == NanoOKOptions.READTYPE_COMBINED) {
+ outputPF = NanoOKOptions.READTYPE_PASS;
+ } else {
+ outputPF = inputPF;
+ }
+
+ // Have we set a min quality threshold? In which case, test...
+ meanQ = inputFile.getMeanQ(options.getBasecallIndex(), t);
+ if (options.getMinQ() >= 0) {
+ if (meanQ == 0) {
+ options.getLog().println(" Couldn't get mean quality value");
+ } else {
+ if (meanQ >= options.getMinQ()) {
+ outputPF = NanoOKOptions.READTYPE_PASS;
+ } else {
+ outputPF = NanoOKOptions.READTYPE_FAIL;
+ }
+ }
+ options.getLog().println(" Mean quality " + meanQ + " output class " + (outputPF == NanoOKOptions.READTYPE_PASS ? "pass":"fail"));
+ }
+
+ String fastaqPathname = getFastaqFilename(fast5Pathname, t, inputPF, outputPF);
+ options.getLog().println(" Writing "+fastaqPathname);
+
+ options.getReadFileMerger().addReadFile(fastaqPathname, t, outputPF, ff.getID(), ff.getLength(), meanQ);
+
+ if (options.getReadFormat() == NanoOKOptions.FASTA) {
+ ff.writeFasta(fastaqPathname, options.outputFast5Path() ? fast5Pathname:null);
+ } else {
+ ff.writeFastq(fastaqPathname);
+ }
+
+ if (options.isBlastingRead()) {
+ addToBlast(fastaqPathname, t);
+ }
+
+ if (options.isAligningRead()) {
+ runAlign(fastaqPathname);
+ }
+ }
+ }
+ }
+ }
+
+ public void run() {
+ while (!fileWatcher.timedOut()) {
+ FileWatcherItem fwi = null;
+ String fastaqPathname = null;
+ String alignmentPathname = null;
+ String parsedPathname = null;
+ String alignmentLogPathname = null;
+
+ // Get next file to process
+ while ((fwi == null) && !fileWatcher.timedOut()) {
+ fwi = fileWatcher.getPendingFile();
+ if (fwi == null) {
+ try {
+ Thread.sleep(500);
+ } catch (InterruptedException ex) {
+ Logger.getLogger(ReadProcessorRunnable.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ }
+
+ if (fwi != null) {
+ String nextPathname = fwi.getPathname();
+ int pf = fwi.getPassOrFail();
+
+ // Check valid filename
+ if (options.isExtractingReads()) {
+ if (nextPathname.toLowerCase().endsWith(".fast5")) {
+ runExtract(nextPathname, pf);
+ } else {
+ options.getLog().println("Invalid "+nextPathname);
+ }
+ } else if (options.isAligningRead()) {
+ if (nextPathname.toLowerCase().endsWith(".fasta") ||
+ nextPathname.toLowerCase().endsWith(".fastq")) {
+ runAlign(nextPathname);
+ }
+ } else if (options.isParsingRead()) {
+ if (nextPathname.toLowerCase().endsWith(options.getParser().getAlignmentFileExtension())) {
+ alignmentPathname = nextPathname;
+ runParse(nextPathname);
+ }
+ }
+ }
+ }
+
+ options.getLog().println("Thread exiting");
+ }
+}
diff --git a/src/nanook/ReadSet.java b/src/nanook/ReadSet.java
new file mode 100644
index 0000000..bc6981b
--- /dev/null
+++ b/src/nanook/ReadSet.java
@@ -0,0 +1,353 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.*;
+import java.util.*;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Represents a read set (Template reads, Complement reads, or 2D reads).
+ *
+ * @author Richard Leggett
+ */
+public class ReadSet {
+ public final static int MAX_READ_DIRS = 1000;
+ private ThreadPoolExecutor queryExecutor;
+ private NanoOKOptions options;
+ private ReadSetStats stats;
+ private int type;
+ private int nFastaFiles=0;
+ private String typeString;
+ private long lastCompleted = -1;
+
+
+ /**
+ * Constructor
+ * @param t type (defined in NanoOKOprions)
+ * @param o NanoOKOptions object
+ * @param s set of stats to associate with this read set
+ */
+ public ReadSet(int t, NanoOKOptions o, ReadSetStats s) {
+ options = o;
+ type = t;
+ stats = s;
+
+ queryExecutor = new ThreadPoolExecutor(options.getNumberOfThreads(), options.getNumberOfThreads(), 10, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ }
+
+ /**
+ * Write progress
+ */
+ private void writeProgress(ThreadPoolExecutor tpe) {
+ long completed = tpe.getCompletedTaskCount();
+ long total = tpe.getTaskCount();
+ long e = 0;
+ long s = NanoOKOptions.PROGRESS_WIDTH;
+
+ if (total > 0) {
+ e = NanoOKOptions.PROGRESS_WIDTH * completed / total;
+ s = NanoOKOptions.PROGRESS_WIDTH - e;
+ }
+
+ if (completed != lastCompleted) {
+ System.out.print("\r[");
+ for (int i=0; i<e; i++) {
+ System.out.print("=");
+ }
+ for (int i=0; i<s; i++) {
+ System.out.print(" ");
+ }
+ System.out.print("] " + completed +"/" + total);
+ lastCompleted = completed;
+ }
+ }
+
+ /**
+ * Check if filename has valid read extension
+ * @param f flename
+ * @return true if valid for chosen aligner
+ */
+ private boolean isValidReadExtension(String f) {
+ boolean r = false;
+
+ if (options.getReadFormat() == NanoOKOptions.FASTQ) {
+ if ((f.endsWith(".fastq")) || (f.endsWith(".fq"))) {
+ r = true;
+ }
+ } else {
+ if ((f.endsWith(".fasta")) || (f.endsWith(".fa"))) {
+ r = true;
+ }
+ }
+
+ return r;
+ }
+
+ /**
+ * Gather length statistics on reads and parse alignments
+ */
+ public int processReadsOld() throws InterruptedException {
+ AlignmentFileParser parser = options.getParser();
+ String[] readDirs = new String[MAX_READ_DIRS];
+ String[] alignerDirs = new String[MAX_READ_DIRS];
+ int readTypes[] = new int[MAX_READ_DIRS];
+ int nDirs = 0;
+ int maxReads = options.getMaxReads();
+ String outputFilename = options.getAnalysisDir() + File.separator + "Unaligned" + File.separator + options.getTypeFromInt(type) + "_nonaligned.txt";
+ AlignmentsTableFile nonAlignedSummary = new AlignmentsTableFile(outputFilename);
+
+ nFastaFiles=0;
+
+ stats.openLengthsFile();
+
+ if (options.usingPassFailDirs()) {
+ for (int pf=NanoOKOptions.READTYPE_PASS; pf<=NanoOKOptions.READTYPE_FAIL; pf++) {
+ String passOrFail="";
+
+ if ((pf == NanoOKOptions.READTYPE_PASS) && (options.isProcessingPassReads())) {
+ passOrFail="pass";
+ } else if ((pf == NanoOKOptions.READTYPE_FAIL) && (options.isProcessingFailReads())) {
+ passOrFail="fail";
+ }
+
+ if (passOrFail != "") {
+ if (options.isBarcoded()) {
+ File inputDir = new File(options.getReadDir() + File.separator + passOrFail + File.separator + options.getTypeFromInt(type));
+ File[] listOfFiles = inputDir.listFiles();
+ for (File file : listOfFiles) {
+ if (file.isDirectory()) {
+ if (nDirs == MAX_READ_DIRS) {
+ System.out.println("Error: too many directories.\n");
+ System.exit(1);
+ }
+ readDirs[nDirs] = inputDir.getPath() + File.separator + file.getName();
+ alignerDirs[nDirs] = options.getAlignerDir() + File.separator + passOrFail + File.separator + options.getTypeFromInt(type) + File.separator + file.getName();
+ readTypes[nDirs++] = pf;
+ }
+ }
+ } else {
+ readDirs[nDirs] = options.getReadDir() + File.separator + passOrFail + File.separator + options.getTypeFromInt(type);
+ alignerDirs[nDirs] = options.getAlignerDir() + File.separator + passOrFail + File.separator + options.getTypeFromInt(type);
+ readTypes[nDirs++] = pf;
+ }
+
+ }
+ }
+ } else {
+ readDirs[nDirs] = options.getReadDir();
+ alignerDirs[nDirs] = options.getAlignerDir();
+ readTypes[nDirs] = NanoOKOptions.READTYPE_COMBINED;
+ nDirs++;
+ }
+
+ for (int dirIndex=0; dirIndex<nDirs; dirIndex++) {
+ String inputDir = readDirs[dirIndex];
+ String alignDir = alignerDirs[dirIndex];
+ File folder = new File(inputDir);
+ File[] listOfFiles = folder.listFiles();
+
+ if (listOfFiles == null) {
+ System.out.println("");
+ System.out.println("Directory "+inputDir+" doesn't exist");
+ } else if (listOfFiles.length <= 0) {
+ System.out.println("");
+ System.out.println("Directory "+inputDir+" empty");
+ } else {
+ for (File file : listOfFiles) {
+ if (file.isFile()) {
+ if (isValidReadExtension(file.getName())) {
+ String alignmentFilename = alignDir + File.separator + file.getName() + parser.getAlignmentFileExtension();
+ //System.out.println(alignmentFilename);
+ //options.getLog().println("File: " + alignmentFilename);
+ if (new File(alignmentFilename).exists()) {
+ queryExecutor.execute(new ParserRunnable(options, stats, file.getAbsolutePath(), alignmentFilename, type, readTypes[dirIndex], nonAlignedSummary));
+ writeProgress(queryExecutor);
+
+ nFastaFiles++;
+ if ((maxReads > 0) && (nFastaFiles >= maxReads)) {
+ break;
+ }
+
+ } else {
+ System.out.println("Error: Read ignored, can't find alignment "+alignmentFilename);
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+
+ // That's all - wait for all threads to finish
+ queryExecutor.shutdown();
+ while (!queryExecutor.isTerminated()) {
+ writeProgress(queryExecutor);
+ Thread.sleep(100);
+ }
+
+ writeProgress(queryExecutor);
+ System.out.println("");
+
+ stats.closeLengthsFile();
+ stats.writeSummaryFile();
+ stats.calculateStats();
+
+ return nFastaFiles;
+ }
+
+ /**
+ * Gather length statistics on reads and parse alignments
+ */
+ public int processReadsBatch() throws InterruptedException {
+ AlignmentFileParser parser = options.getParser();
+ String[] readDirs = new String[MAX_READ_DIRS];
+ String[] alignerDirs = new String[MAX_READ_DIRS];
+ int readTypes[] = new int[MAX_READ_DIRS];
+ int nDirs = 0;
+ int maxReads = options.getMaxReads();
+ String outputFilename = options.getAnalysisDir() + File.separator + "Unaligned" + File.separator + options.getTypeFromInt(type) + "_nonaligned.txt";
+ AlignmentsTableFile nonAlignedSummary = new AlignmentsTableFile(outputFilename);
+
+ nFastaFiles=0;
+
+ typeString = options.getTypeFromInt(type);
+
+ stats.openLengthsFile();
+
+ for (int pf=NanoOKOptions.READTYPE_PASS; pf<=NanoOKOptions.READTYPE_FAIL; pf++) {
+ String passOrFail="";
+
+ if ((pf == NanoOKOptions.READTYPE_PASS) && (options.isProcessingPassReads())) {
+ passOrFail="pass";
+ } else if ((pf == NanoOKOptions.READTYPE_FAIL) && (options.isProcessingFailReads())) {
+ passOrFail="fail";
+ }
+
+ if (passOrFail != "") {
+ if (options.isBarcoded()) {
+ File inputDir = new File(options.getReadDir() + File.separator + passOrFail + File.separator + typeString);
+ File[] listOfFiles = inputDir.listFiles();
+ for (File file : listOfFiles) {
+ if (file.isDirectory()) {
+ if (nDirs == MAX_READ_DIRS) {
+ System.out.println("Error: too many directories.\n");
+ System.exit(1);
+ }
+ readDirs[nDirs] = options.getReadDir() + File.separator + passOrFail + File.separator + typeString + File.separator + file.getName();
+ alignerDirs[nDirs] = options.getAlignerDir() + File.separator + passOrFail + File.separator + typeString + File.separator + file.getName();
+ readTypes[nDirs++] = pf;
+ }
+ }
+ } else {
+ readDirs[nDirs] = options.getReadDir() + File.separator + passOrFail + File.separator + typeString;
+ alignerDirs[nDirs] = options.getAlignerDir() + File.separator + passOrFail + File.separator + typeString;
+ readTypes[nDirs++] = pf;
+ }
+
+ }
+ }
+
+ // Dirs should be e.g.
+ // inputDir = sample/fasta/Template/pass
+ // alignDir = sample/last/Template/pass
+ for (int dirIndex=0; dirIndex<nDirs; dirIndex++) {
+ String inputDir = readDirs[dirIndex];
+ String alignDir = alignerDirs[dirIndex];
+ File folder = new File(inputDir);
+ File[] listOfFilesTop = folder.listFiles();
+
+ options.getLog().println("Input: "+inputDir);
+ options.getLog().println("Align: "+alignDir);
+
+ // Now list of files should contain batch_XXX directories
+ if (listOfFilesTop == null) {
+ System.out.println("");
+ System.out.println("Directory "+inputDir+" doesn't exist");
+ } else if (listOfFilesTop.length <= 0) {
+ System.out.println("");
+ System.out.println("Directory "+inputDir+" empty");
+ } else {
+ for (File topLevelFile : listOfFilesTop) {
+ options.getLog().println(" Got dir "+ topLevelFile.getName());
+ if (topLevelFile.isDirectory()) {
+ // Now go through reads in directory
+ File[] listOfFiles = topLevelFile.listFiles();
+ for (File file : listOfFiles) {
+ if (file.isFile()) {
+ if (isValidReadExtension(file.getName())) {
+ String alignmentFilename = alignDir + File.separator + topLevelFile.getName() + File.separator + file.getName() + parser.getAlignmentFileExtension();
+ //System.out.println(alignmentFilename);
+ //options.getLog().println("File: " + alignmentFilename);
+ if (new File(alignmentFilename).exists()) {
+ queryExecutor.execute(new ParserRunnable(options, stats, file.getAbsolutePath(), alignmentFilename, type, readTypes[dirIndex], nonAlignedSummary));
+ writeProgress(queryExecutor);
+
+ nFastaFiles++;
+ if ((maxReads > 0) && (nFastaFiles >= maxReads)) {
+ break;
+ }
+
+ } else {
+ System.out.println("Error: Read ignored, can't find alignment "+alignmentFilename);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+
+ // That's all - wait for all threads to finish
+ queryExecutor.shutdown();
+ while (!queryExecutor.isTerminated()) {
+ writeProgress(queryExecutor);
+ Thread.sleep(100);
+ }
+
+ writeProgress(queryExecutor);
+ System.out.println("");
+
+ stats.closeLengthsFile();
+ stats.writeSummaryFile();
+ stats.calculateStats();
+
+ return nFastaFiles;
+ }
+
+ public int processReads() throws InterruptedException {
+ if (options.usingBatchDirs()) {
+ return processReadsBatch();
+ } else {
+ return processReadsOld();
+ }
+ }
+
+ /**
+ * Get type of this read set.
+ * @return a String (e.g. "Template")
+ */
+ public String getTypeString() {
+ return typeString;
+ }
+
+ /**
+ * Get stats object.
+ * @return a ReadSetStats object
+ */
+ public ReadSetStats getStats() {
+ return stats;
+ }
+}
diff --git a/src/nanook/ReadSetStats.java b/src/nanook/ReadSetStats.java
new file mode 100644
index 0000000..ab85a65
--- /dev/null
+++ b/src/nanook/ReadSetStats.java
@@ -0,0 +1,625 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.Map;
+
+/**
+ * Represent statistics about a read set (for example Template read set).
+ *
+ * @author Richard Leggett
+ */
+public class ReadSetStats implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private NanoOKOptions options;
+ private transient PrintWriter pwLengths = null;
+ private transient PrintWriter pwKmers = null;
+ private String typeString = "";
+ private int longest = 0;
+ private int shortest = NanoOKOptions.MAX_READ_LENGTH;
+ private long basesSum = 0;
+ private double meanLength = 0;
+ private int n50 = 0;
+ private int n50Count = 0;
+ private int n90 = 0;
+ private int n90Count = 0;
+ private int[] lengths = new int[NanoOKOptions.MAX_READ_LENGTH];
+ private Hashtable<String,Integer> readLengths = new Hashtable();
+ private Hashtable<String,Double> readGC = new Hashtable();
+ private int nReads = 0;
+ private int nReadFiles = 0;
+ private int nPassFiles = 0;
+ private int nFailFiles = 0;
+ private int nReadsWithAlignments = 0;
+ private int nReadsWithoutAlignments = 0;
+ private int[] readBestPerfectKmer = new int[NanoOKOptions.MAX_KMER];
+ private int[] readCumulativeBestPerfectKmer = new int[NanoOKOptions.MAX_KMER];
+ private MotifStatistics motifStats = new MotifStatistics();
+ private int substitutionErrors[][] = new int[4][4];
+ private int nSubstitutions = 0;
+ private int nInsertions = 0;
+ private int nDeletions = 0;
+ private int ignoredDuplicates = 0;
+ private int type;
+
+ /**
+ * Constructor
+ * @param o NanoOKOptions object
+ * @param t Type integer (defined in NanoOKOptions)
+ */
+ public ReadSetStats(NanoOKOptions o, int t) {
+ options=o;
+ type = t;
+ typeString = NanoOKOptions.getTypeFromInt(type);
+ for (int i=0; i<NanoOKOptions.MAX_KMER; i++) {
+ readBestPerfectKmer[i] = 0;
+ readCumulativeBestPerfectKmer[i] = 0;
+ }
+ }
+
+ /**
+ * Open a text file to store read lengths.
+ */
+ public void openLengthsFile() {
+ String lengthsFilename = options.getAnalysisDir() + File.separator + "all_" + typeString + "_lengths.txt";
+ String kmersFilename = options.getAnalysisDir() + File.separator + "all_" + typeString + "_kmers.txt";
+
+ options.getLog().println("Opening "+lengthsFilename);
+ options.getLog().println("Opening "+kmersFilename);
+
+ try {
+ pwLengths = new PrintWriter(new FileWriter(lengthsFilename));
+ pwKmers = new PrintWriter(new FileWriter(kmersFilename));
+ pwKmers.write("Id\tLength\tnk15\tnk17\tnk19\tnk21\tnk23\tnk25");
+ pwKmers.println("");
+ } catch (IOException e) {
+ System.out.println("openLengthsFile exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Close the read lengths file.
+ */
+ public void closeLengthsFile() {
+ pwLengths.close();
+ }
+
+ /**
+ * Close the kmers file
+ */
+ public void closeKmersFile() {
+ pwKmers.close();
+ }
+
+ /**
+ * Calculate various statistics, e.g. N50 etc.
+ */
+ public void calculateStats() {
+ int total = 0;
+ int c = 0;
+
+ meanLength = (double)basesSum / (double)nReads;
+
+ for (int i=longest; i>0; i--) {
+ for (int j=0; j<lengths[i]; j++) {
+ total += i;
+ c++;
+
+ if ((n50 == 0) && ((double)total >= ((double)basesSum * 0.5))) {
+ n50 = i;
+ n50Count = c;
+ }
+
+ if ((n90 == 0) && ((double)total >= ((double)basesSum * 0.9))) {
+ n90 = i;
+ n90Count = c;
+ }
+ }
+ }
+
+ }
+
+ /**
+ * Update count of read files.
+ * @param type
+ */
+ public synchronized void addReadFile(int type) {
+ nReadFiles++;
+
+ if (type == NanoOKOptions.READTYPE_PASS) {
+ nPassFiles++;
+ } else if (type == NanoOKOptions.READTYPE_FAIL) {
+ nFailFiles++;
+ }
+ }
+
+ /**
+ * Get number of read files in pass directory
+ * @return Number of files in pass directory
+ */
+ public synchronized int getNumberOfPassFiles() {
+ return nPassFiles;
+ }
+
+ /**
+ * Get number of read files in fail directory
+ * @return Number of files in fail directory
+ */
+ public synchronized int getNumberOfFailFiles() {
+ return nFailFiles;
+ }
+
+ /**
+ * Get type
+ * @return type
+ */
+ public int getType() {
+ return type;
+ }
+
+ /**
+ * Get type as a string.
+ * @return type String
+ */
+ public String getTypeString() {
+ return typeString;
+ }
+
+ /**
+ * Get mean length of reads in this read set.
+ * @return length
+ */
+ public synchronized double getMeanLength() {
+ return meanLength;
+ }
+
+ /**
+ * Get longest read in this read set.
+ * @return length
+ */
+ public synchronized int getLongest() {
+ return longest;
+ }
+
+ /**
+ * Get shortest read in this read set.
+ * @return length
+ */
+ public synchronized int getShortest() {
+ return shortest;
+ }
+
+ /**
+ * Get N50 for this read set.
+ * @return N50 length
+ */
+ public synchronized int getN50() {
+ return n50;
+ }
+
+ /**
+ * Get N50 count - number of reads of length N50 or greater.
+ * @return count
+ */
+ public synchronized int getN50Count() {
+ return n50Count;
+ }
+
+ /**
+ * Get N90 for this read set.
+ * @return N90 length
+ */
+ public synchronized int getN90() {
+ return n90;
+ }
+
+ /**
+ * Get N90 count - number of reads of length N90 or greater.
+ * @return count
+ */
+ public synchronized int getN90Count() {
+ return n90Count;
+ }
+
+ /**
+ * Get number of reads.
+ * @return number of reads
+ */
+ public synchronized int getNumReads() {
+ return nReads;
+ }
+
+ /**
+ * Get total bases represented by read set.
+ * @return number of bases
+ */
+ public synchronized long getTotalBases() {
+ return basesSum;
+ }
+
+ /**
+ * Get number of read files.
+ * @return number of files
+ */
+ public synchronized int getNumReadFiles() {
+ return nReadFiles;
+ }
+
+ private String getPrefix(String path) {
+ String leafname = new File(path).getName();
+ leafname.replaceAll(":", "_");
+ return leafname.substring(0, leafname.indexOf(".fa"));
+ }
+
+ /**
+ * Store a read length in the array of read lengths.
+ * @param id ID of read
+ * @param l length
+ */
+ public synchronized void addLength(String readPath, String id, int l, double gc) {
+ pwLengths.println(id + "\t" + l);
+ id = getPrefix(readPath) + ":"+id;
+
+ if (readLengths.containsKey(id)) {
+ System.out.println("Error: Read ID "+id+" already seen. This occurrance ignored.");
+ ignoredDuplicates++;
+ } else {
+ readLengths.put(id, l);
+ readGC.put(id, gc);
+ }
+
+ if (l < NanoOKOptions.MAX_READ_LENGTH) {
+ lengths[l]++;
+ if (l > longest) {
+ longest = l;
+ }
+
+ if (l < shortest) {
+ shortest = l;
+ }
+ } else {
+ System.out.println("Error: unexpectedly long ("+l+") read ignored - "+readPath);
+ }
+
+ basesSum += l;
+ nReads++;
+ }
+
+ /**
+ * Get length of read
+ * @param id of read
+ * @return length, in bases
+ */
+ public synchronized int getReadLength(String alignmentFile, String id) {
+ int length = -1;
+
+ id = getPrefix(alignmentFile) + ":"+id;
+
+ Integer l = readLengths.get(id);
+
+ if (l != null) {
+ length = l.intValue();
+ }
+
+ return length;
+ }
+
+ /**
+ * Get GC of read
+ * @param id of read
+ * @return GC percent
+ */
+ public synchronized double getGC(String alignmentFile, String id) {
+ double gc = -1;
+
+ id = getPrefix(alignmentFile) + ":"+id;
+
+ Double g = readGC.get(id);
+
+ if (g == null) {
+ g = 50.0;
+ System.out.println("Warning: couldn't get GC from " + alignmentFile + " - assumed 50%");
+ }
+
+ //if (g != null) {
+ // gc = g.intValue();
+ //}
+
+ //return gc;
+ return g;
+ }
+
+ /**
+ * Store a read with an alignment.
+ */
+ public synchronized void addReadWithAlignment() {
+ nReadsWithAlignments++;
+ }
+
+ /**
+ * Store a read without an alignment.
+ */
+ public synchronized void addReadWithoutAlignment() {
+ nReadsWithoutAlignments++;
+ }
+
+ /**
+ * Store best perfect kmers for each read.
+ * @param bestKmer length of best perfect kmer
+ */
+ public synchronized void addReadBestKmer(int bestKmer) {
+ if (bestKmer >= NanoOKOptions.MAX_KMER) {
+ System.out.println("Error: the unlikely event of a best kmer size of "+bestKmer+" has happened! (Max "+NanoOKOptions.MAX_KMER+")");
+ System.exit(1);
+ }
+
+ readBestPerfectKmer[bestKmer]++;
+
+ for (int i=0; i<bestKmer; i++) {
+ readCumulativeBestPerfectKmer[i]++;
+ }
+ }
+
+ /**
+ * Get number of reads in this read set.
+ * @return number of reads.
+ */
+ public synchronized int getNumberOfReads() {
+ return nReads;
+ }
+
+ /**
+ * Get number of reads with alignments in this read set.
+ * @return number of reads
+ */
+ public synchronized int getNumberOfReadsWithAlignments() {
+ return nReadsWithAlignments;
+ }
+
+ /**
+ * Get number of reads without alignments in this read set.
+ * @return number of reads
+ */
+ public synchronized int getNumberOfReadsWithoutAlignments() {
+ return nReadsWithoutAlignments;
+ }
+
+ /**
+ * Get percentage of reads with alignments
+ * @return percentage of reads
+ */
+ public synchronized double getPercentOfReadsWithAlignments() {
+ return (100.0 * (double)nReadsWithAlignments) / (double)nReads;
+ }
+
+ /**
+ * Get percentage of reads without alignments
+ * @return percentage of reads
+ */
+ public synchronized double getPercentOfReadsWithoutAlignments() {
+ return (100.0 * (double)nReadsWithoutAlignments) / (double)nReads;
+ }
+
+ /**
+ * Print statistics to screen.
+ */
+ public synchronized void printStats() {
+ System.out.println("Parse " + typeString + " alignments");
+ System.out.println(typeString + " reads: " + nReads);
+ System.out.println(typeString + " reads with alignments: " + nReadsWithAlignments);
+ System.out.println(typeString + " reads without alignments: " + nReadsWithoutAlignments);
+ }
+
+ /**
+ * Write a short summary file for this read set.
+ * @param filename output filename
+ */
+ public void writeSummaryFile() {
+ String filename = options.getAlignmentSummaryFilename();
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename, true));
+ pw.println("");
+ pw.printf("%s alignments", typeString);
+ pw.println("");
+ pw.println("");
+ pw.printf("Num reads: %d", nReads);
+ pw.println("");
+ pw.printf("Num reads with alignments: %d", nReadsWithAlignments);
+ pw.println("");
+ pw.printf("Num reads without alignments: %d", nReadsWithoutAlignments);
+ pw.println("");
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeSummaryFile exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Store a deletion error.
+ * @param size size of deletion
+ * @param kmer kmer prior to error
+ */
+ public synchronized void addDeletionError(int size, String kmer) {
+ motifStats.addDeletionMotifs(kmer);
+ nDeletions++;
+ }
+
+ /**
+ * Store an insertion error.
+ * @param size size of insertion
+ * @param kmer kmer prior to error
+ */
+ public synchronized void addInsertionError(int size, String kmer) {
+ motifStats.addInsertionMotifs(kmer);
+ nInsertions++;
+ }
+
+ /**
+ * Store a substitution error.
+ * @param kmer kmer prior to error
+ * @param refChar reference base
+ * @param subChar substituted base
+ */
+ public synchronized void addSubstitutionError(String kmer, char refChar, char subChar) {
+ int r = -1;
+ int s = -1;
+
+ motifStats.addSubstitutionMotifs(kmer);
+
+ switch(refChar) {
+ case 'A': r=0; break;
+ case 'C': r=1; break;
+ case 'G': r=2; break;
+ case 'T': r=3; break;
+ default: break; //System.out.println("Warning: Unknown base ("+refChar+") in reference"); break;
+ }
+
+ switch(subChar) {
+ case 'A': s=0; break;
+ case 'C': s=1; break;
+ case 'G': s=2; break;
+ case 'T': s=3; break;
+ default: System.out.println("Warning: Unknown base ("+refChar+") in read"); break;
+ }
+
+ if ((r >= 0) && (s >= 0)) {
+ nSubstitutions++;
+ substitutionErrors[r][s]++;
+ }
+ }
+
+ /**
+ * Get substitution error matrix (A, C, G, T vs A, C, G, T).
+ * @return Substitution error matrix
+ */
+ public synchronized int[][] getSubstitutionErrors() {
+ return substitutionErrors;
+ }
+
+ /**
+ * Get number of substitutions.
+ * @return number
+ */
+ public synchronized int getNumberOfSubstitutions() {
+ return nSubstitutions;
+ }
+
+ /**
+ * Write motif stats to screen.
+ */
+ public synchronized void outputMotifStats() {
+ motifStats.outputAllMotifCounts();
+ }
+
+ /**
+ * Get motif statistics.
+ * @return MotifStatistics object
+ */
+ public synchronized MotifStatistics getMotifStatistics() {
+ return motifStats;
+ }
+
+ public synchronized void writekCounts(String id, int length, int nk, int[] s, int[] kCounts) {
+ pwKmers.print(id+"\t"+Integer.toString(length));
+ for (int i=0; i<nk; i++) {
+ pwKmers.print("\t"+Integer.toString(kCounts[i]));
+ }
+ pwKmers.println("");
+ }
+
+ /**
+ * Get options
+ */
+ public NanoOKOptions getOptions() {
+ return options;
+ }
+
+ /**
+ * Write substitution stats to a file
+ */
+ public void writeSubstitutionStats() {
+ String filenamePc = options.getAnalysisDir() + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) + "_substitutions_percent.txt";
+ String bases[] = {"A","C","G","T"};
+ try {
+ PrintWriter pwPc = new PrintWriter(new FileWriter(filenamePc));
+ pwPc.println("\tSubA\tSubC\tSubG\tSubT");
+ for (int r=0; r<4; r++) {
+ pwPc.print("Ref"+bases[r]);
+ for (int s=0; s<4; s++) {
+ double pc = 0;
+
+ if (substitutionErrors[r][s] > 0) {
+ pc = (100.0 * (double)substitutionErrors[r][s]) / nSubstitutions;
+ }
+ pwPc.printf("\t%.2f", pc);
+ }
+ pwPc.println("");
+ }
+ pwPc.close();
+ } catch (IOException e) {
+ System.out.println("writeSubstitutionStats exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write error motif stats to a file
+ */
+ public void writeErrorMotifStats() {
+ try {
+ for (int t=0; t<3; t++) {
+ for (int n=3; n<=5; n++) {
+ ArrayList<Map.Entry<String, Double>> motifs = null;
+ String typeString = "";
+ String filename = "";
+
+ if (t == 0) {
+ typeString = "insertion";
+ motifs = motifStats.getSortedInsertionMotifPercentages(n);
+ } else if (t == 1) {
+ typeString = "deletion";
+ motifs = motifStats.getSortedDeletionMotifPercentages(n);
+ } else {
+ typeString = "substitution";
+ motifs = motifStats.getSortedSubstitutionMotifPercentages(n);
+ }
+
+ filename = options.getAnalysisDir() + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) + "_"+typeString+"_"+n+"mer_motifs.txt";
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ pw.println("Kmer\tPercentage");
+
+ for (int i=0; i<motifs.size(); i++) {
+ pw.printf("%s\t%.4f", motifs.get(i).getKey(), motifs.get(i).getValue());
+ pw.println("");
+ }
+ pw.close();
+ }
+ }
+ } catch (IOException e) {
+ System.out.println("writeSubstitutionStats exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public int getIgnoredDuplicates() {
+ return ignoredDuplicates;
+ }
+}
\ No newline at end of file
diff --git a/src/nanook/ReadStats.java b/src/nanook/ReadStats.java
new file mode 100644
index 0000000..3856b9c
--- /dev/null
+++ b/src/nanook/ReadStats.java
@@ -0,0 +1,6 @@
+
+package nanook;
+
+public class ReadStats {
+
+}
diff --git a/src/nanook/ReferenceSequence.java b/src/nanook/ReferenceSequence.java
new file mode 100644
index 0000000..f9e3f46
--- /dev/null
+++ b/src/nanook/ReferenceSequence.java
@@ -0,0 +1,190 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Serializable;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+/**
+ * Represents a sequence (contig) within a reference.
+ *
+ * @author Richard leggett
+ */
+public class ReferenceSequence implements Comparable, Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private String id = null;
+ private String name = null;
+ private int size = 0;
+ private int binSize = 500;
+ private ReferenceSequenceStats referenceStats[] = new ReferenceSequenceStats[3];
+ private KmerTable refKmerTable = new KmerTable(5);
+
+ /**
+ * Constructor
+ * @param i sequence ID
+ * @param s size (length) of sequence
+ * @param n display name (may be difference to ID in file)
+ */
+ public ReferenceSequence(String i, int s, String n) {
+ id = i;
+ size = s;
+ name = n;
+
+ float b = size / 100;
+
+ // Make a multiple of 10, 100 or 500...
+ if (size < 50000) {
+ binSize = 10 * (1 + Math.round(b / 10));
+ } else if (size < 500000) {
+ binSize = 100 * (1 + Math.round(b / 100));
+ } else {
+ binSize = 500 * (1 + Math.round(b / 500));
+ }
+
+ for (int t=0; t<3; t++) {
+ referenceStats[t] = new ReferenceSequenceStats(size, name);
+ }
+ }
+
+ /**
+ * Open alignment summary files for each reference for each type (Template, Complement, 2D).
+ *
+ * @param analysisDir directory to write files to
+ */
+ public void openAlignmentSummaryFiles(NanoOKOptions options) {
+ for (int t=0; t<3; t++) {
+ if (options.isProcessingReadType(t)) {
+ referenceStats[t].openAlignmentsTableFile(options.getAnalysisDir() + File.separator + name + File.separator + name + "_" + NanoOKOptions.getTypeFromInt(t) + "_alignments.txt");
+ }
+ }
+ }
+
+ /**
+ * Get stats for a particular type (Template, Complement, 2D).
+ * @param t integer type
+ * @return ReferenceSequenceStats object
+ */
+ public ReferenceSequenceStats getStatsByType(int t) {
+ return referenceStats[t];
+ }
+
+ /**
+ * Get ID for this sequence.
+ * @return ID String
+ */
+ public String getId() {
+ return id;
+ }
+
+ /**
+ * Get display name for this sequence.
+ * @return name String
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Get size (length) of this sequence.
+ * @return length
+ */
+ public int getSize() {
+ return size;
+ }
+
+ /**
+ * Get bin size for graph plotting
+ * @return size (nt)
+ */
+ public int getBinSize() {
+ return binSize;
+ }
+
+ public int compareTo(Object o) {
+ ReferenceSequence r = (ReferenceSequence)o;
+ return name.compareTo(r.getName());
+ }
+
+ /**
+ * Get kmer table
+ * @return
+ */
+ public KmerTable getKmerTable() {
+ return refKmerTable;
+ }
+
+ /**
+ *
+ */
+ public void writeKmerFile(int type, String filename) {
+ KmerTable readKmerTable = referenceStats[type].getReadKmerTable();
+
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ pw.println("Kmer\tRefCount\tReadCount\tRefPc\tReadPc");
+
+ Set<String> refKeys = refKmerTable.getKeys();
+ Set<String> readKeys = readKmerTable.getKeys();
+ HashSet<String> allKeys = new HashSet();
+ int refTotal = 0;
+ int readTotal = 0;
+
+ for (String kmer : refKeys) {
+ refTotal += refKmerTable.get(kmer);
+ allKeys.add(kmer);
+ }
+
+ int count = 0;
+ for (String kmer : readKeys) {
+ readTotal += readKmerTable.get(kmer);
+ if (! allKeys.contains(kmer)) {
+ allKeys.add(kmer);
+ count++;
+ }
+ }
+
+ for (String kmer : allKeys) {
+ int refCount = refKmerTable.get(kmer);
+ int readCount = readKmerTable.get(kmer);
+ double refPc = 0;
+ double readPc = 0;
+
+ if (refCount > 0) {
+ refPc = (100 * refCount) / (double)refTotal;
+ }
+
+ if (readCount > 0) {
+ readPc = (100 * readCount) / (double)readTotal;
+ }
+
+ referenceStats[type].addKmerAbundance(kmer, refPc, readPc);
+
+ pw.printf("%s\t%d\t%d\t%.4f\t%.4f", kmer, refCount, readCount, refPc, readPc);
+ pw.println("");
+ }
+
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public int getTotalNumberOfAlignments() {
+ return referenceStats[0].getNumberOfReadsWithAlignments() +
+ referenceStats[1].getNumberOfReadsWithAlignments() +
+ referenceStats[2].getNumberOfReadsWithAlignments();
+ }
+}
diff --git a/src/nanook/ReferenceSequenceStats.java b/src/nanook/ReferenceSequenceStats.java
new file mode 100644
index 0000000..39133e8
--- /dev/null
+++ b/src/nanook/ReferenceSequenceStats.java
@@ -0,0 +1,548 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+
+/**
+ * Stores stats for each reference sequence, one object per read type (Template, Complement, 2D).
+ *
+ * @author Richard Leggett
+ */
+public class ReferenceSequenceStats implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private static final int MAX_INDEL = 100;
+ private int size;
+ private String name;
+ private SequenceCoverage cov;
+ //int[] coverage;
+ private int[] perfectKmerCounts = new int[NanoOKOptions.MAX_KMER];
+ private int[] readBestPerfectKmer = new int[NanoOKOptions.MAX_KMER];
+ private int[] readCumulativeBestPerfectKmer = new int[NanoOKOptions.MAX_KMER];
+ private int longestPerfectKmer = 0;
+ private int nReadsWithAlignments = 0;
+ private int totalReadBases = 0;
+ private int totalAlignedBases = 0;
+ private int totalAlignedBasesWithoutIndels = 0;
+ private int totalIdentical = 0;
+ private int nDeletionErrors = 0;
+ private int nInsertionErrors = 0;
+ private int nSubstitutionErrors = 0;
+ private int nInsertedBases = 0;
+ private int nDeletedBases = 0;
+ private int largestInsertion = 0;
+ private int largestDeletion = 0;
+ private int insertionSizes[] = new int[MAX_INDEL];
+ private int deletionSizes[] = new int[MAX_INDEL];
+ private int alignedPositiveStrand = 0;
+ private int alignedNegativeStrand = 0;
+ private long totalBases = 0;
+ private long totalReads = 0;
+ private KmerTable readKmerTable = new KmerTable(5);
+ private AlignmentsTableFile atf;
+ private ArrayList<KmerAbundance> kmerAbundance = new ArrayList();
+
+ /**
+ * Constructor.
+ * @param size size (length) of reference
+ * @param n name of reference
+ */
+ public ReferenceSequenceStats(int s, String n) {
+ size = s;
+ name = n;
+ cov = new SequenceCoverage(size);
+ //coverage = new int[size];
+ }
+
+ /**
+ * Create an alignments table file.
+ * @param filename flename
+ */
+ public void openAlignmentsTableFile(String filename) {
+ atf = new AlignmentsTableFile(filename);
+ }
+
+ /**
+ * Get the associated AlignmentsTableFile object
+ * @return an AlignmentsTableFile
+ */
+ public AlignmentsTableFile getAlignmentsTableFile() {
+ return atf;
+ }
+
+ /**
+ * Get number of reads with alignments.
+ * @return number of reads
+ */
+ public synchronized int getNumberOfReadsWithAlignments() {
+ return nReadsWithAlignments;
+ }
+
+ /**
+ * Get longest perfect kmer length.
+ * @return length longest perfect kmer, in bases
+ */
+ public synchronized int getLongestPerfectKmer() {
+ return longestPerfectKmer;
+ }
+
+ /**
+ * Store all perfect kmer sizes for later analysis.
+ * @param size size of kmer
+ */
+ public synchronized void addPerfectKmer(int size) {
+ if (size >= NanoOKOptions.MAX_KMER) {
+ System.out.println("Error: very unlikely situation with perfect kmer of size " + size + " (Max " + NanoOKOptions.MAX_KMER + ")");
+ System.exit(1);
+ }
+
+ perfectKmerCounts[size]++;
+
+ if (size > longestPerfectKmer) {
+ longestPerfectKmer = size;
+ }
+ }
+
+ /**
+ * Increment coverage between two points.
+ * @param start start position
+ * @param size size
+ */
+ public synchronized void addCoverage(int start, int size) {
+ cov.addCoverage(start, size);
+ //for (int i=start; i<(start+size); i++) {
+ // coverage[i]++;
+ //}
+ }
+
+ /**
+ * Store best perfect kmer length for each read.
+ * @param bestKmer length of best perfect kmer
+ */
+ public synchronized void addReadBestKmer(int bestKmer) {
+ readBestPerfectKmer[bestKmer]++;
+
+ for (int i=1; i<=bestKmer; i++) {
+ readCumulativeBestPerfectKmer[i]++;
+ }
+
+ nReadsWithAlignments++;
+ }
+
+ /**
+ * Write coverage file for later graph plotting.
+ * @param filename output filename
+ * @param binSize bin size
+ */
+ public void writeCoverageData(String filename, int binSize) {
+ cov.writeCoverageData(filename, binSize);
+// try {
+// PrintWriter pw = new PrintWriter(new FileWriter(filename));
+// for (int i=0; i<(size-binSize); i+=binSize) {
+// int count = 0;
+// for (int j=0; j<binSize; j++) {
+// count += coverage[i+j];
+// }
+// pw.printf("%d\t%.2f", i, ((double)count / (double)binSize));
+// pw.println("");
+// }
+// pw.close();
+// } catch (IOException e) {
+// System.out.println("writeCoverageData exception:");
+// e.printStackTrace();
+// System.exit(1);
+// }
+ }
+
+ /**
+ * Write data for perfect kmer histogram.
+ * @param filename output filename
+ */
+ public void writePerfectKmerHist(String filename) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=1; i<=longestPerfectKmer; i++) {
+ pw.printf("%d\t%d", i, perfectKmerCounts[i]);
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writePerfectKmerHist exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write data for best perfect kmer histogram.
+ * @param filename output filename
+ */
+ public void writeBestPerfectKmerHist(String filename) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=1; i<=longestPerfectKmer; i++) {
+ double pc = 0;
+
+ if ((readBestPerfectKmer[i] > 0) && (nReadsWithAlignments > 0)) {
+ pc = ((double)100.0 * readBestPerfectKmer[i]) / (double)nReadsWithAlignments;
+ }
+
+ pw.printf("%d\t%d\t%.2f", i, readBestPerfectKmer[i], pc);
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeBestPerfectKmerHist exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write data for best perfect kmer cumulative histogram.
+ * @param filename output filename
+ */
+ public void writeBestPerfectKmerHistCumulative(String filename) {
+ int nr = 0;
+
+ for (int i=1; i<=longestPerfectKmer; i++) {
+ nr += readBestPerfectKmer[i];
+ }
+
+ if (nReadsWithAlignments != nr) {
+ System.out.println("Discrepancy: "+nr+" not equal to "+nReadsWithAlignments);
+ }
+
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=1; i<=longestPerfectKmer; i++) {
+ double pc = 0;
+
+ if ((readCumulativeBestPerfectKmer[i]> 0) && (nReadsWithAlignments > 0)){
+ pc = ((double)100.0 * readCumulativeBestPerfectKmer[i]) / (double)nr; //(double)nReadsWithAlignments;
+ }
+
+ pw.printf("%d\t%d\t%.2f", i, readCumulativeBestPerfectKmer[i], pc);
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeBestPerfectKmerHistCumulative exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+ /**
+ * Write a line to the reference sequence summary file.
+ * @param pw PrintWriter object to write with
+ * @param format format string for output
+ */
+ public void writeSummary(PrintWriter pw, String format) {
+ pw.printf(format, name, size, nReadsWithAlignments, longestPerfectKmer);
+ pw.println("");
+ }
+
+ /**
+ * Get mean read length
+ * @return mean read length
+ */
+ public synchronized double getMeanReadLength() {
+ if (nReadsWithAlignments > 0) {
+ return (double)totalReadBases / (double)nReadsWithAlignments;
+ } else {
+ return 0.0;
+ }
+ }
+
+ /**
+ * Store alignment stats.
+ * @param querySize query size
+ * @param alignedSize number of aligned bases
+ * @param identicalBases number of identical bases
+ */
+ public synchronized void addAlignmentStats(int querySize, int alignedSize, int alignedSizeMinusIndels, int identicalBases, String hitStrand, String queryStrand) {
+ totalAlignedBases += alignedSize;
+ //System.out.println("\nAlignedBases " + alignedSize);
+ totalAlignedBasesWithoutIndels += alignedSizeMinusIndels;
+ totalReadBases += querySize;
+ totalIdentical += identicalBases;
+
+ if (hitStrand.equals("+")) {
+ if (queryStrand.equals("+")) {
+ alignedPositiveStrand++;
+ } else if (queryStrand.equals("-")) {
+ alignedNegativeStrand++;
+ }
+ }
+ }
+
+ /**
+ * Store a deletion error.
+ * @param size - size of deletion
+ * @param kmer - kmer before error
+ * @param stats - ReadSetStats associated with the error
+ */
+ public synchronized void addDeletionError(int size, String kmer, ReadSetStats stats) {
+ //System.out.println("Delete " + size);
+ if (size >= MAX_INDEL) {
+ System.out.println("Error: indel much larger than expected ("+size+") - possible parsing error");
+ System.out.println("");
+ } else {
+ nDeletionErrors++;
+ nDeletedBases += size;
+ deletionSizes[size]++;
+ if (size > largestDeletion) {
+ largestDeletion = size;
+ }
+ stats.addDeletionError(size, kmer);
+ }
+ }
+
+ /**
+ * Store an insertion error.
+ * @param size - size of insertion
+ * @param kmer - kmer before error
+ * @param stats - ReadSetStats associated with the error
+ */
+ public synchronized void addInsertionError(int size, String kmer, ReadSetStats stats) {
+ //System.out.println("Insert " + size);
+ if (size >= MAX_INDEL) {
+ System.out.println("Error: indel much larger than expected ("+size+") - possible parsing error");
+ System.out.println("");
+ } else {
+ nInsertionErrors++;
+ nInsertedBases += size;
+ insertionSizes[size]++;
+ if (size > largestInsertion) {
+ largestInsertion = size;
+ }
+ stats.addInsertionError(size, kmer);
+ }
+ }
+
+ /**
+ * Get the mean deletion size
+ * @return size, as double
+ */
+ public synchronized double getMeanDeletionSize() {
+ return (double)nDeletedBases / (double)nDeletionErrors;
+ }
+
+ /**
+ * Get the mean insertion size
+ * @return size, as double
+ */
+ public synchronized double getMeanInsertionSize() {
+ return (double)nInsertedBases / (double)nInsertionErrors;
+ }
+
+ /**
+ * Store a substitution error.
+ * @param kmer - kmer before error
+ * @param refChar - reference base
+ * @param subChar - substituted base
+ * @param stats - ReadSetStats associated with the error
+ */
+ public synchronized void addSubstitutionError(String kmer, char refChar, char subChar, ReadSetStats stats) {
+ nSubstitutionErrors++;
+ //System.out.println("Kmer before substitution "+kmer);
+ stats.addSubstitutionError(kmer, refChar, subChar);
+ }
+
+ /**
+ * Get percent identity of aligned bases.
+ * @return identity
+ */
+ public synchronized double getAlignedPercentIdentical() {
+ if ((totalIdentical == 0) || (totalAlignedBases == 0)) {
+ return 0;
+ } else {
+ return (100.0 * totalIdentical) / totalAlignedBases;
+ }
+ }
+
+ /**
+ * Get percent identity of aligned bases.
+ * @return identity
+ */
+ public synchronized double getAlignedPercentIdenticalWithoutIndels() {
+ if ((totalIdentical == 0) || (totalAlignedBasesWithoutIndels == 0)) {
+ return 0;
+ } else {
+ return (100.0 * totalIdentical) / totalAlignedBasesWithoutIndels;
+ }
+ }
+
+ /**
+ * Get percent identity of read.
+ * @return identity
+ */
+ public synchronized double getReadPercentIdentical() {
+ if ((totalIdentical == 0) || (totalReadBases == 0)) {
+ return 0;
+ } else {
+ return (100.0 * totalIdentical) / totalReadBases;
+ }
+ }
+
+ /**
+ * Getnumber of insertion errors.
+ * @return number
+ */
+ public synchronized int getNumberOfInsertionErrors() {
+ return nInsertionErrors;
+ }
+
+ /**
+ * Get number of deletion errors.
+ * @return number
+ */
+ public synchronized int getNumberOfDeletionErrors() {
+ return nDeletionErrors;
+ }
+
+ /**
+ * Get number of substitution errors.
+ * @return number
+ */
+ public synchronized int getNumberOfSubstitutionErrors() {
+ return nSubstitutionErrors;
+ }
+
+ /**
+ * Get percentage of insertion errors
+ * @return percentage
+ */
+ public synchronized double getPercentInsertionErrors() {
+ if ((nInsertedBases == 0) || (totalAlignedBases == 0)) {
+ return 0;
+ } else {
+ return (100.0 * nInsertedBases) / (totalAlignedBases);
+ }
+ }
+
+ /**
+ * Get percentage of deletion errors
+ * @return percentage
+ */
+ public synchronized double getPercentDeletionErrors() {
+ if ((nDeletedBases == 0) || (totalAlignedBases == 0)) {
+ return 0;
+ } else {
+ return (100.0 * nDeletedBases) / (totalAlignedBases);
+ }
+ }
+
+ /**
+ * Get percentage of substitution errors
+ * @return percentage
+ */
+ public synchronized double getPercentSubstitutionErrors() {
+ if ((nSubstitutionErrors == 0) || (totalAlignedBases == 0)) {
+ return 0;
+ } else {
+ return (100.0 * nSubstitutionErrors) / (totalAlignedBases);
+ }
+ }
+
+ /**
+ * Get the number of aligned bases
+ * @return number of bases
+ */
+ public synchronized int getTotalAlignedBases() {
+ return totalAlignedBases;
+ }
+
+ /**
+ * Write a file of insertion stats for plotting.
+ * @param filename output filename
+ */
+ public void writeInsertionStats(String filename) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=1; i<=largestInsertion; i++) {
+ //pw.println(i + "\t" + insertionSizes[i]);
+ pw.printf("%d\t%.4f", i, (100.0 * (double)insertionSizes[i]/(double)nInsertionErrors));
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeInsertionStats exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write a file of deletion stats for plotting.
+ * @param filename output filename
+ */
+ public void writeDeletionStats(String filename) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=1; i<=largestDeletion; i++) {
+ //pw.println(i + "\t" + deletionSizes[i]);
+ pw.printf("%d\t%.4f", i, (100.0 * (double)deletionSizes[i]/(double)nDeletionErrors));
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeDeletionStats exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Get percent of reads aligned on +ve strand
+ * @return count
+ */
+ public synchronized double getAlignedPositiveStrandPercent() {
+ if (alignedPositiveStrand > 0) {
+ return (100.0 * (double)alignedPositiveStrand)/(double)(alignedPositiveStrand + alignedNegativeStrand);
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Get percent of reads aligned on -ve strand
+ * @return count
+ */
+ public synchronized double getAlignedNegativeStrandPercent() {
+ if (alignedNegativeStrand > 0) {
+ return (100.0 * (double)alignedNegativeStrand)/(double)(alignedPositiveStrand + alignedNegativeStrand);
+ } else {
+ return 0;
+ }
+ }
+
+ public KmerTable getReadKmerTable() {
+ return readKmerTable;
+ }
+
+ public void addKmerAbundance(String kmer, double refAbundance, double readAbundance) {
+ kmerAbundance.add(new KmerAbundance(kmer, refAbundance, readAbundance));
+ }
+
+ public void sortKmerAbundance() {
+ Collections.sort(kmerAbundance);
+ for (int i=0; i<10; i++) {
+ KmerAbundance k = kmerAbundance.get(i);
+ }
+ }
+
+ public ArrayList getKmerAbundance() {
+ return kmerAbundance;
+ }
+}
diff --git a/src/nanook/References.java b/src/nanook/References.java
new file mode 100644
index 0000000..a48d80e
--- /dev/null
+++ b/src/nanook/References.java
@@ -0,0 +1,345 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Represents the set of references (sequences) used for the analysis.
+ *
+ * @author Richard Leggett
+ */
+public class References implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private NanoOKOptions options;
+ private File sizesFile;
+ private Hashtable<String,ReferenceSequence> referenceSeqIds = new Hashtable();
+ private Hashtable<String,ReferenceSequence> referenceSeqNames = new Hashtable();
+ private int longestId = 0;
+ private OverallStats overallStats = null;
+
+ /**
+ * Constructor
+ * @param o a NanoOKOptions object
+ */
+ public References(NanoOKOptions o)
+ {
+ options = o;
+ }
+
+ public void setOverallStats(OverallStats s) {
+ overallStats = s;
+ }
+
+ public void readSizesFile() {
+ sizesFile = new File(options.getReferenceFile()+".sizes");
+
+ if (sizesFile.exists()) {
+ System.out.println("Using .sizes file "+sizesFile.getName());
+ System.out.println("Note: if you have changed the reference file, you need to delete the .sizes file and re-run.\n");
+ } else {
+ int extensionIndex = options.getReferenceFile().lastIndexOf('.');
+ if (extensionIndex > 0) {
+ String minusExtension = options.getReferenceFile().substring(0, extensionIndex);
+ sizesFile = new File(minusExtension + ".sizes");
+ }
+ }
+
+ if (!sizesFile.exists()) {
+ System.out.println("Error: can't read sizes file.");
+ System.out.println("Generating .sizes file for reference. You may want to edit the display names.");
+ SequenceReader sr = new SequenceReader(false);
+ sr.indexFASTAFile(options.getReferenceFile(), options.getReferenceFile()+".sizes" , false);
+ sizesFile = new File(options.getReferenceFile()+".sizes");
+ }
+
+ System.out.println("Reading reference sizes and making directories");
+
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(sizesFile));
+ String line = br.readLine();
+ while (line != null) {
+ if (!line.startsWith("#") && (!line.startsWith("SequenceID"))) {
+ String[] values = line.split("\\t");
+ int size = Integer.parseInt(values[1]);
+
+ ReferenceSequence refSeqById = referenceSeqIds.get(values[0]);
+ if (refSeqById != null) {
+ System.out.println("Error: reference contig ID "+values[0]+" occurs more than once.");
+ System.exit(1);
+ }
+
+ ReferenceSequence refSeqByName = referenceSeqNames.get(values[2]);
+ if (refSeqByName != null) {
+ System.out.println("Error: reference contig name "+values[2]+" occurs more than once.");
+ System.exit(1);
+ }
+
+ System.out.println("\t" + values[2] + "\t" + size);
+
+ refSeqById = new ReferenceSequence(values[0], size, values[2]);
+ options.checkAndMakeReferenceAnalysisDir(refSeqById.getName());
+ referenceSeqIds.put(values[0], refSeqById);
+ referenceSeqNames.put(values[2], refSeqById);
+ refSeqById.openAlignmentSummaryFiles(options);
+
+ if (values[0].length() > longestId) {
+ longestId = values[0].length();
+ }
+ }
+
+ line = br.readLine();
+ }
+ br.close();
+ } catch (Exception e) {
+ System.out.println("NanotoolsReferences Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Read reference FASTA file
+ */
+ private void readReferenceFile() {
+ ReferenceSequence currentRef = null;
+ KmerTable refKmerTable = null;
+ GCCounter gcc = null;
+
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(options.getReferenceFile()));
+ String line;
+ String id = null;
+ String name = null;
+ String seq = "";
+ String previousKmerString = "";
+
+ System.out.println("");
+ System.out.println("Calculating reference GC");
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ line = line.trim();
+ }
+
+ // New ID
+ if ((line == null) || (line.startsWith(">"))) {
+ if (id != null) {
+ if (gcc != null) {
+ gcc.closeFile();
+ }
+ }
+
+ if (line != null) {
+ String[] parts = line.substring(1).split("(\\s+)");
+ id = parts[0];
+ currentRef = getReferenceById(id);
+ System.out.println("\t" + currentRef.getName());
+ refKmerTable = currentRef.getKmerTable();
+ gcc = new GCCounter(currentRef.getBinSize(), options.getAnalysisDir() + File.separator + currentRef.getName() + File.separator + currentRef.getName() + "_gc.txt");
+ }
+ }
+ // Continuing sequence read
+ else if ((line != null) && (currentRef != null)) {
+ if (!line.equals("")) {
+ String kmerSeq = previousKmerString + line;
+ int k = refKmerTable.getKmerSize();
+
+ // Store kmers
+ for (int o=0; o<kmerSeq.length() - k; o++) {
+ refKmerTable.countKmer(kmerSeq.substring(o, o+5));
+ }
+
+ // Store end k-1 bases for start of next kmer
+ if (line.length() > k) {
+ previousKmerString = line.substring(line.length() - k + 1);
+ } else {
+ previousKmerString = "";
+ }
+
+ // Now for GC graph
+ gcc.addString(line);
+ }
+ }
+ } while (line != null);
+
+ br.close();
+ } catch (Exception e) {
+ System.out.println("readFasta Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ }
+
+ /**
+ * Load references
+ */
+ public void loadReferences() {
+ readSizesFile();
+ readReferenceFile();
+ }
+
+ /**
+ * Get a ReferenceSequence object from sequence ID.
+ */
+ public ReferenceSequence getReferenceById(String id) {
+ ReferenceSequence r = referenceSeqIds.get(id);
+
+ if (r == null) {
+ System.out.println("");
+ System.out.println("Error: Couldn't find reference for "+id + ". This can occur if you have changed the refernce file, but not deleted the .sizes file associated with it. Try deleting reference.fasta.sizes and re-running.");
+ System.exit(1);
+ }
+
+ return r;
+ }
+
+ /**
+ * Return set of all reference sequence IDs.
+ * @return a String set
+ */
+ public Set<String> getAllIds() {
+ return referenceSeqIds.keySet();
+ }
+
+ /**
+ * Return sorted set of all reference sequence IDs.
+ * @return a String set
+ */
+ public ArrayList getSortedReferences() {
+ ArrayList sortedReferences = new ArrayList();
+ Set<String> keys = referenceSeqIds.keySet();
+
+ for(String id : keys) {
+ sortedReferences.add(referenceSeqIds.get(id));
+ }
+ Collections.sort(sortedReferences);
+
+ return sortedReferences;
+ }
+
+
+ /**
+ * Initiate writing of all statistics data files used to generate graphs.
+ * @param type a type, as defined in NanoOKOptions (for example TYPE_TEMPLATE)
+ */
+ public void writeReferenceStatFiles(int type) {
+ Set<String> keys = referenceSeqIds.keySet();
+
+ for(String id : keys) {
+ ReferenceSequence ref = referenceSeqIds.get(id);
+ ref.getStatsByType(type).writeCoverageData(options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_coverage.txt", ref.getBinSize());
+ ref.getStatsByType(type).writePerfectKmerHist(options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_all_perfect_kmers.txt");
+ ref.getStatsByType(type).writeBestPerfectKmerHist(options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_best_perfect_kmers.txt");
+ ref.getStatsByType(type).writeBestPerfectKmerHistCumulative(options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_cumulative_perfect_kmers.txt");
+ ref.getStatsByType(type).writeInsertionStats(options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_insertions.txt");
+ ref.getStatsByType(type).writeDeletionStats(options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_deletions.txt");
+ ref.writeKmerFile(type, options.getAnalysisDir() + File.separator + ref.getName() + File.separator + ref.getName() + "_" + options.getTypeFromInt(type) + "_kmers.txt");
+ }
+ }
+
+ /**
+ * Get the length of the longest ID - used for formatting output.
+ * @return length of longest sequence ID
+ */
+ public int getLongestIdLength() {
+ return longestId;
+ }
+
+ /**
+ * Get number of references.
+ * @return number of references
+ */
+ public int getNumberOfReferences() {
+ return referenceSeqIds.size();
+ }
+
+ /**
+ * Write reference summary text file.
+ * @param type type from NanoOKOptions
+ */
+ public void writeReferenceSummary(int type) {
+ try {
+ String filename = options.getAnalysisDir() + File.separator + "all_" + NanoOKOptions.getTypeFromInt(type) + "_alignment_summary.txt";
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ String formatString = "%-"+longestId+"s %-12s %-10s %-10s %-10s %-12s %-10s %-10s";
+ //pw.printf(formatString, "ID", "Size", "ReadsAlign", "PcReads", "MeanLen", "TotalBases", "MeanCov", "LongPerfKm");
+ pw.print("ID\tSize\tReadsAlign\tPcReads\tMeanLen\tTotalBases\tMeanCov\tLongPerfKm");
+ pw.println("");
+
+ //List<String> keys = new ArrayList<String>(referenceSeqIds.keySet());
+ //Collections.sort(keys);
+ //for(String id : keys) {
+ // referenceSeqIds.get(id).getStatsByType(type).writeSummary(pw, "%-"+longestId+"s %-12d %-10d %-10.2f %-10d");
+ //}
+
+ formatString = "%s\t%d\t%d\t%.2f\t%.2f\t%d\t%.2f\t%d";
+ ArrayList<ReferenceSequence> sortedRefs = getSortedReferences();
+ for (int i=0; i<sortedRefs.size(); i++) {
+ ReferenceSequence r = sortedRefs.get(i);
+ ReferenceSequenceStats refStats = r.getStatsByType(type);
+ pw.printf(formatString,
+ r.getName(),
+ r.getSize(),
+ refStats.getNumberOfReadsWithAlignments(),
+ 100.0 * (double)refStats.getNumberOfReadsWithAlignments() / (double)overallStats.getStatsByType(type).getNumberOfReads(),
+ refStats.getMeanReadLength(),
+ refStats.getTotalAlignedBases(),
+ (double)refStats.getTotalAlignedBases() / r.getSize(),
+ refStats.getLongestPerfectKmer());
+ pw.println("");
+ }
+
+
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeReferenceSummary exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+// /**
+// * Write reference summary to LaTeX report.
+// * @param type type from NanoOKOptions
+// * @param pw handle to LaTeX file
+// */
+// public void writeTexSummary(int type, PrintWriter pw) {
+// pw.println("\\begin{table}[H]");
+// pw.println("{\\footnotesize");
+// pw.println("\\fontsize{9pt}{11pt}\\selectfont");
+// pw.println("\\begin{tabular}{l c c c c c c c}");
+// pw.println(" & & {\\bf Number of} & {\\bf \\% of} & {\\bf Mean read} & {\\bf Aligned} & {\\bf Mean} & {\\bf Longest} \\\\");
+// pw.println("{\\bf ID} & {\\bf Size} & {\\bf Reads} & {\\bf Reads} & {\\bf length} & {\\bf bases} & {\\bf coverage} & {\\bf Perf Kmer} \\\\");
+// ArrayList<ReferenceSequence> sortedRefs = getSortedReferences();
+// for (int i=0; i<sortedRefs.size(); i++) {
+// ReferenceSequence r = sortedRefs.get(i);
+// ReferenceSequenceStats refStats = r.getStatsByType(type);
+// if ((sortedRefs.size() < 100) || (refStats.getNumberOfReadsWithAlignments() > 0)) {
+// pw.printf("%s & %d & %d & %.2f & %.2f & %d & %.2f & %d \\\\",
+// r.getName().replaceAll("_", " "),
+// r.getSize(),
+// refStats.getNumberOfReadsWithAlignments(),
+// 100.0 * (double)refStats.getNumberOfReadsWithAlignments() / (double)overallStats.getStatsByType(type).getNumberOfReads(),
+// refStats.getMeanReadLength(),
+// refStats.getTotalAlignedBases(),
+// (double)refStats.getTotalAlignedBases() / r.getSize(),
+// refStats.getLongestPerfectKmer());
+// pw.println("");
+// }
+// }
+// pw.println("\\end{tabular}");
+// pw.println("}");
+// pw.println("\\end{table}");
+// }
+}
diff --git a/src/nanook/SAMParser.java b/src/nanook/SAMParser.java
new file mode 100644
index 0000000..75f020e
--- /dev/null
+++ b/src/nanook/SAMParser.java
@@ -0,0 +1,236 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Hashtable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Parser for SAM format files.
+ *
+ * @author Richard Leggett
+ */
+public abstract class SAMParser {
+ private NanoOKOptions options;
+ private References references;
+ private SampleReportWriter report;
+ private String programID = null;
+ ArrayList<Alignment> alignments;
+ private Hashtable<String,Integer> referenceSizes;
+ String leafName;
+
+ /**
+ * Parse a SAM file.
+ * @param filename filename to parse
+ * @param nonAlignedSummaryFile an AlignmentTableFile to output details of anything that doesn't align to
+ * @return number of alignments parsed
+ */
+ public SAMParser(NanoOKOptions o, References r) {
+ options = o;
+ references = r;
+ }
+
+ /**
+ * Get file extension of alignment files
+ * @return
+ */
+ public String getAlignmentFileExtension() {
+ return ".sam";
+ }
+
+
+ private void processReferenceTag(String s) {
+ Pattern pattern = Pattern.compile("@SQ(\\s+)SN:(\\S+)(\\s+)LN:(\\S+)");
+ Matcher matcher = pattern.matcher(s);
+ if (matcher.find()) {
+ String refID = matcher.group(2);
+ int size = Integer.parseInt(matcher.group(4));
+ if (referenceSizes.containsKey(refID)) {
+ System.out.println("Warning: Reference "+refID+" already seen.");
+ } else {
+ referenceSizes.put(refID, size);
+ }
+ } else {
+ System.out.println("Warning: Badly formated tag: " + s);
+ }
+ }
+
+ /**
+ * Process @PG tag in SAM file
+ * @param s
+ */
+ private void processProgramTag(String s) {
+ Pattern pattern = Pattern.compile("(\\s+)ID:(\\S+)(\\s+)");
+ Matcher matcher = pattern.matcher(s);
+ if (matcher.find()) {
+ programID = matcher.group(2);
+ }
+ }
+
+ /**
+ * Process an alignment line from a SAM file
+ * @param s the line
+ * @param outputFilename .maf file to write
+ * @return ]
+ */
+ private Alignment processAlignmentLine(String alignmentFile, String s, String outputFilename, ReadSetStats overallStats) {
+ String[] cols = s.split("\t");
+ String queryName = cols[0];
+ int flags = Integer.parseInt(cols[1]);
+ String hitName = cols[2];
+ int hitStart = Integer.parseInt(cols[3]) - 1; // SAM is 1-based, Last and NanoOK 0-based
+ int mapQuality = Integer.parseInt(cols[4]);
+ String cigar = cols[5];
+ String rNext = cols[6];
+ int pNext = Integer.parseInt(cols[7]);
+ int tLen = Integer.parseInt(cols[8]);
+ String seq = cols[9];
+ String qual = cols[10];
+ boolean mapped = ((flags & 0x04) == 0x04) ? false:true;
+ int queryStart;
+ Alignment al = null;
+
+ //System.out.println("Alignment file "+alignmentFile);
+ //System.out.println("CIGAR string "+cigar);
+
+ if (options.getAligner().equals("blasr")) {
+ queryName = cols[0].substring(0, cols[0].lastIndexOf("/"));
+ }
+
+ if (mapped) {
+ ReferenceSequence readReference = references.getReferenceById(hitName);
+ if (readReference != null) {
+ int readLength = overallStats.getReadLength(alignmentFile, queryName);
+ if (readLength != -1) {
+ CIGARString cs = new CIGARString(cigar, seq, leafName, queryName, hitStart, options.getReferenceFile(), readReference);
+ if (cs.processString()) {
+ //System.out.println("hitName "+hitName);
+ al = new Alignment(mapQuality,
+ queryName,
+ readLength,
+ cs.getQueryStart(),
+ cs.getQueryAlnSize(),
+ cs.getQueryString(),
+ hitName,
+ readReference.getSize(),
+ hitStart,
+ cs.getHitAlnSize(),
+ cs.getHitString(),
+ false);
+
+ // Check for reverse complement
+ if ((flags & 0x10) == 0x10) {
+ al.setQueryStrand("-");
+ }
+
+ al.writeMafFile(outputFilename);
+
+ }
+
+ } else {
+ System.out.println("Error: can't find read length for ["+queryName+"]");
+ System.exit(1);
+ }
+ } else {
+ System.out.println("");
+ System.out.println("Error: Couldn't find reference "+hitName);
+ }
+ }
+
+ return al;
+ }
+
+ public int parseFile(String filename, AlignmentsTableFile nonAlignedSummaryFile, ReadSetStats overallStats) {
+ alignments = new ArrayList();
+ referenceSizes = new Hashtable();
+ leafName = new File(filename).getName();
+
+ // Read all alignmnets and put into an ArrayList
+ try
+ {
+ options.getLog().println("Got file");
+ BufferedReader br = new BufferedReader(new FileReader(filename));
+ String line;
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ if (line.startsWith("@SQ")) {
+ processReferenceTag(line);
+ } else if (line.startsWith("@PG")) {
+ processProgramTag(line);
+ } else if (!line.startsWith("@")) {
+ options.getLog().println("Got line");
+ Alignment al = processAlignmentLine(filename, line, filename+".last", overallStats);
+ if (al != null) {
+ alignments.add(al);
+ }
+ options.getLog().println("Added");
+ }
+ }
+ } while (line != null);
+ br.close();
+
+ options.getLog().println("Finished file");
+
+ if (alignments.size() == 0) {
+ nonAlignedSummaryFile.writeNoAlignmentMessage(leafName);
+ overallStats.addReadWithoutAlignment();
+ }
+
+ } catch (Exception e) {
+ System.out.println("parseFile Exception:");
+ e.printStackTrace();
+ options.getLog().println("Exception parsing "+filename);
+ options.getLog().close();
+ System.exit(1);
+ }
+
+ options.getLog().println("Returning");
+
+ return alignments.size();
+ }
+
+ /**
+ * Sort alignments in order of score
+ */
+ public void sortAlignments() {
+ if (alignments.size() > 0) {
+ Collections.sort(alignments);
+ }
+ }
+
+ /**
+ * Get the set of alignments that match the highest scoring reference
+ */
+ public ArrayList getHighestScoringSet() {
+ ArrayList hss = new ArrayList();
+
+ if (alignments.size() > 0) {
+ String readReferenceName = alignments.get(0).getHitName();
+ ReferenceSequence readReference = references.getReferenceById(readReferenceName);
+ for (int i=0; i<alignments.size(); i++) {
+ Alignment a = alignments.get(i);
+ if (a.getHitName().equals(readReferenceName)) {
+ hss.add(a);
+ }
+ }
+ }
+
+ return hss;
+ }
+}
diff --git a/src/nanook/SampleChecker.java b/src/nanook/SampleChecker.java
new file mode 100644
index 0000000..32d241a
--- /dev/null
+++ b/src/nanook/SampleChecker.java
@@ -0,0 +1,281 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett (richard.leggett at earlham.ac.uk)
+ *
+ * Copyright 2015-17 Earlham Institute
+ */
+package nanook;
+
+import java.io.File;
+import java.util.ArrayList;
+
+public class SampleChecker {
+ private NanoOKOptions options;
+ private boolean haveChecked = false;
+ private boolean usingBarcodes = false;
+ private boolean usingBatchDirs = false;
+ private boolean usingPassFailDirs = false;
+
+ public SampleChecker(NanoOKOptions o) {
+ options = o;
+ }
+
+ private boolean dirExists(String dir) {
+ File d = new File(dir);
+ return d.exists();
+ }
+
+ private boolean checkIfDirHasSubdirs(String dir) {
+ File d = new File(dir);
+ File[] listOfFiles = d.listFiles();
+ boolean contains = false;
+
+ if (listOfFiles == null) {
+ contains = false;
+ } else if (listOfFiles.length <= 0) {
+ contains = false;
+ } else {
+ for (File file : listOfFiles) {
+ if (file.isDirectory()) {
+ contains = true;
+ break;
+ }
+ }
+ }
+
+ return contains;
+ }
+
+ private void checkForBarcodeAndBatch(String dir) {
+ File d = new File(dir);
+ File[] listOfFiles = d.listFiles();
+ boolean contains = false;
+ if (listOfFiles == null) {
+ contains = false;
+ } else if (listOfFiles.length <= 0) {
+ contains = false;
+ } else {
+ boolean foundSubDir = false;
+ for (File file : listOfFiles) {
+ if (file.isDirectory()) {
+ if (file.getName().startsWith("BC") || file.getName().startsWith("barcode")) {
+ usingBarcodes = true;
+ if (usingBatchDirs == false) {
+ if (checkIfDirHasSubdirs(file.getPath())) {
+ usingBatchDirs = true;
+ }
+ }
+ //checkForBarcodeAndBatch(file.getPath());
+ } else if (file.getName().startsWith("batch_")) {
+ usingBatchDirs = true;
+ break;
+ } else {
+ foundSubDir = true;
+ break;
+ }
+ }
+ }
+
+ if ((usingBarcodes == false) && (usingBatchDirs == false) && (foundSubDir == true)) {
+ System.out.println("Found subdirectory, assuming batched output");
+ usingBatchDirs = true;
+ }
+ }
+ }
+
+ private void showDirectoryType() {
+ System.out.println(" Using pass/fail dirs: " + (usingPassFailDirs?"yes":"no"));
+ System.out.println(" Using batch dirs: " + (usingBatchDirs?"yes":"no"));
+ System.out.println(" Using barcodes: " + (usingBarcodes?"yes":"no"));
+ System.out.println("");
+ }
+
+ public void checkFast5Directory() {
+ String passDir = options.getFast5Dir() + File.separator + "pass";
+ String failDir = options.getFast5Dir() + File.separator + "fail";
+
+ System.out.println("Checking FAST5 directory structure...");
+
+ File f = new File(options.getFast5Dir());
+ if (!f.exists()) {
+ System.out.println("Error: can't find FAST5 directory "+options.getFast5Dir());
+ System.exit(1);
+ }
+
+ if ((options.isProcessingPassReads()) && (dirExists(passDir))) {
+ usingPassFailDirs = true;
+ checkForBarcodeAndBatch(passDir);
+ } else if ((options.isProcessingFailReads()) && (dirExists(failDir))) {
+ usingPassFailDirs = true;
+ checkForBarcodeAndBatch(failDir);
+ } else {
+ checkForBarcodeAndBatch(options.getFast5Dir());
+ //File[] listOfFiles = f.listFiles();
+ //usingPassFailDirs = false;
+ //usingBatchDirs = false;
+ //for (File file : listOfFiles) {
+ // if (file.isDirectory()) {
+ // usingBatchDirs = true;
+ // break;
+ // }
+ //}
+ }
+
+ showDirectoryType();
+ }
+
+ public void checkReadDirectory() {
+ boolean gotOne = false;
+ ArrayList<String> al = new ArrayList<String>();
+
+ System.out.println("Checking FASTA/Q directory structure...");
+
+ File f = new File(options.getReadDir());
+ if (!f.exists()) {
+ System.out.println("Error: can't find read directory "+options.getReadDir());
+ System.exit(1);
+ }
+
+ // Check for MinKNOW 1.4.2 and above
+ if ((options.isProcessingPassReads()) && (options.isProcessing2DReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "pass" + File.separator + "2D"));
+ }
+ if ((options.isProcessingPassReads()) && (options.isProcessingTemplateReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "pass" + File.separator + "Template"));
+ }
+ if ((options.isProcessingPassReads()) && (options.isProcessingComplementReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "pass" + File.separator + "Complement"));
+ }
+ if ((options.isProcessingFailReads()) && (options.isProcessing2DReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "fail" + File.separator + "2D"));
+ }
+ if ((options.isProcessingFailReads()) && (options.isProcessingTemplateReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "fail" + File.separator + "Template"));
+ }
+ if ((options.isProcessingFailReads()) && (options.isProcessingComplementReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "fail" + File.separator + "Complement"));
+ }
+ for (int i=0; i<al.size(); i++) {
+ if (dirExists(al.get(i))) {
+ gotOne = true;
+ usingPassFailDirs = true;
+ checkForBarcodeAndBatch(al.get(i));
+ }
+ }
+
+ // Original - no pass/fail dirs, no barcodes, no batch
+ // Or Albacore - with separate directories
+ if (gotOne == false) {
+ System.out.println("Error: FASTA/Q directory structure not understood.");
+ System.out.println("This may be because it was created with an earlier version of NanoOK.");
+ System.out.println("NanoOK now expects the following structures:");
+ System.out.println(" sampledir/fasta/pass/Template/*.fast5");
+ System.out.println(" or sampledir/fasta/pass/Template/batch_XXX/*.fast5");
+ System.out.println(" or sampledir/fasta/pass/Template/0/*.fast5");
+ System.out.println("etc.");
+ System.exit(0);
+ }
+
+ showDirectoryType();
+ }
+
+ public void checkReadDirectorOld() {
+ boolean gotOne = false;
+ ArrayList<String> al = new ArrayList<String>();
+
+ System.out.println("Checking FASTA/Q directory structure...");
+
+ File f = new File(options.getReadDir());
+ if (!f.exists()) {
+ System.out.println("Error: can't find read directory "+options.getReadDir());
+ System.exit(1);
+ }
+
+ // Check for MinKNOW 1.4.2 and above
+ if ((options.isProcessingPassReads()) && (options.isProcessing2DReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "2D" + File.separator + "pass"));
+ }
+ if ((options.isProcessingPassReads()) && (options.isProcessingTemplateReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "Template" + File.separator + "pass"));
+ }
+ if ((options.isProcessingPassReads()) && (options.isProcessingComplementReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "Complement" + File.separator + "pass"));
+ }
+ if ((options.isProcessingFailReads()) && (options.isProcessing2DReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "2D" + File.separator + "fail"));
+ }
+ if ((options.isProcessingFailReads()) && (options.isProcessingTemplateReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "Template" + File.separator + "fail"));
+ }
+ if ((options.isProcessingFailReads()) && (options.isProcessingComplementReads())) {
+ al.add(new String(options.getReadDir() + File.separator + "Complement" + File.separator + "fail"));
+ }
+ for (int i=0; i<al.size(); i++) {
+ if (dirExists(al.get(i))) {
+ gotOne = true;
+ usingPassFailDirs = true;
+ checkForBarcodeAndBatch(al.get(i));
+ }
+ }
+
+ // MinKNOW pre 1.4.2 and after intro of pass/fail dirs
+ // Barcode dirs will only be for pass reads
+ if (gotOne == false) {
+ if ((options.isProcessingPassReads()) && (dirExists(options.getReadDir() + File.separator + "pass"))) {
+ gotOne = true;
+ usingBatchDirs = false;
+ usingPassFailDirs = true;
+ checkForBarcodeAndBatch(options.getReadDir() + File.separator + "pass");
+ } else if ((options.isProcessingFailReads()) && (dirExists(options.getReadDir() + File.separator + "fail"))) {
+ gotOne = true;
+ usingBatchDirs = false;
+ usingPassFailDirs = true;
+ }
+ }
+
+ // Albacore - we end up with sample/fasta/2D/0 etc.
+ if (gotOne == false) {
+ al.clear();
+ if (options.isProcessing2DReads()) {
+ al.add(new String(options.getReadDir() + File.separator + "2D"));
+ }
+ if (options.isProcessingTemplateReads()) {
+ al.add(new String(options.getReadDir() + File.separator + "Template"));
+ }
+ if (options.isProcessingComplementReads()) {
+ al.add(new String(options.getReadDir() + File.separator + "Complement"));
+ }
+ for (int i=0; i<al.size(); i++) {
+ if (dirExists(al.get(i))) {
+ gotOne = true;
+ checkForBarcodeAndBatch(al.get(i));
+ }
+ }
+ }
+
+ // Original - no pass/fail dirs, no barcodes, no batch
+ // Or Albacore - with separate directories
+ if (gotOne == false) {
+ usingBatchDirs = false;
+ usingPassFailDirs = false;
+ }
+
+ showDirectoryType();
+ }
+
+ public boolean haveChecked() {
+ return haveChecked;
+ }
+
+ public boolean usingBarcodes() {
+ return usingBarcodes;
+ }
+
+ public boolean usingBatchDirs() {
+ return usingBatchDirs;
+ }
+
+ public boolean usingPassFailDirs() {
+ return usingPassFailDirs;
+ }
+}
diff --git a/src/nanook/SampleComparer.java b/src/nanook/SampleComparer.java
new file mode 100644
index 0000000..78c555c
--- /dev/null
+++ b/src/nanook/SampleComparer.java
@@ -0,0 +1,154 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InvalidClassException;
+import java.io.ObjectInputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+
+/**
+ *
+ * @author Richard Leggett
+ */
+public class SampleComparer {
+ private NanoOKOptions options;
+ private ArrayList<String> sampleNames = new ArrayList();
+ private ArrayList<OverallStats> sampleStats = new ArrayList();
+
+ public SampleComparer(NanoOKOptions o) {
+ options = o;
+ }
+
+ private void readSample(String sample, String name) {
+ try {
+ FileInputStream fis = new FileInputStream(sample + File.separator + "analysis" + options.getAnalysisSuffix() + File.separator + "OverallStats.ser");
+ ObjectInputStream ois = new ObjectInputStream(fis);
+ OverallStats os = (OverallStats)ois.readObject();
+ sampleNames.add(name);
+ sampleStats.add(os);
+ ois.close();
+ } catch (Exception e) {
+ if (e instanceof InvalidClassException) {
+ System.out.println("The saved data is incompatible with this version of NanoOK. You must re-run nanook analyse on all your samples before running compare.");
+ } else {
+ System.out.println("Exception trying to read object:");
+ e.printStackTrace();
+ }
+ System.exit(1);
+ }
+ }
+
+ public void loadSamples() {
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(options.getSampleList()));
+ String line;
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ if (!line.startsWith("SampleDir")) {
+ String[] fields = line.split("\t");
+ if (fields.length != 2) {
+ System.out.println("Error: invalid format for sample list file. This file should be two fields, tab separated.");
+ System.exit(1);
+ } else {
+ readSample(fields[0], fields[1]);
+ }
+ }
+ }
+ } while (line != null);
+ br.close();
+ } catch (Exception e) {
+ System.out.println("parseFile Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public void compareSamples() {
+ try {
+ for (int type = 0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ String filename = options.getComparisonDir() + File.separator + NanoOKOptions.getTypeFromInt(type) + "_comparison.txt";
+ PrintWriter pw = new PrintWriter(new FileWriter(filename, false));
+
+ pw.println("Name\tNumReads\tTotalBases\tMeanLen\tLongest\tShortest\tN50\tN50Count\tN90\tN90Count");
+
+ for (int i=0; i<sampleStats.size(); i++) {
+ String name = sampleNames.get(i);
+ OverallStats overallStats = sampleStats.get(i);
+ ReadSetStats r = overallStats.getStatsByType(type);
+
+ pw.printf("%s\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d",
+ name, r.getNumReads(), r.getTotalBases(), r.getMeanLength(), r.getLongest(), r.getShortest(), r.getN50(), r.getN50Count(), r.getN90(), r.getN90Count());
+ pw.println("");
+ }
+
+ pw.close();
+
+ filename = options.getComparisonDir() + File.separator + NanoOKOptions.getTypeFromInt(type) + "_map_summary.txt";
+ pw = new PrintWriter(new FileWriter(filename, false));
+ References refs = sampleStats.get(0).getStatsByType(type).getOptions().getReferences();
+ ArrayList<ReferenceSequence> sortedRefs = refs.getSortedReferences();
+ pw.print("Sample");
+ for (int i=0; i<sortedRefs.size(); i++) {
+ ReferenceSequence rs = sortedRefs.get(i);
+ pw.print("\t" + rs.getName());
+ }
+ pw.println("\tUnaligned");
+ for (int i=0; i<sampleStats.size(); i++) {
+ String name = sampleNames.get(i);
+ OverallStats overallStats = sampleStats.get(i);
+ pw.print(name);
+ for (int j=0; j<sortedRefs.size(); j++) {
+ ReferenceSequence rs = overallStats.getStatsByType(type).getOptions().getReferences().getReferenceById(sortedRefs.get(j).getId());
+ double value = 0.0;
+
+ if (rs.getStatsByType(type).getNumberOfReadsWithAlignments() > 0) {
+ value = 100.0 * (double)rs.getStatsByType(type).getNumberOfReadsWithAlignments() / (double)overallStats.getStatsByType(type).getNumberOfReads();
+ }
+
+ pw.printf("\t%.4f", value);
+ }
+
+ double value = 0;
+ if (overallStats.getStatsByType(type).getNumberOfReadsWithoutAlignments() > 0) {
+ value = 100.0 * (double)overallStats.getStatsByType(type).getNumberOfReadsWithoutAlignments() / (double)overallStats.getStatsByType(type).getNumberOfReads();
+ }
+ pw.printf("\t%.4f", value);
+ pw.println("");
+ }
+ pw.close();
+ }
+ }
+ } catch (IOException e) {
+ System.out.println("AlignmentsTableFile exception");
+ e.printStackTrace();
+ }
+ }
+
+ public int getNumberOfSamples() {
+ return sampleStats.size();
+ }
+
+ public OverallStats getSample(int i) {
+ return sampleStats.get(i);
+ }
+
+ public String getSampleName(int i) {
+ return sampleNames.get(i);
+ }
+}
diff --git a/src/nanook/SampleReportWriter.java b/src/nanook/SampleReportWriter.java
new file mode 100644
index 0000000..a27d76c
--- /dev/null
+++ b/src/nanook/SampleReportWriter.java
@@ -0,0 +1,822 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Writes a LaTeX report file.
+ *
+ * @author Richard Leggett
+ */
+public class SampleReportWriter {
+ private static final int LONGTABLE_THRESHOLD = 25;
+ private NanoOKOptions options;
+ private References references;
+ private OverallStats overallStats;
+ private PrintWriter pw;
+ private String sample;
+
+ /**
+ * Constructor.
+ * @param o a NanoOKOptions object
+ * @param r the references
+ * @param s overall statistics
+ */
+ public SampleReportWriter(NanoOKOptions o, OverallStats s) {
+ options = o;
+ references = options.getReferences();
+ overallStats = s;
+ sample = o.getSample().replaceAll("_", "\\\\_");
+ }
+
+ /**
+ * Open the .tex file.
+ */
+ public void open() {
+ try {
+ pw = new PrintWriter(new FileWriter(options.getTexFilename()));
+ writeLaTeXHeader();
+ } catch (IOException e) {
+ System.out.println("ReportWriter exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Close the .tex file.
+ */
+ public void close() {
+ writeLaTeXFooter();
+ pw.close();
+ }
+
+ /**
+ * Write the top of the LaTeX document.
+ */
+ private void writeLaTeXHeader() {
+ pw.println("\\documentclass[a4paper,11pt,oneside]{article}");
+ pw.println("\\usepackage{graphicx}");
+ pw.println("\\usepackage{url}");
+ pw.println("\\usepackage{multirow}");
+ pw.println("\\usepackage{rotating}");
+ pw.println("\\usepackage{color}");
+ pw.println("\\usepackage[compact]{titlesec}");
+ pw.println("\\usepackage[portrait,top=1cm, bottom=2cm, left=1cm, right=1cm]{geometry}");
+ pw.println("\\usepackage{float}");
+ if (references.getNumberOfReferences() >= LONGTABLE_THRESHOLD) {
+ pw.println("\\usepackage{longtable}");
+ }
+ pw.println("\\restylefloat{table}");
+ pw.println("\\begin{document}");
+ pw.println("\\renewcommand*{\\familydefault}{\\sfdefault}");
+ pw.println("\\normalfont");
+ pw.println("\\section*{\\large{NanoOK report for " + sample + "}}");
+ }
+
+ /**
+ * Add the pass/fail section
+ */
+ public void addPassFailSection() {
+ if (options.usingPassFailDirs()) {
+ pw.println("\\subsection*{Pass and fail counts}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{9pt}{11pt}\\selectfont");
+ pw.println("\\begin{tabular}{l c c}");
+ pw.println("{\\bf Type} & {\\bf Pass} & {\\bf Fail} \\\\");
+
+ for (int type = 0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ ReadSetStats r = overallStats.getStatsByType(type);
+ pw.printf("%s & %d & %d \\\\", r.getTypeString(), r.getNumberOfPassFiles(), r.getNumberOfFailFiles());
+ pw.println("");
+ }
+ }
+
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ }
+ }
+
+ /**
+ * Add the read lengths section.
+ */
+ public void addLengthsSection() {
+ String graphWidth = "width=.3\\linewidth";
+
+ if (options.getNumberOfTypes() == 1) {
+ graphWidth = "width=.4\\linewidth";
+ }
+
+ pw.println("\\subsection*{Read lengths}");
+ pw.println("\\vspace{-3mm}");
+
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{9pt}{11pt}\\selectfont");
+ pw.println("\\begin{tabular}{l c c c c c c c c c}");
+ pw.println("{\\bf Type} & {\\bf NumReads} & {\\bf TotalBases} & {\\bf Mean} & {\\bf Longest} & {\\bf Shortest} & {\\bf N50} & {\\bf N50Count} & {\\bf N90} & {\\bf N90Count} \\\\");
+
+ for (int type = 0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ ReadSetStats r = overallStats.getStatsByType(type);
+ pw.printf("%s & %d & %d & %.2f & %d & %d & %d & %d & %d & %d \\\\", r.getTypeString(), r.getNumReads(), r.getTotalBases(), r.getMeanLength(), r.getLongest(), r.getShortest(), r.getN50(), r.getN50Count(), r.getN90(), r.getN90Count());
+ pw.println("");
+ }
+ }
+
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ pw.println("\\vspace{-10mm}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+
+
+
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphWidth+"]{", options.getGraphsDir() + File.separator + "all_Template_lengths", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphWidth+"]{", options.getGraphsDir() + File.separator + "all_Complement_lengths", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphWidth+"]{", options.getGraphsDir() + File.separator + "all_2D_lengths", "}");
+
+
+ //pw.println("\\includegraphics[width=.3\\linewidth]{" + options.getGraphsDir() + File.separator + "all_Template_lengths.pdf}");
+ //pw.println("\\includegraphics[width=.3\\linewidth]{" + options.getGraphsDir() + File.separator + "all_Complement_lengths.pdf}");
+ //pw.println("\\includegraphics[width=.3\\linewidth]{" + options.getGraphsDir() + File.separator + "all_2D_lengths.pdf}");
+ pw.println("\\end{figure}");
+
+ }
+
+ /**
+ * Write the alignments section to the report.
+ * @param stats a ReadSetStats object
+ */
+ public void writeAlignmentsSection(ReadSetStats stats) {
+ //if ((stats.getTypeString() == "Template") || (references.getNumberOfReferences() > 8)) {
+ // pw.println("\\clearpage");
+ //}
+ pw.println("\\subsection*{" + stats.getTypeString() + " alignments}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{9pt}{11pt}\\selectfont");
+ pw.println("\\begin{tabular}{l c c}");
+ pw.println("Number of reads & " + stats.getNumberOfReads() + " & \\\\");
+ pw.printf("Number of reads with alignments & %d & (%.2f\\%%) \\\\", stats.getNumberOfReadsWithAlignments(), stats.getPercentOfReadsWithAlignments());
+ pw.println("");
+ pw.printf("Number of reads without alignments & %d & (%.2f\\%%) \\\\", stats.getNumberOfReadsWithoutAlignments(), stats.getPercentOfReadsWithoutAlignments());
+ pw.println("");
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ pw.println("\\vspace{-10mm}");
+ }
+
+ /**
+ * Check if graphic file exists and only insert if it does
+ * @param preTex LaTeX before filename
+ * @param filename the file
+ * @param postTex LaTeX after filename
+ */
+ private void includeGraphicsIfExists(int type, String preTex, String filename, String postTex) {
+ if (options.isProcessingReadType(type)) {
+ String fullFilename = filename + "." + options.getImageFormat();
+ File f = new File(fullFilename);
+
+ if (f.exists()) {
+ pw.print(preTex);
+ pw.print(fullFilename);
+ pw.println(postTex);
+ } else {
+ System.out.println("Can't find " + fullFilename);
+ pw.print(" ");
+ }
+ }
+ }
+
+ /**
+ * Write a section for a reference sequence.
+ * @param refSeq reference to write
+ */
+ public void writeReferenceSection(ReferenceSequence refSeq) {
+ String id = refSeq.getName().replaceAll("_", " ");
+ String[] lines = new String[10];
+ String newLineTag=" \\\\";
+ String graphSize;
+
+ if (options.getNumberOfTypes() == 1) {
+ newLineTag = "";
+ }
+
+ pw.println("\\subsection*{" + id + " error analysis}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{9pt}{11pt}\\selectfont");
+ pw.println("\\begin{tabular}{l c c c}");
+
+
+ lines[0] = "";
+ lines[1] = "Overall base identity (excluding indels)";
+ lines[2] = "Aligned base identity (excluding indels)";
+ lines[3] = "Identical bases per 100 aligned bases (including indels)";
+ lines[4] = "Inserted bases per 100 aligned bases (including indels)";
+ lines[5] = "Deleted bases per 100 aligned bases (including indels)";
+ lines[6] = "Substitutions per 100 aligned bases (including indels)";
+ lines[7] = "Mean insertion size";
+ lines[8] = "Mean deletion size";
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ lines[0] += " & " + NanoOKOptions.getTypeFromInt(type);
+ lines[1] += String.format(" & %.2f\\%%", refSeq.getStatsByType(type).getReadPercentIdentical());
+ lines[2] += String.format(" & %.2f\\%%", refSeq.getStatsByType(type).getAlignedPercentIdenticalWithoutIndels());
+ lines[3] += String.format(" & %.2f\\%%", refSeq.getStatsByType(type).getAlignedPercentIdentical());
+ lines[4] += String.format(" & %.2f\\%%", refSeq.getStatsByType(type).getPercentInsertionErrors());
+ lines[5] += String.format(" & %.2f\\%%", refSeq.getStatsByType(type).getPercentDeletionErrors());
+ lines[6] += String.format(" & %.2f\\%%", refSeq.getStatsByType(type).getPercentSubstitutionErrors());
+ lines[7] += String.format(" & %.2f", refSeq.getStatsByType(type).getMeanInsertionSize());
+ lines[8] += String.format(" & %.2f", refSeq.getStatsByType(type).getMeanDeletionSize());
+ }
+ }
+
+ for (int i=0; i<=8; i++) {
+ lines[i] += " \\\\";
+ pw.println(lines[i]);
+ }
+
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+
+ if (options.getNumberOfTypes() == 1) {
+ graphSize = "width=.4\\linewidth";
+ } else {
+ graphSize = "height=3.5cm";
+ }
+
+ pw.println("\\vspace{-5mm}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_insertions", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_insertions", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_insertions", "}"+newLineTag);
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_deletions", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_deletions", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_deletions", "}");
+ pw.println("\\end{figure}");
+
+ pw.println("\\subsection*{" + id + " read identity}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_length_vs_identity_hist", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_length_vs_identity_hist", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_length_vs_identity_hist", "}"+newLineTag);
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_length_vs_identity_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_length_vs_identity_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_length_vs_identity_scatter", "}"+newLineTag);
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_read_fraction_vs_alignment_identity_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_read_fraction_vs_alignment_identity_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_read_fraction_vs_alignment_identity_scatter", "}");
+ if (options.getNumberOfTypes() > 1) {
+ pw.println("\\end{figure}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ }
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_percent_aligned_vs_length_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_percent_aligned_vs_length_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_percent_aligned_vs_length_scatter", "}");
+ pw.println("\\end{figure}");
+
+ if (options.getNumberOfTypes() == 1) {
+ graphSize = "width=.4\\linewidth";
+ } else {
+ graphSize = "height=3.5cm";
+ }
+
+ pw.println("\\subsection*{" + id + " perfect kmers}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_cumulative_perfect_kmers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_cumulative_perfect_kmers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_cumulative_perfect_kmers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_best_perfect_kmers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_best_perfect_kmers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_best_perfect_kmers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_longest_perfect_vs_length_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_longest_perfect_vs_length_scatter", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_longest_perfect_vs_length_scatter", "}");
+ pw.println("\\end{figure}");
+
+ if (options.getNumberOfTypes() == 1) {
+ graphSize = "width=.7\\linewidth";
+ } else {
+ graphSize = "height=2cm";
+ }
+
+ pw.println("\\subsection*{" + id + " coverage}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_coverage", "} \\\\");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_coverage", "} \\\\");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_coverage", "} \\\\");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_ALL, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_gc", "}");
+ pw.println("\\end{figure}");
+
+ if (options.getNumberOfTypes() == 1) {
+ pw.println("\\clearpage");
+ }
+
+ if (options.getNumberOfTypes() == 1) {
+ graphSize = "width=.7\\linewidth";
+ } else {
+ graphSize = "height=8cm";
+ }
+
+ pw.println("\\subsection*{" + id + " 5-mer analysis}");
+
+ String[] overRepLines = new String[10];
+ String[] underRepLines = new String[10];
+ for (int i=0; i<10; i++) {
+ overRepLines[i] = Integer.toString(i+1);
+ underRepLines[i] = Integer.toString(i+1);
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ if (i == 0) {
+ refSeq.getStatsByType(type).sortKmerAbundance();
+ }
+
+ ArrayList<KmerAbundance> ka = refSeq.getStatsByType(type).getKmerAbundance();
+ KmerAbundance ko = ka.get(i);
+ KmerAbundance ku = ka.get(ka.size() - 1 - i);
+ overRepLines[i] += String.format(" & %s & %.3f & %.3f & %.3f", ko.getKmer(), ko.getRefAbundance(), ko.getReadAbundance(), ko.getDifference());
+ underRepLines[i] += String.format(" & %s & %.3f & %.3f & %.3f", ku.getKmer(), ku.getRefAbundance(), ku.getReadAbundance(), ku.getDifference());
+ }
+ }
+ overRepLines[i] += " \\\\";
+ underRepLines[i] += " \\\\";
+ }
+
+ pw.println("\\subsection*{Under-represented 5-mers}");
+ pw.println("\\vspace{-3mm}");
+ writeKmerTable(underRepLines);
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\subsection*{Over-represented 5-mers}");
+ pw.println("\\vspace{-3mm}");
+ writeKmerTable(overRepLines);
+ pw.println("\\vspace{-8mm}");
+
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_kmer_scatter", "} ");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_kmer_scatter", "} \\\\");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_kmer_scatter", "} \\\\");
+ pw.println("\\end{figure}");
+
+ if (options.getNumberOfTypes() == 1) {
+ graphSize = "width=.4\\linewidth";
+ } else {
+ graphSize = "height=3.5cm";
+ }
+ pw.println("\\subsection*{" + id + " GC content}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Template_GC_hist", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_Complement_GC_hist", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphSize+"]{", options.getGraphsDir() + File.separator + refSeq.getName() + File.separator + refSeq.getName() + "_2D_GC_hist", "}");
+ }
+
+ private void writeKmerTable(String[] lines) {
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{7pt}{9pt}\\selectfont");
+ pw.print("\\begin{tabular}{|c");
+ int colCount = 1;
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print("|c c c c");
+ colCount+=4;
+ }
+ }
+ pw.println("|}");
+ pw.println("\\cline{1-"+colCount+"}");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print(" & \\multicolumn{4}{c|}{" + NanoOKOptions.getTypeFromInt(type) + "}");
+ }
+ }
+ pw.println(" \\\\");
+ pw.print("Rank");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print(" & kmer & Ref \\% & Read \\% & Diff \\%");
+ }
+ }
+ pw.println(" \\\\");
+ pw.println("\\cline{1-"+colCount+"}");
+
+ for (int i=0; i<10; i++) {
+ pw.println(lines[i]);
+ }
+ pw.println("\\cline{1-"+colCount+"}");
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ }
+
+ /**
+ * Write Top 10 or Bottom 10 moitf section.
+ * @param listType either TYPE_TOP or TYPE_BOTTOM
+ * @param k kmer size
+ */
+ public void writeMotifRange(int listType, int k, int colCount) {
+ ArrayList<Map.Entry<String, Double>>[] insertionMotifs = new ArrayList[3];
+ ArrayList<Map.Entry<String, Double>>[] deletionMotifs = new ArrayList[3];
+ ArrayList<Map.Entry<String, Double>>[] substitutionMotifs = new ArrayList[3];
+ String logoTypeString = new String("Unknown");
+
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ String typeString;
+ if (listType == KmerMotifStatistic.TYPE_TOP) {
+ typeString = overallStats.getStatsByType(type).getTypeString() + "_top";
+ } else if (listType == KmerMotifStatistic.TYPE_BOTTOM) {
+ typeString = overallStats.getStatsByType(type).getTypeString() + "_bottom";
+ } else {
+ typeString = overallStats.getStatsByType(type).getTypeString() + "_unknown";
+ }
+
+ insertionMotifs[type] = overallStats.getStatsByType(type).getMotifStatistics().getSortedInsertionMotifPercentages(k);
+ deletionMotifs[type] = overallStats.getStatsByType(type).getMotifStatistics().getSortedDeletionMotifPercentages(k);
+ substitutionMotifs[type] = overallStats.getStatsByType(type).getMotifStatistics().getSortedSubstitutionMotifPercentages(k);
+
+ overallStats.getStatsByType(type).getMotifStatistics().writeInsertionLogoImage(listType, options.getGraphsDir() + File.separator + "motifs" + File.separator + "logo_insertion_" + typeString + "_k" + k + ".png", k);
+ overallStats.getStatsByType(type).getMotifStatistics().writeDeletionLogoImage(listType, options.getGraphsDir() + File.separator + "motifs" + File.separator + "logo_deletion_" + typeString + "_k" + k + ".png", k);
+ overallStats.getStatsByType(type).getMotifStatistics().writeSubstitutionLogoImage(listType, options.getGraphsDir() + File.separator + "motifs" + File.separator + "logo_substitution_" + typeString + "_k" + k + ".png", k);
+ }
+ }
+
+ for (int i=0; i<10; i++) {
+ if (listType == KmerMotifStatistic.TYPE_TOP) {
+ pw.print(i+1);
+ } else {
+ pw.print("-"+(10-i));
+ }
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ int insertionPos = i;
+ int deletionPos = i;
+ int substitutionPos = i;
+
+ if (listType == KmerMotifStatistic.TYPE_BOTTOM) {
+ insertionPos = insertionMotifs[type].size() - 10 + i;
+ deletionPos = deletionMotifs[type].size() - 10 + i;
+ substitutionPos = substitutionMotifs[type].size() - 10 + i;
+ }
+
+ if ((insertionMotifs[type].size() > insertionPos) && (insertionPos >=0)) {
+ pw.printf(" & %s (%.2f\\%%)", insertionMotifs[type].get(insertionPos).getKey(), insertionMotifs[type].get(insertionPos).getValue());
+ } else {
+ pw.print(" &");
+ }
+
+ if ((deletionMotifs[type].size() > deletionPos) && (deletionPos >=0)) {
+ pw.printf(" & %s (%.2f\\%%)", deletionMotifs[type].get(deletionPos).getKey(), deletionMotifs[type].get(deletionPos).getValue());
+ } else {
+ pw.print(" &");
+ }
+
+ if ((substitutionMotifs[type].size() > substitutionPos) && (substitutionPos >=0)) {
+ pw.printf(" & %s (%.2f\\%%)", substitutionMotifs[type].get(substitutionPos).getKey(), substitutionMotifs[type].get(substitutionPos).getValue());
+ } else {
+ pw.print(" &");
+ }
+ }
+ }
+
+ if (i == 0) {
+ if (listType == KmerMotifStatistic.TYPE_TOP) {
+ pw.print(" & \\multirow{10}{*}{\\rotatebox[origin=c]{90}{Most common}}");
+ } else if (listType == KmerMotifStatistic.TYPE_BOTTOM) {
+ pw.print(" & \\multirow{10}{*}{\\rotatebox[origin=c]{90}{Least common}}");
+ } else {
+ pw.print(" & \\multirow{10}{*}{\\rotatebox[origin=c]{90}{Unknown}}");
+ }
+ }
+ pw.println("\\\\");
+ }
+
+ pw.println("\\cline{1-"+colCount+"}");
+ pw.println("\\rule{0pt}{0.6cm}");
+ pw.print(" ");
+
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ String typeString;
+ if (listType == KmerMotifStatistic.TYPE_TOP) {
+ typeString = overallStats.getStatsByType(type).getTypeString() + "_top";
+ } else if (listType == KmerMotifStatistic.TYPE_BOTTOM) {
+ typeString = overallStats.getStatsByType(type).getTypeString() + "_bottom";
+ } else {
+ typeString = overallStats.getStatsByType(type).getTypeString() + "_unknown";
+ }
+
+ pw.print(" & \\includegraphics[height=0.5cm]{" + options.getGraphsDir()+File.separator + "motifs" + File.separator + "logo_insertion_" + typeString + "_k" + k + ".png}");
+ pw.print(" & \\includegraphics[height=0.5cm]{" + options.getGraphsDir()+File.separator + "motifs" + File.separator + "logo_deletion_" + typeString + "_k" + k + ".png}");
+ pw.print(" & \\includegraphics[height=0.5cm]{" + options.getGraphsDir()+File.separator + "motifs" + File.separator + "logo_substitution_" + typeString + "_k" + k + ".png}");
+ }
+ }
+
+ pw.println(" \\\\");
+ }
+
+ /**
+ * Write motif section of report.
+ */
+ public void writeMotifSection() {
+ pw.println("\\subsection*{Kmer motifs before errors}");
+
+ for (int k=3; k<=5; k++) {
+ int colCount = 1;
+
+ pw.println("\\subsection*{"+k+"-mer error motif analysis}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{6pt}{8pt}\\selectfont");
+ pw.println("\\tabcolsep=0.15cm");
+ pw.print("\\begin{tabular}{|c");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print("|c c c");
+ colCount+=3;
+ }
+ }
+ pw.println("|c}");
+ pw.println("\\cline{1-"+colCount+"}");
+ //pw.println("& \\multicolumn{3}{c|}{Template} & \\multicolumn{3}{c|}{Complement} & \\multicolumn{3}{c|}{2D} & \\\\");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print(" & \\multicolumn{3}{c|}{" + NanoOKOptions.getTypeFromInt(type) + "}");
+ }
+ }
+ pw.println(" & \\\\");
+ //pw.println("Rank & Insertion & Deletion & Substitution & Insertion & Deletion & Substitution & Insertion & Deletion & Substitution & \\\\");
+ pw.print("Rank");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print(" & Insertion & Deletion & Substitution");
+ }
+ }
+ pw.println(" & \\\\");
+ pw.println("\\cline{1-"+colCount+"}");
+ writeMotifRange(KmerMotifStatistic.TYPE_TOP, k, colCount);
+ pw.println("\\cline{1-"+colCount+"}");
+ writeMotifRange(KmerMotifStatistic.TYPE_BOTTOM, k, colCount);
+ pw.println("\\cline{1-"+colCount+"}");
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ pw.println("\\vspace{-9mm}");
+ pw.printf("{\\fontsize{8}{8}\\textsf{Kmer space for %d-mers: %d \\hspace{5mm} Random chance for any given %d-mer: %.2f\\%%}}", k, (int)Math.pow(4, k), k, 100.0/Math.pow(4, k));
+ pw.println("");
+ pw.println("\\vspace{5mm}");
+ }
+ }
+
+ /**
+ * Convert integer (0, 1, 2, 3) to base (A, C, G, T)
+ * @param i number to convert
+ * @return base character
+ */
+ private char intToBase(int i) {
+ char c;
+
+ switch(i) {
+ case 0: c = 'A'; break;
+ case 1: c = 'C'; break;
+ case 2: c = 'G'; break;
+ case 3: c = 'T'; break;
+ default: c = 'N'; break;
+ }
+
+ return c;
+ }
+
+ /**
+ * Write section to report on substitution errors.
+ */
+ public void writeSubstitutionErrorsSection()
+ {
+ pw.println("\\subsection*{All reference substitutions}");
+ pw.println("\\vspace{-3mm}");
+
+ pw.println("\\begin{table}[H]");
+ pw.println("{\\footnotesize");
+ pw.println("\\fontsize{8pt}{10pt}\\selectfont");
+ //pw.println("\\begin{tabular}{|c c|c c c c|c c c c|c c c c|}");
+ pw.print("\\begin{tabular}{|c c");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print("|c c c c");
+ }
+ }
+ pw.println("|}");
+ pw.println("\\hline");
+ //pw.println(" & & \\multicolumn{4}{c|}{Template substituted \\%} & \\multicolumn{4}{c|}{Complement substituted \\%} & \\multicolumn{4}{c|}{2D substituted \\%} \\\\");
+ pw.print(" &");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print(" & \\multicolumn{4}{c|}{" + NanoOKOptions.getTypeFromInt(type) + " substituted \\%}");
+ }
+ }
+ pw.println(" \\\\");
+ //pw.println(" & & a & c & g & t & a & c & g & t & a & c & g & t \\\\");
+ pw.print(" &");
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ pw.print(" & a & c & g & t");
+ }
+ }
+ pw.println(" \\\\");
+
+ pw.println("\\hline");
+
+ for (int r=0; r<4; r++) {
+ if (r == 0) {
+ pw.print("\\multirow{4}{*}{\\rotatebox[origin=c]{90}{Reference}} & ");
+ } else {
+ pw.print(" & ");
+ }
+ pw.print(intToBase(r));
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ int subs[][] = overallStats.getStatsByType(type).getSubstitutionErrors();
+ double nSubstitutions = (double)overallStats.getStatsByType(type).getNumberOfSubstitutions();
+ for (int s=0; s<4; s++) {
+ double pc = (100.0 * (double)subs[r][s]) / nSubstitutions;
+ pw.printf(" & %.2f", pc);
+ }
+ }
+ }
+ pw.println("\\\\");
+ }
+ pw.println("\\hline");
+ pw.println("\\end{tabular}");
+ pw.println("}");
+ pw.println("\\end{table}");
+ }
+
+ private void writeOverallKmerSection() {
+ String graphWidth = "width=.3\\linewidth";
+
+ if (options.getNumberOfTypes() == 1) {
+ graphWidth = "width=.5\\linewidth";
+ }
+
+ pw.println("\\subsection*{All reference 21mer analysis}");
+ pw.println("\\vspace{-3mm}");
+ pw.println("\\begin{figure}[H]");
+ pw.println("\\centering");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_TEMPLATE, "\\includegraphics["+graphWidth+"]{", options.getGraphsDir() + File.separator + "all_Template_21mers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_COMPLEMENT, "\\includegraphics["+graphWidth+"]{", options.getGraphsDir() + File.separator + "all_Complement_21mers", "}");
+ includeGraphicsIfExists(NanoOKOptions.TYPE_2D, "\\includegraphics["+graphWidth+"]{", options.getGraphsDir() + File.separator + "all_2D_21mers", "}");
+ pw.println("\\end{figure}");
+ }
+
+ /**
+ * Add sections for each reference sequence.
+ * @param refs reference sequences
+ */
+ public void addAllReferenceSections() {
+ ArrayList<ReferenceSequence> sortedRefs = references.getSortedReferences();
+ for (int i=0; i<sortedRefs.size(); i++) {
+ ReferenceSequence rs = sortedRefs.get(i);
+
+ if (rs.getTotalNumberOfAlignments() > NanoOKOptions.MIN_ALIGNMENTS) {
+ if ((options.getNumberOfTypes() > 1) || (references.getNumberOfReferences() > 1)) {
+ pw.println("\\clearpage");
+ }
+
+ writeReferenceSection(rs);
+ }
+ }
+ }
+
+ /**
+ * Write end of LaTeX file.
+ */
+ private void writeLaTeXFooter() {
+ pw.println("\\end{document}");
+ }
+
+ /**
+ * Get handle to PrintWriter.
+ * @return a PrintWriter object
+ */
+ public PrintWriter getPrintWriter() {
+ return pw;
+ }
+
+ /**
+ * Write the LaTeX report.
+ */
+ public void writeReport() {
+ open();
+ addPassFailSection();
+ addLengthsSection();
+
+ for (int type=0; type<3; type++) {
+ if (options.isProcessingReadType(type)) {
+ writeAlignmentsSection(overallStats.getStatsByType(type));
+ // references.writeReferenceStatFiles(type);
+ // references.writeReferenceSummary(type);
+ writeAlignmentSummary(type, pw);
+ }
+ }
+
+ addAllReferenceSections();
+ //Set<String> ids = references.getAllIds();
+ //for (String id : ids) {
+ // writeReferenceSection(references.getReferenceById(id));
+ //}
+
+ if ((options.getNumberOfTypes() > 1) || (references.getNumberOfReferences() > 1)) {
+ pw.println("\\clearpage");
+ }
+ writeOverallKmerSection();
+ writeSubstitutionErrorsSection();
+ writeMotifSection();
+
+ writeLaTeXFooter();
+ close();
+ }
+
+ /**
+ * Write reference summary to LaTeX report.
+ * @param type type from NanoOKOptions
+ * @param pw handle to LaTeX file
+ */
+ public void writeAlignmentSummary(int type, PrintWriter pw) {
+ if (references.getNumberOfReferences() < LONGTABLE_THRESHOLD) {
+ pw.println("\\begin{table}[H]");
+ }
+ pw.println("{\\footnotesize");
+ if (references.getNumberOfReferences() < LONGTABLE_THRESHOLD) {
+ pw.println("\\fontsize{9pt}{11pt}\\selectfont");
+ pw.println("\\begin{tabular}{l c c c c c c c}");
+ } else {
+ pw.println("\\begin{longtable}[l]{l c c c c c c c}");
+ }
+ pw.println(" & & {\\bf Number of} & {\\bf \\% of} & {\\bf Mean read} & {\\bf Aligned} & {\\bf Mean} & {\\bf Longest} \\\\");
+ pw.println("{\\bf ID} & {\\bf Size} & {\\bf Reads} & {\\bf Reads} & {\\bf length} & {\\bf bases} & {\\bf coverage} & {\\bf Perf Kmer} \\\\");
+ ArrayList<ReferenceSequence> sortedRefs = references.getSortedReferences();
+ for (int i=0; i<sortedRefs.size(); i++) {
+ ReferenceSequence r = sortedRefs.get(i);
+ ReferenceSequenceStats refStats = r.getStatsByType(type);
+ if ((sortedRefs.size() < 100) || (refStats.getNumberOfReadsWithAlignments() > 0)) {
+ pw.printf("%s & %d & %d & %.2f & %.2f & %d & %.2f & %d \\\\",
+ r.getName().replaceAll("_", " "),
+ r.getSize(),
+ refStats.getNumberOfReadsWithAlignments(),
+ 100.0 * (double)refStats.getNumberOfReadsWithAlignments() / (double)overallStats.getStatsByType(type).getNumberOfReads(),
+ refStats.getMeanReadLength(),
+ refStats.getTotalAlignedBases(),
+ (double)refStats.getTotalAlignedBases() / r.getSize(),
+ refStats.getLongestPerfectKmer());
+ pw.println("");
+ }
+ }
+ if (references.getNumberOfReferences() < LONGTABLE_THRESHOLD) {
+ pw.println("\\end{tabular}");
+ } else {
+ pw.println("\\end{longtable}");
+ }
+ pw.println("}");
+ if (references.getNumberOfReferences() < LONGTABLE_THRESHOLD) {
+ pw.println("\\end{table}");
+ }
+ }
+
+
+ public void makePDF() {
+ ProcessLogger pl = new ProcessLogger();
+ String command = "pdflatex -interaction=nonstopmode -output-directory " +options.getLatexDir() + " " + options.getLatexDir() + File.separator + options.getSample() + ".tex";
+ String logFilename = options.getLogsDir() + File.separator + "pdflatex_output_log" + options.getAnalysisSuffix() + ".txt";
+ System.out.println("pdflatex output " + logFilename);
+ pl.runAndLogCommand(command, logFilename, false);
+ }
+}
diff --git a/src/nanook/SequenceCoverage.java b/src/nanook/SequenceCoverage.java
new file mode 100644
index 0000000..e1610f1
--- /dev/null
+++ b/src/nanook/SequenceCoverage.java
@@ -0,0 +1,139 @@
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Serializable;
+
+/**
+ * Represent reference coverage
+ *
+ * @author Richard Leggett
+ */
+public class SequenceCoverage implements Serializable {
+ private static final long serialVersionUID = NanoOK.SERIAL_VERSION;
+ private int[] coverage;
+ private int numBins = 1000;
+ private int genomeSize = 0;
+ private int binSize = 1;
+ private boolean binEarly = false;
+
+ public SequenceCoverage(int s) {
+ genomeSize = s;
+
+ // Approx hundred bins for coverage
+ float b = genomeSize / 100;
+
+ // Make a multiple of 10, 100 or 500...
+ if (genomeSize < 50000) {
+ binSize = 10 * (1 + Math.round(b / 10));
+ } else if (genomeSize < 500000) {
+ binSize = 100 * (1 + Math.round(b / 100));
+ } else {
+ binSize = 500 * (1 + Math.round(b / 500));
+ }
+
+ //binSize=50;
+
+ numBins = (int) Math.ceil(genomeSize / (double)binSize);
+
+
+ // Bin early for large genomes
+ if (genomeSize < 10000000) {
+ binEarly = false;
+ } else {
+ binEarly = true;
+ }
+
+ // Force this for now
+ binEarly = true;
+
+ if (binEarly) {
+ coverage = new int[numBins];
+ } else {
+ coverage = new int[genomeSize];
+ }
+ }
+
+ /**
+ * Increment coverage between two points.
+ * @param start start position
+ * @param size size
+ */
+ public synchronized void addCoverage(int start, int size) {
+ for (int i=start; i<(start+size); i++) {
+ if (binEarly) {
+ int b = i/binSize;
+ if (b < numBins) {
+ coverage[b]++;
+ }
+ } else {
+ if (i < genomeSize) {
+ coverage[i]++;
+ }
+ }
+ }
+ }
+
+ /**
+ * Write coverage file for later graph plotting.
+ * @param filename output filename
+ * @param binSize bin size
+ */
+ private synchronized void binAndWriteCoverageData(String filename, int pbinSize) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=0; i<(genomeSize-pbinSize); i+=pbinSize) {
+ int count = 0;
+ for (int j=0; j<pbinSize; j++) {
+ count += coverage[i+j];
+ }
+ pw.printf("%d\t%.2f", i, ((double)count / (double)pbinSize));
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeCoverageData exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write coverage file for later graph plotting.
+ * @param filename output filename
+ * @param binSize bin size
+ */
+ private synchronized void writeBinnedCoverageData(String filename) {
+ try {
+ PrintWriter pw = new PrintWriter(new FileWriter(filename));
+ for (int i=0; i<numBins-1; i++) {
+ double c = (double)coverage[i] / (double)binSize;
+ if (i == (numBins - 1)) {
+ c = (double)coverage[i] / (double)(genomeSize - (i*binSize));
+ }
+ pw.printf("%d\t%.2f", i*binSize, c);
+ pw.println("");
+ }
+ pw.close();
+ } catch (IOException e) {
+ System.out.println("writeCoverageData exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Write coverage file for later graph plotting.
+ * @param filename output filename
+ * @param binSize bin size
+ */
+ public synchronized void writeCoverageData(String filename, int binSize) {
+ if (binEarly) {
+ writeBinnedCoverageData(filename);
+ } else {
+ binAndWriteCoverageData(filename, binSize);
+ }
+ }
+
+}
diff --git a/src/nanook/SequenceLogo.java b/src/nanook/SequenceLogo.java
new file mode 100644
index 0000000..5786306
--- /dev/null
+++ b/src/nanook/SequenceLogo.java
@@ -0,0 +1,133 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.awt.Color;
+import java.awt.Font;
+import java.awt.FontMetrics;
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.awt.geom.AffineTransform;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import javax.imageio.ImageIO;
+
+/**
+ * Create sequence logo (for error motifs etc.)
+ *
+ * @author Richard Leggett
+ */
+public class SequenceLogo {
+ private BufferedImage bImage;
+ private int size = 0;
+ private double[][] counts;
+ private String[] bases = {"A", "C", "G", "T"};
+ private Color[] baseColours = {Color.GREEN, Color.BLUE, Color.YELLOW, Color.RED};
+ private int imageWidth = 0;
+ private int imageHeight = 0;
+ private int charWidth = 0;
+ private int charHeight = 0;
+
+ /**
+ * Constructor
+ * @param size size (in bases)
+ */
+ public SequenceLogo(int s) {
+ size = s;
+ counts = new double[4][size];
+ }
+
+ /**
+ * Debugging constructor
+ */
+ public SequenceLogo() {
+ this(6);
+ this.addBase(0, 25, 25, 25, 25);
+ this.addBase(1, 25, 25, 25, 25);
+ this.addBase(2, 50, 0, 0, 50);
+ this.addBase(3, 100, 0, 0, 0);
+ this.addBase(4, 10, 10, 30, 50);
+ this.addBase(5, 33, 33, 0, 34);
+ }
+
+ /**
+ * Set relative counts at a given position in the logo.
+ * @param position position (0-offset)
+ * @param a number of As
+ * @param c number of Cs
+ * @param g number of Gs
+ * @param t number of Ts
+ */
+ public void addBase(int position, int a, int c, int g, int t) {
+ if (position < size) {
+ counts[0][position] = (double)a / (double)(a + c + g + t);
+ counts[1][position] = (double)c / (double)(a + c + g + t);
+ counts[2][position] = (double)g / (double)(a + c + g + t);
+ counts[3][position] = (double)t / (double)(a + c + g + t);
+ } else {
+ System.out.println("Warning: bad index passed to addBase.");
+ }
+ }
+
+ /**
+ * Draw the logo image.
+ */
+ public void drawImage() {
+ // Create temporary image to work out sizing
+ bImage = new BufferedImage(100, 100, BufferedImage.TYPE_INT_RGB);
+ Graphics2D g = bImage.createGraphics();
+ AffineTransform stretch;
+ Font f = new Font("Arial", Font.BOLD, 40);
+ FontMetrics metrics = g.getFontMetrics(f);
+ g.setFont(f);
+ charWidth = metrics.charWidth('G');
+ charHeight = metrics.charWidth('G');
+
+ // Re-create image at right size
+ imageWidth = size * charWidth;
+ imageHeight = charHeight*4;
+ bImage = new BufferedImage(imageWidth, imageHeight, BufferedImage.TYPE_INT_RGB);
+ g = bImage.createGraphics();
+ g.setFont(f);
+ g.setColor(Color.WHITE);
+ g.fillRect(0, 0, imageWidth, imageHeight);
+ //System.out.println("imagesize " + imageWidth + ", " + imageHeight);
+
+ for (int i=0; i<size; i++) {
+ double drawY = (double)imageHeight;
+ for (int j=0; j<4; j++) {
+ if (counts[j][i] > 0.0) {
+ double yStretch = counts[j][i] * 4;
+ int drawX = i * charWidth;
+ stretch = AffineTransform.getScaleInstance(1.0, yStretch);
+ g.setTransform(stretch);
+ g.setColor(baseColours[j]);
+ //System.out.println(bases[j] + " at "+drawX+", "+drawY+" with stretch "+yStretch);
+ g.drawString(bases[j], drawX, (int)(drawY / yStretch));
+ drawY -= (yStretch * (double)charHeight);
+ }
+ }
+ //System.out.println("");
+ }
+
+ }
+
+ /**
+ * Save the logo as an image.
+ * @param filename output filename
+ */
+ public void saveImage(String filename) {
+ try {
+ ImageIO.write(bImage, "PNG", new File(filename));
+ }
+ catch (Exception e)
+ {
+ System.out.println(e);
+ }
+ }
+}
diff --git a/src/nanook/SequenceReader.java b/src/nanook/SequenceReader.java
new file mode 100644
index 0000000..c20c76d
--- /dev/null
+++ b/src/nanook/SequenceReader.java
@@ -0,0 +1,319 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Read FASTA files
+ *
+ * @author Richard Leggett
+ */
+public class SequenceReader {
+ private ArrayList<String> seqIDs = new ArrayList();
+ private ArrayList<Integer> seqLengths = new ArrayList();
+ private ArrayList<String> sequence = new ArrayList();
+ private ArrayList<Double> gcPc = new ArrayList();
+ private int nSeqs = 0;
+ private boolean cacheSequence = false;
+ private String currentFilename;
+
+ public SequenceReader(boolean cache) {
+ cacheSequence = cache;
+ }
+
+ public int countGC(String s) {
+ int g = s.length() - s.replace("G", "").length();
+ int c = s.length() - s.replace("C", "").length();
+
+ return g + c;
+ }
+
+ public int indexFASTQFile(String filename) {
+ currentFilename = filename;
+
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(filename));
+ String line;
+ String id = null;
+ int contigLength = 0;
+ int readsInThisFile = 0;
+ boolean gotRead;
+ int gc = 0;
+
+ do {
+ String sh = br.readLine();
+ String s = br.readLine();
+ String qh = br.readLine();
+ String q = br.readLine();
+ gotRead = false;
+ if ((sh != null) && (s != null) && (qh != null) & (q != null)) {
+ if (sh.startsWith("@")) {
+ if (qh.startsWith("+")) {
+ String sequenceHeader = sh.trim();
+ String seq = s.trim();
+ String[] parts = sequenceHeader.substring(1).split("(\\s+)");
+ id = parts[0];
+
+ if (id != null) {
+ seqIDs.add(id);
+ seqLengths.add(seq.length());
+ gcPc.add(new Double(100.0 * (double)countGC(seq) / (double)seq.length()));
+ if (cacheSequence) {
+ sequence.add(seq);
+ }
+ nSeqs++;
+ gotRead = true;
+ }
+
+ }
+ }
+ }
+ } while (gotRead);
+
+ br.close();
+ } catch (Exception e) {
+ System.out.println("readFasta Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ return nSeqs;
+ }
+
+ private String makeName(String line, String id) {
+ String name = id;
+ Pattern p = Pattern.compile(">gi\\|(\\S+)\\|(\\S+)\\|(\\S+)\\| (\\S+) (\\S+)");
+ Matcher m = p.matcher(line);
+
+ if (m.find()) {
+ name = m.group(4) + "_" + m.group(5);
+ }
+
+ name=name.replaceAll("\\.", "_");
+ name=name.replaceAll(" ", "_");
+ name=name.replaceAll("\\|", "_");
+
+ return name;
+ }
+
+ /**
+ * Parse a FASTA file
+ * @param filename filename of FASTA file
+ */
+ public int indexFASTAFile(String filename, String indexFilename, boolean storeIds) {
+ currentFilename = filename;
+
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(filename));
+ PrintWriter pw = null;
+ String line;
+ String id = null;
+ String name = null;
+ int contigLength = 0;
+ int readsInThisFile = 0;
+ //String seq = "";
+ StringBuilder seq = new StringBuilder(100000);
+ int gc = 0;
+
+ if (indexFilename != null) {
+ pw = new PrintWriter(new FileWriter(indexFilename, false));
+ }
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ line = line.trim();
+ }
+
+ if ((line == null) || (line.startsWith(">"))) {
+ if (id != null) {
+ if (storeIds) {
+ double gcpc = 100.0*(double)gc / (double)contigLength;
+ seqIDs.add(id);
+ seqLengths.add(contigLength);
+ gcPc.add(new Double(gcpc));
+ }
+
+ if (pw != null) {
+ pw.printf("%s\t%d\t%s", id, contigLength, name);
+ pw.println("");
+ }
+
+ if (cacheSequence) {
+ sequence.add(seq.toString());
+ }
+ nSeqs++;
+ seq = new StringBuilder(100000);
+ }
+
+ if (line != null) {
+ String[] parts = line.substring(1).split("(\\s+)");
+ id = parts[0];
+ name = makeName(line, id);
+ }
+
+ contigLength = 0;
+ gc = 0;
+ } else if (line != null) {
+ contigLength += line.length();
+ gc += countGC(line);
+
+ if (cacheSequence) {
+ seq.append(line);
+ //seq = seq + line;
+ }
+ }
+ } while (line != null);
+
+ br.close();
+ if (pw != null) {
+ pw.close();
+ }
+ } catch (Exception e) {
+ System.out.println("readFasta Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ return nSeqs;
+ }
+
+ public int getSequenceCount() {
+ return nSeqs;
+ }
+
+ public String getID(int i) {
+ return seqIDs.get(i);
+ }
+
+ public int getLength(int i) {
+ return seqLengths.get(i);
+ }
+
+ public double getGC(int i) {
+ return gcPc.get(i);
+ }
+
+ public String getSubSequence(String id, int start, int end) {
+ int index = -1;
+ String seq = "";
+
+ for (int i=0; i<nSeqs; i++) {
+ if (seqIDs.get(i).equals(id)) {
+ index = i;
+ break;
+ }
+ }
+
+ if (index == -1) {
+ System.out.println("Error: can't find ID " + id);
+ System.exit(1);
+ }
+
+ if (cacheSequence) {
+ if (start < 0) {
+ System.out.println("Warning: invalid index ("+start+") in SequenceReader");
+ start = 0;
+ }
+ if (end >= sequence.get(index).length()) {
+ //System.out.println("Warning: invalid index ("+end+") in SequenceReader");
+ end = sequence.get(index).length() - 1;
+ }
+ seq = sequence.get(index).substring(start, end+1);
+ } else {
+ try
+ {
+ BufferedReader br = new BufferedReader(new FileReader(currentFilename));
+ StringBuilder ssb = new StringBuilder("");
+ String line;
+ boolean foundId = false;
+ int position = 0;
+
+ do {
+ line = br.readLine();
+ if (line != null) {
+ line = line.trim();
+ }
+
+
+ if (line != null) {
+ if (line.startsWith(">")) {
+ if (foundId) {
+ // If we've found the ID we were after, then this new one means we can stop
+ break;
+ } else {
+ String[] parts = line.substring(1).split("(\\s+)");
+ String thisid = parts[0];
+
+ // Check for ID we're after
+ if (thisid.equals(id)) {
+ foundId = true;
+ }
+ }
+ } else {
+ if (foundId) {
+ int fStart = position;
+ int fEnd = position + line.length() - 1;
+
+ //System.out.println("fStart = "+fStart+" fEnd = "+fEnd);
+
+ if (fEnd >= start) {
+ int cutStart = (fStart >= start) ? 0:start-position;
+ int cutEnd = (fEnd <= end) ? (line.length() - 1):end-position;
+
+ //System.out.println(cutStart + " " +cutEnd+"["+line+"]");
+ ssb.append(line.substring(cutStart, cutEnd+1));
+
+ // Got all we wanted?
+ if (fEnd >= end) {
+ break;
+ }
+ }
+ // Keep track of position
+ position = position + line.length();
+ }
+ }
+ }
+ } while (line != null);
+
+ br.close();
+
+ seq = ssb.toString();
+ } catch (Exception e) {
+ System.out.println("readFasta Exception:");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ return seq;
+ }
+
+ public void storeKmers(int index, KmerTable t) {
+ String seq = sequence.get(index);
+ if (seq != null) {
+ int k = t.getKmerSize();
+
+ for (int o=0; o<seq.length() - k; o++) {
+ t.countKmer(seq.substring(o, o+5));
+ }
+ } else {
+ System.out.println("Need to handle the non-cached case");
+ }
+ }
+
+}
diff --git a/src/nanook/SystemCommandRunnable.java b/src/nanook/SystemCommandRunnable.java
new file mode 100644
index 0000000..cfef515
--- /dev/null
+++ b/src/nanook/SystemCommandRunnable.java
@@ -0,0 +1,93 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.File;
+import java.util.LinkedList;
+
+/**
+ * Enable multi-threading of system commands
+ *
+ * @author Richard Leggett
+ */
+public class SystemCommandRunnable implements Runnable {
+ NanoOKOptions options;
+ private String message;
+ private String command;
+ private String logFile;
+ private String outFile;
+
+ /**
+ * Constructor
+ * @param o program options
+ */
+ public SystemCommandRunnable(NanoOKOptions ops, String msg, String com, String out, String log) {
+ options = ops;
+ message = msg;
+ command = com;
+ logFile = log;
+ outFile = out;
+ }
+
+ private void runCommandLSF(String command, String outPath, String log) {
+ // outPath only non-null if aligner will only write to screen (yes, BWA, I'm talking about you)
+ if (outPath != null) {
+ command = command + " > " + outPath;
+ }
+
+ // Make the LSF command
+ String lsfCommand = "bsub -n " + options.getNumberOfThreads() + " -q " + options.getQueue() + " -oo " + log + " -R \"rusage[mem=8000] span[hosts=1]\" \"" + command + "\"";
+ System.out.println(command);
+ //pl = new ProcessLogger();
+ //response = pl.getCommandOutput(lsfCommand, true, true);
+ }
+
+ private void runCommandLocal(String command, String outPath) {
+ ProcessLogger pl = new ProcessLogger();
+
+ // outPath only non-null if aligner will only write to screen (yes, BWA, I'm talking about you)
+ if (outPath != null) {
+ pl.setWriteFormat(false, true, false);
+ pl.runAndLogCommand(command, outPath, false);
+ } else {
+ pl.runCommand(command);
+ }
+ }
+
+ /**
+ * Run the alignment command
+ * @param command
+ * @param outPath
+ * @param log
+ */
+ private void runCommand(String command, String outPath, String log) {
+ switch(options.getScheduler()) {
+ case "screen":
+ System.out.println(command);
+ break;
+ case "lsf":
+ runCommandLSF(command, outPath, log);
+ break;
+ case "system":
+ runCommandLocal(command, outPath);
+ break;
+ default:
+ System.out.println("Error: scheduler " + options.getScheduler() + " not recognised.");
+ System.exit(1);
+ break;
+ }
+ }
+
+ public void run() {
+ if (message != null) {
+ System.out.println(message);
+ }
+
+ runCommand(command, outFile, logFile);
+ }
+}
diff --git a/src/nanook/WatcherLog.java b/src/nanook/WatcherLog.java
new file mode 100644
index 0000000..769a1d0
--- /dev/null
+++ b/src/nanook/WatcherLog.java
@@ -0,0 +1,65 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package nanook;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+
+public class WatcherLog {
+ private transient PrintWriter pw = null;
+ private String filename = null;
+
+ public WatcherLog(NanoOKOptions options) {
+ }
+
+ public synchronized void open(String f, boolean clearLogs) {
+ if (clearLogs) {
+ filename = f + ".log";
+ } else {
+ DateFormat df = new SimpleDateFormat("ddMMyy_HHmmss");
+ Date dateobj = new Date();
+ filename = f + "_" + df.format(dateobj).toString()+".log";
+ }
+
+ System.out.println("Opening "+filename);
+
+ try {
+ pw = new PrintWriter(new FileWriter(filename, true));
+ } catch (IOException e) {
+ System.out.println("WatcherLog exception");
+ e.printStackTrace();
+ }
+ }
+
+ public synchronized void close() {
+ if (pw != null) {
+ pw.close();
+ }
+ }
+
+ public synchronized void print(String s) {
+ if (pw != null) {
+ pw.print(s);
+ pw.flush();
+ }
+ }
+
+ public synchronized void println(String s) {
+ if (pw != null) {
+ pw.println(s);
+ pw.flush();
+ }
+ }
+
+ public synchronized PrintWriter getPrintWriter() {
+ return pw;
+ }
+}
diff --git a/src/nanook/WatcherRunnable.java b/src/nanook/WatcherRunnable.java
new file mode 100644
index 0000000..e91439c
--- /dev/null
+++ b/src/nanook/WatcherRunnable.java
@@ -0,0 +1,106 @@
+/*
+ * Program: NanoOK
+ * Author: Richard M. Leggett
+ *
+ * Copyright 2015 The Genome Analysis Centre (TGAC)
+ */
+
+package nanook;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Enable multi-threading of read extraction
+ *
+ * @author Richard Leggett
+ */
+public class WatcherRunnable implements Runnable {
+ public final static String TYPE_STRING_TEMPLATE = "/Analyses/Basecall_2D_000/BaseCalled_template/Fastq";
+ public final static String TYPE_STRING_COMPLEMENT = "/Analyses/Basecall_2D_000/BaseCalled_complement/Fastq";
+ public final static String TYPE_STRING_2D = "/Analyses/Basecall_2D_000/BaseCalled_2D/Fastq";
+ private String[] typeStrings = {TYPE_STRING_TEMPLATE, TYPE_STRING_COMPLEMENT, TYPE_STRING_2D};
+ private NanoOKOptions options;
+ private AlignmentFileParser parser;
+ private String inDir;
+ private String filename;
+ private String fastaqDir;
+ private String alignDir;
+ private String passOrFail;
+ private int readType;
+
+ public WatcherRunnable(NanoOKOptions o, String in, String file, String pf, String out, String ad, AlignmentFileParser p) {
+ options = o;
+ inDir = in;
+ filename = file;
+ passOrFail = pf;
+ fastaqDir = out;
+ alignDir = ad;
+ parser = p;
+
+ if (passOrFail.equals("pass")) {
+ readType = NanoOKOptions.READTYPE_PASS;
+ } else if (passOrFail.equals("fail")) {
+ readType = NanoOKOptions.READTYPE_FAIL;
+ } else {
+ System.out.println("Error in WatcherRunnable - not pass or fail!");
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Extract reads of each type from file
+ * @param inDir input directory
+ * @param filename filename
+ * @param outDir output directory
+ */
+ public void run() {
+ String inputPathname = inDir + File.separator + filename;
+ Fast5File inputFile = new Fast5File(options, inputPathname);
+ String outName = new File(inputPathname).getName();
+
+ //for (int t=0; t<3; t++) {
+ int t;
+ if (options.isProcessing2DReads()) {
+ t = NanoOKOptions.TYPE_2D;
+ } else {
+ t = NanoOKOptions.TYPE_TEMPLATE;
+ }
+
+ if (options.isProcessingReadType(t)) {
+ FastAQFile ff = inputFile.getFastq(options.getBasecallIndex(), t);
+ if (ff != null) {
+ String readFilename = null;
+ String readPathname = null;
+ if (options.getReadFormat() == NanoOKOptions.FASTA) {
+ readFilename = outName + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + ".fasta";
+ readPathname = fastaqDir + File.separator + NanoOKOptions.getTypeFromInt(t) + File.separator + readFilename;
+ System.out.println(" Writing "+readPathname);
+ options.getWatcherReadLog().println(readPathname);
+ ff.writeFasta(readPathname, options.outputFast5Path() ? inputPathname:null);
+ options.getMergedFile(t, readType).addFile(readPathname, options.outputFast5Path() ? inputPathname:null);
+ } else if (options.getReadFormat() == NanoOKOptions.FASTQ) {
+ readFilename = outName + "_BaseCalled_" + NanoOKOptions.getTypeFromInt(t) + ".fastq";
+ readPathname = fastaqDir + File.separator + NanoOKOptions.getTypeFromInt(t) + File.separator + readFilename;
+ System.out.println(" Writing "+readPathname);
+ options.getWatcherReadLog().println(readPathname);
+ ff.writeFastq(readPathname);
+ options.getMergedFile(t, readType).addFile(readPathname, options.outputFast5Path() ? inputPathname:null);
+ }
+ }
+ }
+ //}
+ }
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/nanook.git
More information about the debian-med-commit
mailing list