[med-svn] [kraken] 01/02: Imported Upstream version 0.10.5~beta
Andreas Tille
tille at debian.org
Wed Sep 9 11:42:08 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository kraken.
commit 54160fe0b75a7a1d83d9acf3194b791682de67b8
Author: Andreas Tille <tille at debian.org>
Date: Wed Sep 9 13:41:22 2015 +0200
Imported Upstream version 0.10.5~beta
---
CHANGELOG | 86 +++++
LICENSE | 674 ++++++++++++++++++++++++++++++++
README.md | 10 +
docs/MANUAL.html | 358 +++++++++++++++++
docs/MANUAL.markdown | 752 ++++++++++++++++++++++++++++++++++++
docs/Makefile | 9 +
docs/bar-bg.png | Bin 0 -> 188 bytes
docs/head.html | 1 +
docs/kraken.css | 45 +++
docs/top.html | 9 +
install_kraken.sh | 63 +++
scripts/add_to_library.sh | 49 +++
scripts/build_kraken_db.sh | 199 ++++++++++
scripts/check_for_jellyfish.sh | 35 ++
scripts/clean_db.sh | 37 ++
scripts/cp_into_tempfile.pl | 58 +++
scripts/download_genomic_library.sh | 119 ++++++
scripts/download_taxonomy.sh | 60 +++
scripts/kraken | 293 ++++++++++++++
scripts/kraken-build | 296 ++++++++++++++
scripts/kraken-filter | 146 +++++++
scripts/kraken-mpa-report | 221 +++++++++++
scripts/kraken-report | 179 +++++++++
scripts/kraken-translate | 157 ++++++++
scripts/krakenlib.pm | 76 ++++
scripts/read_merger.pl | 164 ++++++++
scripts/report_gi_numbers.pl | 49 +++
scripts/shrink_db.sh | 52 +++
scripts/standard_installation.sh | 42 ++
scripts/upgrade_db.sh | 85 ++++
scripts/verify_gi_numbers.pl | 54 +++
src/Makefile | 35 ++
src/classify.cpp | 449 +++++++++++++++++++++
src/db_shrink.cpp | 161 ++++++++
src/db_sort.cpp | 177 +++++++++
src/kraken_headers.hpp | 50 +++
src/krakendb.cpp | 321 +++++++++++++++
src/krakendb.hpp | 101 +++++
src/krakenutil.cpp | 181 +++++++++
src/krakenutil.hpp | 63 +++
src/make_seqid_to_taxid_map.cpp | 120 ++++++
src/quickfile.cpp | 132 +++++++
src/quickfile.hpp | 48 +++
src/seqreader.cpp | 136 +++++++
src/seqreader.hpp | 64 +++
src/set_lcas.cpp | 265 +++++++++++++
46 files changed, 6681 insertions(+)
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..db8ea12
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,86 @@
+v0.10.5-beta:
+* fix bug in GRCh38 download to handle multi-fasta files
+* add --header-line and --intermediate-ranks options to kraken-mpa-report
+* improved support for adding multi-FASTA files with --add-to-library
+* allow assigning taxon IDs in reference sequences w/o GI numbers
+ using "kraken:taxid" code
+* included full sequence descriptions when using "--[un]classified-out"
+* reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink)
+* reduced memory usage of db_sort (Build step 3)
+* reduced memory usage of set_lcas (Build step 6)
+* support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB
+ env. variables
+* added kraken-translate for getting taxonomic names for each sequence
+* added a --rebuild option to kraken-build
+* turned off default name checking for PE reads; added --check-names option
+* added plasmids to --download-library options
+* added HTML manual, redirecting README to that
+
+v0.10.4-beta:
+* use GRCh38 for human genome library
+* enable input via stdin (via /dev/fd/0)
+* enable compressed (gzip/bzip2) input
+* enable auto-detection of fasta/fastq/gz/bz2
+* simplified add_to_library.sh code to speed up large additions
+* use RNA genomes for viral genome library
+* scan .ffn (RNA) files for genomic data when building databases
+* handle paired-end reads with --paired option
+* provide MetaPhlAn-compatible output with kraken-mpa-report
+* added domain/kingdom codes to kraken-report
+* added kraken-filter script for simple confidence scoring
+* added support for multi-FASTA files in custom DBs
+* fixed build_kraken_db.sh bug for k-mers w/ k < 31
+* updates to README file
+
+v0.10.3-beta:
+* remove Fatal.pm use in kraken-report
+* fixed false success message on make failure in installer
+* explicitly require g++ as C++ compiler in Makefile
+* change to quickfile.cpp to do proper syncing on close
+* fixed kraken-build bug w/ --work-on-disk (cause of some major build stalls)
+* changed hash size calculation to use Perl
+* close input files explicitly in db_sort/db_shrink to reduce reported memory
+* allow db_shrink to work in RAM
+* updates to README file
+
+v0.10.2-beta:
+* fixed kraken-report bug w/ --show-zeros
+* fixed kraken-report installation bug
+* updates to README file
+
+v0.10.1-beta:
+* fixed 2nd bug in build_kraken.sh in calculating hash size (thanks T. Antao)
+* fixed bug in add_to_library.sh for some bash versions (thanks T. Antao)
+* fixed issue where search window wasn't cached until a failure (query speedup)
+* added $KRAKEN_DIR fallback for kraken/kraken-build (thanks S. Koren)
+
+v0.10.0-beta:
+* added CHANGELOG
+* fixed quick mode hit list output
+* updated README citation
+* changed minimizer sort order (query speedup), changes database structure
+* use linear search with small windows (query speedup)
+* changed query procedure (query speedup); search w/o 1st calculating minimizer
+* changed readlink in installer to perl Cwd::abs_path (portability)
+* removed MAP_POPULATE for preloading, uses read loop instead (bugfix/port.)
+* added --work-on-disk switch to kraken-build
+* added kraken-report script
+* fixed bug in build_kraken.sh in calculating hash size (thanks T. Antao)
+
+v0.9.1b:
+* fixed bug to allow kraken-build --shrink
+
+v0.9.0b:
+* full rewrite
+* minimizers used to speed queries, prefix index removed
+
+v0.3:
+* DB build parallelized, Jellyfish removed from LCA assignment
+
+v0.2:
+* full rewrite, most progs. changed to C++
+* Jellyfish removed from classification step
+* prefix index used to speed queries
+
+v0.1:
+* initial version, mostly Perl
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c414dfe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+Kraken taxonomic sequence classification system
+===============================================
+
+Please see the [Kraken webpage] or the [Kraken manual]
+for information on installing and operating Kraken.
+A local copy of the [Kraken manual] is also present here
+in the `docs/` directory (`MANUAL.html` and `MANUAL.markdown`).
+
+[Kraken webpage]: http://ccb.jhu.edu/software/kraken/
+[Kraken manual]: http://ccb.jhu.edu/software/kraken/MANUAL.html
diff --git a/docs/MANUAL.html b/docs/MANUAL.html
new file mode 100644
index 0000000..6829d86
--- /dev/null
+++ b/docs/MANUAL.html
@@ -0,0 +1,358 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta http-equiv="Content-Style-Type" content="text/css" />
+ <meta name="generator" content="pandoc" />
+ <title>Kraken Manual - </title>
+ <style type="text/css">code{white-space: pre;}</style>
+ <link rel="stylesheet" href="kraken.css" type="text/css" />
+ <link href='http://fonts.googleapis.com/css?family=Ubuntu:400,700,400italic' rel='stylesheet' type='text/css'>
+</head>
+<body>
+<div class="pretoc">
+ <p class="title">Kraken taxonomic sequence classification system</p>
+
+ <p class="version">Version 0.10.5-beta</p>
+
+ <p>Operating Manual</p>
+</div>
+
+<h1>Table of Contents</h1>
+<div id="TOC">
+<ul>
+<li><a href="#introduction">Introduction</a></li>
+<li><a href="#system-requirements">System Requirements</a></li>
+<li><a href="#installation">Installation</a></li>
+<li><a href="#kraken-databases">Kraken Databases</a></li>
+<li><a href="#standard-kraken-database">Standard Kraken Database</a></li>
+<li><a href="#classification">Classification</a></li>
+<li><a href="#output-format">Output Format</a></li>
+<li><a href="#custom-databases">Custom Databases</a></li>
+<li><a href="#memory-usage-and-efficiency">Memory Usage and Efficiency</a></li>
+<li><a href="#sample-reports">Sample Reports</a></li>
+<li><a href="#confidence-scoring">Confidence Scoring</a></li>
+<li><a href="#kraken-environment-variables">Kraken Environment Variables</a></li>
+<li><a href="#upgrading-databases-to-v0.10">Upgrading Databases to v0.10+</a></li>
+</ul>
+</div>
+<h1 id="introduction"><a href="#introduction">Introduction</a></h1>
+<p><a href="http://ccb.jhu.edu/software/kraken/">Kraken</a> is a taxonomic sequence classifier that assigns taxonomic labels to short DNA reads. It does this by examining the <span class="math"><em>k</em></span>-mers within a read and querying a database with those <span class="math"><em>k</em></span>-mers. This database contains a mapping of every <span class="math"><em>k</em></span>-mer in <a href="http://ccb.jhu.edu/software/kraken/">Kraken</a>'s genomic library to the lowest common a [...]
+<p>The latest released version of Kraken will be available at the <a href="http://ccb.jhu.edu/software/kraken/">Kraken website</a>, and the latest updates to the Kraken source code are available at the <a href="https://github.com/DerrickWood/kraken">Kraken GitHub repository</a>.</p>
+<p>If you use <a href="http://ccb.jhu.edu/software/kraken/">Kraken</a> in your research, please cite the <a href="http://genomebiology.com/2014/15/3/R46">Kraken paper</a>. Thank you!</p>
+<h1 id="system-requirements"><a href="#system-requirements">System Requirements</a></h1>
+<p>Note: Users concerned about the disk or memory requirements should read the paragraph about MiniKraken, below.</p>
+<ul>
+<li><p><strong>Disk space</strong>: Construction of Kraken's standard database will require at least 160 GB of disk space. Customized databases may require more or less space. Disk space used is linearly proportional to the number of distinct <span class="math"><em>k</em></span>-mers; as of Feb. 2015, Kraken's default database contains just under 6 billion (6e9) distinct <span class="math"><em>k</em></span>-mers.</p>
+<p>In addition, the disk used to store the database should be locally-attached storage. Storing the database on a network filesystem (NFS) partition can cause Kraken's operation to be very slow, or to be stopped completely. As NFS accesses are much slower than local disk accesses, both preloading and database building will be slowed by use of NFS.</p></li>
+<li><p><strong>Memory</strong>: To run efficiently, Kraken requires enough free memory to hold the database in RAM. While this can be accomplished using a ramdisk, Kraken supplies a utility for loading the database into RAM via the OS cache. The default database size is 75 GB (as of Feb. 2015), and so you will need at least that much RAM if you want to build or run with the default database.</p></li>
+<li><p><strong>Dependencies</strong>: Kraken currently makes extensive use of Linux utilities such as sed, find, and wget. Many scripts are written using the Bash shell, and the main scripts are written using Perl. Core programs needed to build the database and run the classifier are written in C++, and need to be compiled using g++. Multithreading is handled using OpenMP. Downloads of NCBI data are performed by wget and in some cases, by rsync. Most Linux systems that have any sort of d [...]
+<p>Finally, if you want to build your own database, you will need to install the <a href="http://www.cbcb.umd.edu/software/jellyfish/">Jellyfish</a> <span class="math"><em>k</em></span>-mer counter. Note that Kraken only supports use of Jellyfish version 1. Jellyfish version 2 is not yet compatible with Kraken.</p></li>
+<li><p><strong>Network connectivity</strong>: Kraken's standard database build and download commands expect unfettered FTP and rsync access to the NCBI FTP server. If you're working behind a proxy, you may need to set certain environment variables (such as <code>ftp_proxy</code> or <code>RSYNC_PROXY</code>) in order to get these commands to work properly.</p></li>
+<li><p><strong>MiniKraken</strong>: To allow users with low-memory computing environments to use Kraken, we supply a reduced standard database that can be downloaded from the Kraken web site. When Kraken is run with a reduced database, we call it MiniKraken.</p>
+<p>The database we make available is only 4 GB in size, and should run well on computers with as little as 8 GB of RAM. Disk space required for this database is also only 4 GB.</p></li>
+</ul>
+<h1 id="installation"><a href="#installation">Installation</a></h1>
+<p>To begin using Kraken, you will first need to install it, and then either download or create a database.</p>
+<p>Kraken consists of two main scripts ("<code>kraken</code>" and "<code>kraken-build</code>"), along with several programs and smaller scripts. As part of the installation process, all scripts and programs are installed in the same directory. After installation, you can move the main scripts elsewhere, but moving the other scripts and programs requires editing the scripts and changing the "<code>$KRAKEN_DIR</code>" variables.</p>
+<p>Once a directory is selected, you need to run the following command in the directory where you extracted the Kraken source:</p>
+<pre><code>./install_kraken.sh $KRAKEN_DIR</code></pre>
+<p>(Replace "<code>$KRAKEN_DIR</code>" above with the directory where you want to install Kraken's programs/directories.)</p>
+<p>The <code>install_kraken.sh</code> script should compile all of Kraken's code and setup your Kraken data directory. Installation is successful if you see the message "<code>Kraken installation complete.</code>"</p>
+<p>Once installation is complete, you may want to copy the two main Kraken scripts into a directory found in your <code>PATH</code> variable (e.g., "<code>$HOME/bin</code>"):</p>
+<pre><code>cp $KRAKEN_DIR/bin/kraken $HOME/bin
+cp $KRAKEN_DIR/bin/kraken-build $HOME/bin</code></pre>
+<p>After installation, you're ready to either create or download a database.</p>
+<h1 id="kraken-databases"><a href="#kraken-databases">Kraken Databases</a></h1>
+<p>A Kraken database is a directory containing at least 4 files:</p>
+<ul>
+<li><code>database.kdb</code>: Contains the <span class="math"><em>k</em></span>-mer to taxon mappings</li>
+<li><code>database.idx</code>: Contains minimizer offset locations in database.kdb</li>
+<li><code>taxonomy/nodes.dmp</code>: Taxonomy tree structure + ranks</li>
+<li><code>taxonomy/names.dmp</code>: Taxonomy names</li>
+</ul>
+<p>Other files may be present as part of the database build process.</p>
+<p>In interacting with Kraken, you should not have to directly reference any of these files, but rather simply provide the name of the directory in which they are stored. Kraken allows both the use of a standard database as well as custom databases; these are described in the sections <a href="#standard-kraken-database">Standard Kraken Database</a> and <a href="#custom-databases">Custom Databases</a> below, respectively.</p>
+<h1 id="standard-kraken-database"><a href="#standard-kraken-database">Standard Kraken Database</a></h1>
+<p>To create the standard Kraken database, you can use the following command:</p>
+<pre><code>kraken-build --standard --db $DBNAME</code></pre>
+<p>(Replace "<code>$DBNAME</code>" above with your preferred database name/location. Please note that the database will use approximately 160 GB of disk space during creation.)</p>
+<p>This will download NCBI taxonomic information, as well as the complete genomes in RefSeq for the bacterial, archaeal, and viral domains. After downloading all this data, the build process begins; this is the most time-consuming step. If you have multiple processing cores, you can run this process with multiple threads, e.g.:</p>
+<pre><code>kraken-build --standard --threads 16 --db $DBNAME</code></pre>
+<p>Using 16 threads on a computer with 122 GB of RAM, the build process took approximately an hour and a half (steps with an asterisk have some multi-threading enabled) in February 2015:</p>
+<pre><code> 7m48s *Step 1 (create set)
+ n/a Step 2 (reduce database, optional and skipped)
+53m16s *Step 3 (sort set)
+ 1m04s Step 4 (GI number to sequence ID map)
+ 0m27s Step 5 (Sequence ID to taxon map)
+29m20s *Step 6 (set LCA values)
+------
+91m55s Total build time</code></pre>
+<p>Note that if any step (including the initial downloads) fails, the build process will abort. However, <code>kraken-build</code> will produce checkpoints throughout the installation process, and will restart the build at the last incomplete step if you attempt to run the same command again on a partially-built database.</p>
+<p>To create a custom database, or to use a database from another source, see <a href="#custom-databases">Custom Databases</a>.</p>
+<p>Notes for users with lower amounts of RAM:</p>
+<ol style="list-style-type: decimal">
+<li><p>If you encounter problems with Jellyfish not being able to allocate enough memory on your system to run the build process, you can supply a smaller hash size to Jellyfish using <code>kraken-build</code>'s <code>--jellyfish-hash-size</code> switch. Each space in the hash table uses approximately 6.9 bytes, so using "<code>--jellyfish-hash-size 6400M</code>" will use a hash table size of 6.4 billion spaces and require 44.3 GB of RAM.</p></li>
+<li><p>Kraken's build process will normally attempt to minimize disk writing by allocating large blocks of RAM and operating within them until data needs to be written to disk. However, this extra RAM usage may exceed your capacity. In such cases, you may want to use <code>kraken-build</code>'s <code>--work-on-disk</code> switch. This will minimize the amount of RAM usage and cause Kraken's build programs to perform most operations off of disk files. This switch can also be useful for pe [...]
+</ol>
+<h1 id="classification"><a href="#classification">Classification</a></h1>
+<p>To classify a set of sequences (reads), use the <code>kraken</code> command:</p>
+<pre><code>kraken --db $DBNAME seqs.fa</code></pre>
+<p>Output will be sent to standard output by default. The files containing the sequences to be classified should be specified on the command line. Sequences can also be provided through standard input using the special filename <code>/dev/fd/0</code>.</p>
+<p>Note that to obtain optimum speeds, Kraken's database should be loaded into RAM first. This can be done through use of a ramdisk, if you have superuser permissions. Failing that, you can use the <code>--preload</code> switch to <code>kraken</code>, e.g.:</p>
+<pre><code>kraken --preload --db $DBNAME seqs.fa</code></pre>
+<p>The database files will be loaded before classification using this switch. See <a href="#memory-usage-and-efficiency">Memory Usage and Efficiency</a> for more information.</p>
+<p>The <code>kraken</code> program allows several different options:</p>
+<ul>
+<li><p><strong>Multithreading</strong>: Use the <code>--threads NUM</code> switch to use multiple threads.</p></li>
+<li><p><strong>Quick operation</strong>: Rather than searching all <span class="math"><em>k</em></span>-mers in a sequence, stop classification after the first database hit; use <code>--quick</code> to enable this mode. Note that <code>--min-hits</code> will allow you to require multiple hits before declaring a sequence classified, which can be especially useful with custom databases when testing to see if sequences either do or do not belong to a particular genome.</p></li>
+<li><p><strong>Sequence filtering</strong>: Classified or unclassified sequences can be sent to a file for later processing, using the <code>--classified-out</code> and <code>--unclassified-out</code> switches, respectively.</p></li>
+<li><p><strong>Output redirection</strong>: Output can be directed using standard shell redirection (<code>|</code> or <code>></code>), or using the <code>--output</code> switch.</p></li>
+<li><p><strong>FASTQ input</strong>: Input is normally expected to be in FASTA format, but you can classify FASTQ data using the <code>--fastq-input</code> switch.</p></li>
+<li><p><strong>Compressed input</strong>: Kraken can handle gzip and bzip2 compressed files as input by specifying the proper switch of <code>--gzip-compressed</code> or <code>--bzip2-compressed</code>.</p></li>
+<li><p><strong>Input format auto-detection</strong>: If regular files are specified on the command line as input, Kraken will attempt to determine the format of your input prior to classification. You can disable this by explicitly specifying <code>--fasta-input</code>, <code>--fastq-input</code>, <code>--gzip-compressed</code>, and/or <code>--bzip2-compressed</code> as appropriate. Note that use of the character device file <code>/dev/fd/0</code> to read from standard input (aka <code>s [...]
+<li><p><strong>Paired reads</strong>: Kraken does not query <span class="math"><em>k</em></span>-mers containing ambiguous nucleotides (non-ACGT). If you have paired reads, you can use this fact to your advantage and increase Kraken's accuracy by concatenating the pairs together with a single <code>N</code> between the sequences. Using the <code>--paired</code> option when running <code>kraken</code> will automatically do this for you; simply specify the two mate pair files on the comman [...]
+</ul>
+<p>To get a full list of options, use <code>kraken --help</code>.</p>
+<h1 id="output-format"><a href="#output-format">Output Format</a></h1>
+<p>Each sequence classified by Kraken results in a single line of output. Output lines contain five tab-delimited fields; from left to right, they are:</p>
+<ol style="list-style-type: decimal">
+<li>"C"/"U": one letter code indicating that the sequence was either classified or unclassified.</li>
+<li>The sequence ID, obtained from the FASTA/FASTQ header.</li>
+<li>The taxonomy ID Kraken used to label the sequence; this is 0 if the sequence is unclassified.</li>
+<li>The length of the sequence in bp.</li>
+<li>A space-delimited list indicating the LCA mapping of each <span class="math"><em>k</em></span>-mer in the sequence. For example, "562:13 561:4 A:31 0:1 562:3" would indicate that:
+<ul>
+<li>the first 13 <span class="math"><em>k</em></span>-mers mapped to taxonomy ID #562</li>
+<li>the next 4 <span class="math"><em>k</em></span>-mers mapped to taxonomy ID #561</li>
+<li>the next 31 <span class="math"><em>k</em></span>-mers contained an ambiguous nucleotide</li>
+<li>the next <span class="math"><em>k</em></span>-mer was not in the database</li>
+<li>the last 3 <span class="math"><em>k</em></span>-mers mapped to taxonomy ID #562</li>
+</ul></li>
+</ol>
+<p>For users who want the full taxonomic name associated with each input sequence, we provide a script named <code>kraken-translate</code> that produces two different output formats for classified sequences. The script operates on the output of <code>kraken</code>, like so:</p>
+<pre><code>kraken --db $DBNAME sequences.fa > sequences.kraken
+kraken-translate --db $DBNAME sequences.kraken > sequences.labels</code></pre>
+<p>(The same database used to run <code>kraken</code> should be used to translate the output; see <a href="#kraken-environment-variables">Kraken Environment Variables</a> below for ways to reduce redundancy on the command line.)</p>
+<p>The file <code>sequences.labels</code> generated by the above example is a text file with two tab-delimited columns, and one line for each classified sequence in <code>sequences.fa</code>; unclassified sequences are not reported by <code>kraken-translate</code>. The first column of <code>kraken-translate</code>'s output are the sequence IDs of the classified sequences, and the second column contains the taxonomy of the sequence. For example, an output line from <code>kraken</code> of:</p>
+<pre><code>C SEQ1 562 36 562:6</code></pre>
+<p>Would result in a corresponding output line from <code>kraken-translate</code> of:</p>
+<pre><code>SEQ1 root;cellular organisms;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;Escherichia coli</code></pre>
+<p>Alternatively, <code>kraken-translate</code> accepts the option <code>--mpa-format</code> which will report only levels of the taxonomy with standard rank assignments (superkingdom, kingdom, phylum, class, order, family, genus, species), and uses pipes to delimit the various levels of the taxonomy. For example, <code>kraken-translate --mpa-format --db $DBNAME</code> with the above example output from <code>kraken</code> would result in the following line of output:</p>
+<pre><code>SEQ1 d__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli</code></pre>
+<p>Taxonomy assignments above the superkingdom (<code>d__</code>) rank are represented as just "root" when using the <code>--mpa-report</code> option with <code>kraken-translate</code>.</p>
+<h1 id="custom-databases"><a href="#custom-databases">Custom Databases</a></h1>
+<p>We realize the standard database may not suit everyone's needs. Kraken also allows creation of customized databases.</p>
+<p>To build a custom database:</p>
+<ol style="list-style-type: decimal">
+<li><p>Install a taxonomy. Usually, you will just use the NCBI taxonomy, which you can easily download using:</p>
+<pre><code>kraken-build --download-taxonomy --db $DBNAME</code></pre>
+<p>This will download the GI number to taxon map, as well as the taxonomic name and tree information from NCBI. These files can be found in <code>$DBNAME/taxonomy/</code> . If you need to modify the taxonomy, edits can be made to the <code>names.dmp</code> and <code>nodes.dmp</code> files in this directory; the <code>gi_taxid_nucl.dmp</code> file will also need to be updated appropriately.</p></li>
+<li><p>Install a genomic library. Four sets of standard genomes are made easily available through <code>kraken-build</code>:</p>
+<ul>
+<li>bacteria: RefSeq complete bacterial/archaeal genomes</li>
+<li>plasmids: RefSeq plasmid sequences</li>
+<li>viruses: RefSeq complete viral genomes</li>
+<li>human: GRCh38 human genome</li>
+</ul>
+<p>To download and install any one of these, use the <code>--download-library</code> switch, e.g.:</p>
+<pre><code>kraken-build --download-library bacteria --db $DBNAME</code></pre>
+Other genomes can also be added, but such genomes must meet certain requirements:
+<ul>
+<li>Sequences must be in a FASTA file (multi-FASTA is allowed)</li>
+<li>Each sequence's ID (the string between the <code>></code> and the first whitespace character on the header line) must contain either a GI number to allow Kraken to lookup the correct taxa, or an explicit assignment of the taxonomy ID using <code>kraken:taxid</code> (see below).</li>
+</ul>
+<p>Replicons not downloaded from NCBI may need their taxonomy information assigned explicitly. This can be done using the string <code>kraken:taxid|XXX</code> in the sequence ID, with <code>XXX</code> replaced by the desired taxon ID. For example, to put a known adapter sequence in taxon 32630 ("synthetic construct"), you could use the following:</p>
+<pre><code>>sequence16|kraken:taxid|32630 Adapter sequence
+CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA</code></pre>
+<p>The <code>kraken:taxid</code> string must begin the sequence ID or be immediately preceded by a pipe character (<code>|</code>). Explicit assignment of taxonomy IDs in this manner will override the GI number mapping provided by NCBI.</p>
+<p>If your genomes meet the requirements above, then you can add each replicon to your database's genomic library using the <code>--add-to-library</code> switch, e.g.:</p>
+<pre><code>kraken-build --add-to-library chr1.fa --db $DBNAME
+kraken-build --add-to-library chr2.fa --db $DBNAME</code></pre>
+<p>Note that if you have a list of files to add, you can do something like this in <code>bash</code>:</p>
+<pre><code>for file in chr*.fa
+do
+ kraken-build --add-to-library $file --db $DBNAME
+done</code></pre>
+<p>Or even add all <code>*.fa</code> files found in the directory <code>genomes</code>:</p>
+<pre><code>find genomes/ -name '*.fa' -print0 | \
+ xargs -0 -I{} -n1 kraken-build --add-to-library {} --db $DBNAME</code></pre>
+<p>(You may also find the <code>-P</code> option to <code>xargs</code> useful to add many files in parallel if you have multiple processors.)</p></li>
+<li><p>Once your library is finalized, you need to build the database. Depending on your size requirements, you may want to adjust the <span class="math"><em>k</em></span>-mer and/or minimizer lengths from the defaults. Except for some small bookkeeping fields, a Kraken database will use <span class="math"><em>s</em><em>D</em></span> + <span class="math">8(4<sup><em>M</em></sup>)</span> bytes, where <span class="math"><em>s</em></span> is the number of bytes used to store the <span class [...]
+<p>The minimizers serve to keep <span class="math"><em>k</em></span>-mers that are adjacent in query sequences close to each other in the database, which allows Kraken to exploit the CPU cache. Changing the value of <span class="math"><em>M</em></span> can significantly affect the speed of Kraken, and neither increasing or decreasing <span class="math"><em>M</em></span> will guarantee faster or slower speed.</p>
+<p>To build the database, you'll use the <code>--build</code> switch:</p>
+<pre><code>kraken-build --build --db $DBNAME</code></pre>
+<p>As noted above, you may want to also use any of <code>--threads</code>, <code>--kmer-len</code>, or <code>--minimizer-len</code> to adjust the database build time and/or final size.</p></li>
+<li><p>Shrinking the database: The "--shrink" task allows you to take an existing Kraken database and create a smaller MiniKraken database from it. The use of this option removes all but a specified number of <span class="math"><em>k</em></span>-mer/taxon pairs to create a new, smaller database. For example:</p>
+<pre><code>kraken-build --shrink 10000 --db $DBNAME --new-db minikraken</code></pre>
+<p>This will create a new database named <code>minikraken</code> that contains 10000 <span class="math"><em>k</em></span>-mers selected from across the original database (<code>$DBNAME</code>).</p>
+<p>The <code>--shrink</code> task is only meant to be run on a completed database. However, if you know before you create a database that you will only be able to use a certain amount of memory, you can use the <code>--max-db-size</code> switch for the <code>--build</code> task to provide a maximum size (in GB) for the database. This allows you to create a MiniKraken database without having to create a full Kraken database first.</p></li>
+</ol>
+<p>A full list of options for <code>kraken-build</code> can be obtained using <code>kraken-build --help</code>.</p>
+<p>After building a database, if you want to reduce the disk usage of the database you can use <code>kraken-build</code>'s <code>--clean</code> switch to remove all intermediate files from the database directory.</p>
+<h1 id="memory-usage-and-efficiency"><a href="#memory-usage-and-efficiency">Memory Usage and Efficiency</a></h1>
+<p>Kraken's execution requires many random accesses to a very large file. To obtain maximal speed, these accesses need to be made as quickly as possible. This means that the database must be in physical memory during execution. Although we provide the <code>--preload</code> option to Kraken for users who cannot use a ramdisk, the ramdisk is likely the simplest option, and is well-suited for installations on computers where Kraken is to be run a majority of the time. In addition, using a [...]
+<p>We also note that in some cases, <code>--preload</code> may not be needed (or even advisable). If you know that your database is already in memory (for example, if it has been recently read or unzipped, then it should be in your operating system cache, which resides in physical memory), then there is no need to perform this step. We have noticed that in low-memory (~8 GB) situations, preloading a MiniKraken DB is actually much slower than simply using <code>cat minikraken/database.* & [...]
+<p>To create a ramdisk, you will need to have superuser (root) permission. As root, you can use the following commands to create a ramdisk:</p>
+<pre><code>mkdir /ramdisk
+mount -t ramfs none /ramdisk</code></pre>
+<p>Optionally, you may have a trusted user who you want to be able to copy databases into this directory. In that case, you'll need to make that user the owner of the directory via chown.</p>
+<p>To put the database on the ramdisk, simply copy the database directory to the ramdisk directory:</p>
+<pre><code>cp -a $DBNAME /ramdisk</code></pre>
+<p>And then you can use it with Kraken by specifying the database copy on the ramdisk, e.g.:</p>
+<pre><code>kraken --db /ramdisk/$DBNAME seqs.fa</code></pre>
+<p>Note that anything copied into a ramdisk will be deleted if the ramdisk is unmounted or the computer is restarted, so make sure that you have a copy of the database on a hard disk (or other non-volatile storage).</p>
+<p>Note that when using the <code>--paired</code> option, Kraken will not (by default) make any attempt to ensure that the two files you specify are indeed matching sets of paired-end reads. To verify that the names of each read do indeed match, you can use the <code>--check-names</code> option in combination with the <code>--paired</code> option.</p>
+<h1 id="sample-reports"><a href="#sample-reports">Sample Reports</a></h1>
+<p>To get an idea as to Kraken's results across an entire sample, we provide the <code>kraken-report</code> script. It is used like this:</p>
+<pre><code>kraken-report --db $DBNAME kraken.output</code></pre>
+<p>Note that the database used must be the same as the one used to generate the output file, or the report script may encounter problems. Output is sent to standard output.</p>
+<p>The output of <code>kraken-report</code> is tab-delimited, with one line per taxon. The fields of the output, from left-to-right, are as follows:</p>
+<ol style="list-style-type: decimal">
+<li>Percentage of reads covered by the clade rooted at this taxon</li>
+<li>Number of reads covered by the clade rooted at this taxon</li>
+<li>Number of reads assigned directly to this taxon</li>
+<li>A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.</li>
+<li>NCBI taxonomy ID</li>
+<li>indented scientific name</li>
+</ol>
+<p>The scientific names are indented using spaces, according to the tree structure specified by the taxonomy.</p>
+<p>By default, taxa with no reads assigned to (or under) them will not have any output produced. However, if you wish to have all taxa displayed, you can use the <code>--show-zeros</code> switch to do so. This can be useful if you are looking to do further downstream analysis of the reports, and want to compare samples. Sorting by the taxonomy ID (using <code>sort -nf5</code>) can provide a consistent line ordering between reports.</p>
+<p>In addition, we also provide the program <code>kraken-mpa-report</code>; this program provides output in a format similar to MetaPhlAn's tab-delimited output. For <code>kraken-mpa-report</code>, multiple Kraken output files can be specified on the command line and each will be treated as a separate sample. For each taxon at the standard ranks (from domain to species), the count of reads in each sample assigned to any node in the clade rooted at that taxon is displayed. <code>kraken-mp [...]
+<h1 id="confidence-scoring"><a href="#confidence-scoring">Confidence Scoring</a></h1>
+<p>At present, we have not yet developed a confidence score with a solid probabilistic interpretation for Kraken. However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that available in the <code>kraken-filter</code> script. The approach we use allows a user to specify a threshold score in the [0,1] interval; the <code>kraken-filter</code> script then will adjust labels up the tree until the label's score (described below) meets or exceed [...]
+<p>A sequence label's score is a fraction <span class="math"><em>C</em></span>/<span class="math"><em>Q</em></span>, where <span class="math"><em>C</em></span> is the number of <span class="math"><em>k</em></span>-mers mapped to LCA values in the clade rooted at the label, and <span class="math"><em>Q</em></span> is the number of <span class="math"><em>k</em></span>-mers in the sequence that lack an ambiguous nucleotide (i.e., they were queried against the database). Consider the example [...]
+<p>"562:13 561:4 A:31 0:1 562:3" would indicate that:</p>
+<ul>
+<li>the first 13 <span class="math"><em>k</em></span>-mers mapped to taxonomy ID #562</li>
+<li>the next 4 <span class="math"><em>k</em></span>-mers mapped to taxonomy ID #561</li>
+<li>the next 31 <span class="math"><em>k</em></span>-mers contained an ambiguous nucleotide</li>
+<li>the next <span class="math"><em>k</em></span>-mer was not in the database</li>
+<li>the last 3 <span class="math"><em>k</em></span>-mers mapped to taxonomy ID #562</li>
+</ul>
+<p>In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of <span class="math"><em>C</em></span>/<span class="math"><em>Q</em></span> = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of <span class="math"><em>C</em></span>/<span class="math"><em>Q</em></span> = (13+4+3)/(13+4+1+3) = 20/21. If a user specified a threshold over 16/21, kraken-filter would adjust the original label from #562 to #561; if the threshold was g [...]
+<p><code>kraken-filter</code> is used like this:</p>
+<pre><code>kraken-filter --db $DBNAME [--threshold NUM] kraken.output</code></pre>
+<p>If not specified, the threshold will be 0. <code>kraken-filter</code>'s output is similar to <code>kraken</code>'s, but a new field between the length and LCA mapping list is present, indicating the new label's score (or the root label's score if the sequence has become unclassified).</p>
+<p>To give some guidance toward selecting an appropriate threshold, we show here the results of different thresholds on the MiSeq metagenome from the <a href="http://genomebiology.com/2014/15/3/R46">Kraken paper</a> (see the paper for more details; note that the database used here is more recent than that used in the paper). Precision, sensitivity, and F-score are measured at the genus rank:</p>
+<div id="confidence-score-table">
+<table>
+<thead>
+<tr class="header">
+<th align="left">Thres</th>
+<th align="left">Prec</th>
+<th align="left">Sens</th>
+<th align="left">F-score</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">0</td>
+<td align="left">95.43</td>
+<td align="left">77.32</td>
+<td align="left">85.43</td>
+</tr>
+<tr class="even">
+<td align="left">0.05</td>
+<td align="left">97.28</td>
+<td align="left">76.31</td>
+<td align="left">85.53</td>
+</tr>
+<tr class="odd">
+<td align="left">0.10</td>
+<td align="left">98.25</td>
+<td align="left">75.13</td>
+<td align="left">85.15</td>
+</tr>
+<tr class="even">
+<td align="left">0.15</td>
+<td align="left">98.81</td>
+<td align="left">73.87</td>
+<td align="left">84.54</td>
+</tr>
+<tr class="odd">
+<td align="left">0.20</td>
+<td align="left">99.13</td>
+<td align="left">72.82</td>
+<td align="left">83.96</td>
+</tr>
+<tr class="even">
+<td align="left">0.25</td>
+<td align="left">99.38</td>
+<td align="left">71.74</td>
+<td align="left">83.33</td>
+</tr>
+<tr class="odd">
+<td align="left">0.30</td>
+<td align="left">99.55</td>
+<td align="left">70.75</td>
+<td align="left">82.71</td>
+</tr>
+<tr class="even">
+<td align="left">0.35</td>
+<td align="left">99.61</td>
+<td align="left">69.53</td>
+<td align="left">81.90</td>
+</tr>
+<tr class="odd">
+<td align="left">0.40</td>
+<td align="left">99.66</td>
+<td align="left">68.35</td>
+<td align="left">81.09</td>
+</tr>
+<tr class="even">
+<td align="left">0.45</td>
+<td align="left">99.70</td>
+<td align="left">66.93</td>
+<td align="left">80.09</td>
+</tr>
+<tr class="odd">
+<td align="left">0.50</td>
+<td align="left">99.71</td>
+<td align="left">65.49</td>
+<td align="left">79.06</td>
+</tr>
+</tbody>
+</table>
+</div>
+<p>As can be seen, with no threshold (i.e., Kraken's original labels), Kraken's precision is fairly high, but it does increase with the threshold. Diminishing returns apply, however, and there is a loss in sensitivity that must be taken into account when deciding on the threshold to use for your own project.</p>
+<h1 id="kraken-environment-variables"><a href="#kraken-environment-variables">Kraken Environment Variables</a></h1>
+<p>The Kraken programs (with the exception of <code>kraken-build</code>) support the use of some environment variables to help in reducing command line lengths:</p>
+<ul>
+<li><p><strong><code>KRAKEN_NUM_THREADS</code></strong>: this variable is only used by <code>kraken</code>; if the <code>--threads</code> option is not supplied to <code>kraken</code>, then the value of this variable (if it is set) will be used as the number of threads to run <code>kraken</code>.</p></li>
+<li><p><strong><code>KRAKEN_DB_PATH</code></strong>: much like the <code>PATH</code> variable is used for executables by your shell, <code>KRAKEN_DB_PATH</code> is a colon-separated list of directories that will be searched for the database you name if the named database does not have a slash (<code>/</code>) character. By default, Kraken assumes the value of this variable is "<code>.</code>" (i.e., the current working directory). This variable can be used to create one (or mor [...]
+<pre><code>export KRAKEN_DB_PATH="/home/user/my_kraken_dbs:/data/kraken_dbs:"</code></pre>
+<p>This will cause three directories to be searched, in this order:</p>
+<ol style="list-style-type: decimal">
+<li><code>/home/user/my_kraken_dbs</code></li>
+<li><code>/data/kraken_dbs</code></li>
+<li>the current working directory (caused by the empty string as the third colon-separated field in the <code>KRAKEN_DB_PATH</code> string)</li>
+</ol>
+<p>The search for a database will stop when a name match is found; if two directories in the <code>KRAKEN_DB_PATH</code> have databases with the same name, the directory of the two that is searched first will have its database selected.</p>
+<p>If the above variable and value are used, and the databases <code>/data/kraken_dbs/mainDB</code> and <code>./mainDB</code> are present, then</p>
+<pre><code>kraken --db mainDB sequences.fa</code></pre>
+<p>will classify <code>sequences.fa</code> using <code>/data/kraken_dbs/mainDB</code>; if instead you wanted to use the <code>mainDB</code> present in the current directory, you would need to specify a directory path to that database in order to circumvent searching, e.g.:</p>
+<pre><code>kraken --db ./mainDB sequences.fa</code></pre>
+<p>Note that the <code>KRAKEN_DB_PATH</code> directory list can be skipped by the use of any absolute (beginning with <code>/</code>) or relative pathname (including at least one <code>/</code>) as the database name.</p></li>
+<li><p><strong><code>KRAKEN_DEFAULT_DB</code></strong>: if no database is supplied with the <code>--db</code> option, the database named in this variable will be used instead. Using this variable, you can avoid using <code>--db</code> if you only have a single database that you usually use, e.g. in bash:</p>
+<pre><code>export KRAKEN_DEFAULT_DB="/home/user/krakendb"
+kraken sequences.fa | kraken-report > sequences.kreport</code></pre>
+<p>This will classify <code>sequences.fa</code> using the <code>/home/user/krakendb</code> directory.</p>
+<p>Note that the value of <code>KRAKEN_DEFAULT_DB</code> will also be interpreted in the context of the value of <code>KRAKEN_DB_PATH</code> if you don't set <code>KRAKEN_DEFAULT_DB</code> to an absolute or relative pathname. Given the earlier example in this section, the following:</p>
+<pre><code>export KRAKEN_DEFAULT_DB="mainDB"
+kraken sequences.fa</code></pre>
+<p>will use <code>/data/kraken_dbs/mainDB</code> to classify <code>sequences.fa</code>.</p></li>
+</ul>
+<h1 id="upgrading-databases-to-v0.10"><a href="#upgrading-databases-to-v0.10">Upgrading Databases to v0.10+</a></h1>
+<p>The minimizer ordering in Kraken versions prior to v0.10.0-beta was a simple lexicographical ordering that provided a suboptimal distribution of k-mers within the bins. Ideally, the bin sizes would be uniform, but simple lexicographical ordering creates a bias toward low-complexity minimizers. To resolve this, the ordering is now "scrambled" by XORing all minimizers with a predefined constant to toggle half of each minimizer's bits before sorting. The more evenly distributed [...]
+<ol style="list-style-type: decimal">
+<li><p>Build a new database. This is the preferred option, as a newly-created database will have the latest genomes and NCBI taxonomy information.</p></li>
+<li><p>Re-sort an existing database. If you have a custom database, you may want to simply reformat the database to provide you with Kraken's increased speed. To do so, you'll need to do the following:</p>
+<pre><code>kraken-build --upgrade --db $DBNAME</code></pre>
+<p>(<strong>Note</strong>: the <code>--threads</code> switch is both valid and encouraged with this operation.)</p>
+<p>This command will <strong>not</strong> delete your existing <code>$DBNAME/database.*</code> files, but will simply rename them. If you're satisfied with the new database's performance, then you can use <code>kraken-build</code>'s <code>--clean</code> option to remove the old files and save space.</p>
+<p>Sorting the database is step 3 of the build process, so you should expect a database upgrade to take about as long as step 3 took when building the original database.</p></li>
+</ol>
+<p>Note that the rest of Kraken v0.10.0-beta's speed improvements are available without upgrading or changing your database.</p>
+</body>
+</html>
diff --git a/docs/MANUAL.markdown b/docs/MANUAL.markdown
new file mode 100644
index 0000000..903a1ea
--- /dev/null
+++ b/docs/MANUAL.markdown
@@ -0,0 +1,752 @@
+Introduction
+============
+
+[Kraken] is a taxonomic sequence classifier that assigns taxonomic
+labels to short DNA reads. It does this by examining the $k$-mers
+within a read and querying a database with those $k$-mers. This database
+contains a mapping of every $k$-mer in [Kraken]'s genomic library to the
+lowest common ancestor (LCA) in a taxonomic tree of all genomes that
+contain that $k$-mer. The set of LCA taxa that correspond to the $k$-mers
+in a read are then analyzed to create a single taxonomic label for the
+read; this label can be any of the nodes in the taxonomic tree.
+[Kraken] is designed to be rapid, sensitive, and highly precise. Our
+tests on various real and simulated data have shown [Kraken] to have
+sensitivity slightly lower than Megablast with precision being slightly
+higher. On a set of simulated 100 bp reads, [Kraken] processed over 1.3
+million reads per minute on a single core in normal operation, and over
+4.1 million reads per minute in quick operation.
+
+The latest released version of Kraken will be available at the
+[Kraken website], and the latest updates to the Kraken source code
+are available at the [Kraken GitHub repository].
+
+If you use [Kraken] in your research, please cite the [Kraken paper].
+Thank you!
+
+[Kraken]: http://ccb.jhu.edu/software/kraken/
+[Kraken website]: http://ccb.jhu.edu/software/kraken/
+[Kraken paper]: http://genomebiology.com/2014/15/3/R46
+[Kraken GitHub repository]: https://github.com/DerrickWood/kraken
+
+
+System Requirements
+===================
+
+Note: Users concerned about the disk or memory requirements should
+read the paragraph about MiniKraken, below.
+
+* **Disk space**: Construction of Kraken's standard database will require at
+ least 160 GB of disk space. Customized databases may require
+ more or less space. Disk space used is linearly proportional
+ to the number of distinct $k$-mers; as of Feb. 2015, Kraken's
+ default database contains just under 6 billion (6e9) distinct $k$-mers.
+
+ In addition, the disk used to store the database should be
+ locally-attached storage. Storing the database on a network
+ filesystem (NFS) partition can cause Kraken's operation to be
+ very slow, or to be stopped completely. As NFS accesses are
+ much slower than local disk accesses, both preloading and database
+ building will be slowed by use of NFS.
+
+* **Memory**: To run efficiently, Kraken requires enough free memory to
+ hold the database in RAM. While this can be accomplished using a
+ ramdisk, Kraken supplies a utility for loading the database into
+ RAM via the OS cache. The default database size is 75 GB (as of
+ Feb. 2015), and so you will need at least that much RAM if you want
+ to build or run with the default database.
+
+* **Dependencies**: Kraken currently makes extensive use of Linux utilities
+ such as sed, find, and wget. Many scripts are written using the
+ Bash shell, and the main scripts are written using Perl. Core
+ programs needed to build the database and run the classifier are
+ written in C++, and need to be compiled using g++. Multithreading
+ is handled using OpenMP. Downloads of NCBI data are performed by
+ wget and in some cases, by rsync. Most Linux systems that have any
+ sort of development package installed will have all of the above
+ listed programs and libraries available.
+
+ Finally, if you want to build your own database, you will need to
+ install the [Jellyfish] $k$-mer counter. Note that Kraken only
+ supports use of Jellyfish version 1. Jellyfish version 2 is not
+ yet compatible with Kraken.
+
+* **Network connectivity**: Kraken's standard database build and download
+ commands expect unfettered FTP and rsync access to the NCBI FTP
+ server. If you're working behind a proxy, you may need to set
+ certain environment variables (such as `ftp_proxy` or `RSYNC_PROXY`)
+ in order to get these commands to work properly.
+
+* **MiniKraken**: To allow users with low-memory computing environments to
+ use Kraken, we supply a reduced standard database that can be
+ downloaded from the Kraken web site. When Kraken is run with a
+ reduced database, we call it MiniKraken.
+
+ The database we make available is only 4 GB in size, and should
+ run well on computers with as little as 8 GB of RAM. Disk space
+ required for this database is also only 4 GB.
+
+[Jellyfish]: http://www.cbcb.umd.edu/software/jellyfish/
+
+
+Installation
+============
+
+To begin using Kraken, you will first need to install it, and then
+either download or create a database.
+
+Kraken consists of two main scripts ("`kraken`" and "`kraken-build`"),
+along with several programs and smaller scripts. As part of the
+installation process, all scripts and programs are installed in
+the same directory. After installation, you can move the main
+scripts elsewhere, but moving the other scripts and programs
+requires editing the scripts and changing the "`$KRAKEN_DIR`" variables.
+
+Once a directory is selected, you need to run the following
+command in the directory where you extracted the Kraken
+source:
+
+ ./install_kraken.sh $KRAKEN_DIR
+
+(Replace "`$KRAKEN_DIR`" above with the directory where you want to
+install Kraken's programs/directories.)
+
+The `install_kraken.sh` script should compile all of Kraken's code
+and setup your Kraken data directory. Installation is successful
+if you see the message "`Kraken installation complete.`"
+
+Once installation is complete, you may want to copy the two main
+Kraken scripts into a directory found in your `PATH` variable
+(e.g., "`$HOME/bin`"):
+
+ cp $KRAKEN_DIR/bin/kraken $HOME/bin
+ cp $KRAKEN_DIR/bin/kraken-build $HOME/bin
+
+After installation, you're ready to either create or download a
+database.
+
+
+Kraken Databases
+================
+
+A Kraken database is a directory containing at least 4 files:
+
+* `database.kdb`: Contains the $k$-mer to taxon mappings
+* `database.idx`: Contains minimizer offset locations in database.kdb
+* `taxonomy/nodes.dmp`: Taxonomy tree structure + ranks
+* `taxonomy/names.dmp`: Taxonomy names
+
+Other files may be present as part of the database build process.
+
+In interacting with Kraken, you should not have to directly reference
+any of these files, but rather simply provide the name of the directory
+in which they are stored. Kraken allows both the use of a standard
+database as well as custom databases; these are described in the sections
+[Standard Kraken Database] and [Custom Databases] below, respectively.
+
+
+Standard Kraken Database
+========================
+
+To create the standard Kraken database, you can use the following command:
+
+ kraken-build --standard --db $DBNAME
+
+(Replace "`$DBNAME`" above with your preferred database name/location.
+Please note that the database will use approximately 160 GB of
+disk space during creation.)
+
+This will download NCBI taxonomic information, as well as the
+complete genomes in RefSeq for the bacterial, archaeal, and
+viral domains. After downloading all this data, the build
+process begins; this is the most time-consuming step. If you
+have multiple processing cores, you can run this process with
+multiple threads, e.g.:
+
+ kraken-build --standard --threads 16 --db $DBNAME
+
+Using 16 threads on a computer with 122 GB of RAM, the build
+process took approximately an hour and a half (steps with an asterisk
+have some multi-threading enabled) in February 2015:
+
+ 7m48s *Step 1 (create set)
+ n/a Step 2 (reduce database, optional and skipped)
+ 53m16s *Step 3 (sort set)
+ 1m04s Step 4 (GI number to sequence ID map)
+ 0m27s Step 5 (Sequence ID to taxon map)
+ 29m20s *Step 6 (set LCA values)
+ ------
+ 91m55s Total build time
+
+Note that if any step (including the initial downloads) fails,
+the build process will abort. However, `kraken-build` will
+produce checkpoints throughout the installation process, and
+will restart the build at the last incomplete step if you
+attempt to run the same command again on a partially-built
+database.
+
+To create a custom database, or to use a database from another
+source, see [Custom Databases].
+
+Notes for users with lower amounts of RAM:
+
+1) If you encounter problems with Jellyfish not being able
+to allocate enough memory on your system to run the build
+process, you can supply a smaller hash size to Jellyfish
+using `kraken-build`'s `--jellyfish-hash-size` switch. Each space
+in the hash table uses approximately 6.9 bytes, so using
+"`--jellyfish-hash-size 6400M`" will use a hash table size of
+6.4 billion spaces and require 44.3 GB of RAM.
+
+2) Kraken's build process will normally attempt to minimize
+disk writing by allocating large blocks of RAM and operating
+within them until data needs to be written to disk. However,
+this extra RAM usage may exceed your capacity. In such cases,
+you may want to use `kraken-build`'s `--work-on-disk` switch. This
+will minimize the amount of RAM usage and cause Kraken's build
+programs to perform most operations off of disk files. This
+switch can also be useful for people building on a ramdisk or
+solid state drive. Please note that working off of disk files
+can be quite slow on some computers, causing builds to take
+several days if not weeks.
+
+
+Classification
+==============
+
+To classify a set of sequences (reads), use the `kraken` command:
+
+ kraken --db $DBNAME seqs.fa
+
+Output will be sent to standard output by default. The files
+containing the sequences to be classified should be specified
+on the command line. Sequences can also be provided through
+standard input using the special filename `/dev/fd/0`.
+
+Note that to obtain optimum speeds, Kraken's database should be
+loaded into RAM first. This can be done through use of a ramdisk,
+if you have superuser permissions. Failing that, you can use
+the `--preload` switch to `kraken`, e.g.:
+
+ kraken --preload --db $DBNAME seqs.fa
+
+The database files will be loaded before classification using this
+switch. See [Memory Usage and Efficiency] for more information.
+
+The `kraken` program allows several different options:
+
+* **Multithreading**: Use the `--threads NUM` switch to use multiple
+ threads.
+
+* **Quick operation**: Rather than searching all $k$-mers in a sequence,
+ stop classification after the first database hit; use `--quick`
+ to enable this mode. Note that `--min-hits` will allow you to
+ require multiple hits before declaring a sequence classified,
+ which can be especially useful with custom databases when testing
+ to see if sequences either do or do not belong to a particular
+ genome.
+
+* **Sequence filtering**: Classified or unclassified sequences can be
+ sent to a file for later processing, using the `--classified-out`
+ and `--unclassified-out` switches, respectively.
+
+* **Output redirection**: Output can be directed using standard shell
+ redirection (`|` or `>`), or using the `--output` switch.
+
+* **FASTQ input**: Input is normally expected to be in FASTA format, but
+ you can classify FASTQ data using the `--fastq-input` switch.
+
+* **Compressed input**: Kraken can handle gzip and bzip2 compressed
+ files as input by specifying the proper switch of `--gzip-compressed`
+ or `--bzip2-compressed`.
+
+* **Input format auto-detection**: If regular files are specified on
+ the command line as input, Kraken will attempt to determine the
+ format of your input prior to classification. You can disable this
+ by explicitly specifying `--fasta-input`, `--fastq-input`,
+ `--gzip-compressed`, and/or `--bzip2-compressed` as appropriate.
+ Note that use of the character device file `/dev/fd/0` to read
+ from standard input (aka `stdin`) will **not** allow auto-detection.
+
+* **Paired reads**: Kraken does not query $k$-mers containing ambiguous
+ nucleotides (non-ACGT). If you have paired reads, you can use this
+ fact to your advantage and increase Kraken's accuracy by concatenating
+ the pairs together with a single `N` between the sequences. Using the
+ `--paired` option when running `kraken` will automatically do this for
+ you; simply specify the two mate pair files on the command line. We
+ have found this to raise sensitivity by about 3 percentage points over
+ classifying the sequences as single-end reads.
+
+To get a full list of options, use `kraken --help`.
+
+
+Output Format
+=============
+
+Each sequence classified by Kraken results in a single line of
+output. Output lines contain five tab-delimited fields; from
+left to right, they are:
+
+1) "C"/"U": one letter code indicating that the sequence was
+ either classified or unclassified.
+2) The sequence ID, obtained from the FASTA/FASTQ header.
+3) The taxonomy ID Kraken used to label the sequence; this is
+ 0 if the sequence is unclassified.
+4) The length of the sequence in bp.
+5) A space-delimited list indicating the LCA mapping of each $k$-mer
+ in the sequence. For example, "562:13 561:4 A:31 0:1 562:3"
+ would indicate that:
+ - the first 13 $k$-mers mapped to taxonomy ID #562
+ - the next 4 $k$-mers mapped to taxonomy ID #561
+ - the next 31 $k$-mers contained an ambiguous nucleotide
+ - the next $k$-mer was not in the database
+ - the last 3 $k$-mers mapped to taxonomy ID #562
+
+For users who want the full taxonomic name associated with each input
+sequence, we provide a script named `kraken-translate` that produces two
+different output formats for classified sequences. The script operates
+on the output of `kraken`, like so:
+
+ kraken --db $DBNAME sequences.fa > sequences.kraken
+ kraken-translate --db $DBNAME sequences.kraken > sequences.labels
+
+(The same database used to run `kraken` should be used to translate the
+output; see [Kraken Environment Variables] below for ways to reduce
+redundancy on the command line.)
+
+The file `sequences.labels` generated by the above example is a text file
+with two tab-delimited columns, and one line for each classified sequence
+in `sequences.fa`; unclassified sequences are not reported by
+`kraken-translate`. The first column of `kraken-translate`'s output are the
+sequence IDs of the classified sequences, and the second column contains
+the taxonomy of the sequence. For example, an output line from `kraken` of:
+
+ C SEQ1 562 36 562:6
+
+Would result in a corresponding output line from `kraken-translate` of:
+
+ SEQ1 root;cellular organisms;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;Escherichia coli
+
+Alternatively, `kraken-translate` accepts the option `--mpa-format` which
+will report only levels of the taxonomy with standard rank assignments
+(superkingdom, kingdom, phylum, class, order, family, genus, species),
+and uses pipes to delimit the various levels of the taxonomy. For example,
+`kraken-translate --mpa-format --db $DBNAME` with the above example output
+from `kraken` would result in the following line of output:
+
+ SEQ1 d__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli
+
+Taxonomy assignments above the superkingdom (`d__`) rank are represented as
+just "root" when using the `--mpa-report` option with `kraken-translate`.
+
+
+Custom Databases
+================
+
+We realize the standard database may not suit everyone's needs. Kraken
+also allows creation of customized databases.
+
+To build a custom database:
+
+1) Install a taxonomy. Usually, you will just use the NCBI taxonomy,
+ which you can easily download using:
+
+ kraken-build --download-taxonomy --db $DBNAME
+
+ This will download the GI number to taxon map, as well as the
+ taxonomic name and tree information from NCBI. These files can
+ be found in `$DBNAME/taxonomy/` . If you need to modify the taxonomy,
+ edits can be made to the `names.dmp` and `nodes.dmp` files in this directory;
+ the `gi_taxid_nucl.dmp` file will also need to be updated appropriately.
+
+2) Install a genomic library. Four sets of standard genomes are
+ made easily available through `kraken-build`:
+
+ - bacteria: RefSeq complete bacterial/archaeal genomes
+ - plasmids: RefSeq plasmid sequences
+ - viruses: RefSeq complete viral genomes
+ - human: GRCh38 human genome
+
+ To download and install any one of these, use the `--download-library`
+ switch, e.g.:
+
+ kraken-build --download-library bacteria --db $DBNAME
+
+ Other genomes can also be added, but such genomes must meet certain
+ requirements:
+ - Sequences must be in a FASTA file (multi-FASTA is allowed)
+ - Each sequence's ID (the string between the `>` and the first
+ whitespace character on the header line) must contain either
+ a GI number to allow Kraken to lookup the correct taxa, or an
+ explicit assignment of the taxonomy ID using `kraken:taxid` (see below).
+
+ Replicons not downloaded from NCBI may need their taxonomy information
+ assigned explicitly. This can be done using the string `kraken:taxid|XXX`
+ in the sequence ID, with `XXX` replaced by the desired taxon ID. For
+ example, to put a known adapter sequence in taxon 32630 ("synthetic
+ construct"), you could use the following:
+
+ >sequence16|kraken:taxid|32630 Adapter sequence
+ CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
+
+ The `kraken:taxid` string must begin the sequence ID or be immediately
+ preceded by a pipe character (`|`). Explicit assignment of taxonomy IDs
+ in this manner will override the GI number mapping provided by NCBI.
+
+ If your genomes meet the requirements above, then you can add each
+ replicon to your database's genomic library using the `--add-to-library`
+ switch, e.g.:
+
+ kraken-build --add-to-library chr1.fa --db $DBNAME
+ kraken-build --add-to-library chr2.fa --db $DBNAME
+
+ Note that if you have a list of files to add, you can do something like
+ this in `bash`:
+
+ for file in chr*.fa
+ do
+ kraken-build --add-to-library $file --db $DBNAME
+ done
+
+ Or even add all `*.fa` files found in the directory `genomes`:
+
+ find genomes/ -name '*.fa' -print0 | \
+ xargs -0 -I{} -n1 kraken-build --add-to-library {} --db $DBNAME
+
+ (You may also find the `-P` option to `xargs` useful to add many files in
+ parallel if you have multiple processors.)
+
+3) Once your library is finalized, you need to build the database.
+ Depending on your size requirements, you may want to adjust the
+ $k$-mer and/or minimizer lengths from the defaults. Except for some
+ small bookkeeping fields, a Kraken database will use
+ $sD$ + $8(4^{M})$
+ bytes, where $s$ is the number of bytes used to store the $k$-mer/taxon
+ pair (usually 12, but lower for smaller $k$-mers), $D$ is the number of
+ distinct $k$-mers in your library and
+ $M$ is the length (in bp) of the minimizers. Although $D$ does increase
+ as $k$ increases, it is impossible to know exactly how many distinct
+ $k$-mers will exist in a library for a given $k$ without actually
+ performing the count. By default, $k$ = 31 and $M$ = 15.
+
+ The minimizers serve to keep $k$-mers that are adjacent in query
+ sequences close to each other in the database, which allows
+ Kraken to exploit the CPU cache. Changing the value of $M$ can
+ significantly affect the speed of Kraken, and neither increasing
+ or decreasing $M$ will guarantee faster or slower speed.
+
+ To build the database, you'll use the `--build` switch:
+
+ kraken-build --build --db $DBNAME
+
+ As noted above, you may want to also use any of `--threads`,
+ `--kmer-len`, or `--minimizer-len` to adjust the database build
+ time and/or final size.
+
+4) Shrinking the database: The "--shrink" task allows you to take
+ an existing Kraken database and create a smaller MiniKraken database
+ from it. The use of this option removes all but a specified number of
+ $k$-mer/taxon pairs to create a new, smaller database. For example:
+
+ kraken-build --shrink 10000 --db $DBNAME --new-db minikraken
+
+ This will create a new database named `minikraken` that contains
+ 10000 $k$-mers selected from across the original database (`$DBNAME`).
+
+ The `--shrink` task is only meant to be run on a completed database.
+ However, if you know before you create a database that you will
+ only be able to use a certain amount of memory, you can use the
+ `--max-db-size` switch for the `--build` task to provide a maximum
+ size (in GB) for the database. This allows you to create a MiniKraken
+ database without having to create a full Kraken database first.
+
+A full list of options for `kraken-build` can be obtained using
+`kraken-build --help`.
+
+After building a database, if you want to reduce the disk usage of
+the database you can use `kraken-build`'s `--clean` switch to remove
+all intermediate files from the database directory.
+
+Memory Usage and Efficiency
+===========================
+
+Kraken's execution requires many random accesses to a very large file.
+To obtain maximal speed, these accesses need to be made as quickly as
+possible. This means that the database must be in physical memory
+during execution. Although we provide the `--preload` option to Kraken
+for users who cannot use a ramdisk, the ramdisk is likely the simplest
+option, and is well-suited for installations on computers where Kraken
+is to be run a majority of the time. In addition, using a ramdisk
+allows the initial start-up of Kraken to be accomplished much more quickly.
+If a ramdisk is used, the `--preload` switch should not be used.
+
+We also note that in some cases, `--preload` may not be needed (or even
+advisable). If you know that your database is already in memory (for
+example, if it has been recently read or unzipped, then it should be in
+your operating system cache, which resides in physical memory), then there
+is no need to perform this step. We have noticed that in low-memory (~8 GB)
+situations, preloading a MiniKraken DB is actually much slower than simply
+using `cat minikraken/database.* > /dev/null`. The selection of the best way
+to get the database into memory is dependent on several factors, including
+your total amount of RAM, operating system, and current free memory. For this
+reason, you may need to experiment with your own setup to find a good solution
+for you.
+
+To create a ramdisk, you will need to have superuser (root) permission.
+As root, you can use the following commands to create a ramdisk:
+
+ mkdir /ramdisk
+ mount -t ramfs none /ramdisk
+
+Optionally, you may have a trusted user who you want to be able to copy
+databases into this directory. In that case, you'll need to make that
+user the owner of the directory via chown.
+
+To put the database on the ramdisk, simply copy the database directory
+to the ramdisk directory:
+
+ cp -a $DBNAME /ramdisk
+
+And then you can use it with Kraken by specifying the database copy on
+the ramdisk, e.g.:
+
+ kraken --db /ramdisk/$DBNAME seqs.fa
+
+Note that anything copied into a ramdisk will be deleted if the ramdisk
+is unmounted or the computer is restarted, so make sure that you have a
+copy of the database on a hard disk (or other non-volatile storage).
+
+
+Note that when using the `--paired` option, Kraken will not (by default)
+make any attempt to ensure that the two files you specify are indeed
+matching sets of paired-end reads. To verify that the names of each
+read do indeed match, you can use the `--check-names` option in
+combination with the `--paired` option.
+
+
+Sample Reports
+==============
+
+To get an idea as to Kraken's results across an entire sample, we provide
+the `kraken-report` script. It is used like this:
+
+ kraken-report --db $DBNAME kraken.output
+
+Note that the database used must be the same as the one used to generate
+the output file, or the report script may encounter problems. Output is
+sent to standard output.
+
+The output of `kraken-report` is tab-delimited, with one line per taxon.
+The fields of the output, from left-to-right, are as follows:
+
+1) Percentage of reads covered by the clade rooted at this taxon
+2) Number of reads covered by the clade rooted at this taxon
+3) Number of reads assigned directly to this taxon
+4) A rank code, indicating (U)nclassified, (D)omain, (K)ingdom,
+ (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
+ All other ranks are simply '-'.
+5) NCBI taxonomy ID
+6) indented scientific name
+
+The scientific names are indented using spaces, according to the tree
+structure specified by the taxonomy.
+
+By default, taxa with no reads assigned to (or under) them will not have
+any output produced. However, if you wish to have all taxa displayed,
+you can use the `--show-zeros` switch to do so. This can be useful if
+you are looking to do further downstream analysis of the reports, and
+want to compare samples. Sorting by the taxonomy ID (using `sort -nf5`)
+can provide a consistent line ordering between reports.
+
+In addition, we also provide the program `kraken-mpa-report`; this program
+provides output in a format similar to MetaPhlAn's tab-delimited output.
+For `kraken-mpa-report`, multiple Kraken output files can be specified on
+the command line and each will be treated as a separate sample. For each
+taxon at the standard ranks (from domain to species), the count of reads
+in each sample assigned to any node in the clade rooted at that taxon is
+displayed. `kraken-mpa-report` is run in the same manner as `kraken-report`,
+and its output is also sent to standard output.
+
+
+Confidence Scoring
+==================
+
+At present, we have not yet developed a confidence score with a solid
+probabilistic interpretation for Kraken. However, we have developed a
+simple scoring scheme that has yielded good results for us, and we've
+made that available in the `kraken-filter` script. The approach we use
+allows a user to specify a threshold score in the [0,1] interval; the
+`kraken-filter` script then will adjust labels up the tree until the
+label's score (described below) meets or exceeds that threshold. If
+a label at the root of the taxonomic tree would not have a score exceeding
+the threshold, the sequence is called unclassified by kraken-filter.
+
+A sequence label's score is a fraction $C$/$Q$, where $C$ is the number of
+$k$-mers mapped to LCA values in the clade rooted at the label, and $Q$ is the
+number of $k$-mers in the sequence that lack an ambiguous nucleotide (i.e.,
+they were queried against the database). Consider the example of the
+LCA mappings in Kraken's output given earlier:
+
+"562:13 561:4 A:31 0:1 562:3" would indicate that:
+
+* the first 13 $k$-mers mapped to taxonomy ID #562
+* the next 4 $k$-mers mapped to taxonomy ID #561
+* the next 31 $k$-mers contained an ambiguous nucleotide
+* the next $k$-mer was not in the database
+* the last 3 $k$-mers mapped to taxonomy ID #562
+
+In this case, ID #561 is the parent node of #562. Here, a label of #562
+for this sequence would have a score of $C$/$Q$ = (13+3)/(13+4+1+3) = 16/21.
+A label of #561 would have a score of $C$/$Q$ = (13+4+3)/(13+4+1+3) = 20/21.
+If a user specified a threshold over 16/21, kraken-filter would adjust the
+original label from #562 to #561; if the threshold was greater than 20/21,
+the sequence would become unclassified.
+
+`kraken-filter` is used like this:
+
+ kraken-filter --db $DBNAME [--threshold NUM] kraken.output
+
+If not specified, the threshold will be 0. `kraken-filter`'s output is
+similar to `kraken`'s, but a new field between the length and LCA mapping
+list is present, indicating the new label's score (or the root label's
+score if the sequence has become unclassified).
+
+To give some guidance toward selecting an appropriate threshold, we
+show here the results of different thresholds on the MiSeq metagenome
+from the [Kraken paper] \(see the paper for more details; note that the
+database used here is more recent than that used in the paper).
+Precision, sensitivity, and F-score are measured at the genus rank:
+
+<div id="confidence-score-table">
+
+ Thres Prec Sens F-score
+ ----- ----- ----- -------
+ 0 95.43 77.32 85.43
+ 0.05 97.28 76.31 85.53
+ 0.10 98.25 75.13 85.15
+ 0.15 98.81 73.87 84.54
+ 0.20 99.13 72.82 83.96
+ 0.25 99.38 71.74 83.33
+ 0.30 99.55 70.75 82.71
+ 0.35 99.61 69.53 81.90
+ 0.40 99.66 68.35 81.09
+ 0.45 99.70 66.93 80.09
+ 0.50 99.71 65.49 79.06
+
+</div>
+
+As can be seen, with no threshold (i.e., Kraken's original labels),
+Kraken's precision is fairly high, but it does increase with the
+threshold. Diminishing returns apply, however, and there is a loss
+in sensitivity that must be taken into account when deciding on the
+threshold to use for your own project.
+
+
+Kraken Environment Variables
+============================
+
+The Kraken programs (with the exception of `kraken-build`) support the
+use of some environment variables to help in reducing command line
+lengths:
+
+* **`KRAKEN_NUM_THREADS`**: this variable is only used by `kraken`; if the
+ `--threads` option is not supplied to `kraken`, then the value of this
+ variable (if it is set) will be used as the number of threads to run
+ `kraken`.
+
+* **`KRAKEN_DB_PATH`**: much like the `PATH` variable is used for executables
+ by your shell, `KRAKEN_DB_PATH` is a colon-separated list of directories
+ that will be searched for the database you name if the named database
+ does not have a slash (`/`) character. By default, Kraken assumes the
+ value of this variable is "`.`" (i.e., the current working directory).
+ This variable can be used to create one (or more) central repositories
+ of Kraken databases in a multi-user system. Example usage in bash:
+
+ export KRAKEN_DB_PATH="/home/user/my_kraken_dbs:/data/kraken_dbs:"
+
+ This will cause three directories to be searched, in this order:
+
+ 1) `/home/user/my_kraken_dbs`
+ 2) `/data/kraken_dbs`
+ 3) the current working directory (caused by the empty string as
+ the third colon-separated field in the `KRAKEN_DB_PATH` string)
+
+ The search for a database will stop when a name match is found; if
+ two directories in the `KRAKEN_DB_PATH` have databases with the same
+ name, the directory of the two that is searched first will have its
+ database selected.
+
+ If the above variable and value are used, and the databases
+ `/data/kraken_dbs/mainDB` and `./mainDB` are present, then
+
+ kraken --db mainDB sequences.fa
+
+ will classify `sequences.fa` using `/data/kraken_dbs/mainDB`; if instead
+ you wanted to use the `mainDB` present in the current directory,
+ you would need to specify a directory path to that database in order
+ to circumvent searching, e.g.:
+
+ kraken --db ./mainDB sequences.fa
+
+ Note that the `KRAKEN_DB_PATH` directory list can be skipped by the use
+ of any absolute (beginning with `/`) or relative pathname (including
+ at least one `/`) as the database name.
+
+* **`KRAKEN_DEFAULT_DB`**: if no database is supplied with the `--db` option,
+ the database named in this variable will be used instead. Using this
+ variable, you can avoid using `--db` if you only have a single database
+ that you usually use, e.g. in bash:
+
+ export KRAKEN_DEFAULT_DB="/home/user/krakendb"
+ kraken sequences.fa | kraken-report > sequences.kreport
+
+ This will classify `sequences.fa` using the `/home/user/krakendb` directory.
+
+ Note that the value of `KRAKEN_DEFAULT_DB` will also be interpreted in
+ the context of the value of `KRAKEN_DB_PATH` if you don't set
+ `KRAKEN_DEFAULT_DB` to an absolute or relative pathname. Given the earlier
+ example in this section, the following:
+
+ export KRAKEN_DEFAULT_DB="mainDB"
+ kraken sequences.fa
+
+ will use `/data/kraken_dbs/mainDB` to classify `sequences.fa`.
+
+
+Upgrading Databases to v0.10+
+=============================
+
+The minimizer ordering in Kraken versions prior to v0.10.0-beta was a
+simple lexicographical ordering that provided a suboptimal distribution
+of k-mers within the bins. Ideally, the bin sizes would be uniform,
+but simple lexicographical ordering creates a bias toward low-complexity
+minimizers. To resolve this, the ordering is now "scrambled" by XORing all
+minimizers with a predefined constant to toggle half of each minimizer's
+bits before sorting. The more evenly distributed bins provide better
+caching performance, but databases created in this way are not compatible
+with earlier versions of Kraken. Kraken versions from v0.10.0-beta up to
+(but not including) v1.0 will support the use of the older databases, but
+we nonetheless recommend one of the two following options:
+
+1) Build a new database. This is the preferred option, as a newly-created
+ database will have the latest genomes and NCBI taxonomy information.
+
+2) Re-sort an existing database. If you have a custom database, you may
+ want to simply reformat the database to provide you with Kraken's
+ increased speed. To do so, you'll need to do the following:
+
+ kraken-build --upgrade --db $DBNAME
+
+ (**Note**: the `--threads` switch is both valid and encouraged with this
+ operation.)
+
+ This command will **not** delete your existing `$DBNAME/database.*`
+ files, but will simply rename them. If you're satisfied with the new
+ database's performance, then you can use `kraken-build`'s `--clean`
+ option to remove the old files and save space.
+
+ Sorting the database is step 3 of the build process, so you should
+ expect a database upgrade to take about as long as step 3 took when
+ building the original database.
+
+Note that the rest of Kraken v0.10.0-beta's speed improvements are available
+without upgrading or changing your database.
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..c6bf65f
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,9 @@
+all:
+ pandoc --title-prefix "Kraken Manual" \
+ --include-in-header head.html \
+ --include-before-body top.html \
+ --from markdown --to html \
+ --table-of-contents \
+ --css kraken.css \
+ --output MANUAL.html \
+ < MANUAL.markdown
diff --git a/docs/bar-bg.png b/docs/bar-bg.png
new file mode 100644
index 0000000..7b9262a
Binary files /dev/null and b/docs/bar-bg.png differ
diff --git a/docs/head.html b/docs/head.html
new file mode 100644
index 0000000..20bd435
--- /dev/null
+++ b/docs/head.html
@@ -0,0 +1 @@
+<link href='http://fonts.googleapis.com/css?family=Ubuntu:400,700,400italic' rel='stylesheet' type='text/css'>
diff --git a/docs/kraken.css b/docs/kraken.css
new file mode 100644
index 0000000..3b9ad08
--- /dev/null
+++ b/docs/kraken.css
@@ -0,0 +1,45 @@
+body {
+ background: #f0f0f0 url("bar-bg.png") top left repeat-y;
+ color: #333;
+ margin: 10px;
+ margin-left: 50px;
+ margin-bottom: 20px;
+ font-family: 'Ubuntu', sans-serif;
+}
+
+a { color: #00b0f0; }
+a:visited { color: #0090f0; }
+a:hover { color: #00d0ff; }
+a:active { color: #00f0ff; }
+
+.pretoc {
+ text-align: center;
+ font-size: 1.2em;
+}
+.title {
+ font-size: 2em;
+ font-weight: bold;
+ margin-bottom: 0;
+}
+.version {
+ font-size: 0.9em;
+}
+h1 {
+ color: #0090f0;
+ border-bottom: 1px #0090f0 solid;
+ margin-left: -10px;
+ margin-bottom: 3px;
+}
+h1 a {
+ color: #0090f0;
+ text-decoration: none;
+}
+div#confidence-score-table table th {
+ width: 7em;
+}
+pre {
+ margin-left: 4em;
+}
+code {
+ font-size: 1.2em;
+}
diff --git a/docs/top.html b/docs/top.html
new file mode 100644
index 0000000..a189458
--- /dev/null
+++ b/docs/top.html
@@ -0,0 +1,9 @@
+<div class="pretoc">
+ <p class="title">Kraken taxonomic sequence classification system</p>
+
+ <p class="version">Version 0.10.5-beta</p>
+
+ <p>Operating Manual</p>
+</div>
+
+<h1>Table of Contents</h1>
diff --git a/install_kraken.sh b/install_kraken.sh
new file mode 100755
index 0000000..fff418d
--- /dev/null
+++ b/install_kraken.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+set -e
+
+VERSION="0.10.5-beta"
+
+if [ -z "$1" ] || [ -n "$2" ]
+then
+ echo "Usage: $(basename $0) KRAKEN_DIR"
+ exit 64
+fi
+
+if [ "$1" = "KRAKEN_DIR" ]
+then
+ echo "Please replace \"KRAKEN_DIR\" with the name of the directory"
+ echo "that you want to install Kraken in."
+ exit 1
+fi
+
+# Perl cmd used to canonicalize dirname - "readlink -f" doesn't work
+# on OS X.
+export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
+
+mkdir -p "$KRAKEN_DIR"
+make -C src install
+for file in scripts/*
+do
+ perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\
+ -e 's/#####=(\w+)=#####/$H{$1}/g' \
+ "KRAKEN_DIR=$KRAKEN_DIR" "VERSION=$VERSION" \
+ < "$file" > "$KRAKEN_DIR/$(basename $file)"
+ if [ -x "$file" ]
+ then
+ chmod +x "$KRAKEN_DIR/$(basename $file)"
+ fi
+done
+
+echo
+echo "Kraken installation complete."
+echo
+echo "To make things easier for you, you may want to copy/symlink the following"
+echo "files into a directory in your PATH:"
+for file in $KRAKEN_DIR/kraken*
+do
+ [ -x "$file" ] && echo " $file"
+done
diff --git a/scripts/add_to_library.sh b/scripts/add_to_library.sh
new file mode 100755
index 0000000..ee50323
--- /dev/null
+++ b/scripts/add_to_library.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Copy specified file into a Kraken library
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+
+LIBRARY_DIR="$KRAKEN_DB_NAME/library"
+
+if [ ! -e "$1" ]
+then
+ echo "Can't add \"$1\": file does not exist"
+ exit 1
+fi
+if [ ! -f "$1" ]
+then
+ echo "Can't add \"$1\": not a regular file"
+ exit 1
+fi
+
+if ! verify_gi_numbers.pl "$1"
+then
+ echo "Can't add \"$1\": sequence is missing GI number"
+ exit 1
+fi
+
+add_dir="$LIBRARY_DIR/added"
+mkdir -p "$add_dir"
+
+filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$1")
+
+echo "Added \"$1\" to library ($KRAKEN_DB_NAME)"
diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
new file mode 100755
index 0000000..7df4d0b
--- /dev/null
+++ b/scripts/build_kraken_db.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Build a Kraken database
+# Designed to be called by kraken_build
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+set -o pipefail # Stop on failures in non-final pipeline commands
+
+function report_time_elapsed() {
+ curr_time=$(date "+%s.%N")
+ perl -e '$time = $ARGV[1] - $ARGV[0];' \
+ -e '$sec = int($time); $nsec = $time - $sec;' \
+ -e '$min = int($sec/60); $sec %= 60;' \
+ -e '$hr = int($min/60); $min %= 60;' \
+ -e 'print "${hr}h" if $hr;' \
+ -e 'print "${min}m" if $min || $hr;' \
+ -e 'printf "%.3fs", $sec + $nsec;' \
+ $1 $curr_time
+}
+
+start_time=$(date "+%s.%N")
+
+DATABASE_DIR="$KRAKEN_DB_NAME"
+
+if [ ! -d "$DATABASE_DIR" ]
+then
+ echo "Can't find Kraken DB directory \"$KRAKEN_DB_NAME\""
+ exit 1
+fi
+cd "$DATABASE_DIR"
+
+MEMFLAG=""
+if [ -z "$KRAKEN_WORK_ON_DISK" ]
+then
+ MEMFLAG="-M"
+ echo "Kraken build set to minimize disk writes."
+else
+ echo "Kraken build set to minimize RAM usage."
+fi
+
+if [ -n "$KRAKEN_REBUILD_DATABASE" ]
+then
+ rm -f database.* *.map lca.complete
+fi
+
+if [ -e "database.jdb" ]
+then
+ echo "Skipping step 1, k-mer set already exists."
+else
+ echo "Creating k-mer set (step 1 of 6)..."
+ start_time1=$(date "+%s.%N")
+
+ check_for_jellyfish.sh
+ # Estimate hash size as 1.15 * chars in library FASTA files
+ if [ -z "$KRAKEN_HASH_SIZE" ]
+ then
+ KRAKEN_HASH_SIZE=$(find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
+ echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
+ fi
+
+ find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+ xargs -0 cat | \
+ jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
+ -o database /dev/fd/0
+
+ # Merge only if necessary
+ if [ -e "database_1" ]
+ then
+ jellyfish merge -o database.jdb.tmp database_*
+ else
+ mv database_0 database.jdb.tmp
+ fi
+
+ # Once here, DB is finalized, can put file in place.
+ mv database.jdb.tmp database.jdb
+
+ echo "K-mer set created. [$(report_time_elapsed $start_time1)]"
+fi
+
+if [ -z "$KRAKEN_MAX_DB_SIZE" ]
+then
+ echo "Skipping step 2, no database reduction requested."
+else
+ if [ -e "database.jdb.big" ]
+ then
+ echo "Skipping step 2, database reduction already done."
+ else
+ start_time1=$(date "+%s.%N")
+ kdb_size=$(stat -c '%s' database.jdb)
+ idx_size=$(echo "8 * (4 ^ $KRAKEN_MINIMIZER_LEN + 2)" | bc)
+ resize_needed=$(echo "scale = 10; ($kdb_size+$idx_size)/(2^30) > $KRAKEN_MAX_DB_SIZE" | bc)
+ if (( resize_needed == 0 ))
+ then
+ echo "Skipping step 2, database reduction unnecessary."
+ else
+ echo "Reducing database size (step 2 of 6)..."
+ max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc)
+ if (( $(echo "$max_kdb_size < 0" | bc) == 1 ))
+ then
+ echo "Maximum database size too small, aborting reduction."
+ exit 1
+ fi
+ # Key ct is 8 byte int stored 48 bytes from start of file
+ key_ct=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 48, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
+ # key_bits is 8 bytes from start
+ key_bits=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 8, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
+ # this is basically ceil(key_bits / 8) - why no ceiling function, bc?
+ key_len=$(echo "($key_bits + 7) / 8" | bc)
+ # val_len is 16 bytes from start
+ val_len=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 16, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
+ record_len=$(( key_len + val_len ))
+ new_ct=$(echo "$max_kdb_size / $record_len" | bc)
+ echo "Shrinking DB to use only $new_ct of the $key_ct k-mers"
+ db_shrink -d database.jdb -o database.jdb.small -n $new_ct
+ mv database.jdb database.jdb.big.tmp
+ mv database.jdb.small database.jdb
+ mv database.jdb.big.tmp database.jdb.big
+ echo "Database reduced. [$(report_time_elapsed $start_time1)]"
+ fi
+ fi
+fi
+
+if [ -e "database.kdb" ]
+then
+ echo "Skipping step 3, k-mer set already sorted."
+else
+ echo "Sorting k-mer set (step 3 of 6)..."
+ start_time1=$(date "+%s.%N")
+ db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
+ -d database.jdb -o database.kdb.tmp \
+ -i database.idx
+
+ # Once here, DB is sorted, can put file in proper place.
+ mv database.kdb.tmp database.kdb
+
+ echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
+fi
+
+if [ -e "gi2seqid.map" ]
+then
+ echo "Skipping step 4, GI number to seqID map already complete."
+else
+ echo "Creating GI number to seqID map (step 4 of 6)..."
+ start_time1=$(date "+%s.%N")
+ find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+ xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp
+ mv gi2seqid.map.tmp gi2seqid.map
+
+ echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]"
+fi
+
+if [ -e "seqid2taxid.map" ]
+then
+ echo "Skipping step 5, seqID to taxID map already complete."
+else
+ echo "Creating seqID to taxID map (step 5 of 6)..."
+ start_time1=$(date "+%s.%N")
+ make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \
+ > seqid2taxid.map.tmp
+ mv seqid2taxid.map.tmp seqid2taxid.map
+ line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
+
+ echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
+fi
+
+if [ -e "lca.complete" ]
+then
+ echo "Skipping step 6, LCAs already set."
+else
+ echo "Setting LCAs in database (step 6 of 6)..."
+ start_time1=$(date "+%s.%N")
+ find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+ xargs -0 cat | \
+ set_lcas $MEMFLAG -x -d database.kdb -i database.idx \
+ -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
+ touch "lca.complete"
+
+ echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
+fi
+
+echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]"
diff --git a/scripts/check_for_jellyfish.sh b/scripts/check_for_jellyfish.sh
new file mode 100755
index 0000000..63cc620
--- /dev/null
+++ b/scripts/check_for_jellyfish.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Check that jellyfish is executable and is proper version
+# Designed to be called by kraken-build
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+set -o pipefail # Stop on failures in non-final pipeline commands
+
+JELLYFISH_VERSION=$(jellyfish --version | awk '{print $2}')
+if [[ $JELLYFISH_VERSION =~ ^1\. ]]
+then
+ echo "Found jellyfish v$JELLYFISH_VERSION"
+else
+ echo "Found jellyfish v$JELLYFISH_VERSION"
+ echo "Kraken requires jellyfish version 1"
+ exit 1
+fi
diff --git a/scripts/clean_db.sh b/scripts/clean_db.sh
new file mode 100755
index 0000000..85b9526
--- /dev/null
+++ b/scripts/clean_db.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Clean unneeded files from a database
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+
+cd "$KRAKEN_DB_NAME"
+
+[ -e "database.kdb" ] || (echo "Incomplete database, clean aborted."; exit 1)
+[ -e "database.idx" ] || (echo "Incomplete database, clean aborted."; exit 1)
+[ -e "taxonomy/nodes.dmp" ] || (echo "Incomplete database, clean aborted."; exit 1)
+[ -e "taxonomy/names.dmp" ] || (echo "Incomplete database, clean aborted."; exit 1)
+
+rm -rf library
+rm -f database.jdb* database_* *.map lca.complete
+mkdir newtaxo
+mv taxonomy/{nodes,names}.dmp newtaxo
+rm -rf taxonomy
+mv newtaxo taxonomy
diff --git a/scripts/cp_into_tempfile.pl b/scripts/cp_into_tempfile.pl
new file mode 100755
index 0000000..4e24ff2
--- /dev/null
+++ b/scripts/cp_into_tempfile.pl
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Create a file in a specified directory, then copy an
+# existing file's contents into the new file. Write name of
+# new file to standard output.
+#
+# Thanks to everyone who wrote the mktemp program and couldn't be
+# bothered to standardize the behavior.
+
+use strict;
+use warnings;
+use File::Basename;
+use File::Temp 'tempfile';
+use Getopt::Std;
+
+my $PROG = basename $0;
+getopts('d:t:s:', \my %opts) or usage();
+$opts{$_} or usage() for qw/d t s/; # all switches mandatory
+my ($directory, $template, $suffix) = @opts{qw/d t s/};
+die "$PROG: '$directory' not a directory!\n" unless -d $directory;
+die "$PROG: must specify a single filename\n" unless @ARGV == 1;
+
+$suffix =~ s/^\.//;
+my $old_filename = shift @ARGV;
+open FILE, "<", $old_filename
+ or die "$PROG: can't read $old_filename: $!\n";
+
+my ($fh, $new_filename) = tempfile($template, DIR => $directory,
+ UNLINK => 0, SUFFIX => ".$suffix");
+# copy loop
+while (<FILE>) {
+ print {$fh} $_;
+}
+close FILE;
+close $fh;
+
+print "$new_filename\n";
+
+sub usage {
+ die "$PROG: <-d directory> <-t template> <-s suffix> <filename>\n";
+}
diff --git a/scripts/download_genomic_library.sh b/scripts/download_genomic_library.sh
new file mode 100755
index 0000000..ddf2ef1
--- /dev/null
+++ b/scripts/download_genomic_library.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Download specific genomic libraries for use with Kraken.
+# Supported choices are:
+# bacteria - NCBI RefSeq complete bacterial/archaeal genomes
+# plasmids - NCBI RefSeq plasmid sequences
+# viruses - NCBI RefSeq complete viral DNA and RNA genomes
+# human - NCBI RefSeq GRCh38 human reference genome
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+
+LIBRARY_DIR="$KRAKEN_DB_NAME/library"
+NCBI_SERVER="ftp.ncbi.nih.gov"
+FTP_SERVER="ftp://$NCBI_SERVER"
+RSYNC_SERVER="rsync://$NCBI_SERVER"
+THIS_DIR=$PWD
+
+case "$1" in
+ "bacteria")
+ mkdir -p $LIBRARY_DIR/Bacteria
+ cd $LIBRARY_DIR/Bacteria
+ if [ ! -e "lib.complete" ]
+ then
+ rm -f all.fna.tar.gz
+ wget $FTP_SERVER/genomes/Bacteria/all.fna.tar.gz
+ echo -n "Unpacking..."
+ tar zxf all.fna.tar.gz
+ rm all.fna.tar.gz
+ echo " complete."
+ touch "lib.complete"
+ else
+ echo "Skipping download of bacterial genomes, already downloaded here."
+ fi
+ ;;
+ "plasmids")
+ mkdir -p $LIBRARY_DIR/Plasmids
+ cd $LIBRARY_DIR/Plasmids
+ if [ ! -e "lib.complete" ]
+ then
+ rm -f plasmids.all.fna.tar.gz
+ wget $FTP_SERVER/genomes/Plasmids/plasmids.all.fna.tar.gz
+ echo -n "Unpacking..."
+ tar zxf plasmids.all.fna.tar.gz
+ rm plasmids.all.fna.tar.gz
+ echo " complete."
+ touch "lib.complete"
+ else
+ echo "Skipping download of plasmids, already downloaded here."
+ fi
+ ;;
+ "viruses")
+ mkdir -p $LIBRARY_DIR/Viruses
+ cd $LIBRARY_DIR/Viruses
+ if [ ! -e "lib.complete" ]
+ then
+ rm -f all.fna.tar.gz
+ rm -f all.ffn.tar.gz
+ wget $FTP_SERVER/genomes/Viruses/all.fna.tar.gz
+ wget $FTP_SERVER/genomes/Viruses/all.ffn.tar.gz
+ echo -n "Unpacking..."
+ tar zxf all.fna.tar.gz
+ tar zxf all.ffn.tar.gz
+ rm all.fna.tar.gz
+ rm all.ffn.tar.gz
+ echo " complete."
+ touch "lib.complete"
+ else
+ echo "Skipping download of viral genomes, already downloaded here."
+ fi
+ ;;
+ "human")
+ mkdir -p $LIBRARY_DIR/Human
+ cd $LIBRARY_DIR/Human
+ if [ ! -e "lib.complete" ]
+ then
+ # get list of CHR_* directories
+ wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/
+ directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing)
+ rm .listing
+
+ # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add
+ for directory in $directories
+ do
+ wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/
+ file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\w+\.fa\.gz)\s*$/ and print $1' .listing)
+ [ -z "$file" ] && exit 1
+ rm .listing
+ wget $FTP_SERVER/genomes/H_sapiens/$directory/$file
+ gunzip "$file"
+ done
+
+ touch "lib.complete"
+ else
+ echo "Skipping download of human genome, already downloaded here."
+ fi
+ ;;
+ *)
+ echo "Unsupported library. Valid options are: "
+ echo " bacteria plasmids virus human"
+ ;;
+esac
diff --git a/scripts/download_taxonomy.sh b/scripts/download_taxonomy.sh
new file mode 100755
index 0000000..fa73616
--- /dev/null
+++ b/scripts/download_taxonomy.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Download NCBI taxonomy information for Kraken.
+# Designed to be called by kraken_build
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+
+TAXONOMY_DIR="$KRAKEN_DB_NAME/taxonomy"
+NCBI_SERVER="ftp.ncbi.nih.gov"
+FTP_SERVER="ftp://$NCBI_SERVER"
+THIS_DIR=$PWD
+
+mkdir -p "$TAXONOMY_DIR"
+cd "$TAXONOMY_DIR"
+
+if [ ! -e "gimap.dlflag" ]
+then
+ wget $FTP_SERVER/pub/taxonomy/gi_taxid_nucl.dmp.gz
+ touch gimap.dlflag
+ echo "Downloaded GI to taxon map"
+fi
+
+if [ ! -e "taxdump.dlflag" ]
+then
+ wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
+ touch taxdump.dlflag
+ echo "Downloaded taxonomy tree data"
+fi
+
+if [ ! -e "gimap.flag" ]
+then
+ gunzip gi_taxid_nucl.dmp.gz
+ touch gimap.flag
+ echo "Uncompressed GI to taxon map"
+fi
+
+if [ ! -e "taxdump.flag" ]
+then
+ tar zxf taxdump.tar.gz
+ touch taxdump.flag
+ echo "Uncompressed taxonomy tree data"
+fi
diff --git a/scripts/kraken b/scripts/kraken
new file mode 100755
index 0000000..57cc717
--- /dev/null
+++ b/scripts/kraken
@@ -0,0 +1,293 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Wrapper for Kraken's classifier
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+ use Cwd 'abs_path';
+ $KRAKEN_DIR = dirname abs_path($0);
+}
+
+require "$KRAKEN_DIR/krakenlib.pm";
+$ENV{"KRAKEN_DIR"} = $KRAKEN_DIR;
+$ENV{"PATH"} = "$KRAKEN_DIR:$ENV{PATH}";
+
+my $CLASSIFY = "$KRAKEN_DIR/classify";
+my $GZIP_MAGIC = chr(hex "1f") . chr(hex "8b");
+my $BZIP2_MAGIC = "BZ";
+
+my $quick = 0;
+my $min_hits = 1;
+my $fasta_input = 0;
+my $fastq_input = 0;
+my $db_prefix;
+my $threads;
+my $preload = 0;
+my $gunzip = 0;
+my $bunzip2 = 0;
+my $paired = 0;
+my $check_names = 0;
+my $only_classified_output = 0;
+my $unclassified_out;
+my $classified_out;
+my $outfile;
+
+GetOptions(
+ "help" => \&display_help,
+ "version" => \&display_version,
+ "db=s" => \$db_prefix,
+ "threads=i" => \$threads,
+ "fasta-input" => \$fasta_input,
+ "fastq-input" => \$fastq_input,
+ "quick" => \$quick,
+ "min-hits=i" => \$min_hits,
+ "unclassified-out=s" => \$unclassified_out,
+ "classified-out=s" => \$classified_out,
+ "output=s" => \$outfile,
+ "preload" => \$preload,
+ "paired" => \$paired,
+ "check-names" => \$check_names,
+ "gzip-compressed" => \$gunzip,
+ "bzip2-compressed" => \$bunzip2,
+ "only-classified-output" => \$only_classified_output,
+);
+
+if (! defined $threads) {
+ $threads = $ENV{"KRAKEN_NUM_THREADS"} || 1;
+}
+
+if (! @ARGV) {
+ print STDERR "Need to specify input filenames!\n";
+ usage();
+}
+eval { $db_prefix = krakenlib::find_db($db_prefix); };
+if ($@) {
+ die "$PROG: $@";
+}
+
+my $taxonomy = "$db_prefix/taxonomy/nodes.dmp";
+if ($quick) {
+ undef $taxonomy; # Skip loading nodes file, not needed in quick mode
+}
+
+my $kdb_file = "$db_prefix/database.kdb";
+my $idx_file = "$db_prefix/database.idx";
+if (! -e $kdb_file) {
+ die "$PROG: $kdb_file does not exist!\n";
+}
+if (! -e $idx_file) {
+ die "$PROG: $idx_file does not exist!\n";
+}
+
+if ($min_hits > 1 && ! $quick) {
+ die "$PROG: --min_hits requires --quick to be specified\n";
+}
+
+if ($paired && @ARGV != 2) {
+ die "$PROG: --paired requires exactly two filenames\n";
+}
+
+my $compressed = $gunzip || $bunzip2;
+if ($gunzip && $bunzip2) {
+ die "$PROG: can't use both gzip and bzip2 compression flags\n";
+}
+if ($fasta_input && $fastq_input) {
+ die "$PROG: can't use both FASTA and FASTQ input flags\n";
+}
+
+my $auto_detect = 1;
+if ($fasta_input || $fastq_input || $compressed) {
+ $auto_detect = 0;
+}
+if (! -f $ARGV[0]) {
+ $auto_detect = 0;
+}
+if ($auto_detect) {
+ auto_detect_file_format();
+}
+
+# set flags for classifier
+my @flags;
+push @flags, "-d", $kdb_file;
+push @flags, "-i", $idx_file;
+push @flags, "-t", $threads if $threads > 1;
+push @flags, "-n", $taxonomy if defined $taxonomy;
+push @flags, "-q" if $quick;
+push @flags, "-m", $min_hits if $min_hits > 1;
+push @flags, "-f" if $fastq_input && ! $paired; # merger always outputs FASTA
+push @flags, "-U", $unclassified_out if defined $unclassified_out;
+push @flags, "-C", $classified_out if defined $classified_out;
+push @flags, "-o", $outfile if defined $outfile;
+push @flags, "-c", if $only_classified_output;
+push @flags, "-M" if $preload;
+
+# handle piping for decompression/merging
+my @pipe_argv;
+if ($paired) {
+ my @merge_flags;
+ push @merge_flags, "--fa" if $fasta_input;
+ push @merge_flags, "--fq" if $fastq_input;
+ push @merge_flags, "--gz" if $gunzip;
+ push @merge_flags, "--bz2" if $bunzip2;
+ push @merge_flags, "--check-names" if $check_names;
+ @pipe_argv = ("read_merger.pl", @merge_flags, @ARGV);
+}
+elsif ($compressed) {
+ if ($gunzip) {
+ @pipe_argv = ("gzip", "-dc", @ARGV);
+ }
+ elsif ($bunzip2) {
+ @pipe_argv = ("bzip2", "-dc", @ARGV);
+ }
+ else {
+ die "$PROG: unrecognized compression program! This is a Kraken bug.\n";
+ }
+}
+
+# if args exist, set up the pipe/fork/exec
+if (@pipe_argv) {
+ pipe RD, WR;
+ my $pid = fork();
+ if ($pid < 0) {
+ die "$PROG: fork error: $!\n";
+ }
+ if ($pid) {
+ open STDIN, "<&RD"
+ or die "$PROG: can't dup stdin to read end of pipe: $!\n";
+ close RD;
+ close WR;
+ @ARGV = ("/dev/fd/0"); # make classifier read from pipe
+ }
+ else {
+ open STDOUT, ">&WR"
+ or die "$PROG: can't dup stdout to write end of pipe: $!\n";
+ close RD;
+ close WR;
+ exec @pipe_argv
+ or die "$PROG: can't exec $pipe_argv[0]: $!\n";
+ }
+}
+
+exec $CLASSIFY, @flags, @ARGV;
+die "$PROG: exec error: $!\n";
+
+sub usage {
+ my $exit_code = @_ ? shift : 64;
+ my $default_db = "none";
+ eval { $default_db = '"' . krakenlib::find_db() . '"'; };
+ my $def_thread_ct = exists $ENV{"KRAKEN_NUM_THREADS"} ? (0 + $ENV{"KRAKEN_NUM_THREADS"}) : 1;
+ print STDERR <<EOF;
+Usage: $PROG [options] <filename(s)>
+
+Options:
+ --db NAME Name for Kraken DB
+ (default: $default_db)
+ --threads NUM Number of threads (default: $def_thread_ct)
+ --fasta-input Input is FASTA format
+ --fastq-input Input is FASTQ format
+ --gzip-compressed Input is gzip compressed
+ --bzip2-compressed Input is bzip2 compressed
+ --quick Quick operation (use first hit or hits)
+ --min-hits NUM In quick op., number of hits req'd for classification
+ NOTE: this is ignored if --quick is not specified
+ --unclassified-out FILENAME
+ Print unclassified sequences to filename
+ --classified-out FILENAME
+ Print classified sequences to filename
+ --output FILENAME Print output to filename (default: stdout); "-" will
+ suppress normal output
+ --only-classified-output
+ Print no Kraken output for unclassified sequences
+ --preload Loads DB into memory before classification
+ --paired The two filenames provided are paired-end reads
+ --check-names Ensure each pair of reads have names that agree
+ with each other; ignored if --paired is not specified
+ --help Print this message
+ --version Print version information
+
+If none of the *-input or *-compressed flags are specified, and the
+file is a regular file, automatic format detection is attempted.
+EOF
+ exit $exit_code;
+}
+
+sub display_help {
+ usage(0);
+}
+
+sub display_version {
+ print "Kraken version #####=VERSION=#####\n";
+ print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+ exit 0;
+}
+
+sub auto_detect_file_format {
+ my $magic;
+ my $filename = $ARGV[0];
+
+ # read 2-byte magic number to determine type of compression (if any)
+ open FILE, "<", $filename;
+ read FILE, $magic, 2;
+ close FILE;
+ if ($magic eq $GZIP_MAGIC) {
+ $compressed = 1;
+ $gunzip = 1;
+ }
+ elsif ($magic eq $BZIP2_MAGIC) {
+ $compressed = 1;
+ $bunzip2 = 1;
+ }
+ else {
+ # if no compression, just look at first char
+ chop $magic;
+ }
+
+ # uncompress to stream and read first char
+ if ($gunzip) {
+ open FILE, "-|", "gzip", "-dc", $filename
+ or die "$PROG: can't determine format of $filename (gzip error): $!\n";
+ read FILE, $magic, 1;
+ close FILE;
+ }
+ elsif ($bunzip2) {
+ open FILE, "-|", "bzip2", "-dc", $ARGV[0]
+ or die "$PROG: can't determine format of $filename (bzip2 error): $!\n";
+ read FILE, $magic, 1;
+ close FILE;
+ }
+
+ if ($magic eq ">") {
+ $fasta_input = 1;
+ }
+ elsif ($magic eq "@") {
+ $fastq_input = 1;
+ }
+ else {
+ die "$PROG: can't determine what format $filename is!\n";
+ }
+}
diff --git a/scripts/kraken-build b/scripts/kraken-build
new file mode 100755
index 0000000..bf36ae6
--- /dev/null
+++ b/scripts/kraken-build
@@ -0,0 +1,296 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# General build process wrapper for Kraken.
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+ use Cwd 'abs_path';
+ $KRAKEN_DIR = dirname abs_path($0);
+}
+
+$ENV{"KRAKEN_DIR"} = $KRAKEN_DIR;
+$ENV{"PATH"} = "$KRAKEN_DIR:$ENV{PATH}";
+
+my $DEF_MINIMIZER_LEN = 15;
+my $DEF_KMER_LEN = 31;
+my $DEF_THREAD_CT = 1;
+
+my @VALID_LIBRARY_TYPES = qw/bacteria plasmids viruses human/;
+
+# Option/task option variables
+my (
+ $db,
+ $threads,
+ $minimizer_len,
+ $kmer_len,
+ $new_db,
+ $hash_size,
+ $max_db_size,
+ $work_on_disk,
+ $shrink_block_offset,
+
+ $dl_taxonomy,
+ $dl_library,
+ $add_to_library,
+ $build,
+ $rebuild,
+ $shrink,
+ $standard,
+ $upgrade,
+ $clean,
+);
+
+$threads = $DEF_THREAD_CT;
+$minimizer_len = $DEF_MINIMIZER_LEN;
+$kmer_len = $DEF_KMER_LEN;
+$work_on_disk = "";
+$hash_size = "";
+$max_db_size = "";
+
+# variables corresponding to task options
+my @TASK_LIST = (
+ \$dl_taxonomy,
+ \$dl_library,
+ \$add_to_library,
+ \$build,
+ \$rebuild,
+ \$shrink,
+ \$standard,
+ \$upgrade,
+ \$clean,
+);
+
+GetOptions(
+ "help" => \&display_help,
+ "version" => \&display_version,
+
+ "db=s" => \$db,
+ "threads=i" => \$threads,
+ "minimizer-len=i", \$minimizer_len,
+ "kmer-len=i", \$kmer_len,
+ "new-db=s", \$new_db,
+ "jellyfish-hash-size=s", \$hash_size,
+ "max-db-size=s", \$max_db_size,
+ "work-on-disk", \$work_on_disk,
+ "shrink-block-offset=i", \$shrink_block_offset,
+
+ "download-taxonomy" => \$dl_taxonomy,
+ "download-library=s" => \$dl_library,
+ "add-to-library=s" => \$add_to_library,
+ "build" => \$build,
+ "rebuild" => \$rebuild,
+ "shrink=i" => \$shrink,
+ "upgrade" => \$upgrade,
+ "standard" => \$standard,
+ "clean" => \$clean,
+) or usage();
+
+if (@ARGV) {
+ warn "Extra arguments on command line.\n";
+ usage();
+}
+my $task_options = 0;
+for my $flag_ref (@TASK_LIST) {
+ defined($$flag_ref) and $task_options++;
+}
+if ($task_options > 1) {
+ warn "More than one task option selected.\n";
+ usage();
+}
+if ($task_options == 0) {
+ warn "Must select a task option.\n";
+ usage();
+}
+
+if (! defined $db) {
+ die "Must specify a database name\n";
+}
+if ($threads <= 0) {
+ die "Can't use nonpositive thread count of $threads\n";
+}
+if ($minimizer_len >= $kmer_len) {
+ die "Minimizer length ($minimizer_len) must be less than k ($kmer_len)\n";
+}
+if ($minimizer_len <= 0) {
+ die "Can't use nonpositive minimizer length of $minimizer_len\n";
+}
+if ($kmer_len <= 2) {
+ die "Can't use k of $kmer_len (must be >= 2)\n";
+}
+if ($kmer_len > 31) {
+ die "Can't use k of $kmer_len (must be <= 31)\n";
+}
+if ($hash_size !~ /^(\d+[kKmMgG]?)?$/) {
+ die "Illegal hash size string\n";
+}
+if ($max_db_size !~ /^$/ && $max_db_size <= 0) {
+ die "Can't have negative max database size.\n";
+}
+
+$ENV{"KRAKEN_DB_NAME"} = $db;
+$ENV{"KRAKEN_THREAD_CT"} = $threads;
+$ENV{"KRAKEN_MINIMIZER_LEN"} = $minimizer_len;
+$ENV{"KRAKEN_KMER_LEN"} = $kmer_len;
+$ENV{"KRAKEN_HASH_SIZE"} = $hash_size;
+$ENV{"KRAKEN_MAX_DB_SIZE"} = $max_db_size;
+$ENV{"KRAKEN_WORK_ON_DISK"} = $work_on_disk;
+
+if ($dl_taxonomy) {
+ download_taxonomy();
+}
+elsif (defined($dl_library)) {
+ download_library($dl_library);
+}
+elsif (defined($add_to_library)) {
+ add_to_library($add_to_library);
+}
+elsif (defined($shrink)) {
+ shrink_db($shrink);
+}
+elsif ($standard) {
+ standard_installation();
+}
+elsif ($build || $rebuild) {
+ build_database();
+}
+elsif ($clean) {
+ clean_database();
+}
+elsif ($upgrade) {
+ upgrade_database();
+}
+else {
+ usage();
+}
+
+exit -1;
+# END OF MAIN CODE.
+
+sub usage {
+ my $exit_code = @_ ? shift : 64;
+ print STDERR <<EOF;
+Usage: $PROG [task option] [options]
+
+Task options (exactly one must be selected):
+ --download-taxonomy Download NCBI taxonomic information
+ --download-library TYPE Download partial library
+ (TYPE = one of "bacteria", "plasmids",
+ "viruses", "human")
+ --add-to-library FILE Add FILE to library
+ --build Create DB from library
+ (requires taxonomy d/l'ed and at least one file
+ in library)
+ --rebuild Create DB from library like --build, but remove
+ existing non-library/taxonomy files before build
+ --clean Remove unneeded files from a built database
+ --shrink NEW_CT Shrink an existing DB to have only NEW_CT k-mers
+ --standard Download and create default database
+ --upgrade Upgrade an existing older database to use scrambled
+ minimizer ordering (see README for details)
+ --help Print this message
+ --version Print version information
+
+Options:
+ --db NAME Kraken DB/library name (mandatory except for
+ --help/--version)
+ --threads # Number of threads (def: $DEF_THREAD_CT)
+ --new-db NAME New Kraken DB name (shrink task only; mandatory
+ for shrink task)
+ --kmer-len NUM K-mer length in bp (build/shrink tasks only;
+ def: $DEF_KMER_LEN)
+ --minimizer-len NUM Minimizer length in bp (build/shrink tasks only;
+ def: $DEF_MINIMIZER_LEN)
+ --jellyfish-hash-size STR Pass a specific hash size argument to jellyfish
+ when building database (build task only)
+ --max-db-size SIZE Shrink the DB before full build, making sure
+ database and index together use <= SIZE gigabytes
+ (build task only)
+ --shrink-block-offset NUM When shrinking, select the k-mer that is NUM
+ positions from the end of a block of k-mers
+ (default: 1)
+ --work-on-disk Perform most operations on disk rather than in
+ RAM (will slow down build in most cases)
+EOF
+ exit $exit_code;
+}
+
+sub display_help {
+ usage(0);
+}
+
+sub display_version {
+ print "Kraken version #####=VERSION=#####\n";
+ print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+ exit 0;
+}
+
+sub download_taxonomy {
+ exec "download_taxonomy.sh";
+}
+
+sub download_library {
+ my $type = shift;
+ if (! grep $type eq $_, @VALID_LIBRARY_TYPES) {
+ warn "Unknown library type \"$type\"\n";
+ usage();
+ }
+ exec "download_genomic_library.sh", $type;
+}
+
+sub add_to_library {
+ my $arg = shift;
+ exec "add_to_library.sh", $arg;
+}
+
+sub shrink_db {
+ my $new_count = shift;
+ if ($new_count <= 0) {
+ die "New DB must have at least 1 k-mer\n";
+ }
+ if (! defined($new_db)) {
+ die "Must specify new database name to perform shrink task\n";
+ }
+ exec "shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
+}
+
+sub standard_installation {
+ exec "standard_installation.sh";
+}
+
+sub build_database {
+ $ENV{"KRAKEN_REBUILD_DATABASE"} = $rebuild;
+ exec "build_kraken_db.sh";
+}
+
+sub clean_database {
+ exec "clean_db.sh";
+}
+
+sub upgrade_database {
+ exec "upgrade_db.sh";
+}
diff --git a/scripts/kraken-filter b/scripts/kraken-filter
new file mode 100755
index 0000000..04dcb7c
--- /dev/null
+++ b/scripts/kraken-filter
@@ -0,0 +1,146 @@
+#!/usr/bin/perl
+
+# Copyright 2013, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Adjusts Kraken output and classifications to require a specified
+# proportion of k-mers map to LCAs at or below a given node.
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+ use Cwd 'abs_path';
+ $KRAKEN_DIR = dirname abs_path($0);
+}
+
+require "$KRAKEN_DIR/krakenlib.pm";
+
+my $threshold = 0;
+my $db_prefix;
+
+GetOptions(
+ "help" => \&display_help,
+ "version" => \&display_version,
+ "threshold=f" => \$threshold,
+ "db=s" => \$db_prefix,
+) or usage();
+
+eval { $db_prefix = krakenlib::find_db($db_prefix); };
+if ($@) {
+ die "$PROG: $@";
+}
+
+if ($threshold < 0 || $threshold > 1) {
+ print STDERR "$PROG: threshold must be in the interval [0,1].\n";
+ exit 64;
+}
+
+sub usage {
+ my $exit_code = @_ ? shift : 64;
+ print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] [--threshold NUM] <kraken output file(s)>\n";
+ my $default_db;
+ eval { $default_db = krakenlib::find_db(); };
+ if (defined $default_db) {
+ print STDERR "\n Default database is \"$default_db\"\n";
+ }
+ exit $exit_code;
+}
+
+sub display_help {
+ usage(0);
+}
+
+sub display_version {
+ print "Kraken version #####=VERSION=#####\n";
+ print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+ exit 0;
+}
+
+my %parent_map;
+load_taxonomy($db_prefix);
+
+while (<>) {
+ chomp;
+ my @fields = split /\t/;
+ my ($code, $seqid, $called_taxon, $len, $hit_list) = @fields;
+ my @hits = split " ", $hit_list;
+ my %hit_counts;
+ for (@hits) {
+ my ($taxid, $ct) = split /:/;
+ $hit_counts{$taxid} += $ct;
+ }
+
+ # maps nodes to count of hits at or below node
+ my %hit_sums;
+
+ my $total_kmers = 0;
+ my $total_unambig = 0;
+ my $total_hits = 0;
+ for (keys %hit_counts) {
+ my $count = $hit_counts{$_};
+ $total_kmers += $count;
+ if ($_ ne "A") {
+ $total_unambig += $count;
+ if ($_ > 0) {
+ $total_hits += $count;
+ my $taxid = $_;
+ while ($taxid > 0) {
+ $hit_sums{$taxid} += $count;
+ $taxid = $parent_map{$taxid};
+ }
+ }
+ }
+ }
+
+ my $pct = 0;
+ # starting at called node, run up tree until threshold met
+ my $new_taxon = $called_taxon;
+ while ($new_taxon > 0) {
+ $pct = $hit_sums{$new_taxon} / $total_unambig;
+ if ($pct >= $threshold - 1e-5) { # epsilon check w/ float equality
+ last;
+ }
+ $new_taxon = $parent_map{$new_taxon};
+ }
+ printf "%s\t$seqid\t$new_taxon\t$len\tP=%0.3f\t$hit_list\n",
+ $new_taxon > 0 ? "C" : "U",
+ $pct;
+}
+
+sub load_taxonomy {
+ my $prefix = shift;
+
+ open NODES, "<", "$prefix/taxonomy/nodes.dmp"
+ or die "$PROG: can't open nodes file: $!\n";
+ while (<NODES>) {
+ chomp;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $parent_id) = @fields[0,1];
+ if ($node_id == 1) {
+ $parent_id = 0;
+ }
+ $parent_map{$node_id} = $parent_id;
+ }
+ close NODES;
+}
diff --git a/scripts/kraken-mpa-report b/scripts/kraken-mpa-report
new file mode 100755
index 0000000..7813569
--- /dev/null
+++ b/scripts/kraken-mpa-report
@@ -0,0 +1,221 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Reports a summary of Kraken's results, in a MetaPhlAn-like format
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+ use Cwd 'abs_path';
+ $KRAKEN_DIR = dirname abs_path($0);
+}
+
+require "$KRAKEN_DIR/krakenlib.pm";
+
+my $show_zeros = 0;
+my $header_line = 0;
+my $intermediate = 0;
+my $db_prefix;
+my @RANK_CODES = qw/D K P C O F G S/;
+
+GetOptions(
+ "help" => \&display_help,
+ "version" => \&display_version,
+ "show-zeros" => \$show_zeros,
+ "header-line" => \$header_line,
+ "intermediate-ranks" => \$intermediate,
+ "db=s" => \$db_prefix,
+);
+
+eval { $db_prefix = krakenlib::find_db($db_prefix); };
+if ($@) {
+ die "$PROG: $@";
+}
+
+sub usage {
+ my $exit_code = @_ ? shift : 64;
+ my $default_db = "none";
+ eval { $default_db = '"' . krakenlib::find_db() . '"'; };
+
+ print STDERR <<__EOF__;
+Usage: $PROG [--db KRAKEN_DB_NAME] [options] <kraken output file(s)>
+
+Options:
+ --db NAME Name of Kraken database
+ (default: $default_db)
+ --show-zeros Display taxa even if they lack a read in any sample
+ --header-line Display a header line indicating sample IDs
+ (sample IDs are the filenames)
+ --intermediate-ranks Display taxa not at the standard ranks with x__ prefix
+__EOF__
+ exit $exit_code;
+}
+
+sub display_help {
+ usage(0);
+}
+
+sub display_version {
+ print "Kraken version #####=VERSION=#####\n";
+ print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+ exit 0;
+}
+
+my (%child_lists, %name_map, %rank_map);
+load_taxonomy($db_prefix);
+
+my @file_data;
+my %hit_taxa;
+for my $file (@ARGV) {
+ open FILE, "<", $file
+ or die "$PROG: can't open $file: $!\n";
+ my %taxo_counts;
+ while (<FILE>) {
+ my @fields = split;
+ $taxo_counts{$fields[2]}++;
+ }
+ my %clade_counts = %taxo_counts;
+ dfs_summation(1, \%clade_counts);
+ for (keys %clade_counts) {
+ if ($clade_counts{$_}) {
+ $hit_taxa{$_}++
+ }
+ }
+ push @file_data, \%clade_counts;
+}
+
+if ($intermediate) {
+ push @RANK_CODES, "X";
+}
+
+my %output_lines;
+$output_lines{$_} = "" for @RANK_CODES;
+if ($header_line) {
+ print "#Sample ID\t" . join("\t", @ARGV) . "\n";
+}
+dfs_report(1);
+print $output_lines{$_} for @RANK_CODES;
+
+sub dfs_report {
+ my $node = shift;
+ my $name = shift;
+ if (! $show_zeros) {
+ return unless $hit_taxa{$node};
+ }
+ my $code = rank_code($node);
+ if ($code ne "-" || $intermediate) {
+ $code = "X" if $code eq "-";
+ if (defined $name) {
+ $name .= "|"
+ }
+ else {
+ $name = "";
+ }
+
+ $name .= lc($code) . "__" . sanitize_name($node);
+ my $output = $name;
+ for my $file (@file_data) {
+ $output .= "\t" . ($file->{$node} || 0);
+ }
+ $output .= "\n";
+
+ $output_lines{$code} .= $output;
+ }
+ my $children = $child_lists{$node};
+ if ($children) {
+ for my $child (@$children) {
+ dfs_report($child, $name);
+ }
+ }
+}
+
+sub sanitize_name {
+ my $node = shift;
+ my $name = $name_map{$node};
+ $name =~ tr/|.//d;
+ $name =~ tr/ /_/;
+ return $name;
+}
+
+sub rank_code {
+ my $node = shift;
+ my $rank = $rank_map{$node};
+ for ($rank) {
+ $_ eq "species" and return "S";
+ $_ eq "genus" and return "G";
+ $_ eq "family" and return "F";
+ $_ eq "order" and return "O";
+ $_ eq "class" and return "C";
+ $_ eq "phylum" and return "P";
+ $_ eq "superkingdom" and return "D";
+ $_ eq "kingdom" and return "K";
+ }
+ return "-";
+}
+
+sub dfs_summation {
+ my $node = shift;
+ my $count_ref = shift;
+ my $children = $child_lists{$node};
+ if ($children) {
+ for my $child (@$children) {
+ dfs_summation($child, $count_ref);
+ $count_ref->{$node} += ($count_ref->{$child} || 0);
+ }
+ }
+}
+
+sub load_taxonomy {
+ my $prefix = shift;
+
+ open NAMES, "<", "$prefix/taxonomy/names.dmp"
+ or die "$PROG: can't open names file: $!\n";
+ while (<NAMES>) {
+ chomp;
+ s/\t\|$//;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $name, $type) = @fields[0,1,3];
+ if ($type eq "scientific name") {
+ $name_map{$node_id} = $name;
+ }
+ }
+ close NAMES;
+
+ open NODES, "<", "$prefix/taxonomy/nodes.dmp"
+ or die "$PROG: can't open nodes file: $!\n";
+ while (<NODES>) {
+ chomp;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $parent_id, $rank) = @fields[0,1,2];
+ if ($node_id == 1) {
+ $parent_id = 0;
+ }
+ $child_lists{$parent_id} ||= [];
+ push @{ $child_lists{$parent_id} }, $node_id;
+ $rank_map{$node_id} = $rank;
+ }
+ close NODES;
+}
diff --git a/scripts/kraken-report b/scripts/kraken-report
new file mode 100755
index 0000000..8351593
--- /dev/null
+++ b/scripts/kraken-report
@@ -0,0 +1,179 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Reports a summary of Kraken's results.
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+ use Cwd 'abs_path';
+ $KRAKEN_DIR = dirname abs_path($0);
+}
+
+require "$KRAKEN_DIR/krakenlib.pm";
+
+my $show_zeros = 0;
+my $db_prefix;
+
+GetOptions(
+ "help" => \&display_help,
+ "version" => \&display_version,
+ "show-zeros" => \$show_zeros,
+ "db=s" => \$db_prefix,
+);
+
+eval { $db_prefix = krakenlib::find_db($db_prefix); };
+if ($@) {
+ die "$PROG: $@";
+}
+
+sub usage {
+ my $exit_code = @_ ? shift : 64;
+ print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] [--show-zeros] <kraken output file(s)>\n";
+ my $default_db;
+ eval { $default_db = krakenlib::find_db(); };
+ if (defined $default_db) {
+ print STDERR "\n Default database is \"$default_db\"\n";
+ }
+ exit $exit_code;
+}
+
+sub display_help {
+ usage(0);
+}
+
+sub display_version {
+ print "Kraken version #####=VERSION=#####\n";
+ print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+ exit 0;
+}
+
+my (%child_lists, %name_map, %rank_map);
+load_taxonomy($db_prefix);
+
+my %taxo_counts;
+my $seq_count = 0;
+$taxo_counts{0} = 0;
+while (<>) {
+ my @fields = split;
+ $taxo_counts{$fields[2]}++;
+ $seq_count++;
+}
+my $classified_count = $seq_count - $taxo_counts{0};
+
+my %clade_counts = %taxo_counts;
+dfs_summation(1);
+
+for (keys %name_map) {
+ $clade_counts{$_} ||= 0;
+}
+
+printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
+ $clade_counts{0} * 100 / $seq_count,
+ $clade_counts{0}, $taxo_counts{0}, "U",
+ 0, "", "unclassified";
+dfs_report(1, 0);
+
+sub dfs_report {
+ my $node = shift;
+ my $depth = shift;
+ if (! $clade_counts{$node} && ! $show_zeros) {
+ return;
+ }
+ printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
+ ($clade_counts{$node} || 0) * 100 / $seq_count,
+ ($clade_counts{$node} || 0),
+ ($taxo_counts{$node} || 0),
+ rank_code($rank_map{$node}),
+ $node,
+ " " x $depth,
+ $name_map{$node};
+ my $children = $child_lists{$node};
+ if ($children) {
+ my @sorted_children = sort { $clade_counts{$b} <=> $clade_counts{$a} } @$children;
+ for my $child (@sorted_children) {
+ dfs_report($child, $depth + 1);
+ }
+ }
+}
+
+sub rank_code {
+ my $rank = shift;
+ for ($rank) {
+ $_ eq "species" and return "S";
+ $_ eq "genus" and return "G";
+ $_ eq "family" and return "F";
+ $_ eq "order" and return "O";
+ $_ eq "class" and return "C";
+ $_ eq "phylum" and return "P";
+ $_ eq "kingdom" and return "K";
+ $_ eq "superkingdom" and return "D";
+ }
+ return "-";
+}
+
+sub dfs_summation {
+ my $node = shift;
+ my $children = $child_lists{$node};
+ if ($children) {
+ for my $child (@$children) {
+ dfs_summation($child);
+ $clade_counts{$node} += ($clade_counts{$child} || 0);
+ }
+ }
+}
+
+sub load_taxonomy {
+ my $prefix = shift;
+
+ open NAMES, "<", "$prefix/taxonomy/names.dmp"
+ or die "$PROG: can't open names file: $!\n";
+ while (<NAMES>) {
+ chomp;
+ s/\t\|$//;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $name, $type) = @fields[0,1,3];
+ if ($type eq "scientific name") {
+ $name_map{$node_id} = $name;
+ }
+ }
+ close NAMES;
+
+ open NODES, "<", "$prefix/taxonomy/nodes.dmp"
+ or die "$PROG: can't open nodes file: $!\n";
+ while (<NODES>) {
+ chomp;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $parent_id, $rank) = @fields[0,1,2];
+ if ($node_id == 1) {
+ $parent_id = 0;
+ }
+ $child_lists{$parent_id} ||= [];
+ push @{ $child_lists{$parent_id} }, $node_id;
+ $rank_map{$node_id} = $rank;
+ }
+ close NODES;
+}
diff --git a/scripts/kraken-translate b/scripts/kraken-translate
new file mode 100755
index 0000000..89a067a
--- /dev/null
+++ b/scripts/kraken-translate
@@ -0,0 +1,157 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# For each classified read, prints sequence ID and full taxonomy
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
+
+# Test to see if the executables got moved, try to recover if we can
+if (! -e "$KRAKEN_DIR/classify") {
+ use Cwd 'abs_path';
+ $KRAKEN_DIR = dirname abs_path($0);
+}
+
+require "$KRAKEN_DIR/krakenlib.pm";
+
+my $db_prefix;
+my $mpa_format = 0;
+
+GetOptions(
+ "help" => \&display_help,
+ "version" => \&display_version,
+ "db=s" => \$db_prefix,
+ "mpa-format" => \$mpa_format
+);
+
+eval { $db_prefix = krakenlib::find_db($db_prefix); };
+if ($@) {
+ die "$PROG: $@";
+}
+
+sub usage {
+ my $exit_code = @_ ? shift : 64;
+ print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] [--mpa-format] <kraken output file(s)>\n";
+ my $default_db;
+ eval { $default_db = krakenlib::find_db(); };
+ if (defined $default_db) {
+ print STDERR "\n Default database is \"$default_db\"\n";
+ }
+ exit $exit_code;
+}
+
+sub display_help {
+ usage(0);
+}
+
+sub display_version {
+ print "Kraken version #####=VERSION=#####\n";
+ print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
+ exit 0;
+}
+
+my (%parent_map, %name_map, %rank_map);
+load_taxonomy($db_prefix);
+my %known_taxonomy_strings;
+
+while (<>) {
+ next unless /^C/;
+ chomp;
+ my @fields = split;
+ my ($seqid, $taxid) = @fields[1,2];
+ my $taxonomy_str = get_taxonomy_str($taxid);
+ print "$seqid\t$taxonomy_str\n";
+}
+
+sub get_taxonomy_str {
+ my $taxid = shift;
+ if (! exists $known_taxonomy_strings{$taxid}) {
+ my @nodes;
+ while ($taxid > 0) {
+ if ($mpa_format) {
+ my $rank_code = rank_code($rank_map{$taxid});
+ my $name = $name_map{$taxid};
+ $name =~ tr/ /_/;
+ unshift @nodes, lc($rank_code) . "__" . $name if $rank_code ne "-";
+ }
+ else {
+ unshift @nodes, $name_map{$taxid};
+ }
+ $taxid = $parent_map{$taxid};
+ }
+ if ($mpa_format) {
+ $known_taxonomy_strings{$taxid} = @nodes ? join("|", @nodes) : "root";
+ }
+ else {
+ $known_taxonomy_strings{$taxid} = join(";", @nodes);
+ }
+ }
+ return $known_taxonomy_strings{$taxid};
+}
+
+sub rank_code {
+ my $rank = shift;
+ for ($rank) {
+ $_ eq "species" and return "S";
+ $_ eq "genus" and return "G";
+ $_ eq "family" and return "F";
+ $_ eq "order" and return "O";
+ $_ eq "class" and return "C";
+ $_ eq "phylum" and return "P";
+ $_ eq "kingdom" and return "K";
+ $_ eq "superkingdom" and return "D";
+ }
+ return "-";
+}
+
+sub load_taxonomy {
+ my $prefix = shift;
+
+ open NAMES, "<", "$prefix/taxonomy/names.dmp"
+ or die "$PROG: can't open names file: $!\n";
+ while (<NAMES>) {
+ chomp;
+ s/\t\|$//;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $name, $type) = @fields[0,1,3];
+ if ($type eq "scientific name") {
+ $name_map{$node_id} = $name;
+ }
+ }
+ close NAMES;
+
+ open NODES, "<", "$prefix/taxonomy/nodes.dmp"
+ or die "$PROG: can't open nodes file: $!\n";
+ while (<NODES>) {
+ chomp;
+ my @fields = split /\t\|\t/;
+ my ($node_id, $parent_id, $rank) = @fields[0,1,2];
+ if ($node_id == 1) {
+ $parent_id = 0;
+ }
+ $parent_map{$node_id} = $parent_id;
+ $rank_map{$node_id} = $rank;
+ }
+ close NODES;
+}
diff --git a/scripts/krakenlib.pm b/scripts/krakenlib.pm
new file mode 100644
index 0000000..943d6c6
--- /dev/null
+++ b/scripts/krakenlib.pm
@@ -0,0 +1,76 @@
+package krakenlib;
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Common subroutines for other Kraken scripts
+
+use strict;
+use warnings;
+
+# Input: the argument for a --db option (possibly undefined)
+# Returns: the DB to use, taking KRAKEN_DEFAULT_DB and KRAKEN_PATH
+# into account.
+sub find_db {
+ my $supplied_db_prefix = shift;
+ my $db_prefix;
+ if (! defined $supplied_db_prefix) {
+ if (! exists $ENV{"KRAKEN_DEFAULT_DB"}) {
+ die "Must specify DB with either --db or \$KRAKEN_DEFAULT_DB\n";
+ }
+ $supplied_db_prefix = $ENV{"KRAKEN_DEFAULT_DB"};
+ }
+ my @db_path = (".");
+ if (exists $ENV{"KRAKEN_DB_PATH"}) {
+ my $path_str = $ENV{"KRAKEN_DB_PATH"};
+ # Allow zero-length path to be current dir
+ $path_str =~ s/^:/.:/;
+ $path_str =~ s/:$/:./;
+ $path_str =~ s/::/:.:/;
+
+ @db_path = split /:/, $path_str;
+ }
+
+ # Use supplied DB if abs. or rel. path is given
+ if ($supplied_db_prefix =~ m|/|) {
+ $db_prefix = $supplied_db_prefix;
+ }
+ else {
+ # Check all dirs in KRAKEN_DB_PATH
+ for my $dir (@db_path) {
+ my $checked_db = "$dir/$supplied_db_prefix";
+ if (-e $checked_db && -d _) {
+ $db_prefix = $checked_db;
+ last;
+ }
+ }
+ if (! defined $db_prefix) {
+ my $printed_path = exists $ENV{"KRAKEN_DB_PATH"} ? qq|"$ENV{'KRAKEN_DB_PATH'}"| : "undefined";
+ die "unable to find $supplied_db_prefix in \$KRAKEN_DB_PATH ($printed_path)\n";
+ }
+ }
+
+ for my $file (qw/database.kdb database.idx/) {
+ if (! -e "$db_prefix/$file") {
+ die "database (\"$db_prefix\") does not contain necessary file $file\n";
+ }
+ }
+
+ return $db_prefix;
+}
+
+1;
diff --git a/scripts/read_merger.pl b/scripts/read_merger.pl
new file mode 100755
index 0000000..2d32477
--- /dev/null
+++ b/scripts/read_merger.pl
@@ -0,0 +1,164 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Merges two files specified on the command line, print to stdout
+# Designed to be called by kraken
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $PROG = basename $0;
+
+my $fasta_input = 0;
+my $fastq_input = 0;
+my $gunzip = 0;
+my $bunzip2 = 0;
+my $check_names = 0;
+
+GetOptions(
+ "fa" => \$fasta_input,
+ "fq" => \$fastq_input,
+ "gz" => \$gunzip,
+ "bz2" => \$bunzip2,
+ "check-names" => \$check_names
+);
+
+if (@ARGV != 2) {
+ die "$PROG: must have exactly two filename arguments\n";
+}
+for my $file (@ARGV) {
+ if (! -e $file) {
+ die "$PROG: $file does not exist\n";
+ }
+ if (! -f $file) {
+ die "$PROG: $file is not a regular file\n";
+ }
+}
+
+my $compressed = $gunzip || $bunzip2;
+if ($gunzip && $bunzip2) {
+ die "$PROG: can't use both gzip and bzip2 compression flags\n";
+}
+if (! ($fasta_input xor $fastq_input)) {
+ die "$PROG: must specify exactly one of FASTA and FASTQ input flags\n";
+}
+
+# Open files (or pipes from decompression progs)
+my ($fh1, $fh2);
+if ($gunzip) {
+ open $fh1, "-|", "gunzip", "-c", $ARGV[0]
+ or die "$PROG: can't open gunzip pipe with $ARGV[0]: $!\n";
+ open $fh2, "-|", "gunzip", "-c", $ARGV[1]
+ or die "$PROG: can't open gunzip pipe with $ARGV[1]: $!\n";
+}
+elsif ($bunzip2) {
+ open $fh1, "-|", "bunzip2", "-c", $ARGV[0]
+ or die "$PROG: can't open bunzip2 pipe with $ARGV[0]: $!\n";
+ open $fh2, "-|", "bunzip2", "-c", $ARGV[1]
+ or die "$PROG: can't open bunzip2 pipe with $ARGV[1]: $!\n";
+}
+else {
+ open $fh1, "<", $ARGV[0]
+ or die "$PROG: can't open $ARGV[0]: $!\n";
+ open $fh2, "<", $ARGV[1]
+ or die "$PROG: can't open $ARGV[1]: $!\n";
+}
+
+# read/merge/print loop
+# make sure names match before merging
+my ($seq1, $seq2);
+while (defined($seq1 = read_sequence($fh1))) {
+ $seq2 = read_sequence($fh2);
+ if (! defined $seq2) {
+ die "$PROG: mismatched sequence counts\n";
+ }
+ if ($check_names && $seq1->{id} ne $seq2->{id}) {
+ die "$PROG: mismatched mate pair names ('$seq1->{id}' & '$seq2->{id}')\n";
+ }
+ print_merged_sequence($seq1, $seq2);
+}
+if (defined($seq2 = read_sequence($fh2))) {
+ die "$PROG: mismatched sequence counts\n";
+}
+close $fh1;
+close $fh2;
+
+{
+ my %buffers; # Needed due to fasta formatting
+ sub read_sequence {
+ my $fh = shift;
+ my $id;
+ my $seq = "";
+ if (! exists $buffers{$fh}) {
+ $buffers{$fh} = <$fh>;
+ }
+ if (! defined $buffers{$fh}) { # No more file
+ return undef;
+ }
+ if ($fasta_input) {
+ if ($buffers{$fh} =~ /^>(\S+)/) {
+ $id = $1;
+ }
+ else {
+ die "$PROG: malformed fasta file (line = '$buffers{$fh}')\n";
+ }
+ delete $buffers{$fh};
+ while (<$fh>) {
+ if (/^>/) {
+ $buffers{$fh} = $_;
+ last;
+ }
+ else {
+ chomp;
+ $seq .= $_;
+ }
+ }
+ }
+ elsif ($fastq_input) {
+ if ($buffers{$fh} =~ /^@(\S+)/) {
+ $id = $1;
+ }
+ else {
+ if ($buffers{$fh} =~ /^$/) {
+ return undef; # Some fastq files end with blank line, allow it
+ }
+ die "$PROG: malformed fastq file (line = '$buffers{$fh}')\n";
+ }
+ delete $buffers{$fh};
+ chomp($seq = <$fh>);
+ scalar <$fh>; # quality header
+ scalar <$fh>; # quality values
+ }
+ else {
+ # should never get here
+ die "$PROG: I have no idea what kind of input I'm reading!!!\n";
+ }
+
+ $id =~ s/[\/_.][12]$//; # strip /1 (or .1, _1) or /2 to help comparison
+ return { id => $id, seq => $seq };
+ }
+}
+
+sub print_merged_sequence {
+ my ($seq1, $seq2) = @_;
+ print ">" . $seq1->{id} . "\n";
+ print $seq1->{seq} . "N" . $seq2->{seq} . "\n";
+}
diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl
new file mode 100755
index 0000000..ce6a0bc
--- /dev/null
+++ b/scripts/report_gi_numbers.pl
@@ -0,0 +1,49 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Reads multi-FASTA input and for each sequence ID reports a
+# tab-delimited line:
+# <GI number> <sequence ID>
+#
+# or in the case of a sequence with Kraken taxid information:
+#
+# TAXID <taxonomy ID> <sequence ID>
+#
+# Assumes all sequence IDs actually have GI numbers or Kraken
+# taxid information.
+
+use strict;
+use warnings;
+use File::Basename;
+
+my $PROG = basename $0;
+
+while (<>) {
+ next unless /^>(\S+)/;
+ my $seq_id = $1;
+ if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) {
+ print "TAXID\t$2\t$seq_id\n";
+ next;
+ }
+
+ if ($seq_id !~ /(^|\|)gi\|(\d+)/) {
+ die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n";
+ }
+ print "$2\t$seq_id\n";
+}
diff --git a/scripts/shrink_db.sh b/scripts/shrink_db.sh
new file mode 100755
index 0000000..a14249f
--- /dev/null
+++ b/scripts/shrink_db.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Shrink a Kraken database
+# Designed to be called by kraken_build
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+set -o pipefail # Stop on failures in non-final pipeline commands
+
+new_ct="$1"
+new_db="$2"
+offset="$3"
+
+OLD_DB_DIR="$KRAKEN_DB_NAME"
+NEW_DB_DIR="$new_db"
+
+if [ -e "$NEW_DB_DIR" ]
+then
+ echo "$new_db already exists ($NEW_DB_DIR), aborting shrink operation."
+ exit 1
+else
+ mkdir -p "$NEW_DB_DIR/taxonomy"
+fi
+
+cp "$OLD_DB_DIR/taxonomy/nodes.dmp" "$NEW_DB_DIR/taxonomy"
+cp "$OLD_DB_DIR/taxonomy/names.dmp" "$NEW_DB_DIR/taxonomy"
+db_shrink -n $new_ct -d "$OLD_DB_DIR/database.kdb" \
+ -o "$NEW_DB_DIR/database.jdb.tmp" -O "$offset"
+mv "$NEW_DB_DIR/database.jdb.tmp" "$NEW_DB_DIR/database.jdb"
+echo "Reduced database created, now sorting..."
+db_sort -M -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
+ -d "$NEW_DB_DIR/database.jdb" -o "$NEW_DB_DIR/database.kdb.tmp" \
+ -i "$NEW_DB_DIR/database.idx"
+mv "$NEW_DB_DIR/database.kdb.tmp" "$NEW_DB_DIR/database.kdb"
+echo "Sort complete, database is ready."
diff --git a/scripts/standard_installation.sh b/scripts/standard_installation.sh
new file mode 100755
index 0000000..b542a4f
--- /dev/null
+++ b/scripts/standard_installation.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Build the standard Kraken database
+# Designed to be called by kraken_build
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+set -o pipefail # Stop on failures in non-final pipeline commands
+
+WOD_FLAG=""
+if [ -n "$KRAKEN_WORK_ON_DISK" ]
+then
+ WOD_FLAG="--work-on-disk"
+fi
+
+check_for_jellyfish.sh
+kraken-build --db $KRAKEN_DB_NAME --download-taxonomy
+kraken-build --db $KRAKEN_DB_NAME --download-library bacteria
+kraken-build --db $KRAKEN_DB_NAME --download-library viruses
+kraken-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
+ --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \
+ --max-db-size "$KRAKEN_MAX_DB_SIZE" \
+ --minimizer-len $KRAKEN_MINIMIZER_LEN \
+ --kmer-len $KRAKEN_KMER_LEN \
+ $WOD_FLAG
diff --git a/scripts/upgrade_db.sh b/scripts/upgrade_db.sh
new file mode 100755
index 0000000..b59ba63
--- /dev/null
+++ b/scripts/upgrade_db.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Upgrade a pre-v0.10.0-beta Kraken DB to use scrambled minimizer order
+# Designed to be called by kraken_build
+
+set -u # Protect against uninitialized vars.
+set -e # Stop on error
+set -o pipefail # Stop on failures in non-final pipeline commands
+
+function report_time_elapsed() {
+ curr_time=$(date "+%s.%N")
+ perl -e '$time = $ARGV[1] - $ARGV[0];' \
+ -e '$sec = int($time); $nsec = $time - $sec;' \
+ -e '$min = int($sec/60); $sec %= 60;' \
+ -e '$hr = int($min/60); $min %= 60;' \
+ -e 'print "${hr}h" if $hr;' \
+ -e 'print "${min}m" if $min || $hr;' \
+ -e 'printf "%.3fs", $sec + $nsec;' \
+ $1 $curr_time
+}
+
+start_time=$(date "+%s.%N")
+
+DATABASE_DIR="$KRAKEN_DB_NAME"
+
+if [ ! -d "$DATABASE_DIR" ]
+then
+ echo "Can't find Kraken DB directory \"$KRAKEN_DB_NAME\""
+ exit 1
+fi
+cd "$DATABASE_DIR"
+
+MEMFLAG=""
+if [ -n "$KRAKEN_WORK_ON_DISK" ]
+then
+ MEMFLAG="-M"
+fi
+
+if [ -e "old_database.kdb" ]
+then
+ echo "old_database.kdb found - it appears database is already upgraded."
+ exit 1
+fi
+
+if [ ! -e "database.kdb" ]
+then
+ echo "Can't find database.kdb file!"
+ exit 1
+fi
+if [ ! -e "database.idx" ]
+then
+ echo "Can't find database.idx file!"
+ exit 1
+fi
+
+idx_size=$(stat -c '%s' database.idx)
+# Calculate minimizer length based on existing index size
+minimizer_len=$(perl -le 'print int(log(shift() / 8 - 2) / log(4))' $idx_size)
+
+db_sort $MEMFLAG -t $KRAKEN_THREAD_CT -n $minimizer_len -d database.kdb \
+ -o newdb.kdb -i newdb.idx
+mv database.idx old_database.idx
+mv database.kdb old_database.kdb
+mv newdb.idx database.idx
+mv newdb.kdb database.kdb
+
+echo "Database upgrade complete. [$(report_time_elapsed $start_time)]"
+echo "Old database files are at $KRAKEN_DB_NAME/old_database.{kdb,idx}"
diff --git a/scripts/verify_gi_numbers.pl b/scripts/verify_gi_numbers.pl
new file mode 100755
index 0000000..ec616f5
--- /dev/null
+++ b/scripts/verify_gi_numbers.pl
@@ -0,0 +1,54 @@
+#!/usr/bin/perl
+
+# Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+#
+# This file is part of the Kraken taxonomic sequence classification system.
+#
+# Kraken is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Kraken is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+
+# Checks each sequence header to ensure it has a GI number to
+# enable taxonomic ID lookup later. Also has some (very basic)
+# FASTA-format checking.
+
+use strict;
+use warnings;
+use File::Basename;
+
+my $PROG = basename $0;
+
+die "$PROG: must specify one filename!\n" if @ARGV != 1;
+
+my $filename = shift;
+
+open FASTA, "<", $filename
+ or die "$PROG: can't open $filename: $!\n";
+my $seq_ct = 0;
+my $errors = 0;
+while (<FASTA>) {
+ next unless /^>/;
+ $seq_ct++;
+ if (! /^>(\S+)/) {
+ $errors++;
+ warn "file $filename, line $. lacks sequence ID\n";
+ }
+ if ($1 !~ /(^|\|)(gi|kraken:taxid)\|(\d+)/) {
+ $errors++;
+ warn "file $filename, line $.: sequence ID lacks GI number\n";
+ }
+}
+close FASTA;
+
+if ($errors) {
+ exit 1;
+}
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..2f927f6
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,35 @@
+CXX = g++
+CXXFLAGS = -Wall -fopenmp -O3
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink
+
+.PHONY: all install clean
+
+all: $(PROGS)
+
+install: $(PROGS)
+ cp $(PROGS) $(KRAKEN_DIR)/
+
+clean:
+ rm -f $(PROGS) *.o
+
+db_shrink: krakendb.o quickfile.o
+
+db_sort: krakendb.o quickfile.o
+
+set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o
+
+classify: krakendb.o quickfile.o krakenutil.o seqreader.o
+
+make_seqid_to_taxid_map: quickfile.o
+
+krakenutil.o: krakenutil.cpp krakenutil.hpp
+ $(CXX) $(CXXFLAGS) -c krakenutil.cpp
+
+krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp
+ $(CXX) $(CXXFLAGS) -c krakendb.cpp
+
+seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp
+ $(CXX) $(CXXFLAGS) -c seqreader.cpp
+
+quickfile.o: quickfile.cpp quickfile.hpp
+ $(CXX) $(CXXFLAGS) -c quickfile.cpp
diff --git a/src/classify.cpp b/src/classify.cpp
new file mode 100644
index 0000000..b3efb42
--- /dev/null
+++ b/src/classify.cpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "krakendb.hpp"
+#include "krakenutil.hpp"
+#include "quickfile.hpp"
+#include "seqreader.hpp"
+
+const size_t DEF_WORK_UNIT_SIZE = 500000;
+
+using namespace std;
+using namespace kraken;
+
+void parse_command_line(int argc, char **argv);
+void usage(int exit_code=EX_USAGE);
+void process_file(char *filename);
+void classify_sequence(DNASequence &dna, ostringstream &koss,
+ ostringstream &coss, ostringstream &uoss);
+string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig);
+set<uint32_t> get_ancestry(uint32_t taxon);
+void report_stats(struct timeval time1, struct timeval time2);
+
+int Num_threads = 1;
+string DB_filename, Index_filename, Nodes_filename;
+bool Quick_mode = false;
+bool Fastq_input = false;
+bool Print_classified = false;
+bool Print_unclassified = false;
+bool Print_kraken = true;
+bool Populate_memory = false;
+bool Only_classified_kraken_output = false;
+uint32_t Minimum_hit_count = 1;
+map<uint32_t, uint32_t> Parent_map;
+KrakenDB Database;
+string Classified_output_file, Unclassified_output_file, Kraken_output_file;
+ostream *Classified_output;
+ostream *Unclassified_output;
+ostream *Kraken_output;
+size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
+
+uint64_t total_classified = 0;
+uint64_t total_sequences = 0;
+uint64_t total_bases = 0;
+
+int main(int argc, char **argv) {
+ #ifdef _OPENMP
+ omp_set_num_threads(1);
+ #endif
+
+ parse_command_line(argc, argv);
+ if (! Nodes_filename.empty())
+ Parent_map = build_parent_map(Nodes_filename);
+
+ if (Populate_memory)
+ cerr << "Loading database... ";
+
+ QuickFile db_file;
+ db_file.open_file(DB_filename);
+ if (Populate_memory)
+ db_file.load_file();
+ Database = KrakenDB(db_file.ptr());
+ KmerScanner::set_k(Database.get_k());
+
+ QuickFile idx_file;
+ idx_file.open_file(Index_filename);
+ if (Populate_memory)
+ idx_file.load_file();
+ KrakenDBIndex db_index(idx_file.ptr());
+ Database.set_index(&db_index);
+
+ if (Populate_memory)
+ cerr << "complete." << endl;
+
+ if (Print_classified) {
+ if (Classified_output_file == "-")
+ Classified_output = &cout;
+ else
+ Classified_output = new ofstream(Classified_output_file.c_str());
+ }
+
+ if (Print_unclassified) {
+ if (Unclassified_output_file == "-")
+ Unclassified_output = &cout;
+ else
+ Unclassified_output = new ofstream(Unclassified_output_file.c_str());
+ }
+
+ if (! Kraken_output_file.empty()) {
+ if (Kraken_output_file == "-")
+ Print_kraken = false;
+ else
+ Kraken_output = new ofstream(Kraken_output_file.c_str());
+ }
+ else
+ Kraken_output = &cout;
+
+ struct timeval tv1, tv2;
+ gettimeofday(&tv1, NULL);
+ for (int i = optind; i < argc; i++)
+ process_file(argv[i]);
+ gettimeofday(&tv2, NULL);
+
+ report_stats(tv1, tv2);
+
+ return 0;
+}
+
+void report_stats(struct timeval time1, struct timeval time2) {
+ time2.tv_usec -= time1.tv_usec;
+ time2.tv_sec -= time1.tv_sec;
+ if (time2.tv_usec < 0) {
+ time2.tv_sec--;
+ time2.tv_usec += 1000000;
+ }
+ double seconds = time2.tv_usec;
+ seconds /= 1e6;
+ seconds += time2.tv_sec;
+
+ cerr << "\r";
+ fprintf(stderr,
+ "%llu sequences (%.2f Mbp) processed in %.3fs (%.1f Kseq/m, %.2f Mbp/m).\n",
+ (unsigned long long) total_sequences, total_bases / 1.0e6, seconds,
+ total_sequences / 1.0e3 / (seconds / 60),
+ total_bases / 1.0e6 / (seconds / 60) );
+ fprintf(stderr, " %llu sequences classified (%.2f%%)\n",
+ (unsigned long long) total_classified, total_classified * 100.0 / total_sequences);
+ fprintf(stderr, " %llu sequences unclassified (%.2f%%)\n",
+ (unsigned long long) (total_sequences - total_classified),
+ (total_sequences - total_classified) * 100.0 / total_sequences);
+}
+
+void process_file(char *filename) {
+ string file_str(filename);
+ DNASequenceReader *reader;
+ DNASequence dna;
+
+ if (Fastq_input)
+ reader = new FastqReader(file_str);
+ else
+ reader = new FastaReader(file_str);
+
+ #pragma omp parallel
+ {
+ vector<DNASequence> work_unit;
+ ostringstream kraken_output_ss, classified_output_ss, unclassified_output_ss;
+
+ while (reader->is_valid()) {
+ work_unit.clear();
+ size_t total_nt = 0;
+ #pragma omp critical(get_input)
+ {
+ while (total_nt < Work_unit_size) {
+ dna = reader->next_sequence();
+ if (! reader->is_valid())
+ break;
+ work_unit.push_back(dna);
+ total_nt += dna.seq.size();
+ }
+ }
+ if (total_nt == 0)
+ break;
+
+ kraken_output_ss.str("");
+ classified_output_ss.str("");
+ unclassified_output_ss.str("");
+ for (size_t j = 0; j < work_unit.size(); j++)
+ classify_sequence( work_unit[j], kraken_output_ss,
+ classified_output_ss, unclassified_output_ss );
+
+ #pragma omp critical(write_output)
+ {
+ if (Print_kraken)
+ (*Kraken_output) << kraken_output_ss.str();
+ if (Print_classified)
+ (*Classified_output) << classified_output_ss.str();
+ if (Print_unclassified)
+ (*Unclassified_output) << unclassified_output_ss.str();
+ total_sequences += work_unit.size();
+ total_bases += total_nt;
+ cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ...";
+ }
+ }
+ } // end parallel section
+
+ delete reader;
+}
+
+void classify_sequence(DNASequence &dna, ostringstream &koss,
+ ostringstream &coss, ostringstream &uoss) {
+ vector<uint32_t> taxa;
+ vector<uint8_t> ambig_list;
+ map<uint32_t, uint32_t> hit_counts;
+ uint64_t *kmer_ptr;
+ uint32_t taxon = 0;
+ uint32_t hits = 0; // only maintained if in quick mode
+
+ uint64_t current_bin_key;
+ int64_t current_min_pos = 1;
+ int64_t current_max_pos = 0;
+
+ if (dna.seq.size() >= Database.get_k()) {
+ KmerScanner scanner(dna.seq);
+ while ((kmer_ptr = scanner.next_kmer()) != NULL) {
+ taxon = 0;
+ if (scanner.ambig_kmer()) {
+ ambig_list.push_back(1);
+ }
+ else {
+ ambig_list.push_back(0);
+ uint32_t *val_ptr = Database.kmer_query(
+ Database.canonical_representation(*kmer_ptr),
+ ¤t_bin_key,
+ ¤t_min_pos, ¤t_max_pos
+ );
+ taxon = val_ptr ? *val_ptr : 0;
+ if (taxon) {
+ hit_counts[taxon]++;
+ if (Quick_mode && ++hits >= Minimum_hit_count)
+ break;
+ }
+ }
+ taxa.push_back(taxon);
+ }
+ }
+
+ uint32_t call = 0;
+ if (Quick_mode)
+ call = hits >= Minimum_hit_count ? taxon : 0;
+ else
+ call = resolve_tree(hit_counts, Parent_map);
+
+ if (call)
+ #pragma omp atomic
+ total_classified++;
+
+ if (Print_unclassified || Print_classified) {
+ ostringstream *oss_ptr = call ? &coss : &uoss;
+ bool print = call ? Print_classified : Print_unclassified;
+ if (print) {
+ if (Fastq_input) {
+ (*oss_ptr) << "@" << dna.header_line << endl
+ << dna.seq << endl
+ << "+" << endl
+ << dna.quals << endl;
+ }
+ else {
+ (*oss_ptr) << ">" << dna.header_line << endl
+ << dna.seq << endl;
+ }
+ }
+ }
+
+ if (! Print_kraken)
+ return;
+
+ if (call) {
+ koss << "C\t";
+ }
+ else {
+ if (Only_classified_kraken_output)
+ return;
+ koss << "U\t";
+ }
+ koss << dna.id << "\t" << call << "\t" << dna.seq.size() << "\t";
+
+ if (Quick_mode) {
+ koss << "Q:" << hits;
+ }
+ else {
+ if (taxa.empty())
+ koss << "0:0";
+ else
+ koss << hitlist_string(taxa, ambig_list);
+ }
+
+ koss << endl;
+}
+
+string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig)
+{
+ int64_t last_code;
+ int code_count = 1;
+ ostringstream hitlist;
+
+ if (ambig[0]) { last_code = -1; }
+ else { last_code = taxa[0]; }
+
+ for (size_t i = 1; i < taxa.size(); i++) {
+ int64_t code;
+ if (ambig[i]) { code = -1; }
+ else { code = taxa[i]; }
+
+ if (code == last_code) {
+ code_count++;
+ }
+ else {
+ if (last_code >= 0) {
+ hitlist << last_code << ":" << code_count << " ";
+ }
+ else {
+ hitlist << "A:" << code_count << " ";
+ }
+ code_count = 1;
+ last_code = code;
+ }
+ }
+ if (last_code >= 0) {
+ hitlist << last_code << ":" << code_count;
+ }
+ else {
+ hitlist << "A:" << code_count;
+ }
+ return hitlist.str();
+}
+
+set<uint32_t> get_ancestry(uint32_t taxon) {
+ set<uint32_t> path;
+
+ while (taxon > 0) {
+ path.insert(taxon);
+ taxon = Parent_map[taxon];
+ }
+ return path;
+}
+
+void parse_command_line(int argc, char **argv) {
+ int opt;
+ int sig;
+
+ if (argc > 1 && strcmp(argv[1], "-h") == 0)
+ usage(0);
+ while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:M")) != -1) {
+ switch (opt) {
+ case 'd' :
+ DB_filename = optarg;
+ break;
+ case 'i' :
+ Index_filename = optarg;
+ break;
+ case 't' :
+ sig = atoi(optarg);
+ if (sig <= 0)
+ errx(EX_USAGE, "can't use nonpositive thread count");
+ #ifdef _OPENMP
+ Num_threads = sig;
+ omp_set_num_threads(Num_threads);
+ #endif
+ break;
+ case 'n' :
+ Nodes_filename = optarg;
+ break;
+ case 'q' :
+ Quick_mode = true;
+ break;
+ case 'm' :
+ sig = atoi(optarg);
+ if (sig <= 0)
+ errx(EX_USAGE, "can't use nonpositive minimum hit count");
+ Minimum_hit_count = sig;
+ break;
+ case 'f' :
+ Fastq_input = true;
+ break;
+ case 'c' :
+ Only_classified_kraken_output = true;
+ break;
+ case 'C' :
+ Print_classified = true;
+ Classified_output_file = optarg;
+ break;
+ case 'U' :
+ Print_unclassified = true;
+ Unclassified_output_file = optarg;
+ break;
+ case 'o' :
+ Kraken_output_file = optarg;
+ break;
+ case 'u' :
+ sig = atoi(optarg);
+ if (sig <= 0)
+ errx(EX_USAGE, "can't use nonpositive work unit size");
+ Work_unit_size = sig;
+ break;
+ case 'M' :
+ Populate_memory = true;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (DB_filename.empty()) {
+ cerr << "Missing mandatory option -d" << endl;
+ usage();
+ }
+ if (Index_filename.empty()) {
+ cerr << "Missing mandatory option -i" << endl;
+ usage();
+ }
+ if (Nodes_filename.empty() && ! Quick_mode) {
+ cerr << "Must specify one of -q or -n" << endl;
+ usage();
+ }
+ if (optind == argc) {
+ cerr << "No sequence data files specified" << endl;
+ }
+}
+
+void usage(int exit_code) {
+ cerr << "Usage: classify [options] <fasta/fastq file(s)>" << endl
+ << endl
+ << "Options: (*mandatory)" << endl
+ << "* -d filename Kraken DB filename" << endl
+ << "* -i filename Kraken DB index filename" << endl
+ << " -n filename NCBI Taxonomy nodes file" << endl
+ << " -o filename Output file for Kraken output" << endl
+ << " -t # Number of threads" << endl
+ << " -u # Thread work unit size (in bp)" << endl
+ << " -q Quick operation" << endl
+ << " -m # Minimum hit count (ignored w/o -q)" << endl
+ << " -C filename Print classified sequences" << endl
+ << " -U filename Print unclassified sequences" << endl
+ << " -f Input is in FASTQ format" << endl
+ << " -c Only include classified reads in output" << endl
+ << " -M Preload database files" << endl
+ << " -h Print this message" << endl
+ << endl
+ << "At least one FASTA or FASTQ file must be specified." << endl
+ << "Kraken output is to standard output by default." << endl;
+ exit(exit_code);
+}
diff --git a/src/db_shrink.cpp b/src/db_shrink.cpp
new file mode 100644
index 0000000..79feeb8
--- /dev/null
+++ b/src/db_shrink.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+#include "krakendb.hpp"
+
+using namespace std;
+using namespace kraken;
+
+string Input_DB_filename, Output_DB_filename;
+uint64_t Output_count = 0;
+size_t Offset = 1;
+bool Operate_in_RAM = false;
+
+static void parse_command_line(int argc, char **argv);
+static void usage(int exit_code=EX_USAGE);
+
+int main(int argc, char **argv) {
+ parse_command_line(argc, argv);
+
+ uint64_t key_bits, val_len, key_count, key_len;
+ uint64_t pair_size;
+
+ // NOTE: Skipping the KrakenDB use here does make the code a
+ // bit redundant and messy, but it keeps the memory footprint
+ // low, which is important for this particular program.
+
+ // Check file magic and get metadata to calculate header size
+ char *buffer = new char[8];
+ ifstream input_file(Input_DB_filename.c_str(), std::ifstream::binary);
+ input_file.read(buffer, 8);
+ if (strncmp("JFLISTDN", buffer, 8) != 0) {
+ errx(EX_DATAERR, "input file not Jellyfish v1 database");
+ }
+ input_file.read(buffer, 8);
+ memcpy(&key_bits, buffer, 8);
+ size_t header_size = 72 + 2 * (4 + 8 * key_bits);
+ delete buffer;
+
+ // Read in header and get remaining metadata
+ buffer = new char[header_size];
+ input_file.seekg(0, ios_base::beg);
+ input_file.read(buffer, header_size);
+ memcpy(&val_len, buffer + 16, 8);
+ memcpy(&key_count, buffer + 48, 8);
+ key_len = key_bits / 8 + !! (key_bits % 8);
+ pair_size = key_len + val_len;
+
+ if (Output_count > key_count) {
+ errx(EX_DATAERR, "Requested new key count %llu larger than old key count %llu, aborting...",
+ (long long unsigned int) Output_count,
+ (long long unsigned int) key_count);
+ }
+
+ // Change key count
+ memcpy(buffer + 48, &Output_count, 8);
+
+ // Copy input header to output file
+ ofstream output_file(Output_DB_filename.c_str(), std::ofstream::binary);
+ output_file.write(buffer, header_size);
+
+ delete buffer;
+
+ // Prep buffer for scan/select loop
+ // We select one pair (the last) per "block"
+ size_t block_size = key_count / Output_count;
+ if (block_size < Offset) {
+ errx(EX_DATAERR, "offset %llu larger than block size %llu, aborting.",
+ (long long unsigned int) Offset,
+ (long long unsigned int) block_size);
+ }
+ // Some blocks have an extra element
+ size_t odd_block_count = key_count % Output_count;
+ buffer = new char[pair_size * (block_size + 1)];
+ size_t remaining_input_pairs = key_count;
+ size_t current_output_count = 0;
+
+ while (remaining_input_pairs) {
+ size_t pairs_to_read;
+ if (odd_block_count == 0) {
+ pairs_to_read = block_size;
+ }
+ else {
+ pairs_to_read = block_size + 1;
+ odd_block_count--;
+ }
+
+ input_file.read(buffer, pairs_to_read * pair_size);
+ remaining_input_pairs -= pairs_to_read;
+
+ output_file.write(buffer + pair_size * (pairs_to_read - Offset), pair_size);
+ current_output_count++;
+ if (current_output_count % 10000 == 0) {
+ cerr << "\rWritten " << current_output_count << "/" << Output_count << " k-mers to new file";
+ }
+ }
+ cerr << "\rWrote " << current_output_count << "/" << Output_count << " k-mers to new file \n";
+
+ input_file.close();
+ output_file.close();
+
+ return 0;
+}
+
+void parse_command_line(int argc, char **argv) {
+ int opt;
+ int sig;
+
+ if (argc > 1 && strcmp(argv[1], "-h") == 0)
+ usage(0);
+ while ((opt = getopt(argc, argv, "d:o:n:O:")) != -1) {
+ switch (opt) {
+ case 'n' :
+ sig = atoi(optarg);
+ if (sig < 1)
+ errx(EX_USAGE, "output count cannot be negative");
+ Output_count = sig;
+ break;
+ case 'd' :
+ Input_DB_filename = optarg;
+ break;
+ case 'o' :
+ Output_DB_filename = optarg;
+ break;
+ case 'O' :
+ sig = atoi(optarg);
+ if (sig < 1)
+ errx(EX_USAGE, "offset count cannot be negative");
+ Offset = sig;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (Input_DB_filename.empty() || Output_DB_filename.empty() || ! Output_count)
+ usage();
+}
+
+void usage(int exit_code) {
+ cerr << "Usage: db_shrink [-O offset] <-d input db> <-o output db> <-n output count>\n";
+ exit(exit_code);
+}
diff --git a/src/db_sort.cpp b/src/db_sort.cpp
new file mode 100644
index 0000000..f4b8aca
--- /dev/null
+++ b/src/db_sort.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+#include "krakendb.hpp"
+
+using namespace std;
+using namespace kraken;
+
+string Input_DB_filename, Output_DB_filename, Index_filename;
+uint8_t Bin_key_nt = 15;
+int Num_threads = 1;
+bool Zero_vals = false;
+bool Operate_in_RAM = false;
+// Global until I can find a way to pass this to the sorting function
+size_t Key_len = 8;
+
+static int pair_cmp(const void *a, const void *b);
+static void parse_command_line(int argc, char **argv);
+static void bin_and_sort_data(KrakenDB &kdb, char *data, KrakenDBIndex &idx);
+static void usage(int exit_code=EX_USAGE);
+
+int main(int argc, char **argv) {
+ #ifdef _OPENMP
+ omp_set_num_threads(1);
+ #endif
+
+ parse_command_line(argc, argv);
+
+ QuickFile input_db_file(Input_DB_filename);
+ KrakenDB *input_db = new KrakenDB(input_db_file.ptr());
+ Key_len = input_db->get_key_len();
+ uint64_t val_len = input_db->get_val_len();
+ uint64_t key_ct = input_db->get_key_ct();
+ input_db->make_index(Index_filename, Bin_key_nt);
+ QuickFile index_file(Index_filename);
+ KrakenDBIndex db_index(index_file.ptr());
+
+ uint64_t skip_len = input_db->header_size();
+ char *header = new char[ skip_len ];
+ memcpy(header, input_db_file.ptr(), skip_len);
+
+ delete input_db;
+ // No longer legit for traversal, but allows access to KDB functions
+ input_db = new KrakenDB(header);
+ input_db_file.close_file(); // Stop using memory-mapped file
+
+ char *data = new char[ key_ct * (Key_len + val_len) ];
+ // Populate data w/ pairs from DB and sort bins in parallel
+ bin_and_sort_data(*input_db, data, db_index);
+
+ ofstream output_file(Output_DB_filename.c_str(), std::ofstream::binary);
+ output_file.write(header, skip_len);
+ output_file.write(data, key_ct * (Key_len + val_len));
+ output_file.close();
+
+ return 0;
+}
+
+static void bin_and_sort_data(KrakenDB &kdb, char *data, KrakenDBIndex &idx) {
+ uint8_t nt = idx.indexed_nt();
+ uint64_t *offsets = idx.get_array();
+ uint64_t entries = 1ull << (nt * 2);
+ uint64_t key_len = kdb.get_key_len();
+ uint64_t val_len = kdb.get_val_len();
+ uint64_t pair_size = key_len + val_len;
+ char pair[pair_size];
+
+ ifstream input_file(Input_DB_filename.c_str(), std::ifstream::binary);
+ input_file.seekg(kdb.header_size(), ios_base::beg);
+
+ // Create a copy of the offsets array for use as insertion positions
+ vector<uint64_t> pos(offsets, offsets + entries);
+ for (uint64_t i = 0; i < kdb.get_key_ct(); i++) {
+ input_file.read(pair, pair_size);
+ uint64_t kmer = 0;
+ memcpy(&kmer, pair, key_len);
+ uint64_t b_key = kdb.bin_key(kmer, nt);
+ char *pair_pos = data + pair_size * pos[b_key]++;
+ // Copy pair into correct bin (but not final position)
+ memcpy(pair_pos, pair, pair_size);
+ if (Zero_vals)
+ memset(pair_pos + key_len, 0, val_len);
+ }
+ input_file.close();
+
+ // Sort all bins
+ #pragma omp parallel for schedule(dynamic)
+ for (uint64_t i = 0; i < entries; i++) {
+ qsort(data + offsets[i] * pair_size,
+ offsets[i+1] - offsets[i], pair_size,
+ pair_cmp);
+ }
+}
+
+static int pair_cmp(const void *a, const void *b) {
+ uint64_t aval = 0, bval = 0;
+ memcpy(&aval, a, Key_len);
+ memcpy(&bval, b, Key_len);
+ if (aval < bval)
+ return -1;
+ else if (aval == bval)
+ return 0;
+ else
+ return 1;
+}
+
+void parse_command_line(int argc, char **argv) {
+ int opt;
+ int sig;
+
+ if (argc > 1 && strcmp(argv[1], "-h") == 0)
+ usage(0);
+ while ((opt = getopt(argc, argv, "n:d:o:i:t:zM")) != -1) {
+ switch (opt) {
+ case 'n' :
+ sig = atoi(optarg);
+ if (sig < 1 || sig > 31)
+ errx(EX_USAGE, "bin key length out of range");
+ Bin_key_nt = (uint8_t) sig;
+ break;
+ case 'd' :
+ Input_DB_filename = optarg;
+ break;
+ case 'o' :
+ Output_DB_filename = optarg;
+ break;
+ case 'i' :
+ Index_filename = optarg;
+ break;
+ case 'M' :
+ Operate_in_RAM = true;
+ break;
+ case 't' :
+ sig = atoi(optarg);
+ if (sig <= 0)
+ errx(EX_USAGE, "can't use nonpositive thread count");
+ #ifdef _OPENMP
+ Num_threads = sig;
+ omp_set_num_threads(Num_threads);
+ #endif
+ break;
+ case 'z' :
+ Zero_vals = true;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (Input_DB_filename.empty() || Output_DB_filename.empty()
+ || Index_filename.empty())
+ usage();
+}
+
+void usage(int exit_code) {
+ cerr << "Usage: db_sort [-z] [-M] [-t threads] [-n nt] <-d input db> <-o output db> <-i output idx>\n";
+ exit(exit_code);
+}
diff --git a/src/kraken_headers.hpp b/src/kraken_headers.hpp
new file mode 100644
index 0000000..bca9858
--- /dev/null
+++ b/src/kraken_headers.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef KRAKEN_HEADERS_HPP
+#define KRAKEN_HEADERS_HPP
+
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 1
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <omp.h>
+#include <set>
+#include <sstream>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <vector>
+
+#endif
diff --git a/src/krakendb.cpp b/src/krakendb.cpp
new file mode 100644
index 0000000..83e71b3
--- /dev/null
+++ b/src/krakendb.cpp
@@ -0,0 +1,321 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "krakendb.hpp"
+#include "quickfile.hpp"
+
+using std::string;
+using std::vector;
+
+namespace kraken {
+
+// File type code for Jellyfish/Kraken DBs
+static const char * DATABASE_FILE_TYPE = "JFLISTDN";
+
+// File type code on Kraken DB index
+// Next byte determines # of indexed nt
+static const char * KRAKEN_INDEX_STRING = "KRAKIDX";
+
+// File type code for Kraken DB index (v2)
+// v2 index corresponds to DB sorted by scrambled order
+// Next byte determines # of indexed nt
+static const char * KRAKEN_INDEX2_STRING = "KRAKIX2";
+
+// XOR mask for minimizer bin keys (allows for better distribution)
+// scrambles minimizer sort order
+static const uint64_t INDEX2_XOR_MASK = 0xe37e28c4271b5a2dULL;
+
+// Basic constructor
+KrakenDB::KrakenDB() {
+ fptr = NULL;
+ index_ptr = NULL;
+ key_ct = 0;
+ val_len = 0;
+ key_len = 0;
+ key_bits = 0;
+ k = 0;
+}
+
+// Assumes ptr points to start of a readable mmap'ed file
+KrakenDB::KrakenDB(char *ptr) {
+ index_ptr = NULL;
+ fptr = ptr;
+ if (strncmp(ptr, DATABASE_FILE_TYPE, strlen(DATABASE_FILE_TYPE)))
+ errx(EX_DATAERR, "database in improper format");
+ memcpy(&key_bits, ptr + 8, 8);
+ memcpy(&val_len, ptr + 16, 8);
+ memcpy(&key_ct, ptr + 48, 8);
+ if (val_len != 4)
+ errx(EX_DATAERR, "can only handle 4 byte DB values");
+ k = key_bits / 2;
+ key_len = key_bits / 8 + !! (key_bits % 8);
+}
+
+// Creates an index, indicating starting positions of each bin
+// Bins contain k-mer/taxon pairs with k-mers that share a bin key
+void KrakenDB::make_index(string index_filename, uint8_t nt) {
+ uint64_t entries = 1ull << (nt * 2);
+ vector<uint64_t> bin_counts(entries);
+ char *ptr = get_pair_ptr();
+ #pragma omp parallel for schedule(dynamic,400)
+ for (uint64_t i = 0; i < key_ct; i++) {
+ uint64_t kmer = 0;
+ memcpy(&kmer, ptr + i * pair_size(), key_len);
+ uint64_t b_key = bin_key(kmer, nt);
+ #pragma omp atomic
+ bin_counts[b_key]++;
+ }
+
+ uint64_t *bin_offsets = new uint64_t[ entries + 1 ];
+ bin_offsets[0] = 0;
+ for (uint64_t i = 1; i <= entries; i++)
+ bin_offsets[i] = bin_offsets[i-1] + bin_counts[i-1];
+
+ QuickFile idx_file(index_filename, "w",
+ strlen(KRAKEN_INDEX2_STRING) + 1 + sizeof(*bin_offsets) * (entries + 1));
+ char *idx_ptr = idx_file.ptr();
+ memcpy(idx_ptr, KRAKEN_INDEX2_STRING, strlen(KRAKEN_INDEX2_STRING));
+ idx_ptr += strlen(KRAKEN_INDEX2_STRING);
+ memcpy(idx_ptr++, &nt, 1);
+ memcpy(idx_ptr, bin_offsets, sizeof(*bin_offsets) * (entries + 1));
+}
+
+// Simple accessor
+char *KrakenDB::get_ptr() {
+ return fptr;
+}
+
+// Returns start of k-mer/taxon pair array (skips header)
+char *KrakenDB::get_pair_ptr() {
+ return fptr == NULL ? NULL : fptr + header_size();
+}
+
+// Simple accessor
+KrakenDBIndex *KrakenDB::get_index() {
+ return index_ptr;
+}
+
+// Associates the index with this database
+void KrakenDB::set_index(KrakenDBIndex *i_ptr) {
+ index_ptr = i_ptr;
+}
+
+// Simple accessors/convenience methods
+uint8_t KrakenDB::get_k() { return k; }
+uint64_t KrakenDB::get_key_bits() { return key_bits; }
+uint64_t KrakenDB::get_key_len() { return key_len; }
+uint64_t KrakenDB::get_val_len() { return val_len; }
+uint64_t KrakenDB::get_key_ct() { return key_ct; }
+uint64_t KrakenDB::pair_size() { return key_len + val_len; }
+size_t KrakenDB::header_size() { return 72 + 2 * (4 + 8 * key_bits); }
+
+// Bin key: each k-mer is made of several overlapping m-mers, m < k
+// The bin key is the m-mer whose canonical representation is "smallest"
+// ("smallest" can refer to lexico. ordering or some other ordering)
+uint64_t KrakenDB::bin_key(uint64_t kmer, uint64_t idx_nt) {
+ uint8_t nt = idx_nt;
+ uint64_t xor_mask = INDEX2_XOR_MASK;
+ uint64_t mask = 1 << (nt * 2);
+ mask--;
+ xor_mask &= mask;
+ uint64_t min_bin_key = ~0;
+ for (uint64_t i = 0; i < key_bits / 2 - nt + 1; i++) {
+ uint64_t temp_bin_key = xor_mask ^ canonical_representation(kmer & mask, nt);
+ if (temp_bin_key < min_bin_key)
+ min_bin_key = temp_bin_key;
+ kmer >>= 2;
+ }
+ return min_bin_key;
+}
+
+// Separate functions to avoid a conditional in the function
+// This probably isn't necessary...
+uint64_t KrakenDB::bin_key(uint64_t kmer) {
+ uint8_t nt = index_ptr->indexed_nt();
+ uint8_t idx_type = index_ptr->index_type();
+ uint64_t xor_mask = idx_type == 1 ? 0 : INDEX2_XOR_MASK;
+ uint64_t mask = 1 << (nt * 2);
+ mask--;
+ xor_mask &= mask;
+ uint64_t min_bin_key = ~0;
+ for (uint64_t i = 0; i < key_bits / 2 - nt + 1; i++) {
+ uint64_t temp_bin_key = xor_mask ^ canonical_representation(kmer & mask, nt);
+ if (temp_bin_key < min_bin_key)
+ min_bin_key = temp_bin_key;
+ kmer >>= 2;
+ }
+ return min_bin_key;
+}
+
+// Code mostly from Jellyfish 1.6 source
+uint64_t KrakenDB::reverse_complement(uint64_t kmer, uint8_t n) {
+ kmer = ((kmer >> 2) & 0x3333333333333333UL) | ((kmer & 0x3333333333333333UL) << 2);
+ kmer = ((kmer >> 4) & 0x0F0F0F0F0F0F0F0FUL) | ((kmer & 0x0F0F0F0F0F0F0F0FUL) << 4);
+ kmer = ((kmer >> 8) & 0x00FF00FF00FF00FFUL) | ((kmer & 0x00FF00FF00FF00FFUL) << 8);
+ kmer = ((kmer >> 16) & 0x0000FFFF0000FFFFUL) | ((kmer & 0x0000FFFF0000FFFFUL) << 16);
+ kmer = ( kmer >> 32 ) | ( kmer << 32);
+ return (((uint64_t)-1) - kmer) >> (8 * sizeof(kmer) - (n << 1));
+}
+
+// Code mostly from Jellyfish 1.6 source
+uint64_t KrakenDB::reverse_complement(uint64_t kmer) {
+ kmer = ((kmer >> 2) & 0x3333333333333333UL) | ((kmer & 0x3333333333333333UL) << 2);
+ kmer = ((kmer >> 4) & 0x0F0F0F0F0F0F0F0FUL) | ((kmer & 0x0F0F0F0F0F0F0F0FUL) << 4);
+ kmer = ((kmer >> 8) & 0x00FF00FF00FF00FFUL) | ((kmer & 0x00FF00FF00FF00FFUL) << 8);
+ kmer = ((kmer >> 16) & 0x0000FFFF0000FFFFUL) | ((kmer & 0x0000FFFF0000FFFFUL) << 16);
+ kmer = ( kmer >> 32 ) | ( kmer << 32);
+ return (((uint64_t)-1) - kmer) >> (8 * sizeof(kmer) - (k << 1));
+}
+
+// Lexicographically smallest of k-mer and reverse comp. of k-mer
+uint64_t KrakenDB::canonical_representation(uint64_t kmer, uint8_t n) {
+ uint64_t revcom = reverse_complement(kmer, n);
+ return kmer < revcom ? kmer : revcom;
+}
+
+uint64_t KrakenDB::canonical_representation(uint64_t kmer) {
+ uint64_t revcom = reverse_complement(kmer, k);
+ return kmer < revcom ? kmer : revcom;
+}
+
+// perform search over last range to speed up queries
+// NOTE: retry_on_failure implies all pointer params are non-NULL
+uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t *last_bin_key,
+ int64_t *min_pos, int64_t *max_pos,
+ bool retry_on_failure)
+{
+ int64_t min, max, mid;
+ uint64_t comp_kmer;
+ uint64_t b_key;
+ char *ptr = get_pair_ptr();
+ size_t pair_sz = pair_size();
+
+ // Use provided values if they exist and are valid
+ if (retry_on_failure && *min_pos <= *max_pos) {
+ b_key = *last_bin_key;
+ min = *min_pos;
+ max = *max_pos;
+ }
+ else {
+ b_key = bin_key(kmer);
+ min = index_ptr->at(b_key);
+ max = index_ptr->at(b_key + 1) - 1;
+ // Invalid min/max values + retry_on_failure means min/max need to be
+ // initialized and set in caller
+ if (retry_on_failure) {
+ *last_bin_key = b_key;
+ *min_pos = min;
+ *max_pos = max;
+ }
+ }
+
+ // Binary search with large window
+ while (min + 15 <= max) {
+ mid = min + (max - min) / 2;
+ comp_kmer = 0;
+ memcpy(&comp_kmer, ptr + pair_sz * mid, key_len);
+ comp_kmer &= (1ull << key_bits) - 1; // trim any excess
+ if (kmer > comp_kmer)
+ min = mid + 1;
+ else if (kmer < comp_kmer)
+ max = mid - 1;
+ else
+ return (uint32_t *) (ptr + pair_sz * mid + key_len);
+ }
+ // Linear search once window shrinks
+ for (mid = min; mid <= max; mid++) {
+ comp_kmer = 0;
+ memcpy(&comp_kmer, ptr + pair_sz * mid, key_len);
+ comp_kmer &= (1ull << key_bits) - 1; // trim any excess
+ if (kmer == comp_kmer)
+ return (uint32_t *) (ptr + pair_sz * mid + key_len);
+ }
+
+ uint32_t *answer = NULL;
+ // ROF implies the provided values might be out of date
+ // If they are, we'll update them and search again
+ if (retry_on_failure) {
+ b_key = bin_key(kmer);
+ // If bin key hasn't changed, search fails
+ if (b_key == *last_bin_key)
+ return NULL;
+ min = index_ptr->at(b_key);
+ max = index_ptr->at(b_key + 1) - 1;
+ // Recursive call w/ adjusted search params and w/o retry
+ answer = kmer_query(kmer, &b_key, &min, &max, false);
+ // Update caller's search params due to bin key change
+ if (last_bin_key != NULL) {
+ *last_bin_key = b_key;
+ *min_pos = min;
+ *max_pos = max;
+ }
+ }
+ return answer;
+}
+
+// Binary search w/in the k-mer's bin
+uint32_t *KrakenDB::kmer_query(uint64_t kmer) {
+ return kmer_query(kmer, NULL, NULL, NULL, false);
+}
+
+KrakenDBIndex::KrakenDBIndex() {
+ fptr = NULL;
+ idx_type = 1;
+ nt = 0;
+}
+
+KrakenDBIndex::KrakenDBIndex(char *ptr) {
+ fptr = ptr;
+ idx_type = 1;
+ if (strncmp(ptr, KRAKEN_INDEX_STRING, strlen(KRAKEN_INDEX_STRING))) {
+ idx_type = 2;
+ if (strncmp(ptr, KRAKEN_INDEX2_STRING, strlen(KRAKEN_INDEX2_STRING)))
+ errx(EX_DATAERR, "illegal Kraken DB index format");
+ }
+ ptr += strlen(KRAKEN_INDEX_STRING);
+ memcpy(&nt, ptr, 1);
+}
+
+// Index version (v2 uses different minimizer sort order)
+uint8_t KrakenDBIndex::index_type() {
+ return idx_type;
+}
+
+// How long are bin keys (i.e., what is minimizer length in bp?)
+uint8_t KrakenDBIndex::indexed_nt() {
+ return nt;
+}
+
+// Return start of index array (skips header)
+uint64_t *KrakenDBIndex::get_array() {
+ return (uint64_t *) (fptr + strlen(KRAKEN_INDEX_STRING) + 1);
+}
+
+// Convenience method, allows for testing guard
+uint64_t KrakenDBIndex::at(uint64_t idx) {
+ uint64_t *array = get_array();
+ #ifdef TESTING
+ if (idx > 1 + (1ull << (nt * 2)))
+ errx(EX_SOFTWARE, "KrakenDBIndex::at() called with illegal index");
+ #endif
+ return array[idx];
+}
+
+} // namespace
diff --git a/src/krakendb.hpp b/src/krakendb.hpp
new file mode 100644
index 0000000..c30eeb5
--- /dev/null
+++ b/src/krakendb.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef KRAKENDB_HPP
+#define KRAKENDB_HPP
+
+#include "kraken_headers.hpp"
+
+namespace kraken {
+ class KrakenDBIndex {
+ public:
+ KrakenDBIndex();
+ // ptr points to mmap'ed existing file opened in read or read/write mode
+ KrakenDBIndex(char *ptr);
+
+ uint8_t index_type();
+ uint8_t indexed_nt();
+ uint64_t *get_array();
+ uint64_t at(uint64_t idx);
+
+ private:
+ uint8_t idx_type;
+ char *fptr;
+ uint8_t nt;
+ };
+
+ class KrakenDB {
+ public:
+
+ char *get_ptr(); // Return the file pointer
+ char *get_pair_ptr(); // Return pointer to start of pairs
+ KrakenDBIndex *get_index(); // Return ptr to assoc'd index obj
+ uint8_t get_k(); // how many nt are in each key?
+ uint64_t get_key_bits(); // how many bits are in each key?
+ uint64_t get_key_len(); // how many bytes does each key occupy?
+ uint64_t get_val_len(); // how many bytes does each value occupy?
+ uint64_t get_key_ct(); // how many key/value pairs are there?
+ uint64_t pair_size(); // how many bytes does each pair occupy?
+
+ size_t header_size(); // Jellyfish uses variable header sizes
+ uint32_t *kmer_query(uint64_t kmer); // return ptr to pair w/ kmer
+
+ // perform search over last range to speed up queries
+ uint32_t *kmer_query(uint64_t kmer, uint64_t *last_bin_key,
+ int64_t *min_pos, int64_t *max_pos,
+ bool retry_on_failure=true);
+
+ // return "bin key" for kmer, based on index
+ // If idx_nt not specified, use index's value
+ uint64_t bin_key(uint64_t kmer, uint64_t idx_nt);
+ uint64_t bin_key(uint64_t kmer);
+
+ // Code from Jellyfish, rev. comp. of a k-mer with n nt.
+ // If n is not specified, use k in DB, otherwise use first n nt in kmer
+ uint64_t reverse_complement(uint64_t kmer, uint8_t n);
+ uint64_t reverse_complement(uint64_t kmer);
+
+ // Return lexicographically smallest of kmer/revcom(kmer)
+ // If n is not specified, use k in DB, otherwise use first n nt in kmer
+ uint64_t canonical_representation(uint64_t kmer, uint8_t n);
+ uint64_t canonical_representation(uint64_t kmer);
+
+ void make_index(std::string index_filename, uint8_t nt);
+
+ void set_index(KrakenDBIndex *i_ptr);
+
+ // Null constructor
+ KrakenDB();
+
+ // ptr points to start of mmap'ed DB in read or read/write mode
+ KrakenDB(char *ptr);
+
+ private:
+
+ char *fptr;
+ KrakenDBIndex *index_ptr;
+ uint8_t k;
+ uint64_t key_bits;
+ uint64_t key_len;
+ uint64_t val_len;
+ uint64_t key_ct;
+ };
+}
+
+#endif
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
new file mode 100644
index 0000000..a00e6bb
--- /dev/null
+++ b/src/krakenutil.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "krakenutil.hpp"
+
+using namespace std;
+
+namespace kraken {
+ // Build a node->parent map from NCBI Taxonomy nodes.dmp file
+ map<uint32_t, uint32_t> build_parent_map(string filename) {
+ map<uint32_t, uint32_t> pmap;
+ uint32_t node_id, parent_id;
+ string line;
+ ifstream ifs(filename.c_str());
+ if (ifs.rdstate() & ifstream::failbit) {
+ err(EX_NOINPUT, "error opening %s", filename.c_str());
+ }
+
+ while (ifs.good()) {
+ getline(ifs, line);
+ if (line.empty())
+ break;
+ sscanf(line.c_str(), "%d\t|\t%d", &node_id, &parent_id);
+ pmap[node_id] = parent_id;
+ }
+ pmap[1] = 0;
+ return pmap;
+ }
+
+ // Return lowest common ancestor of a and b
+ // LCA(0,x) = LCA(x,0) = x
+ // Default ancestor is 1 (root of tree)
+ uint32_t lca(map<uint32_t, uint32_t> &parent_map,
+ uint32_t a, uint32_t b)
+ {
+ if (a == 0 || b == 0)
+ return a ? a : b;
+
+ set<uint32_t> a_path;
+ while (a > 0) {
+ a_path.insert(a);
+ a = parent_map[a];
+ }
+ while (b > 0) {
+ if (a_path.count(b) > 0)
+ return b;
+ b = parent_map[b];
+ }
+ return 1;
+ }
+
+ // Tree resolution: take all hit taxa (plus ancestors), then
+ // return leaf of highest weighted leaf-to-root path.
+ uint32_t resolve_tree(map<uint32_t, uint32_t> &hit_counts,
+ map<uint32_t, uint32_t> &parent_map)
+ {
+ set<uint32_t> max_taxa;
+ uint32_t max_taxon = 0, max_score = 0;
+ map<uint32_t, uint32_t>::iterator it = hit_counts.begin();
+
+ // Sum each taxon's LTR path
+ while (it != hit_counts.end()) {
+ uint32_t taxon = it->first;
+ uint32_t node = taxon;
+ uint32_t score = 0;
+ while (node > 0) {
+ score += hit_counts[node];
+ node = parent_map[node];
+ }
+
+ if (score > max_score) {
+ max_taxa.clear();
+ max_score = score;
+ max_taxon = taxon;
+ }
+ else if (score == max_score) {
+ if (max_taxa.empty())
+ max_taxa.insert(max_taxon);
+ max_taxa.insert(taxon);
+ }
+
+ it++;
+ }
+
+ // If two LTR paths are tied for max, return LCA of all
+ if (! max_taxa.empty()) {
+ set<uint32_t>::iterator sit = max_taxa.begin();
+ max_taxon = *sit;
+ for (sit++; sit != max_taxa.end(); sit++)
+ max_taxon = lca(parent_map, max_taxon, *sit);
+ }
+
+ return max_taxon;
+ }
+
+ uint8_t KmerScanner::k = 0;
+ uint64_t KmerScanner::kmer_mask = 0;
+ uint32_t KmerScanner::mini_kmer_mask = 0;
+
+ // Create a scanner for the string over the interval [start, finish)
+ KmerScanner::KmerScanner(string &seq, size_t start, size_t finish) {
+ if (! k)
+ errx(EX_SOFTWARE, "KmerScanner created w/o setting k");
+ if (finish > seq.size())
+ finish = seq.size();
+
+ kmer = 0;
+ ambig = 0;
+ str = &seq;
+ curr_pos = start;
+ pos1 = start;
+ pos2 = finish;
+ loaded_nt = 0;
+ if (pos2 - pos1 + 1 < k)
+ curr_pos = pos2;
+ }
+
+ uint8_t KmerScanner::get_k() { return k; }
+
+ void KmerScanner::set_k(uint8_t n) {
+ if (k) // Only allow one setting per execution
+ return;
+ k = n;
+ kmer_mask = ~0;
+ kmer_mask >>= sizeof(kmer_mask) * 8 - (k * 2);
+ mini_kmer_mask = ~0;
+ mini_kmer_mask >>= sizeof(mini_kmer_mask) * 8 - k;
+ }
+
+ uint64_t *KmerScanner::next_kmer() {
+ if (curr_pos >= pos2)
+ return NULL;
+ if (loaded_nt)
+ loaded_nt--;
+ while (loaded_nt < k) {
+ loaded_nt++;
+ kmer <<= 2;
+ ambig <<= 1;
+ switch ((*str)[curr_pos++]) {
+ case 'A': case 'a':
+ break;
+ case 'C': case 'c':
+ kmer |= 1;
+ break;
+ case 'G': case 'g':
+ kmer |= 2;
+ break;
+ case 'T': case 't':
+ kmer |= 3;
+ break;
+ default:
+ ambig |= 1;
+ break;
+ }
+ kmer &= kmer_mask;
+ ambig &= mini_kmer_mask;
+ }
+ return &kmer;
+ }
+
+ bool KmerScanner::ambig_kmer() {
+ return !! ambig;
+ }
+}
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
new file mode 100644
index 0000000..196ee1d
--- /dev/null
+++ b/src/krakenutil.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef KRAKENUTIL_HPP
+#define KRAKENUTIL_HPP
+
+#include "kraken_headers.hpp"
+
+namespace kraken {
+ // Build a map of node to parent from an NCBI taxonomy nodes.dmp file
+ std::map<uint32_t, uint32_t> build_parent_map(std::string filename);
+
+ // Return the lowest common ancestor of a and b, according to parent_map
+ // NOTE: LCA(0,x) = LCA(x,0) = x
+ uint32_t lca(std::map<uint32_t, uint32_t> &parent_map,
+ uint32_t a, uint32_t b);
+
+ // Resolve classification tree
+ uint32_t resolve_tree(std::map<uint32_t, uint32_t> &hit_counts,
+ std::map<uint32_t, uint32_t> &parent_map);
+
+ class KmerScanner {
+ public:
+
+ KmerScanner(std::string &seq, size_t start=0, size_t finish=~0);
+ uint64_t *next_kmer(); // NULL when seq exhausted
+ bool ambig_kmer(); // does last returned kmer have non-ACGT?
+
+
+ static uint8_t get_k();
+ // MUST be called before first invocation of KmerScanner()
+ static void set_k(uint8_t n);
+
+ private:
+ std::string *str;
+ size_t curr_pos, pos1, pos2;
+ uint64_t kmer; // the kmer, address is returned (don't share b/t thr.)
+ uint32_t ambig; // is there an ambiguous nucleotide in the kmer?
+ int64_t loaded_nt;
+
+ static uint8_t k; // init. to 0 b/c static
+ static uint64_t kmer_mask;
+ static uint32_t mini_kmer_mask;
+ };
+}
+
+#endif
diff --git a/src/make_seqid_to_taxid_map.cpp b/src/make_seqid_to_taxid_map.cpp
new file mode 100644
index 0000000..30b3091
--- /dev/null
+++ b/src/make_seqid_to_taxid_map.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// Produce a mapping of sequence IDs to taxon IDs
+
+// This program's reason for being is that the gi_taxid_nucl.dmp file
+// is monstrously huge, and the only efficient way to do this task is
+// to use mmap to quickly access the file. Otherwise, I'd have just
+// used a little Perl script instead of all these strchr() calls.
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+
+using namespace std;
+using namespace kraken;
+
+#define USER_SPECIFIED_FLAG "TAXID"
+
+map<string, uint64_t> user_specified_taxids;
+map<uint64_t, set<string> > requests;
+uint64_t request_count = 0;
+
+void fill_request_map(char *filename);
+void report_taxo_numbers(char *filename);
+
+int main(int argc, char **argv) {
+ if (argc < 3) {
+ cerr << "Usage: make_seqid_to_taxid_map <gi to taxid map> <gi to seqid list>"
+ << endl;
+ return 1;
+ }
+ char *map_filename = argv[1];
+ char *list_filename = argv[2];
+ fill_request_map(list_filename);
+ report_taxo_numbers(map_filename);
+
+ return 0;
+}
+
+void report_taxo_numbers(char *filename) {
+ string file_str = filename;
+ QuickFile file(file_str);
+ char *fptr, *fptr_start;
+ fptr_start = fptr = file.ptr();
+ size_t file_size = file.size();
+
+ // Line format: <gi num><tab><taxon ID>
+ while (request_count > 0 && (size_t)(fptr - fptr_start) < file_size) {
+ char *nl_ptr = strchr(fptr, '\n');
+ uint64_t gi = atoll(fptr);
+
+ if (requests.count(gi) > 0) {
+ char *tab_ptr = strchr(fptr, '\t');
+ set<string>::iterator it;
+ // Output line format: <sequence ID><tab><taxon ID>
+ for (it = requests[gi].begin(); it != requests[gi].end(); it++) {
+ cout << *it << "\t";
+ cout.write(tab_ptr + 1, nl_ptr - tab_ptr);
+ request_count--;
+ }
+ }
+
+ fptr = nl_ptr + 1;
+ }
+ file.close_file();
+
+ // Same as before - just doing the user specified sequences now
+ // Output line format: <sequence ID><tab><taxon ID>
+ map<string, uint64_t>::iterator mit = user_specified_taxids.begin();
+ while (mit != user_specified_taxids.end()) {
+ cout << mit->first << "\t" << mit->second << endl;
+ mit++;
+ }
+}
+
+void fill_request_map(char *filename) {
+ string file_str = filename;
+ QuickFile file(file_str);
+ char *fptr, *fptr_start;
+ fptr_start = fptr = file.ptr();
+ size_t file_size = file.size();
+
+ // Line format: <gi num><tab><sequence ID>
+ // OR: TAXID<tab><taxonomy ID><tab><sequence ID> (user spec'ed)
+ while ((size_t)(fptr - fptr_start) < file_size) {
+ char *nl_ptr = strchr(fptr, '\n');
+ char *sep_ptr = strchr(fptr, '\t');
+ if (strncmp(fptr, USER_SPECIFIED_FLAG, strlen(USER_SPECIFIED_FLAG)) == 0) {
+ char *sep_ptr = strchr(fptr, '\t');
+ uint64_t taxid = atoll(sep_ptr + 1);
+ sep_ptr = strchr(sep_ptr + 1, '\t');
+ string seqid(sep_ptr + 1, nl_ptr - sep_ptr - 1);
+ user_specified_taxids[seqid] = taxid;
+ }
+ else {
+ uint64_t gi = atoll(fptr);
+ requests[gi].insert(string(sep_ptr + 1, nl_ptr - sep_ptr - 1));
+ request_count++;
+ }
+ fptr = nl_ptr + 1;
+ }
+
+ file.close_file();
+}
diff --git a/src/quickfile.cpp b/src/quickfile.cpp
new file mode 100644
index 0000000..ddabe9a
--- /dev/null
+++ b/src/quickfile.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+
+using std::string;
+
+namespace kraken {
+
+QuickFile::QuickFile() {
+ valid = false;
+ fptr = NULL;
+ filesize = 0;
+ fd = -1;
+}
+
+QuickFile::QuickFile(string filename_str, string mode, size_t size) {
+ open_file(filename_str, mode, size);
+}
+
+void QuickFile::open_file(string filename_str, string mode, size_t size) {
+ const char *filename = filename_str.c_str();
+ int o_flags = mode == "w"
+ ? O_RDWR | O_CREAT | O_TRUNC
+ : mode == "r" ? O_RDONLY : O_RDWR;
+ int m_flags = mode == "r" ? MAP_PRIVATE : MAP_SHARED;
+
+ fd = open(filename, o_flags, 0666);
+ // Second try for R/W if failure was due to non-existence
+ if (fd < 0 && mode == "rw" && errno == ENOENT) {
+ o_flags |= O_CREAT;
+ fd = open(filename, o_flags, 0666);
+ }
+ if (fd < 0)
+ err(EX_OSERR, "unable to open %s", filename);
+
+ if (o_flags & O_CREAT) {
+ if (lseek(fd, size - 1, SEEK_SET) < 0)
+ err(EX_OSERR, "unable to lseek (%s)", filename);
+ if (write(fd, "", 1) < 0)
+ err(EX_OSERR, "write error (%s)", filename);
+ filesize = size;
+ }
+ else {
+ struct stat sb;
+ if (fstat(fd, &sb) < 0)
+ err(EX_OSERR, "unable to fstat %s", filename);
+ filesize = sb.st_size;
+ }
+
+ fptr = (char *)mmap(0, filesize, PROT_READ | PROT_WRITE, m_flags, fd, 0);
+ if (fptr == MAP_FAILED)
+ err(EX_OSERR, "unable to mmap %s", filename);
+ valid = true;
+}
+
+void QuickFile::load_file() {
+ int thread_ct = 1;
+ int thread = 0;
+ #ifdef _OPENMP
+ int old_thread_ct = omp_get_max_threads();
+ if (old_thread_ct > 4)
+ omp_set_num_threads(4);
+ thread_ct = omp_get_max_threads();
+ #endif
+
+ size_t page_size = getpagesize();
+ char buf[thread_ct][page_size];
+
+ #pragma omp parallel
+ {
+ #ifdef _OPENMP
+ thread = omp_get_thread_num();
+ #endif
+
+ #pragma omp for schedule(dynamic)
+ for (size_t pos = 0; pos < filesize; pos += page_size) {
+ size_t this_page_size = filesize - pos;
+ if (this_page_size > page_size)
+ this_page_size = page_size;
+ memcpy(buf[thread], fptr + pos, this_page_size);
+ }
+ }
+
+ #ifdef _OPENMP
+ omp_set_num_threads(old_thread_ct);
+ #endif
+}
+
+char * QuickFile::ptr() {
+ return valid ? fptr : NULL;
+}
+
+size_t QuickFile::size() {
+ return valid ? filesize : 0;
+}
+
+QuickFile::~QuickFile() {
+ close_file();
+}
+
+void QuickFile::sync_file() {
+ msync(fptr, filesize, MS_SYNC);
+}
+
+void QuickFile::close_file() {
+ if (! valid)
+ return;
+ sync_file();
+ munmap(fptr, filesize);
+ close(fd);
+ valid = false;
+}
+
+} // namespace
diff --git a/src/quickfile.hpp b/src/quickfile.hpp
new file mode 100644
index 0000000..5533580
--- /dev/null
+++ b/src/quickfile.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QUICKFILE_HPP
+#define QUICKFILE_HPP
+
+#include "kraken_headers.hpp"
+
+namespace kraken {
+ class QuickFile {
+ public:
+
+ QuickFile();
+ QuickFile(std::string filename, std::string mode="r", size_t size=0);
+ ~QuickFile();
+ void open_file(std::string filename, std::string mode="r", size_t size=0);
+ char *ptr();
+ size_t size();
+ void load_file();
+ void sync_file();
+ void close_file();
+
+ protected:
+
+ bool valid;
+ int fd;
+ char *fptr;
+ size_t filesize;
+ };
+}
+
+#endif
diff --git a/src/seqreader.cpp b/src/seqreader.cpp
new file mode 100644
index 0000000..78c1442
--- /dev/null
+++ b/src/seqreader.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "seqreader.hpp"
+
+using namespace std;
+
+namespace kraken {
+ FastaReader::FastaReader(string filename) {
+ file.open(filename.c_str());
+ if (file.rdstate() & ifstream::failbit) {
+ err(EX_NOINPUT, "can't open %s", filename.c_str());
+ }
+ valid = true;
+ }
+
+ DNASequence FastaReader::next_sequence() {
+ DNASequence dna;
+
+ if (! valid || ! file.good()) {
+ valid = false;
+ return dna;
+ }
+ string line;
+
+ if (linebuffer.empty()) {
+ getline(file, line);
+ }
+ else {
+ line = linebuffer;
+ linebuffer.clear();
+ }
+
+ if (line[0] != '>') {
+ warnx("malformed fasta file - expected header char > not found");
+ valid = false;
+ return dna;
+ }
+ dna.header_line = line.substr(1);
+ istringstream seq_id(dna.header_line);
+ seq_id >> dna.id;
+
+ ostringstream seq_ss;
+
+ while (file.good()) {
+ getline(file, line);
+ if (line[0] == '>') {
+ linebuffer = line;
+ break;
+ }
+ else {
+ seq_ss << line;
+ }
+ }
+ dna.seq = seq_ss.str();
+
+ if (dna.seq.empty()) {
+ warnx("malformed fasta file - zero-length record (%s)", dna.id.c_str());
+ valid = false;
+ return dna;
+ }
+
+ return dna;
+ }
+
+ bool FastaReader::is_valid() {
+ return valid;
+ }
+
+ FastqReader::FastqReader(string filename) {
+ file.open(filename.c_str());
+ if (file.rdstate() & ifstream::failbit) {
+ err(EX_NOINPUT, "can't open %s", filename.c_str());
+ }
+ valid = true;
+ }
+
+ DNASequence FastqReader::next_sequence() {
+ DNASequence dna;
+
+ if (! valid || ! file.good()) {
+ valid = false;
+ return dna;
+ }
+
+ string line;
+ getline(file, line);
+ if (line.empty()) {
+ valid = false; // Sometimes FASTQ files have empty last lines
+ return dna;
+ }
+ if (line[0] != '@') {
+ if (line[0] != '\r')
+ warnx("malformed fastq file - sequence header (%s)", line.c_str());
+ valid = false;
+ return dna;
+ }
+ dna.header_line = line.substr(1);
+ istringstream line_ss(dna.header_line);
+
+ line_ss >> dna.id;
+ getline(file, dna.seq);
+
+ getline(file, line);
+ if (line.empty() || line[0] != '+') {
+ if (line[0] != '\r')
+ warnx("malformed fastq file - quality header (%s)", line.c_str());
+ valid = false;
+ return dna;
+ }
+ getline(file, dna.quals);
+
+ return dna;
+ }
+
+ bool FastqReader::is_valid() {
+ return valid;
+ }
+} // namespace
diff --git a/src/seqreader.hpp b/src/seqreader.hpp
new file mode 100644
index 0000000..40c1b77
--- /dev/null
+++ b/src/seqreader.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SEQREADER_HPP
+#define SEQREADER_HPP
+
+#include "kraken_headers.hpp"
+
+namespace kraken {
+ typedef struct {
+ std::string id;
+ std::string header_line; // id + optional description
+ std::string seq;
+ std::string quals;
+ } DNASequence;
+
+ class DNASequenceReader {
+ public:
+ virtual DNASequence next_sequence() = 0;
+ virtual bool is_valid() = 0;
+ virtual ~DNASequenceReader() {}
+ };
+
+ class FastaReader : public DNASequenceReader {
+ public:
+ FastaReader(std::string filename);
+ DNASequence next_sequence();
+ bool is_valid();
+
+ private:
+ std::ifstream file;
+ std::string linebuffer;
+ bool valid;
+ };
+
+ class FastqReader : public DNASequenceReader {
+ public:
+ FastqReader(std::string filename);
+ DNASequence next_sequence();
+ bool is_valid();
+
+ private:
+ std::ifstream file;
+ bool valid;
+ };
+}
+
+#endif
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
new file mode 100644
index 0000000..24bf0f3
--- /dev/null
+++ b/src/set_lcas.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood at cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+#include "krakendb.hpp"
+#include "krakenutil.hpp"
+#include "seqreader.hpp"
+
+#define SKIP_LEN 50000
+
+using namespace std;
+using namespace kraken;
+
+void parse_command_line(int argc, char **argv);
+void usage(int exit_code=EX_USAGE);
+void process_files();
+void process_single_file();
+void process_file(string filename, uint32_t taxid);
+void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
+
+int Num_threads = 1;
+string DB_filename, Index_filename, Nodes_filename,
+ File_to_taxon_map_filename,
+ ID_to_taxon_map_filename, Multi_fasta_filename;
+bool Allow_extra_kmers = false;
+bool Operate_in_RAM = false;
+bool One_FASTA_file = false;
+map<uint32_t, uint32_t> Parent_map;
+map<string, uint32_t> ID_to_taxon_map;
+KrakenDB Database;
+
+int main(int argc, char **argv) {
+ #ifdef _OPENMP
+ omp_set_num_threads(1);
+ #endif
+
+ parse_command_line(argc, argv);
+ Parent_map = build_parent_map(Nodes_filename);
+
+ QuickFile db_file(DB_filename, "rw");
+ Database = KrakenDB(db_file.ptr());
+ KmerScanner::set_k(Database.get_k());
+
+ char *temp_ptr = NULL;
+ size_t db_file_size = db_file.size();
+ if (Operate_in_RAM) {
+ db_file.close_file();
+ temp_ptr = new char[ db_file_size ];
+ ifstream ifs(DB_filename.c_str(), ifstream::binary);
+ ifs.read(temp_ptr, db_file_size);
+ ifs.close();
+ Database = KrakenDB(temp_ptr);
+ }
+
+ QuickFile idx_file(Index_filename);
+ KrakenDBIndex db_index(idx_file.ptr());
+ Database.set_index(&db_index);
+
+ if (One_FASTA_file)
+ process_single_file();
+ else
+ process_files();
+
+ if (Operate_in_RAM) {
+ ofstream ofs(DB_filename.c_str(), ofstream::binary);
+ ofs.write(temp_ptr, db_file_size);
+ ofs.close();
+ delete temp_ptr;
+ }
+
+ return 0;
+}
+
+void process_single_file() {
+ ifstream map_file(ID_to_taxon_map_filename.c_str());
+ if (map_file.rdstate() & ifstream::failbit) {
+ err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str());
+ }
+ string line;
+ while (map_file.good()) {
+ getline(map_file, line);
+ if (line.empty())
+ break;
+ string seq_id;
+ uint32_t taxid;
+ istringstream iss(line);
+ iss >> seq_id;
+ iss >> taxid;
+ ID_to_taxon_map[seq_id] = taxid;
+ }
+
+ FastaReader reader(Multi_fasta_filename);
+ DNASequence dna;
+ uint32_t seqs_processed = 0;
+
+ while (reader.is_valid()) {
+ dna = reader.next_sequence();
+ if (! reader.is_valid())
+ break;
+ uint32_t taxid = ID_to_taxon_map[dna.id];
+ if (taxid) {
+ #pragma omp parallel for schedule(dynamic)
+ for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
+ set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+ }
+ cerr << "\rProcessed " << ++seqs_processed << " sequences";
+ }
+ cerr << "\r ";
+ cerr << "\rFinished processing " << seqs_processed << " sequences" << endl;
+}
+
+void process_files() {
+ ifstream map_file(File_to_taxon_map_filename.c_str());
+ if (map_file.rdstate() & ifstream::failbit) {
+ err(EX_NOINPUT, "can't open %s", File_to_taxon_map_filename.c_str());
+ }
+ string line;
+ uint32_t seqs_processed = 0;
+
+ while (map_file.good()) {
+ getline(map_file, line);
+ if (line.empty())
+ break;
+ string filename;
+ uint32_t taxid;
+ istringstream iss(line);
+ iss >> filename;
+ iss >> taxid;
+ process_file(filename, taxid);
+ cerr << "\rProcessed " << ++seqs_processed << " sequences";
+ }
+ cerr << "\r ";
+ cerr << "\rFinished processing " << seqs_processed << " sequences" << endl;
+}
+
+void process_file(string filename, uint32_t taxid) {
+ FastaReader reader(filename);
+ DNASequence dna;
+
+ // For the purposes of this program, we assume these files are
+ // single-fasta files.
+ dna = reader.next_sequence();
+
+ #pragma omp parallel for schedule(dynamic)
+ for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
+ set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+}
+
+void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
+ KmerScanner scanner(seq, start, finish);
+ uint64_t *kmer_ptr;
+ uint32_t *val_ptr;
+
+ while ((kmer_ptr = scanner.next_kmer()) != NULL) {
+ if (scanner.ambig_kmer())
+ continue;
+ val_ptr = Database.kmer_query(
+ Database.canonical_representation(*kmer_ptr)
+ );
+ if (val_ptr == NULL) {
+ if (! Allow_extra_kmers)
+ errx(EX_DATAERR, "kmer found in sequence that is not in database");
+ else
+ continue;
+ }
+ *val_ptr = lca(Parent_map, taxid, *val_ptr);
+ }
+}
+
+void parse_command_line(int argc, char **argv) {
+ int opt;
+ int sig;
+
+ if (argc > 1 && strcmp(argv[1], "-h") == 0)
+ usage(0);
+ while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xM")) != -1) {
+ switch (opt) {
+ case 'f' :
+ File_to_taxon_map_filename = optarg;
+ break;
+ case 'd' :
+ DB_filename = optarg;
+ break;
+ case 'i' :
+ Index_filename = optarg;
+ break;
+ case 'F' :
+ Multi_fasta_filename = optarg;
+ break;
+ case 'm' :
+ ID_to_taxon_map_filename = optarg;
+ break;
+ case 't' :
+ sig = atoi(optarg);
+ if (sig <= 0)
+ errx(EX_USAGE, "can't use nonpositive thread count");
+ #ifdef _OPENMP
+ Num_threads = sig;
+ omp_set_num_threads(Num_threads);
+ #endif
+ break;
+ case 'n' :
+ Nodes_filename = optarg;
+ break;
+ case 'x' :
+ Allow_extra_kmers = true;
+ break;
+ case 'M' :
+ Operate_in_RAM = true;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (DB_filename.empty() || Index_filename.empty() ||
+ Nodes_filename.empty())
+ usage();
+ if (File_to_taxon_map_filename.empty() &&
+ (Multi_fasta_filename.empty() || ID_to_taxon_map_filename.empty()))
+ usage();
+
+ if (! File_to_taxon_map_filename.empty())
+ One_FASTA_file = false;
+ else
+ One_FASTA_file = true;
+}
+
+void usage(int exit_code) {
+ cerr << "Usage: set_lcas [options]" << endl
+ << endl
+ << "Options: (*mandatory)" << endl
+ << "* -d filename Kraken DB filename" << endl
+ << "* -i filename Kraken DB index filename" << endl
+ << "* -n filename NCBI Taxonomy nodes file" << endl
+ << " -t # Number of threads" << endl
+ << " -M Copy DB to RAM during operation" << endl
+ << " -x K-mers not found in DB do not cause errors" << endl
+ << " -f filename File to taxon map" << endl
+ << " -F filename Multi-FASTA file with sequence data" << endl
+ << " -m filename Sequence ID to taxon map" << endl
+ << " -h Print this message" << endl
+ << endl
+ << "-F and -m must be specified together. If -f is given, "
+ << "-F/-m are ignored." << endl;
+ exit(exit_code);
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/kraken.git
More information about the debian-med-commit
mailing list