[med-svn] [lambda-align] 01/15: Imported Upstream version 0.9.3+ds
Sascha Steinbiss
satta at debian.org
Fri Aug 19 16:10:04 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository lambda-align.
commit 6eb7888e04d2e2d3dfdc6848c43936f9407f751a
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Sun Mar 13 13:11:41 2016 +0000
Imported Upstream version 0.9.3+ds
---
.travis.yml | 64 ++
CMakeLists.txt | 45 +
COPYING.rst | 27 +
INFO | 11 +
LICENSE-GPL3.rst | 704 ++++++++++++++++
README.rst | 78 ++
src/CMakeLists.txt | 183 +++++
src/alph.hpp | 237 ++++++
src/holders.hpp | 548 +++++++++++++
src/lambda.cpp | 563 +++++++++++++
src/lambda.hpp | 1495 ++++++++++++++++++++++++++++++++++
src/lambda_indexer.cpp | 221 +++++
src/lambda_indexer.hpp | 554 +++++++++++++
src/lambda_indexer_misc.hpp | 149 ++++
src/match.hpp | 293 +++++++
src/misc.hpp | 505 ++++++++++++
src/options.hpp | 1378 +++++++++++++++++++++++++++++++
src/output.hpp | 578 +++++++++++++
src/radix_inplace.h | 497 +++++++++++
tests/CMakeLists.txt | 28 +
tests/db_nucl.fasta.gz | Bin 0 -> 122039 bytes
tests/db_nucl_fm.md5sums.gz | Bin 0 -> 600 bytes
tests/db_nucl_sa.md5sums.gz | Bin 0 -> 293 bytes
tests/db_prot.fasta.gz | Bin 0 -> 64761 bytes
tests/db_prot_fm.md5sums.gz | Bin 0 -> 701 bytes
tests/db_prot_sa.md5sums.gz | Bin 0 -> 300 bytes
tests/db_trans_fm.md5sums.gz | Bin 0 -> 729 bytes
tests/db_trans_sa.md5sums.gz | Bin 0 -> 336 bytes
tests/maintests.sh | 83 ++
tests/queries_nucl.fasta.gz | Bin 0 -> 33637 bytes
tests/queries_prot.fasta.gz | Bin 0 -> 23316 bytes
tests/search_test_outfile.md5sums.gz | Bin 0 -> 1082 bytes
32 files changed, 8241 insertions(+)
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..93578e5
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,64 @@
+sudo: required
+dist: trusty
+language: cpp
+matrix:
+ include:
+ - os: linux
+ compiler: gcc-5
+ addons:
+ apt:
+ sources: ['ubuntu-toolchain-r-test']
+ packages: ['g++-5', 'cmake', 'zlib1g-dev', 'libbz2-dev']
+ install: ['export CXX="g++-5" CC="gcc-5"' ]
+ before_script:
+ - sudo dd if=/dev/zero of=/swapfile bs=64M count=32 # cheat us some extra MEM
+ - sudo mkswap /swapfile
+ - sudo swapon /swapfile
+ after_script:
+ - sudo swapoff /swapfile
+ - sudo rm /swapfile
+
+ - os: linux
+ compiler: gcc-4.9
+ addons:
+ apt:
+ sources: ['ubuntu-toolchain-r-test']
+ packages: ['g++-4.9', 'cmake', 'zlib1g-dev', 'libbz2-dev']
+ install: ['export CXX="g++-4.9" CC="gcc-4.9"' ]
+ before_script:
+ - sudo dd if=/dev/zero of=/swapfile bs=64M count=32 # cheat us some extra MEM
+ - sudo mkswap /swapfile
+ - sudo swapon /swapfile
+ after_script:
+ - sudo swapoff /swapfile
+ - sudo rm /swapfile
+
+ - os: linux
+ compiler: clang
+ addons:
+ apt:
+ sources: ['ubuntu-toolchain-r-test']
+ packages: ['clang', 'g++-4.9', 'cmake', 'zlib1g-dev', 'libbz2-dev'] # g++ required for newer libstdc++
+
+ #- os: osx
+ #compiler: clang-3.6
+ #before_install:
+ #- sudo brew update
+ #- sudo brew tap homebrew/versions
+ #- sudo brew install llvm36
+ #install: ['export CXX="clang++-3.6" CC="clang-3.6"' ]
+
+ #- os: osx
+ #compiler: clang-3.7
+ #before_install:
+ #- sudo brew update
+ #- sudo brew tap homebrew/versions
+ #- sudo brew install llvm37
+ #install: ['export CXX="clang++-3.7" CC="clang-3.7"' ]
+
+script:
+ - mkdir -p build && cd build
+ - cmake ..
+ - make lambda_indexer
+ - travis_wait make lambda # need to prefix with travis_wait because it might take > 10min
+ - ctest .
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..afe4c1c
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,45 @@
+# ===========================================================================
+# Lambda
+# ===========================================================================
+
+# ----------------------------------------------------------------------------
+# Load SeqAn build system
+# ----------------------------------------------------------------------------
+
+project (seqan)
+cmake_minimum_required (VERSION 2.8.2)
+set (CMAKE_INCLUDE_PATH
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/seqan/include
+ ${CMAKE_INCLUDE_PATH})
+set (CMAKE_MODULE_PATH
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/seqan/util/cmake/FindTBB
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/seqan/util/cmake
+ ${CMAKE_MODULE_PATH})
+include (SeqAnContribs)
+
+# Build in release mode by default
+if (CMAKE_BUILD_TYPE STREQUAL "")
+ set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+set (SEQAN_USE_SEQAN_BUILD_SYSTEM TRUE CACHE INTERNAL "Use SeqAn build system." FORCE)
+include (SeqAnBuildSystem)
+set (SEQAN_ROOT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include/seqan/" CACHE INTERNAL "Root source directory." FORCE)
+# include(include/seqan/CMakeLists.txt)
+seqan_build_system_init ()
+seqan_get_repository_info ()
+seqan_setup_library ()
+include (package)
+include (SeqAnCtdSetup)
+
+# ----------------------------------------------------------------------------
+# Add Lambda targets
+# ----------------------------------------------------------------------------
+
+add_subdirectory(src)
+
+# ----------------------------------------------------------------------------
+# Add Tests
+# ----------------------------------------------------------------------------
+
+add_subdirectory(tests)
\ No newline at end of file
diff --git a/COPYING.rst b/COPYING.rst
new file mode 100644
index 0000000..a1fe1fc
--- /dev/null
+++ b/COPYING.rst
@@ -0,0 +1,27 @@
+lambda copyright
+================
+Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin.
+All rights reserved.
+
+Lambda is *free software*: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Lambda is distributed in the hope that it will be useful,
+but **without any warranty**; without even the implied warranty of
+**merchantability** or **fitness for a particular purpose**. See the
+GNU General Public License for more details.
+
+See the file `LICENSE-GPL3.rst <./LICENSE-GPL3.rst>`__ or
+http://www.gnu.org/licenses/ for a full text of the license and the
+rights and obligations implied.
+
+SeqAn copyright
+===============
+Lambda contains code of the SeqAn project in the folder
+`include/seqan <./include/seqan>`__. See
+`include/seqan/LICENSE <./include/seqan/LICENSE>`__ for the terms.
+Please note that due to technical reasons the repository contains
+other SeqAn applications, as well, that might be distributed under
+different terms.
\ No newline at end of file
diff --git a/INFO b/INFO
new file mode 100644
index 0000000..4e6dccb
--- /dev/null
+++ b/INFO
@@ -0,0 +1,11 @@
+Name: lambda
+Author: Hannes Hauswedell <hannes.hauswedell at molgen.mpg.de>
+Maintainer: Hannes Hauswedell <hannes.hauswedell at molgen.mpg.de>
+License: GPL v3
+Copyright: 2013-2014, FU Berlin
+Status: under development
+Description: NCBI-Blast compatible local aligner that is very fast.
+ LAMBDA: the Local Aligner for Massive Biological DatA is a fast local
+ aligner optimized for NGS data and protein searches, but it also works on
+ DNA-DNA searches. It is faster than most of its competitors while still being
+ rather sensitive.
diff --git a/LICENSE-GPL3.rst b/LICENSE-GPL3.rst
new file mode 100644
index 0000000..b2cade9
--- /dev/null
+++ b/LICENSE-GPL3.rst
@@ -0,0 +1,704 @@
+GNU GENERAL PUBLIC LICENSE
+==========================
+
+Version 3, 29 June 2007
+
+Copyright (C) 2007 `Free Software Foundation, Inc. <http://fsf.org/>`_
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+Preamble
+--------
+
+The GNU General Public License is a free, copyleft license for software
+and other kinds of works.
+
+The licenses for most software and other practical works are designed to
+take away your freedom to share and change the works. By contrast, the
+GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not price.
+Our General Public Licenses are designed to make sure that you have the
+freedom to distribute copies of free software (and charge for them if
+you wish), that you receive source code or can get it if you want it,
+that you can change the software or use pieces of it in new free
+programs, and that you know you can do these things.
+
+To protect your rights, we need to prevent others from denying you these
+rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+For example, if you distribute copies of such a program, whether gratis
+or for a fee, you must pass on to the recipients the same freedoms that
+you received. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+Developers that use the GNU GPL protect your rights with two steps:
+
+1. assert copyright on the software, and
+2. offer you this License giving you legal permission to copy,
+ distribute and/or modify it.
+
+For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of protecting
+users' freedom to change the software. The systematic pattern of such
+abuse occurs in the area of products for individuals to use, which is
+precisely where it is most unacceptable. Therefore, we have designed
+this version of the GPL to prohibit the practice for those products. If
+such problems arise substantially in other domains, we stand ready to
+extend this provision to those domains in future versions of the GPL, as
+needed to protect the freedom of users.
+
+Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+The precise terms and conditions for copying, distribution and
+modification follow.
+
+TERMS AND CONDITIONS
+--------------------
+
+0. Definitions.
+~~~~~~~~~~~~~~~
+
+*This License* refers to version 3 of the GNU General Public License.
+
+*Copyright* also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+*The Program* refers to any copyrightable work licensed under this
+License. Each licensee is addressed as *you*. *Licensees* and
+*recipients* may be individuals or organizations.
+
+To *modify* a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a *modified version* of the
+earlier work or a work *based on* the earlier work.
+
+A *covered work* means either the unmodified Program or a work based on
+the Program.
+
+To *propagate* a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+To *convey* a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+An interactive user interface displays *Appropriate Legal Notices* to
+the extent that it includes a convenient and prominently visible feature
+that
+
+1. displays an appropriate copyright notice, and
+2. tells the user that there is no warranty for the work (except to the
+ extent that warranties are provided), that licensees may convey the
+ work under this License, and how to view a copy of this License.
+
+If the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+1. Source Code.
+~~~~~~~~~~~~~~~
+
+The *source code* for a work means the preferred form of the work for
+making modifications to it. *Object code* means any non-source form of a
+work.
+
+A *Standard Interface* means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that is
+widely used among developers working in that language.
+
+The *System Libraries* of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that Major
+Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A *Major
+Component*, in this context, means a major essential component (kernel,
+window system, and so on) of the specific operating system (if any) on
+which the executable work runs, or a compiler used to produce the work,
+or an object code interpreter used to run it.
+
+The *Corresponding Source* for a work in object code form means all the
+source code needed to generate, install, and (for an executable work)
+run the object code and to modify the work, including scripts to control
+those activities. However, it does not include the work's System
+Libraries, or general-purpose tools or generally available free programs
+which are used unmodified in performing those activities but which are
+not part of the work. For example, Corresponding Source includes
+interface definition files associated with source files for the work,
+and the source code for shared libraries and dynamically linked
+subprograms that the work is specifically designed to require, such as
+by intimate data communication or control flow between those subprograms
+and other parts of the work.
+
+The Corresponding Source need not include anything that users can
+regenerate automatically from other parts of the Corresponding Source.
+
+The Corresponding Source for a work in source code form is that same
+work.
+
+2. Basic Permissions.
+~~~~~~~~~~~~~~~~~~~~~
+
+All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+You may make, run and propagate covered works that you do not convey,
+without conditions so long as your license otherwise remains in force.
+You may convey covered works to others for the sole purpose of having
+them make modifications exclusively for you, or provide you with
+facilities for running those works, provided that you comply with the
+terms of this License in conveying all material for which you do not
+control copyright. Those thus making or running the covered works for
+you must do so exclusively on your behalf, under your direction and
+control, on terms that prohibit them from making any copies of your
+copyrighted material outside their relationship with you.
+
+Conveying under any other circumstances is permitted solely under the
+conditions stated below. Sublicensing is not allowed; section 10 makes
+it unnecessary.
+
+3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article 11
+of the WIPO copyright treaty adopted on 20 December 1996, or similar
+laws prohibiting or restricting circumvention of such measures.
+
+When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to the
+covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+4. Conveying Verbatim Copies.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice; keep
+intact all notices stating that this License and any non-permissive
+terms added in accord with section 7 apply to the code; keep intact all
+notices of the absence of any warranty; and give all recipients a copy
+of this License along with the Program.
+
+You may charge any price or no price for each copy that you convey, and
+you may offer support or warranty protection for a fee.
+
+5. Conveying Modified Source Versions.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the terms
+of section 4, provided that you also meet all of these conditions:
+
+- a) The work must carry prominent notices stating that you modified it,
+and giving a relevant date. - b) The work must carry prominent notices
+stating that it is released under this License and any conditions added
+under section 7. This requirement modifies the requirement in section 4
+to *keep intact all notices*. - c) You must license the entire work, as
+a whole, under this License to anyone who comes into possession of a
+copy. This License will therefore apply, along with any applicable
+section 7 additional terms, to the whole of the work, and all its parts,
+regardless of how they are packaged. This License gives no permission to
+license the work in any other way, but it does not invalidate such
+permission if you have separately received it. - d) If the work has
+interactive user interfaces, each must display Appropriate Legal
+Notices; however, if the Program has interactive interfaces that do not
+display Appropriate Legal Notices, your work need not make them do so.
+
+A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work, and
+which are not combined with it such as to form a larger program, in or
+on a volume of a storage or distribution medium, is called an
+*aggregate* if the compilation and its resulting copyright are not used
+to limit the access or legal rights of the compilation's users beyond
+what the individual works permit. Inclusion of a covered work in an
+aggregate does not cause this License to apply to the other parts of the
+aggregate.
+
+6. Conveying Non-Source Forms.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may convey a covered work in object code form under the terms of
+sections 4 and 5, provided that you also convey the machine-readable
+Corresponding Source under the terms of this License, in one of these
+ways:
+
+- a) Convey the object code in, or embodied in, a physical product
+(including a physical distribution medium), accompanied by the
+Corresponding Source fixed on a durable physical medium customarily used
+for software interchange. - b) Convey the object code in, or embodied
+in, a physical product (including a physical distribution medium),
+accompanied by a written offer, valid for at least three years and valid
+for as long as you offer spare parts or customer support for that
+product model, to give anyone who possesses the object code either
+ 1. a copy of the Corresponding Source for all the software in the
+product that is covered by this License, on a durable physical medium
+customarily used for software interchange, for a price no more than your
+reasonable cost of physically performing this conveying of source, or 2.
+access to copy the Corresponding Source from a network server at no
+charge.
+
+- c) Convey individual copies of the object code with a copy of the
+written offer to provide the Corresponding Source. This alternative is
+allowed only occasionally and noncommercially, and only if you received
+the object code with such an offer, in accord with subsection 6b. - d)
+Convey the object code by offering access from a designated place
+(gratis or for a charge), and offer equivalent access to the
+Corresponding Source in the same way through the same place at no
+further charge. You need not require recipients to copy the
+Corresponding Source along with the object code. If the place to copy
+the object code is a network server, the Corresponding Source may be on
+a different server operated by you or a third party) that supports
+equivalent copying facilities, provided you maintain clear directions
+next to the object code saying where to find the Corresponding Source.
+Regardless of what server hosts the Corresponding Source, you remain
+obligated to ensure that it is available for as long as needed to
+satisfy these requirements. - e) Convey the object code using
+peer-to-peer transmission, provided you inform other peers where the
+object code and Corresponding Source of the work are being offered to
+the general public at no charge under subsection 6d.
+
+A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be included
+in conveying the object code work.
+
+A *User Product* is either
+
+1. a *consumer product*, which means any tangible personal property
+ which is normally used for personal, family, or household purposes,
+ or
+2. anything designed or sold for incorporation into a dwelling.
+
+In determining whether a product is a consumer product, doubtful cases
+shall be resolved in favor of coverage. For a particular product
+received by a particular user, *normally used* refers to a typical or
+common use of that class of product, regardless of the status of the
+particular user or of the way in which the particular user actually
+uses, or expects or is expected to use, the product. A product is a
+consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+*Installation Information* for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product
+from a modified version of its Corresponding Source. The information
+must suffice to ensure that the continued functioning of the modified
+object code is in no case prevented or interfered with solely because
+modification has been made.
+
+If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied by
+the Installation Information. But this requirement does not apply if
+neither you nor any third party retains the ability to install modified
+object code on the User Product (for example, the work has been
+installed in ROM).
+
+The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+Corresponding Source conveyed, and Installation Information provided, in
+accord with this section must be in a format that is publicly documented
+(and with an implementation available to the public in source code
+form), and must require no special password or key for unpacking,
+reading or copying.
+
+7. Additional Terms.
+~~~~~~~~~~~~~~~~~~~~
+
+*Additional permissions* are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by this
+License without regard to the additional permissions.
+
+When you convey a copy of a covered work, you may at your option remove
+any additional permissions from that copy, or from any part of it.
+(Additional permissions may be written to require their own removal in
+certain cases when you modify the work.) You may place additional
+permissions on material, added by you to a covered work, for which you
+have or can give appropriate copyright permission.
+
+Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders
+of that material) supplement the terms of this License with terms:
+
+a. Disclaiming warranty or limiting liability differently from the terms
+ of sections 15 and 16 of this License; or
+b. Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+c. Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+d. Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+e. Declining to grant rights under trademark law for use of some trade
+ names, trademarks, or service marks; or
+f. Requiring indemnification of licensors and authors of that material
+ by anyone who conveys the material (or modified versions of it) with
+ contractual assumptions of liability to the recipient, for any
+ liability that these contractual assumptions directly impose on those
+ licensors and authors.
+
+All other non-permissive additional terms are considered *further
+restrictions* within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains a
+further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms of
+that license document, provided that the further restriction does not
+survive such relicensing or conveying.
+
+If you add terms to a covered work in accord with this section, you must
+place, in the relevant source files, a statement of the additional terms
+that apply to those files, or a notice indicating where to find the
+applicable terms.
+
+Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions; the above
+requirements apply either way.
+
+8. Termination.
+~~~~~~~~~~~~~~~
+
+You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+However, if you cease all violation of this License, then your license
+from a particular copyright holder is reinstated
+
+a. provisionally, unless and until the copyright holder explicitly and
+ finally terminates your license, and
+b. permanently, if the copyright holder fails to notify you of the
+ violation by some reasonable means prior to 60 days after the
+ cessation.
+
+Moreover, your license from a particular copyright holder is reinstated
+permanently if the copyright holder notifies you of the violation by
+some reasonable means, this is the first time you have received notice
+of violation of this License (for any work) from that copyright holder,
+and you cure the violation prior to 30 days after your receipt of the
+notice.
+
+Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+9. Acceptance Not Required for Having Copies.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You are not required to accept this License in order to receive or run a
+copy of the Program. Ancillary propagation of a covered work occurring
+solely as a consequence of using peer-to-peer transmission to receive a
+copy likewise does not require acceptance. However, nothing other than
+this License grants you permission to propagate or modify any covered
+work. These actions infringe copyright if you do not accept this
+License. Therefore, by modifying or propagating a covered work, you
+indicate your acceptance of this License to do so.
+
+10. Automatic Licensing of Downstream Recipients.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+An *entity transaction* is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered work
+results from an entity transaction, each party to that transaction who
+receives a copy of the work also receives whatever licenses to the work
+the party's predecessor in interest had or could give under the previous
+paragraph, plus a right to possession of the Corresponding Source of the
+work from the predecessor in interest, if the predecessor has it or can
+get it with reasonable efforts.
+
+You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may not
+impose a license fee, royalty, or other charge for exercise of rights
+granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that any
+patent claim is infringed by making, using, selling, offering for sale,
+or importing the Program or any portion of it.
+
+11. Patents.
+~~~~~~~~~~~~
+
+A *contributor* is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The work
+thus licensed is called the contributor's *contributor version*.
+
+A contributor's *essential patent claims* are all patent claims owned or
+controlled by the contributor, whether already acquired or hereafter
+acquired, that would be infringed by some manner, permitted by this
+License, of making, using, or selling its contributor version, but do
+not include claims that would be infringed only as a consequence of
+further modification of the contributor version. For purposes of this
+definition, *control* includes the right to grant patent sublicenses in
+a manner consistent with the requirements of this License.
+
+Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to make,
+use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+In the following three paragraphs, a *patent license* is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To *grant* such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+If you convey a covered work, knowingly relying on a patent license, and
+the Corresponding Source of the work is not available for anyone to
+copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either
+
+1. cause the Corresponding Source to be so available, or
+2. arrange to deprive yourself of the benefit of the patent license for
+ this particular work, or
+3. arrange, in a manner consistent with the requirements of this
+ License, to extend the patent license to downstream recipients.
+
+*Knowingly relying* means you have actual knowledge that, but for the
+patent license, your conveying the covered work in a country, or your
+recipient's use of the covered work in a country, would infringe one or
+more identifiable patents in that country that you have reason to
+believe are valid.
+
+If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify or
+convey a specific copy of the covered work, then the patent license you
+grant is automatically extended to all recipients of the covered work
+and works based on it.
+
+A patent license is *discriminatory* if it does not include within the
+scope of its coverage, prohibits the exercise of, or is conditioned on
+the non-exercise of one or more of the rights that are specifically
+granted under this License. You may not convey a covered work if you are
+a party to an arrangement with a third party that is in the business of
+distributing software, under which you make payment to the third party
+based on the extent of your activity of conveying the work, and under
+which the third party grants, to any of the parties who would receive
+the covered work from you, a discriminatory patent license
+
+a. in connection with copies of the covered work conveyed by you (or
+ copies made from those copies), or
+b. primarily for and in connection with specific products or
+ compilations that contain the covered work, unless you entered into
+ that arrangement, or that patent license was granted, prior to 28
+ March 2007.
+
+Nothing in this License shall be construed as excluding or limiting any
+implied license or other defenses to infringement that may otherwise be
+available to you under applicable patent law.
+
+12. No Surrender of Others' Freedom.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not convey it at all. For example, if you agree to terms that
+obligate you to collect a royalty for further conveying from those to
+whom you convey the Program, the only way you could satisfy both those
+terms and this License would be to refrain entirely from conveying the
+Program.
+
+13. Use with the GNU Affero General Public License.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notwithstanding any other provision of this License, you have permission
+to link or combine any covered work with a work licensed under version 3
+of the GNU Affero General Public License into a single combined work,
+and to convey the resulting work. The terms of this License will
+continue to apply to the part which is the covered work, but the special
+requirements of the GNU Affero General Public License, section 13,
+concerning interaction through a network will apply to the combination
+as such.
+
+14. Revised Versions of this License.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies that a certain numbered version of the GNU General Public
+License *or any later version* applies to it, you have the option of
+following the terms and conditions either of that numbered version or of
+any later version published by the Free Software Foundation. If the
+Program does not specify a version number of the GNU General Public
+License, you may choose any version ever published by the Free Software
+Foundation.
+
+If the Program specifies that a proxy can decide which future versions
+of the GNU General Public License can be used, that proxy's public
+statement of acceptance of a version permanently authorizes you to
+choose that version for the Program.
+
+Later license versions may give you additional or different permissions.
+However, no additional obligations are imposed on any author or
+copyright holder as a result of your choosing to follow a later version.
+
+15. Disclaimer of Warranty.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM *AS IS* WITHOUT
+WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF
+THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+16. Limitation of Liability.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR
+CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES
+ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT
+NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES
+SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
+WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+17. Interpretation of Sections 15 and 16.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If the disclaimer of warranty and limitation of liability provided above
+cannot be given local legal effect according to their terms, reviewing
+courts shall apply local law that most closely approximates an absolute
+waiver of all civil liability in connection with the Program, unless a
+warranty or assumption of liability accompanies a copy of the Program in
+return for a fee.
+
+END OF TERMS AND CONDITIONS
+---------------------------
+
+How to Apply These Terms to Your New Programs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these
+terms.
+
+To do so, attach the following notices to the program. It is safest to
+attach them to the start of each source file to most effectively state
+the exclusion of warranty; and each file should have at least the
+*copyright* line and a pointer to where the full notice is found.
+
+::
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program does terminal interaction, make it output a short notice
+like this when it starts in an interactive mode:
+
+::
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands ``show w`` and ``show c`` should show the
+appropriate parts of the General Public License. Of course, your
+program's commands might be different; for a GUI interface, you would
+use an *about box*.
+
+You should also get your employer (if you work as a programmer) or
+school, if any, to sign a *copyright disclaimer* for the program, if
+necessary. For more information on this, and how to apply and follow the
+GNU GPL, see
+`http://www.gnu.org/licenses/ <http://www.gnu.org/licenses/>`_.
+
+The GNU General Public License does not permit incorporating your
+program into proprietary programs. If your program is a subroutine
+library, you may consider it more useful to permit linking proprietary
+applications with the library. If this is what you want to do, use the
+GNU Lesser General Public License instead of this License. But first,
+please read
+`http://www.gnu.org/philosophy/why-not-lgpl.html <http://www.gnu.org/philosophy/why-not-lgpl.html>`_.
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..3aab8a8
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,78 @@
+Lambda: the Local Aligner for Massive Biological DatA
+-----------------------------------------------------
+
+.. image:: https://travis-ci.org/seqan/lambda.svg?branch=master
+ :alt: Travis CI build status
+ :target: https://travis-ci.org/seqan/lambda
+
+Lambda is a local aligner optimized for many query sequences and searches in protein space.
+It is compatible to BLAST, but much faster than BLAST and many other comparable tools.
+
+cite
+----------
+
+Please cite the following if you use Lambda anywhere in your academic work, also as part of pipelines
+or comparisons:
+
+*Lambda: the local aligner for massive biological data*;
+Hannes Hauswedell, Jochen Singer, Knut Reinert;
+`Bioinformatics 2014 30 (17): i349-i355 <http://bioinformatics.oxfordjournals.org/content/30/17/i349.abstract>`__;
+doi: 10.1093/bioinformatics/btu439
+
+download
+--------
+
+The latest version is available
+`here <https://github.com/seqan/lambda/releases>`__. Versions prior to 0.9.0 are available
+`here <https://github.com/h-2/seqan/releases>`__.
+
+Lambda is Free and open source software, so you can use it for any purpose, free of charge.
+However certain conditions apply when you (re-)distribute or modify Lambda, please respect the
+`license <./COPYING.rst>`__.
+
+You can also build lambda from source which will result in binaries optimized for your
+specific system (and thus faster). For instructions, please see the
+`wiki <https://github.com/seqan/lambda/wiki>`__.
+
+run
+---
+
+Optionally mask the database:
+
+::
+
+ % /path/to/segmasker -infmt fasta -in db.fasta -outfmt interval -out db.seg
+
+Run the indexer (or check the `wiki <https://github.com/seqan/lambda/wiki>`__ for pre-built indexes!):
+
+::
+
+ % bin/lambda_indexer -d db.fasta [-s db.seg]
+
+Run lambda:
+
+::
+
+ % bin/lambda -q query.fasta -d db.fasta
+
+*Please note that if you downloaded the binaries from the web-site, you might also have* ``bin/lambda-avx`` *which is
+measurably faster, so try that first!*
+
+For a list of options, see the help pages:
+
+::
+
+ % bin/lambda --help
+ % bin/lambda --full-help
+
+Or visit the Tuning-guide in the `wiki <https://github.com/seqan/lambda/wiki>`__.
+
+give feedback
+-------------
+
+Please report bugs to the `bug-tracker <https://github.com/seqan/lambda/issues>`__.
+
+Any other questions or feedback you can send to
+`Hannes Hauswedell <mailto:hannes.hauswedell@[molgen.mpg.de|fu-berlin.de]>`__.
+
+Thank you for using Lambda, we hope that it is useful to you!
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..3e42748
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,183 @@
+# ===========================================================================
+# SeqAn - The Library for Sequence Analysis
+# ===========================================================================
+# File: /sandbox/h4nn3s/apps/lambda/CMakeLists.txt
+#
+# CMakeLists.txt file for lambda.
+# ===========================================================================
+
+cmake_minimum_required (VERSION 2.8.2)
+project (lambda)
+message (STATUS "Configuring lambda")
+
+# ----------------------------------------------------------------------------
+# Dependencies
+# ----------------------------------------------------------------------------
+
+# Search SeqAn and select dependencies.
+set (SEQAN_FIND_DEPENDENCIES OpenMP ZLIB BZip2)
+find_package(SeqAn REQUIRED)
+find_package(CXX11 REQUIRED)
+find_package(CXX14)
+
+if (NOT CXX11_FOUND)
+ message (FATAL_ERROR " C++11 not found: Not building lambda.")
+ return ()
+endif (NOT CXX11_FOUND)
+
+# Warn if OpenMP was not found.
+if (NOT SEQAN_HAS_OPENMP)
+ message (WARNING "WARNING WARNING WARNING\nWARNING: OpenMP not found. Lambda will be built without multi-threading! "
+ "This is probably not what you want! Use GCC or a very, very recent version of Clang.\nWARNING WARNING WARNING")
+endif (NOT SEQAN_HAS_OPENMP)
+
+# Warn if Zlib was not found.
+if (NOT SEQAN_HAS_ZLIB)
+ message (WARNING "WARNING: Zlib not found. Building lambda without support for gzipped input and output (this includes support for .bam).")
+endif (NOT SEQAN_HAS_ZLIB)
+
+# Warn if BZip2 was not found.
+if (NOT SEQAN_HAS_BZIP2)
+ message (WARNING "WARNING: BZip2 not found. Building lambda without support for bzipped input and output.")
+endif (NOT SEQAN_HAS_BZIP2)
+
+if (COMPILER_IS_CLANG)
+ set (CXX11_CXX_FLAGS "${CXX11_CXX_FLAGS} -ftemplate-depth-1024")
+endif (COMPILER_IS_CLANG)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+ if (491 GREATER _GCC_VERSION)
+ message (FATAL_ERROR "Your GCC version is too old. Minimum version is GCC-4.9.1!")
+ return ()
+ endif(491 GREATER _GCC_VERSION)
+endif(CMAKE_COMPILER_IS_GNUCXX)
+
+# ----------------------------------------------------------------------------
+# App-Level Configuration
+# ----------------------------------------------------------------------------
+
+set (SEQAN_APP_VERSION "0.9.3")
+
+option (LAMBDA_FASTBUILD "Skip certain rarely used codepaths (speeds up build)." OFF)
+if (LAMBDA_FASTBUILD)
+ add_definitions (-DFASTBUILD=1)
+ message (STATUS "FASTBUILD set, not all command line options available.")
+endif (LAMBDA_FASTBUILD)
+
+option (LAMBDA_NATIVE_BUILD "Architecture-specific optimizations, i.e. g++ -march=native." ON)
+if (LAMBDA_NATIVE_BUILD)
+ add_definitions (-DLAMBDA_NATIVE_BUILD=1)
+ set (SEQAN_CXX_FLAGS "${SEQAN_CXX_FLAGS} -march=native")
+ if (COMPILER_IS_INTEL)
+ set (SEQAN_CXX_FLAGS "${SEQAN_CXX_FLAGS} -xHOST -ipo -no-prec-div -fp-model fast=2")
+ endif (COMPILER_IS_INTEL)
+ message (STATUS "NATIVE_BUILD set, binaries will be optimized for this CPU (and might not work on others).")
+ message (STATUS "To deactivate, pass -DLAMBDA_NATIVE_BUILD=0 to cmake.")
+endif (LAMBDA_NATIVE_BUILD)
+
+option (LAMBDA_STATIC_BUILD "Include all libraries in the binaries." OFF)
+if (LAMBDA_STATIC_BUILD)
+ add_definitions (-DLAMBDA_STATIC_BUILD=1)
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+ if (CMAKE_COMPILER_IS_GNUCXX)
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+ endif (CMAKE_COMPILER_IS_GNUCXX)
+ # apple does not support fully static builds
+ if (NOT APPLE)
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+ else (NOT APPLE)
+ message (WARNING "WARNING: Builds on Mac are never fully static.")
+ endif (NOT APPLE)
+ message (STATUS "STATIC_BUILD set, binaries built with all libs included.")
+endif (LAMBDA_STATIC_BUILD)
+
+option (LAMBDA_MMAPPED_DB "Use mmapped access to the database." ON)
+if (LAMBDA_MMAPPED_DB)
+ add_definitions (-DLAMBDA_MMAPPED_DB=1)
+ message (STATUS "MMAPPED_DB set, using memory-mapped IO to access the database.")
+ message (STATUS "To deactivate, pass -DLAMBDA_MMAPPED_DB=0 to cmake.")
+endif (LAMBDA_MMAPPED_DB)
+
+# ----------------------------------------------------------------------------
+# Build Setup
+# ----------------------------------------------------------------------------
+
+# Enable global exception handler for all seqan apps.
+set (SEQAN_DEFINITIONS "${SEQAN_DEFINITIONS} -DSEQAN_GLOBAL_EXCEPTION_HANDLER")
+
+# Add include directories.
+include_directories (${SEQAN_INCLUDE_DIRS})
+
+# Add definitions set by find_package (SeqAn).
+add_definitions (${SEQAN_DEFINITIONS})
+
+# Add definitions set by the build system.
+add_definitions (-DSEQAN_VERSION_STRING="${SEQAN_VERSION_STRING}")
+add_definitions (-DSEQAN_REVISION="${SEQAN_REVISION}")
+add_definitions (-DSEQAN_DATE="${SEQAN_DATE}")
+add_definitions (-DSEQAN_APP_VERSION="${SEQAN_APP_VERSION}")
+add_definitions (-DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}")
+
+# Update the list of file names below if you add source files to your application.
+add_executable (lambda lambda.cpp
+ lambda.hpp
+ misc.hpp
+ match.hpp
+ options.hpp
+ alph.hpp
+ holders.hpp
+ output.hpp)
+add_executable (lambda_indexer lambda_indexer.cpp
+ lambda_indexer.hpp
+ options.hpp
+ lambda_indexer_misc.hpp
+ radix_inplace.h)
+
+# Add dependencies found by find_package (SeqAn).
+target_link_libraries (lambda ${SEQAN_LIBRARIES})
+target_link_libraries (lambda_indexer ${SEQAN_LIBRARIES})
+
+# Add CXX flags found by find_package (SeqAn).
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS} ${CXX11_CXX_FLAGS}")
+if (NOT COMPILER_IS_INTEL)
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-vla")
+endif (NOT COMPILER_IS_INTEL)
+
+set(BUILD_SHARED_LIBS OFF)
+
+# ----------------------------------------------------------------------------
+# Installation
+# ----------------------------------------------------------------------------
+
+# Set variables for installing, depending on the selected build type.
+if (NOT SEQAN_PREFIX_SHARE_DOC)
+ seqan_setup_install_vars (lambda)
+endif (NOT SEQAN_PREFIX_SHARE_DOC)
+
+# Install lambda in ${PREFIX}/bin directory
+install (TARGETS lambda lambda_indexer
+ DESTINATION bin)
+
+# Install non-binary files for the package to "." for app builds and
+# ${PREFIX}/share/doc/lambda for SeqAn release builds.
+install (FILES ../LICENSE-GPL3.rst
+ ../README.rst
+ ../COPYING.rst
+ DESTINATION ${SEQAN_PREFIX_SHARE_DOC})
+# install (FILES example/fasta1.fa
+# example/fasta2.fa
+# DESTINATION ${SEQAN_PREFIX_SHARE_DOC}/example)
+
+# ----------------------------------------------------------------------------
+# CPack Install
+# ----------------------------------------------------------------------------
+
+if (SEQAN_BUILD_SYSTEM STREQUAL "APP:lambda")
+ set (CPACK_PACKAGE_NAME "lambda")
+ set (CPACK_PACKAGE_DESCRIPTION_SUMMARY "lambda")
+ set (CPACK_DEBIAN_PACKAGE_MAINTAINER "Your Name <your.email at example.net>")
+ set (CPACK_PACKAGE_VENDOR "Your Name <your.email at example.net>")
+
+ seqan_configure_cpack_app (lambda "lambda")
+endif (SEQAN_BUILD_SYSTEM STREQUAL "APP:lambda")
+
diff --git a/src/alph.hpp b/src/alph.hpp
new file mode 100644
index 0000000..e63b952
--- /dev/null
+++ b/src/alph.hpp
@@ -0,0 +1,237 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// alph.hpp: Functions and Metafunctions for Alphabet reduction
+// ==========================================================================
+
+#ifndef SEQAN_LAMBDA_ALPH_H_
+#define SEQAN_LAMBDA_ALPH_H_
+
+#include <seqan/basic.h>
+#include <seqan/sequence.h>
+
+// using namespace seqan;
+
+namespace seqan {
+
+// ----------------------------------------------------------------------------
+// Specialization AminoAcid10
+// ----------------------------------------------------------------------------
+
+/**
+.Spec.AminoAcid10:
+..cat:Alphabets
+..summary:Iupac code for amino acids.
+..general:Class.SimpleType
+..signature:AminoAcid10
+..remarks:
+...text:The @Metafunction.ValueSize@ of $AminoAcid10$ is 24.
+...text:The amino acids are enumerated from 0 to 15 in this order:
+...text:'A'=0, 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'=19.
+...text:The remaining 4 symbols are:
+...text: 'B'=20 (Aspartic Acid, Asparagine), 'Z'=21 (Glutamic Acid, Glutamine), 'X'=22 (unknown), '*'=23 (terminator)
+...text:Objects of type $AminoAcid10$ can be converted to $char$ and vice versa.
+Unkown values are converted to $'X'$.
+...text:$AminoAcid10$ is typedef for $SimpleType<char,AminoAcid10_>$, while $AminoAcid10_$ is a helper
+specialization tag class.
+..see:Metafunction.ValueSize
+..include:seqan/basic.h
+*/
+
+struct AminoAcid10_ {};
+typedef SimpleType<unsigned char, AminoAcid10_> AminoAcid10;
+
+template <> struct ValueSize<AminoAcid10>
+{
+ typedef __uint8 Type;
+ static const Type VALUE = 10;
+};
+
+template <> struct BitsPerValue<AminoAcid10>
+{
+ typedef __uint8 Type;
+ static const Type VALUE = 4;
+};
+
+inline AminoAcid10
+unknownValueImpl(AminoAcid10 *)
+{
+ static const AminoAcid10 _result = AminoAcid10('X');
+ return _result;
+}
+
+// --------------------------------------------------------------------------
+// Amino Acid
+// --------------------------------------------------------------------------
+
+template <typename T = void>
+struct TranslateTableAA10ToAscii_
+{
+ static char const VALUE[10];
+};
+
+template <typename T>
+char const TranslateTableAA10ToAscii_<T>::VALUE[10] =
+{
+ 'A', // 0 Acidic or Proto-Acidic
+ // 'N' Asn Asparagine, 'D' Asp Apartic Acid, 'B' Asn or Asp
+ // 'Q' Gln Glutamine, 'E' Glu Glutamic Acid, 'Z' Glu or Gln
+ 'B', // 1 BASIC
+ // 'R' Arg Arginine, 'K' Lys Lysine
+ 'C', // 2 Cys Cystine
+ 'H', // 3 His Histidine
+ 'S', // 4 SMALL UNPOLAR
+ // 'A' Ala Alanine, 'G' Gly Glycine
+ 'U', // 5 UNPOLAR
+ // 'I' Ile Isoleucine, 'L' Leu Leucine, 'V' Val Valine, 'M' Met Methionine
+ 'O', // 6 POLAR
+ // 'S' Ser Serine, 'T' Thr Threonine
+ 'P', // 7 Pro Proline
+ 'R', // 8 AROMATIC
+ //'F' Phe Phenylalanine, 'W' Trp Tryptophan, 'Y' Tyr Tyrosine
+ 'X', // 9 'X' Unknown, '*' Terminator
+};
+
+template <typename T = void>
+struct TranslateTableAsciiToAA10_
+{
+ static char const VALUE[256];
+};
+
+template <typename T>
+char const TranslateTableAsciiToAA10_<T>::VALUE[256] =
+{
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //0
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //1
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //2
+// *
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //3
+ 9, 4, 0, 2, 0, 0, 8, 4, 3, 5, 9, 1, 5, 5, 0, 9, //4
+// , A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
+
+ 7, 0, 1, 6, 6, 9, 5, 8, 9, 8, 0, 9, 9, 9, 9, 9, //5
+// P, Q, R, S, T, U, V, W, X, Y, Z, , , , , ,
+
+ 9, 4, 0, 2, 0, 0, 8, 4, 3, 5, 9, 1, 5, 5, 0, 9, //6
+// , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
+
+ 7, 0, 1, 6, 6, 9, 5, 8, 9, 8, 0, 9, 9, 9, 9, 9, //7
+// p, q, r, s, t, u, v, w, x, y, z, , , , , ,
+
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //8
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //9
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //10
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //11
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //12
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //13
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //14
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 //15
+};
+
+template <typename T = void>
+struct TranslateTableAAToAA10_
+{
+ static char const VALUE[24];
+};
+
+template <typename T>
+char const TranslateTableAAToAA10_<T>::VALUE[24] =
+{
+ 4, 1, 0, 0, 0, 2, 0, 4, 3, 5, 5, 1, 5, 8, 7, 6, 6, 8, 8,
+// A R N D C Q E G H I L K M F P S T W Y
+ 5, 0, 0, 9, 9
+// V B Z X *
+};
+
+
+template <typename T = void>
+struct TranslateTableByteToAA10_
+{
+ static char const VALUE[256];
+};
+
+template <typename T>
+char const TranslateTableByteToAA10_<T>::VALUE[256] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9, //0
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //1
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //2
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //3
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //4
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //5
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //6
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //7
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //8
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //9
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //10
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //11
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //12
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //13
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, //14
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 //15
+};
+
+
+
+inline void assign(char & c_target, AminoAcid10 const & source)
+{
+ c_target = TranslateTableAA10ToAscii_<>::VALUE[source.value];
+}
+
+
+template <>
+struct CompareType<AminoAcid10, __uint8>
+{
+ typedef AminoAcid10 Type;
+};
+
+inline void assign(AminoAcid10 & target, __uint8 c_source)
+{
+ target.value = TranslateTableByteToAA10_<>::VALUE[c_source];
+}
+
+template <>
+struct CompareType<AminoAcid10, char>
+{
+ typedef AminoAcid10 Type;
+};
+
+inline void assign(AminoAcid10 & target, char c_source)
+{
+ target.value = TranslateTableAsciiToAA10_<>::VALUE[(unsigned char) c_source];
+}
+
+template <>
+struct CompareType<AminoAcid10, AminoAcid>
+{
+ typedef AminoAcid10 Type;
+};
+
+inline void assign(AminoAcid10 & target, AminoAcid c_source)
+{
+ target.value = TranslateTableAAToAA10_<>::VALUE[c_source.value];
+}
+
+} // namespace seqan
+
+#endif // header guard
\ No newline at end of file
diff --git a/src/holders.hpp b/src/holders.hpp
new file mode 100644
index 0000000..8c1613c
--- /dev/null
+++ b/src/holders.hpp
@@ -0,0 +1,548 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// holders.hpp: Data container structs
+// ==========================================================================
+
+
+#ifndef SEQAN_LAMBDA_HOLDERS_H_
+#define SEQAN_LAMBDA_HOLDERS_H_
+
+
+#include "match.hpp"
+#include "options.hpp"
+
+// ============================================================================
+// Forwards
+// ============================================================================
+
+// ============================================================================
+// Metafunctions
+// ============================================================================
+
+// ============================================================================
+// Functions
+// ============================================================================
+
+// ============================================================================
+// Tags, Classes, Enums
+// ============================================================================
+
+// ----------------------------------------------------------------------------
+// struct StatsHolder
+// ----------------------------------------------------------------------------
+
+struct StatsHolder
+{
+// seeding
+ uint64_t hitsAfterSeeding;
+ uint64_t hitsMerged;
+ uint64_t hitsTooShort;
+ uint64_t hitsMasked;
+
+// pre-extension
+ uint64_t hitsFailedPreExtendTest;
+ uint64_t hitsPutativeDuplicate;
+ uint64_t hitsPutativeAbundant;
+
+// post-extension
+ uint64_t hitsFailedExtendPercentIdentTest;
+ uint64_t hitsFailedExtendEValueTest;
+ uint64_t hitsAbundant;
+ uint64_t hitsDuplicate;
+
+// final
+ uint64_t hitsFinal;
+ uint64_t qrysWithHit;
+
+ StatsHolder()
+ {
+ clear();
+ }
+
+ void clear()
+ {
+ hitsAfterSeeding = 0;
+ hitsMerged = 0;
+ hitsTooShort = 0;
+ hitsMasked = 0;
+
+ hitsFailedPreExtendTest = 0;
+ hitsPutativeDuplicate = 0;
+ hitsPutativeAbundant = 0;
+
+ hitsFailedExtendPercentIdentTest = 0;
+ hitsFailedExtendEValueTest = 0;
+ hitsAbundant = 0;
+ hitsDuplicate = 0;
+
+ hitsFinal = 0;
+ qrysWithHit = 0;
+ }
+
+ StatsHolder plus(StatsHolder const & rhs)
+ {
+ hitsAfterSeeding += rhs.hitsAfterSeeding;
+ hitsMerged += rhs.hitsMerged;
+ hitsTooShort += rhs.hitsTooShort;
+ hitsMasked += rhs.hitsMasked;
+
+ hitsFailedPreExtendTest += rhs.hitsFailedPreExtendTest;
+ hitsPutativeDuplicate += rhs.hitsPutativeDuplicate;
+ hitsPutativeAbundant += rhs.hitsPutativeAbundant;
+
+ hitsFailedExtendPercentIdentTest += rhs.hitsFailedExtendPercentIdentTest;
+ hitsFailedExtendEValueTest += rhs.hitsFailedExtendEValueTest;
+ hitsAbundant += rhs.hitsAbundant;
+ hitsDuplicate += rhs.hitsDuplicate;
+
+ hitsFinal += rhs.hitsFinal;
+ qrysWithHit += rhs.qrysWithHit;
+ return *this;
+ }
+
+ StatsHolder operator+(StatsHolder const& rhs)
+ {
+ StatsHolder tmp(*this);
+ return tmp.plus(rhs);
+ }
+
+ StatsHolder operator+=(StatsHolder const& rhs)
+ {
+ this->plus(rhs);
+ return *this;
+ }
+};
+
+
+// ----------------------------------------------------------------------------
+// struct GlobalDataHolder -- one object per program
+// ----------------------------------------------------------------------------
+
+template <typename T>
+inline T &
+_initHelper(T & t1, T &&)
+{
+ std::cout << "FOO\n";
+ return t1;
+}
+
+template <typename T, typename T2>
+inline T2 &&
+_initHelper(T &, T2 && t2)
+{
+ std::cout << "BAR\n";
+ return std::move(t2);
+}
+
+template <typename TRedAlph_,
+ typename TScoreScheme_,
+ typename TIndexSpec_,
+ typename TFileFormat,
+ BlastProgram p,
+ BlastTabularSpec h>
+class GlobalDataHolder
+{
+public:
+ using TRedAlph = RedAlph<p, TRedAlph_>; // ensures == Dna5 for BlastN
+
+ static constexpr BlastProgram blastProgram = p;
+ static constexpr bool indexIsFM = std::is_same<TIndexSpec_, TFMIndex<>>::value;
+ static constexpr bool alphReduction = !std::is_same<TransAlph<p>, TRedAlph>::value;
+
+ /* Sequence storage types */
+ using TStringTag = Alloc<>;
+#if defined(LAMBDA_MMAPPED_DB)
+ using TDirectStringTag = MMap<>;
+#else
+ using TDirectStringTag = TStringTag;
+#endif
+ using TQryTag = TStringTag;
+ using TSubjTag = TDirectStringTag; // even if subjects were translated they are now loaded from disk
+
+ /* untranslated query sequences (ONLY USED FOR SAM/BAM OUTPUT) */
+ using TUntransQrySeqs = StringSet<String<OrigQryAlph<p>, TQryTag>, Owner<ConcatDirect<>>>;
+
+ /* Possibly translated but yet unreduced sequences */
+ template <typename TSpec>
+ using TTransSeqs = StringSet<String<TransAlph<p>, TSpec>, Owner<ConcatDirect<>>>;
+ using TTransQrySeqs = TTransSeqs<TQryTag>;
+ using TTransSubjSeqs = TTransSeqs<TSubjTag>;
+ using TTransSubjReal = typename std::conditional<
+ alphReduction || indexIsFM,
+ TTransSubjSeqs, // real type
+ TTransSubjSeqs &>::type; // will be initialized in constructor
+
+ /* Reduced sequence objects, either as modstrings or as references to trans-strings */
+ template <typename TSpec>
+ using TRedAlphModString = ModifiedString<String<TransAlph<p>, TSpec>,
+ ModView<FunctorConvert<TransAlph<p>, TRedAlph>>>;
+
+ using TRedQrySeqs = typename std::conditional<
+ alphReduction,
+ StringSet<TRedAlphModString<TQryTag>, Owner<ConcatDirect<>>>, // modview
+ TTransQrySeqs &>::type; // reference to owner
+ using TRedSubjSeqs = typename std::conditional<
+ alphReduction,
+ StringSet<TRedAlphModString<TSubjTag>, Owner<ConcatDirect<>>>, // modview
+ TTransSubjSeqs &>::type; // reference to owner
+
+ /* sequence ID strings */
+ template <typename TSpec>
+ using TIds = StringSet<String<char, TSpec>, Owner<ConcatDirect<>>>;
+ using TQryIds = TIds<TQryTag>;
+ using TSubjIds = TIds<TSubjTag>;
+
+ /* indeces and their type */
+ using TIndexSpec = TIndexSpec_;
+ using TDbIndex = Index<typename std::remove_reference<TRedSubjSeqs>::type, TIndexSpec>;
+
+ /* output file */
+ using TScoreScheme = TScoreScheme_;
+ using TIOContext = BlastIOContext<TScoreScheme, p, h>;
+ using TFile = FormattedFile<TFileFormat, Output, TIOContext>;
+ using TBamFile = FormattedFile<Bam, Output, BlastTabular>;
+
+ /* misc types */
+ using TPositions = typename StringSetLimits<TTransQrySeqs>::Type;
+ using TMasking = StringSet<String<unsigned>, Owner<ConcatDirect<>>>;
+
+ /* the actual members */
+ TDbIndex dbIndex;
+
+ TUntransQrySeqs untranslatedQrySeqs; // used iff outformat is sam or bam
+
+ TTransQrySeqs qrySeqs;
+ TTransSubjReal subjSeqs;
+
+ TRedQrySeqs redQrySeqs;
+ TRedSubjSeqs redSubjSeqs;
+
+ TQryIds qryIds;
+ TSubjIds subjIds;
+
+ TFile outfile;
+ TBamFile outfileBam;
+
+ TPositions untransQrySeqLengths; // used iff qIsTranslated(p)
+ TPositions untransSubjSeqLengths; // used iff sIsTranslated(p)
+
+ TMasking segIntStarts;
+ TMasking segIntEnds;
+
+ StatsHolder stats;
+
+ GlobalDataHolder() :
+ subjSeqs(_initHelper(indexText(dbIndex), TTransSubjSeqs())),//std::integral_constant<bool, alphReduction || indexIsFM>())),// : TTransSubjSeqs()),
+ redQrySeqs(qrySeqs),
+ redSubjSeqs(subjSeqs),
+ stats()
+ {}
+};
+
+/* Documentation on the confusing type resolution used in the above class:
+ *
+ * !alphReduction && !indexIsFM e.g. BLASTN and SA-Index
+ *
+ * subjSeqs is & and initialized with indexText()
+ * redSubjSeqs is & and initialized with subjSeqs
+ * indexText(dbIndex) is non-ref owner StringSet assigned by loadDbIndexFromDisk()
+ *
+ * !alphReduction && indexIsFM e.g. BLASTN and FM-Index
+ *
+ * subjSeqs is non-ref owner StringSet and assigned in loadSubjects()
+ * redSubjSeqs is & and initialized with subjSeqs
+ * indexText(dbIndex) is non-ref owner StringSet, but never set (fmIndex doesnt need it)
+ *
+ * alphReduction && indexIsFM e.g. default
+ *
+ * subjSeqs is non-ref owner StringSet and assigned in loadSubjects()
+ * redSubjSeqs is lightweight reduced StringSet and initialized with subjSeqs
+ * indexText(dbIndex) is lightweight reduced StringSet, but never set (fmIndex doesnt need it)
+ *
+ * alphReduction && !indexIsFM e.g. default
+ *
+ * subjSeqs is non-ref owner StringSet and assigned in loadSubjects()
+ * redSubjSeqs is lightweight reduced StringSet and initialized with subjSeqs
+ * indexText(dbIndex) is lightweight reduced StringSet and assigned redSubjSeqs in loadDbIndexFromDisk
+ */
+
+// ----------------------------------------------------------------------------
+// struct LocalDataHolder -- one object per thread
+// ----------------------------------------------------------------------------
+
+template <typename TMatch,
+ typename TGlobalHolder_,
+ typename TScoreExtension>
+class LocalDataHolder
+{
+public:
+ using TGlobalHolder = TGlobalHolder_;
+ using TRedQrySeq = typename Value<typename std::remove_reference<typename TGlobalHolder::TRedQrySeqs>::type>::Type;
+ using TSeeds = StringSet<typename Infix<TRedQrySeq const>::Type>;
+ using TSeedIndex = Index<TSeeds, IndexSa<>>;
+
+
+ // references to global stuff
+ LambdaOptions const & options;
+ TGlobalHolder /*const*/ & gH;
+ static constexpr BlastProgram blastProgram = TGlobalHolder::blastProgram;
+
+ // this is the localHolder for the i-th part of the queries
+ uint64_t i;
+
+ // regarding range of queries
+ uint64_t indexBeginQry;
+ uint64_t indexEndQry;
+
+ // regarding seedingp
+ TSeeds seeds;
+ TSeedIndex seedIndex;
+// std::forward_list<TMatch> matches;
+ std::vector<TMatch> matches;
+ std::vector<typename Match::TQId> seedRefs; // mapping seed -> query
+ std::vector<uint16_t> seedRanks; // mapping seed -> relative rank
+
+ // regarding extension
+ using TAlignRow0 = Gaps<typename Infix<typename Value<typename TGlobalHolder::TTransQrySeqs>::Type>::Type,
+ ArrayGaps>;
+ using TAlignRow1 = Gaps<typename Infix<typename Value<typename TGlobalHolder::TTransSubjSeqs>::Type>::Type,
+ ArrayGaps>;
+ using TDPContext = DPContext<typename Value<typename TGlobalHolder::TScoreScheme>::Type, TScoreExtension>;
+ using TAliExtContext = AliExtContext_<TAlignRow0, TAlignRow1, TDPContext>;
+
+ TAliExtContext alignContext;
+
+
+ // regarding the gathering of stats
+ StatsHolder stats;
+
+ // progress string
+ std::stringstream statusStr;
+
+ // constructor
+ LocalDataHolder(LambdaOptions const & _options,
+ TGlobalHolder /*const*/ & _globalHolder) :
+ options(_options), gH(_globalHolder), stats()
+ {}
+
+ // copy constructor SHALLOW COPY ONLY, REQUIRED FOR firsprivate()
+ LocalDataHolder(LocalDataHolder const & rhs) :
+ options(rhs.options), gH(rhs.gH), stats()
+ {}
+
+ void init(uint64_t const _i)
+ {
+ i = _i;
+
+ if (options.doubleIndexing)
+ {
+ indexBeginQry = (length(gH.qrySeqs) / options.queryPart) * i;
+ indexEndQry = (i+1 == options.queryPart) // last interval
+ ? length(gH.qrySeqs) // reach until end
+ : (length(gH.qrySeqs) / options.queryPart) * (i+1);
+ // make sure different frames of one sequence in same interval
+ indexBeginQry -= (indexBeginQry % qNumFrames(blastProgram));
+ indexEndQry -= (indexEndQry % qNumFrames(blastProgram));
+ } else
+ {
+ indexBeginQry = qNumFrames(blastProgram) * i;
+ indexEndQry = qNumFrames(blastProgram) * (i+1);
+ }
+
+ clear(seeds);
+ clear(seedIndex);
+ matches.clear();
+ seedRefs.clear();
+ seedRanks.clear();
+// stats.clear();
+ statusStr.clear();
+ statusStr.precision(2);
+ }
+
+};
+
+// perform a fast local alignment score calculation on the seed and see if we
+// reach above threshold
+// WARNING the following function only works for hammingdistanced seeds
+template <typename TMatch,
+ typename TGlobalHolder,
+ typename TScoreExtension>
+inline bool
+seedLooksPromising(
+ LocalDataHolder<TMatch, TGlobalHolder, TScoreExtension> const & lH,
+ TMatch const & m)
+{
+ // no pre-scoring, but still filter out XXX and NNN hits
+// if (!lH.options.preScoring))
+// {
+// for (unsigned i = m.qryStart, count = 0; i < m.qryStart + lH.options.seedLength; ++i)
+// if (lH.gH.qrySeqs[m.qryId][i] == unkownValue<typename TGlobalHolder::TRedAlph>())
+// if (++count > lH.options.maxSeedDist)
+// return false;
+// return true;
+// }
+
+ int64_t effectiveQBegin = m.qryStart;
+ int64_t effectiveSBegin = m.subjStart;
+ uint64_t effectiveLength = lH.options.seedLength * lH.options.preScoring;
+ if (lH.options.preScoring > 1)
+ {
+ effectiveQBegin -= (lH.options.preScoring - 1) *
+ lH.options.seedLength / 2;
+ effectiveSBegin -= (lH.options.preScoring - 1) *
+ lH.options.seedLength / 2;
+// std::cout << effectiveQBegin << "\t" << effectiveSBegin << "\n";
+ int64_t min = std::min(effectiveQBegin, effectiveSBegin);
+ if (min < 0)
+ {
+ effectiveQBegin -= min;
+ effectiveSBegin -= min;
+ effectiveLength += min;
+ }
+
+ effectiveLength = std::min({
+ length(lH.gH.qrySeqs[m.qryId]) - effectiveQBegin,
+ length(lH.gH.subjSeqs[m.subjId]) - effectiveSBegin,
+ effectiveLength});
+// std::cout << effectiveQBegin << "\t" << effectiveSBegin << "\t"
+// << effectiveLength << "\n";
+ }
+
+ auto const & qSeq = infix(lH.gH.qrySeqs[m.qryId],
+ effectiveQBegin,
+ effectiveQBegin + effectiveLength);
+ auto const & sSeq = infix(lH.gH.subjSeqs[m.subjId],
+ effectiveSBegin,
+ effectiveSBegin + effectiveLength);
+ int maxScore = 0;
+
+ int scores[effectiveLength+1]; // C99, C++14, -Wno-vla before that
+ scores[0] = 0;
+
+ // score the diagonal
+ for (uint64_t i = 0; i < effectiveLength; ++i)
+ {
+ scores[i] += score(seqanScheme(context(lH.gH.outfile).scoringScheme), qSeq[i], sSeq[i]);
+ if (scores[i] < 0)
+ scores[i] = 0;
+ else if (scores[i] > maxScore)
+ maxScore = scores[i];
+
+// if (i < static_cast<uint64_t>(effectiveLength - 1)) // TODO remove
+ scores[i+1] = scores[i];
+ }
+
+ return (maxScore >= int(lH.options.preScoringThresh * effectiveLength));
+}
+
+template <typename TMatch,
+ typename TGlobalHolder,
+ typename TScoreExtension,
+ typename TSeedId,
+ typename TSubjOcc>
+inline void
+onFindImpl(LocalDataHolder<TMatch, TGlobalHolder, TScoreExtension> & lH,
+ TSeedId const & seedId,
+ TSubjOcc subjOcc)
+{
+ if (TGlobalHolder::indexIsFM) // positions are reversed
+ setSeqOffset(subjOcc,
+ length(lH.gH.subjSeqs[getSeqNo(subjOcc)])
+ - getSeqOffset(subjOcc)
+ - lH.options.seedLength);
+
+ Match m {static_cast<Match::TQId>(lH.seedRefs[seedId]),
+ static_cast<Match::TSId>(getSeqNo(subjOcc)),
+ static_cast<Match::TPos>(lH.seedRanks[seedId] * lH.options.seedOffset),
+ static_cast<Match::TPos>(getSeqOffset(subjOcc))};
+
+ bool discarded = false;
+ auto const halfSubjL = lH.options.seedLength / 2;
+
+ if (!sIsTranslated(lH.gH.blastProgram))
+ {
+ for (unsigned k = 0; k < length(lH.gH.segIntStarts[m.subjId]); ++k)
+ {
+ // more than half of the seed falls into masked interval
+ if (intervalOverlap(m.subjStart,
+ m.subjStart + lH.options.seedLength,
+ lH.gH.segIntStarts[m.subjId][k],
+ lH.gH.segIntEnds[m.subjId][k])
+ >= halfSubjL)
+ {
+ ++lH.stats.hitsMasked;
+ discarded = true;
+ break;
+ }
+ }
+ }
+
+ if ((!discarded) && (!seedLooksPromising(lH, m)))
+ {
+ discarded = true;
+ ++lH.stats.hitsFailedPreExtendTest;
+ }
+
+ if (!discarded)
+ lH.matches.emplace_back(m);
+}
+
+template <typename TMatch,
+ typename TGlobalHolder,
+ typename TScoreExtension,
+ typename TFinder>
+inline void
+onFindDoubleIndex(LocalDataHolder<TMatch, TGlobalHolder, TScoreExtension> & lH,
+ TFinder const & finder)
+{
+ auto qryOccs = getOccurrences(back(finder.patternStack));
+ auto subjOccs = getOccurrences(back(finder.textStack));
+
+ lH.stats.hitsAfterSeeding += length(qryOccs) * length(subjOccs);
+
+ for (unsigned i = 0; i < length(qryOccs); ++i)
+ for (unsigned j = 0; j < length(subjOccs); ++j)
+ onFindImpl(lH, getSeqNo(qryOccs[i]), subjOccs[j]);
+}
+
+template <typename TMatch,
+ typename TGlobalHolder,
+ typename TScoreExtension,
+ typename TSeedListIterator,
+ typename TIndexIterator>
+inline void
+onFindSingleIndex(LocalDataHolder<TMatch, TGlobalHolder, TScoreExtension> & lH,
+ TSeedListIterator const & seedIt,
+ TIndexIterator const & indexIt)
+{
+ auto qryOcc = position(seedIt);
+ auto subjOccs = getOccurrences(indexIt);
+
+ lH.stats.hitsAfterSeeding += length(subjOccs);
+
+ for (unsigned j = 0; j < length(subjOccs); ++j)
+ onFindImpl(lH, qryOcc, subjOccs[j]);
+}
+
+#endif // SEQAN_LAMBDA_HOLDERS_H_
diff --git a/src/lambda.cpp b/src/lambda.cpp
new file mode 100644
index 0000000..4f97ca2
--- /dev/null
+++ b/src/lambda.cpp
@@ -0,0 +1,563 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// lambda.cpp: Main File for Lambda
+// ==========================================================================
+
+#include <iostream>
+
+#include <seqan/basic.h>
+#include <seqan/sequence.h>
+#include <seqan/arg_parse.h>
+#include <seqan/seq_io.h>
+#include <seqan/reduced_aminoacid.h>
+#include <seqan/misc/terminal.h>
+
+#include "options.hpp"
+#include "match.hpp"
+#include "lambda.hpp"
+#include "misc.hpp"
+
+using namespace seqan;
+
+
+// inline BlastFormatFile
+// _fileType(LambdaOptions const & options)
+// {
+// if (endsWith(options.output, ".m0"))
+// return BlastFormatFile::PAIRWISE;
+// else if (endsWith(options.output, ".m8"))
+// return BlastFormatFile::TABULAR;
+// else if (endsWith(options.output, ".m9"))
+// return BlastFormatFile::TABULAR_WITH_COMMENTS;
+// else
+// return BlastFormatFile::UNKNOWN;
+// }
+
+// forwards
+
+inline int
+argConv0(LambdaOptions const & options);
+//-
+template <typename TOutFormat,
+ BlastTabularSpec h>
+inline int
+argConv05(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &);
+//-
+template <typename TOutFormat,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv1(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &);
+//-
+template <typename TOutFormat,
+ typename TRedAlph,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv2(LambdaOptions const & options,
+ TOutFormat const &,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &);
+//-
+template <typename TOutFormat,
+ typename TRedAlph,
+ typename TScoreScheme,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv3(LambdaOptions const & options,
+ TOutFormat const &,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &,
+ TScoreScheme const &);
+//-
+template <typename TOutFormat,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TScoreExtension,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv4(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const & /**/,
+ TScoreScheme const & /**/,
+ TScoreExtension const & /**/);
+//-
+template <typename TIndexSpec,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TScoreExtension,
+ typename TOutFormat,
+ BlastProgram p,
+ BlastTabularSpec h>
+inline int
+realMain(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const & /**/,
+ TScoreScheme const & /**/,
+ TScoreExtension const & /**/);
+
+// --------------------------------------------------------------------------
+// Function main()
+// --------------------------------------------------------------------------
+
+// Program entry point.
+
+int main(int argc, char const ** argv)
+{
+ // Parse the command line.
+ seqan::ArgumentParser parser;
+ LambdaOptions options;
+ seqan::ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv);
+
+ // If there was an error parsing or built-in argument parser functionality
+ // was triggered then we exit the program. The return code is 1 if there
+ // were errors and 0 if there were none.
+ if (res != seqan::ArgumentParser::PARSE_OK)
+ return res == seqan::ArgumentParser::PARSE_ERROR;
+
+ if (std::string(CMAKE_BUILD_TYPE) != "Release")
+ std::cerr << "WARNING: This binary is not built in release mode and will be much slower than it should be!\n";
+ return argConv0(options);
+}
+
+// CONVERT Run-time options to compile-time Format-Type
+inline int
+argConv0(LambdaOptions const & options)
+{
+ CharString output = options.output;
+ if (endsWith(output, ".gz"))
+ output = prefix(output, length(output) - 3);
+ else if (endsWith(output, ".bz2"))
+ output = prefix(output, length(output) - 4);
+
+ if (endsWith(output, ".m0"))
+ return argConv05(options, BlastReport(), BlastTabularSpecSelector<BlastTabularSpec::NO_COMMENTS>());
+ else if (endsWith(output, ".m8"))
+ return argConv05(options, BlastTabular(), BlastTabularSpecSelector<BlastTabularSpec::NO_COMMENTS>());
+ else if (endsWith(output, ".m9"))
+ return argConv05(options, BlastTabular(), BlastTabularSpecSelector<BlastTabularSpec::COMMENTS>());
+ else if (endsWith(output, ".sam") || endsWith(output, ".bam")) // handled elsewhere
+ return argConv05(options, BlastTabular(), BlastTabularSpecSelector<BlastTabularSpec::COMMENTS>());
+
+ std::cerr << "ERROR: Cannot handle output extension.\n";
+ return -1;
+}
+
+template <typename TOutFormat,
+ BlastTabularSpec h>
+inline int
+argConv05(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &)
+{
+ switch(options.blastProgram)
+ {
+#ifndef FASTBUILD
+ case BlastProgram::BLASTN:
+ return argConv1(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<BlastProgram::BLASTN>());
+#endif
+ case BlastProgram::BLASTP:
+ return argConv1(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<BlastProgram::BLASTP>());
+ case BlastProgram::BLASTX:
+ return argConv1(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<BlastProgram::BLASTX>());
+#ifndef FASTBUILD
+ case BlastProgram::TBLASTN:
+ return argConv1(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<BlastProgram::TBLASTN>());
+ case BlastProgram::TBLASTX:
+ return argConv1(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<BlastProgram::TBLASTX>());
+#endif
+ default:
+ break;
+ }
+ std::cerr << "ERROR: Cannot handle program mode (perhaps you are building in FASTMODE?).\n";
+ return -1;
+}
+
+
+/// Alphabet reduction
+template <typename TOutFormat,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv1(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &)
+{
+ using TUnred = typename std::conditional<p == BlastProgram::BLASTN, Dna5, AminoAcid>::type;
+ using Th = BlastTabularSpecSelector<h>;
+ using Tp = BlastProgramSelector<p>;
+ switch (options.alphReduction)
+ {
+ case 0:
+ return argConv2(options, TOutFormat(), Th(), Tp(), TUnred());
+ case 2:
+ return argConv2(options, TOutFormat(), Th(), Tp(), ReducedAminoAcid<Murphy10>());
+#if 0
+ case 10:
+ return argConv2(options, TOutFormat(), ReducedAminoAcid<ClusterReduction<10>>());
+ case 1:
+ return argConv2(options, TOutFormat(), AminoAcid10());
+ case 8:
+ return argConv2(options, TOutFormat(), ReducedAminoAcid<ClusterReduction<8>>());
+ case 12:
+ return argConv2(options, TOutFormat(), ReducedAminoAcid<ClusterReduction<12>>());
+#endif
+ default:
+ break;
+ }
+ std::cerr << "ERROR: Cannot handle the specified alphabet reduction.\n";
+ return -1;
+}
+
+/// scoring scheme
+
+template <typename TOutFormat,
+ typename TRedAlph,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv2(LambdaOptions const & options,
+ TOutFormat const &,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &)
+{
+ using Th = BlastTabularSpecSelector<h>;
+ using Tp = BlastProgramSelector<p>;
+ switch (options.scoringMethod)
+ {
+#ifndef FASTBUILD
+ case 0:
+ return argConv3(options, TOutFormat(), Th(), Tp(), TRedAlph(), Score<int, Simple>());
+ case 45:
+ return argConv3(options, TOutFormat(), Th(), Tp(), TRedAlph(), Blosum45());
+ case 80:
+ return argConv3(options, TOutFormat(), Th(), Tp(), TRedAlph(), Blosum80());
+#endif
+ case 62:
+ return argConv3(options, TOutFormat(), Th(), Tp(), TRedAlph(), Blosum62());
+ default:
+ break;
+ }
+ std::cerr << "Unsupported Scoring Scheme selected.\n";
+ return -1;
+}
+
+template <typename TOutFormat,
+ typename TRedAlph,
+ typename TScoreScheme,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv3(LambdaOptions const & options,
+ TOutFormat const &,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &,
+ TScoreScheme const &)
+{
+#ifndef FASTBUILD
+ if (options.gapOpen == 0)
+ return argConv4(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<p>(),
+ TRedAlph(),
+ TScoreScheme(),
+ LinearGaps());
+ else
+#endif
+ return argConv4(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<p>(),
+ TRedAlph(),
+ TScoreScheme(),
+ AffineGaps());
+}
+
+template <typename TOutFormat,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TScoreExtension,
+ BlastTabularSpec h,
+ BlastProgram p>
+inline int
+argConv4(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const & /**/,
+ TScoreScheme const & /**/,
+ TScoreExtension const & /**/)
+{
+ int indexType = options.dbIndexType;
+// if (indexType == -1) // autodetect
+// {
+// //TODO FIX THIS WITH NEW EXTENSIONS
+// CharString file = options.dbFile;
+// append(file, ".sa");
+// struct stat buffer;
+// if (stat(toCString(file), &buffer) == 0)
+// {
+// indexType = 0;
+// } else
+// {
+// file = options.dbFile;
+// append(file, ".sa.val"); // FM Index
+// struct stat buffer;
+// if (stat(toCString(file), &buffer) == 0)
+// {
+// indexType = 1;
+// } else
+// {
+// std::cerr << "No Index file could be found, please make sure paths "
+// << "are correct and the files are readable.\n" << std::flush;
+//
+// return -1;
+// }
+// }
+// }
+
+ if (indexType == 0)
+ return realMain<IndexSa<>>(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<p>(),
+ TRedAlph(),
+ TScoreScheme(),
+ TScoreExtension());
+ else
+ return realMain<TFMIndex<>>(options,
+ TOutFormat(),
+ BlastTabularSpecSelector<h>(),
+ BlastProgramSelector<p>(),
+ TRedAlph(),
+ TScoreScheme(),
+ TScoreExtension());
+}
+
+/// REAL MAIN
+#ifdef _OPENMP
+#define TID omp_get_thread_num()
+#else
+#define TID 0
+#endif
+template <typename TIndexSpec,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TScoreExtension,
+ typename TOutFormat,
+ BlastProgram p,
+ BlastTabularSpec h>
+inline int
+realMain(LambdaOptions const & options,
+ TOutFormat const & /**/,
+ BlastTabularSpecSelector<h> const &,
+ BlastProgramSelector<p> const &,
+ TRedAlph const & /**/,
+ TScoreScheme const & /**/,
+ TScoreExtension const & /**/)
+{
+ using TGlobalHolder = GlobalDataHolder<TRedAlph,
+ TScoreScheme,
+ TIndexSpec,
+ TOutFormat,
+ p,
+ h>;
+ using TLocalHolder = LocalDataHolder<Match, TGlobalHolder, TScoreExtension>;
+
+ myPrint(options, 1, "LAMBDA - the Local Aligner for Massive Biological DatA"
+ "\n======================================================"
+ "\nVersion ", SEQAN_APP_VERSION, "\n\n");
+
+ if (options.verbosity >= 2)
+ printOptions<TLocalHolder>(options);
+
+ TGlobalHolder globalHolder;
+
+ int ret = prepareScoring(globalHolder, options);
+ if (ret)
+ return ret;
+
+ ret = loadSubjects(globalHolder, options);
+ if (ret)
+ return ret;
+
+ ret = loadDbIndexFromDisk(globalHolder, options);
+ if (ret)
+ return ret;
+
+ ret = loadSegintervals(globalHolder, options);
+ if (ret)
+ return ret;
+
+ ret = loadQuery(globalHolder, options);
+ if (ret)
+ return ret;
+
+// std::cout << "1st Query:\n"
+// << front(globalHolder.qrySeqs) << "\n"
+// << front(globalHolder.redQrySeqs) << "\n";
+//
+// std::cout << "last Query:\n"
+// << back(globalHolder.qrySeqs) << "\n"
+// << back(globalHolder.redQrySeqs) << "\n";
+//
+// std::cout << "1st Subject:\n"
+// << front(globalHolder.subjSeqs) << "\n"
+// << front(globalHolder.redSubjSeqs) << "\n";
+//
+// std::cout << "last Subject:\n"
+// << back(globalHolder.subjSeqs) << "\n"
+// << back(globalHolder.redSubjSeqs) << "\n";
+
+ myWriteHeader(globalHolder, options);
+
+ if (options.doubleIndexing)
+ {
+ myPrint(options, 1,
+ "Searching ",
+ options.queryPart,
+ " blocks of query with ",
+ options.threads,
+ " threads...\n");
+ if ((options.isTerm) && (options.verbosity >= 1))
+ {
+ for (unsigned char i=0; i< options.threads+3; ++i)
+ std::cout << std::endl;
+ std::cout << "\033[" << options.threads+2 << "A";
+ }
+ } else
+ {
+ myPrint(options, 1, "Searching and extending hits on-line...progress:\n"
+ "0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n|");
+ }
+ double start = sysTime();
+
+ // at least a block for each thread on double-indexing,
+ // otherwise a block for each original query (contains 6 queries if
+ // translation is used)
+ uint64_t nBlocks = (options.doubleIndexing
+ ? options.queryPart
+ : length(globalHolder.qryIds));
+
+ uint64_t lastPercent = 0;
+
+ SEQAN_OMP_PRAGMA(parallel)
+ {
+ TLocalHolder localHolder(options, globalHolder);
+
+ SEQAN_OMP_PRAGMA(for schedule(dynamic))
+ for (uint64_t t = 0; t < nBlocks; ++t)
+ {
+ int res = 0;
+
+ localHolder.init(t);
+
+ // seed
+ res = generateSeeds(localHolder);
+ if (res)
+ continue;
+
+ if (options.doubleIndexing)
+ {
+ res = generateTrieOverSeeds(localHolder);
+ if (res)
+ continue;
+ }
+
+ // search
+ search(localHolder);
+
+ // sort
+ sortMatches(localHolder);
+
+ // extend
+ res = iterateMatches(localHolder);
+ if (res)
+ continue;
+
+
+ if ((!options.doubleIndexing) && (TID == 0) &&
+ (options.verbosity >= 1))
+ {
+ unsigned curPercent = ((t * 50) / nBlocks) * 2; // round to even
+ printProgressBar(lastPercent, curPercent);
+ }
+
+ } // implicit thread sync here
+
+ if ((!options.doubleIndexing) && (TID == 0) && (options.verbosity >= 1))
+ printProgressBar(lastPercent, 100);
+
+ SEQAN_OMP_PRAGMA(critical(statsAdd))
+ {
+ globalHolder.stats += localHolder.stats;
+ }
+ }
+
+ if (ret)
+ return ret;
+
+ myWriteFooter(globalHolder, options);
+
+ if (!options.doubleIndexing)
+ {
+ myPrint(options, 2, "Runtime: ", sysTime() - start, "s.\n\n");
+ }
+
+ printStats(globalHolder.stats, options);
+
+ return 0;
+}
diff --git a/src/lambda.hpp b/src/lambda.hpp
new file mode 100644
index 0000000..b8ad55c
--- /dev/null
+++ b/src/lambda.hpp
@@ -0,0 +1,1495 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// lambda.hpp: contains the main progam pipeline
+// ==========================================================================
+
+
+#ifndef SEQAN_LAMBDA_LAMBDA_H_
+#define SEQAN_LAMBDA_LAMBDA_H_
+
+#include <type_traits>
+#include <iomanip>
+
+#include <seqan/basic.h>
+#include <seqan/sequence.h>
+#include <seqan/seq_io.h>
+#include <seqan/misc/terminal.h>
+
+#include <seqan/index.h>
+
+#include <seqan/translation.h>
+#include <seqan/reduced_aminoacid.h>
+
+#include <seqan/align_extend.h>
+
+#include "options.hpp"
+#include "match.hpp"
+#include "misc.hpp"
+#include "alph.hpp"
+#include "holders.hpp"
+#include "output.hpp"
+
+using namespace seqan;
+
+enum COMPUTERESULT_
+{
+ SUCCESS = 0,
+ PREEXTEND,
+ PERCENTIDENT,
+ EVALUE,
+ OTHER_FAIL
+};
+
+//TODO replace with lambda
+// comparison operator to sort SA-Values based on the strings in the SA they refer to
+template <typename TSav, typename TStringSet>
+struct Comp :
+ public ::std::binary_function < TSav, TSav, bool >
+{
+ TStringSet const & stringSet;
+
+ Comp(TStringSet const & _stringSet)
+ : stringSet(_stringSet)
+ {}
+
+ inline bool operator() (TSav const & i, TSav const & j) const
+ {
+ return (value(stringSet,getSeqNo(i)) < value(stringSet,getSeqNo(j)));
+ }
+};
+
+// ============================================================================
+// Functions
+// ============================================================================
+
+// --------------------------------------------------------------------------
+// Function prepareScoring()
+// --------------------------------------------------------------------------
+
+template <BlastTabularSpec h,
+ BlastProgram p,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TIndexSpec,
+ typename TOutFormat>
+inline void
+prepareScoringMore(GlobalDataHolder<TRedAlph, TScoreScheme,TIndexSpec, TOutFormat, p, h> & globalHolder,
+ LambdaOptions const & options,
+ std::true_type const & /**/)
+{
+ setScoreMatch(context(globalHolder.outfile).scoringScheme, options.match);
+ setScoreMismatch(context(globalHolder.outfile).scoringScheme, options.misMatch);
+}
+
+template <BlastTabularSpec h,
+ BlastProgram p,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TIndexSpec,
+ typename TOutFormat>
+inline void
+prepareScoringMore(GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h> & /*globalHolder*/,
+ LambdaOptions const & /*options*/,
+ std::false_type const & /**/)
+{
+}
+
+template <BlastTabularSpec h,
+ BlastProgram p,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TIndexSpec,
+ typename TOutFormat>
+inline int
+prepareScoring(GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h> & globalHolder,
+ LambdaOptions const & options)
+{
+ setScoreGapOpenBlast(context(globalHolder.outfile).scoringScheme, options.gapOpen);
+ setScoreGapExtend(context(globalHolder.outfile).scoringScheme, options.gapExtend);
+
+ prepareScoringMore(globalHolder, options, std::is_same<TScoreScheme, Score<int, Simple>>());
+
+ if (!isValid(context(globalHolder.outfile).scoringScheme))
+ {
+ std::cerr << "Could not computer Karlin-Altschul-Values for "
+ << "Scoring Scheme. Exiting.\n";
+ return -1;
+ }
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function loadSubjects()
+// --------------------------------------------------------------------------
+
+template <BlastTabularSpec h,
+ BlastProgram p,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TIndexSpec,
+ typename TOutFormat>
+inline int
+loadSubjects(GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h> & globalHolder,
+ LambdaOptions const & options)
+{
+ using TGH = GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h>;
+
+ double start, finish;
+ std::string strIdent;
+ int ret;
+ CharString _dbSeqs;
+
+ if (TGH::indexIsFM || TGH::alphReduction) // otherwise sequences are loaded as part of index
+ {
+ start = sysTime();
+ strIdent = "Loading Subj Sequences...";
+ myPrint(options, 1, strIdent);
+
+ _dbSeqs = options.dbFile;
+ append(_dbSeqs, ".");
+ append(_dbSeqs, _alphName(TransAlph<p>()));
+
+ ret = open(globalHolder.subjSeqs, toCString(_dbSeqs), OPEN_RDONLY);
+ if (ret != true)
+ {
+ std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
+ << " failed.\n";
+ return 1;
+ }
+
+ if (TGH::alphReduction)
+ globalHolder.redSubjSeqs.limits = globalHolder.subjSeqs.limits;
+
+ finish = sysTime() - start;
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", finish, "s \n", "Amount: ",
+ length(globalHolder.subjSeqs), "\n\n");
+ }
+
+ start = sysTime();
+ strIdent = "Loading Subj Ids...";
+ myPrint(options, 1, strIdent);
+
+ _dbSeqs = options.dbFile;
+ append(_dbSeqs, ".ids");
+ ret = open(globalHolder.subjIds, toCString(_dbSeqs), OPEN_RDONLY);
+ if (ret != true)
+ {
+ std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
+ << " failed.\n";
+ return 1;
+ }
+ finish = sysTime() - start;
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", finish, "s \n\n");
+
+ context(globalHolder.outfile).dbName = options.dbFile;
+
+ // if subjects where translated, we don't have the untranslated seqs at all
+ // but we still need the data for statistics and position un-translation
+ if (sIsTranslated(p))
+ {
+ start = sysTime();
+ std::string strIdent = "Loading Lengths of untranslated Subj sequences...";
+ myPrint(options, 1, strIdent);
+
+ _dbSeqs = options.dbFile;
+ append(_dbSeqs, ".untranslengths");
+ ret = open(globalHolder.untransSubjSeqLengths, toCString(_dbSeqs), OPEN_RDONLY);
+ if (ret != true)
+ {
+ std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
+ << " failed.\n";
+ return 1;
+ }
+
+ finish = sysTime() - start;
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", finish, "s \n\n");
+ }
+
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function loadIndexFromDisk()
+// --------------------------------------------------------------------------
+
+template <typename TGlobalHolder>
+inline int
+loadDbIndexFromDisk(TGlobalHolder & globalHolder,
+ LambdaOptions const & options)
+{
+ std::string strIdent = "Loading Database Index...";
+ myPrint(options, 1, strIdent);
+ double start = sysTime();
+ std::string path = toCString(options.dbFile);
+ path += '.' + std::string(_alphName(typename TGlobalHolder::TRedAlph()));
+ if (TGlobalHolder::indexIsFM)
+ path += ".fm";
+ else
+ path += ".sa";
+
+ // Check if the index is of the old format (pre 0.9.0) by looking for different files
+ if ((globalHolder.blastProgram != BlastProgram::BLASTN) && // BLASTN indexes are compatible
+ ((TGlobalHolder::alphReduction && fileExists(toCString(path + ".txt.concat"))) ||
+ (!TGlobalHolder::alphReduction && TGlobalHolder::indexIsFM && !fileExists(toCString(path + ".lf.drv.wtc.24")))))
+ {
+ std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
+ << " failed.\n"
+ << "It appears you tried to open an old index (created before 0.9.0) which "
+ << "is not supported. Please remove the old files and create a new index with lambda_indexer!\n";
+ return 200;
+ }
+
+ int ret = open(globalHolder.dbIndex, path.c_str(), OPEN_RDONLY);
+ if (ret != true)
+ {
+ std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
+ << " failed. "
+ << "Did you use the same options as with lambda_indexer?\n";
+ return 1;
+ }
+
+ // assign previously loaded sub sequences (possibly modifier-wrapped
+ // to the text-member of our new index (unless isFM, which doesnt need text)
+ if ((!TGlobalHolder::indexIsFM) && (TGlobalHolder::alphReduction))
+ indexText(globalHolder.dbIndex) = globalHolder.redSubjSeqs;
+
+ double finish = sysTime() - start;
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", finish, "s \n", "No of Fibres: ",
+ length(indexSA(globalHolder.dbIndex)), "\n\n");
+
+ // this is actually part of prepareScoring(), but the values are just available now
+ if (sIsTranslated(globalHolder.blastProgram ))
+ {
+ // last value has sum of lengths
+ context(globalHolder.outfile).dbTotalLength = back(globalHolder.untransSubjSeqLengths);
+ context(globalHolder.outfile).dbNumberOfSeqs = length(globalHolder.untransSubjSeqLengths) - 1;
+ } else
+ {
+ context(globalHolder.outfile).dbTotalLength = length(concat(globalHolder.subjSeqs));
+ context(globalHolder.outfile).dbNumberOfSeqs = length(globalHolder.subjSeqs);
+ }
+
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function loadSegintervals()
+// --------------------------------------------------------------------------
+
+template <BlastTabularSpec h,
+ BlastProgram p,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TIndexSpec,
+ typename TOutFormat>
+inline int
+loadSegintervals(GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h> & globalHolder,
+ LambdaOptions const & options)
+{
+
+ double start = sysTime();
+ std::string strIdent = "Loading Database Masking file...";
+ myPrint(options, 1, strIdent);
+
+ CharString segFileS = options.dbFile;
+ append(segFileS, ".binseg_s.concat");
+ CharString segFileE = options.dbFile;
+ append(segFileE, ".binseg_e.concat");
+ bool fail = false;
+ struct stat buffer;
+ // file exists
+ if ((stat(toCString(segFileS), &buffer) == 0) &&
+ (stat(toCString(segFileE), &buffer) == 0))
+ {
+ //cut off ".concat" again
+ resize(segFileS, length(segFileS) - 7);
+ resize(segFileE, length(segFileE) - 7);
+
+ fail = !open(globalHolder.segIntStarts, toCString(segFileS), OPEN_RDONLY);
+ if (!fail)
+ fail = !open(globalHolder.segIntEnds, toCString(segFileE), OPEN_RDONLY);
+ } else
+ {
+ fail = true;
+ }
+
+ if (fail)
+ {
+ std::cerr << ((options.verbosity == 0) ? strIdent : std::string())
+ << " failed.\n";
+ return 1;
+ }
+
+ double finish = sysTime() - start;
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", finish, "s \n\n");
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function loadQuery()
+// --------------------------------------------------------------------------
+
+// BLASTX, TBLASTX
+template <typename TSourceAlph, typename TSpec1,
+ typename TTargetAlph, typename TSpec2,
+ typename TUntransLengths,
+ MyEnableIf<!std::is_same<TSourceAlph, TTargetAlph>::value> = 0>
+inline void
+loadQueryImplTrans(TCDStringSet<String<TTargetAlph, TSpec1>> & target,
+ TCDStringSet<String<TSourceAlph, TSpec2>> & source,
+ TUntransLengths & untransQrySeqLengths,
+ LambdaOptions const & options)
+{
+ myPrint(options, 1, "translating...");
+ // translate
+ translate(target,
+ source,
+ SIX_FRAME,
+ options.geneticCode);
+
+ // preserve lengths of untranslated sequences
+ resize(untransQrySeqLengths,
+ length(source.limits),
+ Exact());
+
+ SEQAN_OMP_PRAGMA(parallel for simd)
+ for (uint32_t i = 0; i < (length(untransQrySeqLengths) - 1); ++i)
+ untransQrySeqLengths[i] = source.limits[i + 1] - source.limits[i];
+
+ // save sum of lengths (both strings have n + 1 elements
+ back(source.limits) = length(source.concat);
+}
+
+// BLASTN
+template <typename TSpec1,
+ typename TSpec2,
+ typename TUntransLengths>
+inline void
+loadQueryImplTrans(TCDStringSet<String<TransAlph<BlastProgram::BLASTN>, TSpec1>> & target,
+ TCDStringSet<String<TransAlph<BlastProgram::BLASTN>, TSpec2>> & source,
+ TUntransLengths & /**/,
+ LambdaOptions const & options)
+{
+ using TAlph = TransAlph<BlastProgram::BLASTN>;
+// using TReverseCompl = ModifiedString<ModifiedString<String<TAlph>,
+// ModView<FunctorComplement<TAlph>>>, ModReverse>;
+ myPrint(options, 1, " generating reverse complements...");
+ // no need for translation, but we need reverse complements
+ resize(target.concat, length(source.concat) * 2);
+ resize(target.limits, length(source) * 2 + 1);
+
+ target.limits[0] = 0;
+ uint64_t const l = length(target.limits) - 1;
+ for (uint64_t i = 1; i < l; i+=2)
+ {
+ target.limits[i] = target.limits[i-1] + length(source[i/2]);
+ target.limits[i+1] = target.limits[i] + length(source[i/2]);
+ }
+
+ FunctorComplement<TAlph> functor;
+ uint64_t tBeg, tBegNext, len, sBeg;
+ SEQAN_OMP_PRAGMA(parallel for schedule(dynamic) private(tBeg, tBegNext, len, sBeg))
+ for (uint64_t i = 0; i < (l - 1); i+=2)
+ {
+ tBeg = target.limits[i];
+ tBegNext = target.limits[i+1];
+ len = tBegNext - tBeg;
+ sBeg = source.limits[i/2];
+ // avoid senseless copying and iterate manually
+ for (uint32_t j = 0; j < len; ++j)
+ {
+ target.concat[tBeg + j] = source.concat[sBeg + j];
+ target.concat[tBegNext + j] = functor(source.concat[sBeg+len-j-1]);
+ }
+ }
+}
+
+// BLASTP, TBLASTN
+template <typename TSpec1,
+ typename TSpec2,
+ typename TUntransLengths>
+inline void
+loadQueryImplTrans(TCDStringSet<String<TransAlph<BlastProgram::BLASTP>, TSpec1>> & target,
+ TCDStringSet<String<TransAlph<BlastProgram::BLASTP>, TSpec2>> & source,
+ TUntransLengths & /**/,
+ LambdaOptions const & /**/)
+{
+ // no need for translation, but sequences have to be in right place
+ swap(target, source);
+}
+
+template <BlastTabularSpec h,
+ BlastProgram p,
+ typename TRedAlph,
+ typename TScoreScheme,
+ typename TIndexSpec,
+ typename TOutFormat>
+inline int
+loadQuery(GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h> & globalHolder,
+ LambdaOptions const & options)
+{
+ using TGH = GlobalDataHolder<TRedAlph, TScoreScheme, TIndexSpec, TOutFormat, p, h>;
+ double start = sysTime();
+
+ std::string strIdent = "Loading Query Sequences and Ids...";
+ myPrint(options, 1, strIdent);
+
+ TCDStringSet<String<OrigQryAlph<p>, typename TGH::TQryTag>> origSeqs;
+
+// std::cout << "FOO " << toCString(options.queryFile) << " BAR" << std::endl;
+ try
+ {
+ SeqFileIn infile(toCString(options.queryFile));
+ int ret = myReadRecords(globalHolder.qryIds, origSeqs, infile);
+ if (ret)
+ return ret;
+ }
+ catch(IOError const & e)
+ {
+ std::cerr << "\nIOError thrown: " << e.what() << '\n'
+ << "Could not read the query file, make sure it exists and is readable.\n";
+ return -1;
+ }
+
+ // translate
+ loadQueryImplTrans(globalHolder.qrySeqs,
+ origSeqs,
+ globalHolder.untransQrySeqLengths,
+ options);
+
+ // sam and bam need original sequences if translation happened
+ if (qIsTranslated(globalHolder.blastProgram) && (options.outFileFormat > 0) &&
+ (options.samBamSeq > 0))
+ std::swap(origSeqs, globalHolder.untranslatedQrySeqs);
+
+ if (TGH::alphReduction)
+ globalHolder.redQrySeqs.limits = globalHolder.qrySeqs.limits;
+
+ double finish = sysTime() - start;
+ myPrint(options, 1, " done.\n");
+
+ if (options.verbosity >= 2)
+ {
+ unsigned long maxLen = 0ul;
+ for (auto const & s : globalHolder.qrySeqs)
+ if (length(s) > maxLen)
+ maxLen = length(s);
+ myPrint(options, 2, "Runtime: ", finish, "s \n",
+ "Number of effective query sequences: ",
+ length(globalHolder.qrySeqs), "\nLongest query sequence: ",
+ maxLen, "\n\n");
+ }
+ return 0;
+}
+
+/// THREAD LOCAL STUFF
+
+// --------------------------------------------------------------------------
+// Function generateSeeds()
+// --------------------------------------------------------------------------
+/*
+#define THREADLINE std::cout << "\0338" << std::endl << "Thread " << lH.i \
+<< std::endl; \
+for (unsigned char i=0; i< lH.i*20; ++i) std::cout << std::endl;*/
+
+template <typename TLocalHolder>
+inline int
+generateSeeds(TLocalHolder & lH)
+{
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1, "Block ", std::setw(4),
+ lH.i, ": Generating Seeds...");
+ if (lH.options.isTerm)
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+
+ double start = sysTime();
+ for (unsigned long i = lH.indexBeginQry; i < lH.indexEndQry; ++i)
+ {
+ for (unsigned j = 0;
+ (j* lH.options.seedOffset + lH.options.seedLength)
+ <= length(value(lH.gH.redQrySeqs, i));
+ ++j)
+ {
+ appendValue(lH.seeds, infix(value(lH.gH.redQrySeqs, i),
+ j* lH.options.seedOffset,
+ j* lH.options.seedOffset
+ + lH.options.seedLength),
+ Generous());
+ appendValue(lH.seedRefs, i, Generous());
+ appendValue(lH.seedRanks, j, Generous());
+
+// std::cout << "seed: " << back(lH.seeds) << "\n";
+ }
+ }
+ double finish = sysTime() - start;
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1, " done. ");
+ appendToStatus(lH.statusStr, lH.options, 2, finish, "s. ",
+ length(lH.seeds), " seeds created.");
+
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+ return 0;
+}
+
+
+// --------------------------------------------------------------------------
+// Function generateTrieOverSeeds()
+// --------------------------------------------------------------------------
+
+
+template <typename TLocalHolder>
+inline int
+generateTrieOverSeeds(TLocalHolder & lH)
+{
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1, "Generating Query-Index...");
+ if (lH.options.isTerm)
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+
+ double start = sysTime();
+ // we only want full length seed sequences in index, build up manually
+ typedef typename Fibre<typename TLocalHolder::TSeedIndex, EsaSA>::Type TSa;
+ //TODO maybe swap here instead
+ lH.seedIndex = decltype(lH.seedIndex)(lH.seeds);
+ TSa & sa = indexSA(lH.seedIndex);
+ resize(sa, length(lH.seeds));
+ for (unsigned u = 0; u < length(lH.seeds); ++u)
+ {
+ assignValueI1(value(sa,u), u);
+ assignValueI2(value(sa,u), 0);
+ }
+ Comp<typename Value<TSa>::Type, typename TLocalHolder::TSeeds const> comp(lH.seeds);
+ std::sort(begin(sa, Standard()), end(sa, Standard()), comp);
+ typename Iterator<typename TLocalHolder::TSeedIndex, TopDown<> >::Type it(lH.seedIndex); // instantiate
+ double finish = sysTime() - start;
+
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1, " done. ");
+ appendToStatus(lH.statusStr, lH.options, 2, finish, "s. ",
+ length(sa), " fibres in SeedIndex. ");;
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function search()
+// --------------------------------------------------------------------------
+
+template <typename BackSpec, typename TLocalHolder>
+inline void
+__searchDoubleIndex(TLocalHolder & lH)
+{
+ appendToStatus(lH.statusStr, lH.options, 1, "Seeding...");
+ if (lH.options.isTerm)
+ myPrint(lH.options, 1, lH.statusStr);
+
+ double start = sysTime();
+
+ using LambdaFinder = Finder_<decltype(lH.gH.dbIndex),
+ decltype(lH.seedIndex),
+ typename std::conditional<std::is_same<BackSpec, Backtracking<Exact>>::value,
+ Backtracking<HammingDistance>,
+ BackSpec>::type >;
+
+ LambdaFinder finder;
+
+ auto delegate = [&lH] (LambdaFinder const & finder)
+ {
+ onFindDoubleIndex(lH, finder);
+ };
+
+ _find(finder, lH.gH.dbIndex, lH.seedIndex, lH.options.maxSeedDist, delegate);
+
+ double finish = sysTime() - start;
+
+ appendToStatus(lH.statusStr, lH.options, 1, " done. ");
+ appendToStatus(lH.statusStr, lH.options, 2, finish, "s. #hits: ",
+ length(lH.matches), " ");
+ myPrint(lH.options, 1, lH.statusStr);
+}
+
+template <typename BackSpec, typename TLocalHolder>
+inline void
+__searchSingleIndex(TLocalHolder & lH)
+{
+ typedef typename Iterator<decltype(lH.seeds) const, Rooted>::Type TSeedsIt;
+ typedef typename Iterator<decltype(lH.gH.dbIndex),TopDown<>>::Type TIndexIt;
+
+// SEQAN_OMP_PRAGMA(critical(stdout))
+// {
+// std::cout << "ReadId: " << lH.i << std::endl;
+// for (auto const & seed : lH.seeds)
+// std::cout << "\"" << toCString(CharString(seed)) << "\" ";
+// std::cout << std::endl;
+// }
+ auto delegate = [&lH] (TIndexIt & indexIt,
+ TSeedsIt const & seedsIt,
+ int /*score*/)
+ {
+ onFindSingleIndex(lH, seedsIt, indexIt);
+ };
+
+ find(lH.gH.dbIndex, lH.seeds, int(lH.options.maxSeedDist), delegate,
+ Backtracking<BackSpec>());
+}
+
+template <typename BackSpec, typename TLocalHolder>
+inline void
+__search(TLocalHolder & lH)
+{
+ if (lH.options.doubleIndexing)
+ __searchDoubleIndex<BackSpec>(lH);
+ else
+ __searchSingleIndex<BackSpec>(lH);
+}
+
+template <typename TLocalHolder>
+inline void
+search(TLocalHolder & lH)
+{
+ if (lH.options.maxSeedDist == 0)
+ __search<Backtracking<Exact>>(lH);
+ else if (lH.options.hammingOnly)
+ __search<Backtracking<HammingDistance>>(lH);
+ else
+#if 0 // reactivate if edit-distance seeding is readded
+ __search<Backtracking<EditDistance>>(lH);
+#else
+ return;
+#endif
+}
+
+// --------------------------------------------------------------------------
+// Function joinAndFilterMatches()
+// --------------------------------------------------------------------------
+
+
+template <typename TLocalHolder>
+inline void
+sortMatches(TLocalHolder & lH)
+{
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1, "Sorting hits...");
+ if (lH.options.isTerm)
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+
+ double start = sysTime();
+
+// std::sort(begin(lH.matches, Standard()), end(lH.matches, Standard()));
+// std::sort(lH.matches.begin(), lH.matches.end());
+
+// if (lH.matches.size() > lH.options.maxMatches)
+// {
+// MatchSortComp comp(lH.matches);
+// std::sort(lH.matches.begin(), lH.matches.end(), comp);
+// } else
+
+ if ((lH.options.filterPutativeAbundant) &&
+ (lH.matches.size() > lH.options.maxMatches))
+ // more expensive sort to get likely targets to front
+ myHyperSortSingleIndex(lH.matches, lH.options, lH.gH);
+ else
+ std::sort(lH.matches.begin(), lH.matches.end());
+
+ double finish = sysTime() - start;
+
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1, " done. ");
+ appendToStatus(lH.statusStr, lH.options, 2, finish, "s. ");
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+}
+
+template <typename TBlastMatch,
+ typename TLocalHolder>
+inline int
+computeBlastMatch(TBlastMatch & bm,
+ Match const & m,
+ TLocalHolder & lH)
+{
+ const unsigned long qryLength = length(value(lH.gH.qrySeqs, m.qryId));
+
+ SEQAN_ASSERT_LEQ(bm.qStart, bm.qEnd);
+ SEQAN_ASSERT_LEQ(bm.sStart, bm.sEnd);
+
+// auto qryInfix = infix(curQry,
+// bm.qStart,
+// bm.qEnd);
+// auto subjInfix = infix(curSubj,
+// bm.sStart,
+// bm.sEnd);
+
+// std::cout << "Query Id: " << m.qryId
+// << "\t TrueQryId: " << getTrueQryId(bm.m, lH.options, lH.gH.blastProgram)
+// << "\t length(qryIds): " << length(qryIds)
+// << "Subj Id: " << m.subjId
+// << "\t TrueSubjId: " << getTrueSubjId(bm.m, lH.options, lH.gH.blastProgram)
+// << "\t length(subjIds): " << length(subjIds) << "\n\n";
+
+ assignSource(bm.alignRow0, infix(lH.gH.qrySeqs[m.qryId], bm.qStart, bm.qEnd));
+ assignSource(bm.alignRow1, infix(lH.gH.subjSeqs[m.subjId],bm.sStart, bm.sEnd));
+
+// std::cout << "== Positions\n";
+// std::cout << " " << bm.qStart << " - " << bm.qEnd << " [before ali]\n";
+// std::cout << bm.align << std::endl;
+
+ int scr = 0;
+
+// unsigned short seedLeng = 0;
+// double seedE = 0;
+// double seedB = 0;
+
+ unsigned row0len = bm.qEnd - bm.qStart;
+ unsigned row1len = bm.sEnd - bm.sStart;
+ unsigned short band = (!lH.options.hammingOnly) * (lH.options.maxSeedDist);
+
+// // TODO FIGURE THIS OUT
+// if ((row0len > (lH.options.seedLength + band)) ||
+// (row1len > (lH.options.seedLength + band)))
+// {
+// #pragma omp atomic
+// ++mergeCount;
+// std::cout << "qrId " << m.qryId << "\tsId: " << m.subjId << "\trow0len: " << row0len << "\trow1len: " << row1len << "\n";
+// std::cout << source(row0) << "\n";
+// std::cout << source(row1) << "\n";
+// }
+
+ auto seedsInSeed = std::max(row0len, row1len) / lH.options.seedLength;
+
+ unsigned short maxDist = 0;
+ if (lH.options.maxSeedDist <= 1)
+ maxDist = std::abs(int(row1len) - int(row0len));
+ else
+ maxDist = std::abs(int(row1len) - int(row0len)) + (seedsInSeed * band);
+
+ // fast local alignment without DP-stuff
+ if (maxDist == 0)
+ {
+ int scores[row0len+1]; // C99, C++14, -Wno-vla before that
+ scores[0] = 0;
+ unsigned newEnd = 0;
+ unsigned newBeg = 0;
+ // score the diagonal
+ for (unsigned i = 0; i < row0len; ++i)
+ {
+ scores[i] += score(seqanScheme(context(lH.gH.outfile).scoringScheme),
+ source(bm.alignRow0)[i],
+ source(bm.alignRow1)[i]);
+ if (scores[i] < 0)
+ {
+ scores[i] = 0;
+ } else if (scores[i] >= scr)
+ {
+ scr = scores[i];
+ newEnd = i + 1;
+ }
+// if (i <row0len -1)
+ scores[i+1] = scores[i];
+ }
+ if (newEnd == 0) // no local alignment
+ {
+ return OTHER_FAIL; // TODO change to PREEXTEND?
+ }
+ // backtrack
+ for (unsigned i = newEnd - 1; i > 0; --i)
+ {
+ if (scores[i] == 0)
+ {
+ newBeg = i + 1;
+ break;
+ }
+ }
+ setEndPosition(bm.alignRow0, newEnd);
+ setEndPosition(bm.alignRow1, newEnd);
+ setBeginPosition(bm.alignRow0, newBeg);
+ setBeginPosition(bm.alignRow1, newBeg);
+
+ } else
+ {
+ // compute with DP-code
+// scr = localAlignment(bm.align, seqanScheme(context(lH.gH.outfile).scoringScheme), -maxDist, +maxDist);
+ scr = localAlignment2(bm.alignRow0,
+ bm.alignRow1,
+ seqanScheme(context(lH.gH.outfile).scoringScheme),
+ -maxDist,
+ +maxDist,
+ lH.alignContext);
+ }
+
+ // save new bounds of alignment
+ bm.qEnd = bm.qStart + endPosition(bm.alignRow0);
+ bm.qStart += beginPosition(bm.alignRow0);
+
+ bm.sEnd = bm.sStart + endPosition(bm.alignRow1);
+ bm.sStart += beginPosition(bm.alignRow1);
+
+// if (scr < lH.options.minSeedScore)
+// return PREEXTEND;
+
+#if 0
+// OLD WAY extension with birte's code
+ {
+ // std::cout << " " << bm.qStart << " - " << bm.qEnd << " [after ali]\n";
+ // std::cout << bm.align << std::endl;
+ decltype(seqanScheme(context(lH.gH.outfile).scoringScheme)) extScheme(seqanScheme(context(lH.gH.outfile).scoringScheme));
+ setScoreGapOpen (extScheme, -8);
+ setScoreGapExtend(extScheme, -8);
+
+ Seed<Simple> seed(bm.sStart, bm.qStart,
+ bm.sEnd, bm.qEnd);
+ extendSeed(seed,
+ curSubj,
+ curQry,
+ EXTEND_BOTH,
+ extScheme,
+// seqanScheme(context(lH.gH.outfile).scoringScheme),
+ int(lH.options.xDropOff),
+ GappedXDrop());
+
+ bm.sStart = beginPositionH(seed);
+ bm.qStart = beginPositionV(seed);
+ bm.sEnd = endPositionH(seed);
+ bm.qEnd = endPositionV(seed);
+
+ assignSource(row0, infix(curQry,
+ bm.qStart,
+ bm.qEnd));
+ assignSource(row1, infix(curSubj,
+ bm.sStart,
+ bm.sEnd));
+
+ //DEBUG
+ auto oldscr = scr;
+
+ scr = localAlignment(bm.align,
+ seqanScheme(context(lH.gH.outfile).scoringScheme),
+// alignConfig,
+ lowerDiagonal(seed)-beginDiagonal(seed),
+ upperDiagonal(seed)-beginDiagonal(seed));
+ // save new bounds of alignment
+ bm.qEnd = bm.qStart + endPosition(row0);
+ bm.qStart += beginPosition(row0);
+
+ bm.sEnd = bm.sStart + endPosition(row1);
+ bm.sStart += beginPosition(row1);
+
+ if (scr < 0) // alignment got screwed up
+ {
+ std::cout << "SCREW UP\n";
+ std::cout << "beginDiag: " << beginDiagonal(seed)
+ << "\tlowDiag: " << lowerDiagonal(seed)
+ << "\tupDiag: " << upperDiagonal(seed) << '\n';
+ std::cout << "oldscore: " << oldscr
+ << "\tseedscore: " << score(seed)
+ << "\tscore: " << scr << '\n';
+ std::cout << bm.align << '\n';
+ }
+ }
+#endif
+
+#if 0
+ // ungapped second prealign
+ {
+ Tuple<decltype(bm.qStart), 4> positions =
+ { { bm.qStart, bm.sStart, bm.qEnd, bm.sEnd} };
+
+ decltype(seqanScheme(context(lH.gH.outfile).scoringScheme)) extScheme(seqanScheme(context(lH.gH.outfile).scoringScheme));
+ setScoreGapOpen (extScheme, -100);
+ setScoreGapExtend(extScheme, -100);
+ scr = extendAlignment(bm.align,
+ lH.alignContext,
+ scr,
+ curQry,
+ curSubj,
+ positions,
+ EXTEND_BOTH,
+ 0, // band of 0 size
+ 0, // band of 0 size
+ 1, // xdrop of 1
+ extScheme);
+ bm.qStart = beginPosition(row0);
+ bm.qEnd = endPosition(row0);
+
+ bm.sStart = beginPosition(row1);
+ bm.sEnd = endPosition(row1);
+ }
+#endif
+ if (((bm.qStart > 0) && (bm.sStart > 0)) ||
+ ((bm.qEnd < qryLength - 1) && (bm.sEnd < length(lH.gH.subjSeqs[m.subjId]) -1)))
+ {
+ // we want to allow more gaps in longer query sequences
+ switch (lH.options.band)
+ {
+ case -3: maxDist = ceil(log2(qryLength)); break;
+ case -2: maxDist = floor(sqrt(qryLength)); break;
+ case -1: break;
+ default: maxDist = lH.options.band; break;
+ }
+
+ Tuple<decltype(bm.qStart), 4> positions =
+ { { bm.qStart, bm.sStart, bm.qEnd, bm.sEnd} };
+
+ if (lH.options.band != -1)
+ {
+ if (lH.options.xDropOff != -1)
+ {
+ scr = _extendAlignmentImpl(bm.alignRow0,
+ bm.alignRow1,
+ scr,
+ lH.gH.qrySeqs[m.qryId],
+ lH.gH.subjSeqs[m.subjId],
+ positions,
+ EXTEND_BOTH,
+ -maxDist,
+ +maxDist,
+ lH.options.xDropOff,
+ seqanScheme(context(lH.gH.outfile).scoringScheme),
+ True(),
+ True(),
+ lH.alignContext);
+ } else
+ {
+ scr = _extendAlignmentImpl(bm.alignRow0,
+ bm.alignRow1,
+ scr,
+ lH.gH.qrySeqs[m.qryId],
+ lH.gH.subjSeqs[m.subjId],
+ positions,
+ EXTEND_BOTH,
+ -maxDist,
+ +maxDist,
+ lH.options.xDropOff,
+ seqanScheme(context(lH.gH.outfile).scoringScheme),
+ True(),
+ False(),
+ lH.alignContext);
+ }
+ } else
+ {
+ if (lH.options.xDropOff != -1)
+ {
+ scr = _extendAlignmentImpl(bm.alignRow0,
+ bm.alignRow1,
+ scr,
+ lH.gH.qrySeqs[m.qryId],
+ lH.gH.subjSeqs[m.subjId],
+ positions,
+ EXTEND_BOTH,
+ -maxDist,
+ +maxDist,
+ lH.options.xDropOff,
+ seqanScheme(context(lH.gH.outfile).scoringScheme),
+ False(),
+ True(),
+ lH.alignContext);
+ } else
+ {
+ scr = _extendAlignmentImpl(bm.alignRow0,
+ bm.alignRow1,
+ scr,
+ lH.gH.qrySeqs[m.qryId],
+ lH.gH.subjSeqs[m.subjId],
+ positions,
+ EXTEND_BOTH,
+ -maxDist,
+ +maxDist,
+ lH.options.xDropOff,
+ seqanScheme(context(lH.gH.outfile).scoringScheme),
+ False(),
+ False(),
+ lH.alignContext);
+ }
+ }
+ bm.sStart = beginPosition(bm.alignRow1);
+ bm.qStart = beginPosition(bm.alignRow0);
+ bm.sEnd = endPosition(bm.alignRow1);
+ bm.qEnd = endPosition(bm.alignRow0);
+
+// std::cout << "AFTER:\n" << bm.align << "\n";
+ }
+
+// std::cerr << "AFTEREXT:\n "<< bm.align << "\n";
+
+ if (scr <= 0)
+ {
+// std::cout << "## LATE FAIL\n" << bm.align << '\n';
+
+ return OTHER_FAIL;
+ }
+// std::cout << "##LINE: " << __LINE__ << '\n';
+
+// std::cout << "ALIGN BEFORE STATS:\n" << bm.align << "\n";
+
+ computeAlignmentStats(bm, context(lH.gH.outfile));
+
+ if (bm.alignStats.alignmentIdentity < lH.options.idCutOff)
+ return PERCENTIDENT;
+
+// const unsigned long qryLength = length(row0);
+ computeBitScore(bm, context(lH.gH.outfile));
+
+ // the length adjustment cache must no be written to by multiple threads
+ SEQAN_OMP_PRAGMA(critical(evalue_length_adj_cache))
+ {
+ computeEValue(bm, context(lH.gH.outfile));
+ }
+
+ if (bm.eValue > lH.options.eCutOff)
+ {
+ return EVALUE;
+ }
+
+ if (qIsTranslated(TLocalHolder::TGlobalHolder::blastProgram))
+ {
+ bm.qFrameShift = (m.qryId % 3) + 1;
+ if (m.qryId % 6 > 2)
+ bm.qFrameShift = -bm.qFrameShift;
+ } else if (qHasRevComp(TLocalHolder::TGlobalHolder::blastProgram))
+ {
+ bm.qFrameShift = 1;
+ if (m.qryId % 2)
+ bm.qFrameShift = -bm.qFrameShift;
+ } else
+ {
+ bm.qFrameShift = 0;
+ }
+
+ if (sIsTranslated(TLocalHolder::TGlobalHolder::blastProgram))
+ {
+ bm.sFrameShift = (m.subjId % 3) + 1;
+ if (m.subjId % 6 > 2)
+ bm.sFrameShift = -bm.sFrameShift;
+ } else if (sHasRevComp(TLocalHolder::TGlobalHolder::blastProgram))
+ {
+ bm.sFrameShift = 1;
+ if (m.subjId % 2)
+ bm.sFrameShift = -bm.sFrameShift;
+ } else
+ {
+ bm.sFrameShift = 0;
+ }
+
+ return 0;
+}
+
+
+template <typename TLocalHolder>
+inline int
+iterateMatches(TLocalHolder & lH)
+{
+ using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
+ using TPos = uint32_t; //typename Match::TPos;
+ using TBlastMatch = BlastMatch<
+ typename TLocalHolder::TAlignRow0,
+ typename TLocalHolder::TAlignRow1,
+ TPos,
+ typename Value<typename TGlobalHolder::TQryIds>::Type,// const &,
+ typename Value<typename TGlobalHolder::TSubjIds>::Type// const &,
+ >;
+ using TBlastRecord = BlastRecord<TBlastMatch>;
+
+// constexpr TPos TPosMax = std::numeric_limits<TPos>::max();
+// constexpr uint8_t qFactor = qHasRevComp(lH.gH.blastProgram) ? 3 : 1;
+// constexpr uint8_t sFactor = sHasRevComp(lH.gH.blastProgram) ? 3 : 1;
+
+ double start = sysTime();
+ if (lH.options.doubleIndexing)
+ {
+ appendToStatus(lH.statusStr, lH.options, 1,
+ "Extending and writing hits...");
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+
+ //DEBUG
+// std::cout << "Length of matches: " << length(lH.matches);
+// for (auto const & m : lH.matches)
+// {
+// std::cout << m.qryId << "\t" << getTrueQryId(m,lH.options, lH.gH.blastProgram) << "\n";
+// }
+
+// double topMaxMatchesMedianBitScore = 0;
+ // outer loop over records
+ // (only one iteration if single indexing is used)
+ for (auto it = lH.matches.begin(),
+ itN = std::next(it, 1),
+ itEnd = lH.matches.end();
+ it != itEnd;
+ ++it)
+ {
+ itN = std::next(it,1);
+ auto const trueQryId = it->qryId / qNumFrames(lH.gH.blastProgram);
+
+ TBlastRecord record(lH.gH.qryIds[trueQryId]);
+
+ record.qLength = (qIsTranslated(lH.gH.blastProgram)
+ ? lH.gH.untransQrySeqLengths[trueQryId]
+ : length(lH.gH.qrySeqs[it->qryId]));
+
+// topMaxMatchesMedianBitScore = 0;
+
+ // inner loop over matches per record
+ for (; it != itEnd; ++it)
+ {
+ auto const trueSubjId = it->subjId / sNumFrames(lH.gH.blastProgram);
+ itN = std::next(it,1);
+// std::cout << "FOO\n" << std::flush;
+// std::cout << "QryStart: " << it->qryStart << "\n" << std::flush;
+// std::cout << "SubjStart: " << it->subjStart << "\n" << std::flush;
+// std::cout << "BAR\n" << std::flush;
+ if (!isSetToSkip(*it))
+ {
+ // ABUNDANCY and PUTATIVE ABUNDANCY CHECKS
+ if ((lH.options.filterPutativeAbundant) && (record.matches.size() % lH.options.maxMatches == 0))
+ {
+ if (record.matches.size() / lH.options.maxMatches == 1)
+ {
+ // numMaxMatches found the first time
+ record.matches.sort();
+ }
+ else if (record.matches.size() / lH.options.maxMatches > 1)
+ {
+ double medianTopNMatchesBefore = 0.0;
+// if (lH.options.filterPutativeAbundant)
+ {
+ medianTopNMatchesBefore =
+ (std::next(record.matches.begin(),
+ lH.options.maxMatches / 2))->bitScore;
+ }
+
+ uint64_t before = record.matches.size();
+ record.matches.sort();
+ // if we filter putative duplicates we never need to check for real duplicates
+ if (!lH.options.filterPutativeDuplicates)
+ {
+ record.matches.unique();
+ lH.stats.hitsDuplicate += before - record.matches.size();
+ before = record.matches.size();
+ }
+ if (record.matches.size() > (lH.options.maxMatches + 1))
+ // +1 so as not to trigger % == 0 in the next run
+ record.matches.resize(lH.options.maxMatches + 1);
+
+ lH.stats.hitsAbundant += before - record.matches.size();
+
+// if (lH.options.filterPutativeAbundant)
+ {
+ double medianTopNMatchesAfter =
+ (std::next(record.matches.begin(),
+ lH.options.maxMatches / 2))->bitScore;
+ // no new matches in top n/2
+ if (int(medianTopNMatchesAfter) <=
+ int(medianTopNMatchesBefore))
+ {
+ // declare all the rest as putative abundant
+ while ((it != itEnd) &&
+ (trueQryId == it->qryId / qNumFrames(lH.gH.blastProgram)))
+ {
+ // not already marked as abundant, duplicate or merged
+ if (!isSetToSkip(*it))
+ ++lH.stats.hitsPutativeAbundant;
+ ++it;
+ }
+ // move back so if-loop's increment still valid
+ std::advance(it, -1);
+ break;
+ }
+ }
+ }
+ }
+// std::cout << "BAX\n" << std::flush;
+ // create blastmatch in list without copy or move
+ record.matches.emplace_back(lH.gH.qryIds [trueQryId],
+ lH.gH.subjIds[trueSubjId]);
+
+ auto & bm = back(record.matches);
+
+ bm.qStart = it->qryStart;
+ bm.qEnd = it->qryStart + lH.options.seedLength;
+ bm.sStart = it->subjStart;
+ bm.sEnd = it->subjStart + lH.options.seedLength;
+
+ bm.qLength = record.qLength;
+ bm.sLength = sIsTranslated(lH.gH.blastProgram)
+ ? lH.gH.untransSubjSeqLengths[trueSubjId]
+ : length(lH.gH.subjSeqs[it->subjId]);
+
+ // MERGE PUTATIVE SIBLINGS INTO THIS MATCH
+ for (auto it2 = itN;
+ (it2 != itEnd) &&
+ (trueQryId == it2->qryId / qNumFrames(lH.gH.blastProgram)) &&
+ (trueSubjId == it2->subjId / sNumFrames(lH.gH.blastProgram));
+ ++it2)
+ {
+ // same frame
+ if ((it->qryId % qNumFrames(lH.gH.blastProgram) == it2->qryId % qNumFrames(lH.gH.blastProgram)) &&
+ (it->subjId % sNumFrames(lH.gH.blastProgram) == it2->subjId % sNumFrames(lH.gH.blastProgram)))
+ {
+
+// TPos const qDist = (it2->qryStart >= bm.qEnd)
+// ? it2->qryStart - bm.qEnd // upstream
+// : 0; // overlap
+//
+// TPos sDist = TPosMax; // subj match region downstream of *it
+// if (it2->subjStart >= bm.sEnd) // upstream
+// sDist = it2->subjStart - bm.sEnd;
+// else if (it2->subjStart >= it->subjStart) // overlap
+// sDist = 0;
+
+ // due to sorting it2->qryStart never <= it->qStart
+ // so subject sequences must have same order
+ if (it2->subjStart < it->subjStart)
+ continue;
+
+ long const qDist = it2->qryStart - bm.qEnd;
+ long const sDist = it2->subjStart - bm.sEnd;
+
+ if ((qDist == sDist) &&
+ (qDist <= (long)lH.options.seedGravity))
+ {
+ bm.qEnd = std::max(bm.qEnd,
+ (TPos)(it2->qryStart
+ + lH.options.seedLength));
+ bm.sEnd = std::max(bm.sEnd,
+ (TPos)(it2->subjStart
+ + lH.options.seedLength));
+ ++lH.stats.hitsMerged;
+
+ setToSkip(*it2);
+ }
+ }
+ }
+
+ // do the extension and statistics
+ int lret = computeBlastMatch(bm, *it, lH);
+
+ switch (lret)
+ {
+ case COMPUTERESULT_::SUCCESS:
+ // ++lH.stats.goodMatches;
+ if (lH.options.outFileFormat > 0)
+ {
+ bm._n_qId = it->qryId / qNumFrames(lH.gH.blastProgram);
+ bm._n_sId = it->subjId / sNumFrames(lH.gH.blastProgram);
+ }
+ break;
+ case EVALUE:
+ ++lH.stats.hitsFailedExtendEValueTest;
+ break;
+ case PERCENTIDENT:
+ ++lH.stats.hitsFailedExtendPercentIdentTest;
+ break;
+ case PREEXTEND:
+ ++lH.stats.hitsFailedPreExtendTest;
+ break;
+ default:
+ std::cerr << "Unexpected Extension Failure:\n"
+ << "qryId: " << it->qryId << "\t"
+ << "subjId: " << it->subjId << "\t"
+ << "seed qry: " << infix(lH.gH.redQrySeqs,
+ it->qryStart,
+ it->qryStart + lH.options.seedLength)
+ << "\n subj: " << infix(lH.gH.redSubjSeqs,
+ it->subjStart,
+ it->subjStart + lH.options.seedLength)
+ << "\nunred qry: " << infix(lH.gH.qrySeqs,
+ it->qryStart,
+ it->qryStart + lH.options.seedLength)
+ << "\n subj: " << infix(lH.gH.subjSeqs,
+ it->subjStart,
+ it->subjStart + lH.options.seedLength)
+ << "\nmatch qry: " << infix(lH.gH.qrySeqs,
+ bm.qStart,
+ bm.qEnd)
+ << "\n subj: " << infix(lH.gH.subjSeqs,
+ bm.sStart,
+ bm.sEnd)
+ << "\nalign: " << bm.alignRow0 << "\n " << bm.alignRow1
+ << "\n";
+ return lret;
+ break;
+ }
+
+ if (lret != 0)// discard match
+ {
+ record.matches.pop_back();
+ } else if (lH.options.filterPutativeDuplicates)
+ {
+ // PUTATIVE DUBLICATES CHECK
+ for (auto it2 = itN;
+ (it2 != itEnd) &&
+ (trueQryId == it2->qryId / qNumFrames(lH.gH.blastProgram)) &&
+ (trueSubjId == it2->subjId / sNumFrames(lH.gH.blastProgram));
+ ++it2)
+ {
+ // same frame and same range
+ if ((it->qryId == it2->qryId) &&
+ (it->subjId == it2->subjId) &&
+ (intervalOverlap(it2->qryStart,
+ it2->qryStart + lH.options.seedLength,
+ bm.qStart,
+ bm.qEnd) > 0) &&
+ (intervalOverlap(it2->subjStart,
+ it2->subjStart + lH.options.seedLength,
+ bm.sStart,
+ bm.sEnd) > 0))
+ {
+ // deactivated alignment check to get rid of
+ // duplicates early on
+// auto const & row0 = row(bm.align, 0);
+// auto const & row1 = row(bm.align, 1);
+// // part of alignment
+// if (toSourcePosition(row0,
+// toViewPosition(row1,
+// it2->subjStart
+// - bm.sStart))
+// == TPos(it2->qryStart - bm.qStart))
+// {
+ ++lH.stats.hitsPutativeDuplicate;
+ setToSkip(*it2);
+// }
+ }
+ }
+ }
+ }
+
+ // last item or new TrueQryId
+ if ((itN == itEnd) ||
+ (trueQryId != itN->qryId / qNumFrames(lH.gH.blastProgram)))
+ break;
+ }
+
+ if (length(record.matches) > 0)
+ {
+ ++lH.stats.qrysWithHit;
+ // sort and remove duplicates -> STL, yeah!
+ auto const before = record.matches.size();
+ record.matches.sort();
+ if (!lH.options.filterPutativeDuplicates)
+ {
+ record.matches.unique();
+ lH.stats.hitsDuplicate += before - record.matches.size();
+ }
+ if (record.matches.size() > lH.options.maxMatches)
+ {
+ lH.stats.hitsAbundant += record.matches.size() -
+ lH.options.maxMatches;
+ record.matches.resize(lH.options.maxMatches);
+ }
+ lH.stats.hitsFinal += record.matches.size();
+
+ myWriteRecord(lH, record);
+ }
+
+ }
+
+ if (lH.options.doubleIndexing)
+ {
+ double finish = sysTime() - start;
+
+ appendToStatus(lH.statusStr, lH.options, 1, " done. ");
+ appendToStatus(lH.statusStr, lH.options, 2, finish, "s. ");
+ myPrint(lH.options, 1, lH.statusStr);
+ }
+
+ return 0;
+}
+
+void printStats(StatsHolder const & stats, LambdaOptions const & options)
+{
+ if ((options.verbosity >= 1) && options.isTerm && options.doubleIndexing)
+ for (unsigned char i=0; i < options.threads + 3; ++i)
+ std::cout << std::endl;
+
+ if (options.verbosity >= 2)
+ {
+ unsigned long rem = stats.hitsAfterSeeding;
+ auto const w = _numberOfDigits(rem); // number of digits
+ #define R " " << std::setw(w)
+ #define RR " = " << std::setw(w)
+ #define BLANKS for (unsigned i = 0; i< w; ++i) std::cout << " ";
+ std::cout << "\033[1m HITS "; BLANKS;
+ std::cout << "Remaining\033[0m"
+ << "\n after Seeding "; BLANKS;
+ std::cout << R << rem;
+ std::cout << "\n - masked " << R << stats.hitsMasked
+ << RR << (rem -= stats.hitsMasked);
+ std::cout << "\n - merged " << R << stats.hitsMerged
+ << RR << (rem -= stats.hitsMerged);
+ std::cout << "\n - putative duplicates " << R
+ << stats.hitsPutativeDuplicate << RR
+ << (rem -= stats.hitsPutativeDuplicate);
+ std::cout << "\n - putative abundant " << R
+ << stats.hitsPutativeAbundant << RR
+ << (rem -= stats.hitsPutativeAbundant);
+ std::cout << "\n - failed pre-extend test " << R
+ << stats.hitsFailedPreExtendTest << RR
+ << (rem -= stats.hitsFailedPreExtendTest);
+ std::cout << "\n - failed %-identity test " << R
+ << stats.hitsFailedExtendPercentIdentTest << RR
+ << (rem -= stats.hitsFailedExtendPercentIdentTest);
+ std::cout << "\n - failed e-value test " << R
+ << stats.hitsFailedExtendEValueTest << RR
+ << (rem -= stats.hitsFailedExtendEValueTest);
+ std::cout << "\n - abundant " << R
+ << stats.hitsAbundant << RR
+ << (rem -= stats.hitsAbundant);
+ std::cout << "\n - duplicates " << R
+ << stats.hitsDuplicate << "\033[1m" << RR
+ << (rem -= stats.hitsDuplicate)
+ << "\033[0m\n\n";
+
+ if (rem != stats.hitsFinal)
+ std::cout << "WARNING: hits dont add up\n";
+ }
+
+ if (options.verbosity >= 1)
+ {
+ auto const w = _numberOfDigits(stats.hitsFinal);
+ std::cout << "Number of valid hits: "
+ << std::setw(w) << stats.hitsFinal
+ << "\nNumber of Queries with at least one valid hit: "
+ << std::setw(w) << stats.qrysWithHit
+ << "\n";
+ }
+
+}
+
+#endif // HEADER GUARD
diff --git a/src/lambda_indexer.cpp b/src/lambda_indexer.cpp
new file mode 100644
index 0000000..9332577
--- /dev/null
+++ b/src/lambda_indexer.cpp
@@ -0,0 +1,221 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// lambda.cpp: Main File for the main application
+// ==========================================================================
+
+#include <seqan/basic.h>
+
+#include <seqan/arg_parse.h>
+#include <seqan/seq_io.h>
+
+#include <iostream>
+
+#define LAMBDA_INDEXER 1 // some things are different for the indexer binary
+
+#include "lambda_indexer.hpp"
+
+using namespace seqan;
+
+// ==========================================================================
+// Forwards
+// ==========================================================================
+
+inline int
+argConv0(LambdaIndexerOptions const & options);
+
+template <BlastProgram p>
+inline int
+argConv1(LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &);
+
+template <BlastProgram p,
+ typename TRedAlph>
+inline int
+argConv2(LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &);
+
+template <BlastProgram p,
+ typename TRedAlph,
+ typename TIndexSpecSpec>
+inline int
+realMain(LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &,
+ TIndexSpecSpec const &);
+
+// ==========================================================================
+// Functions
+// ==========================================================================
+// --------------------------------------------------------------------------
+// Function main()
+// --------------------------------------------------------------------------
+
+// Program entry point.
+
+int main(int argc, char const ** argv)
+{
+ // Parse the command line.
+ seqan::ArgumentParser parser;
+ LambdaIndexerOptions options;
+ seqan::ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv);
+
+ // If there was an error parsing or built-in argument parser functionality
+ // was triggered then we exit the program. The return code is 1 if there
+ // were errors and 0 if there were none.
+ if (res != seqan::ArgumentParser::PARSE_OK)
+ return res == seqan::ArgumentParser::PARSE_ERROR;
+
+ if (std::string(CMAKE_BUILD_TYPE) != "Release")
+ std::cerr << "WARNING: This binary is not built in release mode and will be much slower than it should be!\n";
+ return argConv0(options);
+}
+
+inline int
+argConv0(LambdaIndexerOptions const & options)
+{
+ switch(options.blastProgram)
+ {
+ case BlastProgram::BLASTN:
+ return argConv1(options, BlastProgramSelector<BlastProgram::BLASTN>());
+ case BlastProgram::BLASTP:
+ return argConv1(options, BlastProgramSelector<BlastProgram::BLASTP>());
+ case BlastProgram::BLASTX:
+ return argConv1(options, BlastProgramSelector<BlastProgram::BLASTX>());
+ case BlastProgram::TBLASTN:
+ return argConv1(options, BlastProgramSelector<BlastProgram::TBLASTN>());
+ case BlastProgram::TBLASTX:
+ return argConv1(options, BlastProgramSelector<BlastProgram::TBLASTX>());
+ default:
+ break;
+ }
+ return -1;
+}
+
+/// Alphabet reduction
+template <BlastProgram p>
+inline int
+argConv1(LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &)
+{
+ using TUnred = typename std::conditional<p == BlastProgram::BLASTN, Dna5, AminoAcid>::type;
+ using Tp = BlastProgramSelector<p>;
+ switch (options.alphReduction)
+ {
+ case 0:
+ return argConv2(options, Tp(), TUnred());
+ case 2:
+ return argConv2(options, Tp(), ReducedAminoAcid<Murphy10>());
+#if 0
+ case 10:
+ return argConv2(options, ReducedAminoAcid<ClusterReduction<10>>());
+ case 1:
+ return argConv2(options, AminoAcid10());
+ case 8:
+ return argConv2(options, ReducedAminoAcid<ClusterReduction<8>>());
+ case 12:
+ return argConv2(options, ReducedAminoAcid<ClusterReduction<12>>());
+#endif
+ default:
+ return -1;
+ }
+ return -1;
+}
+
+template <BlastProgram p,
+ typename TRedAlph>
+inline int
+argConv2(LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &)
+{
+ if (options.algo == "radixsort")
+ return realMain(options, BlastProgramSelector<p>(), TRedAlph(), RadixSortSACreateTag());
+ else
+ return realMain(options, BlastProgramSelector<p>(), TRedAlph(), Nothing());
+}
+
+template <BlastProgram p,
+ typename TRedAlph,
+ typename TIndexSpecSpec>
+inline int
+realMain(LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &,
+ TRedAlph const &,
+ TIndexSpecSpec const &)
+{
+ using TOrigSet = TCDStringSet<String<OrigSubjAlph<p>>>;
+ using TTransSet = TCDStringSet<String<TransAlph<p>>>;
+
+ TTransSet translatedSeqs;
+
+ {
+ TOrigSet originalSeqs;
+ int ret = 0;
+
+ // ids get saved to disk again immediately and are not kept in memory
+ ret = loadSubjSeqsAndIds(originalSeqs, options);
+ if (ret)
+ return ret;
+
+ // preserve lengths of untranslated sequences
+ if (sIsTranslated(p))
+ _saveOriginalSeqLengths(originalSeqs.limits, options);
+
+ // convert the seg file to seqan binary format
+ ret = convertMaskingFile(length(originalSeqs), options);
+ if (ret)
+ return ret;
+
+ // translate or swap depending on program
+ translateOrSwap(translatedSeqs, originalSeqs, options);
+ }
+
+ // dump translated and unreduced sequences (except where they are included in index)
+ if ((options.alphReduction != 0) || (options.dbIndexType != 0))
+ dumpTranslatedSeqs(translatedSeqs, options);
+
+ // see if final sequence set actually fits into index
+ if (!checkIndexSize(translatedSeqs))
+ return -1;
+
+ if (options.dbIndexType == 1)
+ {
+ using TIndexSpec = TFMIndex<TIndexSpecSpec>;
+ generateIndexAndDump<TIndexSpec,TIndexSpecSpec>(translatedSeqs,
+ options,
+ BlastProgramSelector<p>(),
+ TRedAlph());
+ } else
+ {
+ using TIndexSpec = IndexSa<TIndexSpecSpec>;
+ generateIndexAndDump<TIndexSpec,TIndexSpecSpec>(translatedSeqs,
+ options,
+ BlastProgramSelector<p>(),
+ TRedAlph());
+ }
+
+ return 0;
+}
+
diff --git a/src/lambda_indexer.hpp b/src/lambda_indexer.hpp
new file mode 100644
index 0000000..d70a0cd
--- /dev/null
+++ b/src/lambda_indexer.hpp
@@ -0,0 +1,554 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// lambda_indexer.hpp: Main File for the indexer application
+// ==========================================================================
+
+#ifndef SEQAN_LAMBDA_LAMBDA_INDEXER_H_
+#define SEQAN_LAMBDA_LAMBDA_INDEXER_H_
+
+#include <seqan/basic.h>
+#include <seqan/sequence.h>
+
+#include <seqan/seq_io.h>
+#include <seqan/index.h>
+// #include <seqan/index_extras.h>
+#include <seqan/translation.h>
+#include <seqan/reduced_aminoacid.h>
+
+#include "misc.hpp"
+#include "options.hpp"
+#include "alph.hpp"
+#include "radix_inplace.h"
+#include "lambda_indexer_misc.hpp"
+
+using namespace seqan;
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+template <typename TOrigAlph>
+inline int
+loadSubjSeqsAndIds(TCDStringSet<String<TOrigAlph>> & originalSeqs,
+ LambdaIndexerOptions const & options)
+{
+ typedef TCDStringSet<String<char, Alloc<>>> TIDs;
+ typedef TCDStringSet<String<char, Alloc<Truncate_>>> TIDsTruncated;
+ TIDs ids;
+ // difference only in name, same layout in mem, so we can hack the type:
+ TIDsTruncated* tIds = static_cast<TIDsTruncated*>((void*)&ids);
+
+ double start = sysTime();
+ myPrint(options, 1, "Loading Subject Sequences and Ids...");
+
+ SeqFileIn infile(toCString(options.dbFile));
+ int ret;
+ if (options.truncateIDs)
+ ret = myReadRecords(*tIds, originalSeqs, infile);
+ else
+ ret = myReadRecords(ids, originalSeqs, infile);
+
+ if (ret)
+ return ret;
+
+ myPrint(options, 1, " done.\n");
+ double finish = sysTime() - start;
+ myPrint(options, 2, "Runtime: ", finish, "s \n");
+
+ unsigned long maxLen = 0ul;
+ for (auto const & s : originalSeqs)
+ if (length(s) > maxLen)
+ maxLen = length(s);
+ myPrint(options, 2, "Number of sequences read: ", length(originalSeqs),
+ "\nLongest sequence read: ", maxLen, "\n\n");
+
+ myPrint(options, 1, "Dumping Subj Ids...");
+
+ //TODO save to TMPDIR instead
+ CharString _path = options.dbFile;
+ append(_path, ".ids");
+ save(ids, toCString(_path));
+
+ myPrint(options, 1, " done.\n");
+ finish = sysTime() - start;
+ myPrint(options, 2, "Runtime: ", finish, "s \n\n");
+
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+template <typename TLimits>
+inline void
+_saveOriginalSeqLengths(TLimits limits, // we want copy!
+ LambdaIndexerOptions const & options)
+{
+ for (uint32_t i = 0; i < (length(limits) - 1); ++i)
+ limits[i] = limits[i+1] - limits[i];
+ // last entry not overwritten, should be the sum of all lengths
+
+ myPrint(options, 1, " dumping untranslated subject lengths...");
+ //TODO save to TMPDIR instead
+ CharString _path = options.dbFile;
+ append(_path, ".untranslengths");
+ save(limits, toCString(_path));
+}
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+template <typename TTransAlph, typename TOrigAlph>
+inline void
+translateOrSwap(TCDStringSet<String<TTransAlph>> & out,
+ TCDStringSet<String<TOrigAlph>> & in,
+ LambdaIndexerOptions const & options)
+{
+ //TODO more output
+ myPrint(options, 1, "translating...");
+ translate(out,
+ in,
+ SIX_FRAME,
+ options.geneticCode);
+}
+
+template <typename TSameAlph>
+inline void
+translateOrSwap(TCDStringSet<String<TSameAlph>> & out,
+ TCDStringSet<String<TSameAlph>> & in,
+ LambdaIndexerOptions const & /**/)
+{
+ swap(out, in);
+}
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+template <typename TTransAlph>
+inline void
+dumpTranslatedSeqs(TCDStringSet<String<TTransAlph>> const & translatedSeqs,
+ LambdaIndexerOptions const & options)
+{
+ double start = sysTime();
+ myPrint(options, 1, "Dumping unreduced Subj Sequences...");
+
+ //TODO save to TMPDIR instead
+ std::string _path = options.dbFile + '.' + std::string(_alphName(TTransAlph()));
+ save(translatedSeqs, _path.c_str());
+
+ myPrint(options, 1, " done.\n");
+ double finish = sysTime() - start;
+ myPrint(options, 2, "Runtime: ", finish, "s \n\n");
+}
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+// template <typename TTransAlph, typename TRedAlph>
+// inline void
+// reduceOrSwap(TCDStringSet<String<TRedAlph>> & out,
+// TCDStringSet<String<TTransAlph>> & in)
+// {
+// //TODO more output
+// // reduce implicitly
+// myPrint(options, 1, "Reducing...");
+// out.concat = in.concat;
+// out.limits = in.limits;
+// }
+//
+// template <typename TSameAlph>
+// inline void
+// reduceOrSwap(TCDStringSet<String<TSameAlph>> & out,
+// TCDStringSet<String<TSameAlph>> & in)
+// {
+// swap(out, in);
+// }
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+template <typename TRedAlph>
+inline bool
+checkIndexSize(TCDStringSet<String<TRedAlph>> const & seqs)
+{
+ using SAV = typename SAValue<TCDStringSet<String<TRedAlph>>>::Type;
+ uint64_t curNumSeq = length(seqs);
+ uint64_t maxNumSeq = std::numeric_limits<typename Value<SAV, 1>::Type>::max();
+
+ if (curNumSeq >= maxNumSeq)
+ {
+ std::cerr << "Too many sequences to be indexed:\n "
+ << length(seqs) << " in file, but only "
+ << maxNumSeq << " supported by index.\n";
+ return false;
+ }
+
+ uint64_t maxLenSeq = std::numeric_limits<typename Value<SAV, 2>::Type>::max();
+ uint64_t maxLen = 0ul;
+ for (auto const & s : seqs)
+ if (length(s) > maxLen)
+ maxLen = length(s);
+
+ if (maxLen >= maxLenSeq)
+ {
+ std::cerr << "Too long sequences to be indexed:\n "
+ << "length" << maxLen << " present in file, but only "
+ << maxLenSeq << " supported by index.\n";
+ return false;
+ }
+ return true;
+}
+
+// --------------------------------------------------------------------------
+// Function loadSubj()
+// --------------------------------------------------------------------------
+
+inline int
+convertMaskingFile(uint64_t numberOfSeqs,
+ LambdaIndexerOptions const & options)
+
+{
+ StringSet<String<unsigned>, Owner<ConcatDirect<>>> segIntStarts;
+ StringSet<String<unsigned>, Owner<ConcatDirect<>>> segIntEnds;
+// resize(segIntervals, numberOfSeqs, Exact());
+
+ if (options.segFile != "")
+ {
+ myPrint(options, 1, "Constructing binary seqan masking from seg-file...");
+
+ std::ifstream stream;
+ stream.open(toCString(options.segFile));
+ if (!stream.is_open())
+ {
+ std::cerr << "ERROR: could not open seg file.\n";
+ return -1;
+ }
+
+ auto reader = directionIterator(stream, Input());
+
+// StringSet<String<Tuple<unsigned, 2>>> _segIntervals;
+// auto & _segIntervals = segIntervals;
+// resize(_segIntervals, numberOfSeqs, Exact());
+ StringSet<String<unsigned>> _segIntStarts;
+ StringSet<String<unsigned>> _segIntEnds;
+ resize(_segIntStarts, numberOfSeqs, Exact());
+ resize(_segIntEnds, numberOfSeqs, Exact());
+ CharString buf;
+// std::tuple<unsigned, unsigned> tup;
+
+// auto curSeq = begin(_segIntervals);
+ unsigned curSeq = 0;
+ while (value(reader) == '>')
+ {
+// if (curSeq == end(_segIntervals))
+// return -7;
+ if (curSeq == numberOfSeqs)
+ {
+ std::cerr << "ERROR: seg file has more entries then database.\n";
+ return -7;
+ }
+ skipLine(reader);
+ if (atEnd(reader))
+ break;
+
+ unsigned curInt = 0;
+ while ((!atEnd(reader)) && (value(reader) != '>'))
+ {
+ resize(_segIntStarts[curSeq], length(_segIntStarts[curSeq])+1);
+ resize(_segIntEnds[curSeq], length(_segIntEnds[curSeq])+1);
+ clear(buf);
+ readUntil(buf, reader, IsWhitespace());
+
+// std::get<0>(tup) = strtoumax(toCString(buf), 0, 10);
+ _segIntStarts[curSeq][curInt] = strtoumax(toCString(buf), 0, 10);
+ skipUntil(reader, IsDigit());
+
+ clear(buf);
+ readUntil(buf, reader, IsWhitespace());
+
+// std::get<1>(tup) = strtoumax(toCString(buf), 0, 10);
+ _segIntEnds[curSeq][curInt] = strtoumax(toCString(buf), 0, 10);
+
+// appendValue(*curSeq, tup);
+
+ skipLine(reader);
+ curInt++;
+ }
+ if (atEnd(reader))
+ break;
+ else
+ curSeq++;
+ }
+// if (curSeq != end(_segIntervals))
+// return -9;
+ if (curSeq != (numberOfSeqs - 1))
+ {
+ std::cerr << "ERROR: seg file has less entries (" << curSeq + 1
+ << ") than database (" << numberOfSeqs << ").\n";
+ return -9;
+ }
+
+ segIntStarts.concat = concat(_segIntStarts);
+ segIntStarts.limits = stringSetLimits(_segIntStarts);
+ segIntEnds.concat = concat(_segIntEnds);
+ segIntEnds.limits = stringSetLimits(_segIntEnds);
+// segIntEnds = _segIntEnds;
+// segIntervals = _segIntervals; // non-concatdirect to concatdirect
+
+ stream.close();
+
+ } else
+ {
+ myPrint(options, 1, "No Seg-File specified, no masking will take place.\n");
+// resize(segIntervals, numberOfSeqs, Exact());
+ resize(segIntStarts, numberOfSeqs, Exact());
+ resize(segIntEnds, numberOfSeqs, Exact());
+ }
+
+// for (unsigned u = 0; u < length(segIntStarts); ++u)
+// {
+// myPrint(options, 1,u, ": ";
+// for (unsigned v = 0; v < length(segIntStarts[u]); ++v)
+// {
+// myPrint(options, 1,'(', segIntStarts[u][v], ", ", segIntEnds[u][v], ") ";
+// }
+// myPrint(options, 1,'\n';
+// }
+ myPrint(options, 1, "Dumping binary seqan mask file...");
+ CharString _path = options.dbFile;
+ append(_path, ".binseg_s");
+ save(segIntStarts, toCString(_path));
+ _path = options.dbFile;
+ append(_path, ".binseg_e");
+ save(segIntEnds, toCString(_path));
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "\n");
+ return 0;
+}
+
+// --------------------------------------------------------------------------
+// Function createSuffixArray()
+// --------------------------------------------------------------------------
+
+// If there is no overload with progress function, then strip it
+template <typename TSA,
+ typename TString,
+ typename TSSetSpec,
+ typename TAlgo,
+ typename TLambda>
+inline void
+createSuffixArray(TSA & SA,
+ StringSet<TString, TSSetSpec> const & s,
+ TAlgo const &,
+ TLambda const &)
+{
+ return createSuffixArray(SA, s, TAlgo());
+}
+
+// ----------------------------------------------------------------------------
+// Function indexCreate
+// ----------------------------------------------------------------------------
+
+template <typename TText, typename TSpec, typename TConfig, typename TLambda>
+inline bool
+indexCreateProgress(Index<TText, FMIndex<TSpec, TConfig> > & index,
+ FibreSALF const &,
+ TLambda const & progressCallback)
+{
+ typedef Index<TText, FMIndex<TSpec, TConfig> > TIndex;
+ typedef typename Fibre<TIndex, FibreTempSA>::Type TTempSA;
+ typedef typename Size<TIndex>::Type TSize;
+ typedef typename DefaultIndexCreator<TIndex, FibreSA>::Type TAlgo;
+
+ TText const & text = indexText(index);
+
+ if (empty(text))
+ return false;
+
+ TTempSA tempSA;
+
+ std::cout << "Generating 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n"
+ " (1) SuffixArray |" << std::flush;
+ // Create the full SA.
+ resize(tempSA, lengthSum(text), Exact());
+ createSuffixArray(tempSA, text, TAlgo(), progressCallback);
+
+ std::cout << " (2) FM-Index..." << std::flush;
+ // Create the LF table.
+ createLF(indexLF(index), text, tempSA);
+
+ // Set the FMIndex LF as the CompressedSA LF.
+ setFibre(indexSA(index), indexLF(index), FibreLF());
+
+ // Create the compressed SA.
+ TSize numSentinel = countSequences(text);
+ createCompressedSa(indexSA(index), tempSA, numSentinel);
+ std::cout << " done.\n" << std::flush;
+ return true;
+}
+
+template <typename TText, typename TSpec, typename TLambda>
+inline bool
+indexCreateProgress(Index<TText, IndexSa<TSpec> > & index,
+ FibreSA const &,
+ TLambda const & progressCallback)
+{
+ typedef Index<TText, IndexSa<TSpec> > TIndex;
+ typedef typename Fibre<TIndex, FibreSA>::Type TSA;
+ typedef typename DefaultIndexCreator<TIndex, FibreSA>::Type TAlgo;
+
+ TText const & text = indexText(index);
+
+ if (empty(text))
+ return false;
+
+ TSA & sa = getFibre(index, FibreSA());
+
+ std::cout << "Generating 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n"
+ " SuffixArray |" << std::flush;
+ // Create the full SA.
+ resize(sa, lengthSum(text), Exact());
+ createSuffixArray(sa, text, TAlgo(), progressCallback);
+
+ return true;
+}
+
+// --------------------------------------------------------------------------
+// Function generateIndexAndDump()
+// --------------------------------------------------------------------------
+
+#ifdef _OPENMP
+#define TID omp_get_thread_num()
+#else
+#define TID 0
+#endif
+
+template <typename TIndexSpec,
+ typename TIndexSpecSpec,
+ typename TString,
+ typename TSpec,
+ typename TRedAlph_,
+ BlastProgram p>
+inline void
+generateIndexAndDump(StringSet<TString, TSpec> & seqs,
+ LambdaIndexerOptions const & options,
+ BlastProgramSelector<p> const &,
+ TRedAlph_ const &)
+{
+ using TTransSeqs = TCDStringSet<String<TransAlph<p>>>;
+
+ using TRedAlph = RedAlph<p, TRedAlph_>; // ensures == Dna5 for BlastN
+ using TRedSeqVirt = ModifiedString<String<TransAlph<p>, Alloc<>>,
+ ModView<FunctorConvert<TransAlph<p>,TRedAlph>>>;
+ using TRedSeqsVirt = StringSet<TRedSeqVirt, Owner<ConcatDirect<>>>;
+
+ static bool constexpr
+ indexIsFM = std::is_same<TIndexSpec,
+ TFMIndex<TIndexSpecSpec>
+ >::value;
+ static bool constexpr
+ alphReduction = !std::is_same<TransAlph<p>, TRedAlph>::value;
+
+ using TRedSeqs = typename std::conditional<
+ !alphReduction,
+ TTransSeqs, // owner
+ TRedSeqsVirt>::type; // modview
+ using TRedSeqsACT = typename std::conditional<
+ !alphReduction,
+ TTransSeqs &, // reference to owner
+ TRedSeqsVirt>::type; // modview
+
+ using TDbIndex = Index<TRedSeqs, TIndexSpec>;
+ using TFullFibre = typename std::conditional<indexIsFM,
+ FibreSALF,
+ FibreSA>::type;
+ static bool constexpr
+ hasProgress = std::is_same<TIndexSpecSpec, RadixSortSACreateTag>::value;
+
+ // Generate Index
+ if (!hasProgress)
+ myPrint(options, 1, "Generating Index...");
+
+ double s = sysTime();
+
+// std::cout << "indexIsFM: " << int(indexIsFM) << std::endl;
+
+ // FM-Index needs reverse input
+ if (indexIsFM)
+ reverse(seqs);
+
+ TRedSeqsACT redSubjSeqs(seqs);
+
+ TDbIndex dbIndex(redSubjSeqs);
+
+ // instantiate SA
+ if (hasProgress && (options.verbosity >= 1))
+ {
+ uint64_t _lastPercent = 0;
+ indexCreateProgress(dbIndex, TFullFibre(),
+ [&_lastPercent] (uint64_t curPerc)
+ {
+ SEQAN_OMP_PRAGMA(critical(progressBar))
+ // if (TID == 0)
+ printProgressBar(_lastPercent, curPerc);
+ });
+ }
+ else
+ {
+ indexCreate(dbIndex, TFullFibre());
+ }
+
+ // since we dumped unreduced sequences before and reduced sequences are
+ // only "virtual" we clear them before dump
+ if (alphReduction || indexIsFM)
+ clear(seqs);
+ if (alphReduction)
+ clear(redSubjSeqs.limits); // limits part is not lightweight
+
+ double e = sysTime() - s;
+ if (!hasProgress)
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", e, "s \n\n");
+
+ // Dump Index
+ myPrint(options, 1, "Writing Index to disk...");
+ s = sysTime();
+ std::string path = toCString(options.dbFile);
+ path += '.' + std::string(_alphName(TRedAlph()));
+ if (indexIsFM)
+ path += ".fm";
+ else
+ path += ".sa";
+ save(dbIndex, path.c_str());
+ e = sysTime() - s;
+ myPrint(options, 1, " done.\n");
+ myPrint(options, 2, "Runtime: ", e, "s \n");
+}
+
+#endif // header guard
diff --git a/src/lambda_indexer_misc.hpp b/src/lambda_indexer_misc.hpp
new file mode 100644
index 0000000..51d2a0b
--- /dev/null
+++ b/src/lambda_indexer_misc.hpp
@@ -0,0 +1,149 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// lambda_indexer_misc.hpp: misc stuff for indexer
+// ==========================================================================
+
+#ifndef LAMBDA_INDEXER_MISC_HPP_
+#define LAMBDA_INDEXER_MISC_HPP_
+
+// ----------------------------------------------------------------------------
+// Class ComparisonCounter
+// ----------------------------------------------------------------------------
+
+template <typename TText, typename TSpec>
+struct ComparisonCounter;
+
+// no counting
+template <typename TText>
+struct ComparisonCounter<TText, Nothing>
+{
+ uint64_t _comparisons = 0;
+ uint64_t _expectedComparisons = 0;
+ uint64_t _lastPercent = 0;
+ ComparisonCounter(TText const &,
+ uint64_t expectedComparisons = 0u)
+ {
+ (void)expectedComparisons;
+ }
+
+ // may be constexpr in c++14
+ inline void inc() const
+ {}
+};
+
+// every thread counts
+#ifdef _OPENMP
+template <typename TText>
+struct ComparisonCounter<TText, std::false_type>
+#else
+template <typename TText, typename TSpec>
+struct ComparisonCounter
+#endif
+{
+ uint64_t _comparisons = 0;
+ uint64_t _expectedComparisons = 0;
+// uint64_t _twoPercent = 0;
+ uint64_t _lastPercent = 0;
+ uint64_t _checkEveryNHits = 1;
+
+ ComparisonCounter(TText const & text,
+ uint64_t expectedComparisons = 0u)
+ {
+ if (expectedComparisons == 0)
+ {
+ uint64_t l = length(concat(text));
+ _expectedComparisons = 1.2 * double(l) * std::log(l) / std::log(2);
+ } else
+ _expectedComparisons = expectedComparisons;
+
+// _twoPercent = _expectedComparisons / 50;
+ _comparisons = 0;
+ _lastPercent = 0;
+ while ((_checkEveryNHits << 1) < (_expectedComparisons / 100))
+ _checkEveryNHits <<= 1;
+ }
+
+ inline void inc()
+ {
+ uint64_t comp = ++_comparisons;
+ // it is not important that the henceforth _comparisons be actually
+ // the same value (might not be due to SMP)
+
+ // progress reporting
+ if (comp & _checkEveryNHits)
+ {
+ uint64_t curPerc = comp * 100 / _expectedComparisons;
+ if (curPerc < 100)
+ printProgressBar(_lastPercent, curPerc);
+ }
+ }
+};
+
+// only one thread counts
+#ifdef _OPENMP
+template <typename TText>
+struct ComparisonCounter<TText, std::true_type>
+{
+ uint64_t _comparisons = 0;
+ uint64_t _expectedComparisons = 0;
+// uint64_t _twoPercent = 0;
+ uint64_t _lastPercent = 0;
+ uint64_t _checkEveryNHits = 1;
+
+ ComparisonCounter(TText const & text,
+ uint64_t expectedComparisons = 0u)
+ {
+ if (expectedComparisons == 0)
+ {
+ uint64_t l = length(concat(text));
+ _expectedComparisons = 1.2 * double(l) * std::log(l) / std::log(2) /
+ omp_get_max_threads();
+ } else
+ _expectedComparisons = expectedComparisons;
+
+// _twoPercent = _expectedComparisons / 50;
+// _comparisons = 0;
+ while ((_checkEveryNHits << 1) < (_expectedComparisons / 100))
+ _checkEveryNHits <<= 1;
+ }
+
+ inline void inc()
+ {
+ if (omp_get_thread_num() == 0) // only one thread counts
+ {
+ uint64_t comp = ++_comparisons;
+
+ // progress reporting
+ if (comp & _checkEveryNHits)
+ {
+ uint64_t curPerc = comp * 100 / _expectedComparisons;
+ if (curPerc < 100)
+ printProgressBar(_lastPercent, curPerc);
+ }
+ }
+ }
+};
+#endif
+
+#endif // LAMBDA_INDEXER_MISC_HPP_
\ No newline at end of file
diff --git a/src/match.hpp b/src/match.hpp
new file mode 100644
index 0000000..83b47e7
--- /dev/null
+++ b/src/match.hpp
@@ -0,0 +1,293 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// match.h: Main File for the match class
+// ==========================================================================
+
+#ifndef SEQAN_LAMBDA_FINDER_H_
+#define SEQAN_LAMBDA_FINDER_H_
+
+#include <forward_list>
+#include <unordered_map>
+#include <vector>
+
+#include "misc.hpp"
+
+
+
+using namespace seqan;
+
+//-----------------------------------------------------------------------------
+// Finder
+//-----------------------------------------------------------------------------
+
+struct Match
+{
+ typedef uint32_t TQId;
+ typedef uint32_t TSId;// many suffixes in subject-index TODO can be smaller now?
+ typedef uint16_t TPos;
+ TQId qryId;
+ TSId subjId;
+ TPos qryStart;
+// TPos qryEnd;
+
+ TPos subjStart;
+// TPos subjEnd;
+
+// Match()
+// :
+// qryId(0), qryStart(0), /*qryEnd(0),*/ subjId(0), subjStart(0)/*, subjEnd(0)*/
+// {
+// }
+//
+// Match(Match const & m2)
+// {
+// qryId = m2.qryId;
+// qryStart = m2.qryStart;
+// /*qryEnd = m2.qryEnd;*/
+// subjId = m2.subjId;
+// subjStart = m2.subjStart;
+// /*subjEnd = m2.subjEnd;*/
+// }
+
+ inline bool operator== (Match const & m2) const
+ {
+ return std::tie(qryId, subjId, qryStart, subjStart/*, qryEnd, subjEnd*/)
+ == std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart/*, m2.qryEnd, m2.subjEnd*/);
+ }
+ inline bool operator< (Match const & m2) const
+ {
+ return std::tie(qryId, subjId, qryStart, subjStart/*, qryEnd, subjEnd*/)
+ < std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart/*, m2.qryEnd, m2.subjEnd*/);
+ }
+};
+
+inline void
+setToSkip(Match & m)
+{
+ using TPos = typename Match::TPos;
+ constexpr TPos posMax = std::numeric_limits<TPos>::max();
+ m.qryStart = posMax;
+ m.subjStart = posMax;
+}
+
+inline bool
+isSetToSkip(Match const & m)
+{
+ using TPos = typename Match::TPos;
+ constexpr TPos posMax = std::numeric_limits<TPos>::max();
+ return (m.qryStart == posMax) && (m.subjStart == posMax);
+}
+
+// struct IdPairHash
+// {
+// std::size_t
+// operator()(std::pair<Match::TQId, Match::TSId> const & pair) const
+// {
+// return std::hash<Match::TQId>()(std::get<0>(pair)) ^
+// std::hash<Match::TSId>()(std::get<1>(pair));
+// }
+// };
+//
+//
+// // this is a comparison structure that takes the relative abundance of hits
+// // on a specific subject into account thereby moving those subjects to
+// // front of a queries hits that are more likely to be top-scoring
+// struct MatchSortComp :
+// public ::std::binary_function <Match, Match, bool>
+// {
+// std::unordered_map<std::pair<Match::TQId, Match::TSId>,
+// uint64_t,
+// IdPairHash> map;
+//
+// MatchSortComp(std::vector<Match> const & matches)
+// {
+// for (Match const & m : matches)
+// ++map[std::make_pair(m.qryId, m.subjId)];
+// }
+//
+// inline bool operator() (Match const & i, Match const & j) const
+// {
+// int64_t iAb = -map.at(std::make_pair(i.qryId, i.subjId));
+// int64_t jAb = -map.at(std::make_pair(j.qryId, j.subjId));
+// return std::tie(i.qryId,
+// iAb,
+// i.qryStart,
+// i.subjStart)
+// < std::tie(j.qryId,
+// jAb,
+// j.qryStart,
+// j.subjStart);
+// }
+// };
+
+template <typename TGH>
+inline void
+myHyperSortSingleIndex(std::vector<Match> & matches,
+ LambdaOptions const & options,
+ TGH const &)
+{
+ using TId = typename Match::TQId;
+
+ // regular sort
+ std::sort(matches.begin(), matches.end());
+
+ // trueQryId, begin, end
+ std::vector<std::tuple<TId, TId, TId>> intervals;
+ for (TId i = 1; i <= length(matches); ++i)
+ {
+ if ((i == length(matches)) ||
+ (matches[i-1].qryId != matches[i].qryId) ||
+ (matches[i-1].subjId / sNumFrames(TGH::blastProgram)) !=
+ (matches[i].subjId / sNumFrames(TGH::blastProgram)))
+ {
+ if (length(intervals) == 0) // first interval
+ intervals.emplace_back(std::make_tuple(matches[i-1].qryId
+ / qNumFrames(TGH::blastProgram),
+ 0,
+ i));
+ else
+ intervals.emplace_back(std::make_tuple(matches[i-1].qryId
+ / qNumFrames(TGH::blastProgram),
+ std::get<2>(intervals.back()),
+ i));
+ }
+ }
+
+ if (options.doubleIndexing)
+ {
+ // sort by trueQryId, then lengths of interval
+ std::sort(intervals.begin(), intervals.end(),
+ [] (std::tuple<TId, TId, TId> const & i1,
+ std::tuple<TId, TId, TId> const & i2)
+ {
+ return (std::get<0>(i1) != std::get<0>(i2))
+ ? (std::get<0>(i1) < std::get<0>(i2))
+ : ((std::get<2>(i1) - std::get<1>(i1))
+ > (std::get<2>(i2) - std::get<1>(i2)));
+ });
+ } else
+ {
+ // sort by lengths of interval, since trueQryId is the same anyway
+ std::sort(intervals.begin(), intervals.end(),
+ [] (std::tuple<TId, TId, TId> const & i1,
+ std::tuple<TId, TId, TId> const & i2)
+ {
+ return (std::get<2>(i1) - std::get<1>(i1))
+ > (std::get<2>(i2) - std::get<1>(i2));
+ });
+ }
+
+ std::vector<Match> tmpVector;
+ tmpVector.resize(matches.size());
+
+ TId newIndex = 0;
+ for (auto const & i : intervals)
+ {
+ TId limit = std::get<2>(i);
+ for (TId j = std::get<1>(i); j < limit; ++j)
+ {
+ tmpVector[newIndex] = matches[j];
+ newIndex++;
+ }
+ }
+ std::swap(tmpVector, matches);
+}
+
+
+// inline bool
+// overlap(Match const & m1, Match const & m2, unsigned char d = 0)
+// {
+// SEQAN_ASSERT_EQ(m1.qryId, m2.qryId);
+// SEQAN_ASSERT_EQ(m1.subjId, m2.subjId);
+//
+// // if ( //match beginnings overlap
+// // ((m1.qryStart >= _protectUnderflow(m2.qryStart, d)) &&
+// // (m1.qryStart <= _protectOverflow( m2.qryEnd, d)) &&
+// // (m1.subjStart >= _protectUnderflow(m2.subjStart,d)) &&
+// // (m1.subjStart <= _protectOverflow( m2.subjEnd, d))
+// // ) || // OR match endings overlap
+// // ((m1.qryEnd >= _protectUnderflow(m2.qryStart, d)) &&
+// // (m1.qryEnd <= _protectOverflow( m2.qryEnd, d)) &&
+// // (m1.subjEnd >= _protectUnderflow(m2.subjStart,d)) &&
+// // (m1.subjEnd <= _protectOverflow( m2.subjEnd, d))
+// // )
+// // )
+// // return true;
+//
+// //DEBUG DEBUG DEBUG TODO TODO TODO WHEN ALIGNMENT IS FIXED
+// // also add check if diffference of overlaps is <= maximum number of gaps
+// if ((intervalOverlap(m1.qryStart, m1.qryEnd, m2.qryStart, m2.qryEnd) > -d) &&
+// (intervalOverlap(m1.subjStart, m1.subjEnd, m2.subjStart, m2.subjEnd) > -d) &&
+// ((m1.qryStart < m2.qryStart ) == (m1.subjStart < m2.subjStart))) // same order
+// return true;
+//
+// return false;
+// }
+
+// inline bool contains(Match const & m1, Match const & m2)
+// {
+// // if (m1.qryId != m2.qryId)
+// // return false;
+// SEQAN_ASSERT_EQ(m1.qryId, m2.qryId);
+// if (m1.qryStart != m2.qryStart)
+// return false;
+//
+// if (m1.qryStart > m2.qryStart)
+// return false;
+// if (m1.subjStart > m2.subjStart)
+// return false;
+// if (m1.qryEnd < m2.qryEnd)
+// return false;
+// if (m1.subjEnd < m2.subjEnd)
+// return false;
+// return true;
+// }
+//
+// inline void
+// mergeUnto(Match & m1, Match const & m2)
+// {
+// SEQAN_ASSERT_EQ(m1.qryId, m2.qryId);
+// SEQAN_ASSERT_EQ(m1.subjId, m2.subjId);
+// m1.qryStart = std::min(m1.qryStart, m2.qryStart);
+// m1.qryEnd = std::max(m1.qryEnd, m2.qryEnd);
+// m1.subjStart = std::min(m1.subjStart,m2.subjStart);
+// m1.subjEnd = std::max(m1.subjEnd, m2.subjEnd);
+// }
+
+
+// inline void
+// _printMatch(Match const & m)
+// {
+// std::cout << "MATCH Query " << m.qryId
+// << "(" << m.qryStart << ", " << m.qryEnd
+// << ") on Subject "<< m.subjId
+// << "(" << m.subjStart << ", " << m.subjEnd
+// << ")" << std::endl << std::flush;
+// }
+
+
+
+
+
+#endif // header guard
diff --git a/src/misc.hpp b/src/misc.hpp
new file mode 100644
index 0000000..da5ac74
--- /dev/null
+++ b/src/misc.hpp
@@ -0,0 +1,505 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.*/
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// store.h: contains types and definitions for storing sequences and indices
+// ==========================================================================
+
+#ifndef SEQAN_LAMBDA_MISC_H_
+#define SEQAN_LAMBDA_MISC_H_
+
+#include <type_traits>
+#include <forward_list>
+
+#include <seqan/basic.h>
+#include <seqan/sequence.h>
+
+#include <seqan/index.h>
+
+#include <seqan/align.h>
+#include <seqan/blast.h>
+// #include <seqan/reduced_aminoacid.h>
+
+#include "options.hpp"
+
+using namespace seqan;
+
+// ============================================================================
+// Forwards
+// ============================================================================
+
+// ============================================================================
+// Metafunctions
+// ============================================================================
+
+// makes partial function specialization convenient
+template <bool condition>
+using MyEnableIf = typename std::enable_if<condition, int>::type;
+
+// ============================================================================
+// Functions for translation and retranslation
+// ============================================================================
+
+// template <typename TString, typename TSpec>
+// inline void
+// assign(StringSet<ModifiedString<TString, TSpec>, Owner<ConcatDirect<>> > & target,
+// StringSet<TString, Owner<ConcatDirect<>> > & source)
+// {
+// target.limits = source.limits;
+// target.concat._host = &source.concat;
+// }
+
+/*
+template <typename T>
+inline uint64_t
+length(std::deque<T> const & list)
+{
+ return list.size();
+}
+template <typename T>
+inline uint64_t
+length(std::forward_list<T> const & list)
+{
+ return std::distance(list.begin(), list.end());
+}*/
+
+template <typename TAlph>
+inline std::basic_ostream<char> &
+operator<<(std::basic_ostream<char> & out,
+ const Iter<const String<SimpleType<unsigned char,TAlph>,
+ seqan::Packed<> >,
+ seqan::Packed<> > it)
+{
+ out << *it;
+ return out;
+}
+
+
+template <typename T1, typename T2>
+inline uint64_t
+quickHamming(T1 const & s1, T2 const & s2)
+{
+ SEQAN_ASSERT_EQ(length(s1), length(s2));
+
+ uint64_t ret = 0;
+
+ for (uint64_t i = 0; i < length(s1); ++i)
+ if (s1[i] != s2[i])
+ ++ret;
+
+ return ret;
+}
+
+template <typename TPos>
+inline bool
+inRange(TPos const i, TPos const beg, TPos const end)
+{
+ return ((i >= beg) && (i < end));
+}
+
+inline int64_t
+intervalOverlap(uint64_t const s1, uint64_t const e1,
+ uint64_t const s2, uint64_t const e2)
+{
+ return std::min(e1, e2) - std::max(s1, s2);
+}
+
+inline void
+printProgressBar(uint64_t & lastPercent, uint64_t curPerc)
+{
+ //round down to even
+ curPerc = curPerc & ~1;
+// #pragma omp critical(stdout)
+ if ((curPerc > lastPercent) && (curPerc <= 100))
+ {
+ for (uint64_t i = lastPercent + 2; i <= curPerc; i+=2)
+ {
+ if (i == 100)
+ std::cout << "|\n" << std::flush;
+ else if (i % 10 == 0)
+ std::cout << ":" << std::flush;
+ else
+ std::cout << "." << std::flush;
+ }
+ lastPercent = curPerc;
+ }
+}
+
+
+template <typename TSource0, typename TGapsSpec0,
+ typename TSource1, typename TGapsSpec1,
+ typename TScoreValue, typename TScoreSpec,
+ typename TAlignContext>
+inline TScoreValue
+localAlignment2(Gaps<TSource0, TGapsSpec0> & row0,
+ Gaps<TSource1, TGapsSpec1> & row1,
+ Score<TScoreValue, TScoreSpec> const & scoringScheme,
+ int const lowerDiag,
+ int const upperDiag,
+ TAlignContext & alignContext)
+{
+ clear(alignContext.traceSegment);
+
+ typedef FreeEndGaps_<True, True, True, True> TFreeEndGaps;
+ typedef AlignConfig2<LocalAlignment_<>,
+ DPBand,
+ TFreeEndGaps,
+ TracebackOn<TracebackConfig_<CompleteTrace,
+ GapsLeft> > > TAlignConfig;
+
+ TScoreValue score;
+ DPScoutState_<Default> scoutState;
+ score = _setUpAndRunAlignment(alignContext.dpContext,
+ alignContext.traceSegment,
+ scoutState,
+ row0,
+ row1,
+ scoringScheme,
+ TAlignConfig(lowerDiag, upperDiag));
+
+ _adaptTraceSegmentsTo(row0, row1, alignContext.traceSegment);
+ return score;
+}
+
+// ----------------------------------------------------------------------------
+// Tag Truncate_ (private tag to signify truncating of IDs while reading)
+// ----------------------------------------------------------------------------
+
+struct Truncate_;
+
+// ----------------------------------------------------------------------------
+// Function readRecord(Fasta); an overload that truncates Ids at first Whitespace
+// ----------------------------------------------------------------------------
+
+template <typename TSeqStringSet, typename TSpec, typename TSize>
+inline void
+readRecords(TCDStringSet<String<char, Alloc<Truncate_>>> & meta,
+ TSeqStringSet & seq,
+ FormattedFile<Fastq, Input, TSpec> & file,
+ TSize maxRecords)
+{
+ typedef typename SeqFileBuffer_<TSeqStringSet, TSpec>::Type TSeqBuffer;
+
+ TSeqBuffer seqBuffer;
+ IsWhitespace func;
+
+ // reuse the memory of context(file).buffer for seqBuffer (which has a different type but same sizeof(Alphabet))
+ swapPtr(seqBuffer.data_begin, context(file).buffer[1].data_begin);
+ swapPtr(seqBuffer.data_end, context(file).buffer[1].data_end);
+ seqBuffer.data_capacity = context(file).buffer[1].data_capacity;
+
+ for (; !atEnd(file) && maxRecords > 0; --maxRecords)
+ {
+ readRecord(context(file).buffer[0], seqBuffer, file);
+ for (size_t i = 0; i < length(context(file).buffer[0]); ++i)
+ {
+ if (func(context(file).buffer[0][i]))
+ {
+ resize(context(file).buffer[0], i);
+ break;
+ }
+ }
+ appendValue(meta, context(file).buffer[0]);
+ appendValue(seq, seqBuffer);
+ }
+
+ swapPtr(seqBuffer.data_begin, context(file).buffer[1].data_begin);
+ swapPtr(seqBuffer.data_end, context(file).buffer[1].data_end);
+ context(file).buffer[1].data_capacity = seqBuffer.data_capacity;
+ seqBuffer.data_capacity = 0;
+}
+
+// ----------------------------------------------------------------------------
+// Generic Sequence loading
+// ----------------------------------------------------------------------------
+
+template <typename TSpec1,
+ typename TSpec2,
+ typename TFile>
+inline int
+myReadRecords(TCDStringSet<String<char, TSpec1>> & ids,
+ TCDStringSet<String<Dna5, TSpec2>> & seqs,
+ TFile & file)
+{
+ TCDStringSet<String<Iupac>> tmpSeqs; // all IUPAC nucleic acid characters are valid input
+ try
+ {
+ readRecords(ids, tmpSeqs, file);
+ }
+ catch(ParseError const & e)
+ {
+ std::cerr << "\nParseError thrown: " << e.what() << '\n'
+ << "Make sure that the file is standards compliant. If you get an unexpected character warning "
+ << "make sure you have set the right program parameter (-p), i.e. "
+ << "Lambda expected nucleic acid alphabet, maybe the file was protein?\n";
+ return -1;
+ }
+
+ seqs = tmpSeqs; // convert IUPAC alphabet to Dna5
+
+ return 0;
+}
+
+template <typename TSpec1,
+ typename TSpec2,
+ typename TFile>
+inline int
+myReadRecords(TCDStringSet<String<char, TSpec1>> & ids,
+ TCDStringSet<String<AminoAcid, TSpec2>> & seqs,
+ TFile & file)
+{
+ try
+ {
+ readRecords(ids, seqs, file);
+ }
+ catch(ParseError const & e)
+ {
+ std::cerr << "\nParseError thrown: " << e.what() << '\n'
+ << "Make sure that the file is standards compliant.\n";
+ return -1;
+ }
+
+ if (length(seqs) > 0)
+ {
+ // warn if sequences look like DNA
+ if (CharString(String<Dna5>(CharString(seqs[0]))) == CharString(seqs[0]))
+ std::cout << "\nWarning: The first query sequence looks like nucleic acid, but amino acid is expected.\n"
+ " Make sure you have set the right program parameter (-p).\n";
+ }
+
+ return 0;
+}
+
+// ----------------------------------------------------------------------------
+// truncate sequences
+// ----------------------------------------------------------------------------
+
+// template <typename TString, typename TSpec>
+// inline void
+// _debug_shorten(StringSet<TString, TSpec > & seqs, unsigned const len)
+// {
+// StringSet<TString, TSpec > copySeqs;
+// reserve(copySeqs.concat, length(seqs)*len, Exact());
+//
+// for (TString const & s : seqs)
+// if (length(s) >= len)
+// appendValue(copySeqs, prefix(s, len), Exact());
+//
+// clear(seqs);
+// reserve(seqs.concat, length(copySeqs)*len, Exact());
+// for (TString const & s : copySeqs)
+// appendValue(seqs, s);
+// }
+
+
+// ----------------------------------------------------------------------------
+// print if certain verbosity is set
+// ----------------------------------------------------------------------------
+
+
+template <typename T>
+inline void
+myPrintImpl(SharedOptions const & /**/,
+ T const & first)
+{
+ std::cout << first;
+}
+
+inline void
+myPrintImpl(SharedOptions const & options,
+ std::stringstream const & first)
+{
+ std::string str = first.str();
+// std::cerr << "terminal cols: " << options.terminalCols
+// << " str.size() " << str.size() << "\n";
+ if (options.isTerm && (str.size() >= (options.terminalCols -12)))
+ std::cout << str.substr(str.size()-options.terminalCols+12,
+ options.terminalCols);
+ else
+ std::cout << str;
+}
+
+template <typename T, typename ... Args>
+inline void
+myPrintImpl(SharedOptions const & options,
+ T const & first,
+ Args const & ... args)
+{
+ myPrintImpl(options, first);
+ myPrintImpl(options, args...);
+}
+
+template <typename ... Args>
+inline void
+myPrintImplThread(SharedOptions const & options,
+// T const & first,
+ Args const & ... args)
+{
+ #pragma omp critical(stdout)
+ {
+// std::cout << "\033[" << omp_get_thread_num() << "B";
+// std::cout << "\033E";
+ if (options.isTerm)
+ {
+ for (unsigned char i=0; i< omp_get_thread_num(); ++i)
+ std::cout << std::endl;
+ std::cout << "\033[K";
+ }
+ std::cout << "Thread " << std::setw(3) << omp_get_thread_num() << "| ";
+
+ myPrintImpl(options, args...);
+ std::cout << "\n" << std::flush;
+ if (options.isTerm)
+ std::cout << "\033[" << omp_get_thread_num()+1 << "A";
+ }
+}
+
+template <typename... Args>
+inline void
+myPrint(SharedOptions const & options, const int verbose, Args const &... args)
+{
+ if (options.verbosity >= verbose)
+ {
+ #if defined(_OPENMP)
+ if (omp_in_parallel())
+ myPrintImplThread(options, args...);
+ else
+ #endif
+ myPrintImpl(options, args...);
+
+ std::cout << std::flush;
+ }
+}
+
+template <typename T>
+inline void
+appendToStatusImpl(std::stringstream & status,
+ T const & first)
+{
+ status << first;
+}
+
+template <typename T, typename ... Args>
+inline void
+appendToStatusImpl(std::stringstream & status,
+ T const & first,
+ Args const & ... args)
+{
+ appendToStatusImpl(status, first);
+ appendToStatusImpl(status, args...);
+}
+
+template <typename... Args>
+inline void
+appendToStatus(std::stringstream & status,
+ LambdaOptions const & options,
+ const int verbose,
+ Args const & ... args)
+{
+ if (options.verbosity >= verbose)
+ appendToStatusImpl(status, args...);
+}
+
+// ----------------------------------------------------------------------------
+// remove tag type
+// ----------------------------------------------------------------------------
+
+// template <typename T>
+// T unTag(Tag<T> const & /**/)
+// {
+// return T();
+// }
+
+// ----------------------------------------------------------------------------
+// get plus-minus-range with bounds-checking for unsigned types
+// ----------------------------------------------------------------------------
+
+// template <typename TNum, typename TNum2>
+// inline TNum
+// _protectUnderflow(const TNum n, const TNum2 s)
+// {
+// const TNum r = n -s;
+// return std::min(r, n);
+// }
+//
+// template <typename TNum, typename TNum2>
+// inline TNum
+// _protectOverflow(const TNum n, const TNum2 s)
+// {
+// const TNum r = n + s;
+// return std::max(r, n);
+// }
+//
+// template <typename TGaps>
+// inline bool
+// _startsWithGap(TGaps const & gaps)
+// {
+// SEQAN_ASSERT(length(gaps._array) > 0);
+// return (gaps._array[0] != 0);
+// }
+//
+// template <typename TGaps>
+// inline int
+// _endsWithGap(TGaps const & gaps)
+// {
+// SEQAN_ASSERT(length(gaps._array) > 0);
+// if ((length(gaps._array)-1) % 2 == 1)
+// return -1;
+// return ((gaps._array[length(gaps._array)-1] != 0) ? 1 : 0);
+// }
+//
+// template <typename TGaps, typename TSeq>
+// inline void
+// _prependNonGaps(TGaps & gaps, TSeq const & seq)
+// {
+// if (_startsWithGap(gaps))
+// {
+// insertValue(gaps._array, 0, length(seq)); // new non-gap column
+// insertValue(gaps._array, 0, 0); // empty gaps column
+// }
+// else
+// {
+// gaps._array[1] += length(seq);
+// }
+//
+// insert(value(gaps._source), 0, seq);
+// setBeginPosition(gaps, 0);
+// }
+//
+// template <typename TGaps, typename TSeq>
+// inline void
+// _appendNonGaps(TGaps & gaps, TSeq const & seq)
+// {
+// switch (_endsWithGap(gaps))
+// {
+// case -1:
+// case 1:
+// appendValue(gaps._array, length(seq)); // new non-gap column
+// break;
+// case 0:
+// gaps._array[1] += length(seq);
+// }
+// append(value(gaps._source), seq);
+// setEndPosition(gaps, length(value(gaps._source)));
+// }
+
+#endif // header guard
diff --git a/src/options.hpp b/src/options.hpp
new file mode 100644
index 0000000..5227bb4
--- /dev/null
+++ b/src/options.hpp
@@ -0,0 +1,1378 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// options.h: contains the options and argument parser
+// ==========================================================================
+
+
+#ifndef SEQAN_LAMBDA_OPTIONS_H_
+#define SEQAN_LAMBDA_OPTIONS_H_
+
+#include <cstdio>
+#include <unistd.h>
+#include <bitset>
+
+#include <seqan/basic.h>
+#include <seqan/translation.h>
+#include <seqan/arg_parse.h>
+#include <seqan/index.h>
+#include "output.hpp"
+
+// ==========================================================================
+// Metafunctions
+// ==========================================================================
+
+// suffix array overloads
+namespace SEQAN_NAMESPACE_MAIN
+{
+
+template<typename TSpec1, typename TSpec2, typename TSpec3>
+struct SAValue<StringSet<String<ReducedAminoAcid<TSpec1>, TSpec2>, TSpec3> >
+{
+ typedef Pair<uint32_t, uint16_t, Pack> Type;
+};
+
+template<typename TSpec2, typename TSpec3, typename TFunctor>
+struct SAValue<StringSet<ModifiedString<String<AminoAcid, TSpec2>, TFunctor>, TSpec3> >
+{
+ typedef Pair<uint32_t, uint16_t, Pack> Type;
+};
+
+template<typename TSpec2, typename TSpec3>
+struct SAValue<StringSet<String<AminoAcid, TSpec2>, TSpec3> >
+{
+ typedef Pair<uint32_t, uint16_t, Pack> Type;
+};
+
+// Dna Sequences might be longer (chromosomes, genomes)
+template<typename TSpec1, typename TSpec2>
+struct SAValue<StringSet<String<Dna5, TSpec1>, TSpec2> >
+{
+ typedef Pair<uint32_t, uint32_t, Pack> Type;
+};
+
+template <typename TString, typename TSpec>
+struct DefaultIndexStringSpec<StringSet<TString, TSpec>>
+{
+#if !defined(LAMBDA_INDEXER) && defined(LAMBDA_MMAPPED_DB)
+ using Type = MMap<>;
+#else
+ using Type = Alloc<>;
+#endif
+};
+
+// our custom Bam Overload
+template <typename TDirection, typename TStorageSpec>
+struct FormattedFileContext<FormattedFile<Bam, TDirection, BlastTabular>, TStorageSpec>
+{
+ typedef StringSet<Segment<String<char, MMap<> >, InfixSegment> > TNameStore;
+ typedef NameStoreCache<TNameStore> TNameStoreCache;
+ typedef BamIOContext<TNameStore, TNameStoreCache, TStorageSpec> Type;
+};
+
+}
+
+using namespace seqan;
+
+// Index Specs
+struct LambdaFMIndexConfig
+{
+ using LengthSum = size_t;
+#if !defined(LAMBDA_INDEXER) && defined(LAMBDA_MMAPPED_DB)
+ using TAlloc = MMap<>;
+#else
+ using TAlloc = Alloc<>;
+#endif
+ using Bwt = WaveletTree<void, WTRDConfig<LengthSum, TAlloc> >;
+ using Sentinels = Levels<void, LevelsRDConfig<LengthSum, TAlloc> >;
+
+ static const unsigned SAMPLING = 10;
+};
+
+template <typename TSpec = void>
+using TFMIndex = FMIndex<TSpec, LambdaFMIndexConfig>;
+
+// lazy...
+template <typename TString>
+using TCDStringSet = StringSet<TString, Owner<ConcatDirect<> > >;
+
+template <BlastProgram p>
+using OrigQryAlph = typename std::conditional<
+ (p == BlastProgram::BLASTN) ||
+ (p == BlastProgram::BLASTX) ||
+ (p == BlastProgram::TBLASTX),
+ Dna5,
+ AminoAcid>::type;
+
+template <BlastProgram p>
+using OrigSubjAlph = typename std::conditional<
+ (p == BlastProgram::BLASTN) ||
+ (p == BlastProgram::TBLASTN) ||
+ (p == BlastProgram::TBLASTX),
+ Dna5,
+ AminoAcid>::type;
+
+template <BlastProgram p>
+using TransAlph = typename std::conditional<(p == BlastProgram::BLASTN),
+ Dna5,
+ AminoAcid>::type;
+
+
+template <BlastProgram p, typename TRedAlph_>
+using RedAlph = typename std::conditional<(p == BlastProgram::BLASTN),
+ Dna5,
+ TRedAlph_>::type;
+
+template <typename TString>
+void getCwd(TString & string)
+{
+ char cwd[1000];
+
+#ifdef PLATFORM_WINDOWS
+ _getcwd(cwd, 1000);
+#else
+ getcwd(cwd, 1000);
+#endif
+
+ assign(string, cwd);
+}
+
+template <typename TString, typename TValue>
+bool setEnv(TString const & key, TValue & value)
+{
+#ifdef PLATFORM_WINDOWS
+ return !_putenv_s(toCString(key), toCString(value));
+#else
+ return !setenv(toCString(key), toCString(value), true);
+#endif
+}
+
+// ==========================================================================
+// Classes
+// ==========================================================================
+
+// --------------------------------------------------------------------------
+// Class LambdaOptions
+// --------------------------------------------------------------------------
+
+// This struct stores the options from the command line.
+
+struct SharedOptions
+{
+ // Verbosity level. 0 -- quiet, 1 -- normal, 2 -- verbose, 3 -- very verbose.
+ int verbosity = 1;
+
+ std::string commandLine;
+
+ std::string dbFile;
+
+ int dbIndexType = 0;
+ // for indexer, the file format of database sequences
+ // for main app, the file format of query sequences
+ // 0 -- fasta, 1 -- fastq
+// int fileFormat = 0;
+
+ int alphReduction = 0;
+
+ GeneticCodeSpec geneticCode = CANONICAL;
+
+ BlastProgram blastProgram = BlastProgram::BLASTX;
+
+ bool isTerm = true;
+ unsigned terminalCols = 80;
+
+ unsigned threads = 1;
+
+ SharedOptions()
+ {
+ isTerm = isTerminal();
+ if (isTerm)
+ {
+ unsigned _rows;
+ getTerminalSize(terminalCols, _rows);
+ }
+ }
+};
+
+
+struct LambdaOptions : public SharedOptions
+{
+
+ std::string queryFile;
+ bool revComp = true;
+
+ int outFileFormat; // 0 = BLAST, 1 = SAM, 2 = BAM
+ std::string output;
+ std::vector<BlastMatchField<>::Enum> columns;
+ std::string outputBam;
+ std::bitset<64> samBamTags;
+ bool samWithRefHeader;
+ unsigned samBamSeq;
+ bool versionInformationToOutputFile;
+
+ unsigned queryPart = 0;
+
+// bool semiGlobal;
+
+ bool doubleIndexing = true;
+
+ unsigned seedLength = 0;
+ unsigned maxSeedDist = 1;
+ bool hammingOnly = true;
+
+ int seedGravity = 0;
+ unsigned seedOffset = 0;
+ unsigned minSeedLength = 0;
+
+// unsigned int minSeedEVal = 0;
+// double minSeedBitS = -1;
+
+ // 0 = manual, positive X = blosumX, negative Y = pamY
+ int scoringMethod = 62;
+ // scores
+ int gapOpen = -11;
+ int gapExtend = -1;
+ int match = 0; // only for manual
+ int misMatch = 0; // only for manual
+
+ int xDropOff = 0;
+ int band = -1;
+ double eCutOff = 0;
+ int idCutOff = 0;
+ unsigned long maxMatches = 500;
+
+ bool filterPutativeDuplicates = true;
+ bool filterPutativeAbundant = true;
+
+ int preScoring = 0; // 0 = off, 1 = seed, 2 = region (
+ double preScoringThresh = 0.0;
+
+ LambdaOptions() :
+ SharedOptions()
+ {
+ }
+};
+
+struct LambdaIndexerOptions : public SharedOptions
+{
+ std::string segFile = "";
+ std::string algo = "";
+
+ bool truncateIDs;
+
+ LambdaIndexerOptions()
+ : SharedOptions()
+ {}
+};
+
+// ==========================================================================
+// Functions
+// ==========================================================================
+
+// --------------------------------------------------------------------------
+// Function displayCopyright()
+// --------------------------------------------------------------------------
+
+void
+sharedSetup(ArgumentParser & parser)
+{
+ // Set short description, version, and date.
+ std::string versionString = std::string(SEQAN_APP_VERSION) + " (Git commit " +
+ std::string(SEQAN_REVISION) + ")";
+ setVersion(parser, versionString);
+ setDate(parser, __DATE__);
+ setShortCopyright(parser, "2013-2015 Hannes Hauswedell; released under the GNU GPL v3 (or later).");
+
+ setCitation(parser, "Hauswedell et al (2014); doi: 10.1093/bioinformatics/btu439");
+
+ setLongCopyright(parser,
+ " Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin\n"
+ " All rights reserved.\n"
+ "\n"
+ " Lambda is free software: you can redistribute it and/or modify\n"
+ " it under the terms of the GNU General Public License as published by\n"
+ " the Free Software Foundation, either version 3 of the License, or\n"
+ " (at your option) any later version.\n"
+ "\n"
+ " Lambda is distributed in the hope that it will be useful,\n"
+ " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+ " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
+ " GNU General Public License for more details.\n"
+ "\n"
+ " You should have received a copy of the GNU General Public License\n"
+ " along with Lambda. If not, see <http://www.gnu.org/licenses/>.\n");
+
+ addOption(parser, ArgParseOption("v", "verbosity",
+ "Display more/less diagnostic output during operation: 0 [only errors]; 1 [default]; 2 "
+ "[+run-time, options and statistics].",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "verbosity", "1");
+ setMinValue(parser, "verbosity", "0");
+ setMaxValue(parser, "verbosity", "2");
+
+ addDescription(parser, "Lambda is a local aligner optimized for many query "
+ "sequences and searches in protein space. It is compatible to BLAST, but "
+ "much faster than BLAST and many other comparable tools.");
+
+ addDescription(parser, "Detailed information is available in the wiki: "
+ "<https://github.com/seqan/lambda/wiki>");
+}
+
+// --------------------------------------------------------------------------
+// Function parseCommandLine()
+// --------------------------------------------------------------------------
+
+// SHARED
+ArgumentParser::ParseResult
+parseCommandLineShared(SharedOptions & options, ArgumentParser & parser);
+
+ArgumentParser::ParseResult
+parseCommandLine(LambdaOptions & options, int argc, char const ** argv)
+{
+ // save commandLine
+ for (int i = 0; i < argc; ++i)
+ options.commandLine += std::string(argv[i]) + " ";
+ eraseBack(options.commandLine);
+
+ // Setup ArgumentParser.
+ ArgumentParser parser("lambda");
+ // Set short description, version, and date.
+ setShortDescription(parser, "the Local Aligner for Massive Biological "
+ "DatA");
+
+ // Define usage line and long description.
+ addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fI-q QUERY.fasta\\fP "
+ "\\fI-d DATABASE.fasta\\fP "
+ "[\\fI-o output.m8\\fP]");
+
+ sharedSetup(parser);
+
+ addSection(parser, "Input Options");
+ addOption(parser, ArgParseOption("q", "query",
+ "Query sequences.",
+ ArgParseArgument::INPUT_FILE,
+ "IN"));
+ setValidValues(parser, "query", toCString(concat(getFileExtensions(SeqFileIn()), ' ')));
+ setRequired(parser, "q");
+
+ addOption(parser, ArgParseOption("d", "database",
+ "Path to original database sequences (a precomputed index with .sa or .fm needs to exist!).",
+ ArgParseArgument::INPUT_FILE,
+ "IN"));
+ setValidValues(parser, "database", toCString(concat(getFileExtensions(SeqFileIn()), ' ')));
+ setRequired(parser, "d");
+
+ addOption(parser, ArgParseOption("di", "db-index-type",
+ "database index is in this format.",
+// "(auto means \"try sa first then fm\").",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "db-index-type", "sa fm");
+ setDefaultValue(parser, "db-index-type", "fm");
+ setAdvanced(parser, "db-index-type");
+
+ addSection(parser, "Output Options");
+ addOption(parser, ArgParseOption("o", "output",
+ "File to hold reports on hits (.m* are blastall -m* formats; .m8 is tab-seperated, .m9 is tab-seperated with "
+ "with comments, .m0 is pairwise format).",
+ ArgParseArgument::OUTPUT_FILE,
+ "OUT"));
+ auto exts = getFileExtensions(BlastTabularFileOut<>());
+ append(exts, getFileExtensions(BlastReportFileOut<>()));
+ append(exts, getFileExtensions(BamFileOut()));
+ CharString extsConcat;
+ // remove .sam.bam, .sam.vcf.gz, .sam.tbi
+ for (auto const & ext : exts)
+ {
+ if ((!endsWith(ext, ".bam") || startsWith(ext, ".bam")) &&
+ (!endsWith(ext, ".vcf.gz")) &&
+ (!endsWith(ext, ".sam.tbi")))
+ {
+ append(extsConcat, ext);
+ appendValue(extsConcat, ' ');
+ }
+ }
+ setValidValues(parser, "output", toCString(extsConcat));
+ setDefaultValue(parser, "output", "output.m8");
+
+ addOption(parser, ArgParseOption("oc", "output-columns",
+ "Print specified column combination and/or order (.m8 and .m9 outputs only); call -oc help for more details.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setDefaultValue(parser, "output-columns", "std");
+ setAdvanced(parser, "output-columns");
+
+ addOption(parser, ArgParseOption("id", "percent-identity",
+ "Output only matches above this threshold (checked before e-value "
+ "check).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "percent-identity", "0");
+ setMinValue(parser, "percent-identity", "0");
+ setMaxValue(parser, "percent-identity", "100");
+
+ addOption(parser, ArgParseOption("e", "e-value",
+ "Output only matches that score below this threshold.",
+ ArgParseArgument::DOUBLE));
+ setDefaultValue(parser, "e-value", "0.1");
+ setMinValue(parser, "e-value", "0");
+
+ addOption(parser, ArgParseOption("nm", "num-matches",
+ "Print at most this number of matches per query.",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "num-matches", "500");
+ setMinValue(parser, "num-matches", "1");
+
+ addOption(parser, ArgParseOption("", "sam-with-refheader",
+ "BAM files require all subject names to be written to the header. For SAM this is not required, so Lambda does "
+ "not automatically do it to save space (especially for protein database this is a lot!). If you still want "
+ "them with SAM, e.g. for better BAM compatibility, use this option.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "sam-with-refheader", "on off");
+ setDefaultValue(parser, "sam-with-refheader", "off");
+ setAdvanced(parser, "sam-with-refheader");
+
+ addOption(parser, ArgParseOption("", "sam-bam-seq",
+ "Write matching DNA subsequence into SAM/BAM file (BLASTN). For BLASTX and TBLASTX the matching protein "
+ "sequence is \"untranslated\" and positions retransformed to the original sequence. For BLASTP and TBLASTN "
+ "there is no DNA sequence so a \"*\" is written to the SEQ column. The matching protein sequence can be "
+ "written as an optional tag, see --sam-bam-tags. If set to uniq than "
+ "the sequence is omitted iff it is identical to the previous match's subsequence.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "sam-bam-seq", "always uniq never");
+ setDefaultValue(parser, "sam-bam-seq", "uniq");
+ setAdvanced(parser, "sam-bam-seq");
+
+ addOption(parser, ArgParseOption("", "sam-bam-tags",
+ "Write the specified optional columns to the SAM/BAM file. Call --sam-bam-tags help for more details.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setDefaultValue(parser, "sam-bam-tags", "AS NM ZE ZI ZF");
+ setAdvanced(parser, "sam-bam-tags");
+
+ addOption(parser, ArgParseOption("", "version-to-outputfile",
+ "Write the Lambda program tag and version number to the output file.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "version-to-outputfile", "on off");
+ setDefaultValue(parser, "version-to-outputfile", "on");
+ hideOption(parser, "version-to-outputfile");
+
+ addSection(parser, "General Options");
+#ifdef _OPENMP
+ addOption(parser, ArgParseOption("t", "threads",
+ "number of threads to run concurrently.",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "threads", omp_get_max_threads());
+#else
+ addOption(parser, ArgParseOption("t", "threads",
+ "LAMBDA BUILT WITHOUT OPENMP; setting this option has no effect.",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "threads", 1);
+#endif
+ setAdvanced(parser, "threads");
+
+ addOption(parser, ArgParseOption("qi", "query-index-type",
+ "controls double-indexing.",
+ ArgParseArgument::STRING));
+ setValidValues(parser, "query-index-type", "radix none");
+ setDefaultValue(parser, "query-index-type", "none");
+ setAdvanced(parser, "query-index-type");
+
+ addOption(parser, ArgParseOption("qp", "query-partitions",
+ "Divide the query into qp number of blocks before processing; should be"
+ " a multiple of the number of threads, defaults to one per thread. "
+ "Only used with double-indexing; strong influence on memory, see below.",
+ ArgParseArgument::INTEGER));
+#ifdef _OPENMP
+ setDefaultValue(parser, "query-partitions", omp_get_max_threads());
+#else
+ setDefaultValue(parser, "query-partitions", 1);
+#endif
+ hideOption(parser, "query-partitions"); // HIDDEN
+
+ addSection(parser, "Alphabets and Translation");
+ addOption(parser, ArgParseOption("p", "program",
+ "Blast Operation Mode.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "program", "blastn blastp blastx tblastn tblastx");
+ setDefaultValue(parser, "program", "blastx");
+
+// addOption(parser, ArgParseOption("qa", "query-alphabet",
+// "original alphabet of the query sequences",
+// ArgParseArgument::STRING,
+// "STR"));
+// setValidValues(parser, "query-alphabet", "dna5 aminoacid");
+// setDefaultValue(parser, "query-alphabet", "dna5");
+//
+// addOption(parser, ArgParseOption("da", "db-alphabet",
+// "original alphabet of the subject sequences",
+// ArgParseArgument::STRING,
+// "STR"));
+// setValidValues(parser, "db-alphabet", "dna5 aminoacid");
+// setDefaultValue(parser, "db-alphabet", "aminoacid");
+//
+// addOption(parser, ArgParseOption("sa", "seeding-alphabet",
+// "alphabet to use during seeding (reduction possible)",
+// ArgParseArgument::STRING,
+// "STR"));
+// setValidValues(parser, "seeding-alphabet", "dna5 aminoacid");
+// setDefaultValue(parser, "seeding-alphabet", "murphy10");
+
+ addOption(parser, ArgParseOption("g", "genetic-code",
+ "The translation table to use for nucl -> amino acid translation"
+ "(not for BlastN, BlastP). See "
+ "https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"
+ " for ids (default is generic). Six frames are generated.",
+ ArgParseArgument::INTEGER));
+// setValidValues(parser, "alph", "0 10");
+ setDefaultValue(parser, "genetic-code", "1");
+ setAdvanced(parser, "genetic-code");
+
+ addOption(parser, ArgParseOption("ar", "alphabet-reduction",
+ "Alphabet Reduction for seeding phase (ignored for BLASTN).",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "alphabet-reduction", "none murphy10");
+ setDefaultValue(parser, "alphabet-reduction", "murphy10");
+ setAdvanced(parser, "alphabet-reduction");
+
+ addSection(parser, "Seeding / Filtration");
+// addOption(parser, ArgParseOption("su",
+// "ungapped-seeds",
+// "allow only mismatches in seeds.",
+// ArgParseArgument::INTEGER));
+// setDefaultValue(parser, "ungapped-seeds", "1");
+
+ addOption(parser, ArgParseOption("sl", "seed-length",
+ "Length of the seeds (default = 14 for BLASTN).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "seed-length", "10");
+ setAdvanced(parser, "seed-length");
+
+ addOption(parser, ArgParseOption("so", "seed-offset",
+ "Offset for seeding (if unset = seed-length, non-overlapping; "
+ "default = 5 for BLASTN).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "seed-offset", "10");
+ setAdvanced(parser, "seed-offset");
+
+ addOption(parser, ArgParseOption("sd", "seed-delta",
+ "maximum seed distance.",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "seed-delta", "1");
+ setAdvanced(parser, "seed-delta");
+
+ addOption(parser, ArgParseOption("sg", "seed-gravity",
+ "Seeds closer than this are merged into region (if unset = "
+ "seed-length).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "seed-gravity", "10");
+ hideOption(parser, "seed-gravity"); // HIDDEN
+
+ addOption(parser, ArgParseOption("sm", "seed-min-length",
+ "after postproc shorter seeds are discarded (if unset = seed-length).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "seed-min-length", "10");
+ hideOption(parser, "seed-min-length"); // HIDDEN
+
+ addSection(parser, "Miscellaneous Heuristics");
+
+ addOption(parser, ArgParseOption("ps", "pre-scoring",
+ "evaluate score of a region NUM times the size of the seed "
+ "before extension (0 -> no pre-scoring, 1 -> evaluate seed, n-> area "
+ "around seed, as well; default = 1 if no reduction is used).",
+ ArgParseArgument::INTEGER));
+ setMinValue(parser, "pre-scoring", "1");
+ setDefaultValue(parser, "pre-scoring", "2");
+ setAdvanced(parser, "pre-scoring");
+
+ addOption(parser, ArgParseOption("pt", "pre-scoring-threshold",
+ "minimum average score per position in pre-scoring region.",
+ ArgParseArgument::DOUBLE));
+ setDefaultValue(parser, "pre-scoring-threshold", "2");
+ setAdvanced(parser, "pre-scoring-threshold");
+
+ addOption(parser, ArgParseOption("pd", "filter-putative-duplicates",
+ "filter hits that will likely duplicate a match already found.",
+ ArgParseArgument::STRING));
+ setValidValues(parser, "filter-putative-duplicates", "on off");
+ setDefaultValue(parser, "filter-putative-duplicates", "on");
+ setAdvanced(parser, "filter-putative-duplicates");
+
+ addOption(parser, ArgParseOption("pa", "filter-putative-abundant",
+ "If the maximum number of matches per query are found already, "
+ "stop searching if the remaining realm looks unfeasable.",
+ ArgParseArgument::STRING));
+ setValidValues(parser, "filter-putative-abundant", "on off");
+ setDefaultValue(parser, "filter-putative-abundant", "on");
+ setAdvanced(parser, "filter-putative-abundant");
+
+// addOption(parser, ArgParseOption("se",
+// "seedminevalue",
+// "after postproc worse seeds are "
+// "discarded"
+// "(0 -> off).",
+// ArgParseArgument::INTEGER));
+// setDefaultValue(parser, "seedminevalue", "100000");
+
+// addOption(parser, ArgParseOption("sb",
+// "seedminbits",
+// "after postproc worse seeds are "
+// "discarded"
+// "(-1 -> off).",
+// ArgParseArgument::DOUBLE));
+// setDefaultValue(parser, "seedminbits", "-1");
+
+ addSection(parser, "Scoring");
+
+ addOption(parser, ArgParseOption("sc", "scoring-scheme",
+ "'62' for Blosum62 (default); '50' for Blosum50; '0' for manual "
+ "(default for BlastN)",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "scoring-scheme", "62");
+ setAdvanced(parser, "scoring-scheme");
+
+ addOption(parser, ArgParseOption("ge", "score-gap",
+ "Score per gap character (default = -2 for BLASTN).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "score-gap", "-1");
+ setAdvanced(parser, "score-gap");
+
+ addOption(parser, ArgParseOption("go", "score-gap-open",
+ "Additional cost for opening gap (default = -5 for BLASTN).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "score-gap-open", "-11");
+ setAdvanced(parser, "score-gap-open");
+
+ addOption(parser, ArgParseOption("ma", "score-match",
+ "Match score (BLASTN or manual scoring)",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "score-match", "2");
+ setAdvanced(parser, "score-match");
+
+ addOption(parser, ArgParseOption("mi", "score-mismatch",
+ "Mismatch score (BLASTN or manual scoring)",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "score-mismatch", "-3");
+ setAdvanced(parser, "score-mismatch");
+
+ addSection(parser, "Extension");
+
+ addOption(parser, ArgParseOption("x", "x-drop",
+ "Stop Banded extension if score x below the maximum seen (-1 means no "
+ "xdrop).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "x-drop", "30");
+ setMinValue(parser, "x-drop", "-1");
+ setAdvanced(parser, "x-drop");
+
+ addOption(parser, ArgParseOption("b", "band",
+ "Size of the DP-band used in extension (-3 means log2 of query length; "
+ "-2 means sqrt of query length; -1 means full dp; n means band of size "
+ "2n+1)",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "band", "-3");
+ setMinValue(parser, "band", "-3");
+ setAdvanced(parser, "band");
+
+ addTextSection(parser, "Tuning");
+ addText(parser, "Tuning the seeding parameters and (de)activating alphabet "
+ "reduction has a strong "
+ "influence on both speed and sensitivity. We recommend the "
+ "following alternative profiles for protein searches:");
+ addText(parser, "fast (high similarity): -ar none -sl 7 -sd 0");
+ addText(parser, "sensitive (lower similarity): -so 5");
+
+ addText(parser, "For further information see the wiki: <https://github.com/seqan/lambda/wiki>");
+// addTextSection(parser, "Speed VS memory requirements");
+// addText(parser, "Lambda requires approximately the following amount of RAM:"
+// " \033[1msize(queryFile) + size(dbIDs) + 2 * size(dbSeqs)\033[0m. "
+// "If you have more RAM, use double indexing and SA:\n"
+// "\033[1m-di sa -qi radix\033[0m "
+// "which will result in an additional speed-up of up to 30% "
+// "compared to the published version (you need to run the "
+// "indexer with \033[1m-di sa \033[0m, as well). The amount "
+// "of RAM required will be: "
+// "\033[1msize(queryFile) + size(dbIDs) + 7 * size(dbSeqs) + n\033[0m "
+// "where n grows slowly but linearly with input size. "
+// "Note that size(dbSeqs) refers to the total "
+// "sequence length and does not include IDs (so it is less "
+// "than the size of the file).");
+// addText(parser, "To save more RAM, you can define "
+// "LAMBDA_BITCOPMRESSED_STRINGS while compiling lambda. "
+// "This will reduce memory usage by about:"
+// " \033[1m0.3 * ( size(queryFile) + size(dbSeqs) )\033[0m,"
+// " but slow down lambda by about 10%.");
+
+ // Parse command line.
+ ArgumentParser::ParseResult res = parse(parser, argc, argv);
+
+ // Only extract options if the program will continue after parseCommandLine()
+ if (res != ArgumentParser::PARSE_OK)
+ return res;
+
+ // Options shared by lambda and its indexer
+ res = parseCommandLineShared(options, parser);
+ if (res != ArgumentParser::PARSE_OK)
+ return res;
+
+ std::string buffer;
+
+ // Extract option values.
+ getOptionValue(options.queryFile, parser, "query");
+// if (endsWith(options.queryFile, ".fastq") ||
+// endsWith(options.queryFile, ".fq"))
+// options.fileFormat = 1;
+// else
+// options.fileFormat = 0;
+
+ getOptionValue(options.output, parser, "output");
+ buffer = options.output;
+ if (endsWith(buffer, ".gz"))
+ buffer.resize(length(buffer) - 3);
+ else if (endsWith(buffer, ".bz2"))
+ buffer.resize(length(buffer) - 4);
+
+ if (endsWith(buffer, ".sam"))
+ options.outFileFormat = 1;
+ else if (endsWith(buffer, ".bam"))
+ options.outFileFormat = 2;
+ else
+ options.outFileFormat = 0;
+
+ clear(buffer);
+ getOptionValue(buffer, parser, "sam-with-refheader");
+ options.samWithRefHeader = (buffer == "on");
+
+ clear(buffer);
+ getOptionValue(buffer, parser, "sam-bam-seq");
+ if (buffer == "never")
+ options.samBamSeq = 0;
+ else if (buffer == "uniq")
+ options.samBamSeq = 1;
+ else
+ options.samBamSeq = 2;
+
+ clear(buffer);
+ getOptionValue(buffer, parser, "output-columns");
+ if (buffer == "help")
+ {
+ std::cout << "Please specify the columns in this format -oc 'column1 column2', i.e. space-seperated and "
+ << "enclosed in single quotes.\nThe specifiers are the same as in NCBI Blast, currently "
+ << "the following are supported:\n";
+ for (unsigned i = 0; i < length(BlastMatchField<>::implemented); ++i)
+ {
+ if (BlastMatchField<>::implemented[i])
+ {
+ std::cout << "\t" << BlastMatchField<>::optionLabels[i]
+ << (length(BlastMatchField<>::optionLabels[i]) >= 8 ? "\t" : "\t\t")
+ << BlastMatchField<>::descriptions[i] << "\n";
+ }
+ }
+ return ArgumentParser::PARSE_HELP;
+ }
+ else
+ {
+ StringSet<CharString> fields;
+ strSplit(fields, buffer, IsSpace(), false);
+ for (auto str : fields)
+ {
+ bool resolved = false;
+ for (unsigned i = 0; i < length(BlastMatchField<>::optionLabels); ++i)
+ {
+ if (BlastMatchField<>::optionLabels[i] == str)
+ {
+ appendValue(options.columns, static_cast<BlastMatchField<>::Enum>(i));
+ resolved = true;
+ break;
+ }
+ }
+ if (!resolved)
+ {
+ std::cerr << "Unknown column specifier \"" << str << "\". Please see -oc help for valid options.\n";
+ return ArgumentParser::PARSE_ERROR;
+ }
+ }
+ }
+ clear(buffer);
+
+ getOptionValue(buffer, parser, "sam-bam-tags");
+ if (buffer == "help")
+ {
+ std::cout << "Please specify the tags in this format -oc 'tag1 tag2', i.e. space-seperated and "
+ << "enclosed in quotes. The order of tags is not preserved.\nThe following specifiers are "
+ << "supported:\n";
+
+ for (auto const & c : SamBamExtraTags<>::keyDescPairs)
+ std::cout << "\t" << std::get<0>(c) << "\t" << std::get<1>(c) << "\n";
+
+ return ArgumentParser::PARSE_HELP;
+ }
+ else
+ {
+ StringSet<CharString> fields;
+ strSplit(fields, buffer, IsSpace(), false);
+ for (auto str : fields)
+ {
+ bool resolved = false;
+ for (unsigned i = 0; i < length(SamBamExtraTags<>::keyDescPairs); ++i)
+ {
+ if (std::get<0>(SamBamExtraTags<>::keyDescPairs[i]) == str)
+ {
+ options.samBamTags[i] = true;
+ resolved = true;
+ break;
+ }
+ }
+ if (!resolved)
+ {
+ std::cerr << "Unknown column specifier \"" << str
+ << "\". Please see \"--sam-bam-tags help\" for valid options.\n";
+ return ArgumentParser::PARSE_ERROR;
+ }
+ }
+ }
+ // if original is protein, than only write if explicitly asked for
+ if (((options.blastProgram == BlastProgram::BLASTP) || (options.blastProgram == BlastProgram::TBLASTN)) &&
+ (!options.samBamTags[SamBamExtraTags<>::Q_AA_CIGAR]))
+ options.samBamSeq = 0;
+
+ getOptionValue(buffer, parser, "version-to-outputfile");
+ options.versionInformationToOutputFile = (buffer == "on");
+
+ clear(buffer);
+ getOptionValue(options.seedLength, parser, "seed-length");
+ if ((!isSet(parser, "seed-length")) &&
+ (options.blastProgram == BlastProgram::BLASTN))
+ options.seedLength = 14;
+
+ if (isSet(parser, "seed-offset"))
+ getOptionValue(options.seedOffset, parser, "seed-offset");
+ else
+ options.seedOffset = options.seedLength;
+
+ if (isSet(parser, "seed-gravity"))
+ getOptionValue(options.seedGravity, parser, "seed-gravity");
+ else
+ options.seedGravity = options.seedLength;
+
+ if (isSet(parser, "seed-min-length"))
+ getOptionValue(options.minSeedLength, parser, "seed-min-length");
+ else
+ options.minSeedLength = options.seedLength;
+
+ getOptionValue(options.maxSeedDist, parser, "seed-delta");
+
+
+ getOptionValue(buffer, parser, "query-index-type");
+ options.doubleIndexing = (buffer == "radix");
+
+ getOptionValue(options.eCutOff, parser, "e-value");
+ getOptionValue(options.idCutOff, parser, "percent-identity");
+
+ getOptionValue(options.xDropOff, parser, "x-drop");
+// if ((!isSet(parser, "x-drop")) &&
+// (options.blastProgram == BlastProgram::BLASTN))
+// options.xDropOff = 16;
+
+ getOptionValue(options.band, parser, "band");
+
+ if (options.doubleIndexing)
+ {
+ if (isSet(parser, "query-partitions"))
+ getOptionValue(options.queryPart, parser, "query-partitions");
+ else
+ options.queryPart = options.threads;
+ if ((options.queryPart % options.threads) != 0)
+ std::cout << "-qp not a multiple of -t; expect suboptimal performance.\n";
+ } else
+ {
+ options.queryPart = 1;
+ }
+
+ getOptionValue(options.scoringMethod, parser, "scoring-scheme");
+ if (options.blastProgram == BlastProgram::BLASTN)
+ options.scoringMethod = 0;
+ switch (options.scoringMethod)
+ {
+ case 0:
+ getOptionValue(options.misMatch, parser, "score-mismatch");
+ getOptionValue(options.match, parser, "score-match");
+ break;
+ case 45: case 62: case 80: break;
+ default:
+ std::cerr << "Unsupported Scoring Scheme selected.\n";
+ return ArgumentParser::PARSE_ERROR;
+ }
+
+ getOptionValue(options.gapExtend, parser, "score-gap");
+ if ((!isSet(parser, "score-gap")) &&
+ (options.blastProgram == BlastProgram::BLASTN))
+ options.gapExtend = -2;
+
+ getOptionValue(options.gapOpen, parser, "score-gap-open");
+ if ((!isSet(parser, "score-gap-open")) &&
+ (options.blastProgram == BlastProgram::BLASTN))
+ options.gapOpen = -5;
+
+ getOptionValue(buffer, parser, "filter-putative-duplicates");
+ options.filterPutativeDuplicates = (buffer == "on");
+
+ getOptionValue(buffer, parser, "filter-putative-abundant");
+ options.filterPutativeAbundant = (buffer == "on");
+
+ // TODO always prescore 1
+ getOptionValue(options.preScoring, parser, "pre-scoring");
+ if ((!isSet(parser, "pre-scoring")) &&
+ (options.alphReduction == 0))
+ options.preScoring = 1;
+
+ getOptionValue(options.preScoringThresh, parser, "pre-scoring-threshold");
+// if (options.preScoring == 0)
+// options.preScoringThresh = 4;
+
+ int numbuf;
+ getOptionValue(numbuf, parser, "num-matches");
+ options.maxMatches = static_cast<unsigned long>(numbuf);
+
+ return ArgumentParser::PARSE_OK;
+}
+
+// INDEXER
+ArgumentParser::ParseResult
+parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** argv)
+{
+ // Setup ArgumentParser.
+ ArgumentParser parser("lambda_indexer");
+
+ // Define usage line and long description.
+ addUsageLine(parser, "[\\fIOPTIONS\\fP] \\-d DATABASE.fasta\\fP");
+
+ sharedSetup(parser);
+
+ addDescription(parser, "This is the indexer_binary for creating a lambda-compatible databases.");
+
+ addSection(parser, "Input Options");
+ addOption(parser, ArgParseOption("d", "database",
+ "Database sequences.",
+ ArgParseArgument::INPUT_FILE,
+ "IN"));
+ setRequired(parser, "database");
+ setValidValues(parser, "database", toCString(concat(getFileExtensions(SeqFileIn()), ' ')));
+
+ addOption(parser, ArgParseOption("s",
+ "segfile",
+ "SEG intervals for database"
+ "(optional).",
+ ArgParseArgument::INPUT_FILE));
+
+ setValidValues(parser, "segfile", "seg");
+
+ addSection(parser, "Output Options");
+// addOption(parser, ArgParseOption("o",
+// "output",
+// "Index of database sequences",
+// ArgParseArgument::OUTPUT_FILE,
+// "OUT"));
+// setValidValues(parser, "output", "sa fm");
+
+ addOption(parser, ArgParseOption("di", "db-index-type",
+ "Suffix array or full-text minute space.",
+ ArgParseArgument::STRING,
+ "type"));
+ setValidValues(parser, "db-index-type", "sa fm");
+ setDefaultValue(parser, "db-index-type", "fm");
+ setAdvanced(parser, "db-index-type");
+
+ addOption(parser, ArgParseOption("", "truncate-ids",
+ "Truncate IDs at first whitespace. This saves a lot of space and is irrelevant for all LAMBDA output formats "
+ "other than BLAST Pairwise (.m0).",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "truncate-ids", "on off");
+ setDefaultValue(parser, "truncate-ids", "on");
+
+ addSection(parser, "Alphabets and Translation");
+ addOption(parser, ArgParseOption("p", "program",
+ "Blast Operation Mode.",
+ ArgParseArgument::STRING,
+ "program"));
+ setValidValues(parser, "program", "blastn blastp blastx tblastn tblastx");
+ setDefaultValue(parser, "program", "blastx");
+ addOption(parser, ArgParseOption("g", "genetic-code",
+ "The translation table to use (not for BlastN, BlastP). See "
+ "https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"
+ " for ids (default is generic).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "genetic-code", "1");
+ setAdvanced(parser, "genetic-code");
+
+ addOption(parser, ArgParseOption("ar", "alphabet-reduction",
+ "Alphabet Reduction for seeding phase (ignored for BLASTN).",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "alphabet-reduction", "none murphy10");
+ setDefaultValue(parser, "alphabet-reduction", "murphy10");
+ setAdvanced(parser, "alphabet-reduction");
+
+ addSection(parser, "Algorithm");
+ addOption(parser, ArgParseOption("a", "algorithm",
+ "Algorithm for SA construction (also used for FM; see Memory "
+ " Requirements below!).",
+ ArgParseArgument::STRING,
+ "STR"));
+ setValidValues(parser, "algorithm", "mergesort quicksortbuckets quicksort radixsort skew7ext");
+ setDefaultValue(parser, "algorithm", "radixsort");
+ setAdvanced(parser, "algorithm");
+
+#ifdef _OPENMP
+ addOption(parser, ArgParseOption("t", "threads",
+ "number of threads to run concurrently (ignored if a == skew7ext).",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "threads", omp_get_max_threads());
+#else
+ addOption(parser, ArgParseOption("t", "threads",
+ "LAMBDA BUILT WITHOUT OPENMP; setting this option has no effect.",
+ ArgParseArgument::INTEGER));
+ setDefaultValue(parser, "threads", 1);
+#endif
+ setAdvanced(parser, "threads");
+
+ std::string tmpdir;
+ getCwd(tmpdir);
+ addOption(parser, ArgParseOption("td", "tmp-dir",
+ "temporary directory used by skew, defaults to working directory.",
+ ArgParseArgument::STRING,
+ "STR"));
+ setDefaultValue(parser, "tmp-dir", tmpdir);
+ setAdvanced(parser, "tmp-dir");
+
+ //TODO move manual / auto-detect
+// addTextSection(parser, "Memory requirements and Speed");
+// addText(parser, "\033[1mmergesort [RAM]:\033[0m"
+// "\t14 * size(dbSeqs)");
+// addText(parser, "\033[1mmergesort [speed]:\033[0m"
+// "\tup to t threads");
+// addText(parser, "\033[1mquicksort and quicksortbuckets [RAM]:\033[0m"
+// "\t7 * size(dbSeqs)");
+// addText(parser, "\033[1mquicksort [speed]:\033[0m"
+// "\t1-2 threads");
+// addText(parser, "\033[1mquicksortbuckets [speed]:\033[0m"
+// "\t1-2 threads for initial sort, up to t for buckets");
+// addText(parser, "\033[1mskew7ext [RAM]:\033[0m"
+// "\t2 * size(dbSeqs)");
+// addText(parser, "\033[1mskew7ext [DISK]:\033[0m"
+// "\t30 * size(dbSeqs)");
+// addText(parser, "\033[1mskew7ext [speed]:\033[0m"
+// "\tnot parallelized");
+// addText(parser, "size(dbSeqs) refers to the total "
+// "sequence length and does not include IDs (which can "
+// "account for >50% of the file size for protein databases). "
+// "The space is the maximum obseverved factor, for many "
+// "databases the factor is smaller." );
+// addText(parser, "Use mergesort if you have enough memory! If not, you will "
+// "probably want to use skew. For small databases and only a "
+// "few cores the quicksorts might be a good tradeoff. "
+// "mergesort and quicksortbuckets provide a rough progress "
+// "estimate.");
+// // addText(parser, "Disk space required is in TMPDIR which you can set as "
+// // "an environment variable.");
+
+ addTextSection(parser, "Remarks");
+ addText(parser, "Please see the wiki (<https://github.com/seqan/lambda/wiki>) for more information on which indexes"
+ " to chose and which algorithms to pick.");
+
+ addText(parser, "Note that the indexes created are binary and not compatible between different CPU endiannesses. "
+ "Also the on-disk format is still subject to change between Lambda versions.");
+
+ // Parse command line.
+ ArgumentParser::ParseResult res = parse(parser, argc, argv);
+
+ // Only extract options if the program will continue after parseCommandLine()
+ if (res != ArgumentParser::PARSE_OK)
+ return res;
+
+ // Options shared by lambda and its indexer
+ res = parseCommandLineShared(options, parser);
+ if (res != ArgumentParser::PARSE_OK)
+ return res;
+
+ // Extract option values
+ getOptionValue(options.segFile, parser, "segfile");
+ getOptionValue(options.algo, parser, "algorithm");
+ if ((options.algo == "mergesort") || (options.algo == "quicksort") || (options.algo == "quicksortbuckets"))
+ {
+ std::cerr << "WARNING: " << options.algo << " tag is deprecated and superseded by \"radixsort\", please "
+ << "adapt your program calls.\n";
+ options.algo = "radixsort";
+ }
+
+ getOptionValue(tmpdir, parser, "tmp-dir");
+ setEnv("TMPDIR", tmpdir);
+
+ std::string buffer;
+ getOptionValue(buffer, parser, "truncate-ids");
+ options.truncateIDs = (buffer == "on");
+
+ return ArgumentParser::PARSE_OK;
+}
+
+// SHARED
+ArgumentParser::ParseResult
+parseCommandLineShared(SharedOptions & options, ArgumentParser & parser)
+{
+ int buf = 0;
+ std::string buffer;
+
+ getOptionValue(options.dbFile, parser, "database");
+
+ getOptionValue(buffer, parser, "db-index-type");
+ if (buffer == "sa")
+ options.dbIndexType = 0;
+ else // if fm
+ options.dbIndexType = 1;
+
+ getOptionValue(buffer, parser, "program");
+ if (buffer == "blastn")
+ options.blastProgram = BlastProgram::BLASTN;
+ else if (buffer == "blastp")
+ options.blastProgram = BlastProgram::BLASTP;
+ else if (buffer == "blastx")
+ options.blastProgram = BlastProgram::BLASTX;
+ else if (buffer == "tblastn")
+ options.blastProgram = BlastProgram::TBLASTN;
+ else if (buffer == "tblastx")
+ options.blastProgram = BlastProgram::TBLASTX;
+ else
+ return ArgumentParser::PARSE_ERROR;
+
+ getOptionValue(buffer, parser, "alphabet-reduction");
+ if ((buffer == "murphy10") &&
+ (options.blastProgram != BlastProgram::BLASTN))
+ options.alphReduction = 2;
+ else
+ options.alphReduction = 0;
+
+ getOptionValue(buf, parser, "genetic-code");
+ switch (buf)
+ {
+ case 1: case 2: case 3: case 4: case 5: case 6:
+ case 9: case 10: case 11: case 12: case 13: case 14: case 15: case 16:
+ case 21: case 22: case 23: case 24 : case 25:
+ options.geneticCode = static_cast<GeneticCodeSpec>(buf);
+ break;
+ default:
+ std::cerr << "Invalid genetic code. See trans_table vars at "
+ << "https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"
+ << std::endl;
+ return ArgumentParser::PARSE_ERROR;
+ }
+
+#ifdef _OPENMP
+ getOptionValue(options.threads, parser, "threads");
+ omp_set_num_threads(options.threads);
+#else
+ options.threads = 1;
+#endif
+
+ getOptionValue(buf, parser, "verbosity");
+ switch(buf)
+ {
+ case 0: options.verbosity = 0; break;
+ case 2: options.verbosity = 2; break;
+ default: options.verbosity = 1; break;
+ }
+
+ return ArgumentParser::PARSE_OK;
+}
+
+constexpr const char *
+_alphName(AminoAcid const & /**/)
+{
+ return "aminoacid";
+}
+
+constexpr const char *
+_alphName(ReducedAminoAcid<Murphy10> const & /**/)
+{
+ return "murphy10";
+}
+
+// constexpr const char *
+// _alphName(ReducedAminoAcid<ClusterReduction<8>> const & /**/)
+// {
+// return "lambda08";
+// }
+//
+// constexpr const char *
+// _alphName(ReducedAminoAcid<ClusterReduction<10>> const & /**/)
+// {
+// return "lambda10";
+// }
+//
+// constexpr const char *
+// _alphName(ReducedAminoAcid<ClusterReduction<12>> const & /**/)
+// {
+// return "lambda12";
+// }
+
+constexpr const char *
+_alphName(Dna const & /**/)
+{
+ return "dna4";
+}
+
+constexpr const char *
+_alphName(Dna5 const & /**/)
+{
+ return "dna5";
+}
+
+template <typename TLH>
+inline void
+printOptions(LambdaOptions const & options)
+{
+ using TGH = typename TLH::TGlobalHolder;
+
+ std::string bandStr;
+ switch(options.band)
+ {
+ case -3: bandStr = "2 * log(queryLength) + 1"; break;
+ case -2: bandStr = "2 * sqrt(queryLength) + 1"; break;
+ case -1: bandStr = "no band"; break;
+ default: bandStr = std::to_string(2 * options.band + 1); break;
+ }
+
+ std::cout << "OPTIONS\n"
+ << " INPUT\n"
+ << " query file: " << options.queryFile << "\n"
+ << " db file: " << options.dbFile << "\n"
+ << " db index type: " << (TGH::indexIsFM
+ ? "FM-Index\n"
+ : "SA-Index\n")
+ << " OUTPUT (file)\n"
+ << " output file: " << options.output << "\n"
+ << " minimum % identity: " << options.idCutOff << "\n"
+ << " maximum e-value: " << options.eCutOff << "\n"
+ << " max #matches per query: " << options.maxMatches << "\n"
+ << " include subj names in sam:" << options.samWithRefHeader << "\n"
+ << " include seq in sam/bam: " << options.samBamSeq << "\n"
+ << " OUTPUT (stdout)\n"
+ << " stdout is terminal: " << options.isTerm << "\n"
+ << " terminal width: " << options.terminalCols << "\n"
+ << " verbosity: " << options.verbosity << "\n"
+ << " GENERAL\n"
+ << " double indexing: " << options.doubleIndexing << "\n"
+ << " threads: " << uint(options.threads) << "\n"
+ << " query partitions: " << (options.doubleIndexing
+ ? std::to_string(options.queryPart)
+ : std::string("n/a")) << "\n"
+ << " TRANSLATION AND ALPHABETS\n"
+ << " genetic code: "
+ << ((TGH::blastProgram != BlastProgram::BLASTN) &&
+ (TGH::blastProgram != BlastProgram::BLASTP)
+ ? std::to_string(options.geneticCode)
+ : std::string("n/a")) << "\n"
+ << " blast mode: " << _programTagToString(TGH::blastProgram)
+ << "\n"
+ << " original alphabet (query):" << _alphName(OrigQryAlph<TGH::blastProgram>())
+ << "\n"
+ << " original alphabet (subj): " << _alphName(OrigSubjAlph<TGH::blastProgram>())
+ << "\n"
+ << " translated alphabet: " << _alphName(TransAlph<TGH::blastProgram>())
+ << "\n"
+ << " reduced alphabet: " << _alphName(typename TGH::TRedAlph())
+ << "\n"
+ << " SEEDING\n"
+ << " seed length: " << uint(options.seedLength) << "\n"
+ << " seed offset: " << uint(options.seedOffset) << "\n"
+ << " seed delta: " << uint(options.maxSeedDist) << "\n"
+ << " seeds ungapped: " << uint(options.hammingOnly) << "\n"
+ << " seed gravity: " << uint(options.seedGravity) << "\n"
+ << " min seed length: " << uint(options.minSeedLength) << "\n"
+ << " MISCELLANEOUS HEURISTICS\n"
+ << " pre-scoring: " << (options.preScoring
+ ? std::string("on")
+ : std::string("off")) << "\n"
+ << " pre-scoring-region: " << (options.preScoring
+ ? std::to_string(
+ options.preScoring *
+ options.seedLength)
+ : std::string("n/a")) << "\n"
+ << " pre-scoring-threshold: " << (options.preScoring
+ ? std::to_string(
+ options.preScoringThresh)
+ : std::string("n/a")) << "\n"
+ << " putative-abundancy: " << (options.filterPutativeAbundant
+ ? std::string("on")
+ : std::string("off")) << "\n"
+ << " putative-duplicates: " << (options.filterPutativeDuplicates
+ ? std::string("on")
+ : std::string("off")) << "\n"
+ << " SCORING\n"
+ << " scoring scheme: " << options.scoringMethod << "\n"
+ << " score-match: " << (options.scoringMethod
+ ? std::string("n/a")
+ : std::to_string(options.match)) << "\n"
+ << " score-mismatch: " << (options.scoringMethod
+ ? std::string("n/a")
+ : std::to_string(options.misMatch)) << "\n"
+ << " score-gap: " << options.gapExtend << "\n"
+ << " score-gap-open: " << options.gapOpen << "\n"
+ << " EXTENSION\n"
+ << " x-drop: " << options.xDropOff << "\n"
+ << " band: " << bandStr << "\n"
+ << " BUILD OPTIONS:\n"
+ << " cmake_build_type: " << std::string(CMAKE_BUILD_TYPE) << "\n"
+ << " fastbuild: "
+ #if defined(FASTBUILD)
+ << "on\n"
+ #else
+ << "off\n"
+ #endif
+ << " native_build: "
+ #if defined(LAMBDA_NATIVE_BUILD)
+ << "on\n"
+ #else
+ << "off\n"
+ #endif
+ << " static_build: "
+ #if defined(LAMBDA_STATIC_BUILD)
+ << "on\n"
+ #else
+ << "off\n"
+ #endif
+ << " mmapped_db: "
+ #if defined(LAMBDA_MMAPPED_DB)
+ << "on\n"
+ #else
+ << "off\n"
+ #endif
+ << "\n";
+}
+
+
+#endif // header guard
diff --git a/src/output.hpp b/src/output.hpp
new file mode 100644
index 0000000..4c2016e
--- /dev/null
+++ b/src/output.hpp
@@ -0,0 +1,578 @@
+// ==========================================================================
+// lambda
+// ==========================================================================
+// Copyright (c) 2013-2015, Hannes Hauswedell, FU Berlin
+// All rights reserved.
+//
+// This file is part of Lambda.
+//
+// Lambda is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Lambda is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Lambda. If not, see <http://www.gnu.org/licenses/>.
+// ==========================================================================
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// output.hpp: contains routines for file-writing
+// ==========================================================================
+
+#ifndef SEQAN_LAMBDA_OUTPUT_H_
+#define SEQAN_LAMBDA_OUTPUT_H_
+
+#include <seqan/blast.h>
+#include <seqan/bam_io.h>
+
+using namespace seqan;
+
+template <typename TVoidSpec = void>
+struct SamBamExtraTags
+{
+ enum Enum
+ {
+// Q_START,
+// S_START,
+ E_VALUE,
+ BIT_SCORE,
+ SCORE,
+ P_IDENT,
+ P_POS,
+ Q_FRAME,
+ S_FRAME,
+ Q_AA_SEQ,
+ Q_AA_CIGAR,
+ EDIT_DISTANCE,
+ MATCH_COUNT
+ };
+
+ static constexpr const std::array<std::pair<const char*, const char*>, 11> keyDescPairs
+ {
+ {
+// { "ZS", "query start (in DNA if original was DNA)" }, // Q_START,
+// { "YS", "subject start (in DNA if original was DNA)" }, // S_START,
+ { "ZE", "expect value" }, // E_VALUE,
+ { "AS", "bit score" }, // BIT_SCORE,
+ { "ZR", "raw score" }, // SCORE,
+ { "ZI", "% identity (in protein space unless BLASTN) " }, // P_IDENT,
+ { "ZP", "% positive (in protein space unless BLASTN)"}, // P_POS,
+ { "ZF", "query frame" }, // Q_FRAME,
+ { "YF", "subject frame" }, // S_FRAME,
+ { "ZQ", "query protein sequence (* for BLASTN)"}, // Q_AA_SEQ,
+ { "OC", "query protein cigar (* for BLASTN)"}, // Q_AA_CIGAR,
+ { "NM", "edit distance (in protein space unless BLASTN)"}, // EDIT_DISTANCE
+ { "IH", "number of matches this query has"}, // MATCH_COUNT
+ }
+ };
+
+};
+
+template <typename TVoidSpec>
+constexpr const std::array<std::pair<const char*, const char*>, 11> SamBamExtraTags<TVoidSpec>::keyDescPairs;
+
+// ----------------------------------------------------------------------------
+// Function _untranslatedClipPositions()
+// ----------------------------------------------------------------------------
+
+// similar to _untranslatePositions() from the blast module
+template <typename TSequence1, typename TSequence2, typename TBlastMatch>
+inline void
+_untranslateSequence(TSequence1 & target,
+ TSequence2 const & source,
+ TBlastMatch const & m)
+{
+ if (m.qFrameShift >= 0)
+ {
+ target = infix(source,
+ 3 * m.qStart + std::abs(m.qFrameShift) - 1,
+ 3 * m.qEnd + std::abs(m.qFrameShift) - 1);
+ }
+ else
+ {
+ thread_local Dna5String buf;
+
+ buf = source;
+ reverseComplement(buf);
+ target = infix(buf,
+ 3 * m.qStart + std::abs(m.qFrameShift) - 1,
+ 3 * m.qEnd + std::abs(m.qFrameShift) - 1);
+ }
+}
+
+// ----------------------------------------------------------------------------
+// Function blastMatchToCigar() convert seqan align to cigar
+// ----------------------------------------------------------------------------
+
+template <typename TCigar, typename TBlastMatch, typename TLocalHolder>
+inline void
+blastMatchOneCigar(TCigar & cigar,
+ TBlastMatch const & m,
+ TLocalHolder const & lH)
+{
+ using TCElem = typename Value<TCigar>::Type;
+
+ SEQAN_ASSERT_EQ(length(m.alignRow0), length(m.alignRow1));
+
+ // hard clipping
+ unsigned leftClip = m.qStart;
+ unsigned rightClip = m.qEnd;
+
+ // translation factor
+ unsigned transFac = 1;
+ if (qIsTranslated(lH.gH.blastProgram))
+ {
+ transFac = 3;
+ leftClip = leftClip * 3 + std::abs(m.qFrameShift) - 1;
+ rightClip = rightClip * 3 + std::abs(m.qFrameShift) - 1;
+ }
+ // we want distance to end
+ rightClip = m.qLength - rightClip;
+
+ if (leftClip > 0)
+ appendValue(cigar, TCElem('H', leftClip));
+
+ for (unsigned i = 0, count = 0; i < length(m.alignRow0); /* incremented below */)
+ {
+ // deletion in query
+ count = 0;
+ while (isGap(m.alignRow0, i) && (i < length(m.alignRow0)))
+ {
+ ++count;
+ ++i;
+ }
+ if (count > 0)
+ appendValue(cigar, TCElem('D', count * transFac));
+
+ // insertion in query
+ count = 0;
+ while (isGap(m.alignRow1, i) && (i < length(m.alignRow0)))
+ {
+ ++count;
+ ++i;
+ }
+ if (count > 0)
+ appendValue(cigar, TCElem('I', count * transFac));
+
+ // match or mismatch
+ count = 0;
+ while ((!isGap(m.alignRow0, i)) && (!isGap(m.alignRow1, i)) && (i < length(m.alignRow0)))
+ {
+ ++count;
+ ++i;
+ }
+ if (count > 0)
+ appendValue(cigar, TCElem('M', count * transFac));
+ }
+
+ if (rightClip > 0)
+ appendValue(cigar, TCElem('H', rightClip));
+
+ if (m.qFrameShift < 0)
+ reverse(cigar);
+}
+
+// translation happened and we want both cigars
+template <typename TCigar, typename TBlastMatch, typename TLocalHolder>
+inline void
+blastMatchTwoCigar(TCigar & dnaCigar,
+ TCigar & protCigar,
+ TBlastMatch const & m,
+ TLocalHolder const &)
+{
+ using TCElem = typename Value<TCigar>::Type;
+
+ SEQAN_ASSERT_EQ(length(m.alignRow0), length(m.alignRow1));
+
+ if (m.qStart > 0)
+ {
+ appendValue(dnaCigar, TCElem('H', m.qStart * 3 + std::abs(m.qFrameShift) - 1));
+ appendValue(protCigar, TCElem('H', m.qStart));
+ }
+
+ for (unsigned i = 0, count = 0; i < length(m.alignRow0); /* incremented below */)
+ {
+ // deletion in query
+ count = 0;
+ while (isGap(m.alignRow0, i) && (i < length(m.alignRow0)))
+ {
+ ++count;
+ ++i;
+ }
+ if (count > 0)
+ {
+ appendValue(dnaCigar, TCElem('D', count * 3));
+ appendValue(protCigar, TCElem('D', count));
+ }
+
+ // insertion in query
+ count = 0;
+ while (isGap(m.alignRow1, i) && (i < length(m.alignRow0)))
+ {
+ ++count;
+ ++i;
+ }
+ if (count > 0)
+ {
+ appendValue(dnaCigar, TCElem('I', count * 3));
+ appendValue(protCigar, TCElem('I', count));
+ }
+
+ // match or mismatch
+ count = 0;
+ while ((!isGap(m.alignRow0, i)) && (!isGap(m.alignRow1, i)) && (i < length(m.alignRow0)))
+ {
+ ++count;
+ ++i;
+ }
+ if (count > 0)
+ {
+ appendValue(dnaCigar, TCElem('M', count * 3));
+ appendValue(protCigar, TCElem('M', count));
+ }
+ }
+
+ unsigned rightDnaClip = m.qLength - (m.qEnd * 3 + std::abs(m.qFrameShift) - 1);
+ if (rightDnaClip > 0)
+ {
+ appendValue(dnaCigar, TCElem('H', rightDnaClip));
+ appendValue(protCigar, TCElem('H', ((m.qLength - std::abs(m.qFrameShift) + 1) / 3) - m.qEnd));
+ }
+
+ if (m.qFrameShift < 0)
+ reverse(dnaCigar);
+ // protCigar never reversed
+}
+
+// ----------------------------------------------------------------------------
+// Function myWriteHeader()
+// ----------------------------------------------------------------------------
+
+template <typename TGH, typename TLambdaOptions>
+inline void
+myWriteHeader(TGH & globalHolder, TLambdaOptions const & options)
+{
+ if (options.outFileFormat == 0) // BLAST
+ {
+ open(globalHolder.outfile, toCString(options.output));
+ context(globalHolder.outfile).fields = options.columns;
+ auto & versionString = context(globalHolder.outfile).versionString;
+ clear(versionString);
+ append(versionString, _programTagToString(globalHolder.blastProgram));
+ append(versionString, " 2.2.26+ [created by LAMBDA");
+ if (options.versionInformationToOutputFile)
+ {
+ append(versionString, "-");
+ append(versionString, SEQAN_APP_VERSION);
+ }
+ append(versionString, ", see http://seqan.de/lambda and please cite correctly in your academic work]");
+ writeHeader(globalHolder.outfile);
+ } else // SAM or BAM
+ {
+ open(globalHolder.outfileBam, toCString(options.output));
+ auto & context = seqan::context(globalHolder.outfileBam);
+ auto & subjSeqLengths = contigLengths(context);
+ auto & subjIds = contigNames(context);
+
+ // set sequence lengths
+ if (sIsTranslated(globalHolder.blastProgram))
+ {
+ //TODO can we get around a copy?
+ subjSeqLengths = globalHolder.untransSubjSeqLengths;
+ } else
+ {
+ // compute lengths ultra-fast
+ resize(subjSeqLengths, length(globalHolder.subjSeqs));
+ SEQAN_OMP_PRAGMA(parallel for simd)
+ for (unsigned i = 0; i < length(subjSeqLengths); ++i)
+ subjSeqLengths[i] = globalHolder.subjSeqs.limits[i+1] - globalHolder.subjSeqs.limits[i];
+ }
+ // set namestore
+ resize(subjIds, length(globalHolder.subjIds));
+ SEQAN_OMP_PRAGMA(parallel for)
+ for (unsigned i = 0; i < length(globalHolder.subjIds); ++i)
+ subjIds[i] = prefix(globalHolder.subjIds[i],
+ std::find(begin(globalHolder.subjIds[i], Standard()),
+ end(globalHolder.subjIds[i], Standard()),
+ ' ')
+ - begin(globalHolder.subjIds[i], Standard()));
+
+ typedef BamHeaderRecord::TTag TTag;
+
+ // CREATE HEADER
+ BamHeader header;
+ // Fill first header line.
+ BamHeaderRecord firstRecord;
+ firstRecord.type = BAM_HEADER_FIRST;
+ appendValue(firstRecord.tags, TTag("VN", "1.4"));
+// appendValue(firstRecord.tags, TTag("SO", "unsorted"));
+ appendValue(firstRecord.tags, TTag("GO", "query"));
+ appendValue(header, firstRecord);
+
+ // Fill program header line.
+ if (options.versionInformationToOutputFile)
+ {
+ BamHeaderRecord pgRecord;
+ pgRecord.type = BAM_HEADER_PROGRAM;
+ appendValue(pgRecord.tags, TTag("ID", "lambda"));
+ appendValue(pgRecord.tags, TTag("PN", "lambda"));
+ appendValue(pgRecord.tags, TTag("VN", SEQAN_APP_VERSION));
+ appendValue(pgRecord.tags, TTag("CL", options.commandLine));
+ appendValue(header, pgRecord);
+ }
+
+ // Fill homepage header line.
+ BamHeaderRecord hpRecord0;
+ hpRecord0.type = BAM_HEADER_COMMENT;
+ appendValue(hpRecord0.tags, TTag("CO", "Lambda is a high performance BLAST compatible local aligner, "
+ "please see http://seqan.de/lambda for more information."));
+ appendValue(header, hpRecord0);
+ BamHeaderRecord hpRecord1;
+ hpRecord1.type = BAM_HEADER_COMMENT;
+ appendValue(hpRecord1.tags, TTag("CO", "SAM/BAM dialect documentation is available here: "
+ "https://github.com/seqan/lambda/wiki/Output-Formats"));
+ appendValue(header, hpRecord1);
+ BamHeaderRecord hpRecord2;
+ hpRecord2.type = BAM_HEADER_COMMENT;
+ appendValue(hpRecord2.tags, TTag("CO", "If you use any results found by Lambda, please cite "
+ "Hauswedell et al. (2014) doi: 10.1093/bioinformatics/btu439"));
+ appendValue(header, hpRecord2);
+
+ // Fill extra tags header line.
+ BamHeaderRecord tagRecord;
+ tagRecord.type = BAM_HEADER_COMMENT;
+ std::string columnHeaders = "Optional tags as follow";
+ for (unsigned i = 0; i < length(SamBamExtraTags<>::keyDescPairs); ++i)
+ {
+ if (options.samBamTags[i])
+ {
+ columnHeaders += '\t';
+ columnHeaders += std::get<0>(SamBamExtraTags<>::keyDescPairs[i]);
+ columnHeaders += ':';
+ columnHeaders += std::get<1>(SamBamExtraTags<>::keyDescPairs[i]);
+ }
+ }
+ appendValue(tagRecord.tags, TTag("CO", columnHeaders));
+ appendValue(header, tagRecord);
+
+ // sam and we don't want the headers
+ if (!options.samWithRefHeader && (options.outFileFormat == 1))
+ {
+ // we only write the header records that we actually created ourselves
+ for (unsigned i = 0; i < length(header); ++i)
+ write(globalHolder.outfileBam.iter, header[i], seqan::context(globalHolder.outfileBam), Sam());
+ }
+ else
+ {
+ // ref header records are automatically added with default writeHeader()
+ writeHeader(globalHolder.outfileBam, header);
+ }
+ }
+}
+
+// ----------------------------------------------------------------------------
+// Function myWriteRecord()
+// ----------------------------------------------------------------------------
+
+template <typename TLH, typename TRecord>
+inline void
+myWriteRecord(TLH & lH, TRecord const & record)
+{
+ if (lH.options.outFileFormat == 0) // BLAST
+ {
+ SEQAN_OMP_PRAGMA(critical(filewrite))
+ {
+ writeRecord(lH.gH.outfile, record);
+ }
+ } else // SAM or BAM
+ {
+ // convert multi-match blast-record to multiple SAM/BAM-Records
+
+ std::vector<BamAlignmentRecord> bamRecords;
+ bamRecords.resize(record.matches.size());
+
+ String<CigarElement<>> protCigar;
+ std::string protCigarString = "*";
+
+ auto mIt = begin(record.matches, Standard());
+ for (auto & bamR : bamRecords)
+ {
+ // untranslate for sIsTranslated
+ if (sIsTranslated(lH.gH.blastProgram))
+ {
+ bamR.beginPos = mIt->sStart * 3 + std::abs(mIt->sFrameShift) - 1;
+ if (mIt->sFrameShift < 0)
+ bamR.beginPos = mIt->qLength - bamR.beginPos;
+ } else
+ {
+ bamR.beginPos = mIt->sStart;
+ }
+
+ bamR.flag = BAM_FLAG_SECONDARY; // all are secondary for now
+ if (mIt->qFrameShift < 0)
+ bamR.flag |= BAM_FLAG_RC;
+ // truncated query name
+ bamR.qName = prefix(mIt->qId,
+ std::find(begin(mIt->qId, Standard()),
+ end(mIt->qId, Standard()),
+ ' ')
+ - begin(mIt->qId, Standard()));
+ // reference ID
+ bamR.rID = mIt->_n_sId;
+
+ // compute cigar
+ if (lH.options.samBamTags[SamBamExtraTags<>::Q_AA_CIGAR]) // amino acid cigar, too?
+ {
+ clear(protCigar);
+ // native protein
+ if ((lH.gH.blastProgram == BlastProgram::BLASTP) || (lH.gH.blastProgram == BlastProgram::TBLASTN))
+ blastMatchOneCigar(protCigar, *mIt, lH);
+ else if (qIsTranslated(lH.gH.blastProgram)) // translated
+ blastMatchTwoCigar(bamR.cigar, protCigar, *mIt, lH);
+ else // BLASTN can't have protein sequence
+ blastMatchOneCigar(bamR.cigar, *mIt, lH);
+ }
+ else
+ {
+ if ((lH.gH.blastProgram != BlastProgram::BLASTP) && (lH.gH.blastProgram != BlastProgram::TBLASTN))
+ blastMatchOneCigar(bamR.cigar, *mIt, lH);
+ }
+ // we want to include the seq
+ bool writeSeq = false;
+ if ((lH.options.samBamSeq > 1) || (mIt == begin(record.matches, Standard())))
+ {
+ writeSeq = true;
+ }
+ else if (lH.options.samBamSeq == 1)// only uniq sequences
+ {
+ decltype(mIt) mPrevIt = mIt - 1;
+ writeSeq = ((beginPosition(mIt->alignRow0) != beginPosition(mPrevIt->alignRow0)) ||
+ (endPosition(mIt->alignRow0)) != endPosition(mPrevIt->alignRow0));
+ }
+ if (writeSeq)
+ {
+ // only dna sequences supported
+ if (lH.gH.blastProgram == BlastProgram::BLASTN)
+ bamR.seq = infix(source(mIt->alignRow0),
+ beginPosition(mIt->alignRow0),
+ endPosition(mIt->alignRow0));
+ // untranslation is ok, too
+ else if (qIsTranslated(lH.gH.blastProgram))
+ _untranslateSequence(bamR.seq,
+ lH.gH.untranslatedQrySeqs[mIt->_n_qId],
+ *mIt);
+ // else no sequence is available
+ }
+
+ // custom tags
+ //TODO untranslate?
+// if (lH.options.samBamTags[SamBamExtraTags<>::Q_START])
+// appendTagValue(bamR.tags,
+// std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::Q_START]),
+// uint32_t(mIt->qStart), 'I');
+ // case S_START:
+ if (lH.options.samBamTags[SamBamExtraTags<>::E_VALUE])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::E_VALUE]),
+ float(mIt->eValue), 'f');
+ if (lH.options.samBamTags[SamBamExtraTags<>::BIT_SCORE])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::BIT_SCORE]),
+ uint16_t(mIt->bitScore), 'S');
+ if (lH.options.samBamTags[SamBamExtraTags<>::SCORE])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::SCORE]),
+ uint8_t(mIt->alignStats.alignmentScore), 'C');
+ if (lH.options.samBamTags[SamBamExtraTags<>::P_IDENT])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::P_IDENT]),
+ uint8_t(mIt->alignStats.alignmentIdentity), 'C');
+ if (lH.options.samBamTags[SamBamExtraTags<>::P_POS])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::P_POS]),
+ uint16_t(mIt->alignStats.alignmentSimilarity), 'S');
+ if (lH.options.samBamTags[SamBamExtraTags<>::Q_FRAME])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::Q_FRAME]),
+ int8_t(mIt->qFrameShift), 'c');
+ if (lH.options.samBamTags[SamBamExtraTags<>::S_FRAME])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::S_FRAME]),
+ int8_t(mIt->sFrameShift), 'c');
+ if (lH.options.samBamTags[SamBamExtraTags<>::Q_AA_SEQ])
+ {
+ if ((lH.gH.blastProgram == BlastProgram::BLASTN) || (!writeSeq))
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::Q_AA_SEQ]),
+ "*", 'Z');
+ else
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::Q_AA_SEQ]),
+ infix(source(mIt->alignRow0),
+ beginPosition(mIt->alignRow0),
+ endPosition(mIt->alignRow0)),
+ 'Z');
+ }
+ if (lH.options.samBamTags[SamBamExtraTags<>::Q_AA_CIGAR])
+ {
+ if (empty(protCigar))
+ {
+ protCigarString = "*";
+ }
+ else
+ {
+ clear(protCigarString);
+ for (unsigned i = 0; i < length(protCigar); ++i)
+ {
+ appendNumber(protCigarString, protCigar[i].count);
+ appendValue(protCigarString, protCigar[i].operation);
+ }
+
+ }
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::Q_AA_CIGAR]),
+ protCigarString, 'Z');
+ }
+ if (lH.options.samBamTags[SamBamExtraTags<>::EDIT_DISTANCE])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::EDIT_DISTANCE]),
+ uint32_t(mIt->alignStats.alignmentLength - mIt->alignStats.numMatches), 'I');
+ if (lH.options.samBamTags[SamBamExtraTags<>::MATCH_COUNT])
+ appendTagValue(bamR.tags,
+ std::get<0>(SamBamExtraTags<>::keyDescPairs[SamBamExtraTags<>::MATCH_COUNT]),
+ uint32_t(length(record.matches)), 'I');
+
+ // goto next match
+ ++mIt;
+ }
+
+ bamRecords.front().flag -= BAM_FLAG_SECONDARY; // remove BAM_FLAG_SECONDARY for first
+
+ SEQAN_OMP_PRAGMA(critical(filewrite))
+ {
+ for (auto & r : bamRecords)
+ writeRecord(lH.gH.outfileBam, r);
+ }
+ }
+}
+
+// ----------------------------------------------------------------------------
+// Function myWriteFooter()
+// ----------------------------------------------------------------------------
+
+template <typename TGH, typename TLambdaOptions>
+inline void
+myWriteFooter(TGH & globalHolder, TLambdaOptions const & options)
+{
+ if (options.outFileFormat == 0) // BLAST
+ {
+ writeFooter(globalHolder.outfile);
+ }
+}
+
+#endif // SEQAN_LAMBDA_OUTPUT_H_
diff --git a/src/radix_inplace.h b/src/radix_inplace.h
new file mode 100644
index 0000000..2471154
--- /dev/null
+++ b/src/radix_inplace.h
@@ -0,0 +1,497 @@
+// ==========================================================================
+// radix_inplace.h
+// ==========================================================================
+// Copyright (c) 2006-2015, Knut Reinert, FU Berlin
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of Knut Reinert or the FU Berlin nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+// DAMAGE.
+//
+// ==========================================================================
+// Author: Sascha Meiers <sascha.meiers at embl.de>
+// Author: Hannes Hauswedell <hannes.hauswedell @ fu-berlin.de>
+// ==========================================================================
+// The Radix Sort functions are adapted from Martin Frith's "last"
+// tool (last.cbrc.jp), but he himself adapted the code from McIlroy, Bostic:
+// "Engineering radix sort" as well as Karkkainen, Rantala: "Engineering radix
+// sort for strings". Thanks to Martin for showing this to me.
+// ============================================================================
+
+#ifndef CORE_INCLUDE_SEQAN_INDEX_RADIX_INPLACE_H_
+#define CORE_INCLUDE_SEQAN_INDEX_RADIX_INPLACE_H_
+
+#if defined(_OPENMP) && defined(__GNUC__) && !defined(__clang__)
+#include <parallel/algorithm>
+#define SORT __gnu_parallel::sort
+#else
+#define SORT std::sort
+#endif
+//TODO(h-2): for clang use std::experimenta::parallel if available
+
+namespace SEQAN_NAMESPACE_MAIN
+{
+
+// ==========================================================================
+// Tags
+// ==========================================================================
+
+struct RadixSortSACreateTag {};
+
+// ==========================================================================
+// Metafunctions
+// ==========================================================================
+
+template <typename TText>
+struct Fibre<Index<TText, IndexSa<RadixSortSACreateTag > >, FibreTempSA>
+{
+ typedef Index<TText, IndexSa<RadixSortSACreateTag > > TIndex_;
+ typedef typename SAValue<TIndex_>::Type TSAValue_;
+
+ typedef String<TSAValue_, typename DefaultIndexStringSpec<TText>::Type> Type;
+};
+
+template <typename TText>
+struct DefaultIndexCreator<Index<TText, IndexSa<RadixSortSACreateTag> >, FibreSA>
+{
+ typedef RadixSortSACreateTag Type;
+};
+
+template <typename TText, typename TConfig>
+struct Fibre<Index<TText, FMIndex<RadixSortSACreateTag, TConfig> >, FibreTempSA>
+{
+ typedef Index<TText, FMIndex<RadixSortSACreateTag, TConfig> > TIndex_;
+ typedef typename SAValue<TIndex_>::Type TSAValue_;
+
+ typedef String<TSAValue_, typename DefaultIndexStringSpec<TText>::Type> Type;
+};
+
+template < typename TText, typename TConfig>
+struct DefaultIndexCreator<Index<TText, FMIndex<RadixSortSACreateTag, TConfig> >, FibreSA>
+{
+ typedef RadixSortSACreateTag Type;
+};
+
+// ============================================================================
+// Classes
+// ============================================================================
+
+// ----------------------------------------------------------------------------
+// struct RadixTextAccessor [String]
+// ----------------------------------------------------------------------------
+
+template <
+ typename TSAValue, // input
+ typename TString, // string object that is referenced
+ typename TSpec = void, // Suffix modifier
+ typename TSize = unsigned> // return type (ordValue)
+struct RadixTextAccessor;
+/*
+ * NOTE:
+ * These accessors cannot resolve the correct order of out-of-bound-positions,
+ * i.e. when suffixes are equal up to their last character.
+ * All these cases get collected in a 0 bucket.
+ * The InplaceRadixSorter takes care of that by calling a special
+ * sort function on the 0 buckets.
+ */
+
+
+template <typename TSAValue, typename TString, typename TSize>
+struct RadixTextAccessor<TSAValue, TString, void, TSize> :
+ public std::unary_function<TSAValue, TSize>
+{
+ TString const & text;
+ typename Size<TString>::Type const L;
+
+ RadixTextAccessor(TString const &str) : text(str), L(length(str))
+ {}
+
+ template <typename TSize2>
+ inline TSize operator()(TSAValue const &x, TSize2 depth) const
+ {
+ typename Size<TString>::Type pos = x + depth;
+ if (pos >= L) return 0;
+ TSize ret = ordValue(text[pos]);
+ return ret+1;
+ }
+};
+
+// ----------------------------------------------------------------------------
+// struct RadixTextAccessor [StringSet]
+// ----------------------------------------------------------------------------
+
+template <typename TSAValue, typename TString, typename TSetSpec, typename TSize>
+struct RadixTextAccessor<TSAValue, StringSet<TString, TSetSpec>, void, TSize> :
+ public std::unary_function<TSAValue, TSize>
+
+{
+ StringSet<TString, TSetSpec> const & text;
+ String<typename Size<TString>::Type> L;
+
+ RadixTextAccessor(StringSet<TString, TSetSpec> const &str) : text(str)
+ {
+ resize(L, length(text), Exact());
+ for(typename Size<TString>::Type i = 0; i < length(text); ++i)
+ L[i] = length(text[i]);
+ }
+
+ template <typename TSize2>
+ inline TSize operator()(TSAValue const &x, TSize2 depth) const
+ {
+ typename Size<TString>::Type pos = getSeqOffset(x) + depth;
+ typename Size<TString>::Type seq = getSeqNo(x);
+ if (pos >= L[seq]) return 0;
+ TSize ret = ordValue(text[seq][pos]);
+ return ret+1;
+ }
+};
+
+// ----------------------------------------------------------------------------
+// struct _ZeroBucketComparator [StringSet]
+// ----------------------------------------------------------------------------
+// Functors to compare suffixes from 0 bucket (suffixes that are lex. equal)
+// ----------------------------------------------------------------------------
+
+template <typename TSAValue, typename TLimitsString=Nothing const>
+struct _ZeroBucketComparator
+{
+ TLimitsString const & limits;
+ _ZeroBucketComparator(TLimitsString const & lim) : limits(lim) { /*std::cout << "limits: " << limits << std::endl;*/ }
+
+ inline bool operator()(TSAValue const & a, TSAValue const & b) const
+ {
+ typename Size<TLimitsString>::Type lena = limits[getSeqNo(a)+1]-limits[getSeqNo(a)] - getSeqOffset(a);
+ typename Size<TLimitsString>::Type lenb = limits[getSeqNo(b)+1]-limits[getSeqNo(b)] - getSeqOffset(b);
+ if (lena == lenb)
+ return getSeqNo(a) > getSeqNo(b);
+ else
+ return lena < lenb;
+ }
+};
+
+// ----------------------------------------------------------------------------
+// struct _ZeroBucketComparator [String]
+// ----------------------------------------------------------------------------
+
+template <typename TSAValue>
+struct _ZeroBucketComparator<TSAValue, Nothing const>
+{
+ _ZeroBucketComparator(Nothing const &) {}
+ _ZeroBucketComparator(Nothing &) {}
+
+
+ inline bool operator()(TSAValue const & a, TSAValue const & b) const
+ {
+ return a > b;
+ }
+};
+
+// ----------------------------------------------------------------------------
+// struct RadixSortContext_
+// ----------------------------------------------------------------------------
+
+template <typename TSAValue,
+ typename TText,
+ typename TSize, // type of depth and bucketCount a.s.o
+ unsigned Q> // alph size = ValueSize + 1
+struct RadixSortContext_
+{
+ typedef typename StringSetLimits<TText const>::Type TLimitsString; // "Nothing" for Strings
+ typedef RadixTextAccessor<TSAValue, TText> TAccessFunctor;
+ typedef _ZeroBucketComparator<TSAValue, TLimitsString> TOrderFunctor;
+ typedef typename TAccessFunctor::result_type TOrdValue;
+
+ static_assert(Q < 256, "Alphabet size must be smaller 256!"); //TODO really?
+ static const unsigned ORACLESIZE = 256;
+
+ TText const & text;
+ TAccessFunctor textAccess;
+ TOrderFunctor comp;
+
+ TSize bucketSize[Q];
+ std::array<TSAValue*,Q> bucketEnd;
+
+ RadixSortContext_(TText const & t) :
+ text(t), textAccess(t), comp(stringSetLimits(t))
+ {}
+};
+
+template <typename TSAValue,
+ typename TText,
+ typename TSize,
+ unsigned Q>
+inline void
+clear(RadixSortContext_<TSAValue, TText, TSize, Q> & context)
+{
+ memset(context.bucketSize, 0, sizeof(TSize)*Q);
+}
+
+// ==========================================================================
+// Functions
+// ==========================================================================
+
+// ----------------------------------------------------------------------------
+// Function _radixSort()
+// ----------------------------------------------------------------------------
+
+template <typename TSAValue, typename TSize,
+ typename TText, unsigned Q>
+inline void
+_radixSort(std::vector<std::tuple<TSAValue*, TSAValue*, TSize> > & stack,
+ RadixSortContext_<TSAValue, TText, TSize, Q> & context,
+ std::tuple<TSAValue*, TSAValue*, TSize> const & item)
+{
+ typedef RadixSortContext_<TSAValue, TText, TSize, Q> TContext;
+ typedef typename TContext::TOrdValue TOrdValue;
+
+ clear(context);
+
+ // get bucket sizes (i.e. letter counts):
+ // The intermediate oracle array makes it faster (see "Engineering
+ // Radix Sort for Strings" by J Karkkainen & T Rantala)
+ for(TSAValue* i = std::get<0>(item); i < std::get<1>(item); /* noop */ )
+ {
+ // buffer for the next chars
+ TOrdValue oracle [TContext::ORACLESIZE];
+ TOrdValue* oracleEnd = oracle + std::min(static_cast<std::size_t>(TContext::ORACLESIZE),
+ static_cast<std::size_t>(std::get<1>(item) - i));
+
+ for(TOrdValue* j = oracle; j < oracleEnd; ++j )
+ *j = context.textAccess(*i++, std::get<2>(item));
+
+ for(TOrdValue* j = oracle; j < oracleEnd; ++j )
+ ++context.bucketSize[*j];
+ }
+
+ // get bucket std::get<1>(item)s, and put buckets on the stack to sort within them later:
+ // EDIT: 0 bucket is not sorted here, but later.
+ TSize zeroBucketSize = context.bucketSize[0];
+ TSAValue* pos = std::get<0>(item) + context.bucketSize[0];
+ context.bucketEnd[0] = pos;
+
+ for(unsigned i = 1; i < Q; ++i )
+ {
+ TSAValue* nextPos = pos + context.bucketSize[i];
+ if (nextPos - pos > 1)
+ stack.emplace_back(pos, nextPos, std::get<2>(item)+1);
+
+ pos = nextPos;
+ context.bucketEnd[i] = pos;
+ }
+
+ // permute items into the correct buckets:
+ for(TSAValue* i = std::get<0>(item); i < std::get<1>(item); )
+ {
+ TOrdValue subset; // unsigned is faster than uchar!
+ TSAValue holdOut = *i;
+ while(--context.bucketEnd[subset = context.textAccess(holdOut, std::get<2>(item))] > i )
+ std::swap(*context.bucketEnd[subset], holdOut);
+ *i = holdOut;
+ i += context.bucketSize[subset];
+ context.bucketSize[subset] = 0; // reset it so we can reuse it //TODO check if we need this, since we clear already!
+ }
+
+ // sort the 0 bucket using std::sort
+ if(zeroBucketSize > 1)
+ std::sort(std::get<0>(item), std::get<0>(item) + zeroBucketSize, context.comp);
+}
+
+// ----------------------------------------------------------------------------
+// Function _radixSortWrapper()
+// ----------------------------------------------------------------------------
+// switch to quicksort if the interval is sufficiently small
+
+//TODO: play with this value
+#ifndef _RADIX_SORT_SWITCH_TO_QUICKSORT_AT
+#define _RADIX_SORT_SWITCH_TO_QUICKSORT_AT 100
+#endif
+
+template <typename TSAValue, typename TSize,
+ typename TText, unsigned Q>
+inline void
+_radixSortWrapper(std::vector<std::tuple<TSAValue*, TSAValue*, TSize> > & stack,
+ RadixSortContext_<TSAValue, TText, TSize, Q> & context,
+ std::tuple<TSAValue*, TSAValue*, TSize> const & i)
+{
+ if (std::get<1>(i) - std::get<0>(i) < _RADIX_SORT_SWITCH_TO_QUICKSORT_AT)
+ std::sort(std::get<0>(i), std::get<1>(i), SuffixLess_<TSAValue, TText const>(context.text, std::get<2>(i)));
+ else if (std::get<1>(i) - std::get<0>(i) >= 2)
+ _radixSort(stack, context, i);
+}
+
+// ----------------------------------------------------------------------------
+// Function inplaceFullRadixSort() [default]
+// ----------------------------------------------------------------------------
+
+#ifdef _OPENMP
+#define N_THREADS omp_get_max_threads()
+#define I_THREAD omp_get_thread_num()
+#define MIN_BUCKETS 512
+#else
+#define N_THREADS 1
+#define I_THREAD 0
+#define MIN_BUCKETS 100 // for somewhat decent progress reporting
+#endif
+
+// TODO: serial version
+// TODO: possibly quicksort directly on buckets in third steps, if buckets have been made small enough
+// TODO: double-check the effects of the new "secondStep"
+
+template <typename TSA, typename TText, typename TLambda>
+void inPlaceRadixSort(TSA & sa, TText const & text, TLambda const & progressCallback = [] (unsigned) {})
+{
+ typedef typename Value<typename Concatenator<TText>::Type>::Type TAlphabet;
+ typedef typename Value<TSA>::Type TSAValue;
+ typedef typename Size<TText>::Type TSize;
+ typedef std::tuple<TSAValue*, TSAValue*, TSize> TItem;
+
+ static const unsigned SIGMA = static_cast<unsigned>(ValueSize<TAlphabet>::VALUE) + 1;
+ SEQAN_ASSERT_LT_MSG(SIGMA, 1000u, "Attention: inplace radix sort is not suited for large alphabets");
+
+ typedef RadixSortContext_<TSAValue, TText, TSize, SIGMA> TContext;
+
+ if (empty(sa))
+ return; // otherwise access sa[0] fails
+
+ /* stacks */
+ std::vector<TItem> firstStack;
+ firstStack.reserve(SIGMA);
+ std::vector<TItem> secondStack;
+ secondStack.reserve(1000);
+ std::vector<std::vector<TItem>> lStack(N_THREADS); // one per thread
+ // reduce memory allocations in threads by reserving space
+ for (auto & stack : lStack)
+ stack.reserve(length(sa) / 1000);
+
+ /* contexts */
+ TContext firstSecondContext{text};
+ std::vector<TContext> lContext(N_THREADS, TContext{text});
+
+ // FIRST STEP
+ // sort by the first character
+ _radixSortWrapper(firstStack, firstSecondContext, TItem(&sa[0], &sa[0]+length(sa), 0));
+ progressCallback(5); // 5% progress guess after first char
+
+ // SECOND STEP
+ // sort by next n characters until the stack has reached a good size for distinct parallelization
+ // NOTE that for small alphabets in combination with small texts, this step might sort the entire SA
+ while (!firstStack.empty())
+ {
+ SEQAN_OMP_PRAGMA(parallel for schedule(dynamic))
+ for (unsigned j = 0; j < length(firstStack); ++j)
+ _radixSortWrapper(lStack[I_THREAD], lContext[I_THREAD], firstStack[j]);
+
+ // merge local stacks and clear for next round or next step
+ for (auto & stack : lStack)
+ {
+ secondStack.insert(secondStack.end(), stack.begin(), stack.end());
+ stack.clear();
+ }
+
+ // sort the stack by interval size so that large intervals are moved to front
+ // this improves parallelization of dynamic schedule
+ SORT(secondStack.begin(), secondStack.end(),
+ [] (TItem const & l, TItem const & r)
+ {
+ return (std::get<1>(l) - std::get<0>(l)) > (std::get<1>(r) - std::get<0>(r));
+ });
+
+ // check if largest interval "fits" in one thread efficiently
+ // this works independently of alphabet size and just depends on the data
+ // MIN_BUCKETS check additionally guarantees a degree of granularity
+ if ((uint64_t(std::get<1>(secondStack.front()) - std::get<0>(secondStack.front())) <= (length(sa) / N_THREADS))
+ && (secondStack.size() >= MIN_BUCKETS))
+ break;
+
+ // switch buffers for next round
+ firstStack.clear();
+ std::swap(firstStack, secondStack);
+ }
+ progressCallback(10); // 10% progress guess after second step
+
+ // THIRD STEP
+ // sort the remaining intervals distinctly; here no locking and syncing is required anymore
+ SEQAN_OMP_PRAGMA(parallel for schedule(dynamic))
+ for (unsigned j = 0; j < secondStack.size(); ++j)
+ {
+ lStack[I_THREAD].push_back(secondStack[j]);
+
+ while (!lStack[I_THREAD].empty())
+ {
+ TItem i = lStack[I_THREAD].back();
+ lStack[I_THREAD].pop_back();
+ _radixSortWrapper(lStack[I_THREAD], lContext[I_THREAD], i);
+ }
+
+ // progressCallback must be thread safe and cope with smaller numbers after big numbers
+ // remaining characters alloted 90% of total progress
+ progressCallback(10 + (j * 90) / secondStack.size());
+ }
+
+ progressCallback(100); // done
+}
+
+// ----------------------------------------------------------------------------
+// Function createSuffixArray
+// ----------------------------------------------------------------------------
+
+template <typename TSA,
+ typename TString,
+ typename TSSetSpec,
+ typename TLambda>
+inline void
+createSuffixArray(TSA & SA,
+ StringSet<TString, TSSetSpec> const & s,
+ RadixSortSACreateTag const &,
+ TLambda const & progressCallback)
+{
+ typedef typename Size<TSA>::Type TSize;
+ typedef typename Iterator<TSA, Standard>::Type TIter;
+
+ // 1. Fill suffix array with a permutation (the identity)
+ TIter it = begin(SA, Standard());
+ for(unsigned j = 0; j < length(s); ++j)
+ {
+ TSize len = length(s[j]);
+ for(TSize i = 0; i < len; ++i, ++it)
+ *it = Pair<unsigned, TSize>(j, i);
+ }
+
+ // 2. Sort suffix array with inplace radix Sort
+ inPlaceRadixSort(SA, s, progressCallback);
+}
+
+template <typename TSA,
+ typename TString,
+ typename TSSetSpec>
+inline void
+createSuffixArray(TSA & SA,
+ StringSet<TString, TSSetSpec> const & s,
+ RadixSortSACreateTag const &)
+{
+ createSuffixArray(SA, s, RadixSortSACreateTag(), [] (unsigned) {});
+}
+
+}
+
+#endif // #ifndef CORE_INCLUDE_SEQAN_INDEX_RADIX_INPLACE_H_
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..90b4a07
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,28 @@
+
+
+cmake_minimum_required (VERSION 2.8.2)
+
+message (STATUS "Configuring tests")
+
+## basic indexer tests
+foreach(PROG blastn blastp blastx tblastn tblastx)
+ foreach(DI sa fm)
+ add_test (NAME test_mkindex_${PROG}_${DI}
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/maintests.sh
+ "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" ${PROG} ${DI} "MKINDEX" " ")
+ endforeach()
+endforeach()
+
+## basic search tests
+foreach(PROG blastn blastp blastx tblastn tblastx)
+ foreach(DI sa fm)
+ foreach(FF m0 m8 m9 sam bam m9.gz sam.bz2)
+ add_test (NAME test_search_${PROG}_${DI}_${FF}
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/maintests.sh
+ "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" ${PROG} ${DI} "SEARCH" ${FF})
+ endforeach()
+ endforeach()
+endforeach()
+
+## TODO remove files afterwards, remove folder hierarchy
+## only .fasta.gz and md5sums files are required
\ No newline at end of file
diff --git a/tests/db_nucl.fasta.gz b/tests/db_nucl.fasta.gz
new file mode 100644
index 0000000..8e22932
Binary files /dev/null and b/tests/db_nucl.fasta.gz differ
diff --git a/tests/db_nucl_fm.md5sums.gz b/tests/db_nucl_fm.md5sums.gz
new file mode 100644
index 0000000..ceb3113
Binary files /dev/null and b/tests/db_nucl_fm.md5sums.gz differ
diff --git a/tests/db_nucl_sa.md5sums.gz b/tests/db_nucl_sa.md5sums.gz
new file mode 100644
index 0000000..1a0ea3c
Binary files /dev/null and b/tests/db_nucl_sa.md5sums.gz differ
diff --git a/tests/db_prot.fasta.gz b/tests/db_prot.fasta.gz
new file mode 100644
index 0000000..1cae22f
Binary files /dev/null and b/tests/db_prot.fasta.gz differ
diff --git a/tests/db_prot_fm.md5sums.gz b/tests/db_prot_fm.md5sums.gz
new file mode 100644
index 0000000..9a18722
Binary files /dev/null and b/tests/db_prot_fm.md5sums.gz differ
diff --git a/tests/db_prot_sa.md5sums.gz b/tests/db_prot_sa.md5sums.gz
new file mode 100644
index 0000000..33817b0
Binary files /dev/null and b/tests/db_prot_sa.md5sums.gz differ
diff --git a/tests/db_trans_fm.md5sums.gz b/tests/db_trans_fm.md5sums.gz
new file mode 100644
index 0000000..b27f44c
Binary files /dev/null and b/tests/db_trans_fm.md5sums.gz differ
diff --git a/tests/db_trans_sa.md5sums.gz b/tests/db_trans_sa.md5sums.gz
new file mode 100644
index 0000000..cc983a5
Binary files /dev/null and b/tests/db_trans_sa.md5sums.gz differ
diff --git a/tests/maintests.sh b/tests/maintests.sh
new file mode 100755
index 0000000..809b9e0
--- /dev/null
+++ b/tests/maintests.sh
@@ -0,0 +1,83 @@
+#!/bin/sh
+
+errorout()
+{
+ echo $1 #> /dev/stderr
+ [ "$MYTMP" = "" ] || rm -r "${MYTMP}"
+ exit 1
+}
+
+[ $# -ne 6 ] && exit 1
+
+SRCDIR=$1
+BINDIR=$2
+PROG=$3
+DI=$4
+MODE=$5
+EXTENSION=$6
+
+# check existence of commands
+which openssl gunzip mktemp diff cat zcat zgrep > /dev/null
+[ $? -eq 0 ] || errorout "Not all required programs found. Needs: openssl gunzip mktemp diff cat zcat zgrep"
+
+SALPH=prot # actual subject alph
+QALPHIN=prot # query input file alph
+SALPHIN=prot # subject input file alph
+case "$PROG" in "blastn")
+ QALPHIN=nucl
+ SALPH=nucl
+ SALPHIN=nucl
+ ;;
+"blastp")
+ ;;
+"blastx")
+ QALPHIN=nucl
+ ;;
+"tblastn")
+ SALPH=trans
+ SALPHIN=nucl
+ ;;
+"tblastx")
+ SALPH=trans
+ QALPHIN=nucl
+ SALPHIN=nucl
+ ;;
+esac
+
+MYTMP="$(mktemp -q -d -t "$(basename "$0").XXXXXX" 2>/dev/null || mktemp -q -d)"
+[ $? -eq 0 ] || errorout "Could not create tmp"
+
+cd "$MYTMP"
+[ $? -eq 0 ] || errorout "Could not cd to tmp"
+
+gunzip < "${SRCDIR}/tests/db_${SALPHIN}.fasta.gz" > db.fasta
+[ $? -eq 0 ] || errorout "Could not unzip database file"
+
+${BINDIR}/bin/lambda_indexer -d db.fasta -di ${DI} -p ${PROG}
+[ $? -eq 0 ] || errorout "Could not run the indexer"
+
+openssl md5 * > md5sums
+[ $? -eq 0 ] || errorout "Could not run md5 or md5sums"
+
+gunzip < "${SRCDIR}/tests/db_${SALPH}_${DI}.md5sums.gz" > md5sums.orig
+[ $? -eq 0 ] || errorout "Could not unzip md5sums.orig"
+
+[ "$(cat md5sums)" = "$(cat md5sums.orig)" ] || errorout "$(diff -u md5sums md5sums.orig)"
+
+## INDEXER tests end here
+if [ "$MODE" = "MKINDEX" ]; then
+ rm -r "${MYTMP}"
+ exit 0
+fi
+
+gunzip < "${SRCDIR}/tests/queries_${QALPHIN}.fasta.gz" > queries.fasta
+[ $? -eq 0 ] || errorout "Could not unzip queries.fasta"
+
+${BINDIR}/bin/lambda -d db.fasta -di ${DI} -p ${PROG} -q queries.fasta -t 1 --version-to-outputfile off \
+-o output_${PROG}_${DI}.${EXTENSION}
+[ $? -eq 0 ] || errorout "Search failed."
+
+[ "$(openssl md5 output_${PROG}_${DI}.${EXTENSION})" = \
+"$(zgrep "(output_${PROG}_${DI}.${EXTENSION})" "${SRCDIR}/tests/search_test_outfile.md5sums.gz")" ] || errorout "MD5 mismatch of output file"
+
+rm -r "${MYTMP}"
diff --git a/tests/queries_nucl.fasta.gz b/tests/queries_nucl.fasta.gz
new file mode 100644
index 0000000..4915e59
Binary files /dev/null and b/tests/queries_nucl.fasta.gz differ
diff --git a/tests/queries_prot.fasta.gz b/tests/queries_prot.fasta.gz
new file mode 100644
index 0000000..954bb4e
Binary files /dev/null and b/tests/queries_prot.fasta.gz differ
diff --git a/tests/search_test_outfile.md5sums.gz b/tests/search_test_outfile.md5sums.gz
new file mode 100644
index 0000000..47e24de
Binary files /dev/null and b/tests/search_test_outfile.md5sums.gz differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/lambda-align.git
More information about the debian-med-commit
mailing list