[med-svn] [diamond-aligner] 01/03: New upstream version 0.9.8+dfsg
Andreas Tille
tille at debian.org
Mon Jun 19 06:43:49 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository diamond-aligner.
commit e37193dbc8ab9fab4ae180e66960041e2a78f710
Author: Andreas Tille <tille at debian.org>
Date: Mon Jun 19 08:14:07 2017 +0200
New upstream version 0.9.8+dfsg
---
CMakeLists.txt | 14 +-
LICENSE | 619 +++++++++++++++++++++++++
README.rst | 3 +-
build_simple.sh | 16 +-
src/COPYING | 12 -
src/ChangeLog | 53 +++
src/align/align.cpp | 194 +++++++-
src/align/align.h | 91 +++-
src/align/align_queries.h | 26 +-
src/align/align_struct.h | 24 +-
src/align/align_target.cpp | 259 ++++++++---
src/align/extend_ungapped.h | 24 +-
src/align/match_func.h | 26 +-
src/align/query_mapper.cpp | 109 +++--
src/align/query_mapper.h | 77 ++-
src/basic/basic.cpp | 56 ++-
src/basic/config.cpp | 153 +++---
src/basic/config.h | 63 ++-
src/basic/const.h | 26 +-
src/basic/hssp.cpp | 40 +-
src/basic/masking.cpp | 94 ++++
src/basic/masking.h | 50 ++
src/basic/match.h | 75 ++-
src/basic/packed_loc.h | 26 +-
src/basic/packed_sequence.h | 25 +-
src/basic/packed_transcript.h | 45 +-
src/basic/reduction.h | 35 +-
src/basic/score_matrix.cpp | 65 ++-
src/basic/score_matrix.h | 46 +-
src/basic/seed.h | 24 +-
src/basic/seed_iterator.h | 97 +++-
src/basic/sequence.h | 45 +-
src/basic/shape.h | 58 ++-
src/basic/shape_config.h | 30 +-
src/basic/statistics.h | 77 +--
src/basic/translate.h | 26 +-
src/basic/value.h | 24 +-
src/data/count_approximate.cpp | 110 -----
src/data/frequent_seeds.cpp | 26 +-
src/data/frequent_seeds.h | 29 +-
src/data/index.cpp | 86 ----
src/data/index.h | 24 +-
src/data/load_seqs.h | 71 +--
src/data/queries.cpp | 26 +-
src/data/queries.h | 30 +-
src/data/reference.cpp | 105 +++--
src/data/reference.h | 28 +-
src/data/seed_histogram.cpp | 39 +-
src/data/seed_histogram.h | 105 ++---
src/data/seed_set.cpp | 82 ++++
src/data/seed_set.h | 52 +++
src/data/sequence_set.h | 241 +++++-----
src/data/sorted_list.cpp | 76 +++
src/data/sorted_list.h | 153 +++---
src/data/string_set.h | 38 +-
src/data/taxonomy.cpp | 66 +++
src/data/taxonomy.h | 90 ++++
src/dp/banded_sw.cpp | 316 +++++++++++++
src/dp/comp_based_stats.cpp | 33 +-
src/dp/diag_scores.cpp | 315 +++++++++++++
src/dp/dp.h | 416 +++++++++++++++--
src/dp/dp_matrix.h | 26 +-
src/dp/floating_sw.cpp | 43 +-
src/dp/floating_sw.h | 25 +-
src/dp/greedy_align.cpp | 876 ++++++++++++++++++-----------------
src/dp/growing_buffer.h | 25 +-
src/dp/needleman_wunsch.cpp | 211 +++++++--
src/dp/padded_banded_sw.cpp | 24 +-
src/dp/scalar_dp_matrix.h | 25 +-
src/dp/scalar_traceback.h | 25 +-
src/dp/score_profile.h | 30 +-
src/dp/score_vector.h | 33 +-
src/dp/smith_waterman.cpp | 24 +-
src/dp/smith_waterman.h | 26 +-
src/dp/swipe.cpp | 249 ++++++++++
src/dp/traceback.h | 28 +-
src/dp/ungapped_align.cpp | 109 ++++-
src/extra/blast_record.h | 18 +
src/extra/compare.h | 18 +
src/extra/extra.h | 24 +-
src/extra/match_file.h | 19 +
src/extra/model_sim.cpp | 24 +-
src/extra/opt.cpp | 24 +-
src/extra/roc.cpp | 25 +-
src/lib/tantan/tantan.cc | 462 ++++++++++++++++++
src/lib/tantan/tantan.hh | 120 +++++
src/output/blast_pairwise_format.cpp | 37 +-
src/output/blast_tab_format.cpp | 64 ++-
src/output/daa_file.h | 24 +-
src/output/daa_record.cpp | 29 +-
src/output/daa_record.h | 24 +-
src/output/daa_write.h | 38 +-
src/output/join_blocks.cpp | 24 +-
src/output/output.h | 36 +-
src/output/output_file.h | 64 ---
src/output/output_format.cpp | 90 +++-
src/output/output_format.h | 72 ++-
src/output/sam_format.cpp | 26 +-
src/output/view.h | 49 +-
src/run/benchmark.cpp | 356 ++++++++++++--
src/run/double_indexed.cpp | 184 +++++---
src/run/main.cpp | 43 +-
src/run/mapper.cpp | 112 -----
src/run/tools.cpp | 62 ++-
src/run/tools.h | 24 +-
src/search/align_range.h | 26 +-
src/search/collision.cpp | 34 +-
src/search/collision.h | 24 +-
src/search/hit_filter.h | 30 +-
src/search/search.cpp | 24 +-
src/search/search_query.cpp | 51 --
src/search/setup.cpp | 79 +++-
src/search/sse_dist.h | 31 +-
src/search/stage2.cpp | 63 +--
src/search/trace_pt_buffer.h | 26 +-
src/util/async_buffer.h | 86 +++-
src/util/binary_buffer.h | 25 +-
src/util/binary_file.cpp | 355 ++++++++++++++
src/util/binary_file.h | 416 ++---------------
src/util/command_line_parser.cpp | 24 +-
src/util/command_line_parser.h | 24 +-
src/util/complexity_filter.h | 26 +-
src/util/compressed_stream.cpp | 61 +--
src/util/compressed_stream.h | 26 +-
src/util/direction.h | 26 +-
src/util/double_buffer.h | 26 +-
src/util/hash_function.h | 26 +-
src/util/hash_table.h | 80 +++-
src/util/high_res_timer.h | 61 +++
src/util/log_stream.h | 28 +-
src/util/map.h | 30 +-
src/util/merge_sort.h | 26 +-
src/util/ptr_vector.h | 24 +-
src/util/radix_sort.h | 24 +-
src/util/seq_file_format.cpp | 24 +-
src/util/seq_file_format.h | 26 +-
src/util/simd.cpp | 60 +++
src/util/simd.h | 53 +--
src/util/system.h | 45 +-
src/util/system_c.h | 18 +
src/util/task_queue.h | 26 +-
src/util/temp_file.h | 24 +-
src/util/text_buffer.h | 62 ++-
src/util/thread.h | 90 +++-
src/util/tinythread.cpp | 301 ++++++------
src/util/util.cpp | 70 +--
src/util/util.h | 142 +++++-
147 files changed, 8059 insertions(+), 3819 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5de19c1..05d6c12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,9 +59,6 @@ add_executable(diamond src/run/main.cpp
src/search/stage2.cpp
src/output/output_format.cpp
src/output/join_blocks.cpp
- src/run/mapper.cpp
- src/data/count_approximate.cpp
- src/data/index.cpp
src/data/frequent_seeds.cpp
src/align/query_mapper.cpp
src/align/align_target.cpp
@@ -73,12 +70,21 @@ add_executable(diamond src/run/main.cpp
src/dp/comp_based_stats.cpp
src/extra/model_sim.cpp
src/run/double_indexed.cpp
- src/search/search_query.cpp
src/search/collision.cpp
src/output/sam_format.cpp
src/align/align.cpp
src/search/setup.cpp
src/extra/opt.cpp
+ src/dp/diag_scores.cpp
+ src/data/taxonomy.cpp
+ src/lib/tantan/tantan.cc
+ src/basic/masking.cpp
+ src/dp/swipe.cpp
+ src/dp/banded_sw.cpp
+ src/data/sorted_list.cpp
+ src/data/seed_set.cpp
+ src/util/binary_file.cpp
+ src/util/simd.cpp
)
target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..4ec8c3f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,619 @@
+ GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
diff --git a/README.rst b/README.rst
index 7b17658..7d043f3 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond
Installing the software on your system may be done by downloading it in binary format for immediate use::
- wget http://github.com/bbuchfink/diamond/releases/download/v0.8.35/diamond-linux64.tar.gz
+ wget http://github.com/bbuchfink/diamond/releases/download/v0.9.8/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable).
@@ -29,6 +29,7 @@ The output file here is specified with the ``–o`` option and named ``matches.m
- The program may use quite a lot of memory and also temporary disk space. Should the program fail due to running out of either one, you need to set a lower value for the block size parameter ``-b`` (see the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond_manual.pdf>`_).
- The default (fast) mode was mainly designed for short reads. For longer sequences, the sensitive modes (options ``--sensitive`` or ``--more-sensitive``) are recommended.
- The runtime of the program is not linear in the size of the query file and it is much more efficient for large query files (> 1 million sequences) than for smaller ones.
+ - Low complexity masking is applied to the query and reference sequences by default. Masked residues appear in the output as X.
- The default e-value cutoff of DIAMOND is 0.001 while that of BLAST is 10, so by default the program will search a lot more stringently than BLAST and not report weak hits.
About
=====
diff --git a/build_simple.sh b/build_simple.sh
index 4d93287..17fb696 100755
--- a/build_simple.sh
+++ b/build_simple.sh
@@ -1,5 +1,5 @@
gcc -c -O3 -DNDEBUG src/blast/sm_blosum45.c src/blast/sm_blosum50.c src/blast/sm_blosum62.c src/blast/sm_blosum80.c src/blast/sm_blosum90.c src/blast/sm_pam30.c src/blast/sm_pam70.c src/blast/sm_pam250.c
-g++ -DNDEBUG -O3 -mssse3 -Wno-deprecated-declarations -std=gnu++98 $1 \
+g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
sm*.o \
src/run/main.cpp \
src/basic/config.cpp \
@@ -27,9 +27,6 @@ g++ -DNDEBUG -O3 -mssse3 -Wno-deprecated-declarations -std=gnu++98 $1 \
src/search/stage2.cpp \
src/output/output_format.cpp \
src/output/join_blocks.cpp \
- src/run/mapper.cpp \
- src/data/count_approximate.cpp \
- src/data/index.cpp \
src/data/frequent_seeds.cpp \
src/align/query_mapper.cpp \
src/align/align_target.cpp \
@@ -41,10 +38,19 @@ g++ -DNDEBUG -O3 -mssse3 -Wno-deprecated-declarations -std=gnu++98 $1 \
src/dp/comp_based_stats.cpp \
src/extra/model_sim.cpp \
src/run/double_indexed.cpp \
- src/search/search_query.cpp \
src/search/collision.cpp \
src/output/sam_format.cpp \
src/align/align.cpp \
src/search/setup.cpp \
src/extra/opt.cpp \
+ src/dp/diag_scores.cpp \
+ src/data/taxonomy.cpp \
+ src/lib/tantan/tantan.cc \
+ src/basic/masking.cpp \
+ src/dp/swipe.cpp \
+ src/dp/banded_sw.cpp \
+ src/data/sorted_list.cpp \
+ src/data/seed_set.cpp \
+ src/util/binary_file.cpp \
+ src/util/simd.cpp \
-lz -lpthread -o diamond
diff --git a/src/COPYING b/src/COPYING
deleted file mode 100644
index abdac22..0000000
--- a/src/COPYING
+++ /dev/null
@@ -1,12 +0,0 @@
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROF [...]
diff --git a/src/ChangeLog b/src/ChangeLog
index 4b0ef45..b1f7103 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,56 @@
+[0.9.8]
+- Fixed a compiler errror.
+
+[0.9.7]
+- Fixed compiler errors.
+- Changed XML format to print accessions in the Hit_id and Hit_accession fields.
+
+[0.9.6]
+- Fixed compiler errors.
+
+[0.9.5]
+- Added support for named pipes.
+- Added support for reading input files from stdin.
+- Added more elaborate file I/O error messages.
+
+[0.9.4]
+- Improved performance.
+- Fixed a bug in the query-indexed algorithm.
+- Empty sequences are ignored instead of generating an error.
+
+[0.9.3]
+- Fixed a bug that could cause hanging.
+- Fixed a bug that could cause an error when using the staxids output field and the --unal option.
+
+[0.9.2]
+- Fixed a compiler error.
+- Improved performance for very small query files.
+
+[0.9.1]
+- fixed a performance issue
+
+[0.9.0]
+- improved performance
+- improved support for alignments with long gaps
+- removed SEG masking
+- added low complexity masking using tantan
+- changed license to AGPL
+
+[0.8.38]
+- fixed std::exception error messages
+- fixed sequence titles in XML format
+- XML and pairwise format contain full length titles by default
+
+[0.8.37]
+- fixed a bug that would cause an error message for empty DAA files
+- all scoring matrices use the respective default gap penalties from BLAST
+- added check for SSSE3 instruction set
+- added diamond-sse2 to the binary package
+- added staxids field to the tabular format
+
+[0.8.36]
+- fixed a compiler error
+
[0.8.35]
- added a check to detect incomplete database files
- database files will be deleted in case database building fails
diff --git a/src/align/align.cpp b/src/align/align.cpp
index 3d5745a..8b6795c 100644
--- a/src/align/align.cpp
+++ b/src/align/align.cpp
@@ -1,25 +1,158 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
+#include "../basic/value.h"
+#include "align.h"
#include "align_queries.h"
#include "../data/reference.h"
#include "../output/output_format.h"
+using std::map;
+
+auto_ptr<Simple_query_queue> Simple_query_queue::instance;
+auto_ptr<Output_sink> Output_sink::instance;
+
+size_t Simple_query_queue::get(vector<hit>::iterator &begin, vector<hit>::iterator &end)
+{
+ mtx_.lock();
+ const unsigned query = (unsigned)(next_++),
+ c = align_mode.query_contexts;
+ if (query >= qend_) {
+ mtx_.unlock();
+ return Simple_query_queue::end;
+ }
+ begin = it_;
+ while (it_ < end_ && it_->query_ / c == query)
+ ++it_;
+ end = it_;
+ mtx_.unlock();
+ return query;
+}
+
+void Output_sink::push(size_t n, Text_buffer *buf)
+{
+ mtx_.lock();
+ //cout << "n=" << n << " next=" << next_ << endl;
+ if (n != next_) {
+ backlog_[n] = buf;
+ size_ += buf ? buf->alloc_size() : 0;
+ max_size_ = std::max(max_size_, size_);
+ mtx_.unlock();
+ }
+ else
+ flush(buf);
+}
+
+void Output_sink::flush(Text_buffer *buf)
+{
+ size_t n = next_ + 1;
+ vector<Text_buffer*> out;
+ out.push_back(buf);
+ map<size_t, Text_buffer*>::iterator i;
+ do {
+ while ((i = backlog_.begin()) != backlog_.end() && i->first == n) {
+ out.push_back(i->second);
+ backlog_.erase(i);
+ ++n;
+ }
+ mtx_.unlock();
+ size_t size = 0;
+ for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) {
+ if (*j) {
+ f_->write((*j)->get_begin(), (*j)->size());
+ if(*j != buf)
+ size += (*j)->alloc_size();
+ delete *j;
+ }
+ }
+ out.clear();
+ mtx_.lock();
+ size_ -= size;
+ } while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
+ next_ = n;
+ mtx_.unlock();
+}
+
+void align_worker(size_t thread_id)
+{
+ vector<hit>::iterator begin, end;
+ size_t query;
+ Statistics stat;
+ while ((query = Simple_query_queue::get().get(begin, end)) != Simple_query_queue::end) {
+ if (end == begin) {
+ Text_buffer *buf = 0;
+ if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
+ buf = new Text_buffer;
+ output_format->print_query_intro(query, query_ids::get()[query].c_str(), get_source_query_len((unsigned)query), *buf, true);
+ output_format->print_query_epilog(*buf, true);
+ }
+ Output_sink::get().push(query, buf);
+ continue;
+ }
+ Query_mapper mapper(query, begin, end);
+ mapper.init();
+ if (config.ext == Config::swipe)
+ mapper.align_targets(stat);
+ else if (mapper.n_targets() > 0) {
+ stat.inc(Statistics::TARGET_HITS0, mapper.n_targets());
+ for (size_t i = 0; i < mapper.n_targets(); ++i)
+ mapper.ungapped_stage(i);
+ if (config.ext != Config::most_greedy) {
+ mapper.rank_targets(config.rank_ratio == -1 ? (mapper.query_seq(0).length() > 50 ? 0.6 : 0.9) : config.rank_ratio);
+ stat.inc(Statistics::TARGET_HITS1, mapper.n_targets());
+ const int cutoff = int(mapper.raw_score_cutoff() * config.score_ratio);
+ for (size_t i = 0; i < mapper.n_targets(); ++i)
+ mapper.greedy_stage(i, stat, cutoff);
+ mapper.rank_targets(config.rank_ratio2 == -1 ? (mapper.query_seq(0).length() > 50 ? 0.95 : 1.0) : config.rank_ratio2);
+ stat.inc(Statistics::TARGET_HITS2, mapper.n_targets());
+ for (size_t i = 0; i < mapper.n_targets(); ++i)
+ mapper.align_target(i, stat);
+ }
+ }
+ Text_buffer *buf = 0;
+ if (*output_format != Output_format::null) {
+ buf = new Text_buffer;
+ const bool aligned = mapper.generate_output(*buf, stat);
+ if (aligned && !config.unaligned.empty())
+ query_aligned[query] = true;
+ }
+ Output_sink::get().push(query, buf);
+ }
+ statistics += stat;
+}
+
+void heartbeat_worker()
+{
+ static const int interval = 100;
+ int n = 0;
+ while (Output_sink::get().next() < Simple_query_queue::get().qend()) {
+ if (n == interval) {
+ const string title(query_ids::get()[Output_sink::get().next()].c_str());
+ verbose_stream << "Queries=" << Simple_query_queue::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size())
+ << " next=" << title.substr(0, title.find(' ')) << endl;
+ n = 0;
+ }
+ else
+ ++n;
+ tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
+ }
+}
+
Query_queue query_queue;
void Query_queue::init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end)
@@ -141,27 +274,44 @@ void align_worker(Output_stream *out)
statistics += stat;
}
-void align_queries(const Trace_pt_buffer &trace_pts, Output_stream* output_file)
+void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file)
{
query_queue.last_query = (unsigned)-1;
- for (unsigned bin = 0; bin < trace_pts.bins(); ++bin) {
- log_stream << "Processing query bin " << bin + 1 << '/' << trace_pts.bins() << '\n';
+ const size_t max_size = (size_t)std::min(config.chunk_size*1e9 * 9 * 2 / config.lowmem, 2e9);
+ pair<size_t, size_t> query_range;
+ while(true) {
task_timer timer("Loading trace points", 3);
Trace_pt_list *v = new Trace_pt_list;
- statistics.max(Statistics::TEMP_SPACE, trace_pts.load(*v, bin));
+ statistics.max(Statistics::TEMP_SPACE, trace_pts.load(*v, max_size, query_range));
+ if (query_range.second - query_range.first == 0) {
+ delete v;
+ break;
+ }
timer.go("Sorting trace points");
merge_sort(v->begin(), v->end(), config.threads_);
v->init();
timer.go("Computing alignments");
- query_queue.init(v->begin(), v->end());
- Thread_pool threads;
- for (unsigned i = 0; i < config.threads_; ++i)
- threads.push_back(launch_thread(align_worker, output_file));
- threads.join_all();
+ if (config.load_balancing == Config::target_parallel) {
+ query_queue.init(v->begin(), v->end());
+ Thread_pool threads;
+ for (unsigned i = 0; i < config.threads_; ++i)
+ threads.push_back(launch_thread(static_cast<void (*)(Output_stream*)>(&align_worker), output_file));
+ threads.join_all();
+ }
+ else {
+ Simple_query_queue::instance = auto_ptr<Simple_query_queue>(new Simple_query_queue(query_range.first, query_range.second, v->begin(), v->end()));
+ Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file));
+ Thread_pool threads;
+ if (config.verbosity >= 3)
+ threads.push_back(launch_thread(heartbeat_worker));
+ for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i)
+ threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i));
+ threads.join_all();
+ }
timer.go("Deallocating buffers");
delete v;
}
- if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
+ if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) {
Text_buffer buf;
for (unsigned i = query_queue.last_query + 1; i < query_ids::get().get_length(); ++i) {
output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buf, true);
diff --git a/src/align/align.h b/src/align/align.h
index 668a4e3..33588b7 100644
--- a/src/align/align.h
+++ b/src/align/align.h
@@ -1,31 +1,34 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef ALIGN_H_
#define ALIGN_H_
+#include <memory>
#include <vector>
+#include <map>
#include "../search/trace_pt_buffer.h"
#include "../util/task_queue.h"
#include "../basic/statistics.h"
#include "align_struct.h"
using std::vector;
+using std::auto_ptr;
struct Output_writer
{
@@ -61,4 +64,68 @@ private:
Task_queue<_buffer, Output_writer> queue;
};
+struct Simple_query_queue
+{
+ enum { end = 0xffffffffffffffffllu };
+ Simple_query_queue(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end):
+ next_(qbegin),
+ qend_(qend),
+ it_(begin),
+ end_(end)
+ {}
+ size_t get(vector<hit>::iterator &begin, vector<hit>::iterator &end);
+ size_t next() const
+ {
+ return next_;
+ }
+ size_t qend() const
+ {
+ return qend_;
+ }
+ static Simple_query_queue& get()
+ {
+ return *instance;
+ }
+ static auto_ptr<Simple_query_queue> instance;
+private:
+ tthread::mutex mtx_;
+ size_t next_;
+ const size_t qend_;
+ vector<hit>::iterator it_, end_;
+};
+
+struct Output_sink
+{
+ Output_sink(size_t begin, Output_stream *f):
+ f_(f),
+ next_(begin),
+ size_(0),
+ max_size_(0)
+ {}
+ void push(size_t n, Text_buffer *buf);
+ size_t size() const
+ {
+ return size_;
+ }
+ size_t max_size() const
+ {
+ return max_size_;
+ }
+ static Output_sink& get()
+ {
+ return *instance;
+ }
+ size_t next() const
+ {
+ return next_;
+ }
+ static auto_ptr<Output_sink> instance;
+private:
+ void flush(Text_buffer *buf);
+ tthread::mutex mtx_;
+ Output_stream* const f_;
+ std::map<size_t, Text_buffer*> backlog_;
+ size_t next_, size_, max_size_;
+};
+
#endif
\ No newline at end of file
diff --git a/src/align/align_queries.h b/src/align/align_queries.h
index 6dfae85..8409f46 100644
--- a/src/align/align_queries.h
+++ b/src/align/align_queries.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef ALIGN_QUERIES_H_
@@ -62,6 +62,6 @@ struct Query_queue
extern Query_queue query_queue;
void align_worker(Output_stream *out);
-void align_queries(const Trace_pt_buffer &trace_pts, Output_stream* output_file);
+void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
#endif /* ALIGN_QUERIES_H_ */
diff --git a/src/align/align_struct.h b/src/align/align_struct.h
index bedeee8..49c96f9 100644
--- a/src/align/align_struct.h
+++ b/src/align/align_struct.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef ALIGN_STRUCT_H_
diff --git a/src/align/align_target.cpp b/src/align/align_target.cpp
index 573f476..43af301 100644
--- a/src/align/align_target.cpp
+++ b/src/align/align_target.cpp
@@ -1,32 +1,91 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "query_mapper.h"
#include "../data/reference.h"
#include "../dp/floating_sw.h"
+#include "../util/map.h"
+#include "../util/high_res_timer.h"
using std::list;
+// #define ENABLE_TIMING
+
+const bool log_ga = false;
+
+void Query_mapper::ungapped_stage(size_t idx)
+{
+ Target& target = targets[idx];
+ const string subject_id(ref_ids::get()[target.subject_id].c_str());
+ if (config.log_subject)
+ cout << "Subject = " << subject_id << endl;
+ std::sort(seed_hits.begin() + target.begin, seed_hits.begin() + target.end, Seed_hit::compare_diag);
+ typedef Map<vector<Seed_hit>::const_iterator, Seed_hit::Frame> Hit_map;
+ Hit_map hit_map(seed_hits.begin() + target.begin, seed_hits.begin() + target.end);
+ const sequence subject = ref_seqs::get()[target.subject_id];
+ for (Hit_map::Iterator it = hit_map.begin(); it.valid(); ++it) {
+ const unsigned frame = it.begin()->frame_;
+ int score = greedy_align(query_seq(frame), profile[frame], query_cb[frame], subject, it.begin(), it.end(), log_ga, target.hsps, target.ts, frame);
+ //target.filter_score = std::max(target.filter_score, (unsigned)target.traits[frame].score);
+ if (score > target.filter_score) {
+ target.filter_score = score;
+ }
+ }
+ //target.filter_time = time;
+}
+
+void Query_mapper::greedy_stage(size_t idx, Statistics &stat, int cutoff)
+{
+ Target& target = targets[idx];
+ const sequence subject = ref_seqs::get()[target.subject_id];
+ const string subject_id(ref_ids::get()[target.subject_id].c_str());
+ if (config.log_subject)
+ cout << "Subject = " << subject_id << endl;
+#ifdef ENABLE_TIMING
+ High_res_timer timer;
+#endif
+ target.filter_score = 0;
+ target.ts.sort(Hsp_traits::cmp_diag);
+ typedef Map<list<Hsp_traits>::const_iterator, Hsp_traits::Frame> Hsp_map;
+ Hsp_map ts(target.ts.begin(), target.ts.end());
+ list<Hsp_traits> t_out;
+ target.hsps.clear();
+ for (Hsp_map::Iterator it = ts.begin(); it.valid(); ++it) {
+ const unsigned frame = it.begin()->frame;
+ target.filter_score = std::max(target.filter_score, greedy_align(query_seq(frame), profile[frame], query_cb[frame], subject, log_ga, target.hsps, it.begin(), it.end(), t_out, cutoff, frame));
+ }
+ target.ts.clear();
+ target.ts.splice(target.ts.begin(), t_out);
+ //stat.inc(Statistics::TIME_GREEDY_EXT, timer.nanoseconds());
+#ifdef ENABLE_TIMING
+ target.filter_time = (float)timer.get();
+#endif
+}
+
void Query_mapper::get_prefilter_score(size_t idx)
{
static const int max_dist = 64;
+ if (config.ext == Config::greedy || config.ext == Config::more_greedy)
+ return;
+
Target& target = targets[idx];
+
const size_t n = target.end - target.begin;
vector<Seed_hit>::iterator hits = seed_hits.begin() + target.begin;
std::sort(seed_hits.begin() + target.begin, seed_hits.begin() + target.end, Seed_hit::compare_pos);
@@ -44,7 +103,7 @@ void Query_mapper::get_prefilter_score(size_t idx)
if (abs(d.ungapped.i - e.ungapped.query_last()) >= max_dist)
continue;
const int shift = d.ungapped.diag() - e.ungapped.diag();
- int gap_score = -config.gap_open - abs(shift)*config.gap_extend;
+ int gap_score = -score_matrix.gap_open() - abs(shift)*score_matrix.gap_extend();
const int space = shift > 0 ? d.ungapped.j - e.ungapped.subject_last() : d.ungapped.i - e.ungapped.query_last();
int prefix_score;
if (space <= 0)
@@ -78,66 +137,120 @@ bool is_contained(const list<Hsp_data> &hsps, const Seed_hit &hit)
return false;
}
+pair<int, int> get_diag_range(vector<Seed_hit>::const_iterator begin, vector<Seed_hit>::const_iterator end, unsigned frame)
+{
+ int d_min = std::numeric_limits<int>::max(), d_max = std::numeric_limits<int>::min();
+ for (vector<Seed_hit>::const_iterator i = begin; i < end; ++i) {
+ if (i->frame_ == frame) {
+ const int d = i->diagonal();
+ d_min = std::min(d_min, d);
+ d_max = std::max(d_max, d);
+ }
+ }
+ return pair<int, int>(d_min, d_max);
+}
+
void Query_mapper::align_target(size_t idx, Statistics &stat)
{
typedef float score_t;
Target& target = targets[idx];
- std::sort(seed_hits.begin() + target.begin, seed_hits.begin() + target.end);
const size_t n = target.end - target.begin,
max_len = query_seq(0).length() + 100 * query_seqs::get().avg_len();
size_t aligned_len = 0;
const vector<Seed_hit>::const_iterator hits = seed_hits.begin() + target.begin;
+ const sequence subject = ref_seqs::get()[hits[0].subject_];
+ if (config.log_subject)
+ cout << "Subject = " << ref_ids::get()[target.subject_id].c_str() << endl;
- for (size_t i = 0; i < n; ++i) {
- if (!is_contained(hits, i) && !is_contained(target.hsps, hits[i])) {
+ unsigned frame_mask = (1 << align_mode.query_contexts) - 1;
+ stat.inc(Statistics::CELLS, query_seq(0).length() * subject.length());
+
+ if (config.ext == Config::floating_xdrop) {
+
+ std::sort(seed_hits.begin() + target.begin, seed_hits.begin() + target.end);
+
+ for (size_t i = 0; i < n; ++i) {
const unsigned frame = hits[i].frame_;
- target.hsps.push_back(Hsp_data());
- target.hsps.back().frame = frame;
- uint64_t cell_updates;
-
- if (false && config.comp_based_stats == 1)
- floating_sw(&query_seq(frame)[hits[i].query_pos_],
- &ref_seqs::get()[hits[i].subject_][hits[i].subject_pos_],
- target.hsps.back(),
- config.read_padding(query_seq(frame).length()),
- (score_t)score_matrix.rawscore(config.gapped_xdrop),
- (score_t)(config.gap_open + config.gap_extend),
- (score_t)config.gap_extend,
- cell_updates,
- hits[i].query_pos_,
- hits[i].subject_pos_,
- query_cb[frame],
- Traceback(),
- score_t());
+ if ((frame_mask & (1u << frame)) == 0)
+ continue;
+ if (!is_contained(hits, i) && !is_contained(target.hsps, hits[i])) {
+ target.hsps.push_back(Hsp_data());
+ target.hsps.back().frame = frame;
+ uint64_t cell_updates;
+
+ if (false && config.comp_based_stats == 1)
+ floating_sw(&query_seq(frame)[hits[i].query_pos_],
+ &subject[hits[i].subject_pos_],
+ target.hsps.back(),
+ config.read_padding(query_seq(frame).length()),
+ (score_t)score_matrix.rawscore(config.gapped_xdrop),
+ (score_t)(score_matrix.gap_open() + score_matrix.gap_extend()),
+ (score_t)score_matrix.gap_extend(),
+ cell_updates,
+ hits[i].query_pos_,
+ hits[i].subject_pos_,
+ 0,
+ query_cb[frame],
+ Traceback(),
+ score_t());
+ else
+ floating_sw(&query_seq(frame)[hits[i].query_pos_],
+ &subject[hits[i].subject_pos_],
+ target.hsps.back(),
+ config.read_padding(query_seq(frame).length()),
+ score_matrix.rawscore(config.gapped_xdrop),
+ score_matrix.gap_open() + score_matrix.gap_extend(),
+ score_matrix.gap_extend(),
+ cell_updates,
+ hits[i].query_pos_,
+ hits[i].subject_pos_,
+ 0,
+ No_score_correction(),
+ Traceback(),
+ int());
+
+ if (config.comp_based_stats) {
+ const int score = (int)target.hsps.back().score + query_cb[frame](target.hsps.back());
+ target.hsps.back().score = (unsigned)std::max(0, score);
+ }
+
+ stat.inc(Statistics::OUT_HITS);
+ if (i > 0)
+ stat.inc(Statistics::SECONDARY_HITS);
+ aligned_len += target.hsps.back().length;
+ if (aligned_len > max_len)
+ break;
+ }
else
- floating_sw(&query_seq(frame)[hits[i].query_pos_],
- &ref_seqs::get()[hits[i].subject_][hits[i].subject_pos_],
- target.hsps.back(),
- config.read_padding(query_seq(frame).length()),
- score_matrix.rawscore(config.gapped_xdrop),
- config.gap_open + config.gap_extend,
- config.gap_extend,
- cell_updates,
- hits[i].query_pos_,
- hits[i].subject_pos_,
- No_score_correction(),
- Traceback(),
- int());
-
- if (config.comp_based_stats) {
- const int score = (int)target.hsps.back().score + query_cb[frame](target.hsps.back());
- target.hsps.back().score = (unsigned)std::max(0, score);
+ stat.inc(Statistics::DUPLICATES);
+ }
+ }
+ else {
+ if (target.filter_score == 0)
+ return;
+ if (config.ext == Config::more_greedy)
+ target.hsps.push_back(Hsp_data(target.filter_score));
+ else {
+ const int qlen = (int)query_seq(0).length(),
+ band_plus = qlen <= 50 ? 0 : 16;
+ target.hsps.clear();
+ for (list<Hsp_traits>::const_iterator i = target.ts.begin(); i != target.ts.end(); ++i) {
+ if (log_ga) {
+ cout << "i_begin=" << i->query_range.begin_ << " j_begin=" << i->subject_range.begin_ << " d_min=" << i->d_min << " d_max=" << i->d_max << endl;
+ }
+ target.hsps.push_back(Hsp_data());
+ target.hsps.back().frame = i->frame;
+ banded_sw(query_seq(i->frame), subject, i->d_min - band_plus, i->d_max + band_plus + 1, 0, (int)subject.length(), target.hsps.back());
+
+ if (config.comp_based_stats) {
+ const int score = (int)target.hsps.back().score + query_cb[i->frame](target.hsps.back());
+ target.hsps.back().score = (unsigned)std::max(0, score);
+ }
}
+ }
+ if (!target.hsps.empty())
stat.inc(Statistics::OUT_HITS);
- if (i > 0)
- stat.inc(Statistics::SECONDARY_HITS);
- aligned_len += target.hsps.back().length;
- if (aligned_len > max_len)
- break;
- }
- else
- stat.inc(Statistics::DUPLICATES);
}
for (list<Hsp_data>::iterator i = target.hsps.begin(); i != target.hsps.end(); ++i)
@@ -149,10 +262,36 @@ void Query_mapper::align_target(size_t idx, Statistics &stat)
else
++j;
- for (list<Hsp_data>::iterator i = target.hsps.begin(); i != target.hsps.end(); ++i)
+ //const float time = (float)timer.getElapsedTimeInMicroSec() + target.filter_time;
+
+ for (list<Hsp_data>::iterator i = target.hsps.begin(); i != target.hsps.end(); ++i) {
+ i->time = target.filter_time;
i->set_source_range(i->frame, source_query_len);
+ }
target.hsps.sort();
if(target.hsps.size() > 0)
target.filter_score = target.hsps.front().score;
+
+ if (config.use_smith_waterman && !target.hsps.empty()) {
+ int score;
+ for (unsigned f = 0; f < align_mode.query_contexts; ++f) {
+ needleman_wunsch(query_seq(f), subject, score, Local(), int());
+ target.hsps.front().sw_score = std::max((unsigned)score, target.hsps.front().sw_score);
+ }
+ stat.inc(Statistics::SQUARED_ERROR, (stat_type)pow((int)target.hsps.front().sw_score - (int)target.hsps.front().score, 2));
+ }
+}
+
+void Query_mapper::align_targets(Statistics &stat)
+{
+ const size_t n = targets.size();
+ vector<sequence> seqs(n);
+ for (size_t i = 0; i < n; ++i) {
+ seqs[i] = ref_seqs::get()[targets[i].subject_id];
+ }
+ vector<int> scores(n);
+ swipe(query_seq(0), seqs.begin(), seqs.end(), scores.begin());
+ for (size_t i = 0; i < n; ++i)
+ targets[i].hsps.push_back(Hsp_data(scores[i]));
}
\ No newline at end of file
diff --git a/src/align/extend_ungapped.h b/src/align/extend_ungapped.h
index 0aad37b..7c578aa 100644
--- a/src/align/extend_ungapped.h
+++ b/src/align/extend_ungapped.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef EXTEND_UNGAPPED_H_
diff --git a/src/align/match_func.h b/src/align/match_func.h
index c8ab2b0..3944586 100644
--- a/src/align/match_func.h
+++ b/src/align/match_func.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef MATCH_FUNC_H_
diff --git a/src/align/query_mapper.cpp b/src/align/query_mapper.cpp
index 15efc45..401a52e 100644
--- a/src/align/query_mapper.cpp
+++ b/src/align/query_mapper.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "align_queries.h"
@@ -30,20 +30,44 @@ Query_mapper::Query_mapper() :
targets_finished(0),
next_target(0),
source_query_len(get_source_query_len(query_id)),
- unaligned_from(query_queue.last_query+1),
- seed_hits(source_hits.second - source_hits.first)
+ unaligned_from(query_queue.last_query+1)
{
query_queue.last_query = query_id;
+ seed_hits.reserve(source_hits.second - source_hits.first);
+}
+
+int Query_mapper::raw_score_cutoff() const
+{
+ return score_matrix.rawscore(config.min_bit_score == 0 ? score_matrix.bitscore(config.max_evalue, ref_header.letters, (unsigned)query_seq(0).length()) : config.min_bit_score);
+}
+
+Query_mapper::Query_mapper(size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end) :
+ source_hits(std::make_pair(begin, end)),
+ query_id((unsigned)query_id),
+ targets_finished(0),
+ next_target(0),
+ source_query_len(get_source_query_len((unsigned)query_id))
+{
+ seed_hits.reserve(source_hits.second - source_hits.first);
}
void Query_mapper::init()
{
- targets.resize(count_targets());
- load_targets();
- rank_targets();
- if(config.comp_based_stats == 1)
+ if(config.log_query)
+ cout << "Query = " << query_ids::get()[query_id].c_str() << endl;
+ if (config.comp_based_stats == 1)
for (unsigned i = 0; i < align_mode.query_contexts; ++i)
query_cb.push_back(Bias_correction(query_seq(i)));
+ if (config.ext == Config::greedy || config.ext == Config::more_greedy)
+ for (unsigned i = 0; i < align_mode.query_contexts; ++i)
+ profile.push_back(Long_score_profile(query_seq(i)));
+ //profile.push_back(Long_score_profile());
+ targets.resize(count_targets());
+ if (targets.empty())
+ return;
+ load_targets();
+ if (config.ext == Config::floating_xdrop)
+ rank_targets(config.rank_ratio == -1 ? 0.6 : config.rank_ratio);
}
pair<Trace_pt_list::iterator, Trace_pt_list::iterator> Query_mapper::get_query_data()
@@ -67,19 +91,21 @@ unsigned Query_mapper::count_targets()
unsigned n_subject = 0;
for (size_t i = 0; i < n; ++i) {
std::pair<size_t, size_t> l = ref_seqs::data_->local_position(hits[i].subject_);
- if (l.first != subject_id) {
- subject_id = l.first;
- ++n_subject;
- }
const unsigned frame = hits[i].query_ % align_mode.query_contexts;
- seed_hits[i] = Seed_hit(frame,
- (unsigned)l.first,
- (unsigned)l.second,
- hits[i].seed_offset_,
- ungapped_extension((unsigned)l.first,
+ /*const Diagonal_segment d = config.comp_based_stats ? xdrop_ungapped(query_seq(frame), query_cb[frame], ref_seqs::get()[l.first], hits[i].seed_offset_, (int)l.second)
+ : xdrop_ungapped(query_seq(frame), ref_seqs::get()[l.first], hits[i].seed_offset_, (int)l.second);*/
+ const Diagonal_segment d = xdrop_ungapped(query_seq(frame), ref_seqs::get()[l.first], hits[i].seed_offset_, (int)l.second);
+ if (d.score >= config.min_ungapped_raw_score) {
+ if (l.first != subject_id) {
+ subject_id = l.first;
+ ++n_subject;
+ }
+ seed_hits.push_back(Seed_hit(frame,
+ (unsigned)l.first,
(unsigned)l.second,
hits[i].seed_offset_,
- query_seq(frame)));
+ d));
+ }
}
return n_subject;
}
@@ -102,30 +128,34 @@ void Query_mapper::load_targets()
get_prefilter_score(n - 1);
}
-void Query_mapper::rank_targets()
+void Query_mapper::rank_targets(double ratio)
{
std::sort(targets.begin(), targets.end(), Target::compare);
- unsigned score = 0;
+ int score = 0;
if (config.toppercent < 100) {
- score = unsigned((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * config.rank_ratio);
+ score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio);
}
else {
size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments);
- score = unsigned((double)targets[min_idx - 1].filter_score * config.rank_ratio);
+ score = int((double)targets[min_idx - 1].filter_score * ratio);
}
unsigned i = 0;
for (; i < targets.size(); ++i)
if (targets[i].filter_score < score)
break;
-
- targets.erase(targets.begin() + std::min((unsigned)targets.size(), std::max((unsigned)(config.max_alignments*config.rank_factor), i)), targets.end());
+
+ if (config.benchmark_ranking)
+ for (unsigned j = i; j < targets.size(); ++j)
+ targets[j].outranked = true;
+ else
+ targets.erase(targets.begin() + i, targets.end());
}
bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
{
- if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
+ if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) {
for (unsigned i = unaligned_from; i < query_id; ++i) {
output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buffer, true);
output_format->print_query_epilog(buffer, true);
@@ -135,7 +165,7 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
std::sort(targets.begin(), targets.end(), Target::compare);
unsigned n_hsp = 0, n_target_seq = 0, hit_hsps = 0;
- const unsigned top_score = targets[0].filter_score, query_len = (unsigned)query_seq(0).length();
+ const unsigned top_score = targets.empty() ? 0 : targets[0].filter_score, query_len = (unsigned)query_seq(0).length();
size_t seek_pos = 0;
const char *query_title = query_ids::get()[query_id].c_str();
@@ -147,6 +177,9 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
if (!config.output_range(n_target_seq, targets[i].filter_score, top_score))
break;
+ if (targets[i].outranked)
+ stat.inc(Statistics::OUTRANKED_HITS);
+
const unsigned subject_len = (unsigned)ref_seqs::get()[targets[i].subject_id].length();
hit_hsps = 0;
@@ -157,7 +190,9 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
if (j->id_percent() < config.min_id
|| j->query_cover_percent(source_query_len) < config.query_cover
|| j->subject_cover_percent(subject_len) < config.subject_cover
- || (config.no_self_hits && j->identities == j->length && j->query_source_range.length() == source_query_len && j->subject_range.length() == subject_len && strcmp(query_title, ref_title) == 0))
+ || (config.no_self_hits &&
+ (config.ext == Config::more_greedy || (j->identities == j->length && j->query_source_range.length() == (int)source_query_len && j->subject_range.length() == (int)subject_len))
+ && strcmp(query_title, ref_title) == 0))
continue;
if (blocked_processing) {
diff --git a/src/align/query_mapper.h b/src/align/query_mapper.h
index 7be11c3..5da19c3 100644
--- a/src/align/query_mapper.h
+++ b/src/align/query_mapper.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef QUERY_MAPPER_H_
@@ -32,58 +32,40 @@ using std::vector;
using std::pair;
using std::list;
-struct Seed_hit
-{
- Seed_hit()
- {}
- Seed_hit(unsigned frame, unsigned subject, unsigned subject_pos, unsigned query_pos, const Diagonal_segment &ungapped) :
- frame_(frame),
- subject_(subject),
- subject_pos_(subject_pos),
- query_pos_(query_pos),
- ungapped(ungapped),
- prefix_score(ungapped.score)
- { }
- int diagonal() const
- {
- return (int)subject_pos_ - (int)query_pos_;
- }
- bool operator<(const Seed_hit &rhs) const
- {
- return ungapped.score > rhs.ungapped.score;
- }
- static bool compare_pos(const Seed_hit &x, const Seed_hit &y)
- {
- return Diagonal_segment::cmp_subject_end(x.ungapped, y.ungapped);
- }
- unsigned frame_, subject_, subject_pos_, query_pos_;
- Diagonal_segment ungapped;
- unsigned prefix_score;
-};
-
struct Target
{
Target(size_t begin, unsigned subject_id) :
subject_id(subject_id),
filter_score(0),
+ outranked(false),
begin(begin)
{}
static bool compare(Target* lhs, Target *rhs)
{
return lhs->filter_score > rhs->filter_score;
}
- unsigned subject_id, filter_score;
+ unsigned subject_id;
+ int filter_score;
+ float filter_time;
+ bool outranked;
size_t begin, end;
list<Hsp_data> hsps;
+ list<Hsp_traits> ts;
};
struct Query_mapper
{
Query_mapper();
+ Query_mapper(size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end);
void init();
void get_prefilter_score(size_t idx);
void align_target(size_t idx, Statistics &stat);
+ void align_targets(Statistics &stat);
bool generate_output(Text_buffer &buffer, Statistics &stat);
+ void ungapped_stage(size_t idx);
+ void greedy_stage(size_t idx, Statistics &stat, int cutoff);
+ void rank_targets(double ratio);
+ int raw_score_cutoff() const;
size_t n_targets() const
{
return targets.size();
@@ -92,26 +74,27 @@ struct Query_mapper
{
return targets_finished == targets.size();
}
+ sequence query_seq(unsigned frame) const
+ {
+ return query_seqs::get()[query_id*align_mode.query_contexts + frame];
+ }
pair<Trace_pt_list::iterator, Trace_pt_list::iterator> source_hits;
unsigned query_id, targets_finished, next_target;
private:
static pair<Trace_pt_list::iterator, Trace_pt_list::iterator> get_query_data();
unsigned count_targets();
- sequence query_seq(unsigned frame) const
- {
- return query_seqs::get()[query_id*align_mode.query_contexts + frame];
- }
sequence query_source_seq() const
{
return align_mode.query_translated ? query_source_seqs::get()[query_id] : query_seqs::get()[query_id];
}
void load_targets();
- void rank_targets();
unsigned source_query_len, unaligned_from;
vector<Seed_hit> seed_hits;
Ptr_vector<Target> targets;
vector<Bias_correction> query_cb;
+ vector<Long_score_profile> profile;
+
};
#endif
\ No newline at end of file
diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
index b7627eb..aa6fab3 100644
--- a/src/basic/basic.cpp
+++ b/src/basic/basic.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "value.h"
@@ -22,8 +22,9 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "translate.h"
#include "statistics.h"
#include "sequence.h"
+#include "masking.h"
-const char* Const::version_string = "0.8.35";
+const char* Const::version_string = "0.9.8";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v";
@@ -72,8 +73,8 @@ unsigned Align_mode::from_command(unsigned command)
Align_mode align_mode (Align_mode::blastp);
-Reduction Reduction::reduction("KREDQN C G H M F Y ILV W P STA");
-//Reduction Reduction::reduction("A KR EDNQ C G H ILVM FYW P ST"); // murphy.10
+//Reduction Reduction::reduction("KREDQN C G H M F Y ILV W P STA");
+Reduction Reduction::reduction("A KR EDNQ C G H ILVM FYW P ST"); // murphy.10
//const Reduction Reduction::reduction("G D N AEFIKLMQRVW Y H C T S P"); // gmbr.10
//const Reduction Reduction::reduction("EKQR IV LY F AM W HT C DNS"); // dssp.10
//const Reduction Reduction::reduction("K R E D Q N C G H M F Y I L V W P S T A");
@@ -81,8 +82,8 @@ Reduction Reduction::reduction("KREDQN C G H M F Y ILV W P STA");
Statistics statistics;
const char* shape_codes[][Const::max_shapes] = {
- { "111101011101111", "111011001100101111", "1111001001010001001111", "111100101000010010010111", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 4x12
- { "1111011111", // 16x9
+ { "111101011101111", "111011001100101111", "1111001001010001001111", "111100101000010010010111", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 0 4x12
+ { "1111011111", // 1 16x9
"111001101111",
"11101100101011",
"11010010111011",
@@ -99,7 +100,7 @@ const char* shape_codes[][Const::max_shapes] = {
"1101000100100000100000111",
"1110001000100000001010011" },
{
- "11001011", // 16x5
+ "11001011", // 2 16x5
"101010011",
"100110101",
"1110000101",
@@ -117,7 +118,7 @@ const char* shape_codes[][Const::max_shapes] = {
"10010000100000000000011" },
{
- "11101011", // 16x6
+ "11101011", // 3 16x6
"110100111",
"11001000111",
"1100001001011",
@@ -135,7 +136,7 @@ const char* shape_codes[][Const::max_shapes] = {
"101000010000000000010011"
},
- { "1110010111", // 16x7
+ { "1110010111", // 4 16x7
"11001101011",
"1101001000111",
"11100010010011",
@@ -154,7 +155,7 @@ const char* shape_codes[][Const::max_shapes] = {
},
{
- "101011", // 16x4
+ "101011", // 5 16x4
"110011",
"110000101",
"1001000011",
@@ -170,7 +171,18 @@ const char* shape_codes[][Const::max_shapes] = {
"101000000000000011",
"100010000000000000101",
"1000100000000000100001"
- }
+ },
+ {
+ "111010110110111","111001010101001111","1110110010001101011","11110010000100100010111",0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 6 4x11
+{ "1110101101111","1110110100010111","10110110001001000111","111010001000010010111",0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 7 4x10
+//{"1111011101011","1110100111111",0,0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 8 2x10
+//{"1110101101111","11101001100010111",0,0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 8 2x10
+ { "111101110111","111011010010111" }, // 8 2x10 iedera
+//{"111011111","11011010111","111101001011","1110011001011","1110001101101","1101000111011","1101010100111","11101001000111","110100100101011","111001010001011","110011000100111","1110100001010011","1101000100100111","11010100010001011","11100010100001011","11101000000010001011"}, // 9 16x8
+{"1011110111","110100100010111","11001011111","101110001111","11011101100001","1111010010101","111001001001011","10101001101011","111101010011","1111000010000111","1100011011011","1101010000011011","1110001010101001","110011000110011","11011010001101","1101001100010011"},// 9 16x8 iedera
+{"111111",0,0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0 }, // 10 1x6
+{"11111",0,0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0 } // 11 1x5
+
};
shape_config shapes;
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index 8b925a2..bb3fe7a 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "../util/command_line_parser.h"
@@ -31,6 +31,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../data/sorted_list.h"
#include "../basic/translate.h"
#include "../dp/dp.h"
+#include "masking.h"
Config config;
@@ -53,7 +54,9 @@ Config::Config(int argc, const char **argv)
.add_command("modelsim", "")
.add_command("match-file-stat", "")
.add_command("model-seqs", "")
- .add_command("opt", "");
+ .add_command("opt", "")
+ .add_command("mask", "")
+ .add_command("fastq2fasta", "");
Options_group general("General options");
general.add()
@@ -91,6 +94,7 @@ Config::Config(int argc, const char **argv)
\tppos means Percentage of positive - scoring matches\n\
\tqframe means Query frame\n\
\tbtop means Blast traceback operations(BTOP)\n\
+\tstaxids means unique Subject Taxonomy ID(s), separated by a ';' (in numerical order)\n\
\tstitle means Subject Title\n\
\tsalltitles means All Subject Title(s), separated by a '<>'\n\
\tqcovhsp means Query Coverage Per HSP\n\
@@ -112,7 +116,7 @@ Config::Config(int argc, const char **argv)
("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25))
("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
("compress", 0, "compression for output files (0=none, 1=gzip)", compression)
- ("evalue", 'e', "maximum e-value to report alignments", max_evalue, 0.001)
+ ("evalue", 'e', "maximum e-value to report alignments (default=0.001)", max_evalue, 0.001)
("min-score", 0, "minimum bit score to report alignments (overrides e-value setting)", min_bit_score)
("id", 0, "minimum identity% to report an alignment", min_id)
("query-cover", 0, "minimum query cover% to report an alignment", query_cover)
@@ -122,20 +126,23 @@ Config::Config(int argc, const char **argv)
("block-size", 'b', "sequence block size in billions of letters (default=2.0)", chunk_size)
("index-chunks", 'c', "number of chunks for index processing", lowmem)
("tmpdir", 't', "directory for temporary files", tmpdir)
- ("gapopen", 0, "gap open penalty (default=11 for protein)", gap_open, -1)
- ("gapextend", 0, "gap extension penalty (default=1 for protein)", gap_extend, -1)
+ ("gapopen", 0, "gap open penalty", gap_open, -1)
+ ("gapextend", 0, "gap extension penalty", gap_extend, -1)
("matrix", 0, "score matrix for protein alignment (default=BLOSUM62)", matrix, string("blosum62"))
("custom-matrix", 0, "file containing custom scoring matrix", matrix_file)
("lambda", 0, "lambda parameter for custom matrix", lambda)
("K", 0, "K parameter for custom matrix", K)
("comp-based-stats", 0, "enable composition based statistics (0/1=default)", comp_based_stats, 1u)
- ("seg", 0, "enable SEG masking of queries (yes/no)", seg)
+ ("masking", 0, "enable masking of low complexity regions (0/1=default)", masking, 1)
+ //("seg", 0, "enable SEG masking of queries (yes/no)", seg)
("query-gencode", 0, "genetic code to use to translate query (see user manual)", query_gencode, 1u)
- ("salltitles", 0, "print full subject titles in output files", salltitles)
- ("no-self-hits", 0, "suppress reporting of identical self hits", no_self_hits);
+ ("salltitles", 0, "include full subject titles in DAA file", salltitles)
+ ("no-self-hits", 0, "suppress reporting of identical self hits", no_self_hits)
+ ("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid);
Options_group advanced("Advanced options");
advanced.add()
+ ("algo", 0, "Seed search algorithm (0=double-indexed/1=query-indexed)", algo, -1)
("bin", 0, "number of query bins for seed search", query_bins, 16u)
("min-orf", 'l', "ignore translated sequences without an open reading frame of at least this length", run_len)
("freq-sd", 0, "number of standard deviations for ignoring frequent seeds", freq_sd, 0.0)
@@ -150,13 +157,11 @@ Config::Config(int argc, const char **argv)
("shapes", 's', "number of seed shapes (0 = all available)", shapes)
("shape-mask", 0, "seed shapes", shape_mask)
("index-mode", 0, "index mode (0=4x12, 1=16x9)", index_mode)
- ("fetch-size", 0, "trace point fetch size", fetch_size, 4096u)
- ("rank-factor", 0, "include subjects within this range of max-target-seqs", rank_factor, 2.0)
- ("rank-ratio", 0, "include subjects within this ratio of last hit", rank_ratio, 0.6)
+ ("rank-ratio", 0, "include subjects within this ratio of last hit (stage 1)", rank_ratio, -1.0)
+ ("rank-ratio2", 0, "include subjects within this ratio of last hit (stage 2)", rank_ratio2, -1.0)
("max-hsps", 0, "maximum number of HSPs per subject sequence to save for each query", max_hsps, 1u)
("dbsize", 0, "effective database size (in letters)", db_size)
- ("no-auto-append", 0, "disable auto appending of DAA and DMND file extensions", no_auto_append)
- ("target-fetch-size", 0, "number of target sequences to fetch for seed extension", target_fetch_size, 4u);
+ ("no-auto-append", 0, "disable auto appending of DAA and DMND file extensions", no_auto_append);
Options_group view_options("View options");
view_options.add()
@@ -174,16 +179,13 @@ Config::Config(int argc, const char **argv)
("slow-search", 0, "", slow_search)
("ht", 0, "", ht_mode)
("old-freq", 0, "", old_freq)
- ("qp", 0, "", query_parallel)
("match1", 0, "", match_file1)
("match2", 0, "", match_file2)
("max-hits", 'C', "maximum number of hits to consider for one seed", hit_cap)
("seed-freq", 0, "maximum seed frequency", max_seed_freq, -15.0)
- ("space-penalty", 0, "", space_penalty, 4.9)
- ("min-diag-score", 0, "", min_diag_score, 10.5)
+ ("space-penalty", 0, "", space_penalty, 0.5)
("reverse", 0, "", reverse)
("neighborhood-score", 0, "", neighborhood_score)
- ("algo", 0, "", algo, 0u)
("seed-weight", 'w', "", seed_weight, 7u)
("very-sensitive", 0, "", mode_very_sensitive)
("idl", 0, "", id_left)
@@ -196,24 +198,39 @@ Config::Config(int argc, const char **argv)
("rho", 0, "", rho, 0.99)
("p_best", 0, "", p_best, 0.05)
("d_exp", 0, "", d_exp, 1.0)
- ("d_new", 0, "", d_new, 1.0);
+ ("d_new", 0, "", d_new, 1.0)
+ ("score-estimate-factor", 0, "", score_estimate_factor, 0.0)
+ ("diag-min-estimate", 0, "", diag_min_estimate, 17)
+ ("qfilt", 0, "", qfilt)
+ ("sfilt", 0, "", sfilt)
+ ("path-cutoff", 0, "", path_cutoff, 0.92)
+ ("sw", 0, "", use_smith_waterman)
+ ("superblock", 0, "", superblock, 128)
+ ("max-cells", 0, "", max_cells, 10000000u)
+ ("lb", 0, "", load_balancing, (unsigned)Config::query_parallel)
+ ("ext", 0, "", ext, (int)Config::greedy)
+ ("br", 0, "", benchmark_ranking)
+ ("log-query", 0, "", log_query)
+ ("log-subject", 0, "", log_subject)
+ ("palign", 0, "", threads_align)
+ ("score-ratio", 0, "", score_ratio, 0.9)
+ ("fetch-size", 0, "trace point fetch size", fetch_size, 4096u)
+ ("target-fetch-size", 0, "number of target sequences to fetch for seed extension", target_fetch_size, 4u)
+ ("small-query", 0, "", small_query)
+ ("hashed-seeds", 0, "", hashed_seeds);
parser.add(general).add(makedb).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
parser.store(argc, argv, command);
switch (command) {
case Config::makedb:
- if (input_ref_file == "")
- throw std::runtime_error("Missing parameter: input file (--in)");
if (database == "")
throw std::runtime_error("Missing parameter: database file (--db/-d)");
if (chunk_size != 0.0)
throw std::runtime_error("Invalid option: --block-size/-b. Block size is set for the alignment commands.");
break;
case Config::blastp:
- case Config::blastx:
- if (query_file == "")
- throw std::runtime_error("Missing parameter: query file (--query/-q)");
+ case Config::blastx:
if (database == "")
throw std::runtime_error("Missing parameter: database file (--db/-d)");
if (daa_file.length() > 0) {
@@ -247,7 +264,7 @@ Config::Config(int argc, const char **argv)
else if (verbose)
verbosity = 2;
else if (((command == Config::view || command == blastx || command == blastp) && output_file == "")
- || command == Config::version || command == getseq)
+ || command == Config::version || command == getseq || command == fastq2fasta)
verbosity = 0;
else
verbosity = 1;
@@ -280,6 +297,7 @@ Config::Config(int argc, const char **argv)
}
message_stream << Const::program_name << " v" << Const::version_string << "." << (unsigned)Const::build_version << " | by Benjamin Buchfink <buchfink at gmail.com>" << endl;
+ message_stream << "Licensed under the GNU AGPL <https://www.gnu.org/licenses/agpl.txt>" << endl;
message_stream << "Check http://github.com/bbuchfink/diamond for updates." << endl << endl;
#ifndef NDEBUG
verbose_stream << "Assertions enabled." << endl;
@@ -296,52 +314,45 @@ Config::Config(int argc, const char **argv)
;
}
- if (command == Config::blastp || command == Config::blastx || command == Config::benchmark || command == Config::model_sim || command == Config::opt) {
- if (tmpdir == "")
- tmpdir = extract_dir(output_file);
- if (gap_open == -1)
- gap_open = 11;
- if (gap_extend == -1)
- gap_extend = 1;
+ switch (command) {
+ case Config::blastp:
+ case Config::blastx:
+ case Config::benchmark:
+ case Config::model_sim:
+ case Config::opt:
+ case Config::mask:
+ case Config::makedb:
if (matrix_file == "")
score_matrix = Score_matrix(to_upper_case(matrix), gap_open, gap_extend, reward, penalty);
else {
if (lambda == 0 || K == 0)
throw std::runtime_error("Custom scoring matrices require setting the --lambda and --K options.");
+ if (gap_open == -1 || gap_extend == -1)
+ throw std::runtime_error("Custom scoring matrices require setting the --gapopen and --gapextend options.");
score_matrix = Score_matrix(matrix_file, lambda, K, gap_open, gap_extend);
}
message_stream << "Scoring parameters: " << score_matrix << endl;
- raw_ungapped_xdrop = score_matrix.rawscore(ungapped_xdrop);
- min_diag_raw_score = score_matrix.rawscore(min_diag_score);
- raw_space_penalty = score_matrix.rawscore(space_penalty, double());
- log_stream << "Min_diag_score=" << min_diag_raw_score << " space_penalty=" << raw_space_penalty << endl;
- init_cbs();
-
- if (seg == "" && command == blastx)
- seg = "yes";
- verbose_stream << "SEG masking = " << (seg == "yes") << endl;
- have_ssse3 = check_SSSE3();
- if (have_ssse3)
- verbose_stream << "SSSE3 enabled." << endl;
- verbose_stream << "Reduction: " << Reduction::reduction << endl;
+ if (masking == 1)
+ Masking::instance = auto_ptr<Masking>(new Masking(score_matrix));
+ }
- if (mode_more_sensitive) {
- set_option(index_mode, 1u);
- set_option(freq_sd, 200.0);
- }
- else if (mode_sensitive) {
- set_option(index_mode, 1u);
- set_option(freq_sd, 10.0);
- }
- else {
- set_option(index_mode, 0u);
- set_option(freq_sd, 50.0);
- }
+ if (command == Config::blastp || command == Config::blastx || command == Config::benchmark || command == Config::model_sim || command == Config::opt
+ || command == Config::mask) {
+ if (tmpdir == "")
+ tmpdir = extract_dir(output_file);
+
+ init_cbs();
+ raw_ungapped_xdrop = score_matrix.rawscore(ungapped_xdrop);
- verbose_stream << "Seed frequency SD: " << freq_sd << endl;
- ::shapes = shape_config(index_mode, shapes, shape_mask);
- verbose_stream << "Shape configuration: " << ::shapes << endl;
- seed_anchor = std::min(::shapes[0].length_ - 1, 8u);
+#ifdef __SSSE3__
+ verbose_stream << "SSSE3 enabled." << endl;
+#endif
+#ifdef __SSE4_1__
+ verbose_stream << "SSE4.1 enabled." << endl;
+#endif
+#ifdef __POPCNT__
+ verbose_stream << "POPCNT enabled." << endl;
+#endif
message_stream << "#Target sequences to report alignments for: ";
if (max_alignments == 0) {
diff --git a/src/basic/config.h b/src/basic/config.h
index 8398234..0b1035b 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef CONFIG_H_
@@ -69,9 +69,7 @@ struct Config
int gap_open;
int gap_extend;
string matrix;
- string seg;
bool debug_log, verbose, quiet;
- bool have_ssse3;
bool salltitles;
int reward;
int penalty;
@@ -94,12 +92,11 @@ struct Config
bool extend_all;
bool slow_search;
vector<string> seq_no;
- double rank_factor;
double rank_ratio;
+ double rank_ratio2;
bool ht_mode;
bool old_freq;
double freq_sd;
- bool query_parallel;
unsigned target_fetch_size;
bool mode_more_sensitive;
string matrix_file;
@@ -108,9 +105,7 @@ struct Config
unsigned seed_anchor;
unsigned query_gencode;
string unaligned;
- double space_penalty, raw_space_penalty;
- double min_diag_score;
- int min_diag_raw_score;
+ double space_penalty;
bool new_prefilter;
bool reverse;
unsigned comp_based_stats;
@@ -128,15 +123,39 @@ struct Config
double rho;
double p_best;
double d_exp, d_new;
+ double score_estimate_factor;
+ int diag_min_estimate;
+ string qfilt, sfilt;
+ double path_cutoff;
+ bool use_smith_waterman;
+ string prot_accession2taxid;
+ int superblock;
+ unsigned max_cells;
+ int masking;
+ bool benchmark_ranking;
+ bool log_query;
+ bool log_subject;
+ unsigned threads_align;
+ double score_ratio;
+ bool small_query;
+ bool hashed_seeds;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
- match_file_stat = 14, model_seqs = 15, opt = 16
+ match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18
};
unsigned command;
- enum { double_indexed = 0, subject_indexed = 1 };
- unsigned algo;
+ enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 };
+ int algo;
+
+ enum { query_parallel = 0, target_parallel = 1 };
+ unsigned load_balancing;
+
+ enum {
+ swipe = 0, greedy = 1, floating_xdrop = 4, more_greedy = 2, most_greedy=3,
+ };
+ int ext;
Config() {}
Config(int argc, const char **argv);
diff --git a/src/basic/const.h b/src/basic/const.h
index f915e31..0dd7c7a 100644
--- a/src/basic/const.h
+++ b/src/basic/const.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef CONST_H_
@@ -23,7 +23,7 @@ struct Const
{
enum {
- build_version = 97,
+ build_version = 109,
daa_version = 0,
seedp_bits = 10,
seedp = 1<<seedp_bits,
diff --git a/src/basic/hssp.cpp b/src/basic/hssp.cpp
index 4030b14..d0134d9 100644
--- a/src/basic/hssp.cpp
+++ b/src/basic/hssp.cpp
@@ -1,27 +1,27 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "../align/align.h"
bool Hsp_data::pass_through(const Diagonal_segment &d) const
{
- if (intersect(d.query_range(), query_range).length() != (size_t)d.len
- || intersect(d.subject_range(), subject_range).length() != (size_t)d.len)
+ if (intersect(d.query_range(), query_range).length() != d.len
+ || intersect(d.subject_range(), subject_range).length() != d.len)
return false;
Iterator it = begin();
@@ -39,6 +39,17 @@ bool Hsp_data::pass_through(const Diagonal_segment &d) const
return true;
}
+std::pair<int, int> Hsp_data::diagonal_bounds() const
+{
+ int d0 = std::numeric_limits<int>::max(), d1 = std::numeric_limits<int>::min();
+ for (Iterator it = begin(); it.good(); ++it) {
+ const int d = (int)it.query_pos - (int)it.subject_pos;
+ d0 = std::min(d0, d);
+ d1 = std::max(d1, d);
+ }
+ return std::make_pair(d0, d1);
+}
+
bool Hsp_data::is_weakly_enveloped(const Hsp_data &j) const
{
static const double overlap_factor = 0.9;
@@ -92,6 +103,7 @@ Hsp_context& Hsp_context::parse()
for (; i.good(); ++i) {
++hsp_.length;
+ assert(i.query_pos < query.length());
if (i.query_pos >= query.length())
throw std::runtime_error("Query sequence index out of bounds.");
switch (i.op()) {
diff --git a/src/basic/masking.cpp b/src/basic/masking.cpp
new file mode 100644
index 0000000..b34641c
--- /dev/null
+++ b/src/basic/masking.cpp
@@ -0,0 +1,94 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "masking.h"
+
+auto_ptr<Masking> Masking::instance;
+const uint8_t Masking::bit_mask = 128;
+
+Masking::Masking(const Score_matrix &score_matrix)
+{
+ const double lambda = score_matrix.lambda(); // 0.324032
+ for (unsigned i = 0; i < size; ++i) {
+ mask_table_x_[i] = value_traits.mask_char;
+ mask_table_bit_[i] = (uint8_t)i | bit_mask;
+ for (unsigned j = 0; j < size; ++j)
+ if (i < value_traits.alphabet_size && j < value_traits.alphabet_size)
+ likelihoodRatioMatrix_[i][j] = exp(lambda * score_matrix(i, j));
+ }
+ std::copy(likelihoodRatioMatrix_, likelihoodRatioMatrix_ + size, probMatrixPointers_);
+ int firstGapCost = score_matrix.gap_extend() + score_matrix.gap_open();
+ firstGapProb_ = exp(-lambda * firstGapCost);
+ otherGapProb_ = exp(-lambda * score_matrix.gap_extend());
+ firstGapProb_ /= (1 - otherGapProb_);
+}
+
+void Masking::operator()(Letter *seq, size_t len) const
+{
+ tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), 50,
+ (tantan::const_double_ptr*)probMatrixPointers_,
+ 0.005, 0.05,
+ 0.9,
+ 0, 0,
+ 0.5, (const tantan::uchar*)mask_table_x_);
+}
+
+void Masking::mask_bit(Letter *seq, size_t len) const
+{
+ tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), 50,
+ (tantan::const_double_ptr*)probMatrixPointers_,
+ 0.005, 0.05,
+ 0.9,
+ 0, 0,
+ 0.5, (const tantan::uchar*)mask_table_bit_);
+}
+
+void Masking::bit_to_hard_mask(Letter *seq, size_t len, size_t &n) const
+{
+ for (size_t i = 0; i < len; ++i)
+ if (seq[i] & bit_mask) {
+ seq[i] = value_traits.mask_char;
+ ++n;
+ }
+}
+
+void Masking::remove_bit_mask(Letter *seq, size_t len) const
+{
+ for (size_t i = 0; i < len; ++i)
+ if (seq[i] & bit_mask)
+ seq[i] &= ~bit_mask;
+}
+
+void mask_worker(Atomic<size_t> *next, Sequence_set *seqs, const Masking *masking, bool hard_mask)
+{
+ size_t i;
+ while ((i = (*next)++) < seqs->get_length())
+ if (hard_mask)
+ masking->operator()(seqs->ptr(i), seqs->length(i));
+ else
+ masking->mask_bit(seqs->ptr(i), seqs->length(i));
+}
+
+void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
+{
+ Thread_pool threads;
+ Atomic<size_t> next(0);
+ for (size_t i = 0; i < config.threads_; ++i)
+ threads.push_back(launch_thread(mask_worker, &next, &seqs, &masking, hard_mask));
+ threads.join_all();
+}
\ No newline at end of file
diff --git a/src/basic/masking.h b/src/basic/masking.h
new file mode 100644
index 0000000..7a700c0
--- /dev/null
+++ b/src/basic/masking.h
@@ -0,0 +1,50 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <vector>
+#include <memory>
+#include <math.h>
+#include "value.h"
+#include "score_matrix.h"
+#include "../basic/sequence.h"
+#include "../lib/tantan/tantan.hh"
+#include "../data/sequence_set.h"
+
+using std::vector;
+using std::auto_ptr;
+
+struct Masking
+{
+ Masking(const Score_matrix &score_matrix);
+ void operator()(Letter *seq, size_t len) const;
+ void mask_bit(Letter *seq, size_t len) const;
+ void bit_to_hard_mask(Letter *seq, size_t len, size_t &n) const;
+ void remove_bit_mask(Letter *seq, size_t len) const;
+ static const Masking& get()
+ {
+ return *instance;
+ }
+ static auto_ptr<Masking> instance;
+ static const uint8_t bit_mask;
+private:
+ enum { size = 64 };
+ double likelihoodRatioMatrix_[size][size], *probMatrixPointers_[size], firstGapProb_, otherGapProb_;
+ char mask_table_x_[size], mask_table_bit_[size];
+};
+
+void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
\ No newline at end of file
diff --git a/src/basic/match.h b/src/basic/match.h
index e10f0a5..555ecb6 100644
--- a/src/basic/match.h
+++ b/src/basic/match.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef MATCH_H_
@@ -81,6 +81,15 @@ struct Diagonal_segment
{
return i - j;
}
+ Diagonal_segment intersect(const Diagonal_segment &x) const
+ {
+ if (diag() != x.diag())
+ return Diagonal_segment();
+ else {
+ const interval q = ::intersect(query_range(), x.query_range());
+ return Diagonal_segment(q.begin_, ::intersect(subject_range(), x.subject_range()).begin_, q.length(), 0);
+ }
+ }
bool is_enveloped(const Diagonal_segment &x) const
{
return score <= x.score
@@ -99,18 +108,32 @@ struct Diagonal_segment
{
return i + len <= rhs.i && j + len <= rhs.j;
}
+ bool operator==(const Diagonal_segment &rhs) const
+ {
+ return i == rhs.i && j == rhs.j && len == rhs.len;
+ }
static bool cmp_subject(const Diagonal_segment &x, const Diagonal_segment &y)
{
- return x.j < y.j;
+ return x.j < y.j || (x.j == y.j && x.i < y.i);
}
static bool cmp_subject_end(const Diagonal_segment &x, const Diagonal_segment &y)
{
return x.subject_end() < y.subject_end();
}
+ static bool cmp_heuristic(const Diagonal_segment &x, const Diagonal_segment &y)
+ {
+ return (x.subject_end() < y.subject_end() && x.j < y.j)
+ || (x.j - y.j < y.subject_end() - x.subject_end());
+ }
friend int abs_shift(const Diagonal_segment &x, const Diagonal_segment &y)
{
return abs(x.diag() - y.diag());
}
+ friend std::ostream& operator<<(std::ostream &s, const Diagonal_segment &d)
+ {
+ s << "i=" << d.i << " j=" << d.j << " l=" << d.len << " score=" << d.score;
+ return s;
+ }
int i, j, len, score;
};
@@ -118,7 +141,7 @@ struct Intermediate_record;
struct Hsp_data
{
- Hsp_data():
+ Hsp_data() :
score(0),
frame(0),
length(0),
@@ -126,9 +149,10 @@ struct Hsp_data
mismatches(0),
positives(0),
gap_openings(0),
- gaps(0)
+ gaps(0),
+ sw_score(0)
{}
- Hsp_data(int score):
+ Hsp_data(int score) :
score(unsigned(score)),
frame(0),
length(0),
@@ -136,7 +160,8 @@ struct Hsp_data
mismatches(0),
positives(0),
gap_openings(0),
- gaps(0)
+ gaps(0),
+ sw_score(0)
{}
Hsp_data(const Intermediate_record &r, unsigned query_source_len);
struct Iterator
@@ -232,10 +257,16 @@ struct Hsp_data
{
return (double)subject_range.length() * 100 / subject_len;
}
+ static bool cmp_query_pos(const Hsp_data &x, const Hsp_data &y)
+ {
+ return x.query_range.begin_ < y.query_range.begin_;
+ }
bool pass_through(const Diagonal_segment &d) const;
bool is_weakly_enveloped(const Hsp_data &j) const;
+ std::pair<int, int> diagonal_bounds() const;
void merge(const Hsp_data &right, const Hsp_data &left, unsigned query_anchor, unsigned subject_anchor);
- unsigned score, frame, length, identities, mismatches, positives, gap_openings, gaps;
+ unsigned score, frame, length, identities, mismatches, positives, gap_openings, gaps, sw_score;
+ float time;
interval query_source_range, query_range, subject_range;
Packed_transcript transcript;
};
@@ -329,6 +360,14 @@ struct Hsp_context
{
return score_matrix.bitscore(score());
}
+ double sw_score() const
+ {
+ return score_matrix.bitscore(hsp_.sw_score);
+ }
+ double time() const
+ {
+ return hsp_.time;
+ }
unsigned frame() const
{ return hsp_.frame; }
unsigned length() const
diff --git a/src/basic/packed_loc.h b/src/basic/packed_loc.h
index 09294f4..6a6a4b3 100644
--- a/src/basic/packed_loc.h
+++ b/src/basic/packed_loc.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef PACKED_LOC_H_
diff --git a/src/basic/packed_sequence.h b/src/basic/packed_sequence.h
index 572cc5f..6989dea 100644
--- a/src/basic/packed_sequence.h
+++ b/src/basic/packed_sequence.h
@@ -1,20 +1,19 @@
/****
-Copyright (c) 2015, University of Tuebingen
-Author: Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef PACKED_SEQUENCE_H_
diff --git a/src/basic/packed_transcript.h b/src/basic/packed_transcript.h
index 7c4856e..7cbb685 100644
--- a/src/basic/packed_transcript.h
+++ b/src/basic/packed_transcript.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2015-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef PACKED_TRANSCRIPT_H_
@@ -101,16 +101,6 @@ struct Packed_transcript
Combined_operation op_;
};
- void read(Buffered_file &f)
- {
- data_.clear();
- uint8_t code;
- do {
- f.read(code);
- data_.push_back(code);
- } while (code != Packed_operation::terminator());
- }
-
void read(Binary_buffer::Iterator &it)
{
data_.clear();
@@ -147,7 +137,16 @@ struct Packed_transcript
void push_back(Edit_operation op, unsigned count)
{
- data_.push_back(Packed_operation(op, count));
+ while (count > 0) {
+ const unsigned n = std::min(count, 63u);
+ data_.push_back(Packed_operation(op, n));
+ count -= n;
+ }
+ }
+
+ void reverse()
+ {
+ std::reverse(data_.begin(), data_.end());
}
void push_terminator()
diff --git a/src/basic/reduction.h b/src/basic/reduction.h
index 7bf6420..8b4e623 100644
--- a/src/basic/reduction.h
+++ b/src/basic/reduction.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef REDUCTION_H_
@@ -41,6 +39,7 @@ struct Reduction
map_[(long)value_traits.mask_char] = value_traits.mask_char;
const vector<string> tokens(tokenize(definition_string, " "));
size_ = (unsigned)tokens.size();
+ bit_size_ = (uint64_t)ceil(log(size_) / log(2));
for (unsigned i = 0; i<size_; ++i)
for (unsigned j = 0; j<tokens[i].length(); ++j) {
const char ch = tokens[i][j];
@@ -55,6 +54,11 @@ struct Reduction
return size_;
}
+ uint64_t bit_size() const
+ {
+ return bit_size_;
+ }
+
unsigned operator()(Letter a) const
{
return map_[(long)a];
@@ -96,6 +100,7 @@ private:
char map8_[256] __attribute__((aligned(16)));
#endif
unsigned size_;
+ uint64_t bit_size_;
};
@@ -103,4 +108,4 @@ private:
#include "../../../extra/reduction.h"
#endif
-#endif /* REDUCTION_H_ */
+#endif /* REDUCTION_H_ */
\ No newline at end of file
diff --git a/src/basic/score_matrix.cpp b/src/basic/score_matrix.cpp
index 1a925e3..34cf6e9 100644
--- a/src/basic/score_matrix.cpp
+++ b/src/basic/score_matrix.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <string>
@@ -165,6 +165,7 @@ struct Matrix_info
const array_of_8 *constants;
const char *scores;
const unsigned count;
+ const int default_gap_open, default_gap_extend;
static const Matrix_info& get(const string &name)
{
@@ -188,20 +189,20 @@ struct Matrix_info
};
const Matrix_info Matrix_info::matrices[] = {
- { "BLOSUM45", blosum45_values, (const char*)NCBISM_Blosum45.scores, BLOSUM45_VALUES_MAX },
- { "BLOSUM50", blosum50_values, (const char*)NCBISM_Blosum50.scores, BLOSUM50_VALUES_MAX },
- { "BLOSUM62", blosum62_values, (const char*)NCBISM_Blosum62.scores, BLOSUM62_VALUES_MAX },
- { "BLOSUM80", blosum80_values, (const char*)NCBISM_Blosum80.scores, BLOSUM80_VALUES_MAX },
- { "BLOSUM90", blosum90_values, (const char*)NCBISM_Blosum90.scores, BLOSUM90_VALUES_MAX },
- { "PAM70", pam70_values, (const char*)NCBISM_Pam70.scores, PAM70_VALUES_MAX },
- { "PAM250", pam250_values, (const char*)NCBISM_Pam250.scores, PAM250_VALUES_MAX },
- { "PAM30", pam30_values, (const char*)NCBISM_Pam30.scores, PAM30_VALUES_MAX }
+ { "BLOSUM45", blosum45_values, (const char*)NCBISM_Blosum45.scores, BLOSUM45_VALUES_MAX, 14, 2 },
+ { "BLOSUM50", blosum50_values, (const char*)NCBISM_Blosum50.scores, BLOSUM50_VALUES_MAX, 13, 2 },
+ { "BLOSUM62", blosum62_values, (const char*)NCBISM_Blosum62.scores, BLOSUM62_VALUES_MAX, 11, 1 },
+ { "BLOSUM80", blosum80_values, (const char*)NCBISM_Blosum80.scores, BLOSUM80_VALUES_MAX, 10, 1 },
+ { "BLOSUM90", blosum90_values, (const char*)NCBISM_Blosum90.scores, BLOSUM90_VALUES_MAX, 10, 1 },
+ { "PAM70", pam70_values, (const char*)NCBISM_Pam70.scores, PAM70_VALUES_MAX, 10, 1 },
+ { "PAM250", pam250_values, (const char*)NCBISM_Pam250.scores, PAM250_VALUES_MAX, 14, 2 },
+ { "PAM30", pam30_values, (const char*)NCBISM_Pam30.scores, PAM30_VALUES_MAX, 9, 1 }
};
Score_matrix::Score_matrix(const string & matrix, int gap_open, int gap_extend, int reward, int penalty):
- gap_open_ (gap_open),
- gap_extend_ (gap_extend),
- constants_ (Matrix_info::get(matrix).get_constants(gap_open, gap_extend)),
+ gap_open_ (gap_open == -1 ? Matrix_info::get(matrix).default_gap_open : gap_open),
+ gap_extend_ (gap_extend == -1 ? Matrix_info::get(matrix).default_gap_extend : gap_extend),
+ constants_ (Matrix_info::get(matrix).get_constants(gap_open_, gap_extend_)),
name_(matrix),
matrix8_(Matrix_info::get(matrix).scores),
bias_((char)(-low_score())),
@@ -218,6 +219,24 @@ char Score_matrix::low_score() const
return low;
}
+double Score_matrix::avg_id_score() const
+{
+ double s = 0;
+ for (int i = 0; i < 20; ++i)
+ s += this->operator()(i, i);
+ return s / 20;
+}
+
+std::ostream& operator<<(std::ostream& s, const Score_matrix &m)
+{
+ s << "(Matrix=" << m.name_
+ << " Lambda=" << m.lambda()
+ << " K=" << m.k()
+ << " Penalties=" << m.gap_open_
+ << '/' << m.gap_extend_ << ')';
+ return s;
+}
+
const char* custom_scores(const string &matrix_file)
{
static char scores[25 * 25];
diff --git a/src/basic/score_matrix.h b/src/basic/score_matrix.h
index fd599c0..06ec2bd 100644
--- a/src/basic/score_matrix.h
+++ b/src/basic/score_matrix.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SCORE_MATRIX_H_
@@ -39,15 +39,7 @@ struct Score_matrix
Score_matrix(const string &matrix, int gap_open, int gap_extend, int reward, int penalty);
Score_matrix(const string &matrix_file, double lambda, double K, int gap_open, int gap_extend);
- friend std::ostream& operator<<(std::ostream& s, const Score_matrix &m)
- {
- s << "(Matrix=" << m.name_
- << " Lambda=" << m.lambda()
- << " K=" << m.k()
- << " Penalties=" << m.gap_open_
- << '/' << m.gap_extend_ << ')';
- return s;
- }
+ friend std::ostream& operator<<(std::ostream& s, const Score_matrix &m);
const int8_t* matrix8() const
{ return matrix8_.data; }
@@ -101,6 +93,18 @@ struct Score_matrix
char low_score() const;
+ int gap_open() const
+ {
+ return gap_open_;
+ }
+
+ int gap_extend() const
+ {
+ return gap_extend_;
+ }
+
+ double avg_id_score() const;
+
private:
template<typename _t>
diff --git a/src/basic/seed.h b/src/basic/seed.h
index 0d7d8e1..fbf7478 100644
--- a/src/basic/seed.h
+++ b/src/basic/seed.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SEED_H_
diff --git a/src/basic/seed_iterator.h b/src/basic/seed_iterator.h
index 169adef..6ef030b 100644
--- a/src/basic/seed_iterator.h
+++ b/src/basic/seed_iterator.h
@@ -1,23 +1,27 @@
/****
-Copyright (c) 2014-2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
+#ifndef SEED_ITERATOR_H_
+#define SEED_ITERATOR_H_
+
#include "shape.h"
#include "sequence.h"
+#include "../util/hash_function.h"
struct Seed_iterator
{
@@ -35,4 +39,71 @@ struct Seed_iterator
}
private:
const char *ptr_, *end_;
-};
\ No newline at end of file
+};
+
+template<uint64_t _b>
+struct Hashed_seed_iterator
+{
+ Hashed_seed_iterator(const sequence &seq, const shape &sh):
+ ptr_(seq.data()),
+ end_(ptr_ + seq.length()),
+ last_(0)
+ {
+ for (uint64_t i = 0; i < sh.length_ - 1; ++i)
+ last_ = (last_ << _b) | Reduction::reduction(*(ptr_++));
+ }
+ bool good() const
+ {
+ return ptr_ < end_;
+ }
+ bool get(uint64_t &seed, uint64_t mask)
+ {
+ last_ <<= _b;
+ const char l = *(ptr_++);
+ if (l == value_traits.mask_char)
+ return false;
+ last_ |= Reduction::reduction(l);
+ seed = murmur_hash()(last_ & mask);
+ return true;
+ }
+private:
+ const char *ptr_, *end_;
+ uint64_t last_;
+};
+
+template<uint64_t _l, uint64_t _b>
+struct Contiguous_seed_iterator
+{
+ Contiguous_seed_iterator(const sequence &seq) :
+ ptr_(seq.data()),
+ end_(ptr_ + seq.length()),
+ last_(0)
+ {
+ for (uint64_t i = 0; i < _l - 1; ++i)
+ last_ = (last_ << _b) | Reduction::reduction(*(ptr_++));
+ }
+ bool good() const
+ {
+ return ptr_ < end_;
+ }
+ bool get(uint64_t &seed)
+ {
+ last_ <<= _b;
+ last_ &= (1 << (_b*_l)) - 1;
+ const char l = *(ptr_++);
+ if (l == value_traits.mask_char)
+ return false;
+ last_ |= Reduction::reduction(l);
+ seed = last_;
+ return true;
+ }
+ static uint64_t length()
+ {
+ return _l;
+ }
+private:
+ const char *ptr_, *end_;
+ uint64_t last_;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/basic/sequence.h b/src/basic/sequence.h
index 479aeab..349a9de 100644
--- a/src/basic/sequence.h
+++ b/src/basic/sequence.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SEQUENCE_H_
@@ -68,6 +66,10 @@ struct sequence
{
return data_;
}
+ const Letter* end() const
+ {
+ return data_ + len_;
+ }
const Letter* clipped_data() const
{
return data_ + clipping_offset_;
@@ -122,6 +124,21 @@ struct sequence
buf << value_traits.alphabet[(long)s.data_[i]];
return buf;
}
+ static sequence get_window(const Letter *s, int window)
+ {
+ const Letter *p = s;
+ int n = 0;
+ while (*p != '\xff' && n < window) {
+ --p;
+ ++n;
+ }
+ n = 0;
+ while (*s != '\xff' && n < window) {
+ ++s;
+ ++n;
+ }
+ return sequence(p + 1, s - p - 1);
+ }
/*friend std::ostream& operator<<(std::ostream &os, const sequence &s)
{
std::cout << "co = " << s.clipping_offset_ << std::endl;
diff --git a/src/basic/shape.h b/src/basic/shape.h
index 266b930..97e3212 100644
--- a/src/basic/shape.h
+++ b/src/basic/shape.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SHAPE_H_
@@ -100,20 +100,24 @@ struct shape
weight_ (0),
mask_ (0),
rev_mask_ (0),
- id_ (id)
+ id_ (id),
+ long_mask_(0)
{
assert(id < Const::max_shapes);
assert(strlen(code) <= 32);
memset(positions_, 0, sizeof(uint32_t)*Const::max_seed_weight);
+ const uint64_t b = Reduction::reduction.bit_size();
unsigned i (0);
for(;i<strlen(code);++i) {
rev_mask_ <<= 1;
+ long_mask_ <<= b;
if(code[i] == '1') {
assert(weight_ < Const::max_seed_weight);
positions_[weight_] = i;
++weight_;
mask_ |= 1 << i;
rev_mask_ |= 1;
+ long_mask_ |= (1 << b) - 1;
}
}
length_ = i;
@@ -126,7 +130,7 @@ struct shape
#ifdef FREQUENCY_MASKING
double f = 0;
#endif
- for(unsigned i=0;i<weight_;++i) {
+ for (unsigned i = 0; i < weight_; ++i) {
Letter l = seq[positions_[i]];
if (l == value_traits.mask_char || l == '\xff')
return false;
@@ -143,6 +147,21 @@ struct shape
return true;
}
+ inline bool set_seed_shifted(Packed_seed &s, const Letter *seq) const
+ {
+ s = 0;
+ const uint64_t b = Reduction::reduction.bit_size();
+ for (unsigned i = 0; i < weight_; ++i) {
+ Letter l = seq[positions_[i]];
+ if (l == value_traits.mask_char || l == '\xff')
+ return false;
+ unsigned r = Reduction::reduction(l);
+ s <<= b;
+ s |= uint64_t(r);
+ }
+ return true;
+ }
+
inline bool set_seed_reduced(Packed_seed &s, const Letter *seq) const
{
s = 0;
@@ -195,7 +214,18 @@ struct shape
return true;
}
+ bool contiguous() const
+ {
+ return length_ == weight_;
+ }
+
+ uint64_t long_mask() const
+ {
+ return long_mask_;
+ }
+
uint32_t length_, weight_, positions_[Const::max_seed_weight], d_, mask_, rev_mask_, id_;
+ uint64_t long_mask_;
};
diff --git a/src/basic/shape_config.h b/src/basic/shape_config.h
index 4e89d72..4918f8e 100644
--- a/src/basic/shape_config.h
+++ b/src/basic/shape_config.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SHAPE_CONFIG_H_
@@ -23,7 +21,7 @@ Author: Benjamin Buchfink
#include "shape.h"
-extern const char* shape_codes[6][Const::max_shapes];
+extern const char* shape_codes[12][Const::max_shapes];
class shape_config
{
@@ -54,7 +52,7 @@ public:
unsigned count() const
{ return n_; }
- const shape& operator[](unsigned i) const
+ const shape& operator[](size_t i) const
{ return shapes_[i]; }
unsigned mode() const
diff --git a/src/basic/statistics.h b/src/basic/statistics.h
index 7d6ad92..ea193ae 100644
--- a/src/basic/statistics.h
+++ b/src/basic/statistics.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef STATISTICS_H_
@@ -32,9 +30,11 @@ typedef uint64_t stat_type;
struct Statistics
{
- enum value { SEED_HITS, TENTATIVE_MATCHES0, TENTATIVE_MATCHES1, TENTATIVE_MATCHES2, TENTATIVE_MATCHES3, TENTATIVE_MATCHES4, TENTATIVE_MATCHESX, MATCHES, ALIGNED, GAPPED, DUPLICATES,
+ enum value {
+ SEED_HITS, TENTATIVE_MATCHES0, TENTATIVE_MATCHES1, TENTATIVE_MATCHES2, TENTATIVE_MATCHES3, TENTATIVE_MATCHES4, TENTATIVE_MATCHESX, MATCHES, ALIGNED, GAPPED, DUPLICATES,
GAPPED_HITS, QUERY_SEEDS, QUERY_SEEDS_HIT, REF_SEEDS, REF_SEEDS_HIT, QUERY_SIZE, REF_SIZE, OUT_HITS, OUT_MATCHES, COLLISION_LOOKUPS, QCOV, BIAS_ERRORS, SCORE_TOTAL, ALIGNED_QLEN, PAIRWISE, HIGH_SIM,
- TEMP_SPACE, SECONDARY_HITS, ERASED_HITS, COUNT };
+ TEMP_SPACE, SECONDARY_HITS, ERASED_HITS, SQUARED_ERROR, CELLS, OUTRANKED_HITS, TARGET_HITS0, TARGET_HITS1, TARGET_HITS2, TIME_GREEDY_EXT, COUNT
+ };
Statistics()
{ memset(data_, 0, sizeof(data_)); }
@@ -61,26 +61,33 @@ struct Statistics
void print() const
{
- log_stream << "Used ref size = " << data_[REF_SIZE] << endl;
- log_stream << "Traceback errors = " << data_[BIAS_ERRORS] << endl;
- verbose_stream << "Hits (filter stage 0) = " << data_[SEED_HITS] << endl;
- verbose_stream << "Hits (filter stage 1) = " << data_[TENTATIVE_MATCHES1] << " (" << data_[TENTATIVE_MATCHES1]*100.0/ data_[SEED_HITS] << " %)" << endl;
- verbose_stream << "Hits (filter stage x) = " << data_[TENTATIVE_MATCHESX] << " (" << data_[TENTATIVE_MATCHESX] * 100.0 / data_[TENTATIVE_MATCHES1] << " %)" << endl;
- verbose_stream << "Hits (filter stage 2) = " << data_[TENTATIVE_MATCHES2] << " (" << data_[TENTATIVE_MATCHES2] * 100.0 / data_[TENTATIVE_MATCHESX] << " %)" << endl;
- verbose_stream << "Hits (filter stage 3) = " << data_[TENTATIVE_MATCHES3] << " (" << data_[TENTATIVE_MATCHES3] * 100.0 / data_[TENTATIVE_MATCHES2] << " %)" << endl;
- verbose_stream << "Hits (filter stage 4) = " << data_[TENTATIVE_MATCHES4] << " (" << data_[TENTATIVE_MATCHES4] * 100.0 / data_[TENTATIVE_MATCHES3] << " %)" << endl;
- log_stream << "Gapped hits = " << data_[GAPPED_HITS] << endl;
- log_stream << "Overlap hits = " << data_[DUPLICATES] << endl;
- log_stream << "Secondary hits = " << data_[SECONDARY_HITS] << endl;
- log_stream << "Erased hits = " << data_[ERASED_HITS] << endl;
- log_stream << "High similarity hits = " << data_[HIGH_SIM] << endl;
- log_stream << "Net hits = " << data_[OUT_HITS] << endl;
- log_stream << "Matches = " << data_[OUT_MATCHES] << endl;
- log_stream << "Total score = " << data_[SCORE_TOTAL] << endl;
- log_stream << "Aligned query len = " << data_[ALIGNED_QLEN] << endl;
- log_stream << "Gapped matches = " << data_[GAPPED] << endl;
+ //log_stream << "Used ref size = " << data_[REF_SIZE] << endl;
+ //log_stream << "Traceback errors = " << data_[BIAS_ERRORS] << endl;
+ log_stream << "Hits (filter stage 0) = " << data_[SEED_HITS] << endl;
+ log_stream << "Hits (filter stage 1) = " << data_[TENTATIVE_MATCHES1] << " (" << data_[TENTATIVE_MATCHES1]*100.0/ data_[SEED_HITS] << " %)" << endl;
+ log_stream << "Hits (filter stage 2) = " << data_[TENTATIVE_MATCHES2] << " (" << data_[TENTATIVE_MATCHES2] * 100.0 / data_[TENTATIVE_MATCHES1] << " %)" << endl;
+ log_stream << "Hits (filter stage x) = " << data_[TENTATIVE_MATCHESX] << " (" << data_[TENTATIVE_MATCHESX] * 100.0 / data_[TENTATIVE_MATCHES2] << " %)" << endl;
+ log_stream << "Hits (filter stage 3) = " << data_[TENTATIVE_MATCHES3] << " (" << data_[TENTATIVE_MATCHES3] * 100.0 / data_[TENTATIVE_MATCHESX] << " %)" << endl;
+ log_stream << "Hits (filter stage 4) = " << data_[TENTATIVE_MATCHES4] << " (" << data_[TENTATIVE_MATCHES4] * 100.0 / data_[TENTATIVE_MATCHES3] << " %)" << endl;
+ log_stream << "Target hits (stage 0) = " << data_[TARGET_HITS0] << endl;
+ log_stream << "Target hits (stage 1) = " << data_[TARGET_HITS1] << endl;
+ log_stream << "Target hits (stage 2) = " << data_[TARGET_HITS2] << endl;
+ log_stream << "Time (greedy extension) = " << data_[TIME_GREEDY_EXT]/1e9 << "s" << endl;
+ //log_stream << "Gapped hits = " << data_[GAPPED_HITS] << endl;
+ //log_stream << "Overlap hits = " << data_[DUPLICATES] << endl;
+ //log_stream << "Secondary hits = " << data_[SECONDARY_HITS] << endl;
+ //log_stream << "Erased hits = " << data_[ERASED_HITS] << endl;
+ //log_stream << "High similarity hits = " << data_[HIGH_SIM] << endl;
+ //log_stream << "Net hits = " << data_[OUT_HITS] << endl;
+ //log_stream << "Matches = " << data_[OUT_MATCHES] << endl;
+ //log_stream << "Total score = " << data_[SCORE_TOTAL] << endl;
+ //log_stream << "Aligned query len = " << data_[ALIGNED_QLEN] << endl;
+ //log_stream << "Gapped matches = " << data_[GAPPED] << endl;
+ log_stream << "MSE = " << (double)data_[SQUARED_ERROR] / (double)data_[OUT_HITS] << endl;
+ //log_stream << "Cells = " << data_[CELLS] << endl;
verbose_stream << "Temporary disk space used: " << (double)data_[TEMP_SPACE] / (1 << 30) << " GB" << endl;
- message_stream << "Reported " << data_[PAIRWISE] << " pairwise alignments, " << data_[MATCHES] << " HSSPs." << endl;
+ log_stream << "Outranked hits = " << data_[OUTRANKED_HITS] << " (" << data_[OUTRANKED_HITS]*100.0/ data_[PAIRWISE] << "%)" << endl;
+ message_stream << "Reported " << data_[PAIRWISE] << " pairwise alignments, " << data_[MATCHES] << " HSPs." << endl;
message_stream << data_[ALIGNED] << " queries aligned." << endl;
}
diff --git a/src/basic/translate.h b/src/basic/translate.h
index 83b82a7..6620e60 100644
--- a/src/basic/translate.h
+++ b/src/basic/translate.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TRANSLATE_H_
diff --git a/src/basic/value.h b/src/basic/value.h
index d814ac3..5f7a256 100644
--- a/src/basic/value.h
+++ b/src/basic/value.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef VALUE_H_
diff --git a/src/data/count_approximate.cpp b/src/data/count_approximate.cpp
deleted file mode 100644
index dcff5dc..0000000
--- a/src/data/count_approximate.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****/
-
-#include <set>
-#include "index.h"
-#include "../basic/shape_config.h"
-
-template<unsigned PartitionBits> struct Flajolet_Martin_counter
-{
- static const unsigned N = 1 << PartitionBits;
- static const uint64_t MASK = N - 1;
- static const uint64_t HIGH = MASK << (64 - PartitionBits);
- static const double PHI;
- uint64_t buckets[N];
-
- Flajolet_Martin_counter()
- {
- clear();
- }
-
- void clear()
- {
- for (unsigned i = 0; i<N; ++i)
- buckets[i] = 0;
- }
-
- void add(uint64_t hash)
- {
- buckets[hash & MASK] |= 1LL << ctz(HIGH | (hash >> PartitionBits));
- }
-
- double get()
- {
- int n = 0;
- for (unsigned i = 0; i<N; ++i)
- n += ctz(~buckets[i]);
- return double(N) / PHI * pow(2, double(n) / N);
- }
-
-};
-
-template<unsigned PartitionBits> const double Flajolet_Martin_counter<PartitionBits>::PHI = 0.77351;
-
-struct Exact_counter
-{
- Exact_counter():
- seeds(shape_to-shape_from)
- {}
- void operator()(Hashed_seed seed, size_t pos, unsigned shape_id)
- {
- seeds[shape_id - shape_from][seed.partition()].insert(seed);
- }
- void finish()
- {}
- vector<Array<std::set<uint64_t>, Hashed_seed::p> > seeds;
-};
-
-struct Approximate_counter
-{
- Approximate_counter() :
- data(shape_to - shape_from)
- {}
- void operator()(Hashed_seed seed, size_t pos, unsigned shape_id)
- {
- data[shape_id - shape_from].add(seed);
- }
- void finish()
- {}
- enum { counter_pbits = 8 };
- vector<Flajolet_Martin_counter<counter_pbits> > data;
-};
-
-vector<Array<unsigned, Hashed_seed::p> > count_exact(const Sequence_set &seqs)
-{
- vector<Exact_counter> counters(config.threads_);
- seqs.enum_seeds(counters);
- vector<Array<unsigned, Hashed_seed::p> > out(shape_to - shape_from);
- memset(out.data(), 0, (shape_to - shape_from)*Hashed_seed::p*sizeof(unsigned));
- for (unsigned s = 0; s < shape_to - shape_from; ++s)
- for (unsigned p = 0; p < Hashed_seed::p; ++p)
- for (unsigned t = 0; t < config.threads_; ++t)
- out[s][p] += (unsigned)counters[t].seeds[s][p].size();
- return out;
-}
-
-vector<size_t> count_approximate(const Sequence_set &seqs)
-{
- vector<Approximate_counter> counters(config.threads_);
- seqs.enum_seeds(counters);
- vector<size_t> out(shape_to - shape_from);
- for (unsigned s = 0; s < shape_to - shape_from; ++s)
- for (unsigned t = 0; t < config.threads_; ++t)
- out[s] += (size_t)counters[t].data[s].get();
- return out;
-}
\ No newline at end of file
diff --git a/src/data/frequent_seeds.cpp b/src/data/frequent_seeds.cpp
index 31c6831..dbc2eea 100644
--- a/src/data/frequent_seeds.cpp
+++ b/src/data/frequent_seeds.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <numeric>
@@ -74,7 +74,7 @@ struct Frequent_seeds::Build_context
}
const size_t ht_size = std::max((size_t)(buf.size() * hash_table_factor), buf.size() + 1);
- PHash_set hash_set(ht_size);
+ PHash_set<void, murmur_hash> hash_set(ht_size);
for (vector<uint32_t>::const_iterator i = buf.begin(); i != buf.end(); ++i)
hash_set.insert(*i);
diff --git a/src/data/frequent_seeds.h b/src/data/frequent_seeds.h
index f9de6d4..4af435d 100644
--- a/src/data/frequent_seeds.h
+++ b/src/data/frequent_seeds.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef FREQUENT_SEEDS_H_
@@ -31,7 +31,8 @@ struct Frequent_seeds
bool get(const Letter *pos, unsigned sid) const
{
Packed_seed seed;
- if (!shapes[sid].set_seed(seed, pos))
+ const bool t = config.algo == Config::double_indexed ? shapes[sid].set_seed(seed, pos) : shapes[sid].set_seed_shifted(seed, pos);
+ if (!t)
return true;
return tables_[sid][seed_partition(seed)].contains(seed_partition_offset(seed));
}
@@ -44,7 +45,7 @@ private:
static void compute_sd(Atomic<unsigned> *seedp, const sorted_list *ref_idx, const sorted_list *query_idx, vector<Sd> *ref_out, vector<Sd> *query_out);
- PHash_set tables_[Const::max_shapes][Const::seedp];
+ PHash_set<void,murmur_hash> tables_[Const::max_shapes][Const::seedp];
};
diff --git a/src/data/index.cpp b/src/data/index.cpp
deleted file mode 100644
index 22aecb0..0000000
--- a/src/data/index.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****/
-
-#include <numeric>
-#include "index.h"
-#include "queries.h"
-#include "../util/log_stream.h"
-
-const double Seed_index::hash_table_factor = 1.3;
-Seed_index seed_index[Const::max_shapes];
-
-Seed_index::Seed_index(const Partitioned_histogram &hst, const Sequence_set &seqs, unsigned sid):
- list_buffer(sorted_list::alloc_buffer(hst)),
- list(list_buffer.get(), seqs, shapes[sid], hst.get(sid), seedp_range::all(), hst.partition())
-{
- task_timer timer("Counting seeds", 3);
- Thread_pool threads;
- vector<size_t> counts(Const::seedp);
- Atomic<unsigned> seedp(0);
- for (unsigned i = 0; i < config.threads_; ++i)
- threads.push_back(launch_thread(count_seeds, &seedp, &counts, &list));
- threads.join_all();
-
- timer.go("Allocating hash tables");
- for (unsigned p = 0; p < Hashed_seed::p; ++p)
- assign_ptr(tables[p], new PHash_table<Entry>(counts[p], hash_table_factor));
-
- timer.go("Filling hash tables");
- seedp = 0;
- for (unsigned i = 0; i < config.threads_; ++i)
- threads.push_back(launch_thread(fill_tables, &seedp, this));
- threads.join_all();
-}
-
-void Seed_index::count_seeds(Atomic<unsigned> *seedp, vector<size_t> *counts, sorted_list *list)
-{
- unsigned p;
- while ((p = (*seedp)++) < Const::seedp) {
- sorted_list::const_iterator i = list->get_partition_cbegin(p);
- size_t n = 0;
- while (!i.at_end()) {
- ++n;
- ++i;
- }
- (*counts)[p] = n;
- }
-}
-
-void Seed_index::fill_tables(Atomic<unsigned> *seedp, Seed_index *idx)
-{
- unsigned p;
- while ((p = (*seedp)++) < Const::seedp) {
- sorted_list::const_iterator i = idx->list.get_partition_cbegin(p);
- while (!i.at_end()) {
- PHash_table<Entry>::entry *e = idx->tables[p].insert(murmur_hash()(i.key()));
- e->value.n = (unsigned)idx->list.iterator_offset(i, p);
- ++i;
- }
- }
-}
-
-void build_index(const Sequence_set &seqs)
-{
- task_timer timer("Building histograms", 3);
- const pair<size_t, size_t> len_bounds = seqs.len_bounds(shapes[0].length_);
- const Partitioned_histogram hst(seqs, (unsigned)len_bounds.second);
- timer.finish();
-
- for (unsigned i = 0; i < shapes.count();++i)
- assign_ptr(seed_index[i], new Seed_index(hst, seqs, i));
-}
\ No newline at end of file
diff --git a/src/data/index.h b/src/data/index.h
index 9d174ac..dfb7036 100644
--- a/src/data/index.h
+++ b/src/data/index.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef INDEX_H_
diff --git a/src/data/load_seqs.h b/src/data/load_seqs.h
index 124342c..a203779 100644
--- a/src/data/load_seqs.h
+++ b/src/data/load_seqs.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef LOAD_SEQS_H_
@@ -26,14 +24,14 @@ Author: Benjamin Buchfink
#include "../basic/translate.h"
#include "../util/seq_file_format.h"
-inline size_t push_seq(Sequence_set &ss, Sequence_set& source_seqs, const vector<Letter> &seq)
+inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq)
{
if (config.command == Config::blastp || config.command == Config::makedb || config.command == Config::random_seqs) {
ss.push_back(seq);
return seq.size();
}
else {
- source_seqs.push_back(seq);
+ (*source_seqs)->push_back(seq);
if (seq.size() < 2) {
for (unsigned j = 0; j<6; ++j)
ss.fill(0, value_traits.mask_char);
@@ -54,33 +52,40 @@ inline size_t push_seq(Sequence_set &ss, Sequence_set& source_seqs, const vector
}
inline size_t load_seqs(Input_stream &file,
- const Sequence_file_format &format,
- Sequence_set** seqs,
- String_set<0>*& ids,
- Sequence_set*& source_seqs,
- size_t max_letters)
+ const Sequence_file_format &format,
+ Sequence_set** seqs,
+ String_set<0>*& ids,
+ Sequence_set** source_seqs,
+ size_t max_letters,
+ const string &filter)
{
- *seqs = new Sequence_set ();
- ids = new String_set<0> ();
- source_seqs = new Sequence_set ();
+ *seqs = new Sequence_set();
+ ids = new String_set<0>();
+ if(source_seqs)
+ *source_seqs = new Sequence_set();
size_t letters = 0, n = 0;
vector<Letter> seq;
vector<char> id;
-
+ string id2;
+
while (letters < max_letters && format.get_seq(id, seq, file)) {
- ids->push_back(id);
- letters += push_seq(**seqs, *source_seqs, seq);
- ++n;
- if ((*seqs)->get_length() > (size_t)std::numeric_limits<int>::max())
- throw std::runtime_error("Number of sequences in file exceeds supported maximum.");
+ if (seq.size() > 0 && (filter.empty() || id2.assign(id.data(), id.data() + id.size()).find(filter, 0) != string::npos)) {
+ ids->push_back(id);
+ letters += push_seq(**seqs, source_seqs, seq);
+ ++n;
+ if ((*seqs)->get_length() >(size_t)std::numeric_limits<int>::max())
+ throw std::runtime_error("Number of sequences in file exceeds supported maximum.");
+ }
}
ids->finish_reserve();
(*seqs)->finish_reserve();
- source_seqs->finish_reserve();
- if(n == 0) {
+ if(source_seqs)
+ (*source_seqs)->finish_reserve();
+ if (n == 0) {
delete *seqs;
delete ids;
- delete source_seqs;
+ if(source_seqs)
+ delete *source_seqs;
}
return n;
}
diff --git a/src/data/queries.cpp b/src/data/queries.cpp
index 2ebc115..1f4de01 100644
--- a/src/data/queries.cpp
+++ b/src/data/queries.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "queries.h"
@@ -24,6 +24,8 @@ Sequence_set* query_seqs::data_ = 0;
String_set<0>* query_ids::data_ = 0;
Partitioned_histogram query_hst;
vector<bool> query_aligned;
+Seed_set *query_seeds = 0;
+Hashed_seed_set *query_seeds_hashed = 0;
void write_unaligned(Output_stream *file)
{
diff --git a/src/data/queries.h b/src/data/queries.h
index a59841a..0be1d33 100644
--- a/src/data/queries.h
+++ b/src/data/queries.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef QUERIES_H_
@@ -26,6 +24,7 @@ Author: Benjamin Buchfink
#include "sorted_list.h"
#include "../basic/statistics.h"
#include "sequence_set.h"
+#include "seed_set.h"
extern Partitioned_histogram query_hst;
extern unsigned current_query_chunk;
@@ -60,4 +59,7 @@ inline unsigned get_source_query_len(unsigned query_id)
return align_mode.query_translated ? (unsigned)query_seqs::get().reverse_translated_len(query_id*align_mode.query_contexts) : (unsigned)query_seqs::get().length(query_id);
}
+extern Seed_set *query_seeds;
+extern Hashed_seed_set *query_seeds_hashed;
+
#endif /* QUERIES_H_ */
diff --git a/src/data/reference.cpp b/src/data/reference.cpp
index 921b6a2..c86caef 100644
--- a/src/data/reference.cpp
+++ b/src/data/reference.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <limits>
@@ -26,6 +26,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "load_seqs.h"
#include "../util/seq_file_format.h"
#include "../util/log_stream.h"
+#include "../basic/masking.h"
String_set<0>* ref_ids::data_ = 0;
Ref_map ref_map;
@@ -50,71 +51,79 @@ struct Pos_record
uint32_t seq_len;
};
+void push_seq(const sequence &seq, const sequence &id, uint64_t &offset, vector<Pos_record> &pos_array, Output_stream &out, size_t &letters, size_t &n_seqs)
+{
+ pos_array.push_back(Pos_record(offset, seq.length()));
+ out.write("\xff", 1);
+ out.write(seq.data(), seq.length());
+ out.write("\xff", 1);
+ out.write(id.data(), id.length() + 1);
+ letters += seq.length();
+ ++n_seqs;
+ offset += seq.length() + id.length() + 3;
+}
+
void make_db()
{
message_stream << "Database file: " << config.input_ref_file << endl;
Timer total;
total.start();
+ if (config.input_ref_file == "")
+ std::cerr << "Input file parameter (--in) is missing. Input will be read from stdin." << endl;
task_timer timer("Opening the database file", true);
auto_ptr<Input_stream> db_file (Compressed_istream::auto_detect(config.input_ref_file));
Output_stream out(config.database);
- out.typed_write(&ref_header, 1);
+ out.write(&ref_header, 1);
- size_t letters = 0, n = 0;
+ size_t letters = 0, n = 0, n_seqs = 0;
uint64_t offset = sizeof(ref_header);
-
- vector<Letter> seq;
- vector<char> id;
+ Sequence_set *seqs;
+ String_set<0> *ids;
+ const FASTA_format format;
vector<Pos_record> pos_array;
- FASTA_format format;
try {
-
- while (format.get_seq(id, seq, *db_file)) {
- if (seq.size() == 0)
- throw std::runtime_error("File format error: sequence of length 0 at line " + to_string(db_file->line_count));
- if (n % 100000llu == 0llu) {
- std::stringstream ss;
- ss << "Loading sequence data (" << n << " sequences processed)";
- timer.go(ss.str().c_str());
+ while ((timer.go("Loading sequences"), n = load_seqs(*db_file, format, &seqs, ids, 0, (size_t)(1e9), string())) > 0) {
+ if (config.masking == 1) {
+ timer.go("Masking sequences");
+ mask_seqs(*seqs, Masking::get(), false);
+ }
+ timer.go("Writing sequences");
+ for (size_t i = 0; i < n; ++i) {
+ sequence seq = (*seqs)[i];
+ if (seq.length() == 0)
+ throw std::runtime_error("File format error: sequence of length 0 at line " + to_string(db_file->line_count));
+ push_seq(seq, (*ids)[i], offset, pos_array, out, letters, n_seqs);
}
- pos_array.push_back(Pos_record(offset, seq.size()));
- out.write("\xff", 1);
- out.write(seq, false);
- out.write("\xff", 1);
- out.write(id, false);
- out.write("\0", 1);
- letters += seq.size();
- ++n;
- offset += seq.size() + id.size() + 3;
+ delete seqs;
+ delete ids;
}
-
}
- catch (std::exception &e) {
+ catch (std::exception&) {
out.close();
out.remove();
- throw e;
+ throw;
}
-
+
timer.go("Writing trailer");
ref_header.pos_array_offset = offset;
pos_array.push_back(Pos_record(offset, 0));
- out.write(pos_array, false);
+ out.write(pos_array);
timer.go("Closing the input file");
db_file->close();
timer.go("Closing the database file");
ref_header.letters = letters;
- ref_header.sequences = n;
+ ref_header.sequences = n_seqs;
out.seekp(0);
- out.typed_write(&ref_header, 1);
+ out.write(&ref_header, 1);
out.close();
timer.finish();
- message_stream << "Processed " << n << " sequences, " << letters << " letters." << endl;
+ message_stream << "Processed " << n_seqs << " sequences, " << letters << " letters." << endl;
message_stream << "Total time = " << total.getElapsedTimeInSec() << "s" << endl;
}
@@ -154,11 +163,21 @@ bool Database_file::load_seqs()
ref_seqs::data_->finish_reserve();
ref_ids::data_->finish_reserve();
seek(start_offset);
+ size_t masked = 0;
for (size_t n = 0; n < seqs; ++n) {
read(ref_seqs::data_->ptr(n) - 1, ref_seqs::data_->length(n) + 2);
read(ref_ids::data_->ptr(n), ref_ids::data_->length(n) + 1);
+ if (config.masking == 1)
+ Masking::get().bit_to_hard_mask(ref_seqs::data_->ptr(n), ref_seqs::data_->length(n), masked);
+ else
+ Masking::get().remove_bit_mask(ref_seqs::data_->ptr(n), ref_seqs::data_->length(n));
+ if (!config.sfilt.empty() && strstr(ref_ids::get()[n].c_str(), config.sfilt.c_str()) == 0)
+ memset(ref_seqs::data_->ptr(n), value_traits.mask_char, ref_seqs::data_->length(n));
}
+ timer.finish();
+ ref_seqs::get().print_stats();
+ log_stream << "Masked letters = " << masked << endl;
blocked_processing = seqs < ref_header.sequences;
return true;
diff --git a/src/data/reference.h b/src/data/reference.h
index 138cc15..7eb8bb5 100644
--- a/src/data/reference.h
+++ b/src/data/reference.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef REFERENCE_H_
@@ -39,7 +39,7 @@ struct invalid_database_version_exception : public std::exception
{
virtual const char* what() const throw()
{
- return "Incompatible database version";
+ return "Database was built with a different version of diamond as is incompatible.";
}
};
@@ -58,7 +58,7 @@ struct Reference_header
#ifdef EXTRA
Sequence_type sequence_type;
#endif
- enum { current_db_version = 0 };
+ enum { current_db_version = 1 };
};
extern Reference_header ref_header;
diff --git a/src/data/seed_histogram.cpp b/src/data/seed_histogram.cpp
index 22c6b97..f06391a 100644
--- a/src/data/seed_histogram.cpp
+++ b/src/data/seed_histogram.cpp
@@ -1,21 +1,34 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "seed_histogram.h"
-seedp_range current_range;
\ No newline at end of file
+seedp_range current_range;
+
+Partitioned_histogram::Partitioned_histogram()
+{ }
+
+size_t Partitioned_histogram::max_chunk_size() const
+{
+ size_t max = 0;
+ ::partition<unsigned> p(Const::seedp, config.lowmem);
+ for (unsigned shape = 0; shape < shapes.count(); ++shape)
+ for (unsigned chunk = 0; chunk < p.parts; ++chunk)
+ max = std::max(max, hst_size(data_[shape], seedp_range(p.getMin(chunk), p.getMax(chunk))));
+ return max;
+}
\ No newline at end of file
diff --git a/src/data/seed_histogram.h b/src/data/seed_histogram.h
index 0b47acd..692239f 100644
--- a/src/data/seed_histogram.h
+++ b/src/data/seed_histogram.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-16, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SEED_HISTOGRAM_H_
@@ -25,6 +25,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../basic/shape_config.h"
#include "../util/thread.h"
#include "../basic/seed_iterator.h"
+#include "seed_set.h"
using std::vector;
@@ -83,33 +84,31 @@ inline size_t hst_size(const shape_histogram &hst, const seedp_range &range)
struct Partitioned_histogram
{
- Partitioned_histogram()
- { }
-
- Partitioned_histogram(const Sequence_set &seqs, unsigned longest):
+ Partitioned_histogram();
+
+ template<typename _filter>
+ Partitioned_histogram(const Sequence_set &seqs, bool serial, const _filter *filter) :
data_(shapes.count()),
- p_(seqs.partition(config.threads_*4))
+ p_(seqs.partition(config.threads_))
{
for (unsigned s = 0; s < shapes.count(); ++s) {
data_[s].resize(p_.size() - 1);
memset(data_[s].data(), 0, (p_.size() - 1)*sizeof(unsigned)*Const::seedp);
}
- Build_context context (seqs, *this, longest);
- launch_scheduled_thread_pool(context, (unsigned)(p_.size() - 1), (unsigned)(p_.size() - 1));
+ Ptr_vector<Callback> cb;
+ for (size_t i = 0; i < p_.size() - 1; ++i)
+ cb.push_back(new Callback(i, data_));
+ if (serial)
+ for (unsigned s = 0; s < shapes.count(); ++s)
+ seqs.enum_seeds(cb, p_, s, s + 1, filter);
+ else
+ seqs.enum_seeds(cb, p_, 0, shapes.count(), filter);
}
const shape_histogram& get(unsigned sid) const
{ return data_[sid]; }
- size_t max_chunk_size() const
- {
- size_t max = 0;
- ::partition<unsigned> p(Const::seedp, config.lowmem);
- for (unsigned shape = 0; shape < shapes.count(); ++shape)
- for (unsigned chunk = 0; chunk < p.parts; ++chunk)
- max = std::max(max, hst_size(data_[shape], seedp_range(p.getMin(chunk), p.getMax(chunk))));
- return max;
- }
+ size_t max_chunk_size() const;
const vector<size_t>& partition() const
{
@@ -118,47 +117,23 @@ struct Partitioned_histogram
private:
- struct Build_context
+ struct Callback
{
- Build_context(const Sequence_set &seqs, Partitioned_histogram &hst, unsigned longest):
- seqs (seqs),
- hst (hst),
- longest(longest)
- { }
- void operator()(unsigned thread_id, unsigned seqp) const
+ Callback(size_t seqp, vector<shape_histogram> &data)
{
- vector<char> buf(longest);
- hst.build_seq_partition(seqs, seqp, hst.p_[seqp], hst.p_[seqp+1], buf);
+ for (unsigned s = 0; s < shapes.count(); ++s)
+ ptr.push_back(data[s][seqp].begin());
}
- const Sequence_set &seqs;
- Partitioned_histogram &hst;
- unsigned longest;
- };
-
- void build_seq_partition(const Sequence_set &seqs,
- const unsigned seqp,
- const size_t begin,
- const size_t end,
- vector<char> &buf)
- {
- for (size_t i = begin; i < end; ++i) {
-
- assert(i < seqs.get_length());
- if (seqs[i].length() == 0)
- continue;
- Reduction::reduce_seq(seqs[i], buf);
-
- for (unsigned s = 0; s < shapes.count(); ++s) {
- Seed_iterator it(buf, shapes[s]);
- unsigned *ptr = data_[s][seqp].begin();
- uint64_t seed;
- while (it.good())
- if (it.get(seed, shapes[s]))
- ++ptr[seed_partition(seed)];
- }
-
+ bool operator()(uint64_t seed, uint64_t pos, size_t shape)
+ {
+ ++ptr[shape][seed_partition(seed)];
+ return true;
}
- }
+ void finish() const
+ {
+ }
+ vector<unsigned*> ptr;
+ };
vector<shape_histogram> data_;
vector<size_t> p_;
diff --git a/src/data/seed_set.cpp b/src/data/seed_set.cpp
new file mode 100644
index 0000000..7167ba9
--- /dev/null
+++ b/src/data/seed_set.cpp
@@ -0,0 +1,82 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "seed_set.h"
+#include "../util/ptr_vector.h"
+
+No_filter no_filter;
+
+struct Seed_set_callback
+{
+ Seed_set_callback(vector<bool> &data, size_t max_coverage):
+ coverage(0),
+ max_coverage(max_coverage),
+ data(&data)
+ {}
+ bool operator()(uint64_t seed, uint64_t pos, uint64_t shape)
+ {
+ if ((*data)[seed] == false) {
+ (*data)[seed] = true;
+ ++coverage;
+ if (coverage > max_coverage)
+ return false;
+ }
+ return true;
+ }
+ void finish()
+ {}
+ size_t coverage, max_coverage;
+ vector<bool> *data;
+};
+
+Seed_set::Seed_set(const Sequence_set &seqs, double max_coverage):
+ data_((size_t)pow(1llu<<Reduction::reduction.bit_size(), shapes[0].length_))
+{
+ if (!shapes[0].contiguous())
+ throw std::runtime_error("Contiguous seed required.");
+ Ptr_vector<Seed_set_callback> v;
+ v.push_back(new Seed_set_callback(data_, size_t(max_coverage*pow(Reduction::reduction.size(), shapes[0].length_))));
+ seqs.enum_seeds(v, seqs.partition(1), 0, 1, &no_filter);
+ coverage_ = (double)v.back()->coverage / pow(Reduction::reduction.size(), shapes[0].length_);
+}
+
+struct Hashed_seed_set_callback
+{
+ Hashed_seed_set_callback(Ptr_vector<PHash_set<Modulo2, No_hash> > &dst):
+ dst(dst)
+ {}
+ bool operator()(uint64_t seed, uint64_t pos, uint64_t shape)
+ {
+ dst[shape].insert(seed);
+ return true;
+ }
+ void finish()
+ {}
+ Ptr_vector<PHash_set<Modulo2, No_hash> > &dst;
+};
+
+Hashed_seed_set::Hashed_seed_set(const Sequence_set &seqs)
+{
+ for (size_t i = 0; i < shapes.count(); ++i)
+ data_.push_back(new PHash_set<Modulo2, No_hash>(next_power_of_2(seqs.letters()*1.25)));
+ Ptr_vector<Hashed_seed_set_callback> v;
+ v.push_back(new Hashed_seed_set_callback(data_));
+ seqs.enum_seeds(v, seqs.partition(1), 0, shapes.count(), &no_filter);
+ for (size_t i = 0; i < shapes.count(); ++i)
+ log_stream << "Shape=" << i << " Hash_table_size=" << data_[i].size() << " load=" << data_[i].load() << endl;
+}
\ No newline at end of file
diff --git a/src/data/seed_set.h b/src/data/seed_set.h
new file mode 100644
index 0000000..90cc40d
--- /dev/null
+++ b/src/data/seed_set.h
@@ -0,0 +1,52 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#ifndef SEED_SET_H_
+#define SEED_SET_H_
+
+#include "sequence_set.h"
+#include "../util/hash_table.h"
+
+struct Seed_set
+{
+ Seed_set(const Sequence_set &seqs, double max_coverage);
+ bool contains(uint64_t key, uint64_t shape) const
+ {
+ return data_[key];
+ }
+ double coverage() const
+ {
+ return coverage_;
+ }
+private:
+ vector<bool> data_;
+ double coverage_;
+};
+
+struct Hashed_seed_set
+{
+ Hashed_seed_set(const Sequence_set &seqs);
+ bool contains(uint64_t key, uint64_t shape) const
+ {
+ return data_[shape].contains(key);
+ }
+private:
+ Ptr_vector<PHash_set<Modulo2, No_hash> > data_;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/data/sequence_set.h b/src/data/sequence_set.h
index 5f99f4c..7576a6d 100644
--- a/src/data/sequence_set.h
+++ b/src/data/sequence_set.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SEQUENCE_SET_H_
@@ -27,6 +27,8 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "string_set.h"
#include "../util/thread.h"
#include "../basic/shape_config.h"
+#include "../basic/seed_iterator.h"
+#include "../util/ptr_vector.h"
using std::cout;
using std::endl;
@@ -37,11 +39,7 @@ struct Sequence_set : public String_set<'\xff', 1>
Sequence_set()
{ }
-
- Sequence_set(Input_stream &file) :
- String_set<'\xff',1>(file)
- { }
-
+
void print_stats() const
{
verbose_stream << "Sequences = " << this->get_length() << ", letters = " << this->letters() << ", average length = " << this->avg_len() << endl;
@@ -58,6 +56,14 @@ struct Sequence_set : public String_set<'\xff', 1>
return pair<size_t, size_t>(min, max);
}
+ size_t max_len(size_t begin, size_t end) const
+ {
+ size_t max = 0;
+ for (size_t i = begin; i < end; ++i)
+ max = std::max(this->length(i), max);
+ return max;
+ }
+
sequence window_infix(size_t offset, unsigned &left) const
{
const Letter* begin(this->data(offset));
@@ -123,122 +129,149 @@ struct Sequence_set : public String_set<'\xff', 1>
return this->letters() / this->get_length();
}
- template <typename _f>
- void enum_seeds(vector<_f> &f) const
+ template <typename _f, typename _filter>
+ void enum_seeds(Ptr_vector<_f> &f, const vector<size_t> &p, size_t shape_begin, size_t shape_end, const _filter *filter) const
{
- const vector<size_t> p = this->partition((unsigned)f.size());
Thread_pool threads;
for (unsigned i = 0; i < f.size(); ++i)
- threads.push_back(launch_thread(enum_seeds_worker<_f>, &f[i], this, (unsigned)p[i], (unsigned)p[i + 1]));
+ threads.push_back(launch_thread(enum_seeds_worker<_f, _filter>, &f[i], this, (unsigned)p[i], (unsigned)p[i + 1], std::make_pair(shape_begin, shape_end), filter));
threads.join_all();
}
- template<typename _f, typename _entry>
- void enum_seeds_partitioned() const
- {
- vector<Enum_partitioned_callback<_f, _entry> > v;
- v.reserve(config.threads_);
- ::partition<unsigned> p(Hashed_seed::p, config.threads_);
- for (unsigned i = 0; i < config.threads_; ++i) {
- Enum_partitioned_callback<_f, _entry> * tmp = new Enum_partitioned_callback<_f, _entry>(p.getMin(i), p.getMax(i));
- v.push_back(*tmp);
- delete tmp;
- }
- enum_seeds(v);
- /*for (unsigned i = 0; i < config.threads_; ++i)
- v[i].flush_queues();*/
- }
-
virtual ~Sequence_set()
{ }
private:
- template<typename _f>
- static void enum_seeds_worker(_f *f, const Sequence_set *seqs, unsigned begin, unsigned end)
+ template<typename _f, typename _filter>
+ void enum_seeds(_f *f, unsigned begin, unsigned end, pair<size_t, size_t> shape_range, const _filter *filter) const
{
+ vector<char> buf(max_len(begin, end));
uint64_t key;
for (unsigned i = begin; i < end; ++i) {
- const sequence seq = (*seqs)[i];
- for (unsigned shape_id = shape_from; shape_id < shape_to; ++shape_id) {
+ const sequence seq = (*this)[i];
+ Reduction::reduce_seq(seq, buf);
+ for (size_t shape_id = shape_range.first; shape_id < shape_range.second; ++shape_id) {
const shape& sh = shapes[shape_id];
if (seq.length() < sh.length_) continue;
- for (unsigned j = 0; j < seq.length() - sh.length_ + 1; ++j) {
- if (sh.set_seed(key, &seq[j]))
- (*f)(Hashed_seed(key), seqs->position(i, j), shape_id);
+ Seed_iterator it(buf, sh);
+ size_t j = 0;
+ while (it.good()) {
+ if (it.get(key, sh))
+ if (filter->contains(key, shape_id))
+ (*f)(key, position(i, j), shape_id);
+ ++j;
}
}
}
f->finish();
}
- template<typename _f, typename _entry>
- struct Enum_partitioned_callback
+ template<typename _f, uint64_t _b, typename _filter>
+ void enum_seeds_hashed(_f *f, unsigned begin, unsigned end, pair<size_t, size_t> shape_range, const _filter *filter) const
{
- Enum_partitioned_callback(unsigned p_begin, unsigned p_end) :
- p_begin(p_begin),
- p_end(p_end)
- {
- memset(counts, 0, sizeof(counts));
- }
- void operator()(Hashed_seed seed, size_t pos, unsigned shape_id)
- {
- const unsigned p = seed.partition();
- buffers[shape_id][p][counts[shape_id][p]++] = _entry(seed, pos);
- if (counts[shape_id][p] == buffer_size)
- flush_buffer(shape_id, p);
- if ((pos & flush_mask) == 0 && shape_id == shape_from)
- flush_queues();
- }
- void flush_buffer(unsigned shape_id, unsigned p)
- {
- mtx[shape_id][p].lock();
- vector<_entry> &q = queues[shape_id][p];
- unsigned &count = counts[shape_id][p];
- const size_t s = q.size();
- q.resize(s + sizeof(_entry)*count);
- memcpy(q.data() + s, buffers[shape_id][p].begin(), sizeof(_entry)*count);
- mtx[shape_id][p].unlock();
- count = 0;
+ uint64_t key;
+ for (unsigned i = begin; i < end; ++i) {
+ const sequence seq = (*this)[i];
+ for (size_t shape_id = shape_range.first; shape_id < shape_range.second; ++shape_id) {
+ const shape& sh = shapes[shape_id];
+ if (seq.length() < sh.length_) continue;
+ const uint64_t shape_mask = sh.long_mask();
+ Hashed_seed_iterator<_b> it(seq, sh);
+ size_t j = 0;
+ while (it.good()) {
+ if (it.get(key, shape_mask))
+ if (filter->contains(key, shape_id))
+ (*f)(key, position(i, j), shape_id);
+ ++j;
+ }
+ }
}
- void finish()
- {
- for (unsigned shape_id = shape_from; shape_id < shape_to; ++shape_id)
- for (unsigned p = 0; p < Hashed_seed::p; ++p)
- flush_buffer(shape_id, p);
+ f->finish();
+ }
+
+ template<typename _f, typename _it, typename _filter>
+ void enum_seeds_contiguous(_f *f, unsigned begin, unsigned end, const _filter *filter) const
+ {
+ uint64_t key;
+ for (unsigned i = begin; i < end; ++i) {
+ const sequence seq = (*this)[i];
+ if (seq.length() < _it::length()) continue;
+ _it it(seq);
+ size_t j = 0;
+ while (it.good()) {
+ if (it.get(key))
+ if (filter->contains(key, 0))
+ if ((*f)(key, position(i, j), 0) == false)
+ return;
+ ++j;
+ }
}
- void flush_queues()
- {
- for (unsigned shape_id = shape_from; shape_id < shape_to; ++shape_id)
- for (unsigned p = p_begin; p < p_end; ++p) {
- mtx[shape_id][p].lock();
- vector<_entry> &q = queues[shape_id][p];
- const size_t size = q.size();
- if (size == 0) {
- mtx[shape_id][p].unlock();
- continue;
- }
- out_buf.resize(size);
- memcpy(out_buf.data(), q.data(), size * sizeof(_entry));
- q.clear();
- mtx[shape_id][p].unlock();
- for (typename vector<_entry>::const_iterator i = out_buf.begin(); i != out_buf.end(); ++i)
- _f()(shape_id, *i);
+ f->finish();
+ }
+
+ template<typename _f, typename _filter>
+ static void enum_seeds_worker(_f *f, const Sequence_set *seqs, unsigned begin, unsigned end, pair<size_t,size_t> shape_range, const _filter *filter)
+ {
+ static const char *errmsg = "Unsupported contiguous seed.";
+ if (shape_range.second - shape_range.first == 1 && shapes[shape_range.first].contiguous()) {
+ const uint64_t b = Reduction::reduction.bit_size(), l = shapes[shape_range.first].length_;
+ switch (l) {
+ case 7:
+ switch (b) {
+ case 4:
+ seqs->enum_seeds_contiguous<_f, Contiguous_seed_iterator<7, 4>,_filter>(f, begin, end, filter);
+ break;
+ default:
+ throw std::runtime_error(errmsg);
+ }
+ break;
+ case 6:
+ switch (b) {
+ case 4:
+ seqs->enum_seeds_contiguous<_f, Contiguous_seed_iterator<6, 4>, _filter>(f, begin, end, filter);
+ break;
+ default:
+ throw std::runtime_error(errmsg);
}
+ break;
+ case 5:
+ switch (b) {
+ case 4:
+ seqs->enum_seeds_contiguous<_f, Contiguous_seed_iterator<5, 4>, _filter>(f, begin, end, filter);
+ break;
+ default:
+ throw std::runtime_error(errmsg);
+ }
+ break;
+ default:
+ throw std::runtime_error(errmsg);
+ }
+ }
+ else if (config.hashed_seeds) {
+ const uint64_t b = Reduction::reduction.bit_size();
+ switch (b) {
+ case 4:
+ seqs->enum_seeds_hashed<_f, 4, _filter>(f, begin, end, shape_range, filter);
+ break;
+ default:
+ throw std::runtime_error("Unsupported reduction.");
+ }
}
- enum { buffer_size = 16, flush_mask = 1023 };
- Array<_entry, buffer_size> buffers[Const::max_shapes][Hashed_seed::p];
- unsigned counts[Const::max_shapes][Hashed_seed::p], p_begin, p_end;
- vector<_entry> out_buf;
- static tthread::mutex mtx[Const::max_shapes][Hashed_seed::p];
- static vector<_entry> queues[Const::max_shapes][Hashed_seed::p];
- };
+ else
+ seqs->enum_seeds<_f,_filter>(f, begin, end, shape_range, filter);
+ }
};
-template<typename _f, typename _entry>
-tthread::mutex Sequence_set::Enum_partitioned_callback<_f, _entry>::mtx[Const::max_shapes][Hashed_seed::p];
-template<typename _f, typename _entry>
-vector<_entry> Sequence_set::Enum_partitioned_callback<_f, _entry>::queues[Const::max_shapes][Hashed_seed::p];
+struct No_filter
+{
+ bool contains(uint64_t seed, uint64_t shape) const
+ {
+ return true;
+ }
+};
+
+extern No_filter no_filter;
#endif /* SEQUENCE_SET_H_ */
\ No newline at end of file
diff --git a/src/data/sorted_list.cpp b/src/data/sorted_list.cpp
new file mode 100644
index 0000000..0eec23b
--- /dev/null
+++ b/src/data/sorted_list.cpp
@@ -0,0 +1,76 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "sorted_list.h"
+#include "../util/ptr_vector.h"
+#include "queries.h"
+
+char* sorted_list::alloc_buffer(const Partitioned_histogram &hst)
+{
+ return new char[sizeof(entry) * hst.max_chunk_size()];
+}
+
+sorted_list::sorted_list()
+{}
+
+sorted_list::const_iterator sorted_list::get_partition_cbegin(unsigned p) const
+{
+ return const_iterator(cptr_begin(p), cptr_end(p));
+}
+
+sorted_list::iterator sorted_list::get_partition_begin(unsigned p) const
+{
+ return iterator(ptr_begin(p), ptr_end(p));
+}
+
+sorted_list::Random_access_iterator sorted_list::random_access(unsigned p, size_t offset) const
+{
+ return Random_access_iterator(cptr_begin(p) + offset, cptr_end(p));
+}
+
+sorted_list::entry* sorted_list::ptr_begin(unsigned i) const
+{
+ return &data_[limits_[i]];
+}
+
+sorted_list::entry* sorted_list::ptr_end(unsigned i) const
+{
+ return &data_[limits_[i + 1]];
+}
+
+const sorted_list::entry* sorted_list::cptr_begin(unsigned i) const
+{
+ return &data_[limits_[i]];
+}
+
+const sorted_list::entry* sorted_list::cptr_end(unsigned i) const
+{
+ return &data_[limits_[i + 1]];
+}
+
+sorted_list::Ptr_set sorted_list::build_iterators(const shape_histogram &hst) const
+{
+ Ptr_set iterators(hst.size());
+ for (unsigned i = 0; i < Const::seedp; ++i)
+ iterators[0][i] = ptr_begin(i);
+
+ for (unsigned i = 1; i < hst.size(); ++i)
+ for (unsigned j = 0; j < Const::seedp; ++j)
+ iterators[i][j] = iterators[i - 1][j] + hst[i - 1][j];
+ return iterators;
+}
\ No newline at end of file
diff --git a/src/data/sorted_list.h b/src/data/sorted_list.h
index 665acce..8c48cee 100644
--- a/src/data/sorted_list.h
+++ b/src/data/sorted_list.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SORTED_LIST_H_
@@ -47,24 +47,25 @@ struct sorted_list
_pos value;
} PACKED_ATTRIBUTE ;
- static char* alloc_buffer(const Partitioned_histogram &hst)
- { return new char[sizeof(entry) * hst.max_chunk_size()]; }
-
- sorted_list()
- {}
-
- sorted_list(char *buffer, const Sequence_set &seqs, const shape &sh, const shape_histogram &hst, const seedp_range &range, const vector<size_t> seq_partition):
- limits_ (hst, range),
- data_ (reinterpret_cast<entry*>(buffer))
+ static char* alloc_buffer(const Partitioned_histogram &hst);
+ sorted_list();
+
+ template<typename _filter>
+ sorted_list(char *buffer, const Sequence_set &seqs, size_t sh, const shape_histogram &hst, const seedp_range &range, const vector<size_t> seq_partition, const _filter *filter) :
+ limits_(hst, range),
+ data_(reinterpret_cast<entry*>(buffer))
{
- task_timer timer ("Building seed list", 3);
- Build_context build_context (seqs, sh, range, build_iterators(hst), seq_partition);
- launch_scheduled_thread_pool(build_context, unsigned(seq_partition.size() - 1), config.threads_);
+ task_timer timer("Building seed list", 3);
+ Ptr_set iterators(build_iterators(hst));
+ Ptr_vector<Build_callback> cb;
+ for (size_t i = 0; i < seq_partition.size() - 1; ++i)
+ cb.push_back(new Build_callback(range, iterators[i].begin()));
+ seqs.enum_seeds(cb, seq_partition, sh, sh + 1, filter);
timer.go("Sorting seed list");
- Sort_context sort_context (*this);
+ Sort_context sort_context(*this);
launch_scheduled_thread_pool(sort_context, Const::seedp, config.threads_);
}
-
+
template<typename _t>
struct Iterator_base
{
@@ -98,11 +99,8 @@ struct sorted_list
typedef Iterator_base<entry> iterator;
typedef Iterator_base<const entry> const_iterator;
- const_iterator get_partition_cbegin(unsigned p) const
- { return const_iterator (cptr_begin(p), cptr_end(p)); }
-
- iterator get_partition_begin(unsigned p) const
- { return iterator (ptr_begin(p), ptr_end(p)); }
+ const_iterator get_partition_cbegin(unsigned p) const;
+ iterator get_partition_begin(unsigned p) const;
struct Random_access_iterator
{
@@ -136,13 +134,12 @@ struct sorted_list
return i.i - cptr_begin(p);
}
- Random_access_iterator random_access(unsigned p, size_t offset) const
- {
- return Random_access_iterator(cptr_begin(p) + offset, cptr_end(p));
- }
+ Random_access_iterator random_access(unsigned p, size_t offset) const;
private:
+ friend struct Build_callback;
+
typedef vector<Array<entry*, Const::seedp> > Ptr_set;
struct buffered_iterator
@@ -180,69 +177,12 @@ private:
uint8_t n[Const::seedp];
};
- entry* ptr_begin(unsigned i) const
- { return &data_[limits_[i]]; }
-
- entry* ptr_end(unsigned i) const
- { return &data_[limits_[i+1]]; }
-
- const entry* cptr_begin(unsigned i) const
- { return &data_[limits_[i]]; }
+ entry* ptr_begin(unsigned i) const;
+ entry* ptr_end(unsigned i) const;
+ const entry* cptr_begin(unsigned i) const;
+ const entry* cptr_end(unsigned i) const;
- const entry* cptr_end(unsigned i) const
- { return &data_[limits_[i+1]]; }
-
- struct Build_context
- {
- Build_context(const Sequence_set &seqs, const shape &sh, const seedp_range &range, const Ptr_set &iterators, const vector<size_t> &seq_partition):
- seqs (seqs),
- sh (sh),
- range (range),
- iterators (iterators),
- seq_partition (seq_partition)
- { }
- void operator()(unsigned thread_id, unsigned seqp) const
- {
- build_seqp(seqs,
- seq_partition[seqp],
- seq_partition[seqp + 1],
- iterators[seqp].begin(),
- sh,
- range);
- }
- const Sequence_set &seqs;
- const shape &sh;
- const seedp_range ⦥
- const Ptr_set iterators;
- const vector<size_t> &seq_partition;
- };
-
- static void build_seqp(const Sequence_set &seqs, size_t begin, size_t end, entry* const* ptr, const shape &sh, const seedp_range &range)
- {
- uint64_t key;
- auto_ptr<buffered_iterator> it (new buffered_iterator(ptr));
- for(size_t i=begin;i<end;++i) {
- const sequence seq = seqs[i];
- if(seq.length()<sh.length_) continue;
- for (unsigned j = 0; j < seq.length() - sh.length_ + 1; ++j) {
- if (sh.set_seed(key, &seq[j]))
- it->push(key, seqs.position(i, j), range);
- }
- }
- it->flush();
- }
-
- Ptr_set build_iterators(const shape_histogram &hst) const
- {
- Ptr_set iterators (hst.size());
- for (unsigned i = 0; i < Const::seedp; ++i)
- iterators[0][i] = ptr_begin(i);
-
- for (unsigned i = 1; i < hst.size(); ++i)
- for (unsigned j = 0; j < Const::seedp; ++j)
- iterators[i][j] = iterators[i - 1][j] + hst[i - 1][j];
- return iterators;
- }
+ Ptr_set build_iterators(const shape_histogram &hst) const;
struct Sort_context
{
@@ -271,6 +211,25 @@ private:
}
};
+ struct Build_callback
+ {
+ Build_callback(const seedp_range &range, sorted_list::entry* const* ptr) :
+ range(range),
+ it(new sorted_list::buffered_iterator(ptr))
+ { }
+ bool operator()(uint64_t seed, uint64_t pos, size_t shape)
+ {
+ it->push(seed, pos, range);
+ return true;
+ }
+ void finish()
+ {
+ it->flush();
+ }
+ seedp_range range;
+ auto_ptr<sorted_list::buffered_iterator> it;
+ };
+
Limits limits_;
entry *data_;
diff --git a/src/data/string_set.h b/src/data/string_set.h
index 5d01930..032c56d 100644
--- a/src/data/string_set.h
+++ b/src/data/string_set.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef STRING_SET_H_
@@ -91,18 +89,6 @@ struct String_set
file.write(data_);
}
- String_set(Input_stream &file)
- {
- file.read(limits_);
- file.read(data_);
- }
-
- static void skip(Input_stream &file)
- {
- file.skip_vector<size_t>();
- file.skip_vector<_t>();
- }
-
size_t raw_len() const
{ return limits_.back(); }
diff --git a/src/data/taxonomy.cpp b/src/data/taxonomy.cpp
new file mode 100644
index 0000000..838f29f
--- /dev/null
+++ b/src/data/taxonomy.cpp
@@ -0,0 +1,66 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <stdio.h>
+#include "taxonomy.h"
+#include "../util/compressed_stream.h"
+#include "../basic/config.h"
+#include "../util/merge_sort.h"
+
+Taxonomy taxonomy;
+
+string& get_accession(string &t)
+{
+ size_t i;
+ if (t.compare(0, 6, "UniRef") == 0)
+ t.erase(0, 9);
+ else if ((i = t.find_first_of('|', 0)) != string::npos) {
+ if (t.compare(0, 3, "gi|") == 0) {
+ t.erase(0, t.find_first_of('|', i + 1) + 1);
+ i = t.find_first_of('|', 0);
+ }
+ t.erase(0, i + 1);
+ i = t.find_first_of('|', 0);
+ if (i != string::npos)
+ t.erase(i);
+ }
+ return t;
+}
+
+void Taxonomy::load()
+{
+ char acc[max_accesion_len + 2];
+ unsigned taxid;
+ Compressed_istream f(config.prot_accession2taxid);
+ f.getline();
+
+ while (!f.eof() && (f.getline(), !f.line.empty())) {
+ if (sscanf(f.line.c_str(), "%*s%15s%u%*u", acc, &taxid) != 2) {
+ //std::cout << f.line << endl;
+ throw std::runtime_error("Invalid taxonomy mapping file format.");
+ }
+ if (strlen(acc) > max_accesion_len) {
+ //std::cout << f.line << endl;
+ throw std::runtime_error("Accession exceeds supported length.");
+ }
+ accession2taxid_.push_back(std::make_pair(Accession(acc), taxid));
+ /*if (f.line_count % 10000 == 0)
+ std::cout << f.line_count << endl;*/
+ }
+ merge_sort(accession2taxid_.begin(), accession2taxid_.end(), config.threads_);
+}
\ No newline at end of file
diff --git a/src/data/taxonomy.h b/src/data/taxonomy.h
new file mode 100644
index 0000000..1ce6dc5
--- /dev/null
+++ b/src/data/taxonomy.h
@@ -0,0 +1,90 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "../basic/const.h"
+#include "../util/util.h"
+
+using std::pair;
+using std::string;
+
+string& get_accession(string &t);
+
+struct Taxonomy
+{
+ enum { max_accesion_len = 14 };
+ struct Accession
+ {
+ Accession(const char *s)
+ {
+ strncpy(this->s, s, max_accesion_len);
+ }
+ Accession(const string &s)
+ {
+ string t(get_title(s));
+ get_accession(t);
+ if (t.length() > max_accesion_len) {
+ //this->s[0] = 0;
+ throw std::runtime_error("Accession exceeds maximum length.");
+ }
+ else
+ strncpy(this->s, t.c_str(), max_accesion_len);
+ }
+ bool operator<(const Accession &y) const
+ {
+ return strncmp(s, y.s, max_accesion_len) < 0;
+ }
+ bool match(const Accession &y) const
+ {
+ const void *p2 = memchr(y.s, '.', max_accesion_len);
+ size_t n = max_accesion_len;
+ if (p2 == 0) {
+ const void *p1 = memchr(s, '.', max_accesion_len);
+ if (p1)
+ n = (const char*)p1 - s;
+ }
+ return strncmp(s, y.s, n) == 0;
+ }
+ friend std::ostream& operator<<(std::ostream &str, const Accession &x)
+ {
+ for (int i = 0; i < max_accesion_len && x.s[i] != 0; ++i)
+ str << x.s[i];
+ return str;
+ }
+ char s[max_accesion_len];
+ };
+
+ void load();
+
+ unsigned get(const Accession &accession) const
+ {
+ std::vector<std::pair<Accession, unsigned> >::const_iterator i = std::lower_bound(accession2taxid_.begin(), accession2taxid_.end(), std::make_pair(accession, 0u));
+ if (i < accession2taxid_.end() && i->first.match(accession))
+ return i->second;
+ else
+ return 0;
+ }
+
+private:
+
+ std::vector<std::pair<Accession, unsigned> > accession2taxid_;
+};
+
+extern Taxonomy taxonomy;
\ No newline at end of file
diff --git a/src/dp/banded_sw.cpp b/src/dp/banded_sw.cpp
new file mode 100644
index 0000000..f4761a1
--- /dev/null
+++ b/src/dp/banded_sw.cpp
@@ -0,0 +1,316 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "dp.h"
+
+using std::pair;
+
+struct Banded_traceback_matrix
+{
+ Banded_traceback_matrix(const vector<int> &data, int band, int i0) :
+ data_(data),
+ band_(band),
+ i0_(i0)
+ { }
+ int operator()(int i, int j) const
+ {
+ return data_[(j + 1)*band_ + (i - (i0_ + j))];
+ }
+ bool in_band(int i, int j) const
+ {
+ return i >= 0 && j >= 0 && i >= i0_ + j && i < i0_ + j + band_;
+ }
+ void print(int qlen, int slen) const
+ {
+ printf("\n ");
+ for (int j = 0; j <= slen; ++j)
+ printf("%4i", j - 1);
+ printf("\n");
+ for (int i = 0; i <= qlen; ++i) {
+ printf("%4i", i - 1);
+ for (int j = 0; j <= slen; ++j)
+ printf("%4i", in_band(i - 1, j - 1) ? this->operator()(i - 1, j - 1) : 0);
+ printf("\n");
+ }
+ }
+ struct Column_iterator
+ {
+ Column_iterator(const int *ptr, const int *end):
+ ptr_(ptr),
+ end_(end)
+ {}
+ bool good()
+ {
+ return ptr_ >= end_;
+ }
+ void operator--()
+ {
+ --ptr_;
+ }
+ int operator*() const
+ {
+ return *ptr_;
+ }
+ const int *ptr_, *end_;
+ };
+ Column_iterator column(int i, int j) const
+ {
+ const int i0 = i0_ + j;
+ return Column_iterator(&data_[(j + 1)*band_ + (i - i0)], &data_[(j + 1)*band_ + std::max(i0, 0) - i0]);
+ }
+ struct Row_iterator
+ {
+ Row_iterator(const int *ptr, const int *end, int band) :
+ ptr_(ptr),
+ end_(end),
+ band_(band-1)
+ {}
+ bool good()
+ {
+ return ptr_ >= end_;
+ }
+ void operator--()
+ {
+ ptr_ -= band_;
+ }
+ int operator*() const
+ {
+ //cout << "*h=" << *ptr_ << endl;
+ //printf("ptr=%llx end=%llx\n", ptr_, end_);
+ return *ptr_;
+ }
+ const int *ptr_, *end_, band_;
+ };
+ Row_iterator row(int i, int j) const
+ {
+ const int i0 = i0_ + j;
+ //cout << "j_end=" << i - i0_ - band_ << endl;
+ const int* p = &data_[(j + 1)*band_ + (i - i0)];
+ return Row_iterator(p, std::max(p - (j - (i - i0_ - band_) - 1)*(band_ - 1), &data_[band_]), band_);
+ }
+private:
+ const vector<int> &data_;
+ const int band_, i0_;
+};
+
+int have_gap(const Banded_traceback_matrix &dp,
+ int i,
+ int j,
+ int &l)
+{
+ const int score = dp(i, j);
+ l = 1;
+ const int ge = score_matrix.gap_extend();
+ int g = score_matrix.gap_open() + ge;
+ Banded_traceback_matrix::Column_iterator v(dp.column(i - 1, j));
+ Banded_traceback_matrix::Row_iterator h(dp.row(i, j - 1));
+ while (v.good() && h.good()) {
+ if (score == *v - g)
+ return 0;
+ else if (score == *h - g)
+ return 1;
+ --h;
+ --v;
+ ++l;
+ g += ge;
+ }
+ while (v.good()) {
+ if (score == *v - g)
+ return 0;
+ --v;
+ ++l;
+ g += ge;
+ }
+ while (h.good()) {
+ if (score == *h - g)
+ return 1;
+ --h;
+ ++l;
+ g += ge;
+ }
+ return -1;
+}
+
+void traceback(const sequence &query,
+ const sequence &subject,
+ const vector<int> &scores,
+ int band,
+ int i0,
+ int i,
+ int j,
+ Hsp_data &l)
+{
+ Banded_traceback_matrix dp(scores, band, i0);
+ //dp.print(i + 1, j + 1);
+ l.query_range.end_ = i + 1;
+ l.subject_range.end_ = j + 1;
+ l.transcript.clear();
+
+ int gap_len, score;
+
+ while ((score = dp(i, j)) > 0) {
+ const int match_score = score_matrix(query[i], subject[j]);
+ //printf("i=%i j=%i score=%i subject=%c query=%c\n", i, j, dp(i, j), value_traits.alphabet[subject[j]], value_traits.alphabet[query[i]]);
+
+ if (score == match_score + dp(i - 1, j - 1)) {
+ if (query[i] == subject[j]) {
+ l.transcript.push_back(op_match);
+ ++l.identities;
+ ++l.positives;
+ }
+ else {
+ l.transcript.push_back(op_substitution, subject[j]);
+ ++l.mismatches;
+ if (match_score > 0)
+ ++l.positives;
+ }
+ --i;
+ --j;
+ ++l.length;
+ }
+ else {
+ const int g = have_gap(dp, i, j, gap_len);
+ if (g == -1)
+ throw std::runtime_error("Traceback error.");
+ ++l.gap_openings;
+ l.length += gap_len;
+ l.gaps += gap_len;
+ if (g == 0) {
+ i -= gap_len;
+ l.transcript.push_back(op_insertion, (unsigned)gap_len);
+ }
+ else {
+ for (; gap_len > 0; gap_len--)
+ l.transcript.push_back(op_deletion, subject[j--]);
+ }
+ }
+ }
+
+ l.query_range.begin_ = i + 1;
+ l.subject_range.begin_ = j + 1;
+ l.transcript.reverse();
+ l.transcript.push_terminator();
+}
+
+struct Banded_dp_matrix
+{
+
+ struct Column_iterator
+ {
+ inline Column_iterator(const pair<int*, int*> &score, int *hgap) :
+ score_(score),
+ hgap_(hgap + 1)
+ {
+ }
+ inline int& score()
+ {
+ return *score_.second;
+ }
+ inline int diag() const
+ {
+ return *score_.first;
+ }
+ inline int hgap_in() const
+ {
+ return *hgap_;
+ }
+ inline int& hgap_out()
+ {
+ return *(hgap_ - 1);
+ }
+ inline void operator++()
+ {
+ ++score_.first;
+ ++score_.second;
+ ++hgap_;
+ }
+ private:
+ pair<int*, int*> score_;
+ int *hgap_;
+ };
+
+ inline Column_iterator column(int j, int offset)
+ {
+ return Column_iterator(std::make_pair(&score_[j*band_ + offset], &score_[(j + 1)*band_ + offset]), hgap_.data() + offset);
+ }
+
+ const vector<int>& scores() const
+ {
+ return score_;
+ }
+
+ inline Banded_dp_matrix(int band, int cols) :
+ band_(band),
+ score_(band*(cols + 1)),
+ hgap_(band + 1)
+ {}
+
+private:
+
+ const int band_;
+ vector<int> score_, hgap_;
+
+};
+
+void banded_sw(const sequence &query, const sequence &subject, int d_begin, int d_end, int j_begin, int j_end, Hsp_data &out)
+{
+ using std::max;
+ assert(d_end > d_begin);
+ const int slen = (int)subject.length(),
+ qlen = (int)query.length();
+ d_begin = std::max(d_begin, -(slen - 1));
+ d_end = std::min(d_end, qlen);
+ const int band = d_end - d_begin,
+ gap_open = score_matrix.gap_open() + score_matrix.gap_extend(),
+ gap_extend = score_matrix.gap_extend();
+ int i0 = d_begin + j_begin, score = 0, max_i, max_j;
+ int n=0;
+ Banded_dp_matrix mtx(band, slen);
+ for (int j = j_begin; j < j_end; ++j, ++i0) {
+ const int i1 = std::min(i0 + band, qlen);
+ int i = std::max(i0, 0), vgap = 0;
+ Banded_dp_matrix::Column_iterator it = mtx.column(j, i - i0);
+ for (; i < i1; ++i, ++it) {
+ const int match_score = score_matrix(query[i], subject[j]);
+ int hgap = it.hgap_in();
+ int s = it.diag() + match_score;
+ if (s < hgap)
+ s = hgap;
+ if (s < vgap)
+ s = vgap;
+ if (s < 0)
+ s = 0;
+ const int open = s - gap_open;
+ vgap -= gap_extend;
+ if (vgap < open)
+ vgap = open;
+ hgap -= gap_extend;
+ if (hgap < open)
+ hgap = open;
+ it.hgap_out() = hgap;
+ it.score() = s;
+ if (s > score) {
+ score = s;
+ max_i = i;
+ max_j = j;
+ }
+ }
+ }
+ out.score = score;
+ traceback(query, subject, mtx.scores(), band, d_begin + j_begin, max_i, max_j, out);
+}
\ No newline at end of file
diff --git a/src/dp/comp_based_stats.cpp b/src/dp/comp_based_stats.cpp
index a65c36b..5817816 100644
--- a/src/dp/comp_based_stats.cpp
+++ b/src/dp/comp_based_stats.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "dp.h"
@@ -113,4 +113,13 @@ int Bias_correction::operator()(const Hsp_data &hsp) const
}
}
return (int)s;
+}
+
+int Bias_correction::operator()(const Diagonal_segment &d) const
+{
+ float s = 0;
+ const int end = d.query_end();
+ for (int i = d.i; i < end; ++i)
+ s += (*this)[i];
+ return (int)s;
}
\ No newline at end of file
diff --git a/src/dp/diag_scores.cpp b/src/dp/diag_scores.cpp
new file mode 100644
index 0000000..97ccc34
--- /dev/null
+++ b/src/dp/diag_scores.cpp
@@ -0,0 +1,315 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "dp.h"
+
+int Diag_scores::min_diag_score = 19;
+int Diag_scores::min_low_score = 13;
+
+void score_diagonal2(const Letter *query, const Bias_correction &query_bc, const Letter *subject, int len, int qbegin, int jbegin, vector<Diagonal_node> &diags, int cutoff)
+{
+ typedef int score_t;
+ static const score_t xdrop = 10;
+ int i = 0, j = 0, begin = 0, end = 0, l = 0;
+ score_t score = 0, max_score = 0;
+ while (l<len) {
+ score += score_matrix(query[i], subject[i]); // +query_bc[qbegin + i];
+ if (score <= 0 || max_score - score > xdrop) {
+ if (max_score >= cutoff) {
+ diags.push_back(Diagonal_node(qbegin + begin, jbegin + begin, end - begin, (int)max_score));
+ //cells += end - begin;
+ }
+ score = 0;
+ max_score = 0;
+ begin = i + 1;
+ }
+ else if (score > max_score) {
+ max_score = score;
+ end = i + 1;
+ }
+ ++i;
+ ++l;
+ }
+ if (max_score >= cutoff) {
+ diags.push_back(Diagonal_node(qbegin + begin, jbegin + begin, end - begin, (int)max_score));
+ //cells += end - begin;
+ }
+}
+
+void scan_cols(const Long_score_profile &qp, sequence s, int i, int j, int j_end, vector<uint8_t> &sv_max, bool log, Band &buf, Band &local_max, int block_len)
+{
+#ifdef __SSE2__
+ typedef score_vector<uint8_t> Sv;
+ const Sv vbias(score_matrix.bias());
+ const int qlen = (int)qp.length(),
+ diags = buf.diags();
+
+ int j2 = std::max(-(i - j + 15), j),
+ i3 = j2 + i - j,
+ j2_end = std::min(qlen - (i - j), j_end);
+ uint8_t *local_max_ptr = local_max.data() + (j2 - j) / 16 * diags,
+ *buf_ptr = buf.data() + (j2 - j)*diags;
+ Sv v, max, global_max;
+ for (; j2 < j2_end; ++j2, ++i3) {
+ assert(j2 >= 0);
+ const uint8_t *q = qp.get(s[j2], i3);
+ v = v + score_vector<uint8_t>(q);
+ v -= vbias;
+ max.max(v);
+ assert(buf.check(buf_ptr + 16));
+ v.store(buf_ptr);
+ buf_ptr += diags;
+ if (((j2 - j) & 15) == 15) {
+ global_max.max(max);
+ assert(local_max.check(local_max_ptr + 16));
+ max.store(local_max_ptr);
+ local_max_ptr += diags;
+ max = score_vector<uint8_t>();
+ }
+ }
+ if (((j2 - j) & 15) != 0) {
+ global_max.max(max);
+ assert(local_max.check(local_max_ptr));
+ max.store(local_max_ptr);
+ }
+ global_max.store(&sv_max[0]);
+#endif
+}
+
+void scan_cols(const sequence &q, sequence s, int i, int j, int j_end, vector<uint8_t> &sv_max, bool log, Band &buf, Band &local_max, int block_len)
+{
+#ifdef __SSE2__
+ typedef score_vector<uint8_t> Sv;
+ const Sv vbias(score_matrix.bias());
+ const int qlen = (int)q.length(),
+ diags = buf.diags();
+
+ int j2 = std::max(-(i - j + 15), j),
+ i3 = j2 + i - j,
+ j2_end = std::min(qlen - (i - j), j_end);
+ uint8_t *local_max_ptr = local_max.data() + (j2 - j) / 16 * diags,
+ *buf_ptr = buf.data() + (j2 - j)*diags;
+ Sv v, max, global_max;
+ //__m128i f = _mm_loadu_si128((__m128i*)&q[0]);
+ for (; j2 < j2_end; ++j2, ++i3) {
+ assert(j2 >= 0);
+ const Sv scores(s[j2], _mm_loadu_si128((__m128i*)&q[i3]));
+ //const Sv scores(s[j2], f);
+ v = v + scores;
+ v -= vbias;
+ max.max(v);
+ assert(buf.check(buf_ptr + 16));
+ v.store(buf_ptr);
+ buf_ptr += diags;
+ if (((j2 - j) & 15) == 15) {
+ global_max.max(max);
+ assert(local_max.check(local_max_ptr + 16));
+ max.store(local_max_ptr);
+ local_max_ptr += diags;
+ max = score_vector<uint8_t>();
+ }
+ }
+ if (((j2 - j) & 15) != 0) {
+ global_max.max(max);
+ assert(local_max.check(local_max_ptr));
+ max.store(local_max_ptr);
+ }
+ global_max.store(&sv_max[0]);
+#else
+ const int qlen = (int)q.length(),
+ diags = buf.diags();
+
+ for (int i0 = i; i0 < i + 16; ++i0) {
+ int j2 = std::max(-(i0 - j), j),
+ i3 = j2 + i0 - j,
+ j2_end = std::min(qlen - (i0 - j), j_end);
+ uint8_t *local_max_ptr = local_max.data() + (j2 - j) / 16 * diags + (i0 - i),
+ *buf_ptr = buf.data() + (j2 - j)*diags + (i0 - i);
+ int v = 0, max = 0, global_max = 0;
+ for (; j2 < j2_end; ++j2, ++i3) {
+ assert(j2 >= 0);
+ v = std::max(v + score_matrix(q[i3], s[j2]), 0);
+ max = std::max(max, v);
+ assert(buf.check(buf_ptr + 1));
+ *buf_ptr = (uint8_t)std::min(v, 255);
+ buf_ptr += diags;
+ if (((j2 - j) & 15) == 15) {
+ global_max = std::max(global_max, max);
+ assert(local_max.check(local_max_ptr + 1));
+ *local_max_ptr = (uint8_t)std::min(max,255);
+ local_max_ptr += diags;
+ max = 0;
+ }
+ }
+ if (((j2 - j) & 15) != 0) {
+ global_max = std::max(global_max, max);
+ assert(local_max.check(local_max_ptr));
+ *local_max_ptr = (uint8_t)std::min(max, 255);
+ }
+ sv_max[i0 - i] = (uint8_t)std::min(global_max, 255);
+ }
+#endif
+}
+
+
+int get_low_idx(Band::Iterator &d, int begin, int end, int d0)
+{
+ uint8_t low = 255;
+ int j = end;
+ for (int i = end - 1; i >= begin; --i)
+ if (d[i] == 0)
+ return i;
+ else if (d[i] < low) {
+ low = d[i];
+ j = i;
+ }
+ return begin > d0 ? j : d0 - 1;
+}
+
+int get_max_idx(Band::Iterator &d, int begin, int end)
+{
+ assert(begin >= 0 && begin < end);
+ int i = begin, s = d[begin];
+ for (int j = i + 1; j < end; ++j)
+ if (d[j] > s) {
+ i = j;
+ s = d[j];
+ }
+ return i;
+}
+
+int get_diag(int i, int j, Band::Iterator &d, int begin, int last, int end, int d0, vector<Diagonal_node> &diags, int block_len, bool log, int cutoff, int best_score, const Bias_correction &query_bc)
+{
+ assert(end >= begin && begin >= 0);
+ int z = std::numeric_limits<int>::max();
+ while (end > begin) {
+ /*const int p1 = get_score_idx(d, std::max(end - block_len, begin), end, max_score),
+ p0 = get_low_idx(d, last, std::min(std::min(begin + block_len, end), p1));*/
+ const int mod = end%block_len,
+ p1 = get_max_idx(d, std::max(begin, end - (mod == 0 ? block_len : mod)), end),
+ p0 = get_low_idx(d, last, p1, d0);
+ assert(p1 >= p0);
+ assert(p1 < end);
+ const int score = d[p1] - (p0 >= d0 ? d[p0] : 0);
+ if (score >= cutoff || (d[p1] == best_score && score > 0)) {
+ //if (score >= cutoff) {
+ assert(i + p0 + 1 >= 0);
+ assert(j + p0 + 1 >= 0);
+ Diagonal_segment diag(Diagonal_segment(i + p0 + 1, j + p0 + 1, p1 - p0, score));
+ //if (diag.score + query_bc(diag) >= cutoff) {
+ diag.score = diag.score; // +query_bc(diag);
+ diags.push_back(diag);
+ assert(p0 + 1 >= 0);
+ z = p0 + 1;
+ /*if (log)
+ cout << diags.back() << endl;*/
+ //}
+ }
+ end = p0;
+ }
+ assert(z >= last);
+ return z;
+}
+
+void Diag_scores::get_diag(int i, int j, int o, int j_begin, int j_end, vector<Diagonal_node> &diags, int cutoff, bool log, const Bias_correction &query_bc)
+{
+ Band::Iterator d(local_max.diag(o)), d2(score_buf.diag(o));
+ const int diag = i - j,
+ j0 = dj0(diag),
+ j1 = dj1(diag),
+ b0 = (j0 - j) / block_len,
+ b1 = (j1 - j + block_len - 1) / block_len;
+ int p = std::max((j_begin - j) / block_len, b0), p_begin = p, p_end = std::min((j_end - j + block_len - 1) / block_len, b1), best = -1, best_score = -1, begin = -1, max_score, last = p;
+ while (last > b0)
+ --last;
+ for (; p < p_end; ++p) {
+ if ((max_score = d[p]) >= cutoff
+ && (p == 0 || max_score > d[p - 1])) {
+ if (begin == -1)
+ begin = p;
+ best = p;
+ best_score = max_score;
+ }
+ else if (begin != -1) {
+ const int z = ::get_diag(i, j, d2, std::max(begin*block_len, j0 - j), std::max(last*block_len, j0 - j), std::min((best + 1)*block_len, j1 - j), j0 - j, diags, block_len, log, cutoff, best_score, query_bc);
+ if (z < std::numeric_limits<int>::max()) {
+ assert(diags.back().len > 0);
+ assert(diags.back().j >= 0 && diags.back().subject_end() <= slen);
+ assert(diags.back().i >= 0 && diags.back().query_end() <= qlen);
+ last = best + 1;
+ }
+ begin = -1;
+ best = -1;
+ }
+ }
+
+ if (begin != -1) {
+ if (best == p_end - 1) {
+ for (; best<b1 && d[best] >= cutoff && (best == 0 || d[best] > d[best - 1]); ++best);
+ best -= 1;
+ best_score = d[best];
+ }
+ const int z = ::get_diag(i, j, d2, std::max(begin*block_len, j0 - j), std::max(last*block_len, j0 - j), std::min((best + 1)*block_len, j1 - j), j0-j, diags, block_len, log, cutoff, best_score, query_bc);
+ if (z < std::numeric_limits<int>::max()) {
+ assert(diags.back().len > 0);
+ assert(diags.back().j >= 0 && diags.back().subject_end() <= slen);
+ assert(diags.back().i >= 0 && diags.back().query_end() <= qlen);
+ }
+ }
+}
+
+void Diag_scores::scan_diags(int d_begin, int d_end, sequence query, sequence subject, const Long_score_profile &qp, const Bias_correction &query_bc, bool log, vector<Diagonal_node> &diags, bool fast)
+{
+ assert(d_end > d_begin);
+ qlen = (int)query.length();
+ slen = (int)subject.length();
+ const int band = d_end - d_begin;
+ this->fast = fast;
+ this->d_begin = d_begin;
+ this->d_end = d_end;
+ i_begin = std::max(0, d_end - 1) - band + 1;
+ j_begin = i_begin - d_begin;
+ const int j1 = std::min(qlen - d_begin, slen);
+ sv_max.clear();
+ sv_max.resize(16);
+ assert(j1 > j_begin);
+ score_buf.init(16, j1 - j_begin);
+ local_max.init(16, (j1 - j_begin + block_len - 1) / block_len);
+
+ for (int i = i_begin; i < i_begin + band; i += 16) {
+
+ memset(sv_max.data(), 0, sv_max.size());
+
+#ifdef __SSE2__
+ scan_cols(qp, subject, i, j_begin, j1, sv_max, log, score_buf, local_max, block_len);
+#else
+ scan_cols(query, subject, i, j_begin, j1, sv_max, log, score_buf, local_max, block_len);
+#endif
+
+ for (int o = 0; o < 16; ++o)
+ if (sv_max[o] >= Diag_scores::min_diag_score) {
+ if (sv_max[o] >= 255 - score_matrix.bias()) {
+ const int s = std::min(i + o, 0), i0 = i + o - s, j0 = j_begin - s;
+ score_diagonal2(&query[i0], query_bc, &subject[j0], std::min((int)query.length() - i0, (int)subject.length() - j0), i0, j0, diags, fast ? min_diag_score : min_low_score);
+ }
+ else
+ get_diag(i + o, j_begin, o, j_begin, j1, diags, min_diag_score, log, query_bc);
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/src/dp/dp.h b/src/dp/dp.h
index 94454b5..dbdec70 100644
--- a/src/dp/dp.h
+++ b/src/dp/dp.h
@@ -1,25 +1,27 @@
/****
-Copyright (c) 2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DP_H_
#define DP_H_
#include <utility>
+#include <map>
+#include <list>
#include "../basic/match.h"
#include "../align/align.h"
#include "score_profile.h"
@@ -40,6 +42,107 @@ struct Bias_correction : public vector<float>
score += (*this)[query_anchor + i*mult];
}
int operator()(const Hsp_data &hsp) const;
+ int operator()(const Diagonal_segment &d) const;
+};
+
+struct Seed_hit
+{
+ Seed_hit()
+ {}
+ Seed_hit(unsigned frame, unsigned subject, unsigned subject_pos, unsigned query_pos, const Diagonal_segment &ungapped) :
+ frame_(frame),
+ subject_(subject),
+ subject_pos_(subject_pos),
+ query_pos_(query_pos),
+ ungapped(ungapped),
+ prefix_score(ungapped.score)
+ { }
+ int diagonal() const
+ {
+ return (int)query_pos_ - (int)subject_pos_;
+ }
+ bool operator<(const Seed_hit &rhs) const
+ {
+ return ungapped.score > rhs.ungapped.score;
+ }
+ static bool compare_pos(const Seed_hit &x, const Seed_hit &y)
+ {
+ return Diagonal_segment::cmp_subject_end(x.ungapped, y.ungapped);
+ }
+ static bool compare_diag(const Seed_hit &x, const Seed_hit &y)
+ {
+ return x.frame_ < y.frame_ || (x.frame_ == y.frame_ && (x.diagonal() < y.diagonal() || (x.diagonal() == y.diagonal() && x.ungapped.j < y.ungapped.j)));
+ }
+ struct Frame
+ {
+ unsigned operator()(const Seed_hit &x) const
+ {
+ return x.frame_;
+ }
+ };
+
+ unsigned frame_, subject_, subject_pos_, query_pos_;
+ Diagonal_segment ungapped;
+ unsigned prefix_score;
+};
+
+struct Hsp_traits
+{
+ Hsp_traits(unsigned frame) :
+ d_min(std::numeric_limits<int>::max()),
+ d_max(std::numeric_limits<int>::min()),
+ score(0),
+ frame((int)frame)
+ {}
+ int partial_score(const Diagonal_segment &d) const
+ {
+ const double overlap = std::max(d.subject_range().overlap_factor(subject_range), d.query_range().overlap_factor(query_range));
+ return int((1 - overlap)*d.score);
+ }
+ int partial_score(const Hsp_traits &x) const
+ {
+ const double overlap = std::max(x.subject_range.overlap_factor(subject_range), x.query_range.overlap_factor(query_range));
+ return int((1 - overlap)*x.score);
+ }
+ bool disjoint(const Diagonal_segment &d) const
+ {
+ return intersect(query_range, d.query_range()).length() == 0 && intersect(subject_range, d.subject_range()).length() == 0;
+ }
+ bool disjoint(const Hsp_traits &x) const
+ {
+ return intersect(query_range, x.query_range).length() == 0 && intersect(subject_range, x.subject_range).length() == 0;
+ }
+ bool rel_disjoint(const Diagonal_segment &d) const
+ {
+ return intersect(query_range, d.query_range()).length() == 0 || intersect(subject_range, d.subject_range()).length() == 0;
+ }
+ bool rel_disjoint(const Hsp_traits &x) const
+ {
+ return intersect(query_range, x.query_range).length() == 0 || intersect(subject_range, x.subject_range).length() == 0;
+ }
+ bool collinear(const Hsp_traits &x) const
+ {
+ const int di = x.query_range.begin_ - query_range.begin_, dj = x.subject_range.begin_ - subject_range.begin_;
+ return (di >= 0 && dj >= 0) || (di <= 0 && dj <= 0);
+ }
+ bool collinear(const Diagonal_segment &d) const
+ {
+ const int di = d.i - query_range.begin_, dj = d.j - subject_range.begin_;
+ return (di >= 0 && dj >= 0) || (di <= 0 && dj <= 0);
+ }
+ static bool cmp_diag(const Hsp_traits &x, const Hsp_traits &y)
+ {
+ return x.frame < y.frame || (x.frame == y.frame && x.d_min < y.d_min);
+ }
+ struct Frame
+ {
+ unsigned operator()(const Hsp_traits &x) const
+ {
+ return x.frame;
+ }
+ };
+ int d_min, d_max, score, frame;
+ interval query_range, subject_range;
};
template<typename _score>
@@ -49,12 +152,15 @@ int smith_waterman(const sequence &query, const sequence &subject, unsigned band
int xdrop_ungapped(const Letter *query, const Letter *subject, unsigned seed_len, unsigned &delta, unsigned &len);
int xdrop_ungapped(const Letter *query, const Letter *subject, unsigned &delta, unsigned &len);
int xdrop_ungapped_right(const Letter *query, const Letter *subject, int &len);
+Diagonal_segment xdrop_ungapped(const sequence &query, const Bias_correction &query_bc, const sequence &subject, int qa, int sa);
+Diagonal_segment xdrop_ungapped(const sequence &query, const sequence &subject, int qa, int sa);
-void greedy_align(sequence query, sequence subject, const vector<Diagonal_segment> &sh, bool log);
-void greedy_align(sequence query, sequence subject, const Diagonal_segment &sh, bool log);
-void greedy_align(sequence query, const Long_score_profile &qp, sequence subject, const Diagonal_segment &sh, bool log);
-void greedy_align2(sequence query, const Long_score_profile &qp, sequence subject, const vector<Diagonal_segment> &sh, bool log, Hsp_data &out);
-void greedy_align(sequence query, const Long_score_profile &qp, sequence subject, int qa, int sa, bool log);
+struct Local {};
+struct Global {};
+
+int greedy_align(sequence query, const Long_score_profile &qp, const Bias_correction &query_bc, sequence subject, vector<Seed_hit>::const_iterator begin, vector<Seed_hit>::const_iterator end, bool log, std::list<Hsp_data> &hsps, std::list<Hsp_traits> &ts, unsigned frame);
+int greedy_align(sequence query, const Long_score_profile &qp, const Bias_correction &query_bc, sequence subject, bool log, std::list<Hsp_data> &hsps, std::list<Hsp_traits>::const_iterator t_begin, std::list<Hsp_traits>::const_iterator t_end, std::list<Hsp_traits> &ts, int cutoff, unsigned frame);
+int estimate_score(const Long_score_profile &qp, sequence s, int d, int d1, bool log);
template<typename _t>
struct Fixed_score_buffer
@@ -69,6 +175,12 @@ struct Fixed_score_buffer
for (size_t i = 0; i<col_size; ++i)
data_[i] = init;
}
+
+ std::pair<int, int> find(_t s) const
+ {
+ const int i = int(std::find(data_.begin(), data_.end(), s) - data_.begin());
+ return std::pair<int, int>(int(i%col_size_), int(i / col_size_));
+ }
inline std::pair<_t*, _t*> get()
{
@@ -92,54 +204,270 @@ struct Fixed_score_buffer
return data_[j*col_size_ + i];
}
+ friend std::ostream& operator<<(std::ostream &s, const Fixed_score_buffer &buf)
+ {
+ s << '\t';
+ for (int j = 0; j < int(buf.data_.size() / buf.col_size_); ++j)
+ s << j << '\t';
+ s << endl;
+ for (int i = 0; i < int(buf.col_size_); ++i) {
+ s << i << '\t';
+ for (int j = 0; j < int(buf.data_.size() / buf.col_size_); ++j)
+ s << buf(i, j) << '\t';
+ s << endl;
+ }
+ return s;
+ }
+
private:
vector<_t> data_;
size_t col_size_;
};
+template<typename _score, typename _mode>
+const Fixed_score_buffer<_score>& needleman_wunsch(sequence query, sequence subject, int &max_score, const _mode&, const _score&);
+
struct Diagonal_node : public Diagonal_segment
{
+ enum { estimate, finished };
+ Diagonal_node() :
+ Diagonal_segment(),
+ link_idx(-1),
+ prefix_score(0),
+ path_max(0),
+ path_min(0)
+ {}
+ Diagonal_node(int query_pos, int subject_pos, int len, int score, int link_idx=-1) :
+ Diagonal_segment(query_pos, subject_pos, len, score),
+ link_idx(link_idx),
+ prefix_score(score),
+ path_max(score),
+ path_min(score)
+ {}
+ Diagonal_node(const Diagonal_segment &d) :
+ Diagonal_segment(d),
+ link_idx(-1),
+ prefix_score(d.score),
+ path_max(d.score),
+ path_min(d.score)
+ {}
+ void deactivate()
+ {
+ link_idx = 0;
+ }
+ void reset()
+ {
+ link_idx = -1;
+ prefix_score = score;
+ path_max = score;
+ path_min = score;
+ }
+ bool is_maximum() const
+ {
+ return path_max == prefix_score;
+ }
+ int rel_score() const
+ {
+ return prefix_score == path_max ? prefix_score : prefix_score - path_min;
+ }
+ static bool cmp_prefix_score(const Diagonal_node *x, const Diagonal_node *y)
+ {
+ return x->prefix_score > y->prefix_score;
+ }
+ static bool cmp_rel_score(const Diagonal_node *x, const Diagonal_node *y)
+ {
+ return x->rel_score() > y->rel_score();
+ }
+ int link_idx, prefix_score, path_max, path_min;
+};
+
+struct Diag_graph
+{
+
+ enum { end = 0xffffffffffffffffllu };
struct Edge
{
Edge() :
prefix_score(0),
- node(),
- exact(true)
- {}
- Edge(int prefix_score, int j, unsigned node, bool exact) :
+ node_in()
+ {
+ }
+ Edge(int prefix_score, int path_max, int j, unsigned node_in, unsigned node_out, int path_min, int prefix_score_begin) :
prefix_score(prefix_score),
+ path_max(path_max),
j(j),
- node(node),
- exact(exact)
- {}
- operator int() const
+ path_min(path_min),
+ prefix_score_begin(prefix_score_begin),
+ node_in(node_in),
+ node_out(node_out)
+ {
+ }
+ /*operator int() const
{
return prefix_score;
}
- int prefix_score, j;
- unsigned node;
- bool exact;
+ bool operator<(const Edge &x) const
+ {
+ return prefix_score > x.prefix_score;
+ }*/
+ int prefix_score, path_max, j, path_min, prefix_score_begin;
+ unsigned node_in, node_out;
};
- Diagonal_node() :
- Diagonal_segment(),
- diff(std::numeric_limits<int>::min())
- {}
- Diagonal_node(int query_pos, int subject_pos, int len, int score) :
- Diagonal_segment(query_pos, subject_pos, len, score),
- diff(std::numeric_limits<int>::min())
- {}
- Diagonal_node(const Diagonal_segment &d) :
- Diagonal_segment(d),
- diff(std::numeric_limits<int>::min())
- {}
- enum { n_path = 2 };
- Top_list<Edge, n_path> edges;
- int diff;
+ void init()
+ {
+ nodes.clear();
+ edges.clear();
+ }
+
+ void init(unsigned node)
+ {
+ if (edges.size() >= (size_t)std::numeric_limits<int>::max())
+ throw std::runtime_error("Too many edges.");
+ nodes[node].link_idx = (int)edges.size();
+ }
+
+ void load(vector<Seed_hit>::const_iterator begin, vector<Seed_hit>::const_iterator end);
+ void sort();
+ void clear_edges();
+
+ vector<Edge>::iterator add_edge(const Edge &edge)
+ {
+ for (vector<Diagonal_node>::iterator j = nodes.begin() + edge.node_in + 1; j < nodes.end(); ++j)
+ if (j->link_idx == -1)
+ break;
+ else
+ ++j->link_idx;
+ assert(nodes[edge.node_in].link_idx >= 0 && nodes[edge.node_in].link_idx <= (int)edges.size());
+ Diagonal_node &d = nodes[edge.node_in];
+ if (edge.prefix_score > d.prefix_score) {
+ d.prefix_score = edge.prefix_score;
+ d.path_max = edge.path_max;
+ d.path_min = edge.path_min;
+ }
+ return edges.insert(edges.begin() + d.link_idx++, edge);
+ }
+
+ vector<Edge>::const_iterator get_edge(size_t node, int j) const
+ {
+ const Diagonal_node &d = nodes[node];
+ if (d.score == 0)
+ return edges.begin() + d.link_idx - 1;
+ if (edges.empty())
+ return edges.end();
+ int max_score = d.score;
+ vector<Edge>::const_iterator max_edge = edges.end();
+ for (vector<Edge>::const_iterator i = edges.begin() + d.link_idx - 1; i >= edges.begin() && i->node_in == node; --i)
+ if (i->j <= j && i->prefix_score > max_score) {
+ max_edge = i;
+ max_score = i->prefix_score;
+ }
+ return max_edge;
+ }
+
+ int prefix_score(size_t node, int j, int &path_max, int &path_min) const
+ {
+ const vector<Edge>::const_iterator i = get_edge(node, j);
+ path_max = i == edges.end() ? nodes[node].score : std::max(nodes[node].score, i->path_max);
+ path_min = i == edges.end() ? nodes[node].score : i->path_min;
+ return i == edges.end() ? nodes[node].score : std::max(nodes[node].score, i->prefix_score);
+ }
+
+ Diagonal_node& operator[](size_t k)
+ {
+ return nodes[k];
+ }
+
+ const Diagonal_node& operator[](size_t k) const
+ {
+ return nodes[k];
+ }
+
+ void print(sequence query, sequence subject) const;
+ size_t top_node() const;
+
+ vector<Diagonal_node> nodes;
+ vector<Edge> edges;
+};
+
+int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int sbegin, int send, unsigned node, unsigned edge, Diag_graph &diags, bool log);
+
+struct Band
+{
+ void init(int diags, int cols)
+ {
+ diags_ = diags;
+ cols_ = cols;
+ data_.clear();
+ data_.resize((size_t)diags*cols);
+ }
+ struct Iterator {
+ Iterator(uint8_t *p, int diags) :
+ diags_(diags),
+ p_(p)
+ {}
+ uint8_t& operator[](int i)
+ {
+ return p_[i*diags_];
+ }
+ private:
+ const int diags_;
+ uint8_t *p_;
+ };
+ Iterator diag(int o)
+ {
+ return Iterator(&data_[o], diags_);
+ }
+ int cols() const
+ {
+ return cols_;
+ }
+ int diags() const
+ {
+ return diags_;
+ }
+ uint8_t* data()
+ {
+ return data_.data();
+ }
+ bool check(uint8_t *ptr) const
+ {
+ return ptr >= data_.data() && ptr <= data_.data() + data_.size();
+ }
+private:
+ int diags_, cols_;
+ vector<uint8_t> data_;
};
-int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int sbegin, int send, unsigned node, unsigned edge, vector<Diagonal_node> &diags, bool log);
+struct Diag_scores {
+ enum {
+ block_len = 16
+ };
+ int dj0(int d) const
+ {
+ return std::max(-d, 0);
+ }
+ int dj1(int d) const
+ {
+ return std::min(qlen - d, slen);
+ }
+ void get_diag(int i, int j, int o, int j_begin, int j_end, vector<Diagonal_node> &diags, int cutoff, bool log, const Bias_correction &query_bc);
+ void scan_diags(int d_begin, int d_end, sequence query, sequence subject, const Long_score_profile &qp, const Bias_correction &query_bc, bool log, vector<Diagonal_node> &diags, bool fast);
+ Band score_buf, local_max;
+ vector<uint8_t> sv_max;
+ vector<bool> active;
+ int i_begin, j_begin, d_begin, d_end, qlen, slen;
+ bool fast;
+ static int min_diag_score, min_low_score;
+};
+
+void smith_waterman(sequence q, sequence s, Hsp_data &out);
+void smith_waterman(sequence q, sequence s, const Diag_graph &diags);
+int score_range(sequence query, sequence subject, int i, int j, int j_end);
+
+void swipe(const sequence &query, vector<sequence>::const_iterator subject_begin, vector<sequence>::const_iterator subject_end, vector<int>::iterator out);
+void banded_sw(const sequence &query, const sequence &subject, int d_begin, int d_end, int j_begin, int j_end, Hsp_data &out);
#endif /* FLOATING_SW_H_ */
diff --git a/src/dp/dp_matrix.h b/src/dp/dp_matrix.h
index 6f1f760..b661f21 100644
--- a/src/dp/dp_matrix.h
+++ b/src/dp/dp_matrix.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DP_MATRIX_H_
diff --git a/src/dp/floating_sw.cpp b/src/dp/floating_sw.cpp
index 00f5a1a..8c3bd3d 100644
--- a/src/dp/floating_sw.cpp
+++ b/src/dp/floating_sw.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <limits>
@@ -66,7 +66,7 @@ Hsp_data get_traceback(const Letter *query,
}
template<typename _dir, typename _score, typename _traceback, typename _score_correction>
-Hsp_data floating_sw_dir(const Letter *query, const Letter* subject, int band, _score xdrop, _score gap_open, _score gap_extend, uint64_t &cell_updates, const _score_correction &score_correction, int query_anchor)
+Hsp_data floating_sw_dir(const Letter *query, const Letter* subject, int band, _score xdrop, _score gap_open, _score gap_extend, uint64_t &cell_updates, const _score_correction &score_correction, int query_anchor, int min_j)
{
using std::max;
@@ -74,8 +74,7 @@ Hsp_data floating_sw_dir(const Letter *query, const Letter* subject, int band, _
int j = 0, i_max = -1, j_best = -1, i_best = -1;
Scalar_dp_matrix<_score, _traceback> mtx(band);
const Letter *x = query, *y = subject;
-
- while (*y != '\xff' && max_score - column_max < xdrop) {
+ while (*y != '\xff' && (max_score - column_max < xdrop || j < min_j)) {
typename Scalar_dp_matrix<_score, _traceback>::Column_iterator it = mtx.column(j, i_max);
if (get_dir(x, it.row(), _dir()) == '\xff')
break;
@@ -118,13 +117,13 @@ Hsp_data floating_sw_dir(const Letter *query, const Letter* subject, int band, _
}
template<typename _score, typename _traceback, typename _score_correction>
-void floating_sw(const Letter *query, const Letter *subject, Hsp_data &segment, int band, _score xdrop, _score gap_open, _score gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, const _score_correction &score_correction, const _traceback&, const _score&)
+void floating_sw(const Letter *query, const Letter *subject, Hsp_data &segment, int band, _score xdrop, _score gap_open, _score gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, int min_j, const _score_correction &score_correction, const _traceback&, const _score&)
{
- segment.merge(floating_sw_dir<Right, _score, _traceback, _score_correction>(query + 1, subject + 1, band, xdrop, gap_open, gap_extend, cell_updates, score_correction, query_anchor + 1),
- floating_sw_dir<Left, _score, _traceback, _score_correction>(query, subject, band, xdrop, gap_open, gap_extend, cell_updates, score_correction, query_anchor), query_anchor, subject_anchor);
+ segment.merge(floating_sw_dir<Right, _score, _traceback, _score_correction>(query + 1, subject + 1, band, xdrop, gap_open, gap_extend, cell_updates, score_correction, query_anchor + 1, min_j-subject_anchor),
+ floating_sw_dir<Left, _score, _traceback, _score_correction>(query, subject, band, xdrop, gap_open, gap_extend, cell_updates, score_correction, query_anchor,0), query_anchor, subject_anchor);
}
-template void floating_sw<int, Traceback, No_score_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, int xdrop, int gap_open, int gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, const No_score_correction&, const Traceback&, const int&);
-template void floating_sw<int, Score_only, No_score_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, int xdrop, int gap_open, int gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, const No_score_correction&, const Score_only&, const int&);
-template void floating_sw<float, Traceback, Bias_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, float xdrop, float gap_open, float gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, const Bias_correction&, const Traceback&, const float&);
-template void floating_sw<float, Score_only, Bias_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, float xdrop, float gap_open, float gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, const Bias_correction&, const Score_only&, const float&);
\ No newline at end of file
+template void floating_sw<int, Traceback, No_score_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, int xdrop, int gap_open, int gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, int min_j, const No_score_correction&, const Traceback&, const int&);
+template void floating_sw<int, Score_only, No_score_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, int xdrop, int gap_open, int gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, int min_j, const No_score_correction&, const Score_only&, const int&);
+template void floating_sw<float, Traceback, Bias_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, float xdrop, float gap_open, float gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, int min_j, const Bias_correction&, const Traceback&, const float&);
+template void floating_sw<float, Score_only, Bias_correction>(const Letter *query, const Letter *subject, Hsp_data &segment, int band, float xdrop, float gap_open, float gap_extend, uint64_t &cell_updates, unsigned query_anchor, unsigned subject_anchor, int min_j, const Bias_correction&, const Score_only&, const float&);
\ No newline at end of file
diff --git a/src/dp/floating_sw.h b/src/dp/floating_sw.h
index 566edbf..b6ec120 100644
--- a/src/dp/floating_sw.h
+++ b/src/dp/floating_sw.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef FLOATING_SW_H_
@@ -36,6 +36,7 @@ void floating_sw(const Letter *query,
uint64_t &cell_updates,
unsigned query_anchor,
unsigned subject_anchor,
+ int min_j,
const _score_correction &score_correction,
const _traceback& = Score_only(),
const _score& = int());
diff --git a/src/dp/greedy_align.cpp b/src/dp/greedy_align.cpp
index 5e2de2d..8f7acea 100644
--- a/src/dp/greedy_align.cpp
+++ b/src/dp/greedy_align.cpp
@@ -1,29 +1,108 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
// #define _ITERATOR_DEBUG_LEVEL 0
+#include <map>
+#include <list>
+#include <set>
#include "../basic/sequence.h"
#include "../basic/match.h"
#include "../basic/score_matrix.h"
#include "../search/sse_dist.h"
#include "../align/extend_ungapped.h"
#include "../dp/score_profile.h"
+#include "../output/output_format.h"
+
+using std::map;
+using std::list;
+using std::set;
+
+bool disjoint(list<Hsp_traits>::const_iterator begin, list<Hsp_traits>::const_iterator end, const Hsp_traits &t, int cutoff)
+{
+ for (; begin != end; ++begin)
+ if (begin->partial_score(t) < cutoff || !begin->collinear(t))
+ //if (!begin->disjoint(t) || !begin->collinear(t))
+ //if (!begin->rel_disjoint(t))
+ return false;
+ return true;
+}
+
+bool disjoint(list<Hsp_traits>::const_iterator begin, list<Hsp_traits>::const_iterator end, const Diagonal_segment &d, int cutoff)
+{
+ for (; begin != end; ++begin)
+ if (begin->partial_score(d) < cutoff || !begin->collinear(d))
+ //if (!begin->disjoint(d) || !begin->collinear(d))
+ //if (!begin->rel_disjoint(d))
+ return false;
+ return true;
+}
+
+void Diag_graph::clear_edges()
+{
+ edges.clear();
+ for (vector<Diagonal_node>::iterator i = nodes.begin(); i < nodes.end(); ++i)
+ i->deactivate();
+}
+
+void Diag_graph::load(vector<Seed_hit>::const_iterator begin, vector<Seed_hit>::const_iterator end)
+{
+ int d = std::numeric_limits<int>::min(), max_j_end = d;
+ for (vector<Seed_hit>::const_iterator i = begin; i < end; ++i) {
+ const int d2 = i->diagonal();
+ if (d2 != d) {
+ d = d2;
+ nodes.push_back(i->ungapped);
+ max_j_end = nodes.back().subject_end();
+ }
+ else if (max_j_end < i->ungapped.j) {
+ nodes.push_back(i->ungapped);
+ max_j_end = std::max(max_j_end, nodes.back().subject_end());
+ }
+ }
+}
+
+void Diag_graph::print(sequence query, sequence subject) const
+{
+ for (int k = 0; k < (int)nodes.size(); ++k) {
+ const Diagonal_segment &d = nodes[k];
+ cout << "Diag n=" << k << " i=" << d.i << " j=" << d.j << " d=" << d.diag() << " score=" << d.score << " len=" << d.len << endl;
+ cout << sequence(query, d.i, d.query_last()) << endl;
+ cout << sequence(subject, d.j, d.subject_last()) << endl;
+ }
+}
+
+size_t Diag_graph::top_node() const
+{
+ int top_score = 0, score;
+ size_t top_node = end;
+ for (size_t k = 0; k < nodes.size(); ++k)
+ if ((score = nodes[k].prefix_score) > top_score) {
+ top_node = k;
+ top_score = score;
+ }
+ return top_node;
+}
+
+void Diag_graph::sort()
+{
+ std::sort(nodes.begin(), nodes.end(), Diagonal_segment::cmp_subject);
+}
struct Link
{
@@ -51,44 +130,6 @@ struct Link
}
};
-Diagonal_segment score_diagonal(const Letter *query, const Letter *subject, int qbegin, int jbegin)
-{
- int i = 0, j = 0, max_score = 0, score = 0, begin = 0, end = 0;
- while (query[i] != '\xff' && subject[i] != '\xff') {
- score += score_matrix(query[i], subject[i]);
- if (score <= 0) {
- score = 0;
- j = i + 1;
- }
- if (score > max_score) {
- max_score = score;
- begin = j;
- end = i + 1;
- }
- ++i;
- }
- return Diagonal_segment(qbegin + begin, jbegin + begin, end - begin, max_score);
-}
-
-Diagonal_segment score_diagonal(const Letter *query, const Letter *subject, int n, int qbegin, int jbegin)
-{
- int i = 0, j = 0, max_score = 0, score = 0, begin = 0, end = 0;
- while (i < n) {
- score += score_matrix(query[i], subject[i]);
- if (score <= 0) {
- score = 0;
- j = i + 1;
- }
- if (score > max_score) {
- max_score = score;
- begin = j;
- end = i + 1;
- }
- ++i;
- }
- return Diagonal_segment(qbegin + begin, jbegin + begin, end - begin, max_score);
-}
-
int score_range(sequence query, sequence subject, int i, int j, int j_end)
{
int score = 0;
@@ -100,10 +141,10 @@ int score_range(sequence query, sequence subject, int i, int j, int j_end)
return score;
}
-int get_hgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequence query, sequence subject, Link &l)
+int get_hgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequence query, sequence subject, Link &l, int padding)
{
const int d = d1.diag() - d2.diag(),
- j2_end = std::min(std::max((int)d2.j, d1.subject_last() + d + 1), d2.subject_last());
+ j2_end = std::min(std::max((int)d2.j, d1.subject_last() + d + 1 + padding), d2.subject_last());
int j1;
bool space;
if (d1.subject_last() < d2.j - d - 1) {
@@ -111,12 +152,13 @@ int get_hgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequen
space = true;
}
else {
- j1 = std::max(d2.j - d - 1, d1.j);
+ j1 = std::max(d2.j - d - 1 - padding, d1.j);
space = false;
}
int j2 = j1 + d + 1,
i1 = d1.i + (j1 - d1.j),
i2 = i1 + 1;
+ //cout << "j2=" << j2 << " d2.subject_last=" << d2.subject_last() << endl;
if (j2 > d2.subject_last()) {
l.reset();
return std::numeric_limits<int>::min();
@@ -126,6 +168,7 @@ int get_hgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequen
score2 = score_range(query, subject, i2, j2, d2.j) + d2.score - score_range(query, subject, d2.i, d2.j, j2);
int max_score = std::numeric_limits<int>::min();
while (true) {
+ //cout << "i1=" << i1 << " j1=" << j1 << " i2=" << i2 << " j2=" << j2 << " score1=" << score1 << " score2=" << score2 << " total=" << score1 + score2 << endl;
if (score1 + score2 > max_score) {
max_score = score1 + score2;
l.query_pos1 = i1;
@@ -145,445 +188,440 @@ int get_hgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequen
if (space)
l.score1 = d1.score + l.score1;
else
- l.score1 = d1.score - score_range(query, subject, d1.diag() + j1_end, j1_end, d1.subject_end()) - score1 + l.score1;
+ l.score1 = d1.score - score_range(query, subject, d1.diag() + j1_end, j1_end, d1.subject_end()) + score_range(query, subject, d1.query_end(), d1.subject_end(), j1_end) - score1 + l.score1;
return max_score;
}
-int get_vgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequence query, sequence subject, Link &l)
+int get_vgap_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequence query, sequence subject, Link &l, int padding)
{
- int s = get_hgap_link(d1.transpose(), d2.transpose(), subject, query, l);
+ int s = get_hgap_link(d1.transpose(), d2.transpose(), subject, query, l, padding);
l.transpose();
return s;
}
-int get_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequence query, sequence subject, Link &l)
+int get_link(const Diagonal_segment &d1, const Diagonal_segment &d2, sequence query, sequence subject, Link &l, int padding)
{
if (d1.diag() < d2.diag())
- return get_vgap_link(d1, d2, query, subject, l);
+ return get_vgap_link(d1, d2, query, subject, l, padding);
else
- return get_hgap_link(d1, d2, query, subject, l);
-}
-
-void set_global_max(score_vector<uint8_t> *max, score_vector<uint8_t> *global_max, uint8_t *&local_max)
-{
-#ifdef __SSE2__
- global_max[0].max(max[0]);
- max[0].store(local_max);
- max[0] = score_vector<uint8_t>();
- local_max += 16;
- global_max[1].max(max[1]);
- max[1].store(local_max);
- max[1] = score_vector<uint8_t>();
- local_max += 16;
- global_max[2].max(max[2]);
- max[2].store(local_max);
- max[2] = score_vector<uint8_t>();
- local_max += 16;
- global_max[3].max(max[3]);
- max[3].store(local_max);
- max[3] = score_vector<uint8_t>();
- local_max += 16;
-#endif
-}
-
-void scan_cols(const Long_score_profile &qp, sequence s, int i, int j, int j_end, uint8_t *sv_max, bool log, uint8_t *buf, uint8_t *local_max, int block_len)
-{
-#ifdef __SSE2__
- typedef score_vector<uint8_t> Sv;
- const Sv vbias(score_matrix.bias());
- Sv v[4], max[4], global_max[4];
- int n = 0;
- for (; j < j_end; ++j) {
- const uint8_t *q = qp.get(s[j], i);
- v[0] = v[0] + score_vector<uint8_t>(q);
- v[0] -= vbias;
- max[0].max(v[0]);
- _mm_storeu_si128((__m128i*)buf, v[0].data_);
- q += 16;
- buf += 16;
- v[1] = v[1] + score_vector<uint8_t>(q);
- v[1] -= vbias;
- max[1].max(v[1]);
- _mm_storeu_si128((__m128i*)buf, v[1].data_);
- q += 16;
- buf += 16;
- v[2] = v[2] + score_vector<uint8_t>(q);
- v[2] -= vbias;
- max[2].max(v[2]);
- _mm_storeu_si128((__m128i*)buf, v[2].data_);
- q += 16;
- buf += 16;
- v[3] = v[3] + score_vector<uint8_t>(q);
- v[3] -= vbias;
- max[3].max(v[3]);
- _mm_storeu_si128((__m128i*)buf, v[3].data_);
- buf += 16;
- //cout << 's' << v[0] << v[1] << v[2] << v[3] << endl;
- if ((n & 15) == 15) {
- //cout << 'l' << max[0] << max[1] << max[2] << max[3] << endl;
- set_global_max(max, global_max, local_max);
- }
- ++i;
- ++n;
- }
- if(n % block_len != 0)
- set_global_max(max, global_max, local_max);
- global_max[0].store(sv_max);
- global_max[1].store(sv_max + 16);
- global_max[2].store(sv_max + 32);
- global_max[3].store(sv_max + 48);
- //cout << 'g' << global_max[0] << global_max[1] << global_max[2] << global_max[3] << endl;
-#endif
+ return get_hgap_link(d1, d2, query, subject, l, padding);
}
struct Greedy_aligner2
{
- enum { band = 64, block_len = 16 };
-
- struct Node_ref
- {
- Node_ref():
- node(0),
- score(0)
- {}
- Node_ref(unsigned node, int score) :
- node(node),
- score(score)
- {}
- unsigned node;
- int score;
- bool operator<(const Node_ref &rhs) const
- {
- return score > rhs.score;
- }
- };
-
- int correct_left(unsigned node, int j)
- {
- const Diagonal_segment &d = diags[node];
- const int shift = j - d.j;
- return shift < 0 ? score_range(query, subject, d.i + shift, j, d.j) : -score_range(query, subject, d.i, d.j, j);
- }
-
- int correct_right(unsigned node, int j)
- {
- const Diagonal_segment &d = diags[node];
- const int end = d.subject_end(), shift = j - end;
- return shift < 0 ? -score_range(query, subject, d.query_end() + shift, j, d.subject_end()) : score_range(query, subject, d.query_end(), d.subject_end(), j);
- }
+ enum { link_padding = 10, reverse_link_min_overhang = 10 };
- void get_diag(int i, int j, int o, int begin, int end, int max_score)
+ int get_approximate_link(int d_idx, int e_idx, double space_penalty, int max_i)
{
- const uint8_t *p = score_buf.data() + o + begin*band,
- *p_end = p + block_len*band,
- *b0 = p - band,
- *b = b0;
- for (; p < p_end; p += band) {
- if (*p == 0)
- b = p;
- if (*p == max_score)
- break;
+ Diagonal_node &d = diags[d_idx];
+ Diagonal_node &e = diags[e_idx];
+ const int shift = d.diag() - e.diag();
+ int gap_score = shift != 0 ? -score_matrix.gap_open() - abs(shift)*score_matrix.gap_extend() : 0;
+ const int space = shift > 0 ? d.j - e.subject_last() : d.i - e.query_last();
+ int prefix_score = 0, link_score = 0, link_j, diff1 = 0, path_max, path_min, prefix_score_begin;
+ if (space <= 0) {
+ vector<Diag_graph::Edge>::const_iterator edge = diags.get_edge(d_idx, d.j);
+ if (edge != diags.edges.end() && edge->prefix_score > e.prefix_score + gap_score + d.score)
+ return 0;
+ /*if (d.prefix_score > e.prefix_score + gap_score + d.score)
+ return 0;*/
+ Link link;
+ if (get_link(e, d, query, subject, link, link_padding) > 0) {
+ diff1 = e.score - link.score1;
+ const int prefix_e = diags.prefix_score(e_idx, link.subject_pos1, path_max, path_min);
+ prefix_score = prefix_e - diff1 + gap_score + link.score2;
+ vector<Diag_graph::Edge>::const_iterator edge = diags.get_edge(d_idx, link.subject_pos2);
+ if (edge != diags.edges.end() && edge->prefix_score > prefix_score)
+ return 0;
+ prefix_score_begin = prefix_score - link.score2;
+ path_min = std::min(path_min, prefix_score - link.score2);
+ if (prefix_e == path_max) {
+ path_max -= diff1;
+ }
+ link_score = link.score1 + link.score2 + gap_score;
+ link_j = link.subject_pos2;
+ /*if (log)
+ cout << "Link score1=" << link.score1 << " score2=" << link.score2 << " j1=" << link.subject_pos1 << " j2=" << link.subject_pos2 << endl;*/
+ }
}
- if (b == b0) {
- b0 = score_buf.data() + o;
- for (; b >= b0; b -= band)
- if (*b == 0)
- break;
+ else {
+ prefix_score = e.prefix_score + gap_score - int(space_penalty*std::max(space - 1, 0)) + d.score;
+ vector<Diag_graph::Edge>::const_iterator edge = diags.get_edge(d_idx, d.j);
+ if (edge != diags.edges.end() && edge->prefix_score > prefix_score)
+ return 0;
+ prefix_score_begin = prefix_score - d.score;
+ path_max = e.path_max;
+ path_min = e.path_min;
+ path_min = std::min(path_min, prefix_score - d.score);
+ link_score = e.score + d.score + gap_score;
+ link_j = d.j;
}
-
- p = score_buf.data() + o + end*band;
- p_end = p + block_len*band;
- for (; p < p_end; p += band) {
- if (*p == max_score)
- break;
+
+ if (prefix_score > d.score) {
+ path_max = std::max(path_max, prefix_score);
+ diags.add_edge(Diag_graph::Edge(prefix_score, path_max, link_j, d_idx, e_idx, prefix_score == path_max ? prefix_score : path_min, prefix_score_begin));
+ if (log)
+ cout << "Link n=" << e_idx << " d=" << e.diag() << " i_end=" << e.query_end() << " max_i=" << max_i << " shift=" << shift << " space=" << space << " prefix_score=" << prefix_score << " link_score=" << link_score << " path_min="<<path_min<<endl;
}
-
- const int b2 = int(b - (score_buf.data() + o)) / band + 1;
- diags.push_back(Diagonal_segment(i + b2, j + b2, int(p - b) / band, max_score));
+ return prefix_score;
}
- void get_diag(int i, int j, int o)
+ template<typename _it>
+ void forward_pass(_it begin, _it end, bool init, double space_penalty)
{
- const uint8_t *p = local_max.data() + o, *p_end = local_max.data() + local_max.size();
- int n = 0, begin = -1, best = -1, best_score = -1, second_best_score = -1, second_best = -1, second_best_begin = -1;
- for (; p < p_end; p += band) {
- if (*p >= config.min_diag_raw_score) {
- if (begin == -1) {
- begin = n;
- best = n;
- best_score = *p;
- }
- else if (*p > best_score) {
- best = n;
- best_score = *p;
- }
- else if (*p > *(p - band)) {
- if (second_best_begin == -1) {
- second_best_begin = n;
- second_best = n;
- second_best_score = *p;
+ window.clear();
+
+ for (_it it = begin; it != end; ++it) {
+
+ unsigned node = (unsigned)(*it);
+ if(init) diags.init(node);
+ Diagonal_node& d = diags[node];
+ const int dd = d.diag();
+ if (log) cout << "Node " << node << " Score=" << d.score << endl;
+ map<int, unsigned>::iterator i = window.find(dd), j;
+ if (i == window.end())
+ i = window.insert(std::make_pair(dd, node)).first;
+
+ j = i;
+ int max_j = 0;
+ if (i == window.begin())
+ goto weiter;
+ do {
+ --j;
+ Diagonal_node &e = diags[j->second];
+ const int de = j->first, shift = dd - de;
+ //if (d.j - e.subject_end() > max_dist) {
+ if (e.prefix_score - int(space_penalty*(std::max(d.j - e.subject_end(), 0))) <= 0) {
+ map<int, unsigned>::iterator k = j;
+ if (j == window.begin()) {
+ window.erase(j);
+ break;
}
else {
- if (*p > second_best_score) {
- second_best_score = *p;
- second_best = n;
- }
+ ++k;
+ window.erase(j);
+ j = k;
+ continue;
}
}
- }
- else if (begin >= 0) {
- get_diag(i, j, o, begin * 16, best * 16, best_score);
- if (second_best_begin >= 0) {
- second_best_begin -= 1;
- const Diagonal_segment second = score_diagonal(&query[i + second_best_begin * 16],
- &subject[j + second_best_begin * 16],
- (second_best - second_best_begin + 1) * 16,
- i + second_best_begin * 16,
- j + second_best_begin * 16);
- if (second.score >= config.min_diag_raw_score)
- diags.push_back(second);
- second_best_begin = -1;
+ if (e.subject_end() < max_j)
+ continue;
+ get_approximate_link(node, j->second, space_penalty, max_j);
+ max_j = std::max(max_j, std::min(d.j, e.subject_end()));
+ if (e.subject_end() - (d.subject_end() - std::min(e.diag() - d.diag(), 0)) >= reverse_link_min_overhang) {
+ if (log)
+ cout << "Computing reverse link node=" << j->second << endl;
+ get_approximate_link(j->second, node, space_penalty, max_j);
}
- begin = -1;
- best = -1;
- best_score = -1;
- }
- ++n;
- }
- if (begin >= 0) {
- get_diag(i, j, o, begin * 16, best * 16, best_score);
- if (second_best_begin >= 0) {
- second_best_begin -= 1;
- const Diagonal_segment second = score_diagonal(&query[i + second_best_begin * 16],
- &subject[j + second_best_begin * 16],
- (second_best - second_best_begin + 1) * 16,
- i + second_best_begin * 16,
- j + second_best_begin * 16);
- if (second.score >= config.min_diag_raw_score)
- diags.push_back(second);
+ } while (j != window.begin());
+
+ weiter:
+ j = i;
+ if (j->second == node)
+ ++j;
+ int max_i = 0;
+ while (j != window.end()) {
+ Diagonal_node &e = diags[j->second];
+ const int de = j->first, shift = dd - de;
+ //if (d.j - e.subject_end() > max_dist && j != i) {
+ if (e.prefix_score - int(space_penalty*(std::max(d.j - e.subject_end(),0))) <= 0 && j != i) {
+ map<int, unsigned>::iterator k = j;
+ ++k;
+ window.erase(j);
+ j = k;
+ continue;
+ }
+ if (e.query_end() < max_i) {
+ ++j;
+ continue;
+ }
+ //if (get_approximate_link(node, j->second, space_penalty, max_i) > e.prefix_score)
+ get_approximate_link(node, j->second, space_penalty, max_i);
+ if (e.i < d.i)
+ max_i = std::max(max_i, std::min(e.query_end(), d.i));
+ if (e.subject_end() - (d.subject_end() - std::min(e.diag() - d.diag(), 0)) >= reverse_link_min_overhang) {
+ if (log)
+ cout << "Computing reverse link node=" << j->second << endl;
+ get_approximate_link(j->second, node, space_penalty, max_i);
+ }
+ ++j;
}
+ i->second = node;
+
+ if (log)
+ cout << "Prefix_score=" << d.prefix_score << " path_max=" << d.path_max << " path_min=" << d.path_min << endl << endl;
}
}
-
- void scan_diags(const Diagonal_segment &diag)
+
+ bool backtrace(size_t node, int j_end, Hsp_data *out, Hsp_traits &t, int score_max, int score_min, int max_shift, unsigned &next) const
{
- const int d = diag.diag() - band / 2,
- d1 = d + band - 1,
- i = std::max(0, d1) - band + 1,
- j = i - d,
- j1 = std::min((int)query.length() - d, (int)subject.length());
- uint8_t sv_max[band];
- memset(sv_max, 0, band);
- const size_t cells = band * (j1 - j);
- score_buf.resize(cells);
- local_max.resize((j1 - j + block_len - 1) / block_len * band);
- scan_cols(qp, subject, i, j, j1, sv_max, log, score_buf.data(), local_max.data(), block_len);
- for (int o = 0; o < band; ++o)
- if (sv_max[o] >= config.min_diag_raw_score) {
- //get_diag(i + o, j, sv_max[o], o);
- get_diag(i + o, j, o);
+ const Diagonal_node &d = diags[node];
+ vector<Diag_graph::Edge>::const_iterator f = diags.get_edge(node, j_end);
+ bool at_end = f >= diags.edges.end();
+ const int prefix_score = at_end ? d.score : f->prefix_score;
+ if (prefix_score > score_max)
+ return false;
+
+ int j;
+ score_min = std::min(score_min, at_end ? 0 : f->prefix_score_begin);
+
+ //if (f != diags.edges.end() && (!stop_at_min || f->path_min == diags[f->node_out].path_min)) {
+ if (!at_end) {
+ const Diagonal_node &e = diags[f->node_out];
+ const int shift = d.diag() - e.diag();
+ j = f->j;
+
+ if (abs(shift) <= max_shift) {
+ const bool bt = backtrace(f->node_out, shift > 0 ? j : j + shift, out, t, score_max, score_min, max_shift, next);
+ if (!bt) {
+ if (f->prefix_score_begin > score_min)
+ return false;
+ else
+ at_end = true;
+ }
}
- }
+ else {
+ next = f->node_out;
+ at_end = true;
+ }
+ }
- int follow_path(unsigned level, unsigned node, int score, int subject_pos)
- {
- static const int max_dist = 32, min_nw_space = 7;
- static const float space_penalty = -0.5;
- Diagonal_node& d = diags[node];
- if (log) cout << "Node " << node << " Score=" << d.edges[0].prefix_score << endl;
- if (d.diff != std::numeric_limits<int>::min()) {
- if (log)
- cout << "Visited node final_score=" << d.edges[0].prefix_score << endl;
- return d.diff;
+ if (at_end) {
+ if (out) {
+ out->query_range.begin_ = d.i;
+ out->subject_range.begin_ = d.j;
+ out->score = score_max - score_min;
+ }
+ t.query_range.begin_ = d.i;
+ t.subject_range.begin_ = d.j;
+ t.score = score_max - score_min;
+ j = d.j;
}
- int max_score = d.edges[0].prefix_score, diff = 0, max_edge = 0;
- for (unsigned k = 0; k < Diagonal_node::n_path; ++k) {
- Diagonal_node::Edge &f = d.edges[k];
- if (f.prefix_score == 0)
- break;
- if (f.node == node)
- continue;
- unsigned next = f.node;
- const Diagonal_node &e = diags[next];
- if (!f.exact) {
- const int shift = d.diag() - e.diag();
- int gap_score = -config.gap_open - abs(shift)*config.gap_extend;
- const int space = shift > 0 ? d.j - e.subject_last() : d.i - e.query_last();
- f.prefix_score += int(config.raw_space_penalty*std::max(space - 1, 0));
- if (space >= min_nw_space && abs(shift) > 1) {
- if (log) {
- const sequence q1 = sequence(query, e.query_last() + 1, d.i - 1),
- s1 = sequence(subject, e.subject_last() + 1, d.j - 1);
- cout << q1 << endl << s1 << endl;
+ else {
+ const Diagonal_node &e = diags[f->node_out];
+ const int shift = d.diag() - e.diag();
+ if (out) {
+ if (shift > 0) {
+ out->transcript.push_back(op_insertion, (unsigned)shift);
+ out->length += shift;
+ }
+ else if (shift < 0) {
+ for (int j2 = j + shift; j2 < j; ++j2) {
+ out->transcript.push_back(op_deletion, subject[j2]);
+ ++out->length;
}
- gap_score = needleman_wunsch(query, subject, e.query_last() + 1, d.i, e.subject_last() + 1, d.j, node, k, diags, log);
- f.prefix_score -= -config.gap_open - abs(shift)*config.gap_extend;
- f.prefix_score += gap_score;
}
- else {
- Link l;
- int link_score = get_link(e, d, query, subject, l);
- f.prefix_score += (l.score1 - e.score) + (l.score2 - d.score);
- f.j = l.subject_pos2;
+ }
+ }
+
+ const int dd = d.diag();
+ t.d_max = std::max(t.d_max, dd);
+ t.d_min = std::min(t.d_min, dd);
+
+ if (out) {
+ const int d2 = d.diag();
+ if (log) cout << "Backtrace node=" << node << " i=" << d2 + j << "-" << d2 + j_end << " j=" << j << "-" << j_end << endl;
+ for (; j < j_end; ++j) {
+ const Letter s = subject[j], q = query[d2 + j];
+ if (s == q) {
+ out->transcript.push_back(op_match);
+ ++out->identities;
}
+ else
+ out->transcript.push_back(op_substitution, s);
+ ++out->length;
}
- if (log)
- cout << "Node=" << node << " Link n=" << next << " prefix_score=" << f.prefix_score << endl;
- f.prefix_score += follow_path(level + 1, next, 0, 0);
- if (diff < f.prefix_score - max_score) {
- diff = f.prefix_score - max_score;
- max_edge = k;
+ }
+ return true;
+ }
+
+ void backtrace(size_t top_node, Hsp_data *out, Hsp_traits &t, int max_shift, unsigned &next, int max_j) const
+ {
+ Hsp_traits traits(frame);
+ if (top_node != Diag_graph::end) {
+ const Diagonal_node &d = diags[top_node];
+ if (out) {
+ out->transcript.clear();
+ out->query_range.end_ = d.query_end();
+ out->subject_range.end_ = d.subject_end();
}
+ traits.subject_range.end_ = d.subject_end();
+ traits.query_range.end_ = d.query_end();
+ int score_min = d.prefix_score;
+ backtrace(top_node, std::min(d.subject_end(), max_j), out, traits, d.prefix_score, score_min, max_shift, next);
}
- if (max_edge != 0) {
- memcpy(&d.edges[0], &d.edges[max_edge], sizeof(Diagonal_node::Edge));
+ else {
+ traits.score = 0;
+ if (out)
+ out->score = 0;
}
- d.diff = diff;
- top_node = std::min(top_node, Node_ref(node, d.edges[0].prefix_score));
- if(log) cout << "Node " << node << " diff=" << diff << " final_score=" << d.edges[0].prefix_score << endl;
- return diff;
+ if (out)
+ out->transcript.push_terminator();
+ t = traits;
}
- unsigned follow_path_approximate()
+ int backtrace(size_t top_node, list<Hsp_data> &hsps, list<Hsp_traits> &ts, list<Hsp_traits>::iterator &t_begin, int cutoff, int max_shift) const
+ {
+ unsigned next;
+ int max_score = 0, max_j = (int)subject.length();
+ do {
+ Hsp_data *hsp = log ? new Hsp_data : 0;
+ Hsp_traits t(frame);
+ next = std::numeric_limits<unsigned>::max();
+ backtrace(top_node, hsp, t, max_shift, next, max_j);
+ if (t.score > 0)
+ max_j = t.subject_range.begin_;
+ if (t.score >= cutoff && disjoint(t_begin, ts.end(), t, cutoff)) {
+ if (t_begin == ts.end()) {
+ ts.push_back(t);
+ t_begin = ts.end();
+ t_begin--;
+ } else
+ ts.push_back(t);
+ if (hsp)
+ hsps.push_back(*hsp);
+ max_score = std::max(max_score, t.score);
+ }
+ delete hsp;
+ top_node = next;
+ } while (next != std::numeric_limits<unsigned>::max());
+ return max_score;
+ }
+
+ int backtrace(list<Hsp_data> &hsps, list<Hsp_traits> &ts, int cutoff, int max_shift) const
{
- static const int max_dist = 32;
- static const float space_penalty = -0.5;
+ vector<Diagonal_node*> top_nodes;
+ for (size_t i = 0; i < diags.nodes.size(); ++i) {
+ Diagonal_node &d = diags.nodes[i];
+ //cout << "node=" << i << " prefix_score=" << d.prefix_score << " path_max=" << d.path_max << " rel_score=" << d.rel_score() << " cutoff=" << cutoff << endl;
+ //if (d.prefix_score >= cutoff && (d.prefix_score == d.path_max || d.prefix_score - d.path_min >= cutoff))
+ if(d.rel_score() >= cutoff)
+ top_nodes.push_back(&d);
+ }
+ std::sort(top_nodes.begin(), top_nodes.end(), Diagonal_node::cmp_rel_score);
int max_score = 0;
- unsigned max_node;
- for (unsigned node = 0; node < diags.size(); ++node) {
- Diagonal_node& d = diags[node];
- if (log) cout << "Node " << node << " Score=" << d.score << endl;
- d.edges.add(Diagonal_node::Edge(d.score, d.j, node, true));
- for (int k = node - 1; k >= 0; --k) {
- const Diagonal_node &e = diags[k];
- if (d.j - e.subject_last() < max_dist) {
- if (abs(d.i - e.query_last()) >= max_dist)
- continue;
- const int shift = d.diag() - e.diag();
- int gap_score = -config.gap_open - abs(shift)*config.gap_extend;
- const int space = shift > 0 ? d.j - e.subject_last() : d.i - e.query_last();
- int prefix_score = 0, link_score = 0, link_j;
- bool exact;
- if (space <= 0) {
- Link link;
- if (get_link(e, d, query, subject, link) > 0) {
- prefix_score = e.edges[0].prefix_score - (e.score - link.score1) + gap_score + link.score2;
- link_score = link.score1 + link.score2 + gap_score;
- exact = true;
- link_j = link.subject_pos2;
- }
- }
- else {
- prefix_score = e.edges[0].prefix_score + gap_score - int(config.raw_space_penalty*std::max(space - 1, 0)) + d.score;
- link_score = e.score + d.score + gap_score;
- exact = false;
- link_j = d.j;
- }
+ list<Hsp_traits>::iterator t_begin = ts.end();
- if (log)
- cout << "Link n=" << k << " shift=" << shift << " space=" << space << " prefix_score=" << prefix_score << " link_score=" << link_score << endl;
- if (prefix_score > 0)
- d.edges.add(Diagonal_node::Edge(prefix_score, link_j, k, exact));
- }
- else
- break;
- }
- if (log) {
- cout << "Final score=" << d.edges[0].prefix_score << endl << endl;
- }
- top_nodes.push_back(Node_ref(node, d.edges[0].prefix_score));
- if (d.edges[0].prefix_score > max_score) {
- max_score = d.edges[0].prefix_score;
- max_node = node;
+ for (vector<Diagonal_node*>::const_iterator i = top_nodes.begin(); i < top_nodes.end(); ++i) {
+ const size_t node = *i - diags.nodes.data();
+ if (log)
+ cout << "Backtrace candidate node=" << node << endl;
+ if (disjoint(t_begin, ts.end(), **i, cutoff)) {
+ if (log)
+ cout << "Backtrace node=" << node << " prefix_score=" << (*i)->prefix_score << " rel_score=" << (*i)->rel_score() << endl;
+ max_score = std::max(max_score, backtrace(node, hsps, ts, t_begin, cutoff, max_shift));
+ if (log)
+ cout << endl;
}
}
- return max_node;
+ return max_score;
}
- void backtrace(unsigned node, int j_end, Hsp_data &out)
+ int run(list<Hsp_data> &hsps, list<Hsp_traits> &ts, double space_penalty, int cutoff, int max_shift)
{
- const Diagonal_node &d = diags[node];
- const Diagonal_node::Edge &f = d.edges[0];
- const Diagonal_node &e = diags[f.node];
- const int shift = d.diag() - e.diag();
- int j = f.j;
- if (f.node != node) {
- backtrace(f.node, shift > 0 ? j : j + shift, out);
- if (shift > 0) {
- out.transcript.push_back(op_insertion, (unsigned)shift);
- }
- else {
- for (int j2 = j + shift; j2 < j; ++j2)
- out.transcript.push_back(op_deletion, subject[j2]);
- }
+ diags.sort();
+ if (log) {
+ diags.print(query, subject);
+ cout << endl << endl;
}
- else {
- out.query_range.begin_ = d.i;
- out.subject_range.begin_ = d.j;
+
+ forward_pass(Index_iterator(0llu), Index_iterator(diags.nodes.size()), true, space_penalty);
+ int max_score = backtrace(hsps, ts, cutoff, max_shift);
+
+ if (log) {
+ hsps.sort(Hsp_data::cmp_query_pos);
+ for (list<Hsp_data>::iterator i = hsps.begin(); i != hsps.end(); ++i)
+ print_hsp(*i, query);
+ cout << endl << "Smith-Waterman:" << endl;
+ smith_waterman(query, subject, diags);
+ cout << endl << endl;
}
- const int d2 = d.diag();
- for (; j < j_end; ++j) {
- const Letter s = subject[j], q = query[d2 + j];
- if (s == q)
- out.transcript.push_back(op_match);
+ return max_score;
+ }
+
+ int run(list<Hsp_data> &hsps, list<Hsp_traits>::const_iterator t_begin, list<Hsp_traits>::const_iterator t_end, list<Hsp_traits> &ts, int band, int cutoff)
+ {
+ if (t_end == t_begin)
+ return 0;
+ if(log)
+ cout << "***** Scan run n_hsp=" << 0 << " cutoff=" << cutoff << endl;
+ diags.init();
+ list<Hsp_traits>::const_iterator i = t_begin;
+ const int ql = (int)query.length();
+ int d_begin = std::max(i->d_min - band, -((int)subject.length() - 1)),
+ d_end = d_begin + make_multiple(std::min(i->d_max + band, ql) - d_begin, 16);
+ ++i;
+ for (; i != t_end; ++i) {
+ if (i->d_min - band >= d_end) {
+ if (log)
+ cout << "Scan " << d_begin << '\t' << d_end << '\t' << d_end - d_begin << endl;
+ diag_scores.scan_diags(d_begin, d_end, query, subject, qp, query_bc, log, diags.nodes, true);
+ d_begin = i->d_min - band;
+ d_end = d_begin + make_multiple(std::min(i->d_max + band, ql) - d_begin, 16);
+ }
else
- out.transcript.push_back(op_substitution, s);
+ d_end = std::max(d_end, d_begin + make_multiple(std::min(i->d_max + band, ql) - d_begin, 16));
}
+ if (log)
+ cout << "Scan " << d_begin << '\t' << d_end << '\t' << d_end - d_begin << endl;
+ diag_scores.scan_diags(d_begin, d_end, query, subject, qp, query_bc, log, diags.nodes, true);
+ if (log)
+ cout << endl;
+
+ return run(hsps, ts, config.space_penalty, cutoff, 999);
}
- Greedy_aligner2(const sequence &query, const Long_score_profile &qp, const sequence &subject, const vector<Diagonal_segment> &sh, bool log, Hsp_data &out) :
+ int run(list<Hsp_data> &hsps, list<Hsp_traits> &ts, vector<Seed_hit>::const_iterator begin, vector<Seed_hit>::const_iterator end, int band)
+ {
+ if (log)
+ cout << "***** Seed hit run " << begin->diagonal() << '\t' << (end - 1)->diagonal() << '\t' << (end - 1)->diagonal() - begin->diagonal() << endl;
+ diags.init();
+ diags.load(begin, end);
+ return run(hsps, ts, 0.1, 19, band);
+ }
+
+ Greedy_aligner2(const sequence &query, const Long_score_profile &qp, const Bias_correction &query_bc, const sequence &subject, bool log, unsigned frame) :
query(query),
subject(subject),
qp(qp),
+ query_bc(query_bc),
log(log),
- score_buf(TLS::get(score_buf_ptr)),
- local_max(TLS::get(local_max_ptr)),
+ frame(frame),
+ diag_scores(TLS::get(diag_scores_ptr)),
diags(TLS::get(diags_ptr)),
- top_nodes(TLS::get(top_nodes_ptr))
+ window(TLS::get(window_ptr))
{
- config.min_diag_raw_score = 15;
- diags.clear();
- local_max.clear();
- top_nodes.clear();
- scan_diags(sh[0]);
- std::sort(diags.begin(), diags.end(), Diagonal_segment::cmp_subject_end);
- if (log)
- for (int k = 0; k < (int)diags.size(); ++k) {
- const Diagonal_segment &d = diags[k];
- cout << "Diag n=" << k << " i=" << d.i << " j=" << d.j << " score=" << d.score << " len=" << d.len << endl;
- cout << sequence(query, d.i, d.query_last()) << endl;
- cout << sequence(subject, d.j, d.subject_last()) << endl;
- }
- if(log) cout << endl;
- unsigned max_node = follow_path_approximate();
- follow_path(0, max_node, 0, 0);
- out.transcript.clear();
- out.length = 1;
- backtrace(top_node.node, diags[top_node.node].subject_end(), out);
- out.transcript.push_terminator();
}
- static TLS_PTR vector<uint8_t> *score_buf_ptr;
- static TLS_PTR vector<uint8_t> *local_max_ptr;
- static TLS_PTR vector<Diagonal_node> *diags_ptr;
- static TLS_PTR vector<Node_ref> *top_nodes_ptr;
+ static TLS_PTR Diag_scores *diag_scores_ptr;
+ static TLS_PTR Diag_graph *diags_ptr;
+ static TLS_PTR map<int, unsigned> *window_ptr;
const sequence query, subject;
const Long_score_profile &qp;
+ const Bias_correction &query_bc;
const bool log;
- vector<uint8_t> &score_buf, &local_max;
- vector<Diagonal_node> &diags;
- vector<Node_ref> &top_nodes;
- Node_ref top_node;
+ const unsigned frame;
+ Diag_scores &diag_scores;
+ Diag_graph &diags;
+ map<int, unsigned> &window;
};
-TLS_PTR vector<uint8_t> *Greedy_aligner2::score_buf_ptr;
-TLS_PTR vector<uint8_t> *Greedy_aligner2::local_max_ptr;
-TLS_PTR vector<Diagonal_node> *Greedy_aligner2::diags_ptr;
-TLS_PTR vector<Greedy_aligner2::Node_ref> *Greedy_aligner2::top_nodes_ptr;
+TLS_PTR Diag_scores *Greedy_aligner2::diag_scores_ptr;
+TLS_PTR Diag_graph *Greedy_aligner2::diags_ptr;
+TLS_PTR map<int, unsigned> *Greedy_aligner2::window_ptr;
+
+int greedy_align(sequence query, const Long_score_profile &qp, const Bias_correction &query_bc, sequence subject, vector<Seed_hit>::const_iterator begin, vector<Seed_hit>::const_iterator end, bool log, list<Hsp_data> &hsps, list<Hsp_traits> &ts, unsigned frame)
+{
+ const int band = config.padding == 0 ? std::min(64, int(query.length()*0.5)) : config.padding;
+ Greedy_aligner2 ga(query, qp, query_bc, subject, log, frame);
+ return ga.run(hsps, ts, begin, end, band);
+}
-void greedy_align2(sequence query, const Long_score_profile &qp, sequence subject, const vector<Diagonal_segment> &sh, bool log, Hsp_data &out)
+int greedy_align(sequence query, const Long_score_profile &qp, const Bias_correction &query_bc, sequence subject, bool log, list<Hsp_data> &hsps, list<Hsp_traits>::const_iterator t_begin, list<Hsp_traits>::const_iterator t_end, list<Hsp_traits> &ts, int cutoff, unsigned frame)
{
- Greedy_aligner2(query, qp, subject, sh, log, out);
+ const int band = config.padding == 0 ? std::min(64, int(query.length()*0.5)) : config.padding;
+ Greedy_aligner2 ga(query, qp, query_bc, subject, log, frame);
+ return ga.run(hsps, t_begin, t_end, ts, band, cutoff);
}
\ No newline at end of file
diff --git a/src/dp/growing_buffer.h b/src/dp/growing_buffer.h
index f405dcb..9460ffc 100644
--- a/src/dp/growing_buffer.h
+++ b/src/dp/growing_buffer.h
@@ -1,20 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-Author: Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef GROWING_BUFFER_H_
diff --git a/src/dp/needleman_wunsch.cpp b/src/dp/needleman_wunsch.cpp
index 1fd6e0f..79a0a17 100644
--- a/src/dp/needleman_wunsch.cpp
+++ b/src/dp/needleman_wunsch.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <vector>
@@ -22,11 +22,35 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "growing_buffer.h"
#include "../util/util.h"
#include "traceback.h"
+#include "../output/output_format.h"
using std::vector;
using std::pair;
-template<typename _score>
+template<typename _score, typename _mode>
+_score saturate(_score x)
+{
+ return x;
+}
+
+template<>
+int saturate<int, Local>(int x)
+{
+ return std::max(x, 0);
+}
+
+template<typename _score,typename _mode>
+void set_max_score(_score s, _score &max_score)
+{
+}
+
+template<>
+void set_max_score<int,Local>(int s, int &max_score)
+{
+ max_score = std::max(max_score, s);
+}
+
+template<typename _score, typename _mode>
struct Dp_matrix
{
@@ -39,7 +63,7 @@ struct Dp_matrix
end_(score_.second + query_len + 1),
i_(0)
{
- *score_.first = col == 0 ? 0 : -config.gap_open - col*config.gap_extend;
+ *score_.first = saturate<_score, _mode>(col == 0 ? 0 : -score_matrix.gap_open() - col*score_matrix.gap_extend());
++score_.second;
}
@@ -96,11 +120,11 @@ struct Dp_matrix
{
score_.init(query_len + 1, subject_len + 1, 0);
hgap_.clear();
- hgap_.insert(hgap_.end(), query_len, std::numeric_limits<int>::min() + 1);
+ hgap_.insert(hgap_.end(), query_len, std::numeric_limits<int>::min() + score_matrix.gap_extend());
int *score = score_.last();
- int g = -config.gap_open - config.gap_extend;
+ int g = -score_matrix.gap_open() - score_matrix.gap_extend();
for (int i = 1; i <= query_len; ++i)
- score[i] = g--;
+ score[i] = saturate<_score, _mode>(g--);
}
const Fixed_score_buffer<_score>& score_buffer() const
@@ -118,42 +142,49 @@ private:
};
-template<typename _score> TLS_PTR Fixed_score_buffer<_score>* Dp_matrix<_score>::score_ptr;
-template<typename _score> TLS_PTR vector<_score>* Dp_matrix<_score>::hgap_ptr;
+template<typename _score, typename _mode> TLS_PTR Fixed_score_buffer<_score>* Dp_matrix<_score,_mode>::score_ptr;
+template<typename _score, typename _mode> TLS_PTR vector<_score>* Dp_matrix<_score,_mode>::hgap_ptr;
-template<typename _score>
-const Fixed_score_buffer<_score>& needleman_wunsch(sequence query, sequence subject, const _score& = int())
+template<typename _score, typename _mode>
+const Fixed_score_buffer<_score>& needleman_wunsch(sequence query, sequence subject, int &max_score, const _mode&, const _score&)
{
using std::max;
- const int gap_open = config.gap_open + config.gap_extend, gap_extend = config.gap_extend;
+ const int gap_open = score_matrix.gap_open() + score_matrix.gap_extend(), gap_extend = score_matrix.gap_extend();
+ int m = 0;
- Dp_matrix<_score> mtx((unsigned)query.length(), (unsigned)subject.length());
+ Dp_matrix<_score, _mode> mtx((unsigned)query.length(), (unsigned)subject.length());
for (int j = 0; j < (int)subject.length(); ++j) {
- typename Dp_matrix<_score>::Column_iterator it = mtx.column(j);
- _score vgap = std::numeric_limits<int>::min() + 1;
+ typename Dp_matrix<_score,_mode>::Column_iterator it = mtx.column(j);
+ _score vgap = std::numeric_limits<int>::min() + gap_extend;
for (; it.valid(); ++it) {
const _score match_score = score_matrix(subject[j], query[it.row()]);
- const _score s = max(max(it.diag() + match_score, vgap), it.hgap());
+ const _score s = saturate<_score, _mode>(max(max(it.diag() + match_score, vgap), it.hgap()));
const _score open = s - gap_open;
vgap = max(vgap - gap_extend, open);
it.hgap() = max(it.hgap() - gap_extend, open);
it.score() = s;
+ set_max_score<_score, _mode>(s, m);
}
}
+ max_score = m;
return mtx.score_buffer();
}
-int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int sbegin, int send, unsigned node, unsigned edge, vector<Diagonal_node> &diags, bool log)
+int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int sbegin, int send, unsigned node, unsigned edge, Diag_graph &diags, bool log)
{
const sequence q = query.subseq(qbegin, qend), s = subject.subseq(sbegin, send);
- const Fixed_score_buffer<int> &dp = needleman_wunsch(q, s, int());
+ int max_score;
+ const Fixed_score_buffer<int> &dp = needleman_wunsch(q, s, max_score, Global(), int());
Diagonal_node *d = &diags[node];
- unsigned start_node = d->edges[edge].node;
- Diagonal_node::Edge *f = &d->edges[edge];
+ unsigned start_node = diags.edges[edge].node_out;
+ vector<Diag_graph::Edge>::iterator f = diags.edges.begin() + edge;
- const int gap_open = config.gap_open, gap_extend = config.gap_extend;
+ /*if (log)
+ cout << dp << endl;*/
+
+ const int gap_open = score_matrix.gap_open(), gap_extend = score_matrix.gap_extend();
int l, i = qend - qbegin, j = send - sbegin;
const int score = dp(i, j);
@@ -161,8 +192,7 @@ int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int
if (l > 0) {
i -= l;
j -= l;
- f->exact = true;
- f->j = j;
+ f->j = sbegin + j;
}
while (i > 0 && j > 0) {
@@ -170,11 +200,9 @@ int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int
i -= l;
j -= l;
if (i != 0 || j != 0) {
- f->node = (unsigned)diags.size();
- diags.push_back(Diagonal_node(qbegin + i, sbegin + j, l, 0));
- f = &diags.back().edges[0];
- f->exact = true;
- f->j = sbegin + j;
+ f->node_out = (unsigned)diags.nodes.size();
+ diags.nodes.push_back(Diagonal_node(qbegin + i, sbegin + j, l, 0, (int)diags.edges.size()));
+ //f = diags.add_edge(Diag_graph::Edge(0, sbegin + j, f->node_out, 0, true, Diagonal_node::finished, 0, 0));
}
}
else if (have_hgap(dp, i, j, gap_open, gap_extend, l)) {
@@ -187,6 +215,111 @@ int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int
throw std::runtime_error("Traceback error.");
}
- f->node = start_node;
+ f->node_out = start_node;
return score;
+}
+
+void smith_waterman(sequence q, sequence s, Hsp_data &out)
+{
+ int max_score;
+ const Fixed_score_buffer<int> &dp = needleman_wunsch(q, s, max_score, Local(), int());
+ pair<int, int> max_pos = dp.find(max_score);
+
+ const int gap_open = score_matrix.gap_open(), gap_extend = score_matrix.gap_extend();
+ int l, i = max_pos.first, j = max_pos.second, score;
+ out.score = dp(i, j);
+ out.query_range.end_ = i;
+ out.subject_range.end_ = j;
+
+ while ((score = dp(i, j)) > 0) {
+ const int match_score = score_matrix(q[i - 1], s[j - 1]);
+ if (score == match_score + dp(i - 1, j - 1)) {
+ if (q[i - 1] == s[j - 1]) {
+ out.transcript.push_back(op_match);
+ }
+ else {
+ out.transcript.push_back(op_substitution, s[j - 1]);
+ }
+ --i;
+ --j;
+ ++out.length;
+ }
+ else if (have_hgap(dp, i, j, gap_open, gap_extend, l)) {
+ for (; l > 0; l--) {
+ out.transcript.push_back(op_deletion, s[--j]);
+ ++out.length;
+ }
+ }
+ else if (have_vgap(dp, i, j, gap_open, gap_extend, l)) {
+ out.transcript.push_back(op_insertion, (unsigned)l);
+ out.length += l;
+ i -= l;
+ }
+ else
+ throw std::runtime_error("Traceback error.");
+ }
+
+ out.query_range.begin_ = i;
+ out.subject_range.begin_ = j;
+ out.transcript.reverse();
+ out.transcript.push_terminator();
+}
+
+void print_diag(int i0, int j0, int l, int score, const Diag_graph &diags, const sequence &query, const sequence &subject)
+{
+ Diagonal_segment ds(i0, j0, l, 0);
+ unsigned n = 0;
+ int path_max,path_min;
+ for (vector<Diagonal_node>::const_iterator d = diags.nodes.begin(); d != diags.nodes.end(); ++d) {
+ if (d->intersect(ds).len > 0) {
+ if (d->score == 0)
+ continue;
+ const int diff = score_range(query, subject, d->query_end(), d->subject_end(), j0 + l);
+ if (n > 0)
+ cout << "(";
+ cout << "Diag n=" << d - diags.nodes.begin() << " i=" << i0 << " j=" << j0 << " len=" << l
+ << " prefix_score=" << score + score_range(query, subject, i0 + l, j0 + l, d->subject_end()) - std::min(diff, 0)
+ << " prefix_score2=" << diags.prefix_score((unsigned)(d - diags.nodes.begin()), j0 + l, path_max,path_min);
+ if (n > 0)
+ cout << ")";
+ cout << endl;
+ ++n;
+ }
+ }
+ if(n == 0)
+ cout << "Diag n=x i=" << i0 << " j=" << j0 << " len=" << l << " prefix_score=" << score << endl;
+}
+
+void smith_waterman(sequence q, sequence s, const Diag_graph &diags)
+{
+ Hsp_data hsp;
+ smith_waterman(q, s, hsp);
+ Hsp_data::Iterator i = hsp.begin();
+ int i0 = -1, j0 = -1, l = 0, score = 0;
+ for (; i.good(); ++i) {
+ switch (i.op()) {
+ case op_match:
+ case op_substitution:
+ if (i0 < 0) {
+ i0 = i.query_pos;
+ j0 = i.subject_pos;
+ l = 0;
+ }
+ score += score_matrix(q[i.query_pos], s[i.subject_pos]);
+ ++l;
+ break;
+ case op_deletion:
+ case op_insertion:
+ if (i0 >= 0) {
+ print_diag(i0, j0, l, score, diags, q, s);
+ score -= score_matrix.gap_open() + score_matrix.gap_extend();
+ i0 = -1;
+ j0 = -1;
+ }
+ else
+ score -= score_matrix.gap_extend();
+ }
+ }
+ print_diag(i0, j0, l, score, diags, q, s);
+ print_hsp(hsp, q);
}
\ No newline at end of file
diff --git a/src/dp/padded_banded_sw.cpp b/src/dp/padded_banded_sw.cpp
index 3e01ebc..9645caa 100644
--- a/src/dp/padded_banded_sw.cpp
+++ b/src/dp/padded_banded_sw.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "dp.h"
diff --git a/src/dp/scalar_dp_matrix.h b/src/dp/scalar_dp_matrix.h
index 788176d..92d82ab 100644
--- a/src/dp/scalar_dp_matrix.h
+++ b/src/dp/scalar_dp_matrix.h
@@ -1,20 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-Author: Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SCALAR_DP_MATRIX_H_
diff --git a/src/dp/scalar_traceback.h b/src/dp/scalar_traceback.h
index 5fc940f..423006c 100644
--- a/src/dp/scalar_traceback.h
+++ b/src/dp/scalar_traceback.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SCALAR_TRACEBACK_H_
@@ -115,7 +115,6 @@ local_match traceback(const Letter *query,
return local_match (0);
Scalar_traceback_matrix<_score> dp (scores, band);
//dp.print(i, j);
-
local_match l;
l.query_range.begin_ = 0;
l.query_range.end_ = j + 1;
diff --git a/src/dp/score_profile.h b/src/dp/score_profile.h
index 1ea8794..818f312 100644
--- a/src/dp/score_profile.h
+++ b/src/dp/score_profile.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SCORE_PROFILE_H_
@@ -134,6 +132,8 @@ struct score_profile
struct Long_score_profile
{
+ Long_score_profile()
+ {}
Long_score_profile(sequence seq)
{
for (unsigned l = 0; l < 25; ++l) {
@@ -154,7 +154,7 @@ struct Long_score_profile
return &data[(int)l][i + padding];
}
vector<uint8_t> data[25];
- enum { padding = 256 };
+ enum { padding = 32 };
};
#endif /* SCORE_PROFILE_H_ */
diff --git a/src/dp/score_vector.h b/src/dp/score_vector.h
index f8c8d0c..ab028eb 100644
--- a/src/dp/score_vector.h
+++ b/src/dp/score_vector.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SCORE_VECTOR_H_
@@ -63,14 +63,11 @@ struct score_vector<uint8_t>
explicit score_vector(unsigned a, const __m128i &seq)
{
- if(config.have_ssse3) {
#ifdef __SSSE3__
- set_ssse3(a, seq);
+ set_ssse3(a, seq);
#else
- set_generic(a, seq);
+ set_generic(a, seq);
#endif
- } else
- set_generic(a, seq);
}
void set_ssse3(unsigned a, const __m128i &seq)
@@ -89,7 +86,7 @@ struct score_vector<uint8_t>
data_ = _mm_or_si128(s1, s2);
#endif
}
-
+
void set_generic(unsigned a, const __m128i &seq)
{
const uint8_t* row (&score_matrix.matrix8u()[a<<5]);
diff --git a/src/dp/smith_waterman.cpp b/src/dp/smith_waterman.cpp
index af19b99..6682090 100644
--- a/src/dp/smith_waterman.cpp
+++ b/src/dp/smith_waterman.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <vector>
diff --git a/src/dp/smith_waterman.h b/src/dp/smith_waterman.h
index 157c7b1..604231c 100644
--- a/src/dp/smith_waterman.h
+++ b/src/dp/smith_waterman.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SSE_SW_H_
diff --git a/src/dp/swipe.cpp b/src/dp/swipe.cpp
new file mode 100644
index 0000000..d09b2ee
--- /dev/null
+++ b/src/dp/swipe.cpp
@@ -0,0 +1,249 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <vector>
+#include "dp.h"
+#include "score_vector.h"
+
+// #define SW_ENABLE_DEBUG
+
+using std::vector;
+using std::pair;
+
+#ifdef __SSE2__
+
+template<typename _score>
+struct Swipe_profile
+{
+ inline void set(const __m128i &seq)
+ {
+ assert(sizeof(data_) / sizeof(score_vector<_score>) >= value_traits.alphabet_size);
+ for (unsigned j = 0; j < value_traits.alphabet_size; ++j)
+ data_[j] = score_vector<_score>(j, seq);
+ }
+ inline const score_vector<_score>& get(Letter i) const
+ {
+ return data_[(int)i];
+ }
+ score_vector<_score> data_[25];
+};
+
+template<typename _score>
+struct Swipe_matrix
+{
+ typedef score_vector<_score> sv;
+ struct Column_iterator
+ {
+ Column_iterator(sv* hgap_front, sv* score_front) :
+ hgap_ptr_(hgap_front),
+ score_ptr_(score_front)
+ { }
+ inline void operator++()
+ {
+ ++hgap_ptr_; ++score_ptr_;
+ }
+ inline sv hgap() const
+ {
+ return *hgap_ptr_;
+ }
+ inline sv diag() const
+ {
+ return *score_ptr_;
+ }
+ inline void set_hgap(const sv& x)
+ {
+ *hgap_ptr_ = x;
+ }
+ inline void set_score(const sv& x)
+ {
+ *score_ptr_ = x;
+ }
+ sv *hgap_ptr_, *score_ptr_;
+ };
+ Swipe_matrix(int rows):
+ hgap_(TLS::get(hgap_ptr)),
+ score_(TLS::get(score_ptr))
+ {
+ hgap_.clear();
+ hgap_.resize(rows);
+ score_.clear();
+ score_.resize(rows + 1);
+ memset(hgap_.data(), 0, rows * sizeof(sv));
+ memset(score_.data(), 0, (rows + 1) * sizeof(sv));
+ }
+ inline Column_iterator begin()
+ {
+ return Column_iterator(&hgap_[0], &score_[0]);
+ }
+ void set_zero(int c)
+ {
+ const int l = (int)hgap_.size();
+ for (int i = 0; i < l; ++i) {
+ hgap_[i].set(c, 0);
+ score_[i].set(c, 0);
+ }
+ score_[l].set(c, 0);
+ }
+private:
+ vector<sv> &hgap_, &score_;
+ static TLS_PTR vector<sv> *hgap_ptr, *score_ptr;
+};
+
+template<typename _score> TLS_PTR vector<score_vector<_score> >* Swipe_matrix<_score>::hgap_ptr;
+template<typename _score> TLS_PTR vector<score_vector<_score> >* Swipe_matrix<_score>::score_ptr;
+
+template<typename _score>
+inline score_vector<_score> cell_update(const score_vector<_score> &diagonal_cell,
+ const score_vector<_score> &scores,
+ const score_vector<_score> &gap_extension,
+ const score_vector<_score> &gap_open,
+ score_vector<_score> &horizontal_gap,
+ score_vector<_score> &vertical_gap,
+ score_vector<_score> &best,
+ const score_vector<_score> &vbias)
+{
+ score_vector<_score> current_cell = diagonal_cell + scores;
+ current_cell -= vbias;
+ current_cell.max(vertical_gap).max(horizontal_gap);
+ best.max(current_cell);
+ vertical_gap -= gap_extension;
+ horizontal_gap -= gap_extension;
+ const score_vector<_score> open = current_cell - gap_open;
+ vertical_gap.max(open);
+ horizontal_gap.max(open);
+ return current_cell;
+}
+
+template<int _n>
+struct Target_iterator
+{
+ Target_iterator(vector<sequence>::const_iterator subject_begin, vector<sequence>::const_iterator subject_end):
+ next(0),
+ n_targets(int(subject_end-subject_begin)),
+ subject_begin(subject_begin)
+ {
+ for (; next < std::min(_n, n_targets); ++next) {
+ pos[next] = 0;
+ target[next] = next;
+ active.push_back(next);
+ }
+ }
+ char operator[](int i) const
+ {
+ return subject_begin[target[i]][pos[i]];
+ }
+ __m128i get() const
+ {
+ char s[16];
+ for (int i = 0; i < active.size(); ++i) {
+ const int j = active[i];
+ s[j] = (*this)[j];
+ }
+ return _mm_loadu_si128((const __m128i*)s);
+ }
+ bool init_target(int i, int j)
+ {
+ if (next < n_targets) {
+ pos[j] = 0;
+ target[j] = next++;
+ return true;
+ }
+ active.erase(i);
+ return false;
+ }
+ bool inc(int i)
+ {
+ ++pos[i];
+ if (pos[i] >= (int)subject_begin[target[i]].length())
+ return false;
+ return true;
+ }
+ int pos[_n], target[_n], next, n_targets;
+ Static_vector<int, _n> active;
+ const vector<sequence>::const_iterator subject_begin;
+};
+
+template<typename _score>
+void swipe(const sequence &query, vector<sequence>::const_iterator subject_begin, vector<sequence>::const_iterator subject_end, vector<int>::iterator out)
+{
+#ifdef SW_ENABLE_DEBUG
+ static int v[1024][1024];
+#endif
+
+ typedef score_vector<_score> sv;
+
+ const int qlen = (int)query.length();
+ Swipe_matrix<_score> dp(qlen);
+
+ const sv open_penalty(static_cast<char>(score_matrix.gap_open() + score_matrix.gap_extend())),
+ extend_penalty(static_cast<char>(score_matrix.gap_extend())),
+ vbias(score_matrix.bias());
+ sv best;
+ Swipe_profile<_score> profile;
+ Target_iterator<score_traits<_score>::channels> targets(subject_begin, subject_end);
+
+ while (targets.active.size() > 0) {
+ typename Swipe_matrix<_score>::Column_iterator it(dp.begin());
+ sv vgap, hgap, last;
+ profile.set(targets.get());
+ for (int i = 0; i < qlen; ++i) {
+ hgap = it.hgap();
+ const sv next = cell_update<_score>(it.diag(), profile.get(query[i]), extend_penalty, open_penalty, hgap, vgap, best, vbias);
+ it.set_hgap(hgap);
+ it.set_score(last);
+ last = next;
+#ifdef SW_ENABLE_DEBUG
+ v[targets.pos[0]][i] = next[0];
+#endif
+ ++it;
+ }
+ it.set_score(last);
+
+ for (int i = 0; i < targets.active.size();) {
+ int j = targets.active[i];
+ if (!targets.inc(j)) {
+ out[targets.target[j]] = best[j];
+ if (targets.init_target(i, j)) {
+ dp.set_zero(j);
+ best.set(j, 0);
+ }
+ else
+ continue;
+ }
+ ++i;
+ }
+ }
+
+#ifdef SW_ENABLE_DEBUG
+ for (unsigned j = 0; j < qlen; ++j) {
+ for (unsigned i = 0; i < subject_begin[0].length(); ++i)
+ printf("%4i", v[i][j]);
+ printf("\n");
+ }
+ printf("\n");
+#endif
+}
+
+#endif
+
+void swipe(const sequence &query, vector<sequence>::const_iterator subject_begin, vector<sequence>::const_iterator subject_end, vector<int>::iterator out)
+{
+#ifdef __SSE2__
+ swipe<uint8_t>(query, subject_begin, subject_end, out);
+#endif
+}
diff --git a/src/dp/traceback.h b/src/dp/traceback.h
index 3ec6621..cd4ec9c 100644
--- a/src/dp/traceback.h
+++ b/src/dp/traceback.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TRACEBACK_H_
@@ -74,8 +74,8 @@ int have_diag(const _matrix &dp,
const int match_score = score_matrix(query[i - 1], subject[j - 1]);
if (dp(i, j) == match_score + dp(i - 1, j - 1)) {
- if (log)
- printf("i=%i j=%i score=%i subject=%c query=%c\n", i, j, dp(i, j), value_traits.alphabet[(int)subject[j - 1]], value_traits.alphabet[(int)query[i - 1]]);
+ /*if (log)
+ printf("i=%i j=%i score=%i subject=%c query=%c\n", i, j, dp(i, j), value_traits.alphabet[(int)subject[j - 1]], value_traits.alphabet[(int)query[i - 1]]);*/
++l;
--i;
--j;
diff --git a/src/dp/ungapped_align.cpp b/src/dp/ungapped_align.cpp
index 8c83c1f..4b83e14 100644
--- a/src/dp/ungapped_align.cpp
+++ b/src/dp/ungapped_align.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "dp.h"
@@ -105,6 +105,91 @@ int xdrop_ungapped(const Letter *query, const Letter *subject, unsigned &delta,
return score;
}
+Diagonal_segment xdrop_ungapped(const sequence &query, const Bias_correction &query_bc, const sequence &subject, int qa, int sa)
+{
+ const float xdrop = (float)config.raw_ungapped_xdrop;
+ float score = 0, st = 0;
+ int n = 1, delta = 0, len = 0;
+
+ int q = qa - 1, s = sa - 1;
+ Letter ql, sl;
+ while (score - st < xdrop
+ && (ql = query[q]) != '\xff'
+ && (sl = subject[s]) != '\xff')
+ {
+ st += score_matrix(ql, sl) + query_bc[q];
+ if (st > score) {
+ score = st;
+ delta = n;
+ }
+ --q;
+ --s;
+ ++n;
+ }
+
+ q = qa;
+ s = sa;
+ st = score;
+ n = 1;
+ while (score - st < xdrop
+ && (ql = query[q]) != '\xff'
+ && (sl = subject[s]) != '\xff')
+ {
+ st += score_matrix(ql, sl) + query_bc[q];
+ if (st > score) {
+ score = st;
+ len = n;
+ }
+ ++q;
+ ++s;
+ ++n;
+ }
+ return Diagonal_segment(qa - delta, sa - delta, len + delta, (int)score);
+}
+
+Diagonal_segment xdrop_ungapped(const sequence &query, const sequence &subject, int qa, int sa)
+{
+ const int xdrop = config.raw_ungapped_xdrop;
+ int score = 0, st = 0;
+ int n = 1, delta = 0, len = 0;
+
+ int q = qa - 1, s = sa - 1;
+ Letter ql, sl;
+ while (score - st < xdrop
+ && (ql = query[q]) != '\xff'
+ && (sl = subject[s]) != '\xff')
+ {
+ st += score_matrix(ql, sl);
+ if (st > score) {
+ score = st;
+ delta = n;
+ }
+ --q;
+ --s;
+ ++n;
+ }
+
+ q = qa;
+ s = sa;
+ st = score;
+ n = 1;
+ while (score - st < xdrop
+ && (ql = query[q]) != '\xff'
+ && (sl = subject[s]) != '\xff')
+ {
+ st += score_matrix(ql, sl);
+ if (st > score) {
+ score = st;
+ len = n;
+ }
+ ++q;
+ ++s;
+ ++n;
+ }
+ return Diagonal_segment(qa - delta, sa - delta, len + delta, score);
+}
+
+
int xdrop_ungapped_right(const Letter *query, const Letter *subject, int &len)
{
int score(0), st(0), n = 1;
diff --git a/src/extra/blast_record.h b/src/extra/blast_record.h
index 4cdf2a6..c585379 100644
--- a/src/extra/blast_record.h
+++ b/src/extra/blast_record.h
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
#ifndef BLAST_RECORD_H_
#define BLAST_RECORD_H_
diff --git a/src/extra/compare.h b/src/extra/compare.h
index 1cf016c..2aecd00 100644
--- a/src/extra/compare.h
+++ b/src/extra/compare.h
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
#ifndef COMPARE_H_
#define COMPARE_H_
diff --git a/src/extra/extra.h b/src/extra/extra.h
index 69f4428..1894fc5 100644
--- a/src/extra/extra.h
+++ b/src/extra/extra.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2017, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <vector>
diff --git a/src/extra/match_file.h b/src/extra/match_file.h
index c1ec693..cd2ef69 100644
--- a/src/extra/match_file.h
+++ b/src/extra/match_file.h
@@ -1,6 +1,25 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
#ifndef MATCH_FILE_H_
#define MATCH_FILE_H_
+#include <stdio.h>
#include <vector>
#include "blast_record.h"
#include "../util/util.h"
diff --git a/src/extra/model_sim.cpp b/src/extra/model_sim.cpp
index 0751d71..9ec4489 100644
--- a/src/extra/model_sim.cpp
+++ b/src/extra/model_sim.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <time.h>
diff --git a/src/extra/opt.cpp b/src/extra/opt.cpp
index 2946631..9a46959 100644
--- a/src/extra/opt.cpp
+++ b/src/extra/opt.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2017, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <string.h>
diff --git a/src/extra/roc.cpp b/src/extra/roc.cpp
index 0fe2a75..ac36086 100644
--- a/src/extra/roc.cpp
+++ b/src/extra/roc.cpp
@@ -1,21 +1,22 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
+#include <stdio.h>
#include <map>
#include <set>
#include "../util/binary_file.h"
diff --git a/src/lib/tantan/tantan.cc b/src/lib/tantan/tantan.cc
new file mode 100644
index 0000000..0e56d59
--- /dev/null
+++ b/src/lib/tantan/tantan.cc
@@ -0,0 +1,462 @@
+// Copyright 2010 Martin C. Frith
+
+#include "tantan.hh"
+
+#include <algorithm> // fill, max
+#include <cassert>
+#include <cmath> // pow, abs
+#include <iostream> // cerr
+#include <numeric> // accumulate
+#include <vector>
+
+#define BEG(v) ((v).empty() ? 0 : &(v).front())
+#define END(v) ((v).empty() ? 0 : &(v).back() + 1)
+
+namespace tantan {
+
+void multiplyAll(std::vector<double> &v, double factor) {
+ for (std::vector<double>::iterator i = v.begin(); i < v.end(); ++i)
+ *i *= factor;
+}
+
+double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) {
+ if (probMult < 1 || probMult > 1)
+ return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset));
+ else
+ return 1.0 / maxRepeatOffset;
+}
+
+void checkForwardAndBackwardTotals(double fTot, double bTot) {
+ double x = std::abs(fTot);
+ double y = std::abs(bTot);
+
+ // ??? Is 1e6 suitable here ???
+ if (std::abs(fTot - bTot) > std::max(x, y) / 1e6)
+ std::cerr << "tantan: warning: possible numeric inaccuracy\n"
+ << "tantan: forward algorithm total: " << fTot << "\n"
+ << "tantan: backward algorithm total: " << bTot << "\n";
+}
+
+struct Tantan {
+ enum { scaleStepSize = 16 };
+
+ const uchar *seqBeg; // start of the sequence
+ const uchar *seqEnd; // end of the sequence
+ const uchar *seqPtr; // current position in the sequence
+
+ int maxRepeatOffset;
+
+ const const_double_ptr *likelihoodRatioMatrix;
+
+ double b2b; // transition probability from background to background
+ double f2b; // transition probability from foreground to background
+ double g2g; // transition probability from gap/indel to gap/indel
+ //double f2g; // transition probability from foreground to gap/indel
+ //double g2f; // transition probability from gap/indel to foreground
+ double oneGapProb; // f2g * g2f
+ double endGapProb; // f2g * 1
+ double f2f0; // foreground to foreground, if there are 0 indel transitions
+ double f2f1; // foreground to foreground, if there is 1 indel transition
+ double f2f2; // foreground to foreground, if there are 2 indel transitions
+ double b2fDecay;
+ double b2fGrowth;
+ double b2fFirst; // background state to first foreground state
+ double b2fLast; // background state to last foreground state
+
+ double backgroundProb;
+ std::vector<double> foregroundProbs;
+ std::vector<double> insertionProbs;
+
+ std::vector<double> scaleFactors;
+
+ Tantan(const uchar *seqBeg,
+ const uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb) {
+ assert(maxRepeatOffset > 0);
+ assert(repeatProb >= 0 && repeatProb < 1);
+ // (if repeatProb==1, then any sequence is impossible)
+ assert(repeatEndProb >= 0 && repeatEndProb <= 1);
+ assert(repeatOffsetProbDecay > 0 && repeatOffsetProbDecay <= 1);
+ assert(otherGapProb >= 0 && otherGapProb <= 1);
+ assert(firstGapProb >= 0);
+ assert(repeatEndProb + firstGapProb * 2 <= 1);
+
+ this->seqBeg = seqBeg;
+ this->seqEnd = seqEnd;
+ this->seqPtr = seqBeg;
+ this->maxRepeatOffset = maxRepeatOffset;
+ this->likelihoodRatioMatrix = likelihoodRatioMatrix;
+
+ b2b = 1 - repeatProb;
+ f2b = repeatEndProb;
+ g2g = otherGapProb;
+ //f2g = firstGapProb;
+ //g2f = 1 - otherGapProb;
+ oneGapProb = firstGapProb * (1 - otherGapProb);
+ endGapProb = firstGapProb * 1;
+ f2f0 = 1 - repeatEndProb;
+ f2f1 = 1 - repeatEndProb - firstGapProb;
+ f2f2 = 1 - repeatEndProb - firstGapProb * 2;
+
+ b2fDecay = repeatOffsetProbDecay;
+ b2fGrowth = 1 / repeatOffsetProbDecay;
+
+ b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset);
+ b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset);
+
+ foregroundProbs.resize(maxRepeatOffset);
+ insertionProbs.resize(maxRepeatOffset - 1);
+
+ scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize);
+ }
+
+ void initializeForwardAlgorithm() {
+ backgroundProb = 1.0;
+ std::fill(foregroundProbs.begin(), foregroundProbs.end(), 0.0);
+ std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0);
+ }
+
+ double forwardTotal() {
+ double fromForeground = std::accumulate(foregroundProbs.begin(),
+ foregroundProbs.end(), 0.0);
+ fromForeground *= f2b;
+ double total = backgroundProb * b2b + fromForeground;
+ assert(total > 0);
+ return total;
+ }
+
+ void initializeBackwardAlgorithm() {
+ backgroundProb = b2b;
+ std::fill(foregroundProbs.begin(), foregroundProbs.end(), f2b);
+ std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0);
+ }
+
+ double backwardTotal() {
+ assert(backgroundProb > 0);
+ return backgroundProb;
+ }
+
+ void calcForwardTransitionProbsWithGaps() {
+ double fromBackground = backgroundProb * b2fLast;
+ double *foregroundPtr = &foregroundProbs.back();
+ double f = *foregroundPtr;
+ double fromForeground = f;
+
+ if (insertionProbs.empty()) {
+ *foregroundPtr = fromBackground + f * f2f0;
+ } else {
+ double *insertionPtr = &insertionProbs.back();
+ double i = *insertionPtr;
+ *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
+ double d = f;
+ --foregroundPtr;
+ fromBackground *= b2fGrowth;
+
+ while (foregroundPtr > &foregroundProbs.front()) {
+ f = *foregroundPtr;
+ fromForeground += f;
+ i = *(insertionPtr - 1);
+ *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb;
+ *insertionPtr = f + i * g2g;
+ d = f + d * g2g;
+ --foregroundPtr;
+ --insertionPtr;
+ fromBackground *= b2fGrowth;
+ }
+
+ f = *foregroundPtr;
+ fromForeground += f;
+ *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
+ *insertionPtr = f;
+ }
+
+ fromForeground *= f2b;
+ backgroundProb = backgroundProb * b2b + fromForeground;
+ }
+
+ void calcBackwardTransitionProbsWithGaps() {
+ double toBackground = f2b * backgroundProb;
+ double *foregroundPtr = &foregroundProbs.front();
+ double f = *foregroundPtr;
+ double toForeground = f;
+
+ if (insertionProbs.empty()) {
+ *foregroundPtr = toBackground + f2f0 * f;
+ } else {
+ double *insertionPtr = &insertionProbs.front();
+ double i = *insertionPtr;
+ *foregroundPtr = toBackground + f2f1 * f + i;
+ double d = endGapProb * f;
+ ++foregroundPtr;
+ toForeground *= b2fGrowth;
+
+ while (foregroundPtr < &foregroundProbs.back()) {
+ f = *foregroundPtr;
+ toForeground += f;
+ i = *(insertionPtr + 1);
+ *foregroundPtr = toBackground + f2f2 * f + (i + d);
+ double oneGapProb_f = oneGapProb * f;
+ *insertionPtr = oneGapProb_f + g2g * i;
+ d = oneGapProb_f + g2g * d;
+ ++foregroundPtr;
+ ++insertionPtr;
+ toForeground *= b2fGrowth;
+ }
+
+ f = *foregroundPtr;
+ toForeground += f;
+ *foregroundPtr = toBackground + f2f1 * f + d;
+ *insertionPtr = endGapProb * f;
+ }
+
+ toForeground *= b2fLast;
+ backgroundProb = b2b * backgroundProb + toForeground;
+ }
+
+ void calcForwardTransitionProbs() {
+ if (endGapProb > 0) return calcForwardTransitionProbsWithGaps();
+
+ double fromBackground = backgroundProb * b2fLast;
+ double fromForeground = 0;
+ double *foregroundPtr = END(foregroundProbs);
+ double *foregroundBeg = BEG(foregroundProbs);
+
+ while (foregroundPtr > foregroundBeg) {
+ --foregroundPtr;
+ double f = *foregroundPtr;
+ fromForeground += f;
+ *foregroundPtr = fromBackground + f * f2f0;
+ fromBackground *= b2fGrowth;
+ }
+
+ fromForeground *= f2b;
+ backgroundProb = backgroundProb * b2b + fromForeground;
+ }
+
+ void calcBackwardTransitionProbs() {
+ if (endGapProb > 0) return calcBackwardTransitionProbsWithGaps();
+
+ double toBackground = f2b * backgroundProb;
+ double toForeground = 0;
+ double *foregroundPtr = BEG(foregroundProbs);
+ double *foregroundEnd = END(foregroundProbs);
+
+ while (foregroundPtr < foregroundEnd) {
+ toForeground *= b2fGrowth;
+ double f = *foregroundPtr;
+ toForeground += f;
+ *foregroundPtr = toBackground + f2f0 * f;
+ ++foregroundPtr;
+ }
+
+ toForeground *= b2fLast;
+ backgroundProb = b2b * backgroundProb + toForeground;
+ }
+
+ void addEndCounts(double forwardProb,
+ double totalProb,
+ double *transitionCounts) {
+ double toEnd = forwardProb * b2b / totalProb;
+ transitionCounts[0] += toEnd;
+ }
+
+ void addTransitionCounts(double forwardProb,
+ double totalProb,
+ double *transitionCounts) {
+ double toBg = forwardProb * b2b / totalProb;
+ double toFg = forwardProb * b2fFirst / totalProb;
+
+ transitionCounts[0] += backgroundProb * toBg;
+
+ for (double *i = BEG(foregroundProbs); i < END(foregroundProbs); ++i) {
+ ++transitionCounts;
+ *transitionCounts += *i * toFg;
+ toFg *= b2fDecay;
+ }
+ }
+
+ void calcEmissionProbs() {
+ const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+
+ bool isNearSeqBeg = (seqPtr - seqBeg < maxRepeatOffset);
+ const uchar *seqStop = isNearSeqBeg ? seqBeg : seqPtr - maxRepeatOffset;
+
+ double *foregroundPtr = BEG(foregroundProbs);
+ const uchar *offsetPtr = seqPtr;
+
+ while (offsetPtr > seqStop) {
+ --offsetPtr;
+ *foregroundPtr *= lrRow[*offsetPtr];
+ ++foregroundPtr;
+ }
+
+ while (foregroundPtr < END(foregroundProbs)) {
+ *foregroundPtr *= 0;
+ ++foregroundPtr;
+ }
+ }
+
+ void rescale(double scale) {
+ backgroundProb *= scale;
+ multiplyAll(foregroundProbs, scale);
+ multiplyAll(insertionProbs, scale);
+ }
+
+ void rescaleForward() {
+ if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) {
+ assert(backgroundProb > 0);
+ double scale = 1 / backgroundProb;
+ scaleFactors[(seqPtr - seqBeg) / scaleStepSize] = scale;
+ rescale(scale);
+ }
+ }
+
+ void rescaleBackward() {
+ if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) {
+ double scale = scaleFactors[(seqPtr - seqBeg) / scaleStepSize];
+ rescale(scale);
+ }
+ }
+
+ void calcRepeatProbs(float *letterProbs) {
+ initializeForwardAlgorithm();
+
+ while (seqPtr < seqEnd) {
+ calcForwardTransitionProbs();
+ calcEmissionProbs();
+ rescaleForward();
+ *letterProbs = static_cast<float>(backgroundProb);
+ ++letterProbs;
+ ++seqPtr;
+ }
+
+ double z = forwardTotal();
+
+ initializeBackwardAlgorithm();
+
+ while (seqPtr > seqBeg) {
+ --seqPtr;
+ --letterProbs;
+ double nonRepeatProb = *letterProbs * backgroundProb / z;
+ // Convert nonRepeatProb to a float, so that it is more likely
+ // to be exactly 1 when it should be, e.g. for the 1st letter of
+ // a sequence:
+ *letterProbs = 1 - static_cast<float>(nonRepeatProb);
+ rescaleBackward();
+ calcEmissionProbs();
+ calcBackwardTransitionProbs();
+ }
+
+ double z2 = backwardTotal();
+ checkForwardAndBackwardTotals(z, z2);
+ }
+
+ void countTransitions(double *transitionCounts) {
+ std::vector<float> p(seqEnd - seqBeg);
+ float *letterProbs = BEG(p);
+
+ initializeForwardAlgorithm();
+
+ while (seqPtr < seqEnd) {
+ *letterProbs = static_cast<float>(backgroundProb);
+ calcForwardTransitionProbs();
+ calcEmissionProbs();
+ rescaleForward();
+ ++letterProbs;
+ ++seqPtr;
+ }
+
+ double z = forwardTotal();
+
+ addEndCounts(backgroundProb, z, transitionCounts);
+
+ initializeBackwardAlgorithm();
+
+ while (seqPtr > seqBeg) {
+ --seqPtr;
+ --letterProbs;
+ rescaleBackward();
+ calcEmissionProbs();
+ addTransitionCounts(*letterProbs, z, transitionCounts);
+ calcBackwardTransitionProbs();
+ }
+
+ double z2 = backwardTotal();
+ checkForwardAndBackwardTotals(z, z2);
+ }
+};
+
+void maskSequences(uchar *seqBeg,
+ uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb,
+ double minMaskProb,
+ const uchar *maskTable) {
+ std::vector<float> p(seqEnd - seqBeg);
+ float *probabilities = BEG(p);
+
+ getProbabilities(seqBeg, seqEnd, maxRepeatOffset,
+ likelihoodRatioMatrix, repeatProb, repeatEndProb,
+ repeatOffsetProbDecay, firstGapProb, otherGapProb,
+ probabilities);
+
+ maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable);
+}
+
+void getProbabilities(const uchar *seqBeg,
+ const uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb,
+ float *probabilities) {
+ Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix,
+ repeatProb, repeatEndProb, repeatOffsetProbDecay,
+ firstGapProb, otherGapProb);
+ tantan.calcRepeatProbs(probabilities);
+}
+
+void maskProbableLetters(uchar *seqBeg,
+ uchar *seqEnd,
+ const float *probabilities,
+ double minMaskProb,
+ const uchar *maskTable) {
+ while (seqBeg < seqEnd) {
+ if (*probabilities >= minMaskProb)
+ *seqBeg = maskTable[*seqBeg];
+ ++probabilities;
+ ++seqBeg;
+ }
+}
+
+void countTransitions(const uchar *seqBeg,
+ const uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb,
+ double *transitionCounts) {
+ Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix,
+ repeatProb, repeatEndProb, repeatOffsetProbDecay,
+ firstGapProb, otherGapProb);
+ tantan.countTransitions(transitionCounts);
+}
+
+}
diff --git a/src/lib/tantan/tantan.hh b/src/lib/tantan/tantan.hh
new file mode 100644
index 0000000..53d4648
--- /dev/null
+++ b/src/lib/tantan/tantan.hh
@@ -0,0 +1,120 @@
+// Copyright 2010 Martin C. Frith
+
+// These are routines for masking simple regions (low-complexity and
+// short-period tandem repeats) in biological sequences. To
+// understand them in detail, see the published article (in
+// preparation).
+
+// Typically, you would just use the maskSequences routine. The other
+// routines are more specialized. The inputs to maskSequences are as
+// follows.
+
+// seqBeg: pointer to the start of the sequence.
+// seqEnd: pointer to one-past-the-end of the sequence.
+// maxRepeatOffset: the maximum tandem-repeat period-size to consider.
+
+// likelihoodRatioMatrix: a matrix of the form Qxy / (Px * Py), where
+// Qxy is the probability of seeing letters x and y in equivalent
+// positions of a tandem repeat, and Px and Py are the background
+// probabilities of the letters. This matrix is related to a scoring
+// matrix (e.g. blosum62) by:
+// likelihoodRatioMatrix[x][y] = exp(lambda * scoringMatrix[x][y]).
+
+// The uchars in the sequence will be used as indexes into the matrix
+// (e.g. likelihoodRatioMatrix[x][y]). So typically the uchars are
+// small integers (e.g. 0, 1, 2, 3). The matrix needs to have entries
+// for any uchars that can occur.
+
+// repeatProb: the probability of a repetitive segment starting per position.
+// repeatEndProb: the probability of a repetitive segment ending per position.
+
+// repeatOffsetProbDecay: the probability of a period-(i) repeat
+// divided by the probability of a period-(i-1) repeat.
+
+// firstGapProb: the probability of initiating an insertion or
+// deletion in a repetitive region.
+
+// otherGapProb: the probability of extending an insertion or deletion
+// by one more letter.
+
+// minMaskProb: mask letters whose posterior probability of being
+// repetitive is >= this.
+
+// maskTable: how to do the masking. Letter x will be changed to
+// maskTable[x]. So maskTable needs to have entries for any uchar
+// that can occur.
+
+// Typical usage:
+// tantan::maskSequences(seqBeg, seqEnd, 100, likelihoodRatioMatrix,
+// 0.005, 0.05, 0.9, 0, 0, 0.5, maskTable)
+
+#ifndef TANTAN_HH
+#define TANTAN_HH
+
+namespace tantan {
+
+typedef unsigned char uchar;
+typedef const double *const_double_ptr;
+
+void maskSequences(uchar *seqBeg,
+ uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb,
+ double minMaskProb,
+ const uchar *maskTable);
+
+// The following routine gets the posterior probability that each
+// letter is repetitive. It stores the results in "probabilities",
+// which must point to enough pre-allocated space to fit the results.
+
+void getProbabilities(const uchar *seqBeg,
+ const uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb,
+ float *probabilities);
+
+// The following routine masks each letter whose corresponding entry
+// in "probabilities" is >= minMaskProb.
+
+void maskProbableLetters(uchar *seqBeg,
+ uchar *seqEnd,
+ const float *probabilities,
+ double minMaskProb,
+ const uchar *maskTable);
+
+// The following routine counts the expected number of transitions
+// from the background (non-repeat) state to other states. It adds
+// the results to "transitionCounts", which must point to
+// pre-initialized space for (maxRepeatOffset+1) items. The
+// background->background transition count is stored in
+// transitionCounts[0]. The background->(period-i repeat) transition
+// count is stored in transitionCounts[i].
+
+// (In this routine, the HMM begin and end states are counted as
+// background states. Thus, begin->X is added to background->X, and
+// X->end is added to X->background.)
+
+void countTransitions(const uchar *seqBeg,
+ const uchar *seqEnd,
+ int maxRepeatOffset,
+ const const_double_ptr *likelihoodRatioMatrix,
+ double repeatProb,
+ double repeatEndProb,
+ double repeatOffsetProbDecay,
+ double firstGapProb,
+ double otherGapProb,
+ double *transitionCounts);
+
+}
+
+#endif
diff --git a/src/output/blast_pairwise_format.cpp b/src/output/blast_pairwise_format.cpp
index fe95bfb..d4c4b20 100644
--- a/src/output/blast_pairwise_format.cpp
+++ b/src/output/blast_pairwise_format.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "output_format.h"
@@ -21,8 +21,9 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const
{
static const unsigned width = 60;
- out << '>' << r.subject_name << '\n';
- out << "Length=" << r.subject_len << "\n\n";
+ out << '>';
+ Output_format::print_title(out, r.subject_name, true, true, " ");
+ out << "\nLength=" << r.subject_len << "\n\n";
out << " Score = " << r.bit_score() << " bits (" << r.score() << "), Expect = ";
out.print_e(r.evalue());
out << '\n';
@@ -31,11 +32,12 @@ void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const
if (align_mode.query_translated)
out << " Frame = " << r.blast_query_frame() << '\n';
out << '\n';
+ const unsigned digits = (unsigned)std::max(ceil(log10(r.subject_range().end_)), ceil(log10(r.query_range().end_)));
Hsp_context::Iterator qi = r.begin(), mi = r.begin(), si = r.begin();
while (qi.good()) {
out << "Query ";
- out.print(qi.query_pos+1, 0);
+ out.print(qi.query_pos+1, digits);
out << " ";
for (unsigned i = 0; i < width && qi.good(); ++i, ++qi)
out << qi.query_char();
@@ -43,13 +45,14 @@ void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const
out.print(qi.query_pos, 0);
out << '\n';
- out << " ";
+ for (unsigned i = 0; i < digits + 9; ++i)
+ out << ' ';
for (unsigned i = 0; i < width && mi.good(); ++i, ++mi)
out << mi.midline_char();
out << '\n';
out << "Sbjct ";
- out.print(si.subject_pos+1, 0);
+ out.print(si.subject_pos+1, digits);
out << " ";
for (unsigned i = 0; i < width && si.good(); ++i, ++si)
out << si.subject_char();
diff --git a/src/output/blast_tab_format.cpp b/src/output/blast_tab_format.cpp
index 32c82c7..a85147a 100644
--- a/src/output/blast_tab_format.cpp
+++ b/src/output/blast_tab_format.cpp
@@ -1,23 +1,25 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
+#include <set>
#include "../basic/match.h"
#include "output_format.h"
+#include "../data/taxonomy.h"
const char* Blast_tab_format::field_str[] = {
"qseqid", // 0 means Query Seq - id
@@ -65,7 +67,9 @@ const char* Blast_tab_format::field_str[] = {
"qcovs", // 42 means Query Coverage Per Subject
"qcovhsp", // 43 means Query Coverage Per HSP
"qcovus", // 44 means Query Coverage Per Unique Subject(blastn only)
- "qtitle" // 45 means Query title
+ "qtitle", // 45 means Query title
+ "swdiff", // 46
+ "time" // 47
};
Blast_tab_format::Blast_tab_format() :
@@ -81,12 +85,26 @@ Blast_tab_format::Blast_tab_format() :
int j = get_idx(field_str, sizeof(field_str) / sizeof(field_str[0]), i->c_str());
if(j == -1)
throw std::runtime_error(string("Invalid output field: ") + *i);
+ if (j == 34 && config.prot_accession2taxid.empty())
+ throw std::runtime_error("staxids output field requires setting the --taxonmap parameter.");
fields.push_back(j);
- if (j == 6 || j == 39 || j == 40)
+ if (j == 6 || j == 39 || j == 40 || j == 34)
config.salltitles = true;
}
}
+void print_staxids(Text_buffer &out, const char *id)
+{
+ const vector<string> t(tokenize(id, "\1"));
+ std::set<unsigned> taxons;
+ for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i)
+ taxons.insert(taxonomy.get(Taxonomy::Accession(*i)));
+ std::set<unsigned>::const_iterator i = taxons.begin();
+ out << *(i++);
+ for (; i != taxons.end(); ++i)
+ out << ';' << *i;
+}
+
void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const
{
for (vector<unsigned>::const_iterator i = fields.begin(); i != fields.end(); ++i) {
@@ -98,10 +116,10 @@ void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const
out << r.source_query.length();
break;
case 5:
- this->print_salltitles(out, r.subject_name, false, false);
+ print_title(out, r.subject_name, false, false, "<>");
break;
case 6:
- this->print_salltitles(out, r.subject_name, false, true);
+ print_title(out, r.subject_name, false, true, "<>");
break;
case 12:
out << r.subject_len;
@@ -202,11 +220,14 @@ void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const
out << n_matches;
}
break;
+ case 34:
+ print_staxids(out, r.subject_name);
+ break;
case 39:
- this->print_salltitles(out, r.subject_name, true, false);
+ print_title(out, r.subject_name, true, false, "<>");
break;
case 40:
- this->print_salltitles(out, r.subject_name, true, true);
+ print_title(out, r.subject_name, true, true, "<>");
break;
case 43:
out << (double)r.query_source_range().length()*100.0 / r.source_query.length();
@@ -214,6 +235,12 @@ void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const
case 45:
out << r.query_name;
break;
+ case 46:
+ out << r.sw_score() - r.bit_score();
+ break;
+ case 47:
+ out << r.time();
+ break;
default:
throw std::runtime_error("Invalid output field");
}
@@ -263,6 +290,7 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
out << "-1";
break;
case 31:
+ case 34:
out << '0';
break;
case 45:
diff --git a/src/output/daa_file.h b/src/output/daa_file.h
index 41c683e..e8c5720 100644
--- a/src/output/daa_file.h
+++ b/src/output/daa_file.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DAA_FILE_H_
diff --git a/src/output/daa_record.cpp b/src/output/daa_record.cpp
index 566d8ee..ba2cf61 100644
--- a/src/output/daa_record.cpp
+++ b/src/output/daa_record.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "daa_record.h"
@@ -53,9 +53,10 @@ Binary_buffer::Iterator& operator>>(Binary_buffer::Iterator &it, DAA_query_recor
uint8_t flag;
it >> flag;
it.read_packed(flag & 3, r.score);
- uint32_t query_begin;
+ uint32_t query_begin, subject_begin;
it.read_packed((flag >> 2) & 3, query_begin);
- it.read_packed((flag >> 4) & 3, r.subject_range.begin_);
+ it.read_packed((flag >> 4) & 3, subject_begin);
+ r.subject_range.begin_ = (int)subject_begin;
r.transcript.read(it);
r.subject_name = r.parent_.file_.ref_name(r.subject_id);
r.subject_len = r.parent_.file_.ref_len(r.subject_id);
diff --git a/src/output/daa_record.h b/src/output/daa_record.h
index c7f6cd2..c39a1a4 100644
--- a/src/output/daa_record.h
+++ b/src/output/daa_record.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DAA_RECORD_H_
diff --git a/src/output/daa_write.h b/src/output/daa_write.h
index b8eb691..7406400 100644
--- a/src/output/daa_write.h
+++ b/src/output/daa_write.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DAA_WRITE_H_
@@ -31,9 +31,9 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
inline void init_daa(Output_stream &f)
{
DAA_header1 h1;
- f.typed_write(&h1, 1);
+ f.write(&h1, 1);
DAA_header2 h2_;
- f.typed_write(&h2_, 1);
+ f.write(&h2_, 1);
}
inline size_t write_daa_query_record(Text_buffer &buf, const char *query_name, const sequence &query)
@@ -78,8 +78,8 @@ inline void finish_daa(Output_stream &f)
{
DAA_header2 h2_(ref_header.sequences,
config.db_size,
- config.gap_open,
- config.gap_extend,
+ score_matrix.gap_open(),
+ score_matrix.gap_extend(),
config.reward,
config.penalty,
score_matrix.k(),
@@ -93,7 +93,7 @@ inline void finish_daa(Output_stream &f)
h2_.block_type[2] = DAA_header2::ref_lengths;
uint32_t size = 0;
- f.typed_write(&size, 1);
+ f.write(&size, 1);
h2_.block_size[0] = f.tell() - sizeof(DAA_header1) - sizeof(DAA_header2);
h2_.db_seqs_used = ref_map.next_;
h2_.query_records = statistics.get(Statistics::ALIGNED);
@@ -105,11 +105,11 @@ inline void finish_daa(Output_stream &f)
}
h2_.block_size[1] = s;
- f.write(ref_map.len_, false);
+ f.write(ref_map.len_);
h2_.block_size[2] = ref_map.len_.size() * sizeof(uint32_t);
f.seekp(sizeof(DAA_header1));
- f.typed_write(&h2_, 1);
+ f.write(&h2_, 1);
}
#endif /* DAA_WRITE_H_ */
diff --git a/src/output/join_blocks.cpp b/src/output/join_blocks.cpp
index 31a4387..0b2e63e 100644
--- a/src/output/join_blocks.cpp
+++ b/src/output/join_blocks.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "output.h"
diff --git a/src/output/output.h b/src/output/output.h
index da1eb83..03fe1fe 100644
--- a/src/output/output.h
+++ b/src/output/output.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef OUTPUT_H_
@@ -59,16 +59,6 @@ inline uint8_t get_segment_flag(const Hsp_context &match)
struct Intermediate_record
{
- void read(Buffered_file &f)
- {
- f.read(query_id);
- f.read(subject_id);
- f.read(flag);
- f.read_packed(flag & 3, score);
- f.read_packed((flag >> 2) & 3, query_begin);
- f.read_packed((flag >> 4) & 3, subject_begin);
- transcript.read(f);
- }
void read(Binary_buffer::Iterator &f)
{
f.read(subject_id);
@@ -100,7 +90,7 @@ struct Intermediate_record
static void finish_file(Output_stream &f)
{
unsigned x = finished;
- f.typed_write(&x, 1);
+ f.write(&x, 1);
}
enum { finished = 0xffffffffu };
uint32_t query_id, subject_id, score, query_begin, subject_begin;
diff --git a/src/output/output_file.h b/src/output/output_file.h
deleted file mode 100644
index 8f7b575..0000000
--- a/src/output/output_file.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
-****/
-
-#ifndef OUTPUT_FILE_H_
-#define OUTPUT_FILE_H_
-
-#include <string>
-#include "output.h"
-
-using std::string;
-
-struct Block_output : public Buffered_file
-{
-
- struct Iterator
- {
- unsigned block_;
- bool same_subject_;
- Intermediate_record info_;
- bool operator<(const Iterator &rhs) const
- { return info_.query_id > rhs.info_.query_id ||
- (info_.query_id == rhs.info_.query_id && (rhs.same_subject_ ||
- (!rhs.same_subject_ && info_.score < rhs.info_.score))); }
- };
-
- bool next(Iterator &it, unsigned subject, unsigned query)
- {
- if(this->eof())
- return false;
- it.info_.read(*this);
- it.block_ = block_;
- it.same_subject_ = it.info_.subject_id == subject && it.info_.query_id == query;
- return true;
- }
-
- Block_output(unsigned ref_block, const Temp_file &tmp_file):
- Buffered_file (tmp_file),
- block_ (ref_block)
- { }
-
-private:
-
- const unsigned block_;
-
-};
-
-#endif /* OUTPUT_FILE_H_ */
diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp
index 6a9143a..74786f6 100644
--- a/src/output/output_format.cpp
+++ b/src/output/output_format.cpp
@@ -1,22 +1,23 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <iostream>
+#include "../data/taxonomy.h"
#include "output_format.h"
#include "../data/reference.h"
@@ -24,6 +25,44 @@ using std::endl;
auto_ptr<Output_format> output_format;
+void Output_format::print_title(Text_buffer &buf, const char *id, bool full_titles, bool all_titles, const char *separator)
+{
+ if (!all_titles) {
+ buf.write_until(id, full_titles ? "\1" : Const::id_delimiters);
+ return;
+ }
+ if (strchr(id, '\1') == 0) {
+ buf.write_until(id, full_titles ? "\1" : Const::id_delimiters);
+ return;
+ }
+ //size_t n = 0;
+ const vector<string> t(tokenize(id, "\1"));
+ vector<string>::const_iterator i = t.begin();
+ for (; i<t.end() - 1; ++i) {
+ if (full_titles)
+ buf << *i << separator;
+ else {
+ buf.write_until(i->c_str(), Const::id_delimiters);
+ buf << ";";
+ }
+ //n += i->length() + 2;
+ }
+ if (full_titles)
+ buf << *i;
+ else
+ buf.write_until(i->c_str(), Const::id_delimiters);
+ //n += i->length();
+ //return n;
+}
+
+void print_hsp(Hsp_data &hsp, sequence query)
+{
+ Text_buffer buf;
+ Pairwise_format().print_match(Hsp_context(hsp, 0, query, query, "", 0, 0, "", 0, 0, 0), buf);
+ buf << '\0';
+ cout << buf.get_begin();
+}
+
Output_format* get_output_format()
{
const vector<string> &f = config.output_format;
@@ -43,6 +82,8 @@ Output_format* get_output_format()
return new DAA_format;
else if (f[0] == "0")
return new Pairwise_format;
+ else if (f[0] == "null")
+ return new Null_format;
else
throw std::runtime_error("Invalid output format. Allowed values: 5,6,100,101");
}
@@ -53,14 +94,13 @@ void XML_format::print_match(const Hsp_context &r, Text_buffer &out) const
if(r.hsp_num == 0) {
if (r.hit_num > 0)
out << " </Hit_hsps>" << '\n' << "</Hit>" << '\n';
+ string id, def;
+ get_title_def(r.subject_name, id, def);
out << "<Hit>" << '\n'
<< " <Hit_num>" << r.hit_num + 1 << "</Hit_num>" << '\n'
- << " <Hit_id>gnl|BL_ORD_ID|" << r.orig_subject_id + 1 << "</Hit_id>" << '\n'
- << " <Hit_def>";
- const bool lt = (config.salltitles || (config.command == Config::view)) ? true : false;
- this->print_salltitles(out, r.subject_name, lt, lt);
- out << "</Hit_def> " << '\n'
- << " <Hit_accession>" << r.orig_subject_id + 1 << "</Hit_accession>" << '\n'
+ << " <Hit_id>" << id << "</Hit_id>" << '\n'
+ << " <Hit_def>" << def << "</Hit_def>" << '\n';
+ out << " <Hit_accession>" << get_accession(id) << "</Hit_accession>" << '\n'
<< " <Hit_len>" << r.subject_len << "</Hit_len>" << '\n'
<< " <Hit_hsps>" << '\n';
}
@@ -114,8 +154,10 @@ void XML_format::print_header(Output_stream &f, int mode, const char *matrix, in
<< " <BlastOutput_reference>Benjamin Buchfink, Xie Chao, and Daniel Huson (2015), "Fast and sensitive protein alignment using DIAMOND", Nature Methods 12:59-60.</BlastOutput_reference>" << endl
<< " <BlastOutput_db></BlastOutput_db>" << endl
<< " <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>" << endl
- << " <BlastOutput_query-def>" << first_query_name << "</BlastOutput_query-def>" << endl
- << " <BlastOutput_query-len>" << first_query_len << "</BlastOutput_query-len>" << endl
+ << " <BlastOutput_query-def>";
+ const string fqn = string(first_query_name);
+ ss << fqn.substr(0, fqn.find('\1'));
+ ss << "</BlastOutput_query-def>" << endl << " <BlastOutput_query-len>" << first_query_len << "</BlastOutput_query-len>" << endl
<< " <BlastOutput_param>" << endl
<< " <Parameters>" << endl
<< " <Parameters_matrix>" << matrix << "</Parameters_matrix>" << endl
@@ -132,9 +174,11 @@ void XML_format::print_header(Output_stream &f, int mode, const char *matrix, in
void XML_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
{
out << "<Iteration>" << '\n'
- << " <Iteration_iter-num>" << query_num+1 << "</Iteration_iter-num>" << '\n'
- << " <Iteration_query-ID>Query_" << query_num+1 << "</Iteration_query-ID>" << '\n'
- << " <Iteration_query-def>" << query_name << "</Iteration_query-def>" << '\n'
+ << " <Iteration_iter-num>" << query_num + 1 << "</Iteration_iter-num>" << '\n'
+ << " <Iteration_query-ID>Query_" << query_num + 1 << "</Iteration_query-ID>" << '\n'
+ << " <Iteration_query-def>";
+ print_title(out, query_name, true, false, "");
+ out << "</Iteration_query-def>" << '\n'
<< " <Iteration_query-len>" << query_len << "</Iteration_query-len>" << '\n'
<< "<Iteration_hits>" << '\n';
}
diff --git a/src/output/output_format.h b/src/output/output_format.h
index 3047d65..4ed34a5 100644
--- a/src/output/output_format.h
+++ b/src/output/output_format.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef OUTPUT_FORMAT_H_
@@ -44,45 +44,24 @@ struct Output_format
{ }
virtual ~Output_format()
{ }
- static void print_salltitles(Text_buffer &buf, const char *id, bool full_titles, bool all_titles)
- {
- if (!all_titles) {
- buf.write_until(id, full_titles ? "\1" : Const::id_delimiters);
- return;
- }
- if (strchr(id, '\1') == 0) {
- buf.write_until(id, full_titles ? "\1" : Const::id_delimiters);
- return;
- }
- //size_t n = 0;
- const vector<string> t (tokenize(id, "\1"));
- vector<string>::const_iterator i=t.begin();
- for(;i<t.end()-1;++i) {
- if (full_titles)
- buf << *i << "<>";
- else {
- buf.write_until(i->c_str(), Const::id_delimiters);
- buf << ";";
- }
- //n += i->length() + 2;
- }
- if(full_titles)
- buf << *i;
- else
- buf.write_until(i->c_str(), Const::id_delimiters);
- //n += i->length();
- //return n;
- }
+ static void print_title(Text_buffer &buf, const char *id, bool full_titles, bool all_titles, const char *separator);
operator unsigned() const
{
return code;
}
unsigned code;
- enum { daa, blast_tab, blast_xml, sam, blast_pairwise };
+ enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null };
};
extern auto_ptr<Output_format> output_format;
+struct Null_format : public Output_format
+{
+ Null_format() :
+ Output_format(null)
+ {}
+};
+
struct DAA_format : public Output_format
{
DAA_format():
@@ -117,7 +96,9 @@ struct XML_format : public Output_format
{
XML_format():
Output_format(blast_xml)
- {}
+ {
+ config.salltitles = true;
+ }
virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
@@ -131,7 +112,9 @@ struct Pairwise_format : public Output_format
{
Pairwise_format() :
Output_format(blast_pairwise)
- {}
+ {
+ config.salltitles = true;
+ }
virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
@@ -142,5 +125,6 @@ struct Pairwise_format : public Output_format
};
Output_format* get_output_format();
+void print_hsp(Hsp_data &hsp, sequence query);
#endif /* OUTPUT_FORMAT_H_ */
diff --git a/src/output/sam_format.cpp b/src/output/sam_format.cpp
index 61c29c3..932e406 100644
--- a/src/output/sam_format.cpp
+++ b/src/output/sam_format.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "output_format.h"
@@ -88,7 +88,7 @@ void Sam_format::print_match(const Hsp_context& r, Text_buffer &out) const
out << '\t' << '0' << '\t';
const bool lt = (config.salltitles || (config.command == Config::view)) ? true : false;
- this->print_salltitles(out, r.subject_name, lt, lt);
+ print_title(out, r.subject_name, lt, lt, "<>");
out << '\t'
<< r.subject_range().begin_ + 1 << '\t'
diff --git a/src/output/view.h b/src/output/view.h
index 7e6da26..21a2ba6 100644
--- a/src/output/view.h
+++ b/src/output/view.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef VIEW_H_
@@ -133,16 +133,23 @@ void view()
Binary_buffer buf;
size_t query_num;
- daa.read_query_buffer(buf, query_num);
- DAA_query_record r(daa, buf, query_num);
- Text_buffer out;
- view_query(r, out, *output_format);
-
- output_format->print_header(*writer.f_, daa.mode(), daa.score_matrix(), daa.gap_open_penalty(), daa.gap_extension_penalty(), daa.evalue(), r.query_name.c_str(), (unsigned)r.query_len());
- writer(out);
+ if (daa.read_query_buffer(buf, query_num)) {
+ DAA_query_record r(daa, buf, query_num);
+ Text_buffer out;
+ view_query(r, out, *output_format);
+
+ output_format->print_header(*writer.f_, daa.mode(), daa.score_matrix(), daa.gap_open_penalty(), daa.gap_extension_penalty(), daa.evalue(), r.query_name.c_str(), (unsigned)r.query_len());
+ writer(out);
+
+ View_context context(daa, writer, *output_format);
+ launch_thread_pool(context, config.threads_);
+ }
+ else {
+ Text_buffer out;
+ output_format->print_header(*writer.f_, daa.mode(), daa.score_matrix(), daa.gap_open_penalty(), daa.gap_extension_penalty(), daa.evalue(), "", 0);
+ writer(out);
+ }
- View_context context(daa, writer, *output_format);
- launch_thread_pool(context, config.threads_);
output_format->print_footer(*writer.f_);
}
diff --git a/src/run/benchmark.cpp b/src/run/benchmark.cpp
index 7c057f9..c2616d1 100644
--- a/src/run/benchmark.cpp
+++ b/src/run/benchmark.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "../basic/sequence.h"
@@ -27,6 +27,8 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../dp/score_profile.h"
#include "../output/output_format.h"
+using std::list;
+
void benchmark_cmp()
{
#ifdef __SSE2__
@@ -87,19 +89,19 @@ int xdrop_window(const Letter *query, const Letter *subject)
&& *s != '\xff')
{
st += score_matrix(*q, *s);
- //score = std::max(score, st);
+ score = std::max(score, st);
++q;
++s;
++n;
st += score_matrix(*q, *s);
- //score = std::max(score, st);
+ score = std::max(score, st);
++q;
++s;
++n;
st += score_matrix(*q, *s);
- //score = std::max(score, st);
+ score = std::max(score, st);
++q;
++s;
++n;
@@ -107,6 +109,48 @@ int xdrop_window(const Letter *query, const Letter *subject)
return st;
}
+int xdrop_window2(const Letter *query, const Letter *subject)
+{
+ static const int window = 40;
+ int score(0), st(0), n = 0, i = 0;
+
+ const Letter *q(query), *s(subject);
+
+ st = score;
+ while (n < window
+ && *q != '\xff'
+ && *s != '\xff')
+ {
+ st += score_matrix(*q, *s);
+ if (st > score) {
+ score = st;
+ i = n;
+ }
+ ++q;
+ ++s;
+ ++n;
+
+ st += score_matrix(*q, *s);
+ if (st > score) {
+ score = st;
+ i = n;
+ }
+ ++q;
+ ++s;
+ ++n;
+
+ st += score_matrix(*q, *s);
+ if (st > score) {
+ score = st;
+ i = n;
+ }
+ ++q;
+ ++s;
+ ++n;
+ }
+ return st*i;
+}
+
void benchmark_ungapped(const Sequence_set &ss, unsigned qa, unsigned sa)
{
static const size_t n = 100000000llu;
@@ -126,7 +170,7 @@ void benchmark_ungapped(const Sequence_set &ss, unsigned qa, unsigned sa)
for (size_t i = 0; i < n; ++i) {
- //score += xdrop_window(q, s);
+ score += xdrop_window2(q, s);
//score += binary_ungapped(mask);
}
@@ -138,38 +182,83 @@ void benchmark_ungapped(const Sequence_set &ss, unsigned qa, unsigned sa)
void benchmark_greedy(const Sequence_set &ss, unsigned qa, unsigned sa)
{
- static const unsigned n = 100000;
- vector<Diagonal_segment> d;
- d.push_back(ungapped_extension(sa, qa, ss[0], ss[1]));
+ static const unsigned n = 1000;
+ vector<Seed_hit> d2;
+ d2.push_back(Seed_hit(0, 0, sa, qa, xdrop_ungapped(ss[0],ss[1],qa,sa)));
Long_score_profile qp(ss[0]);
- //greedy_align(ss[0], qp, ss[1], d[0], true);
- //greedy_align(ss[0], qp, ss[1], qa, sa, true);
- Hsp_data hsp;
- greedy_align2(ss[0], qp, ss[1], d, true, hsp);
- Text_buffer buf;
- Pairwise_format().print_match(Hsp_context(hsp, 0, ss[0], ss[0], "", 0, 0, "", 0, 0, 0), buf);
- buf << '\0';
- cout << buf.get_begin();
+ list<Hsp_data> hsp;
+ list<Hsp_traits> traits, traits2;
+ Bias_correction query_bc(ss[0]);
+ const int d = (int)qa - (int)sa, band = 7;
+ greedy_align(ss[0], qp, query_bc, ss[1], d2.begin(), d2.end(), true, hsp, traits, 0);
+ hsp.clear();
+ greedy_align(ss[0], qp, query_bc, ss[1], true, hsp, traits.begin(), traits.end(), traits2, 50, 0);
+ return;
+
+ Timer t;
+ t.start();
+
+ for (unsigned i = 0; i < n; ++i) {
+ traits.clear();
+ greedy_align(ss[0], qp, query_bc, ss[1], d2.begin(), d2.end(), true, hsp, traits, 0);
+ greedy_align(ss[0], qp, query_bc, ss[1], true, hsp, traits.begin(), traits.end(), traits2, 50, 0);
+ }
+ t.stop();
+
+ cout << " usec=" << t.getElapsedTimeInSec() / (double)n * 1000000.0 << endl;
+ cout << "t=" << t.getElapsedTimeInMicroSec() << endl;
+}
+void benchmark_banded(const Sequence_set &ss, unsigned qa, unsigned sa)
+{
+ static const unsigned n = 10000;
+ Hsp_data hsp;
+ const int d = (int)qa - (int)sa, band = 30, slen = (int)ss[1].length();
+ banded_sw(ss[0], ss[1], d - band, d + band, 0, slen, hsp);
+ cout << "Score = " << hsp.score << endl;
+ //print_hsp(hsp, ss[0]);
Timer t;
t.start();
for (unsigned i = 0; i < n; ++i) {
- //greedy_align(ss[0], qp, ss[1], d[0], false);
- //greedy_align(ss[0], qp, ss[1], qa, sa, false);
- greedy_align2(ss[0], qp, ss[1], d, false, hsp);
+ banded_sw(ss[0], ss[1], d - band, d + band, 0,slen,hsp);
+ hsp.score = 0;
}
t.stop();
cout << " usec=" << t.getElapsedTimeInSec() / (double)n * 1000000.0 << endl;
- cout << "t=" << t.getElapsedTimeInMicroSec() << endl;
+}
+
+void benchmark_swipe(const Sequence_set &ss)
+{
+ static const unsigned n = 1000;
+ vector<sequence> seqs;
+ vector<int> score(64);
+ for (int i = 0; i < 64; ++i)
+ seqs.push_back(ss[1]);
+ swipe(ss[0], seqs.begin(), seqs.end(), score.begin());
+ cout << "Score = " << score[0] << endl;
+ cout << needleman_wunsch(ss[0], ss[1], score[0], Local(), int()) << endl;
+ return;
+ Timer t;
+ t.start();
+
+ for (unsigned i = 0; i < n; ++i) {
+
+ swipe(ss[0], seqs.begin(), seqs.end(), score.begin());
+
+ }
+ t.stop();
+
+ cout << "usec=" << t.getElapsedTimeInSec() / (double)n * 1000000.0 << endl;
+ cout << "gcups=" << 64*ss[0].length()*ss[1].length()*n / t.getElapsedTime() / 1e9 << endl;
}
void benchmark_floating(const Sequence_set &ss, unsigned qa, unsigned sa)
{
- static const unsigned n = 10000;
+ static const unsigned n = 100000;
uint64_t cell_updates = 0;
local_match hsp(0, 0, &ss[1][sa]);
@@ -182,13 +271,14 @@ void benchmark_floating(const Sequence_set &ss, unsigned qa, unsigned sa)
floating_sw(&ss[0][qa],
hsp.subject_,
hsp,
- 32,
+ 5,
score_matrix.rawscore(config.gapped_xdrop),
- config.gap_open + config.gap_extend,
- config.gap_extend,
+ score_matrix.gap_open() + score_matrix.gap_extend(),
+ score_matrix.gap_extend(),
cell_updates,
hsp.query_anchor_,
hsp.subject_anchor,
+ 0,
No_score_correction(),
Score_only());
@@ -197,6 +287,7 @@ void benchmark_floating(const Sequence_set &ss, unsigned qa, unsigned sa)
cout << hsp.score << ' ' << cell_updates << endl;
cout << "gcups=" << (double)cell_updates / 1e9 / t.getElapsedTimeInSec() << " n/sec=" << (double)n / t.getElapsedTimeInSec() << endl;
+ cout << " usec=" << t.getElapsedTimeInSec() / (double)n * 1000000.0 << endl;
}
}
@@ -205,7 +296,7 @@ void benchmark_sw()
Sequence_set ss;
vector<Letter> s1, s2;
unsigned qa = 0, sa = 0;
- goto aln1;
+ goto aln1;
/*
> d2va1a_ c.73.1.0 (A:) automated matches {Ureaplasma parvum [TaxId:
@@ -223,8 +314,6 @@ Query 80 TVIGHL 85
T+I L
Sbjct 76 TIINGL 81 */
-aln1:
-
s1 = sequence::from_string("SLFEQLGGQAAVQAVTAQFYANIQADATVATFFNGIDMPNQTNKTAAFLCAALGGPNAWTGRNLKEVHANMGVSNAQFTTVIGHLRSALTGAGVAAALVEQTVAVAETVRGDVVTV");
s2 = sequence::from_string("RKQRIVIKISGACLKQNDSSIIDFIKINDLAEQIEKISKKYIVSIVLGGGNIWRGSIAKELDMDRNLADNMGMMATIINGLALENALNHLNVNTIVLSAIKCDKLVHESSANNIKKAIEKEQVMIFVAGTGFPYFTTDSCAAIRAAETESSIILMGKNGVDGVYDSDPKINPNAQFYEHITFNMALTQNLKVMDATALALCQENNINLLVFNIDKPNAIVDVLEKKNKYTIVSK");
qa = 23;
@@ -324,7 +413,6 @@ KCLEHLFFFKLIGDTPIDTFLMEMLEAPHQIT");
*/
-
s1 = sequence::from_string("tspmtpditgkpfvaadasndyikrevmipmrdgvklhtvivlpkgaknapivltrtpyd\
asgrterlasphmkdllsagddvfveggyirvfqdvrgkygsegdyvmtrplrgplnpse\
vdhatdawdtidwlvknvsesngkvgmigssyegftvvmaltnphpalkvavpespmidg\
@@ -332,7 +420,7 @@ wmgddwfnygafrqvnfdyftgqlskrgkgagiarqghddysnflqagsagdfakaagle\
qlpwwhkltehaaydafwqeqaldkvmartplkvptmwlqglwdqedmwgaihsyaamep\
rdkrntlnylvmgpwrhsqvnydgsalgalnfegdtarqfrhdvlrpffdqylvdgapka\
dtppvfiyntgenhwdrlkaw");
-
+
s2 = sequence::from_string("MVDGNYSVASNVMVPMRDGVRLAVDLYRPDADGPVPVLLVRNPYDKFDVFAWSTQSTNWL\
EFVRDGYAVVIQDTRGLFASEGEFVPHVDDEADAEDTLSWILEQAWCDGNVGMFGVSYLG\
VTQWQAAVSGVGGLKAIAPSMASADLYRAPWYGPGGALSVEALLGWSALIGTGLITSRSD\
@@ -346,6 +434,190 @@ VIAREQLEEMCTAVNRIHRGPEHPSHIVLPIIKR");
qa = 19;
sa = 4;
+ goto ende;
+
+ /*
+
+ Query= 488:2:1:298:839
+
+ Length=114
+
+ >sp|Q820R1|RS3_NITEU 30S ribosomal protein S3 OS=Nitrosomonas europaea (strain ATCC 19718 / NBRC 14298) GN=rpsC PE=3 SV=1
+Length=215
+
+ Score = 46.6 bits (105!), Expect = 6.8e-05
+ Identities = 22/34 (64%), Positives = 27/34 (79%), Gaps = 5/34 (14%)
+ Frame = 2
+
+Query 1 PLHTLRADIDYGT--ARALYPGAGIIGVQVWIYK 32
+ PLHTLRA++DYGT AR Y GIIGV+VW++K
+Sbjct 174 PLHTLRAEVDYGTSEARTTY---GIIGVKVWVFK 204
+ */
+
+aln1:
+ s1 = sequence::from_string("PLHTLRADIDYGTARALYPGAGIIGVQVWIYK");
+ s2 = sequence::from_string("MGQKINPTGFRLSVLKNWSSRWYTNTKKFSDFLNEDISVRQYLQKKLAHASVGSIIIERPSKNAKITIHTSRPGVVIGKKGEDIEILRRNVEKLMNVPVHINIEEIRKPEIDAQLIAASITQQLEKRIMFRRAMKRAIQNAMRLGAQGIKIMSSGRLNGIEIARTEWYREGRVPLHTLRAEVDYGTSEARTTYGIIGVKVWVFKGEQLGIKERQN");
+
+ qa = 0;
+ sa = 173;
+ goto ende;
+
+ /*
+
+ Score = 1694.5 bits (4387), Expect = 0.0e+00
+ Identities = 0/2122 (0%), Positives = 0/2122 (0%), Gaps = 0/2122 (0%)
+
+ Query 4975 PATRPERVPLSFAQQRLWFLHRMQGPSPTYNVPVVLRLDGELHRDALVAAVRDVVVRHES 5034
+ P T+ + L+ AQ +WF ++ +P YN + ++G ++ A+R V+ ES
+ Sbjct 2 PDTKDLQYSLTGAQTGIWFAQQLDPDNPIYNTAEYIEINGPVNIALFEEALRHVIKEAES 61
+
+ Query 5035 LRTVFPDVEGTPYQHVLAEFEPAVSFVD-TTDLDADLTELARHAFDLATELPI------R 5087
+ L F + P+Q + + + +D +++ D + T L DLA + +
+ Sbjct 62 LHVRFGENMDGPWQMINPSPDVQLHVIDVSSEPDPEKTALNWMKADLAKPVDLGYAPLFN 121
+
+ Query 5088 VTVLSTSPTEHALLLLTHHIASDGWSTEPLSRDFAHAYAARTRGEQPE---FTPLPVQYA 5144
+ + P HHIA DG+ +++ A Y A +G+ + F L
+ Sbjct 122 EALFIAGPDRFFWYQRIHHIAIDGFGFSLIAQRVASTYTALIKGQTAKSRSFGSLQAILE 181
+
+ Query 5145 DYTLWQQDLLGSEQDPTSLLSRQVEFWRTALADLPELLQLPTDRPRPAVASYEGGALDFE 5204
+ + T D GSEQ + +FW AD PE++ L PR + + A
+ Sbjct 182 EDT----DYRGSEQ-----YEKDRQFWLDRFADAPEVVSLADRAPRTSNSFLRHTAY--- 229
+
+ Query 5205 FTPELHRGVTELAERTGTTVFMVMQAALSTLFTKLGAGTDIPLGTPIAGRTDEALEELVG 5264
+ P + E A + VM A + ++ D+ LG P+ GR A +
+ Sbjct 230 LPPSDVNALKEAARYFSGSWHEVMIAVSAVYVHRMTGSEDVVLGLPMMGRIGSASLNVPA 289
+
+ Query 5265 FFVNTLVLRTDTSGDPGFGQVLERVREANLAAYAHQDVPFERLVEVLNPTRSLAHHPLF- 5323
+ +N L LR S F ++++++ + H E L L +H LF
+ Sbjct 290 MVMNLLPLRLTVSSSMSFSELIQQISREIRSIRRHHKYRHEELRRDLKLIGE--NHRLFG 347
+
+ Query 5324 -QVMMTLHNSSADGPGLEGVDTGVA---TVDLQFTLQESFDANGSPAGLGGDVEYATDLF 5379
+ Q+ + + D G+ G ++ DL + + D +GL DV+ +++
+ Sbjct 348 PQINLMPFDYGLDFAGVRGTTHNLSAGPVDDLSINVYDRTDG----SGLRIDVDANPEVY 403
+
+ Query 5380 GPDSVRLLLTRLETLLAAVVADPRRPISRIDLLTAQERTQVLRTWNDTAREVPALTVPQL 5439
+ ++L R+ LL A I +++LL +E+ +V+ WN+TA+ +++ +
+ Sbjct 404 SESDIKLHQQRILQLLQTASAGEDMLIGQMELLLPEEKEKVISKWNETAKSEKLVSLQDM 463
+
+ Query 5440 FQAHAQGSPEATALVFGAEQVSYVELNVRANQLAHHLIAQGVGPERIVAVALPRSVDLVV 5499
+ F+ A +PE AL+ QV+Y +LN AN+LA LI +G+GPE+ VA+ALPRS ++V
+ Sbjct 464 FEKQAVLTPERIALMCDDIQVNYRKLNEEANRLARLLIEKGIGPEQFVALALPRSPEMVA 523
+
+ Query 5500 ALLAVLKTGAAYLPIDPGYPAERIGYILADAQPVCLLSTGNGPRT-----EVPTVLLDSA 5554
+ ++L VLKTGAAYLP+DP +PA+RI Y+L DA+P C+++T + VP ++LD A
+ Sbjct 524 SMLGVLKTGAAYLPLDPEFPADRISYMLEDAKPSCIITTEEIAASLPDDLAVPELVLDQA 583
+
+ Query 5555 AKQAELAELSGADPESTVDLRNPAYTIYTSGSTGRPKGVVVTVGDLANFLAAMTDRIGLT 5614
+ Q + S + + +V L +PAY IYTSGSTGRPKGVVVT L+NFL +M + L
+ Sbjct 584 VTQEIIKRYSPENQDVSVSLDHPAYIIYTSGSTGRPKGVVVTQKSLSNFLLSMQEAFSLG 643
+
+ Query 5615 PEDRLLAVTTVAFDIAGLEIHLPLVSGARVVLAAESQVRDPAALTALARTSGATVMQATP 5674
+ EDRLLAVTTVAFDI+ LE++LPL+SGA++V+A + +R+P AL + +MQATP
+ Sbjct 644 EEDRLLAVTTVAFDISALELYLPLISGAQIVIAKKETIREPQALAQMIENFDINIMQATP 703
+
+ Query 5675 SLWQAALAEDAESLRGMRMLVGGEALPAALAGTMAEVGAEVVNLYGPTETTIWSASAAIS 5734
+ +LW A + + E LRG+R+LVGGEALP+ L + ++ V NLYGPTETTIWSA+A +
+ Sbjct 704 TLWHALVTSEPEKLRGLRVLVGGEALPSGLLQELQDLHCSVTNLYGPTETTIWSAAAFLE 763
+
+ Query 5735 DG--SVPPIGRPIANTAVYVLDSALAPVPVGVTGELYIAGDGLARGYANRPGLTAERFSA 5792
+ +G VPPIG+PI NT VYVLD+ L PVP GV GELYIAG GLARGY +RP LTAERF A
+ Sbjct 764 EGLKGVPPIGKPIWNTQVYVLDNGLQPVPPGVVGELYIAGTGLARGYFHRPDLTAERFVA 823
+
+ Query 5793 DPFGEPGSRMYRTGDLARWRADGQLEYLARVDDQVKLRGFRIELGEIESVLTAHPEVTRA 5852
+ DP+G PG+RMYRTGD ARWRADG L+Y+ R D Q+K+RGFRIELGEI++VL HP + +A
+ Sbjct 824 DPYGPPGTRMYRTGDQARWRADGSLDYIGRADHQIKIRGFRIELGEIDAVLANHPHIEQA 883
+
+ Query 5853 AVLVREE-----RLVAYVVGSA--DVSGLRALAQSRLPEYMVPSAYVSLDTLPLTPNGKL 5905
+ AV+VRE+ RL AYVV A D + LR + LP+YMVPSA+V +D LPLTPNGKL
+ Sbjct 884 AVVVREDQPGDKRLAAYVVADAAIDTAELRRYMGASLPDYMVPSAFVEMDELPLTPNGKL 943
+
+ Query 5906 DRKALPAPDFGAESGGRAARTPAEQVLCGLFADVLGAERVGIEDNFFELGGHSLLATKLI 5965
+ DRKALPAPDF RA RTP E++LC LFA+VLG RVGI+D+FFELGGHSLLA +L+
+ Sbjct 944 DRKALPAPDFSTSVSDRAPRTPQEEILCDLFAEVLGLARVGIDDSFFELGGHSLLAARLM 1003
+
+ Query 5966 SRIRVALGVEVPIQALFEAPTVAGLAERLTGAATGRRALEPMARPQRVPLSYAQRRLWFL 6025
+ SRIR +G E+ I LF+ PTVAGLA L A + AL+ RP+++PLS+AQRRLWFL
+ Sbjct 1004 SRIREVMGAELGIAKLFDEPTVAGLAAHLDLAQSACPALQRAERPEKIPLSFAQRRLWFL 1063
+
+ Query 6026 NRLEGPSATYNLPLVLRLSGSVDASALAAALRDVVGRHESLRTVFPDSSADPHQIILSPA 6085
+ + LEGPS TYN+P+ +RLSG +D L AAL D+V RHESLRT+FP+S +Q IL
+ Sbjct 1064 HCLEGPSPTYNIPVAVRLSGELDQGLLKAALYDLVCRHESLRTIFPESQGTSYQHILDAD 1123
+
+ Query 6086 EAQPRFDTVELSAAELDGAIAEAAQYRFDLAAELPIRAWLFTVSPTEHAVVLLMHHIASD 6145
+ A P E++ EL +AEA +Y FDLAAE RA LF + P E+ ++LL+HHI D
+ Sbjct 1124 RACPELHVTEIAEKELSDRLAEAVRYSFDLAAEPAFRAELFVIGPDEYVLLLLVHHIVGD 1183
+
+ Query 6146 GWSMTPLSQDLAHAYTARCRGEEPVFAPLPVQYADYTLWQHEVLGDEQDADSVLAQQVAH 6205
+ GWS+TPL++DL AY ARC G P +APL VQYADY LWQ E+LG+E D +S++A Q+A
+ Sbjct 1184 GWSLTPLTRDLGTAYAARCHGRSPEWAPLAVQYADYALWQQELLGNEDDPNSLIAGQLAF 1243
+
+ Query 6206 WQRTLAGAPELLELPTDRPRPAVASYQGGILEFELDYEVHKGLSTLAKRSGTTLFMVVQS 6265
+ W+ TL P+ LELPTD RPA S+ G + F ++ E HK L LA+ + +LFMV+QS
+ Sbjct 1244 WKETLKNLPDQLELPTDYSRPAEPSHDGDTIHFRIEPEFHKRLQELARANRVSLFMVLQS 1303
+
+ Query 6266 ALATLFTKLGAGTDIPLGTAIAGRTDEAVEDLIGFFVNTLVLRTDTSGDPSFTELLGRVR 6325
+ LA L T+LGAGTDIP+G+ IAGR D+A+ DL+G F+NTLVLRTDTSGDPSF ELL RVR
+ Sbjct 1304 GLAALLTRLGAGTDIPIGSPIAGRNDDALGDLVGLFINTLVLRTDTSGDPSFRELLDRVR 1363
+
+ Query 6326 QTDLAAFAHQEVPFERLVEVLNPARSLAHHPLFQVLLTVQNTEQAQLRLPGAEVEFDGAG 6385
+ + +LAA+ +Q++PFERLVEVLNPARS A HPLFQ++L QNT A+L LP E
+ Sbjct 1364 EVNLAAYDNQDLPFERLVEVLNPARSRATHPLFQIMLAFQNTPDAELHLPDMESSLRINS 1423
+
+ Query 6386 TGVAKFDLAFSLEE------SGEGLEGLVEYAADLFDRDTVQRLVDRLITLLRGVITDPE 6439
+ G AKFDL + E + G+EGL+EY+ DLF R+T Q L DRL+ LL +DP+
+ Sbjct 1424 VGSAKFDLTLEISEDRLADGTPNGMEGLLEYSTDLFKRETAQALADRLMRLLEAAESDPD 1483
+
+ Query 6440 LPISQLDVLSDTERGQLLGEWNETAASTEDGVLAELFAARVRRDPQAPALAFEGTTLTYG 6499
+ I LD+L+ E ++ +W + L E F + P A A+ +E L+Y
+ Sbjct 1484 EQIGNLDILAPEEHSSMVTDWQSVSEKIPHACLPEQFEKQAALRPDAIAVVYENQELSYA 1543
+
+ Query 6500 ELDARANRLAHKLVELGAGPERFVAVAVPRSVEMVVALLAVAKSGAAYVPVDPSYPADRV 6559
+ EL+ RANRLA ++ G GPE+FVA+A+PRS+EM V LLAV K+GAAY+P+DP YPADR+
+ Sbjct 1544 ELNERANRLARMMISEGVGPEQFVALALPRSLEMAVGLLAVLKAGAAYLPLDPDYPADRI 1603
+
+ Query 6560 AYMLSDAAPSLVLTTSGTG---LAGLRLDELELDGPDTAPAVTGLGLGSP---------- 6606
+ A+ML DA P+ ++T + + ++ LD P+ A + G+P
+ Sbjct 1604 AFMLKDAQPAFIMTNTKAANHIPPVENVPKIVLDDPELAEKLNTYPAGNPKNKDRTQPLS 1663
+
+ Query 6607 ----AYVIYTSGSTGRPKGVVVTHSGLASLVAAQVGAFGVGPGSRVLQFASLSFDAASWE 6662
+ AYVIYTSGSTG PKGV++ H + L AA F G F S +FD + WE
+ Sbjct 1664 PLNTAYVIYTSGSTGVPKGVMIPHQNVTRLFAATEHWFRFSSGDIWTMFHSYAFDFSVWE 1723
+
+ Query 6663 VCMGLLSGACLVVAPADRVLPGEPLAELVAEHAVTHVTLPPTAL-----AALPANGLPEG 6717
+ + LL G LV+ P E L+ + VT + P+A A L +
+ Sbjct 1724 IWGPLLHGGRLVIVPHHVSRSPEAFLRLLVKEGVTVLNQTPSAFYQFMQAEREQPDLGQA 1783
+
+ Query 6718 MTL---VVAGEATQPSTVEQW-----SAGRTMINAYGPTETTVCATMSGPLSGAVVA--- 6766
+ ++L + GEA + S +E W +IN YG TETTV + L ++ A
+ Sbjct 1784 LSLRYVIFGGEALELSRLEDWYNRHPENRPQLINMYGITETTVHVSYI-ELDRSMAALRA 1842
+
+ Query 6767 --PIGRPITNSRVYVLDAGLRPVPPGTTGELYVAGASLARGYHNRPGLTAERFVASPFG- 6823
+ IG I + VYVLD L+PVPPG GELYV+GA LARGY RPGLT+ERF+A PFG
+ Sbjct 1843 NSLIGCGIPDLGVYVLDERLQPVPPGVAGELYVSGAGLARGYLGRPGLTSERFIADPFGP 1902
+
+ Query 6824 VGERLYRTGDLAKWRVDGQLEYVGRADHQVKVRGFRIELGEIESVLAAHPAIAEVAAVVR 6883
+ G R+YRTGD+A+ R DG L+YVGRADHQVK+RGFRIELGEIE+ L HP + + A +VR
+ Sbjct 1903 PGTRMYRTGDVARLRADGSLDYVGRADHQVKIRGFRIELGEIEAALVQHPQLEDAAVIVR 1962
+
+ Query 6884 EDQPGDRRIVAYLVAAGQAPGSAELRSVVGAALPEYMVPSAFVVLPAIPLLPNGKVDRKA 6943
+ EDQPGD+R+ AY++ + + +AELR LP+YMVP+AFV + +PL PNGK+DRKA
+ Sbjct 1963 EDQPGDKRLAAYVIPSEETFDTAELRRYAAERLPDYMVPAAFVTMKELPLTPNGKLDRKA 2022
+
+ Query 6944 LPAPDFTAVSTGRAPRTSREELLCGLYAEVLGLPEVGIDANFFELGGDSILSLQVVSRAR 7003
+ LPAPDF A TGR PRT +EE+LC L+ EVL LP VGID FF+LGG S+L++Q++SR R
+ Sbjct 2023 LPAPDFAAAVTGRGPRTPQEEILCDLFMEVLHLPRVGIDDRFFDLGGHSLLAVQLMSRIR 2082
+
+ Query 7004 NA-GIVISARDVFRYGTPAALA 7024
+ A G+ +S ++F T A LA
+ Sbjct 2083 EALGVELSIGNLFEAPTVAGLA 2104
+
+ */
+
+ s1 = sequence::from_string("MDGSTMRVDEGTASLAALVQARAERAPGRRALVFEDTALTYRELTEQAHRLARFLLARGVAPGQRVALALPRSVSMIVAMLAVAEVGAAYVPVDPDYPADRIAFMVADSAPALLLTDSTVAGALPELAAPRLLLDDPEIAAAVASQEDTGVGIEVSPASPAYVIYTSGSTGRPKGVVVPHAGVVNHMLWQAEAWDVDEQDVVLARTAFSFDAAGSEIWLPLLAGATICLAPSTVTRDPEALVAYAARHGVTVAQFVPSLLAVTAEAIARAENLALRLVFVAGEVLPPTLAEQVVSEWGVRLAHLYGPTEASVDVTGYEARPGCGNAPLPIGRPVWNTSAYVLDEALRPVELGVTGELYVAGVQLAHGYLNRPGLSAERFVADPFGEPGTRMYRTGDLARWNANDELDFLGRADDQVKVRGFRIELGEIEAALAQCEGVRRVAVLVREDQPGDKRIVAYVVSDV [...]
+ s2 = sequence::from_string("MPDTKDLQYSLTGAQTGIWFAQQLDPDNPIYNTAEYIEINGPVNIALFEEALRHVIKEAESLHVRFGENMDGPWQMINPSPDVQLHVIDVSSEPDPEKTALNWMKADLAKPVDLGYAPLFNEALFIAGPDRFFWYQRIHHIAIDGFGFSLIAQRVASTYTALIKGQTAKSRSFGSLQAILEEDTDYRGSEQYEKDRQFWLDRFADAPEVVSLADRAPRTSNSFLRHTAYLPPSDVNALKEAARYFSGSWHEVMIAVSAVYVHRMTGSEDVVLGLPMMGRIGSASLNVPAMVMNLLPLRLTVSSSMSFSELIQQISREIRSIRRHHKYRHEELRRDLKLIGENHRLFGPQINLMPFDYGLDFAGVRGTTHNLSAGPVDDLSINVYDRTDGSGLRIDVDANPEVYSESDIKLHQQRILQLLQTASAGEDMLIGQMELLLPEEKEKVISKWNETAKSEKLVSLQDM [...]
+ qa = 4975;
+ sa = 2;
+ goto ende;
ende:
ss.push_back(s1);
@@ -355,6 +627,8 @@ VIAREQLEEMCTAVNRIHRGPEHPSHIVLPIIKR");
//benchmark_floating(ss, qa, sa);
//benchmark_greedy(ss, qa, sa);
//benchmark_cmp();
- benchmark_ungapped(ss, qa, sa);
+ //benchmark_ungapped(ss, qa, sa);
+ //benchmark_swipe(ss);
+ //benchmark_banded(ss, qa, sa);
}
\ No newline at end of file
diff --git a/src/run/double_indexed.cpp b/src/run/double_indexed.cpp
index b01cd8a..a6fa614 100644
--- a/src/run/double_indexed.cpp
+++ b/src/run/double_indexed.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <iostream>
@@ -29,6 +29,8 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../output/output_format.h"
#include "../data/frequent_seeds.h"
#include "../output/daa_write.h"
+#include "../data/taxonomy.h"
+#include "../basic/masking.h"
using std::endl;
using std::cout;
@@ -57,7 +59,6 @@ struct Search_context
};
void process_shape(unsigned sid,
- Timer &timer_mapping,
unsigned query_chunk,
char *query_buffer,
char *ref_buffer)
@@ -72,34 +73,52 @@ void process_shape(unsigned sid,
current_range = range;
task_timer timer("Building reference index", true);
- sorted_list ref_idx(ref_buffer,
- *ref_seqs::data_,
- shapes[sid],
- ref_hst.get(sid),
- range,
- ref_hst.partition());
+ sorted_list *ref_idx;
+ if (config.algo == Config::query_indexed)
+ ref_idx = new sorted_list(ref_buffer,
+ *ref_seqs::data_,
+ sid,
+ ref_hst.get(sid),
+ range,
+ ref_hst.partition(),
+ query_seeds);
+ else if (query_seeds_hashed != 0)
+ ref_idx = new sorted_list(ref_buffer,
+ *ref_seqs::data_,
+ sid,
+ ref_hst.get(sid),
+ range,
+ ref_hst.partition(),
+ query_seeds_hashed);
+ else
+ ref_idx = new sorted_list(ref_buffer,
+ *ref_seqs::data_,
+ sid,
+ ref_hst.get(sid),
+ range,
+ ref_hst.partition(),
+ &no_filter);
timer.go("Building query index");
- timer_mapping.resume();
sorted_list query_idx(query_buffer,
*query_seqs::data_,
- shapes[sid],
+ sid,
query_hst.get(sid),
range,
- query_hst.partition());
+ query_hst.partition(),
+ &no_filter);
timer.go("Building seed filter");
- frequent_seeds.build(sid, range, ref_idx, query_idx);
+ frequent_seeds.build(sid, range, *ref_idx, query_idx);
timer.go("Searching alignments");
- Search_context context(sid, ref_idx, query_idx);
+ Search_context context(sid, *ref_idx, query_idx);
launch_scheduled_thread_pool(context, Const::seedp, config.threads_);
+ delete ref_idx;
}
- timer_mapping.stop();
}
void run_ref_chunk(Database_file &db_file,
- Timer &timer_mapping,
Timer &total_timer,
unsigned query_chunk,
pair<size_t, size_t> query_len_bounds,
@@ -108,8 +127,12 @@ void run_ref_chunk(Database_file &db_file,
vector<Temp_file> &tmp_file)
{
task_timer timer("Building reference histograms");
- const pair<size_t, size_t> len_bounds = ref_seqs::data_->len_bounds(shapes[0].length_);
- ref_hst = Partitioned_histogram(*ref_seqs::data_, (unsigned)len_bounds.second);
+ if(config.algo==Config::query_indexed)
+ ref_hst = Partitioned_histogram(*ref_seqs::data_, false, query_seeds);
+ else if(query_seeds_hashed != 0)
+ ref_hst = Partitioned_histogram(*ref_seqs::data_, true, query_seeds_hashed);
+ else
+ ref_hst = Partitioned_histogram(*ref_seqs::data_, false, &no_filter);
ref_map.init(safe_cast<unsigned>(ref_seqs::get().get_length()));
@@ -117,20 +140,17 @@ void run_ref_chunk(Database_file &db_file,
char *ref_buffer = sorted_list::alloc_buffer(ref_hst);
timer.go("Initializing temporary storage");
- timer_mapping.resume();
Trace_pt_buffer::instance = new Trace_pt_buffer(query_seqs::data_->get_length() / align_mode.query_contexts,
config.tmpdir,
config.query_bins);
timer.finish();
- timer_mapping.stop();
-
+
for (unsigned i = 0; i < shapes.count(); ++i)
- process_shape(i, timer_mapping, query_chunk, query_buffer, ref_buffer);
+ process_shape(i, query_chunk, query_buffer, ref_buffer);
timer.go("Deallocating buffers");
delete[] ref_buffer;
- timer_mapping.resume();
Output_stream* out;
if (blocked_processing) {
timer.go("Opening temporary output file");
@@ -148,7 +168,6 @@ void run_ref_chunk(Database_file &db_file,
Intermediate_record::finish_file(*out);
delete out;
}
- timer_mapping.stop();
timer.go("Deallocating reference");
delete ref_seqs::data_;
@@ -157,27 +176,64 @@ void run_ref_chunk(Database_file &db_file,
}
void run_query_chunk(Database_file &db_file,
- Timer &timer_mapping,
Timer &total_timer,
unsigned query_chunk,
- pair<size_t, size_t> query_len_bounds,
Output_stream &master_out,
Output_stream *unaligned_file)
{
- task_timer timer("Allocating buffers", true);
+ static const double max_coverage = 0.15;
+
+ task_timer timer("Building query seed set");
+ if (query_chunk == 0)
+ setup_search_cont();
+ if (config.algo == -1) {
+ query_seeds = new Seed_set(query_seqs::get(), max_coverage);
+ timer.finish();
+ log_stream << "Seed space coverage = " << query_seeds->coverage() << endl;
+ if (query_seeds->coverage() >= max_coverage) {
+ config.algo = Config::double_indexed;
+ delete query_seeds;
+ query_seeds = 0;
+ }
+ else
+ config.algo = Config::query_indexed;
+ }
+ else if (config.algo == Config::query_indexed) {
+ query_seeds = new Seed_set(query_seqs::get(), 2);
+ timer.finish();
+ log_stream << "Seed space coverage = " << query_seeds->coverage() << endl;
+ }
+ else
+ timer.finish();
+ if (query_chunk == 0)
+ setup_search();
+ if (config.algo == Config::double_indexed && config.small_query) {
+ timer.go("Building query seed hash set");
+ query_seeds_hashed = new Hashed_seed_set(query_seqs::get());
+ }
+
+ timer.go("Building query histograms");
+ const pair<size_t, size_t> query_len_bounds = query_seqs::data_->len_bounds(shapes[0].length_);
+ setup_search_params(query_len_bounds, 0);
+ query_hst = Partitioned_histogram(*query_seqs::data_, false, &no_filter);
+ timer.finish();
+ //const bool long_addressing_query = query_seqs::data_->raw_len() > (size_t)std::numeric_limits<uint32_t>::max();
+
+ timer.go("Allocating buffers");
char *query_buffer = sorted_list::alloc_buffer(query_hst);
vector<Temp_file> tmp_file;
query_aligned.clear();
query_aligned.insert(query_aligned.end(), query_ids::get().get_length(), false);
- timer.finish();
-
db_file.rewind();
+ timer.finish();
+
for (current_ref_block = 0; db_file.load_seqs(); ++current_ref_block)
- run_ref_chunk(db_file, timer_mapping, total_timer, query_chunk, query_len_bounds, query_buffer, master_out, tmp_file);
+ run_ref_chunk(db_file, total_timer, query_chunk, query_len_bounds, query_buffer, master_out, tmp_file);
timer.go("Deallocating buffers");
- timer_mapping.resume();
delete[] query_buffer;
+ delete query_seeds;
+ query_seeds = 0;
if (blocked_processing) {
timer.go("Joining output blocks");
@@ -193,13 +249,13 @@ void run_query_chunk(Database_file &db_file,
delete query_seqs::data_;
delete query_ids::data_;
delete query_source_seqs::data_;
- timer_mapping.stop();
}
-void master_thread(Database_file &db_file, Timer &timer_mapping, Timer &total_timer)
+void master_thread(Database_file &db_file, Timer &total_timer)
{
+ if(config.query_file.empty())
+ std::cerr << "Query file parameter (--query/-q) is missing. Input will be read from stdin." << endl;
task_timer timer("Opening the input file", true);
- timer_mapping.start();
auto_ptr<Input_stream> query_file(Compressed_istream::auto_detect(config.query_file));
const Sequence_file_format *format_n(guess_format(*query_file));
@@ -214,44 +270,34 @@ void master_thread(Database_file &db_file, Timer &timer_mapping, Timer &total_ti
auto_ptr<Output_stream> unaligned_file;
if (!config.unaligned.empty())
unaligned_file = auto_ptr<Output_stream>(new Output_stream(config.unaligned));
- timer_mapping.stop();
timer.finish();
for (;; ++current_query_chunk) {
task_timer timer("Loading query sequences", true);
- timer_mapping.resume();
size_t n_query_seqs;
- n_query_seqs = load_seqs(*query_file, *format_n, &query_seqs::data_, query_ids::data_, query_source_seqs::data_, (size_t)(config.chunk_size * 1e9));
+ n_query_seqs = load_seqs(*query_file, *format_n, &query_seqs::data_, query_ids::data_, &query_source_seqs::data_, (size_t)(config.chunk_size * 1e9), config.qfilt);
if (n_query_seqs == 0)
break;
timer.finish();
query_seqs::data_->print_stats();
if (current_query_chunk == 0 && *output_format != Output_format::daa)
- output_format->print_header(*master_out, align_mode.mode, config.matrix.c_str(), config.gap_open, config.gap_extend, config.max_evalue, query_ids::get()[0].c_str(),
+ output_format->print_header(*master_out, align_mode.mode, config.matrix.c_str(), score_matrix.gap_open(), score_matrix.gap_extend(), config.max_evalue, query_ids::get()[0].c_str(),
unsigned(align_mode.query_translated ? query_source_seqs::get()[0].length() : query_seqs::get()[0].length()));
- if (align_mode.sequence_type == amino_acid && config.seg == "yes") {
- timer.go("Running complexity filter");
- Complexity_filter::get().run(*query_seqs::data_);
- }
-
- timer.go("Building query histograms");
- const pair<size_t, size_t> query_len_bounds = query_seqs::data_->len_bounds(shapes[0].length_);
- setup_search_params(query_len_bounds, 0);
- query_hst = Partitioned_histogram(*query_seqs::data_, (unsigned)query_len_bounds.second);
- timer_mapping.stop();
- timer.finish();
- //const bool long_addressing_query = query_seqs::data_->raw_len() > (size_t)std::numeric_limits<uint32_t>::max();
+ if (config.masking == 1) {
+ timer.go("Masking queries");
+ mask_seqs(*query_seqs::data_, Masking::get());
+ timer.finish();
+ }
- run_query_chunk(db_file, timer_mapping, total_timer, current_query_chunk, query_len_bounds, *master_out, unaligned_file.get());
+ run_query_chunk(db_file, total_timer, current_query_chunk, *master_out, unaligned_file.get());
}
timer.go("Closing the input file");
query_file->close();
timer.go("Closing the output file");
- timer_mapping.resume();
if (*output_format == Output_format::daa)
finish_daa(*master_out);
else
@@ -259,20 +305,18 @@ void master_thread(Database_file &db_file, Timer &timer_mapping, Timer &total_ti
master_out->close();
if (unaligned_file.get())
unaligned_file->close();
- timer_mapping.stop();
-
+
timer.go("Closing the database file");
db_file.close();
timer.finish();
message_stream << "Total time = " << total_timer.getElapsedTimeInSec() << "s" << endl;
- verbose_stream << "Mapping time = " << timer_mapping.getElapsedTimeInSec() << "s" << endl;
statistics.print();
}
void master_thread_di()
{
- Timer timer2, timer_mapping;
+ Timer timer2;
timer2.start();
align_mode = Align_mode(Align_mode::from_command(config.command));
@@ -300,5 +344,11 @@ void master_thread_di()
set_max_open_files(config.query_bins * config.threads_ + unsigned(ref_header.letters / (size_t)(config.chunk_size * 1e9)) + 16);
- master_thread(db_file, timer_mapping, timer2);
+ if (!config.prot_accession2taxid.empty()) {
+ timer.go("Loading taxonomy");
+ taxonomy.load();
+ timer.finish();
+ }
+
+ master_thread(db_file, timer2);
}
\ No newline at end of file
diff --git a/src/run/main.cpp b/src/run/main.cpp
index af4be63..79b1fe5 100644
--- a/src/run/main.cpp
+++ b/src/run/main.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <iostream>
@@ -26,16 +26,18 @@ using std::cout;
using std::cerr;
using std::endl;
-void run_mapper();
void master_thread_di();
void model_seqs();
void opt();
+void run_masker();
+void fastq2fasta();
int main(int ac, const char* av[])
{
try {
+ check_simd();
config = Config(ac, av);
switch (config.command) {
@@ -49,10 +51,7 @@ int main(int ac, const char* av[])
break;
case Config::blastp:
case Config::blastx:
- if (config.algo == Config::subject_indexed)
- run_mapper();
- else
- master_thread_di();
+ master_thread_di();
break;
case Config::view:
view();
@@ -90,15 +89,21 @@ int main(int ac, const char* av[])
case Config::opt:
opt();
break;
+ case Config::mask:
+ run_masker();
+ break;
+ case Config::fastq2fasta:
+ fastq2fasta();
+ break;
default:
return 1;
}
}
- catch(std::bad_alloc &e) {
+ catch(const std::bad_alloc &e) {
cerr << "Failed to allocate sufficient memory. Please refer to the manual for instructions on memory usage." << endl;
log_stream << "Error: " << e.what() << endl;
return 1;
- } catch(std::exception& e) {
+ } catch(const std::exception& e) {
cerr << "Error: " << e.what() << endl;
log_stream << "Error: " << e.what() << endl;
return 1;
diff --git a/src/run/mapper.cpp b/src/run/mapper.cpp
deleted file mode 100644
index 8d8e395..0000000
--- a/src/run/mapper.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****/
-
-#include <numeric>
-#include "../data/reference.h"
-#include "../output/output_format.h"
-#include "../util/seq_file_format.h"
-#include "../data/queries.h"
-#include "../output/daa_write.h"
-#include "../data/load_seqs.h"
-#include "../data/index.h"
-
-void search_query_worker(Atomic<unsigned> *next);
-
-void run_query_chunk(Output_stream &master_out)
-{
- task_timer timer("Computing alignments");
- Thread_pool threads;
- Atomic<unsigned> query (0);
- for (unsigned i = 0; i < config.threads_; ++i)
- threads.push_back(launch_thread(search_query_worker, &query));
- threads.join_all();
-}
-
-void run_ref_chunk(Input_stream &query_file, const Sequence_file_format &input_format, Output_stream &master_out)
-{
- task_timer timer("Building database index");
- shape_from = 0;
- shape_to = 1;
- build_index(ref_seqs::get());
- timer.finish();
-
- query_file.rewind();
-
- for (current_query_chunk = 0;; ++current_query_chunk) {
- task_timer timer("Loading query sequences", true);
- size_t n_query_seqs;
- n_query_seqs = load_seqs(query_file, input_format, &query_seqs::data_, query_ids::data_, query_source_seqs::data_, (size_t)(config.chunk_size * 1e9));
- if (n_query_seqs == 0)
- break;
- timer.finish();
- query_seqs::data_->print_stats();
- run_query_chunk(master_out);
- }
-
- timer.go("Deallocating memory");
- for (unsigned i = 0; i < shapes.count(); ++i)
- assign_ptr(seed_index[i], new Seed_index());
-}
-
-void run_mapper(Database_file &db_file, Timer &total_timer)
-{
- task_timer timer("Opening the input file", true);
- auto_ptr<Input_stream> query_file(Compressed_istream::auto_detect(config.query_file));
- const Sequence_file_format *format_n(guess_format(*query_file));
-
- timer.go("Opening the output file");
- auto_ptr<Output_stream> master_out(config.compression == 1
- ? new Compressed_ostream(config.output_file)
- : new Output_stream(config.output_file));
- timer.finish();
-
- for (current_ref_block = 0; db_file.load_seqs(); ++current_ref_block)
- run_ref_chunk(*query_file, *format_n, *master_out);
-
- timer.go("Closing the output file");
- master_out->close();
-
- timer.go("Closing the database file");
- db_file.close();
-
- timer.finish();
- message_stream << "Total wall clock time: " << total_timer.getElapsedTimeInSec() << "s" << endl;
- statistics.print();
-}
-
-void run_mapper()
-{
- Timer timer2;
- timer2.start();
-
- Reduction::reduction = Reduction("A KR EDNQ C G H ILVM FYW P ST");
-
- align_mode = Align_mode(Align_mode::from_command(config.command));
- output_format = auto_ptr<Output_format>(get_output_format());
-
- message_stream << "Temporary directory: " << Temp_file::get_temp_dir() << endl;
-
- task_timer timer("Opening the database", 1);
- Database_file db_file;
- timer.finish();
- message_stream << "Reference: " << config.database << " (" << ref_header.sequences << " sequences, " << ref_header.letters << " letters)" << endl;
- verbose_stream << "Block size: " << (size_t)(config.chunk_size * 1e9) << endl;
- Config::set_option(config.db_size, (uint64_t)ref_header.letters);
-
- run_mapper(db_file, timer2);
-}
diff --git a/src/run/tools.cpp b/src/run/tools.cpp
index b5e4e9e..5e76d01 100644
--- a/src/run/tools.cpp
+++ b/src/run/tools.cpp
@@ -1,21 +1,22 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
+#include <stdio.h>
#include <set>
#include "tools.h"
#include "../basic/config.h"
@@ -25,6 +26,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../data/load_seqs.h"
#include "../data/reference.h"
#include "../extra/match_file.h"
+#include "../basic/masking.h"
void get_seq()
{
@@ -104,4 +106,40 @@ void match_file_stat()
blast_match match;
while (file.get(match, blast_format()));
file.get_subst();
+}
+
+void run_masker()
+{
+ Input_stream f(config.query_file);
+ vector<Letter> seq, seq2;
+ vector<char> id;
+ const FASTA_format format;
+ while (format.get_seq(id, seq, f)) {
+ cout << '>' << string(id.data(), id.size()) << endl;
+ seq2 = seq;
+ Masking::get()(seq2.data(), seq2.size());
+ for (size_t i = 0; i < seq.size(); ++i) {
+ char c = value_traits.alphabet[(long)seq[i]];
+ if (seq2[i] == value_traits.mask_char)
+ c = tolower(c);
+ cout << c;
+ }
+ cout << endl;
+ //cout << sequence(seq.data(), seq.size()) << endl;
+ }
+}
+
+void fastq2fasta()
+{
+ auto_ptr<Input_stream> f(Compressed_istream::auto_detect(config.query_file));
+ vector<Letter> seq;
+ vector<char> id;
+ const FASTQ_format format;
+ input_value_traits = value_traits = nucleotide_traits;
+ size_t n = 0, max = atoi(config.seq_no[0].c_str());
+ while (n < max && format.get_seq(id, seq, *f)) {
+ cout << '>' << string(id.data(), id.size()) << endl;
+ cout << sequence(seq.data(), seq.size()) << endl;
+ ++n;
+ }
}
\ No newline at end of file
diff --git a/src/run/tools.h b/src/run/tools.h
index b714b61..2e6e32b 100644
--- a/src/run/tools.h
+++ b/src/run/tools.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TOOLS_H_
diff --git a/src/search/align_range.h b/src/search/align_range.h
index efd76b9..f843175 100644
--- a/src/search/align_range.h
+++ b/src/search/align_range.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef ALIGN_RANGE_H_
@@ -26,6 +26,8 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../dp/dp.h"
void setup_search_params(pair<size_t, size_t> query_len_bounds, size_t chunk_db_letters);
+void setup_search();
+void setup_search_cont();
struct Stage1_hit
{
diff --git a/src/search/collision.cpp b/src/search/collision.cpp
index edc3d2e..b792bab 100644
--- a/src/search/collision.cpp
+++ b/src/search/collision.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "collision.h"
@@ -38,14 +38,20 @@ inline bool match_shape_mask(const uint64_t mask, const uint64_t shape_mask)
inline bool is_lower_chunk(const Letter *subject, unsigned sid)
{
Packed_seed seed;
- shapes[sid].set_seed(seed, subject);
+ if (config.algo == Config::double_indexed)
+ shapes[sid].set_seed(seed, subject);
+ else
+ shapes[sid].set_seed_shifted(seed, subject);
return current_range.lower(seed_partition(seed));
}
inline bool is_lower_or_equal_chunk(const Letter *subject, unsigned sid)
{
Packed_seed seed;
- shapes[sid].set_seed(seed, subject);
+ if (config.algo == Config::double_indexed)
+ shapes[sid].set_seed(seed, subject);
+ else
+ shapes[sid].set_seed_shifted(seed, subject);
return current_range.lower_or_equal(seed_partition(seed));
}
diff --git a/src/search/collision.h b/src/search/collision.h
index 78614c2..982c6cb 100644
--- a/src/search/collision.h
+++ b/src/search/collision.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef COLLISION_H_
diff --git a/src/search/hit_filter.h b/src/search/hit_filter.h
index 9a51f05..53eb441 100644
--- a/src/search/hit_filter.h
+++ b/src/search/hit_filter.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef HIT_FILTER_H_
@@ -65,8 +63,8 @@ struct hit_filter
subjects_,
config.hit_band,
left,
- config.gap_open + config.gap_extend,
- config.gap_extend,
+ score_matrix.gap_open() + score_matrix.gap_extend(),
+ score_matrix.gap_extend(),
config.min_hit_raw_score,
*this,
uint8_t(),
diff --git a/src/search/search.cpp b/src/search/search.cpp
index 5506ab0..93da2a8 100644
--- a/src/search/search.cpp
+++ b/src/search/search.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "align_range.h"
diff --git a/src/search/search_query.cpp b/src/search/search_query.cpp
deleted file mode 100644
index 872ca2d..0000000
--- a/src/search/search_query.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****/
-
-#include "../util/thread.h"
-#include "../data/queries.h"
-#include "../data/index.h"
-
-void search_query(unsigned query_id, Statistics &stat, vector<Seed> &neighbor_seeds)
-{
- const sequence query_seq = query_seqs::get()[query_id];
- for (unsigned sid = 0; sid < shapes.count(); ++sid) {
- const shape &sh = shapes[sid];
- if (query_seq.length() < sh.length_)
- return;
- for (unsigned i = 0; i <= query_seq.length() - sh.length_; ++i) {
- uint64_t seed;
- if (sh.set_seed(seed, &query_seq[i])) {
- sorted_list::Random_access_iterator k = seed_index[sid][seed];
- while (k.good()) {
- stat.inc(Statistics::SEED_HITS);
- ++k;
- }
- }
- }
- }
-}
-
-void search_query_worker(Atomic<unsigned> *next)
-{
- unsigned query_id;
- Statistics stat;
- vector<Seed> neighbor_seeds;
- while ((query_id = (*next)++) < query_seqs::get().get_length())
- search_query(query_id, stat, neighbor_seeds);
- statistics += stat;
-}
\ No newline at end of file
diff --git a/src/search/setup.cpp b/src/search/setup.cpp
index 9381940..e4e24c9 100644
--- a/src/search/setup.cpp
+++ b/src/search/setup.cpp
@@ -1,25 +1,80 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "align_range.h"
#include "../data/reference.h"
#include "../basic/config.h"
+void setup_search_cont()
+{
+ unsigned index_mode;
+ if (config.mode_more_sensitive) {
+ index_mode = 11;
+ }
+ else if (config.mode_sensitive) {
+ index_mode = 11;
+ }
+ else {
+ index_mode = 10;
+ Reduction::reduction = Reduction("KR EQ D N C G H F Y IV LM W P S T A");
+ }
+ ::shapes = shape_config(index_mode, 1, vector<string>());
+}
+
+void setup_search()
+{
+ if (config.algo == Config::double_indexed) {
+ if (config.mode_more_sensitive) {
+ Config::set_option(config.index_mode, 9u);
+ Config::set_option(config.freq_sd, 200.0);
+ }
+ else if (config.mode_sensitive) {
+ Config::set_option(config.index_mode, 9u);
+ Config::set_option(config.freq_sd, 10.0);
+ }
+ else {
+ Config::set_option(config.index_mode, 8u);
+ Config::set_option(config.freq_sd, 50.0);
+ }
+ Reduction::reduction = Reduction("A KR EDNQ C G H ILVM FYW P ST");
+ ::shapes = shape_config(config.index_mode, config.shapes, config.shape_mask);
+ }
+ else {
+ if (config.mode_more_sensitive) {
+ Config::set_option(config.freq_sd, 200.0);
+ }
+ else if (config.mode_sensitive) {
+ Config::set_option(config.freq_sd, 20.0);
+ }
+ else {
+ Config::set_option(config.freq_sd, 50.0);
+ }
+ config.lowmem = 1;
+ }
+
+ message_stream << "Algorithm: " << (config.algo == Config::double_indexed ? "Double-indexed" : "Query-indexed") << endl;
+ verbose_stream << "Reduction: " << Reduction::reduction << endl;
+
+ verbose_stream << "Seed frequency SD: " << config.freq_sd << endl;
+ verbose_stream << "Shape configuration: " << ::shapes << endl;
+ config.seed_anchor = std::min(::shapes[0].length_ - 1, 8u);
+}
+
void setup_search_params(pair<size_t, size_t> query_len_bounds, size_t chunk_db_letters)
{
const double b = config.min_bit_score == 0 ? score_matrix.bitscore(config.max_evalue, ref_header.letters, (unsigned)query_len_bounds.first) : config.min_bit_score;
diff --git a/src/search/sse_dist.h b/src/search/sse_dist.h
index 8e351f5..f4a03fe 100644
--- a/src/search/sse_dist.h
+++ b/src/search/sse_dist.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2013-2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SSE_DIST_H_
@@ -52,14 +52,11 @@ inline __m128i reduce_seq_generic(const __m128i &seq)
inline __m128i reduce_seq(const __m128i &seq)
{
- if(config.have_ssse3) {
#ifdef __SSSE3__
- return reduce_seq_ssse3(seq);
+ return reduce_seq_ssse3(seq);
#else
- return reduce_seq_generic(seq);
+ return reduce_seq_generic(seq);
#endif
- } else
- return reduce_seq_generic(seq);
}
#endif
diff --git a/src/search/stage2.cpp b/src/search/stage2.cpp
index 338742d..a0b4ef5 100644
--- a/src/search/stage2.cpp
+++ b/src/search/stage2.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "align_range.h"
@@ -28,35 +28,6 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
TLS_PTR vector<sequence>* hit_filter::subjects_ptr;
#endif
-unsigned count_id(const Letter *query, const Letter *subject)
-{
- static const Reduction reduction("KR E D Q N C G H LM FY VI W P S T A"); // murphy.10
- unsigned n = 0;
- for (int i = -1; i >= -((int)config.id_left) && query[i] != '\xff' && subject[i] != '\xff'; --i)
- if (reduction(query[i]) == reduction(subject[i]))
- ++n;
- for (unsigned i = 0; i < config.id_right && query[i] != '\xff' && subject[i] != '\xff'; ++i)
- if (reduction(query[i]) == reduction(subject[i]))
- ++n;
- return n;
-}
-
-int extend_binary(const Letter *query, const Letter *subject)
-{
- int s = 0, t = 0;
- for (int i = -1; i >= -60 && query[i] != '\xff' && subject[i] != '\xff'; --i) {
- t += query[i] == subject[i] ? config.bmatch : -config.bmismatch;
- s = std::max(s, t);
- }
- int sr = 0;
- t = 0;
- for (unsigned i = 0; i < 60 && query[i] != '\xff' && subject[i] != '\xff'; ++i) {
- t += query[i] == subject[i] ? config.bmatch : -config.bmismatch;
- sr = std::max(sr, t);
- }
- return s + sr;
-}
-
#ifdef __SSE2__
void search_query_offset(Loc q,
@@ -80,19 +51,22 @@ void search_query_offset(Loc q,
/*if (extend_binary(query, subject) < config.bcutoff)
continue;*/
- stats.inc(Statistics::TENTATIVE_MATCHESX);
-
unsigned delta, len;
int score;
if ((score = stage2_ungapped(query, subject, sid, delta, len)) < config.min_ungapped_raw_score)
continue;
stats.inc(Statistics::TENTATIVE_MATCHES2);
+ /*if (filterl(query, subject) < 20)
+ continue;*/
+
+ stats.inc(Statistics::TENTATIVE_MATCHESX);
if (!is_primary_hit(query - delta, subject - delta, delta, sid, len))
continue;
stats.inc(Statistics::TENTATIVE_MATCHES3);
+ //cout << ref_ids::get()[ref_seqs::get().local_position(s_pos).first].c_str() << endl;
hf.push(s_pos, score);
}
@@ -134,7 +108,7 @@ void search_query_offset(Loc q,
const sequence s = ref_seqs::data_->fixed_window_infix(s_pos + config.seed_anchor);
unsigned left;
sequence query(query_seqs::data_->window_infix(q + config.seed_anchor, left));
- score = smith_waterman(query, s, config.hit_band, left, config.gap_open + config.gap_extend, config.gap_extend);
+ score = smith_waterman(query, s, config.hit_band, left, score_matrix.gap_open() + score_matrix.gap_extend(), score_matrix.gap_extend());
}
if (score >= config.min_hit_raw_score) {
if (q_num_ == std::numeric_limits<unsigned>::max()) {
@@ -142,7 +116,6 @@ void search_query_offset(Loc q,
q_num_ = (unsigned)l.first;
seed_offset_ = (unsigned)l.second;
}
- assert(subject < ref_seqs::get().raw_len());
out.push(hit(q_num_, s_pos, seed_offset_));
stats.inc(Statistics::TENTATIVE_MATCHES4);
}
diff --git a/src/search/trace_pt_buffer.h b/src/search/trace_pt_buffer.h
index a73b527..0da21ef 100644
--- a/src/search/trace_pt_buffer.h
+++ b/src/search/trace_pt_buffer.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TRACE_PT_BUFFER_H_
@@ -22,8 +22,6 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../util/async_buffer.h"
#include "../basic/match.h"
-using std::auto_ptr;
-
#pragma pack(1)
struct hit
diff --git a/src/util/async_buffer.h b/src/util/async_buffer.h
index 38a97e4..3fb4a0c 100644
--- a/src/util/async_buffer.h
+++ b/src/util/async_buffer.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2013-2017, Benjamin Buchfink, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef ASYNC_BUFFER_H_
@@ -23,6 +23,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include <exception>
#include "../basic/config.h"
#include "temp_file.h"
+#include "log_stream.h"
using std::vector;
using std::string;
@@ -36,7 +37,9 @@ struct Async_buffer
Async_buffer(size_t input_count, const string &tmpdir, unsigned bins) :
bins_(bins),
- bin_size_((input_count + bins_ - 1) / bins_)
+ bin_size_((input_count + bins_ - 1) / bins_),
+ input_count_(input_count),
+ bins_processed_(0)
{
log_stream << "Async_buffer() " << input_count << ',' << bin_size_ << endl;
for (unsigned j = 0; j < config.threads_; ++j)
@@ -46,6 +49,16 @@ struct Async_buffer
}
}
+ size_t begin(size_t bin) const
+ {
+ return bin*bin_size_;
+ }
+
+ size_t end(size_t bin) const
+ {
+ return std::min((bin + 1)*bin_size_, input_count_);
+ }
+
struct Iterator
{
Iterator(Async_buffer &parent, unsigned thread_num) :
@@ -66,7 +79,7 @@ struct Async_buffer
}
void flush(unsigned bin)
{
- out_[bin]->typed_write(buffer_[bin].data(), buffer_[bin].size());
+ out_[bin]->write(buffer_[bin].data(), buffer_[bin].size());
parent_.add_size(thread_num_, bin, buffer_[bin].size());
buffer_[bin].clear();
}
@@ -83,18 +96,40 @@ struct Async_buffer
const unsigned thread_num_;
};
- size_t load(vector<_t> &data, unsigned bin) const
+ size_t load(vector<_t> &data, size_t max_size, std::pair<size_t,size_t> &input_range)
{
static size_t total_size;
- if (bin == 0)
+ if (bins_processed_ == 0)
total_size = 0;
- size_t size = 0;
- for (unsigned i = 0; i < config.threads_; ++i)
- size += size_[i*bins_ + bin];
+ if (bins_processed_ == bins_) {
+ input_range = std::make_pair(0, 0);
+ return total_size*sizeof(_t);
+ }
+ size_t size = bin_size(bins_processed_), end = bins_processed_ + 1, current_size;
+ while (end < bins_ && (size + (current_size = bin_size(end)))*sizeof(_t) < max_size) {
+ size += current_size;
+ ++end;
+ }
log_stream << "Async_buffer.load() " << size << "(" << (double)size*sizeof(_t) / (1 << 30) << " GB)" << endl;
total_size += size;
data.resize(size);
_t* ptr = data.data();
+ input_range.first = begin(bins_processed_);
+ for (; bins_processed_ < end; ++bins_processed_)
+ load_bin(ptr, bins_processed_);
+ input_range.second = this->end(bins_processed_ - 1);
+ return total_size*sizeof(_t);
+ }
+
+ unsigned bins() const
+ {
+ return bins_;
+ }
+
+private:
+
+ void load_bin(_t*& ptr, size_t bin)
+ {
for (unsigned i = 0; i < config.threads_; ++i) {
Input_stream f(tmp_file_[i*bins_ + bin]);
const size_t s = size_[i*bins_ + bin];
@@ -104,16 +139,16 @@ struct Async_buffer
if (n != s)
throw std::runtime_error("Error reading temporary file: " + f.file_name);
}
- return total_size*sizeof(_t);
}
- unsigned bins() const
+ size_t bin_size(size_t bin) const
{
- return bins_;
+ size_t size = 0;
+ for (unsigned i = 0; i < config.threads_; ++i)
+ size += size_[i*bins_ + bin];
+ return size;
}
-private:
-
Temp_file* get_out(unsigned threadid, unsigned bin)
{
return &tmp_file_[threadid*bins_ + bin];
@@ -125,7 +160,8 @@ private:
}
const unsigned bins_;
- const size_t bin_size_;
+ const size_t bin_size_, input_count_;
+ size_t bins_processed_;
vector<size_t> size_;
vector<Temp_file> tmp_file_;
diff --git a/src/util/binary_buffer.h b/src/util/binary_buffer.h
index 3d4c8d7..80d8021 100644
--- a/src/util/binary_buffer.h
+++ b/src/util/binary_buffer.h
@@ -1,20 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-Author: Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef BINARY_BUFFER_H_
diff --git a/src/util/binary_file.cpp b/src/util/binary_file.cpp
new file mode 100644
index 0000000..46b49ca
--- /dev/null
+++ b/src/util/binary_file.cpp
@@ -0,0 +1,355 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <sstream>
+#include <stdio.h>
+#ifdef _MSC_VER
+#define NOMINMAX
+#include <Windows.h>
+#else
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#endif
+
+#include "../basic/config.h"
+#include "binary_file.h"
+#include "temp_file.h"
+#include "util.h"
+
+using std::vector;
+using std::endl;
+using std::string;
+using std::runtime_error;
+
+#ifdef __linux__
+#define POSIX_OPEN(x,y,z) open64(x,y,z)
+#define POSIX_OPEN2(x,y) open64(x,y)
+#else
+#define POSIX_OPEN(x,y,z) open(x,y,z)
+#define POSIX_OPEN2(x,y) open(x,y)
+#endif
+
+Output_stream::Output_stream()
+{ }
+
+#ifndef _MSC_VER
+Output_stream::~Output_stream()
+{}
+#endif
+
+Output_stream::Output_stream(const string &file_name) :
+ file_name_(file_name)
+{
+#ifdef _MSC_VER
+ f_ = file_name.length() == 0 ? stdout : fopen(file_name.c_str(), "wb");
+#else
+ int fd_ = file_name.length() == 0 ? 1 : POSIX_OPEN(file_name.c_str(), O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (fd_ < 0) {
+ perror(0);
+ throw File_open_exception(file_name_);
+ }
+ f_ = fdopen(fd_, "wb");
+#endif
+ if (f_ == 0) {
+ perror(0);
+ throw File_open_exception(file_name_);
+ }
+}
+
+void Output_stream::remove()
+{
+ if (::remove(file_name_.c_str()) != 0)
+ std::cerr << "Warning: Failed to delete file " << file_name_ << std::endl;
+}
+
+void Output_stream::close()
+{
+ if (f_ && f_ != stdout) {
+ if (fclose(f_) != 0) {
+ perror(0);
+ throw std::runtime_error(string("Error closing file ") + file_name_);
+ }
+ f_ = 0;
+ }
+}
+
+void Output_stream::write_raw(const char *ptr, size_t count)
+{
+ size_t n;
+ if ((n = fwrite((const void*)ptr, 1, count, f_)) != count) {
+ perror(0);
+ throw File_write_exception(file_name_);
+ }
+}
+
+void Output_stream::write_c_str(const string &s)
+{
+ write(s.c_str(), s.length() + 1);
+}
+
+void Output_stream::seekp(size_t p)
+{
+#ifdef _MSC_VER
+ if (_fseeki64(f_, (int64_t)p, SEEK_SET) != 0) {
+ perror(0);
+ throw std::runtime_error("Error calling fseek.");
+ }
+#else
+ if (fseek(f_, p, SEEK_SET) != 0) {
+ perror(0);
+ throw std::runtime_error("Error calling fseek.");
+ }
+#endif
+}
+
+size_t Output_stream::tell()
+{
+#ifdef _MSC_VER
+ int64_t x;
+ if ((x = _ftelli64(f_)) == (int64_t)-1)
+ throw std::runtime_error("Error executing ftell on stream " + file_name_);
+ return (size_t)x;
+#else
+ const long n = ftell(f_);
+ if (n < 0) {
+ perror(0);
+ throw std::runtime_error("Error calling ftell.");
+ }
+ return n;
+#endif
+}
+
+Input_stream::Input_stream(const string &file_name) :
+ file_name(file_name),
+ line_count(0),
+ line_buf_used_(0),
+ line_buf_end_(0),
+ putback_line_(false),
+ eof_(false)
+{
+#ifdef _MSC_VER
+ f_ = file_name.empty() ? stdin : fopen(file_name.c_str(), "rb");
+#else
+ int fd_ = file_name.empty() ? 0 : POSIX_OPEN2(file_name.c_str(), O_RDONLY);
+ if (fd_ < 0) {
+ perror(0);
+ throw std::runtime_error(string("Error opening file ") + file_name);
+ }
+ f_ = fdopen(fd_, "rb");
+#endif
+ if (f_ == 0) {
+ perror(0);
+ throw File_open_exception(file_name);
+ }
+}
+
+void Input_stream::rewind()
+{
+ ::rewind(f_);
+ line_count = 0;
+ line_buf_used_ = 0;
+ line_buf_end_ = 0;
+ putback_line_ = false;
+ eof_ = false;
+ line.clear();
+}
+
+Input_stream::Input_stream(const Output_stream &tmp_file) :
+ file_name(tmp_file.file_name_),
+ f_(tmp_file.f_)
+{
+ ::rewind(f_);
+}
+
+void Input_stream::seek(size_t pos)
+{
+#ifdef _MSC_VER
+ if (_fseeki64(f_, (int64_t)pos, SEEK_SET) != 0) {
+ perror(0);
+ throw std::runtime_error("Error executing seek on file " + file_name);
+ }
+#else
+ if (fseek(f_, pos, SEEK_SET) < 0) {
+ perror(0);
+ throw std::runtime_error("Error calling fseek.");
+ }
+#endif
+}
+
+void Input_stream::seek_forward(size_t n)
+{
+#ifdef _MSC_VER
+ if (_fseeki64(f_, (int64_t)n, SEEK_CUR) != 0) {
+ perror(0);
+ throw std::runtime_error("Error executing seek on file " + file_name);
+ }
+#else
+ if (fseek(f_, n, SEEK_CUR) < 0) {
+ perror(0);
+ throw std::runtime_error("Error calling fseek.");
+ }
+#endif
+}
+
+bool Input_stream::eof() const
+{
+ return eof_;
+}
+
+size_t Input_stream::read_bytes(char *ptr, size_t count)
+{
+ size_t n;
+ if ((n = fread(ptr, 1, count, f_)) != count) {
+ if (feof(f_) != 0)
+ return n;
+ else {
+ perror(0);
+ throw File_read_exception(file_name);
+ }
+ }
+ return n;
+}
+
+void Input_stream::read_c_str(string &s)
+{
+ char c;
+ s.clear();
+ while (true) {
+ if (read(&c, 1) != 1)
+ throw std::runtime_error("Unexpected end of file.");
+ if (c == 0)
+ break;
+ s += (char)c;
+ }
+}
+
+void Input_stream::close_and_delete()
+{
+ close();
+#ifdef _MSC_VER
+ if (remove(file_name.c_str()) != 0)
+ std::cerr << "Warning: Failed to delete temporary file " << file_name << std::endl;
+#endif
+}
+
+void Input_stream::close()
+{
+ if (f_) {
+ if (fclose(f_) != 0) {
+ perror(0);
+ throw std::runtime_error(string("Error closing file ") + file_name);
+ }
+ f_ = 0;
+ }
+}
+
+void Input_stream::getline()
+{
+ if (putback_line_) {
+ putback_line_ = false;
+ ++line_count;
+ return;
+ }
+ line.clear();
+ while (true) {
+ const char *p = (const char*)memchr(&line_buf_[line_buf_used_], '\n', line_buf_end_ - line_buf_used_);
+ if (p == 0) {
+ line.append(&line_buf_[line_buf_used_], line_buf_end_ - line_buf_used_);
+ line_buf_end_ = read_bytes(line_buf_, line_buf_size);
+ line_buf_used_ = 0;
+ if (line_buf_end_ == 0) {
+ eof_ = true;
+ ++line_count;
+ return;
+ }
+ }
+ else {
+ const size_t n = (p - line_buf_) - line_buf_used_;
+ line.append(&line_buf_[line_buf_used_], n);
+ line_buf_used_ += n + 1;
+ const size_t s = line.length() - 1;
+ if (!line.empty() && line[s] == '\r')
+ line.resize(s);
+ ++line_count;
+ return;
+ }
+ }
+}
+
+void Input_stream::putback_line()
+{
+ putback_line_ = true;
+ --line_count;
+}
+
+unsigned Temp_file::n = 0;
+uint64_t Temp_file::hash_key;
+
+Temp_file::Temp_file()
+{
+ if (n == 0) {
+#ifdef _MSC_VER
+ LARGE_INTEGER count;
+ QueryPerformanceCounter(&count);
+ hash_key = (uint64_t)(count.HighPart + count.LowPart + count.QuadPart + GetCurrentProcessId());
+#else
+ timeval count;
+ gettimeofday(&count, NULL);
+ hash_key = count.tv_sec + count.tv_usec + getpid();
+#endif
+ }
+ std::stringstream ss;
+ ss.setf(std::ios::hex, std::ios::basefield);
+ if (config.tmpdir != "")
+ ss << config.tmpdir << dir_separator;
+ ss << "diamond-" << hash_key << "-" << n++ << ".tmp";
+ ss >> this->file_name_;
+#ifdef _MSC_VER
+ this->f_ = fopen(this->file_name_.c_str(), "w+b");
+ if (this->f_ == 0) {
+ perror(0);
+ throw std::runtime_error("Error opening temporary file: " + this->file_name_);
+ }
+#else
+ int fd = POSIX_OPEN(this->file_name_.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (fd < 0) {
+ perror(0);
+ throw std::runtime_error(string("Error opening temporary file ") + this->file_name_);
+ }
+ if (unlink(this->file_name_.c_str()) < 0) {
+ perror(0);
+ throw std::runtime_error("Error calling unlink.");
+ }
+ this->f_ = fdopen(fd, "w+b");
+ if (this->f_ == 0) {
+ perror(0);
+ throw std::runtime_error("Error opening temporary file: " + this->file_name_);
+ }
+#endif
+}
+
+string Temp_file::get_temp_dir()
+{
+ Temp_file t;
+ Input_stream f(t);
+ f.close_and_delete();
+ return extract_dir(f.file_name);
+}
\ No newline at end of file
diff --git a/src/util/binary_file.h b/src/util/binary_file.h
index 2c3563b..18b4935 100644
--- a/src/util/binary_file.h
+++ b/src/util/binary_file.h
@@ -1,42 +1,29 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef BINARY_FILE_H_
#define BINARY_FILE_H_
-#include <memory>
#include <vector>
#include <string>
#include <stdexcept>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <assert.h>
-#include "log_stream.h"
-#include "system.h"
-using std::auto_ptr;
using std::vector;
-using std::endl;
using std::string;
using std::runtime_error;
@@ -50,7 +37,7 @@ struct File_read_exception : public std::runtime_error
struct File_write_exception : public std::runtime_error
{
File_write_exception(const string &file_name) :
- runtime_error(string("Error writing file ") + file_name + ". Disk full?")
+ runtime_error(string("Error writing file ") + file_name)
{ }
};
@@ -63,67 +50,27 @@ struct File_open_exception : public std::runtime_error
struct Output_stream
{
-
- Output_stream()
- { }
+ Output_stream();
#ifndef _MSC_VER
- virtual ~Output_stream()
- {}
+ virtual ~Output_stream();
#endif
- Output_stream(const string &file_name) :
- file_name_(file_name),
- f_(file_name.length() == 0 ? stdout : fopen(file_name.c_str(), "wb"))
- {
- if (f_ == 0) throw File_open_exception(file_name_);
- }
- void remove()
- {
- if (::remove(file_name_.c_str()) != 0)
- std::cerr << "Warning: Failed to delete file " << file_name_ << std::endl;
- }
- virtual void close()
- {
- if (f_ && f_ != stdout) {
- fclose(f_);
- f_ = 0;
- }
- }
- virtual void write(const char *ptr, size_t count)
- {
- size_t n;
- if ((n = fwrite((const void*)ptr, 1, count, f_)) != count)
- throw File_write_exception(file_name_);
- }
+ Output_stream(const string &file_name);
+ void remove();
+ virtual void close();
+ virtual void write_raw(const char *ptr, size_t count);
template<typename _t>
- void typed_write(const _t *ptr, size_t count)
+ void write(const _t *ptr, size_t count)
{
- size_t n;
- if ((n = fwrite((const void*)ptr, sizeof(_t), count, f_)) != count)
- throw File_write_exception(file_name_);
+ write_raw((const char*)ptr, sizeof(_t)*count);
}
template<class _t>
- void write(const vector<_t> &v, bool write_size = true)
- {
- size_t size = v.size();
- if (write_size)
- typed_write(&size, 1);
- typed_write(v.data(), size);
- }
- void write_c_str(const string &s)
- {
- write(s.c_str(), s.length() + 1);
- }
- void seekp(size_t p)
+ void write(const vector<_t> &v)
{
- if (FSEEK(f_, (int64_t)p, SEEK_SET) != 0) throw File_write_exception(file_name_);
- }
- size_t tell()
- {
- int64_t x;
- if ((x = FTELL(f_)) == (int64_t)-1)
- throw std::runtime_error("Error executing ftell on stream " + file_name_);
- return (size_t)x;
+ write(v.data(), v.size());
}
+ void write_c_str(const string &s);
+ void seekp(size_t p);
+ size_t tell();
protected:
string file_name_;
FILE *f_;
@@ -132,171 +79,29 @@ protected:
struct Input_stream
{
-
- Input_stream(const string &file_name):
- file_name (file_name),
- line_count(0),
- f_(fopen(file_name.c_str(), "rb")),
- line_buf_used_(0),
- line_buf_end_(0),
- putback_line_(false),
- eof_(false)
- {
- if (f_ == 0)
- throw File_open_exception(file_name);
- }
-
- void rewind()
- {
- ::rewind(f_);
- line_count = 0;
- line_buf_used_ = 0;
- line_buf_end_ = 0;
- putback_line_ = false;
- eof_ = false;
- line.clear();
- }
-
- Input_stream(const Output_stream &tmp_file):
- file_name (tmp_file.file_name_),
- f_(tmp_file.f_)
- {
- ::rewind(f_);
- }
-
- void seek(size_t pos)
- {
- if (FSEEK(f_, (int64_t)pos, SEEK_SET) != 0)
- throw std::runtime_error("Error executing seek on file " + file_name);
- }
-
- void seek_forward(size_t n)
- {
- if (FSEEK(f_, (int64_t)n, SEEK_CUR) != 0)
- throw std::runtime_error("Error executing seek on file " + file_name);
- }
-
- bool eof() const
- {
- return eof_;
- }
-
- virtual size_t read_bytes(char *ptr, size_t count)
- {
- size_t n;
- if ((n = fread(ptr, 1, count, f_)) != count) {
- if (feof(f_) != 0)
- return n;
- else
- throw File_read_exception(file_name);
- }
- return n;
- }
-
+ Input_stream(const string &file_name);
+ void rewind();
+ Input_stream(const Output_stream &tmp_file);
+ void seek(size_t pos);
+ void seek_forward(size_t n);
+ bool eof() const;
+ virtual size_t read_bytes(char *ptr, size_t count);
template<class _t>
size_t read(_t *ptr, size_t count)
{
return read_bytes((char*)ptr, count*sizeof(_t)) / sizeof(_t);
}
-
- template<class _t>
- void read(vector<_t> &v)
- {
- size_t size;
- if (read(&size, 1) != 1)
- throw File_read_exception(file_name);
- v.clear();
- v.resize(size);
- if (read(v.data(), size) != size)
- throw File_read_exception(file_name);
- }
-
- template<class _t>
- void skip_vector()
- {
- size_t size;
- if (read(&size, 1) != 1)
- throw File_read_exception(file_name);
- seek_forward(size * sizeof(_t));
- }
-
- void read_c_str(string &s)
- {
- int c;
- s.clear();
- while ((c = fgetc(f_)) != 0) {
- if (c == EOF)
- throw File_read_exception(file_name);
- s += (char)c;
- }
- }
-
- void close_and_delete()
- {
- close();
-#ifdef _MSC_VER
- if (remove(file_name.c_str()) != 0)
- std::cerr << "Warning: Failed to delete temporary file " << file_name << std::endl;
-#endif
- }
-
- virtual void close()
- {
- if (f_) {
- fclose(f_);
- f_ = 0;
- }
- }
-
- void putback(char c)
- {
- if (ungetc(c, f_) != (int)c)
- throw File_read_exception(file_name);
- }
-
- void getline()
- {
- if (putback_line_) {
- putback_line_ = false;
- ++line_count;
- return;
- }
- line.clear();
- while (true) {
- const char *p = (const char*)memchr(&line_buf_[line_buf_used_], '\n', line_buf_end_ - line_buf_used_);
- if (p == 0) {
- line.append(&line_buf_[line_buf_used_], line_buf_end_ - line_buf_used_);
- line_buf_end_ = read_bytes(line_buf_, line_buf_size);
- line_buf_used_ = 0;
- if (line_buf_end_ == 0) {
- eof_ = true;
- ++line_count;
- return;
- }
- }
- else {
- const size_t n = (p - line_buf_) - line_buf_used_;
- line.append(&line_buf_[line_buf_used_], n);
- line_buf_used_ += n + 1;
- const size_t s = line.length() - 1;
- if (!line.empty() && line[s] == '\r')
- line.resize(s);
- ++line_count;
- return;
- }
- }
- }
-
- void putback_line()
- {
- putback_line_ = true;
- --line_count;
- }
+ void read_c_str(string &s);
+ void close_and_delete();
+ virtual void close();
+ void putback(char c);
+ void getline();
+ void putback_line();
string file_name;
string line;
size_t line_count;
-
+
protected:
enum { line_buf_size = 256 };
@@ -308,145 +113,4 @@ protected:
};
-struct Buffered_file : public Input_stream
-{
-
- Buffered_file(const string& file_name):
- Input_stream (file_name)
- { init(); }
-
- Buffered_file(const Output_stream &tmp_file):
- Input_stream (tmp_file)
- { init(); }
-
- bool eof()
- {
- if(ptr_ < end_)
- return false;
- else if(end_ < &data_[buffer_size])
- return true;
- else {
- fetch();
- return eof();
- }
- }
-
- template<typename _t>
- void read(_t &dst)
- {
- const char *const p = ptr_ + sizeof(_t);
- if(p > end_) {
- if(end_ < &data_[buffer_size])
- throw File_read_exception(file_name);
- fetch();
- return read(dst);
- }
- dst = *reinterpret_cast<_t*>(ptr_);
- ptr_ += sizeof(_t);
- }
-
- template<typename _t>
- void read(_t* dst, size_t n)
- {
- for(size_t i=0;i<n;++i)
- read(*(dst++));
- }
-
- void read_c_str(string &dst)
- {
- dst.clear();
- char c;
- while(read(c), c != '\0')
- dst.push_back(c);
- }
-
- void read_packed(uint8_t length, uint32_t &dst)
- {
- switch(length) {
- case 0: uint8_t x; read(x); dst = x; break;
- case 1: uint16_t y; read(y); dst = y; break;
- case 2: read(dst);
- }
- }
-
- const char* ptr() const
- { return ptr_; }
-
- void seek(size_t pos)
- {
- //this->clear();
- Input_stream::seek(pos);
- init();
- }
-
-private:
-
- void fetch()
- {
- ptrdiff_t d = end_ - ptr_;
- assert(d >= 0);
- memcpy(&data_[0], ptr_, d);
- ptr_ = &data_[0];
- end_ = read_block(ptr_+d, buffer_size - d);
- }
-
- void init()
- {
- ptr_ = &data_[0];
- end_ = read_block(ptr_, buffer_size);
- }
-
- char* read_block(char* ptr, size_t size)
- { return ptr + Input_stream::read(ptr, size); }
-
- enum { buffer_size = 8192 };
-
- char data_[buffer_size];
- char *ptr_, *end_;
-
-};
-
-struct Buffered_ostream
-{
-
- Buffered_ostream(Output_stream &s):
- stream_ (s),
- ptr_ (&data_[0]),
- end_ (&data_[buffer_size])
- { }
-
- template<typename _t>
- void write(const _t &x)
- {
- const char *const p = ptr_ + sizeof(_t);
- if(p > end_) {
- flush();
- return write(x);
- }
- *reinterpret_cast<_t*>(ptr_) = x;
- ptr_ += sizeof(_t);
- }
-
- void close()
- {
- flush();
- ((Output_stream*)this)->close();
- }
-
-private:
-
- void flush()
- {
- stream_.write(data_, ptr_ - data_);
- ptr_ = data_;
- }
-
- enum { buffer_size = 4096 };
-
- Output_stream &stream_;
- char data_[buffer_size];
- char *ptr_, * const end_;
-
-};
-
#endif /* BINARY_FILE_H_ */
diff --git a/src/util/command_line_parser.cpp b/src/util/command_line_parser.cpp
index 64c38dd..963729d 100644
--- a/src/util/command_line_parser.cpp
+++ b/src/util/command_line_parser.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <iostream>
diff --git a/src/util/command_line_parser.h b/src/util/command_line_parser.h
index cfad844..6a6e350 100644
--- a/src/util/command_line_parser.h
+++ b/src/util/command_line_parser.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef COMMAND_LINE_PARSER_H_
diff --git a/src/util/complexity_filter.h b/src/util/complexity_filter.h
index 7bc4379..88b5eda 100644
--- a/src/util/complexity_filter.h
+++ b/src/util/complexity_filter.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef COMPLEXITY_FILTER_H_
diff --git a/src/util/compressed_stream.cpp b/src/util/compressed_stream.cpp
index 163b0e6..47fe6ec 100644
--- a/src/util/compressed_stream.cpp
+++ b/src/util/compressed_stream.cpp
@@ -1,31 +1,29 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <stdexcept>
-#include "compressed_stream.h"
-
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
-# include <fcntl.h>
-# include <io.h>
-# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
-#else
-# define SET_BINARY_MODE(file)
+#include <stdio.h>
+#ifndef _MSC_VER
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
#endif
+#include "compressed_stream.h"
Compressed_istream::Compressed_istream(const string &file_name):
Input_stream(file_name),
@@ -73,13 +71,9 @@ size_t Compressed_istream::read_bytes(char *ptr, size_t count)
int ret = inflate(&strm, Z_NO_FLUSH);
if (ret == Z_STREAM_END) {
- if(strm.avail_in == 0 && feof(this->f_))
- eos_ = true;
- else {
- int ret = inflateInit2(&strm, 15 + 32);
- if (ret != Z_OK)
- throw std::runtime_error("Error initializing compressed stream (inflateInit): " + file_name);
- }
+ int ret = inflateInit2(&strm, 15 + 32);
+ if (ret != Z_OK)
+ throw std::runtime_error("Error initializing compressed stream (inflateInit): " + file_name);
}
else if (ret != Z_OK)
throw std::runtime_error("Inflate error.");
@@ -105,6 +99,17 @@ bool is_gzip_stream(const unsigned char *b)
Input_stream *Compressed_istream::auto_detect(const string &file_name)
{
+ if (file_name.empty())
+ return new Input_stream(file_name);
+#ifndef _MSC_VER
+ struct stat buf;
+ if (stat(file_name.c_str(), &buf) < 0) {
+ perror(0);
+ throw std::runtime_error(string("Error calling stat on file ") + file_name);
+ }
+ if (!S_ISREG(buf.st_mode))
+ return new Input_stream(file_name);
+#endif
unsigned char b[2];
Input_stream f(file_name);
size_t n = f.read(b, 2);
@@ -140,7 +145,7 @@ void Compressed_ostream::deflate_loop(const char * ptr, size_t count, int flush)
} while (strm.avail_out == 0);
}
-void Compressed_ostream::write(const char * ptr, size_t count)
+void Compressed_ostream::write_raw(const char * ptr, size_t count)
{
deflate_loop(ptr, count, Z_NO_FLUSH);
}
diff --git a/src/util/compressed_stream.h b/src/util/compressed_stream.h
index 5dd0e82..52d8c6e 100644
--- a/src/util/compressed_stream.h
+++ b/src/util/compressed_stream.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef COMPRESSED_STREAM_H_
@@ -56,7 +56,7 @@ struct Compressed_ostream : public Output_stream
virtual ~Compressed_ostream()
{}
#endif
- virtual void write(const char *ptr, size_t count);
+ virtual void write_raw(const char *ptr, size_t count);
virtual void close();
private:
void deflate_loop(const char *ptr, size_t count, int code);
diff --git a/src/util/direction.h b/src/util/direction.h
index 6df128c..a43b038 100644
--- a/src/util/direction.h
+++ b/src/util/direction.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DIRECTION_H_
diff --git a/src/util/double_buffer.h b/src/util/double_buffer.h
index 4090a8d..e7cd733 100644
--- a/src/util/double_buffer.h
+++ b/src/util/double_buffer.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef DOUBLE_BUFFER_H_
diff --git a/src/util/hash_function.h b/src/util/hash_function.h
index e6a743a..5d4a2c8 100644
--- a/src/util/hash_function.h
+++ b/src/util/hash_function.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef HASH_FUNCTION_H_
diff --git a/src/util/hash_table.h b/src/util/hash_table.h
index 7200678..1b4bd47 100644
--- a/src/util/hash_table.h
+++ b/src/util/hash_table.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef HASH_TABLE_H_
@@ -208,7 +208,29 @@ private:
};
+struct Modulo2 {};
+
+template<typename _t>
+inline uint64_t modulo(uint64_t x, uint64_t y)
+{
+ return x % y;
+}
+
+template<>
+inline uint64_t modulo<Modulo2>(uint64_t x, uint64_t y)
+{
+ return x & (y - 1);
+}
+struct No_hash
+{
+ uint64_t operator()(uint64_t x) const
+ {
+ return x;
+ }
+};
+
+template<typename _mod, typename _hash>
struct PHash_set
{
@@ -229,15 +251,16 @@ public:
bool contains(uint64_t key) const
{
- fp *entry = get_entry(key);
- return *entry != 0;
+ fp *p;
+ return get_entry(key, p);
}
void insert(uint64_t key)
{
- fp *entry = get_entry(key);
+ fp *entry;
+ get_entry(key, entry);
if (*entry == (fp)0)
- *entry = finger_print(murmur_hash()(key));
+ *entry = finger_print(_hash()(key));
}
size_t size() const
@@ -245,19 +268,33 @@ public:
return size_;
}
+ double load() const
+ {
+ size_t n = 0;
+ for (size_t i = 0; i < size_; ++i)
+ if (*(table.get() + i) != 0)
+ ++n;
+ return (double)n / size_;
+ }
+
private:
static fp finger_print(uint64_t hash)
{
- return std::max((fp)(hash & ((1llu<<(sizeof(fp)*8))-1llu)), (fp)1);
+ const fp x = (fp)(hash & ((1llu << (sizeof(fp) * 8)) - 1llu));
+ return std::max(x, (fp)1);
}
- fp* get_entry(uint64_t key) const
+ bool get_entry(uint64_t key, fp*& p) const
{
- const uint64_t hash = murmur_hash()(key), f = finger_print(hash);
- fp *p = table.get() + ((hash >> sizeof(fp)*8) % size_);
+ const uint64_t hash = _hash()(key), f = finger_print(hash);
+ p = table.get() + modulo<_mod>(hash >> sizeof(fp) * 8, size_);
bool wrapped = false;
- while (*p != f && *p != (fp)0) {
+ while (true) {
+ if (*p == f)
+ return true;
+ if (*p == (fp)0)
+ return false;
++p;
if (p == table.get() + size_) {
if (wrapped)
@@ -266,7 +303,6 @@ private:
wrapped = true;
}
}
- return p;
}
auto_ptr<fp> table;
diff --git a/src/util/high_res_timer.h b/src/util/high_res_timer.h
new file mode 100644
index 0000000..3c0e980
--- /dev/null
+++ b/src/util/high_res_timer.h
@@ -0,0 +1,61 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#else
+#include <time.h>
+#endif
+
+struct High_res_timer {
+
+ High_res_timer():
+#ifdef _MSC_VER
+ time_(__rdtsc())
+#else
+ time_(0)
+#endif
+ {
+ }
+
+ uint64_t get() const
+ {
+#ifdef _MSC_VER
+ return __rdtsc() - time_;
+#else
+ return 0;
+#endif
+ }
+
+ uint64_t nanoseconds() {
+#ifdef _MSC_VER
+ return 0;
+#else
+ return 0;
+#endif
+ }
+
+ double microseconds()
+ {
+ return nanoseconds() / 1000.0;
+ }
+
+private:
+ unsigned long long time_;
+
+};
\ No newline at end of file
diff --git a/src/util/log_stream.h b/src/util/log_stream.h
index 22525dc..9aacba1 100644
--- a/src/util/log_stream.h
+++ b/src/util/log_stream.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef LOG_STREAM_H_
@@ -52,9 +50,11 @@ struct Message_stream
if(to_cout_)
((*_Pfn)(std::cout));
if (to_file_) {
+ mtx.lock();
std::ofstream f("diamond.log", std::ios_base::out | std::ios_base::app);
((*_Pfn)(f));
f.close();
+ mtx.unlock();
}
return *this;
}
diff --git a/src/util/map.h b/src/util/map.h
index 48ccba4..0c0c632 100644
--- a/src/util/map.h
+++ b/src/util/map.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef MAP_H_
@@ -35,7 +33,7 @@ struct Map
void operator++()
{ begin_ = end_; end_ = get_end(); }
bool valid() const
- { return begin_ < parent_end_; }
+ { return begin_ != parent_end_; }
_it& begin()
{ return begin_; }
_it& end()
@@ -44,7 +42,7 @@ struct Map
_it get_end() const
{
_it i = begin_;
- while(i < parent_end_ && _key()(*i) == _key()(*begin_)) ++i;
+ while(i != parent_end_ && _key()(*i) == _key()(*begin_)) ++i;
return i;
}
_it begin_, parent_end_, end_;
diff --git a/src/util/merge_sort.h b/src/util/merge_sort.h
index 3868175..a4a66e6 100644
--- a/src/util/merge_sort.h
+++ b/src/util/merge_sort.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef MERGE_SORT_H_
diff --git a/src/util/ptr_vector.h b/src/util/ptr_vector.h
index 1e207e7..e6d43df 100644
--- a/src/util/ptr_vector.h
+++ b/src/util/ptr_vector.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef PTR_VECTOR_H_
diff --git a/src/util/radix_sort.h b/src/util/radix_sort.h
index ea96806..b3eeafd 100644
--- a/src/util/radix_sort.h
+++ b/src/util/radix_sort.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2015-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef RADIX_SORT_H_
diff --git a/src/util/seq_file_format.cpp b/src/util/seq_file_format.cpp
index 5cd2037..4e45e28 100644
--- a/src/util/seq_file_format.cpp
+++ b/src/util/seq_file_format.cpp
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "seq_file_format.h"
diff --git a/src/util/seq_file_format.h b/src/util/seq_file_format.h
index 5b60eda..5ee3cb7 100644
--- a/src/util/seq_file_format.h
+++ b/src/util/seq_file_format.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SEQ_FILE_FORMAT_H_
diff --git a/src/util/simd.cpp b/src/util/simd.cpp
new file mode 100644
index 0000000..fa437ed
--- /dev/null
+++ b/src/util/simd.cpp
@@ -0,0 +1,60 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "simd.h"
+
+#ifdef _WIN32
+#define cpuid(info,x) __cpuidex(info,x,0)
+#else
+inline void cpuid(int CPUInfo[4], int InfoType) {
+ __asm__ __volatile__(
+ "cpuid":
+ "=a" (CPUInfo[0]),
+ "=b" (CPUInfo[1]),
+ "=c" (CPUInfo[2]),
+ "=d" (CPUInfo[3]) :
+ "a" (InfoType), "c" (0)
+ );
+}
+#endif
+
+void check_simd()
+{
+#ifdef __SSE2__
+ int info[4];
+ cpuid(info, 0);
+ int nids = info[0];
+ if (nids >= 1) {
+ cpuid(info, 1);
+ }
+ else
+ throw std::runtime_error("Incompatible CPU type. Please try to compile the software from source.");
+#endif
+#ifdef __SSSE3__
+ if ((info[2] & (1 << 9)) == 0)
+ throw std::runtime_error("CPU does not support SSSE3. Please compile the software from source.");
+#endif
+#ifdef __POPCNT__
+ if ((info[2] & (1 << 23)) == 0)
+ throw std::runtime_error("CPU does not support POPCNT. Please compile the software from source.");
+#endif
+#ifdef __SSE4_1__
+ if ((info[2] & (1 << 19)) == 0)
+ throw std::runtime_error("CPU does not support SSE4.1. Please compile the software from source.");
+#endif
+}
\ No newline at end of file
diff --git a/src/util/simd.h b/src/util/simd.h
index 5992f5c..1fc890e 100644
--- a/src/util/simd.h
+++ b/src/util/simd.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SIMD_H_
@@ -22,41 +22,22 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "system.h"
#ifdef _MSC_VER
-#define __MMX__
-#define __SSE__
#define __SSE2__
#define __SSSE3__
#define __SSE4_1__
+#define __POPCNT__
#endif
#ifdef __SSSE3__
#include <tmmintrin.h>
#endif
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-#ifdef __MMX__
-#include <mmintrin.h>
-#endif
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
#ifdef __SSE2__
#include <emmintrin.h>
#endif
-
-inline bool check_SSSE3()
-{
-#ifdef __SSSE3__
- int info[4];
- cpuid(info, 0);
- int nids = info[0];
- if (nids >= 1) {
- cpuid(info, 1);
- return (info[2] & (1 << 9)) != 0;
- }
+#ifdef __SSE4_1__
+#include <smmintrin.h>
#endif
- return false;
-}
+
+void check_simd();
#endif
\ No newline at end of file
diff --git a/src/util/system.h b/src/util/system.h
index 6d507ad..674e3c2 100644
--- a/src/util/system.h
+++ b/src/util/system.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef SYSTEM_H_
@@ -23,32 +21,13 @@ Author: Benjamin Buchfink
#include <stdexcept>
-#ifdef _WIN32
-#define cpuid(info,x) __cpuidex(info,x,0)
-#else
-inline void cpuid(int CPUInfo[4],int InfoType) {
- __asm__ __volatile__ (
- "cpuid":
- "=a" (CPUInfo[0]),
- "=b" (CPUInfo[1]),
- "=c" (CPUInfo[2]),
- "=d" (CPUInfo[3]) :
- "a" (InfoType), "c" (0)
- );
-}
-#endif
-
#ifdef _MSC_VER
#define PACKED_ATTRIBUTE
-#define FTELL(x) _ftelli64(x)
-#define FSEEK(x,y,z) _fseeki64(x,y,z)
#else
#define PACKED_ATTRIBUTE __attribute__((packed))
-#define FTELL(x) ftell(x)
-#define FSEEK(x,y,z) fseek(x,y,z)
#endif
diff --git a/src/util/system_c.h b/src/util/system_c.h
index 1c8023d..01ed7cd 100644
--- a/src/util/system_c.h
+++ b/src/util/system_c.h
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
#ifndef SYSTEM_C_H_
#define SYSTEM_C_H_
diff --git a/src/util/task_queue.h b/src/util/task_queue.h
index dad5bc1..96346e3 100644
--- a/src/util/task_queue.h
+++ b/src/util/task_queue.h
@@ -1,21 +1,19 @@
/****
-Copyright (c) 2014, University of Tuebingen
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-****
-Author: Benjamin Buchfink
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TASK_QUEUE_H_
diff --git a/src/util/temp_file.h b/src/util/temp_file.h
index 3094a11..068338c 100644
--- a/src/util/temp_file.h
+++ b/src/util/temp_file.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-16, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TEMP_FILE_H_
diff --git a/src/util/text_buffer.h b/src/util/text_buffer.h
index a263fbf..df25e23 100644
--- a/src/util/text_buffer.h
+++ b/src/util/text_buffer.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef TEXT_BUFFER_H_
@@ -31,7 +31,8 @@ struct Text_buffer
Text_buffer():
data_ (0),
- ptr_ (data_)
+ ptr_ (data_),
+ alloc_size_(0)
{ }
~Text_buffer()
@@ -41,8 +42,9 @@ struct Text_buffer
void reserve(size_t n)
{
- const size_t s = ptr_ - data_, new_size = s + n + block_size - ((s + n) & (block_size - 1));
- data_ = (char*)realloc(data_, new_size);
+ const size_t s = ptr_ - data_;
+ alloc_size_ = s + n + block_size - ((s + n) & (block_size - 1));
+ data_ = (char*)realloc(data_, alloc_size_);
ptr_ = data_ + s;
if (data_ == 0) throw std::runtime_error("Failed to allocate memory.");
}
@@ -110,6 +112,11 @@ struct Text_buffer
size_t size() const
{ return ptr_ - data_; }
+ size_t alloc_size() const
+ {
+ return alloc_size_;
+ }
+
Text_buffer& operator<<(const string &s)
{
const size_t l = s.length();
@@ -134,8 +141,8 @@ struct Text_buffer
*(ptr_++) = c;
return *this;
}
-
- Text_buffer& operator<<(uint32_t x)
+
+ Text_buffer& operator<<(unsigned int x)
{
//write(x);
reserve(16);
@@ -151,10 +158,17 @@ struct Text_buffer
return *this;
}
- Text_buffer& operator<<(uint64_t x)
+ Text_buffer& operator<<(unsigned long x)
+ {
+ reserve(32);
+ ptr_ += sprintf(ptr_, "%lu", x);
+ return *this;
+ }
+
+ Text_buffer& operator<<(unsigned long long x)
{
reserve(32);
- ptr_ += sprintf(ptr_, "%llu", (unsigned long long)x);
+ ptr_ += sprintf(ptr_, "%llu", x);
return *this;
}
@@ -182,7 +196,12 @@ struct Text_buffer
Text_buffer& print(unsigned i, unsigned width)
{
reserve(16);
- ptr_ += sprintf(ptr_, "%4u", i);
+ char buf[16];
+ const int n = sprintf(buf, "%u", i),
+ padding = std::max((int)width - n, 0);
+ memset(ptr_, ' ', padding);
+ memcpy(ptr_ + padding, buf, n);
+ ptr_ += padding + n;
return *this;
}
@@ -208,8 +227,9 @@ struct Text_buffer
}
protected:
- enum { block_size = 65536 };
+ enum { block_size = 4096 };
char *data_, *ptr_;
+ size_t alloc_size_;
};
diff --git a/src/util/thread.h b/src/util/thread.h
index 443589f..f37970c 100644
--- a/src/util/thread.h
+++ b/src/util/thread.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef THREAD_H_
@@ -83,18 +83,14 @@ void launch_thread_pool(_context &context, unsigned threads)
vector<tthread::thread*> t;
vector<Thread_p<_context> > p;
p.reserve(threads);
- unsigned n = 0;
for(unsigned i=0;i<threads;++i) {
p.push_back(Thread_p<_context> (i, context));
t.push_back(new tthread::thread(pool_worker<_context>, (void*)&p.back()));
- n += t.back()->get_id() == tthread::thread::id () ? 0 : 1;
}
- for(vector<tthread::thread*>::iterator i=t.begin();i!=t.end();++i) {
+ for (vector<tthread::thread*>::iterator i = t.begin(); i != t.end(); ++i) {
(*i)->join();
delete *i;
}
- if(n != threads)
- throw std::runtime_error("Failed to create thread.");
}
template<typename _context>
@@ -123,6 +119,31 @@ void launch_scheduled_thread_pool(_context &context, unsigned count, unsigned th
launch_thread_pool(c, threads);
}
+template<typename _f>
+struct Thread_p0
+{
+ Thread_p0(_f f) :
+ f(f)
+ { }
+ _f f;
+};
+
+template<typename _f>
+void thread_worker(void *p)
+{
+ Thread_p0<_f> *q = (Thread_p0<_f>*)p;
+ q->f();
+ delete q;
+ TLS::clear();
+}
+
+template<typename _f>
+thread* launch_thread(_f f)
+{
+ return new thread(thread_worker<_f>, new Thread_p0<_f>(f));
+}
+
+
template<typename _f, typename _t1>
struct Thread_p1
{
@@ -271,6 +292,41 @@ thread* launch_thread(_f f, _t1 p1, _t2 p2, _t3 p3, _t4 p4, _t5 p5)
return new thread(thread_worker<_f, _t1, _t2, _t3, _t4, _t5>, new Thread_p5<_f, _t1, _t2, _t3, _t4, _t5>(f, p1, p2, p3, p4, p5));
}
+template<typename _f, typename _t1, typename _t2, typename _t3, typename _t4, typename _t5, typename _t6>
+struct Thread_p6
+{
+ Thread_p6(_f f, _t1 p1, _t2 p2, _t3 p3, _t4 p4, _t5 p5, _t6 p6) :
+ f(f),
+ p1(p1),
+ p2(p2),
+ p3(p3),
+ p4(p4),
+ p5(p5),
+ p6(p6)
+ { }
+ _f f;
+ _t1 p1;
+ _t2 p2;
+ _t3 p3;
+ _t4 p4;
+ _t5 p5;
+ _t6 p6;
+};
+
+template<typename _f, typename _t1, typename _t2, typename _t3, typename _t4, typename _t5, typename _t6>
+void thread_worker(void *p)
+{
+ Thread_p6<_f, _t1, _t2, _t3, _t4, _t5,_t6> *q = (Thread_p6<_f, _t1, _t2, _t3, _t4, _t5,_t6>*)p;
+ q->f(q->p1, q->p2, q->p3, q->p4, q->p5,q->p6);
+ delete q;
+ TLS::clear();
+}
+
+template<typename _f, typename _t1, typename _t2, typename _t3, typename _t4, typename _t5,typename _t6>
+thread* launch_thread(_f f, _t1 p1, _t2 p2, _t3 p3, _t4 p4, _t5 p5, _t6 p6)
+{
+ return new thread(thread_worker<_f, _t1, _t2, _t3, _t4, _t5, _t6>, new Thread_p6<_f, _t1, _t2, _t3, _t4, _t5, _t6>(f, p1, p2, p3, p4, p5, p6));
+}
struct Thread_pool : public vector<thread*>
{
diff --git a/src/util/tinythread.cpp b/src/util/tinythread.cpp
index 813a3bc..c8c1428 100644
--- a/src/util/tinythread.cpp
+++ b/src/util/tinythread.cpp
@@ -23,11 +23,14 @@ freely, subject to the following restrictions:
#include <exception>
#include <iostream>
+#include <stdexcept>
#include "tinythread.h"
#include "log_stream.h"
#if defined(_TTHREAD_POSIX_)
#include <unistd.h>
+#include <pthread.h>
+#include <errno.h>
#include <map>
#elif defined(_TTHREAD_WIN32_)
#include <process.h>
@@ -36,183 +39,191 @@ freely, subject to the following restrictions:
namespace tthread {
-//------------------------------------------------------------------------------
-// condition_variable
-//------------------------------------------------------------------------------
-// NOTE 1: The Win32 implementation of the condition_variable class is based on
-// the corresponding implementation in GLFW, which in turn is based on a
-// description by Douglas C. Schmidt and Irfan Pyarali:
-// http://www.cs.wustl.edu/~schmidt/win32-cv-1.html
-//
-// NOTE 2: Windows Vista actually has native support for condition variables
-// (InitializeConditionVariable, WakeConditionVariable, etc), but we want to
-// be portable with pre-Vista Windows versions, so TinyThread++ does not use
-// Vista condition variables.
-//------------------------------------------------------------------------------
+ //------------------------------------------------------------------------------
+ // condition_variable
+ //------------------------------------------------------------------------------
+ // NOTE 1: The Win32 implementation of the condition_variable class is based on
+ // the corresponding implementation in GLFW, which in turn is based on a
+ // description by Douglas C. Schmidt and Irfan Pyarali:
+ // http://www.cs.wustl.edu/~schmidt/win32-cv-1.html
+ //
+ // NOTE 2: Windows Vista actually has native support for condition variables
+ // (InitializeConditionVariable, WakeConditionVariable, etc), but we want to
+ // be portable with pre-Vista Windows versions, so TinyThread++ does not use
+ // Vista condition variables.
+ //------------------------------------------------------------------------------
#if defined(_TTHREAD_WIN32_)
- #define _CONDITION_EVENT_ONE 0
- #define _CONDITION_EVENT_ALL 1
+#define _CONDITION_EVENT_ONE 0
+#define _CONDITION_EVENT_ALL 1
#endif
#if defined(_TTHREAD_WIN32_)
-condition_variable::condition_variable() : mWaitersCount(0)
-{
- mEvents[_CONDITION_EVENT_ONE] = CreateEvent(NULL, FALSE, FALSE, NULL);
- mEvents[_CONDITION_EVENT_ALL] = CreateEvent(NULL, TRUE, FALSE, NULL);
- InitializeCriticalSection(&mWaitersCountLock);
-}
+ condition_variable::condition_variable() : mWaitersCount(0)
+ {
+ mEvents[_CONDITION_EVENT_ONE] = CreateEvent(NULL, FALSE, FALSE, NULL);
+ mEvents[_CONDITION_EVENT_ALL] = CreateEvent(NULL, TRUE, FALSE, NULL);
+ InitializeCriticalSection(&mWaitersCountLock);
+ }
#endif
#if defined(_TTHREAD_WIN32_)
-condition_variable::~condition_variable()
-{
- CloseHandle(mEvents[_CONDITION_EVENT_ONE]);
- CloseHandle(mEvents[_CONDITION_EVENT_ALL]);
- DeleteCriticalSection(&mWaitersCountLock);
-}
+ condition_variable::~condition_variable()
+ {
+ CloseHandle(mEvents[_CONDITION_EVENT_ONE]);
+ CloseHandle(mEvents[_CONDITION_EVENT_ALL]);
+ DeleteCriticalSection(&mWaitersCountLock);
+ }
#endif
#if defined(_TTHREAD_WIN32_)
-void condition_variable::_wait()
-{
- // Wait for either event to become signaled due to notify_one() or
- // notify_all() being called
- int result = WaitForMultipleObjects(2, mEvents, FALSE, INFINITE);
-
- // Check if we are the last waiter
- EnterCriticalSection(&mWaitersCountLock);
- -- mWaitersCount;
- bool lastWaiter = (result == (WAIT_OBJECT_0 + _CONDITION_EVENT_ALL)) &&
- (mWaitersCount == 0);
- LeaveCriticalSection(&mWaitersCountLock);
-
- // If we are the last waiter to be notified to stop waiting, reset the event
- if(lastWaiter)
- ResetEvent(mEvents[_CONDITION_EVENT_ALL]);
-}
+ void condition_variable::_wait()
+ {
+ // Wait for either event to become signaled due to notify_one() or
+ // notify_all() being called
+ int result = WaitForMultipleObjects(2, mEvents, FALSE, INFINITE);
+
+ // Check if we are the last waiter
+ EnterCriticalSection(&mWaitersCountLock);
+ --mWaitersCount;
+ bool lastWaiter = (result == (WAIT_OBJECT_0 + _CONDITION_EVENT_ALL)) &&
+ (mWaitersCount == 0);
+ LeaveCriticalSection(&mWaitersCountLock);
+
+ // If we are the last waiter to be notified to stop waiting, reset the event
+ if (lastWaiter)
+ ResetEvent(mEvents[_CONDITION_EVENT_ALL]);
+ }
#endif
#if defined(_TTHREAD_WIN32_)
-void condition_variable::notify_one()
-{
- // Are there any waiters?
- EnterCriticalSection(&mWaitersCountLock);
- bool haveWaiters = (mWaitersCount > 0);
- LeaveCriticalSection(&mWaitersCountLock);
-
- // If we have any waiting threads, send them a signal
- if(haveWaiters)
- SetEvent(mEvents[_CONDITION_EVENT_ONE]);
-}
+ void condition_variable::notify_one()
+ {
+ // Are there any waiters?
+ EnterCriticalSection(&mWaitersCountLock);
+ bool haveWaiters = (mWaitersCount > 0);
+ LeaveCriticalSection(&mWaitersCountLock);
+
+ // If we have any waiting threads, send them a signal
+ if (haveWaiters)
+ SetEvent(mEvents[_CONDITION_EVENT_ONE]);
+ }
#endif
#if defined(_TTHREAD_WIN32_)
-void condition_variable::notify_all()
-{
- // Are there any waiters?
- EnterCriticalSection(&mWaitersCountLock);
- bool haveWaiters = (mWaitersCount > 0);
- LeaveCriticalSection(&mWaitersCountLock);
-
- // If we have any waiting threads, send them a signal
- if(haveWaiters)
- SetEvent(mEvents[_CONDITION_EVENT_ALL]);
-}
+ void condition_variable::notify_all()
+ {
+ // Are there any waiters?
+ EnterCriticalSection(&mWaitersCountLock);
+ bool haveWaiters = (mWaitersCount > 0);
+ LeaveCriticalSection(&mWaitersCountLock);
+
+ // If we have any waiting threads, send them a signal
+ if (haveWaiters)
+ SetEvent(mEvents[_CONDITION_EVENT_ALL]);
+ }
#endif
-//------------------------------------------------------------------------------
-// POSIX pthread_t to unique thread::id mapping logic.
-// Note: Here we use a global thread safe std::map to convert instances of
-// pthread_t to small thread identifier numbers (unique within one process).
-// This method should be portable across different POSIX implementations.
-//------------------------------------------------------------------------------
+ //------------------------------------------------------------------------------
+ // POSIX pthread_t to unique thread::id mapping logic.
+ // Note: Here we use a global thread safe std::map to convert instances of
+ // pthread_t to small thread identifier numbers (unique within one process).
+ // This method should be portable across different POSIX implementations.
+ //------------------------------------------------------------------------------
#if defined(_TTHREAD_POSIX_)
-static thread::id _pthread_t_to_ID(const pthread_t &aHandle)
-{
- static mutex idMapLock;
- static std::map<pthread_t, unsigned long int> idMap;
- static unsigned long int idCount(1);
-
- lock_guard<mutex> guard(idMapLock);
- if(idMap.find(aHandle) == idMap.end())
- idMap[aHandle] = idCount ++;
- return thread::id(idMap[aHandle]);
-}
+ static thread::id _pthread_t_to_ID(const pthread_t &aHandle)
+ {
+ static mutex idMapLock;
+ static std::map<pthread_t, unsigned long int> idMap;
+ static unsigned long int idCount(1);
+
+ lock_guard<mutex> guard(idMapLock);
+ if (idMap.find(aHandle) == idMap.end())
+ idMap[aHandle] = idCount++;
+ return thread::id(idMap[aHandle]);
+ }
#endif // _TTHREAD_POSIX_
-//------------------------------------------------------------------------------
-// thread
-//------------------------------------------------------------------------------
+ //------------------------------------------------------------------------------
+ // thread
+ //------------------------------------------------------------------------------
-/// Information to pass to the new thread (what to run).
-struct _thread_start_info {
- void (*mFunction)(void *); ///< Pointer to the function to be executed.
- void * mArg; ///< Function argument for the thread function.
- thread * mThread; ///< Pointer to the thread object.
-};
+ /// Information to pass to the new thread (what to run).
+ struct _thread_start_info {
+ void(*mFunction)(void *); ///< Pointer to the function to be executed.
+ void * mArg; ///< Function argument for the thread function.
+ thread * mThread; ///< Pointer to the thread object.
+ };
-// Thread wrapper function.
+ // Thread wrapper function.
#if defined(_TTHREAD_WIN32_)
-unsigned WINAPI thread::wrapper_function(void * aArg)
+ unsigned WINAPI thread::wrapper_function(void * aArg)
#elif defined(_TTHREAD_POSIX_)
-void * thread::wrapper_function(void * aArg)
+ void * thread::wrapper_function(void * aArg)
#endif
-{
- // Get thread startup information
- _thread_start_info * ti = (_thread_start_info *) aArg;
-
- try
- {
- // Call the actual client thread function
- ti->mFunction(ti->mArg);
- }
- catch (std::exception &e) {
- std::cerr << "Error: " << e.what() << std::endl;
- log_stream << "Error: " << e.what() << std::endl;
- std::terminate();
- }
- catch(...)
- {
- // Uncaught exceptions will terminate the application (default behavior
- // according to C++11)
- std::terminate();
- }
-
- // The thread is no longer executing
- lock_guard<mutex> guard(ti->mThread->mDataMutex);
- //ti->mThread->mNotAThread = true;
-
- // The thread is responsible for freeing the startup information
- delete ti;
-
- return 0;
-}
-
-thread::thread(void (*aFunction)(void *), void * aArg)
-{
- // Serialize access to this thread structure
- lock_guard<mutex> guard(mDataMutex);
-
- // Fill out the thread startup information (passed to the thread wrapper,
- // which will eventually free it)
- _thread_start_info * ti = new _thread_start_info;
- ti->mFunction = aFunction;
- ti->mArg = aArg;
- ti->mThread = this;
-
- // The thread is now alive
- mNotAThread = false;
-
- // Create the thread
+ {
+ // Get thread startup information
+ _thread_start_info * ti = (_thread_start_info *)aArg;
+
+ try
+ {
+ // Call the actual client thread function
+ ti->mFunction(ti->mArg);
+ }
+ catch (std::exception &e) {
+ std::cerr << "Error: " << e.what() << std::endl;
+ log_stream << "Error: " << e.what() << std::endl;
+ std::terminate();
+ }
+ catch (...)
+ {
+ // Uncaught exceptions will terminate the application (default behavior
+ // according to C++11)
+ std::terminate();
+ }
+
+ // The thread is no longer executing
+ lock_guard<mutex> guard(ti->mThread->mDataMutex);
+ //ti->mThread->mNotAThread = true;
+
+ // The thread is responsible for freeing the startup information
+ delete ti;
+
+ return 0;
+ }
+
+ thread::thread(void(*aFunction)(void *), void * aArg)
+ {
+ // Serialize access to this thread structure
+ lock_guard<mutex> guard(mDataMutex);
+
+ // Fill out the thread startup information (passed to the thread wrapper,
+ // which will eventually free it)
+ _thread_start_info * ti = new _thread_start_info;
+ ti->mFunction = aFunction;
+ ti->mArg = aArg;
+ ti->mThread = this;
+
+ // The thread is now alive
+ mNotAThread = false;
+
+ // Create the thread
#if defined(_TTHREAD_WIN32_)
- mHandle = (HANDLE) _beginthreadex(0, 0, wrapper_function, (void *) ti, 0, &mWin32ThreadID);
+ mHandle = (HANDLE)_beginthreadex(0, 0, wrapper_function, (void *)ti, 0, &mWin32ThreadID);
#elif defined(_TTHREAD_POSIX_)
- if(pthread_create(&mHandle, NULL, wrapper_function, (void *) ti) != 0)
- mHandle = 0;
+ int error;
+ if ((error = pthread_create(&mHandle, NULL, wrapper_function, (void *)ti)) != 0) {
+ switch (error) {
+ case EAGAIN: throw std::runtime_error("pthread_create error: Insufficient resources to create another thread, or a system-imposed limit on the number of threads was encountered.");
+ case EINVAL: throw std::runtime_error("pthread_create error: Invalid settings in attr.");
+ case EPERM: throw std::runtime_error("pthread_create error: No permission to set the scheduling policy and parameters specified in attr.");
+ default: throw std::runtime_error("pthread_create error: Unknown error.");
+ }
+ mHandle = 0;
+ }
#endif
// Did we fail to create the thread?
diff --git a/src/util/util.cpp b/src/util/util.cpp
index 1c4463c..82b2b0e 100644
--- a/src/util/util.cpp
+++ b/src/util/util.cpp
@@ -1,33 +1,25 @@
/****
-Copyright (c) 2016, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
-#include <stdio.h>
-#include <exception>
-#include <sstream>
-#ifndef _MSC_VER
-#include <unistd.h>
-#endif
#include "../basic/config.h"
#include "log_stream.h"
#include "complexity_filter.h"
#include "util.h"
-#include "temp_file.h"
-#include "binary_file.h"
Message_stream message_stream;
Message_stream verbose_stream (false);
@@ -48,44 +40,6 @@ string extract_dir(const string & s)
return s.find_last_of(dir_separator) == string::npos ? "" : s.substr(0, s.find_last_of(dir_separator));
}
-unsigned Temp_file::n = 0;
-uint64_t Temp_file::hash_key;
-
-Temp_file::Temp_file()
-{
- if (n == 0) {
-#ifdef WIN32
- LARGE_INTEGER count;
- QueryPerformanceCounter(&count);
- hash_key = (uint64_t)(count.HighPart + count.LowPart + count.QuadPart + GetCurrentProcessId());
-#else
- timeval count;
- gettimeofday(&count, NULL);
- hash_key = count.tv_sec + count.tv_usec + getpid();
-#endif
- }
- std::stringstream ss;
- ss.setf(std::ios::hex, std::ios::basefield);
- if (config.tmpdir != "")
- ss << config.tmpdir << dir_separator;
- ss << "diamond-" << hash_key << "-" << n++ << ".tmp";
- ss >> this->file_name_;
- this->f_ = fopen(this->file_name_.c_str(), "w+b");
- if (this->f_ == 0)
- throw std::runtime_error("Error opening temporary file: " + this->file_name_);
-#ifndef _MSC_VER
- unlink(this->file_name_.c_str());
-#endif
-}
-
-string Temp_file::get_temp_dir()
-{
- Temp_file t;
- Input_stream f(t);
- f.close_and_delete();
- return extract_dir(f.file_name);
-}
-
Sd::Sd(const vector<Sd> &groups):
A(0),
Q(0),
diff --git a/src/util/util.h b/src/util/util.h
index 69982f6..b0c9479 100644
--- a/src/util/util.h
+++ b/src/util/util.h
@@ -1,19 +1,19 @@
/****
-Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
-All rights reserved.
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#ifndef UTIL_H_
@@ -67,11 +67,11 @@ struct interval
begin_ (0),
end_ (0)
{ }
- interval(unsigned begin, unsigned end):
+ interval(int begin, int end):
begin_ (begin),
end_ (end)
{ }
- unsigned length() const
+ int length() const
{ return end_ > begin_ ? end_ - begin_ : 0; }
unsigned overlap(const interval &rhs) const
{ return intersect(*this, rhs).length(); }
@@ -79,17 +79,32 @@ struct interval
{
return (double)overlap(rhs) / (double)length();
}
- bool includes(unsigned p) const
+ bool includes(int p) const
{ return p >= begin_ && p < end_; }
- friend inline interval intersect(const interval &lhs, const interval &rhs)
- { return interval (std::max(lhs.begin_, rhs.begin_), std::min(lhs.end_, rhs.end_)); }
friend std::ostream& operator<<(std::ostream &os, const interval &x)
{ os << "[" << x.begin_ << ";" << x.end_ << "]"; return os; }
bool operator<(const interval &rhs) const
{
return begin_ < rhs.begin_;
}
- unsigned begin_, end_;
+ friend interval intersect(const interval &lhs, const interval &rhs);
+ int begin_, end_;
+};
+
+inline interval intersect(const interval &lhs, const interval &rhs)
+{
+ return interval(std::max(lhs.begin_, rhs.begin_), std::min(lhs.end_, rhs.end_));
+}
+
+struct Interval_set : public vector<interval>
+{
+ int max_intersect(const interval &x) const
+ {
+ int max = 0;
+ for (const_iterator i = begin(); i < end(); ++i)
+ max = std::max(max, intersect(x, *i).length());
+ return max;
+ }
};
#ifdef __SSE2__
@@ -235,6 +250,21 @@ inline size_t find_first_of(const char *s, const char *delimiters)
return t-s;
}
+inline string get_title(const string &s)
+{
+ return s.substr(0, find_first_of(s.c_str(), Const::id_delimiters));
+}
+
+inline void get_title_def(const string &s, string &title, string &def)
+{
+ const size_t i = find_first_of(s.c_str(), Const::id_delimiters);
+ title = s.substr(0, i);
+ if (i >= s.length())
+ def.clear();
+ else
+ def = s.substr(i + 1);
+}
+
inline size_t print_str(char* buf, const char *s, size_t n)
{
memcpy(buf, s, n);
@@ -284,6 +314,7 @@ inline bool check_dir(const string &path)
#endif
}
+extern const char dir_separator;
string extract_dir(const string &s);
inline std::ostream& indent(std::ostream &str, unsigned n)
@@ -326,6 +357,37 @@ struct Array
_t data_[n];
};
+template<typename _t, int _n>
+struct Static_vector
+{
+ Static_vector():
+ n(0)
+ {}
+ _t& operator[](int i)
+ {
+ return data[i];
+ }
+ const _t& operator[](int i) const
+ {
+ return data[i];
+ }
+ int size() const
+ {
+ return n;
+ }
+ void push_back(const _t &x)
+ {
+ data[n++] = x;
+ }
+ void erase(int i)
+ {
+ memmove(&data[i], &data[i + 1], (--n - i)*sizeof(_t));
+ }
+private:
+ _t data[_n];
+ int n;
+};
+
inline int ctz(uint64_t x)
{
#ifdef _MSC_VER
@@ -464,14 +526,14 @@ inline string print_char(char c)
template<typename _t, int n>
struct Top_list
{
- void add(const _t &x)
+ _t& add(const _t &x)
{
for (int i = 0; i < n; ++i)
if ((int)x >(int)data_[i]) {
if (i < n - 1)
memmove(&data_[i + 1], &data_[i], sizeof(data_)/n*(n - 1 - i));
data_[i] = x;
- return;
+ return data_[i];
}
}
const _t& operator[](unsigned i) const
@@ -482,6 +544,10 @@ struct Top_list
{
return data_[i];
}
+ void sort()
+ {
+ std::sort(&data_[0], &data_[n]);
+ }
private:
_t data_[n];
};
@@ -547,4 +613,40 @@ inline _t safe_cast(size_t x)
return (_t)x;
}
+struct Index_iterator
+{
+ Index_iterator(size_t i) :
+ i(i)
+ {}
+ size_t operator*() const
+ {
+ return i;
+ }
+ bool operator!=(const Index_iterator &rhs) const
+ {
+ return i != rhs.i;
+ }
+ Index_iterator& operator++()
+ {
+ ++i;
+ return *this;
+ }
+ size_t i;
+};
+
+inline double megabytes(size_t x)
+{
+ return (double)x / (1 << 20);
+}
+
+inline int make_multiple(int x, int m)
+{
+ return x % m == 0 ? x : x + m - x%m;
+}
+
+inline uint64_t next_power_of_2(double x)
+{
+ return 1llu << uint64_t(ceil(log(x) / log(2)));
+}
+
#endif /* UTIL_H_ */
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/diamond-aligner.git
More information about the debian-med-commit
mailing list