[med-svn] [libmems] 01/02: Imported Upstream version 1.6.0+4725
Andreas Tille
tille at debian.org
Fri Apr 17 20:36:46 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository libmems.
commit d6a03948c5ba705cb1acc8c5fad540b5d87f78e5
Author: Andreas Tille <tille at debian.org>
Date: Fri Apr 17 22:35:37 2015 +0200
Imported Upstream version 1.6.0+4725
---
AUTHORS | 1 +
COPYING | 340 +++
ChangeLog | 0
INSTALL | 167 ++
Makefile.am | 14 +
NEWS | 0
README | 1 +
TODO | 1 +
autogen.sh | 5 +
configure.ac | 137 ++
doxygen.am | 157 ++
libMems-1.6.pc.in | 12 +
libMems/AbstractGappedAlignment.h | 109 +
libMems/AbstractMatch.h | 392 +++
libMems/Aligner.cpp | 2289 ++++++++++++++++++
libMems/Aligner.h | 307 +++
libMems/Backbone.cpp | 1203 ++++++++++
libMems/Backbone.h | 240 ++
libMems/ClustalInterface.cpp | 576 +++++
libMems/ClustalInterface.h | 101 +
libMems/CompactGappedAlignment.h | 819 +++++++
libMems/DNAFileSML.cpp | 68 +
libMems/DNAFileSML.h | 66 +
libMems/DNAMemorySML.cpp | 48 +
libMems/DNAMemorySML.h | 55 +
libMems/DenseAbstractMatch.h | 169 ++
libMems/DistanceMatrix.h | 327 +++
libMems/FileSML.cpp | 679 ++++++
libMems/FileSML.h | 135 ++
libMems/Files.h | 213 ++
libMems/GappedAligner.h | 73 +
libMems/GappedAlignment.cpp | 77 +
libMems/GappedAlignment.h | 283 +++
libMems/GreedyBreakpointElimination.cpp | 994 ++++++++
libMems/GreedyBreakpointElimination.h | 873 +++++++
libMems/HomologyHMM/algebras.cc | 52 +
libMems/HomologyHMM/algebras.h | 558 +++++
libMems/HomologyHMM/dptables.h | 387 +++
libMems/HomologyHMM/homology.cc | 786 ++++++
libMems/HomologyHMM/homology.h | 188 ++
libMems/HomologyHMM/homology.xml | 217 ++
libMems/HomologyHMM/homologymain.cc | 65 +
libMems/HomologyHMM/parameters.h | 162 ++
libMems/HybridAbstractMatch.h | 315 +++
libMems/Interval.cpp | 25 +
libMems/Interval.h | 958 ++++++++
libMems/IntervalList.cpp | 25 +
libMems/IntervalList.h | 842 +++++++
libMems/Islands.cpp | 320 +++
libMems/Islands.h | 417 ++++
libMems/LCB.h | 70 +
libMems/Makefile.am | 85 +
libMems/MaskedMemHash.cpp | 65 +
libMems/MaskedMemHash.h | 44 +
libMems/Match.h | 33 +
libMems/MatchFinder.cpp | 444 ++++
libMems/MatchFinder.h | 380 +++
libMems/MatchHashEntry.cpp | 203 ++
libMems/MatchHashEntry.h | 147 ++
libMems/MatchList.cpp | 26 +
libMems/MatchList.h | 668 ++++++
libMems/MatchProjectionAdapter.h | 142 ++
libMems/Matrix.h | 174 ++
libMems/MemHash.cpp | 330 +++
libMems/MemHash.h | 208 ++
libMems/Memory.h | 60 +
libMems/MemorySML.cpp | 96 +
libMems/MemorySML.h | 58 +
libMems/MuscleInterface.cpp | 1192 ++++++++++
libMems/MuscleInterface.h | 148 ++
libMems/NumericMatrix.h | 164 ++
libMems/PairwiseMatchAdapter.h | 117 +
libMems/PairwiseMatchFinder.cpp | 73 +
libMems/PairwiseMatchFinder.h | 38 +
libMems/ParallelMemHash.cpp | 133 ++
libMems/ParallelMemHash.h | 75 +
libMems/PhyloTree.cpp | 9 +
libMems/PhyloTree.h | 378 +++
libMems/ProgressiveAligner.cpp | 3945 +++++++++++++++++++++++++++++++
libMems/ProgressiveAligner.h | 637 +++++
libMems/RepeatHash.cpp | 64 +
libMems/RepeatHash.h | 54 +
libMems/RepeatMatch.cpp | 51 +
libMems/RepeatMatch.h | 51 +
libMems/RepeatMatchList.cpp | 300 +++
libMems/RepeatMatchList.h | 66 +
libMems/Scoring.h | 335 +++
libMems/SeedMasks.h | 404 ++++
libMems/SeedOccurrenceList.h | 100 +
libMems/SlotAllocator.cpp | 5 +
libMems/SlotAllocator.h | 170 ++
libMems/SortedMerList.cpp | 826 +++++++
libMems/SortedMerList.h | 323 +++
libMems/SparseAbstractMatch.h | 250 ++
libMems/SubstitutionMatrix.h | 111 +
libMems/SuperInterval.cpp | 124 +
libMems/SuperInterval.h | 81 +
libMems/TreeUtilities.h | 138 ++
libMems/UngappedLocalAlignment.h | 227 ++
libMems/configuration.h | 37 +
libMems/dmSML/Makefile.am | 22 +
libMems/dmSML/aPOSIXaio.c | 124 +
libMems/dmSML/aPOSIXaio.h | 18 +
libMems/dmSML/alibc.c | 47 +
libMems/dmSML/alibc.h | 15 +
libMems/dmSML/alinuxaio.c | 283 +++
libMems/dmSML/alinuxaio.h | 19 +
libMems/dmSML/asyncio.c | 358 +++
libMems/dmSML/asyncio.h | 166 ++
libMems/dmSML/awin32aio.c | 160 ++
libMems/dmSML/awin32aio.h | 18 +
libMems/dmSML/buffer.c | 407 ++++
libMems/dmSML/buffer.h | 203 ++
libMems/dmSML/dmsort.c | 1942 +++++++++++++++
libMems/dmSML/dmsort.h | 197 ++
libMems/dmSML/sml.c | 55 +
libMems/dmSML/sml.h | 79 +
libMems/dmSML/sorting.c | 323 +++
libMems/dmSML/sorting.h | 81 +
libMems/dmSML/timing.c | 164 ++
libMems/dmSML/timing.h | 24 +
libMems/dmSML/util.c | 132 ++
libMems/dmSML/util.h | 28 +
libMems/gnAlignedSequences.cpp | 1570 ++++++++++++
libMems/gnAlignedSequences.h | 401 ++++
libMems/gnRAWSequence.h | 202 ++
libMems/twister.c | 224 ++
libMems/twister.h | 18 +
m4/ac_cxx_namespaces.m4 | 25 +
m4/ax_openmp.m4 | 104 +
m4/ax_prog_doxygen.m4 | 535 +++++
m4/boost.m4 | 1343 +++++++++++
m4/pkg.m4 | 156 ++
projects/libMems.doxygen | 212 ++
projects/libMems.kdevprj | 281 +++
projects/libMems.sln | 20 +
projects/libMems.vcproj | 1033 ++++++++
137 files changed, 41096 insertions(+)
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..7eb5af2
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Aaron Darling <darling(at)cs.wisc.edu>
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d60c31a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..e69de29
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..02a4a07
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,167 @@
+Basic Installation
+==================
+
+ These are generic installation instructions.
+
+ The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation. It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions. Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, a file
+`config.cache' that saves the results of its tests to speed up
+reconfiguring, and a file `config.log' containing compiler output
+(useful mainly for debugging `configure').
+
+ If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release. If at some point `config.cache'
+contains results you don't want to keep, you may remove or edit it.
+
+ The file `configure.in' is used to create `configure' by a program
+called `autoconf'. You only need `configure.in' if you want to change
+it or regenerate `configure' using a newer version of `autoconf'.
+
+The simplest way to compile this package is:
+
+ 1. `cd' to the directory containing the package's source code and type
+ `./configure' to configure the package for your system. If you're
+ using `csh' on an old version of System V, you might need to type
+ `sh ./configure' instead to prevent `csh' from trying to execute
+ `configure' itself.
+
+ Running `configure' takes a while. While running, it prints some
+ messages telling which features it is checking for.
+
+ 2. Type `make' to compile the package.
+
+ 3. Type `make install' to install the programs and any data files and
+ documentation.
+
+ 4. You can remove the program binaries and object files from the
+ source code directory by typing `make clean'.
+
+Compilers and Options
+=====================
+
+ Some systems require unusual options for compilation or linking that
+the `configure' script does not know about. You can give `configure'
+initial values for variables by setting them in the environment. Using
+a Bourne-compatible shell, you can do that on the command line like
+this:
+ CC=c89 CFLAGS=-O2 LIBS=-lposix ./configure
+
+Or on systems that have the `env' program, you can do it like this:
+ env CPPFLAGS=-I/usr/local/include LDFLAGS=-s ./configure
+
+Compiling For Multiple Architectures
+====================================
+
+ You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory. To do this, you must use a version of `make' that
+supports the `VPATH' variable, such as GNU `make'. `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script. `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.
+
+ If you have to use a `make' that does not supports the `VPATH'
+variable, you have to compile the package for one architecture at a time
+in the source code directory. After you have installed the package for
+one architecture, use `make distclean' before reconfiguring for another
+architecture.
+
+Installation Names
+==================
+
+ By default, `make install' will install the package's files in
+`/usr/local/bin', `/usr/local/man', etc. You can specify an
+installation prefix other than `/usr/local' by giving `configure' the
+option `--prefix=PATH'.
+
+ You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files. If you
+give `configure' the option `--exec-prefix=PATH', the package will use
+PATH as the prefix for installing programs and libraries.
+Documentation and other data files will still use the regular prefix.
+
+ If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+Optional Features
+=================
+
+ Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System). The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+ For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+Specifying the System Type
+==========================
+
+ There may be some features `configure' can not figure out
+automatically, but needs to determine by the type of host the package
+will run on. Usually `configure' can figure that out, but if it prints
+a message saying it can not guess the host type, give it the
+`--host=TYPE' option. TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name with three fields:
+ CPU-COMPANY-SYSTEM
+
+See the file `config.sub' for the possible values of each field. If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the host type.
+
+ If you are building compiler tools for cross-compiling, you can also
+use the `--target=TYPE' option to select the type of system they will
+produce code for and the `--build=TYPE' option to select the type of
+system on which you are compiling the package.
+
+Sharing Defaults
+================
+
+ If you want to set default values for `configure' scripts to share,
+you can create a site shell script called `config.site' that gives
+default values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists. Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Operation Controls
+==================
+
+ `configure' recognizes the following options to control how it
+operates.
+
+`--cache-file=FILE'
+ Use and save the results of the tests in FILE instead of
+ `./config.cache'. Set FILE to `/dev/null' to disable caching, for
+ debugging `configure'.
+
+`--help'
+ Print a summary of the options to `configure', and exit.
+
+`--quiet'
+`--silent'
+`-q'
+ Do not print messages saying which checks are being made.
+
+`--srcdir=DIR'
+ Look for the package's source code in directory DIR. Usually
+ `configure' can determine that directory automatically.
+
+`--version'
+ Print the version of Autoconf used to generate the `configure'
+ script, and exit.
+
+`configure' also accepts some other, not widely useful, options.
+
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..19b47bc
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,14 @@
+ACLOCAL_AMFLAGS = -I m4
+include doxygen.am
+MOSTLYCLEANFILES = $(DX_CLEANFILES)
+
+EXTRA_DIST = \
+projects/libMems.doxygen \
+projects/libMems.sln \
+projects/libMems.vcproj
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libMems- at GENERIC_API_VERSION@.pc
+
+SUBDIRS = libMems
+
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..e69de29
diff --git a/README b/README
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/TODO
@@ -0,0 +1 @@
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..fc76ff6
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+mkdir -p config
+autoreconf --force --install -I config -I m4
+echo "Now run ./configure --with-boost=</path/to/boost> --prefix=$HOME && make install"
+echo "Add --disable-shared to the configure line if building on Mac OS X"
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..39159dc
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,137 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_PREREQ([2.59])
+AC_INIT(libMems/Match.h)
+AC_CONFIG_AUX_DIR(config)
+AC_CONFIG_MACRO_DIR([m4])
+
+dnl -----------------------------------------------
+dnl Package name and version number (user defined)
+dnl -----------------------------------------------
+
+GENERIC_LIBRARY_NAME=libMems
+
+#release versioning
+GENERIC_MAJOR_VERSION=1
+GENERIC_MINOR_VERSION=6
+GENERIC_MICRO_VERSION=0
+
+#API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)
+GENERIC_API_VERSION=1.6
+AC_SUBST(GENERIC_API_VERSION)
+
+#shared library versioning
+GENERIC_LIBRARY_VERSION=1:0:0
+# | | |
+# +------+ | +---+
+# | | |
+# current:revision:age
+# | | |
+# | | +- increment if interfaces have been added
+# | | set to zero if interfaces have been removed
+# or changed
+# | +- increment if source code has changed
+# | set to zero if current is incremented
+# +- increment if interfaces have been added, removed or changed
+
+
+
+dnl --------------------------------
+dnl Package name and version number
+dnl --------------------------------
+
+AC_SUBST(GENERIC_LIBRARY_VERSION)
+
+PACKAGE=$GENERIC_LIBRARY_NAME
+AC_SUBST(GENERIC_LIBRARY_NAME)
+
+GENERIC_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION.$GENERIC_MICRO_VERSION
+GENERIC_RELEASE=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION
+AC_SUBST(GENERIC_RELEASE)
+AC_SUBST(GENERIC_VERSION)
+
+VERSION=$GENERIC_VERSION
+
+AM_INIT_AUTOMAKE($PACKAGE, $VERSION, no-define)
+
+dnl Override default O2
+CFLAGS=${CFLAGS-""}
+CXXFLAGS=${CXXFLAGS-""}
+
+AC_PREFIX_DEFAULT(/usr/local)
+
+dnl Checks for programs.
+AC_PROG_CXX
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AM_PROG_LIBTOOL
+AC_SYS_LARGEFILE
+
+dnl Checks for header files.
+AC_HEADER_STDC
+
+dnl Check what compiler we're using
+AM_CONDITIONAL(ICC, test x$CXX = xicc )
+EXTRA_CXX_FLAGS=""
+if( test x$CC == "xgcc" ) then
+ EXTRA_CXX_FLAGS="-Wno-deprecated"
+fi
+AC_SUBST(EXTRA_CXX_FLAGS)
+
+dnl Allow debugging compilation
+AC_ARG_ENABLE(debug,
+[ --enable-debug Turn on debugging],
+[case "${enableval}" in
+ yes) debug=true ;;
+ no) debug=false ;;
+ *) AC_MSG_ERROR(bad value ${enableval} for --enable-debug) ;;
+esac],[debug=false])
+AM_CONDITIONAL(DEBUG, test x$debug = xtrue)
+
+dnl Get location of Boost
+BOOST_REQUIRE(1.34.0)
+AC_CXX_NAMESPACES
+BOOST_FILESYSTEM
+BOOST_PROGRAM_OPTIONS
+BOOST_IOSTREAMS
+
+dnl Get location of libGenome Headers
+PKG_CHECK_MODULES(DEPS, libGenome-1.3 >= 1.3.1 libMUSCLE-3.7 >= 1.0.0)
+AC_SUBST(DEPS_CFLAGS)
+
+dnl Check for OpenMP
+#AX_OPENMP()
+AC_SUBST(OPENMP_CFLAGS)
+dnl OpenMP checker only defines for C when compiling both C and C++
+OPENMP_CXXFLAGS=$OPENMP_CFLAGS
+AC_SUBST(OPENMP_CXXFLAGS)
+
+dnl ensure portability for OS X with these checks
+AC_CHECK_HEADERS(sys/types.h)
+AC_CHECK_HEADERS(sys/aio.h)
+AC_CHECK_HEADERS(aio.h)
+AC_CHECK_HEADERS(features.h)
+
+dnl certain parts of the library need async io and threads
+AC_CHECK_LIB(pthread, pthread_mutex_unlock)
+AC_CHECK_LIB(rt, aio_write)
+AC_SUBST(DEPS_LIBS)
+
+dnl Make doxygen docs
+DX_INIT_DOXYGEN( "libMems", "projects/libMems.doxygen", "doc" )
+
+AM_CONFIG_HEADER(config.h)
+
+dnl Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_C_INLINE
+dnl AC_C_BIGENDIAN
+AC_HEADER_TIME
+
+dnl Checks for library functions.
+AC_PROG_GCC_TRADITIONAL
+
+dnl SAVE_LIBRARY_VERSION
+AC_SUBST(LIBTOOL_VERSION_INFO)
+
+AC_OUTPUT(Makefile libMems/Makefile libMems-1.6.pc )
+#doc/html/Makefile doc/man/Makefile doc/man/man3/Makefile)
diff --git a/doxygen.am b/doxygen.am
new file mode 100644
index 0000000..5046add
--- /dev/null
+++ b/doxygen.am
@@ -0,0 +1,157 @@
+## --------------------------------- ##
+## Format-independent Doxygen rules. ##
+## --------------------------------- ##
+
+if DX_COND_doc
+
+## ------------------------------- ##
+## Rules specific for HTML output. ##
+## ------------------------------- ##
+
+if DX_COND_html
+
+DX_CLEAN_HTML = @DX_DOCDIR@/html
+
+endif DX_COND_html
+
+## ------------------------------ ##
+## Rules specific for CHM output. ##
+## ------------------------------ ##
+
+if DX_COND_chm
+
+DX_CLEAN_CHM = @DX_DOCDIR@/chm
+
+if DX_COND_chi
+
+DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE at .chi
+
+endif DX_COND_chi
+
+endif DX_COND_chm
+
+## ------------------------------ ##
+## Rules specific for MAN output. ##
+## ------------------------------ ##
+
+if DX_COND_man
+
+DX_CLEAN_MAN = @DX_DOCDIR@/man
+
+endif DX_COND_man
+
+## ------------------------------ ##
+## Rules specific for RTF output. ##
+## ------------------------------ ##
+
+if DX_COND_rtf
+
+DX_CLEAN_RTF = @DX_DOCDIR@/rtf
+
+endif DX_COND_rtf
+
+## ------------------------------ ##
+## Rules specific for XML output. ##
+## ------------------------------ ##
+
+if DX_COND_xml
+
+DX_CLEAN_XML = @DX_DOCDIR@/xml
+
+endif DX_COND_xml
+
+## ----------------------------- ##
+## Rules specific for PS output. ##
+## ----------------------------- ##
+
+if DX_COND_ps
+
+DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE at .ps
+
+DX_PS_GOAL = doxygen-ps
+
+doxygen-ps: @DX_DOCDIR@/@PACKAGE at .ps
+
+ at DX_DOCDIR@/@PACKAGE at .ps: @DX_DOCDIR@/@PACKAGE at .tag
+ cd @DX_DOCDIR@/latex; \
+ rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
+ $(DX_LATEX) refman.tex; \
+ $(MAKEINDEX_PATH) refman.idx; \
+ $(DX_LATEX) refman.tex; \
+ countdown=5; \
+ while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
+ refman.log > /dev/null 2>&1 \
+ && test $$countdown -gt 0; do \
+ $(DX_LATEX) refman.tex; \
+ countdown=`expr $$countdown - 1`; \
+ done; \
+ $(DX_DVIPS) -o ../@PACKAGE at .ps refman.dvi
+
+endif DX_COND_ps
+
+## ------------------------------ ##
+## Rules specific for PDF output. ##
+## ------------------------------ ##
+
+if DX_COND_pdf
+
+DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE at .pdf
+
+DX_PDF_GOAL = doxygen-pdf
+
+doxygen-pdf: @DX_DOCDIR@/@PACKAGE at .pdf
+
+ at DX_DOCDIR@/@PACKAGE at .pdf: @DX_DOCDIR@/@PACKAGE at .tag
+ cd @DX_DOCDIR@/latex; \
+ rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
+ $(DX_PDFLATEX) refman.tex; \
+ $(DX_MAKEINDEX) refman.idx; \
+ $(DX_PDFLATEX) refman.tex; \
+ countdown=5; \
+ while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
+ refman.log > /dev/null 2>&1 \
+ && test $$countdown -gt 0; do \
+ $(DX_PDFLATEX) refman.tex; \
+ countdown=`expr $$countdown - 1`; \
+ done; \
+ mv refman.pdf ../@PACKAGE at .pdf
+
+endif DX_COND_pdf
+
+## ------------------------------------------------- ##
+## Rules specific for LaTeX (shared for PS and PDF). ##
+## ------------------------------------------------- ##
+
+if DX_COND_latex
+
+DX_CLEAN_LATEX = @DX_DOCDIR@/latex
+
+endif DX_COND_latex
+
+.PHONY: doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
+
+.INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
+
+doxygen-run: @DX_DOCDIR@/@PACKAGE at .tag
+
+doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
+
+ at DX_DOCDIR@/@PACKAGE at .tag: $(DX_CONFIG) $(pkginclude_HEADERS)
+ rm -rf @DX_DOCDIR@
+ $(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
+
+DX_CLEANFILES = \
+ @DX_DOCDIR@/@PACKAGE at .tag \
+ -r \
+ $(DX_CLEAN_HTML) \
+ $(DX_CLEAN_CHM) \
+ $(DX_CLEAN_CHI) \
+ $(DX_CLEAN_MAN) \
+ $(DX_CLEAN_RTF) \
+ $(DX_CLEAN_XML) \
+ $(DX_CLEAN_PS) \
+ $(DX_CLEAN_PDF) \
+ $(DX_CLEAN_LATEX)
+
+endif DX_COND_doc
+
diff --git a/libMems-1.6.pc.in b/libMems-1.6.pc.in
new file mode 100644
index 0000000..b7186e0
--- /dev/null
+++ b/libMems-1.6.pc.in
@@ -0,0 +1,12 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libMems
+Description: c++ library supporting DNA sequence and genome alignment
+Version: @VERSION@
+Requires: libGenome-1.3 libMUSCLE-3.7
+Libs: -L${libdir} @OPENMP_CXXFLAGS@ -lMems- at GENERIC_API_VERSION@ @BOOST_SYSTEM_LDFLAGS@ @BOOST_SYSTEM_LIBS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_PROGRAM_OPTIONS_LIBS@ @BOOST_IOSTREAMS_LIBS@ @LIBS@
+Cflags: -I${includedir}/@GENERIC_LIBRARY_NAME at -@GENERIC_API_VERSION@ @BOOST_CPPFLAGS@ @OPENMP_CXXFLAGS@ @EXTRA_CXX_FLAGS@
+
diff --git a/libMems/AbstractGappedAlignment.h b/libMems/AbstractGappedAlignment.h
new file mode 100644
index 0000000..91b42bf
--- /dev/null
+++ b/libMems/AbstractGappedAlignment.h
@@ -0,0 +1,109 @@
+/*******************************************************************************
+ * $Id: AbstractGappedAlignment.h,v 1.12 2004/04/19 23:10:50 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __AbstractGappedAlignment_h__
+#define __AbstractGappedAlignment_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/AbstractMatch.h"
+#include "libGenome/gnFilter.h"
+
+namespace mems {
+
+template<class AbstractMatchImpl>
+class AbstractGappedAlignment : public AbstractMatchImpl
+{
+public:
+ AbstractGappedAlignment();
+ AbstractGappedAlignment( uint seq_count, gnSeqI align_length );
+
+ /**
+ * Sets the alignment
+ * @param seq_align should be in row/column format, e.g. one string per sequence (row)
+ */
+ virtual void SetAlignment( const std::vector< std::string >& seq_align ) = 0;
+
+ // Inherited methods from AbstractMatch:
+ gnSeqI Length( uint seqI = UINT_MAX ) const;
+ virtual void SetLength( gnSeqI len, uint seqI ) { length[ seqI ] = len; }
+
+ gnSeqI AlignmentLength() const {return align_length;}
+ void SetAlignmentLength(gnSeqI len){ align_length = len; }
+
+protected:
+ // for use by derived classes in order to swap contents
+ void swap( AbstractGappedAlignment* other );
+private:
+ std::vector< gnSeqI > length;
+ gnSeqI align_length;
+};
+
+
+template<class AbstractMatchImpl>
+AbstractGappedAlignment<AbstractMatchImpl>::AbstractGappedAlignment() : AbstractMatchImpl()
+{
+ align_length = 0;
+}
+
+template<class AbstractMatchImpl>
+AbstractGappedAlignment<AbstractMatchImpl>::AbstractGappedAlignment( uint seq_count, gnSeqI align_length ) : AbstractMatchImpl( seq_count )
+{
+ length = std::vector< gnSeqI >( seq_count, 0 );
+ this->align_length = align_length;
+}
+
+template<class AbstractMatchImpl>
+void AbstractGappedAlignment<AbstractMatchImpl>::swap( AbstractGappedAlignment* other )
+{
+ std::swap( length, other->length );
+ std::swap( align_length, other->align_length );
+ AbstractMatchImpl::swap( other );
+}
+
+template<class AbstractMatchImpl>
+gnSeqI AbstractGappedAlignment<AbstractMatchImpl>::Length( uint seqI ) const
+{
+ if( seqI == UINT_MAX )
+ return align_length;
+ return length[ seqI ];
+}
+
+//template<class AbstractGappedAlignmentImpl>
+void GetAlignment( const AbstractMatch& ga, const std::vector< genome::gnSequence* >& seq_table, std::vector<std::string>& alignment );
+
+//template<class AbstractGappedAlignmentImpl>
+inline
+void GetAlignment( const AbstractMatch& ga, const std::vector< genome::gnSequence* >& seq_table, std::vector<std::string>& alignment )
+{
+ std::vector< bitset_t > aln_mat;
+ ga.GetAlignment(aln_mat);
+ alignment = std::vector<std::string>( aln_mat.size() );
+ const genome::gnFilter* comp_filter = genome::gnFilter::DNAComplementFilter();
+ for( std::size_t seqI = 0; seqI < alignment.size(); seqI++ )
+ {
+ alignment[seqI] = std::string( aln_mat[0].size(), '-' );
+ if( ga.LeftEnd(seqI) == NO_MATCH )
+ continue;
+ std::string cur_seq;
+ seq_table[seqI]->ToString( cur_seq, ga.Length(seqI), ga.LeftEnd(seqI) );
+ if( ga.Orientation(seqI) == AbstractMatch::reverse )
+ comp_filter->ReverseFilter(cur_seq);
+ std::size_t cI = 0;
+ for( std::size_t gI = 0; gI < alignment[seqI].size(); gI++ )
+ if( aln_mat[seqI][gI] )
+ alignment[seqI][gI] = cur_seq[cI++];
+ }
+}
+
+}
+
+#endif // __AbstractGappedAlignment_h__
+
diff --git a/libMems/AbstractMatch.h b/libMems/AbstractMatch.h
new file mode 100644
index 0000000..79d94d4
--- /dev/null
+++ b/libMems/AbstractMatch.h
@@ -0,0 +1,392 @@
+/*******************************************************************************
+ * $Id: AbstractMatch.h,v 1.8 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __AbstractMatch_h__
+#define __AbstractMatch_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include <vector>
+#include <algorithm>
+#include <boost/type_traits/remove_pointer.hpp>
+#include <boost/type_traits/add_pointer.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <libMems/SlotAllocator.h>
+#include <libMems/configuration.h>
+
+namespace mems {
+
+static const gnSeqI NO_MATCH = 0;
+
+
+#ifdef WIN32
+/** define this to force all matches to use boost allocators instead of new/delete */
+//#define _USE_BOOST_MATCH_ALLOCATOR
+//typedef boost::dynamic_bitset<unsigned, boost::pool_allocator<unsigned> > bitset_t;
+
+// slot allocator turns out to have the fastest new/free implementation for single object allocations
+#define _USE_SLOT_ALLOCATOR
+#else
+#define _USE_SLOT_ALLOCATOR
+#endif
+typedef boost::dynamic_bitset<> bitset_t;
+
+#ifdef _USE_SLOT_ALLOCATOR
+#include "libMems/SlotAllocator.h"
+#elif defined(_USE_BOOST_MATCH_ALLOCATOR)
+#include <boost/pool/pool_alloc.hpp>
+#endif
+
+template< typename T >
+T* m_allocateAndCopy( const T& t )
+{
+#ifdef _USE_SLOT_ALLOCATOR
+ SlotAllocator<T>& sat = SlotAllocator<T>::GetSlotAllocator();
+ T* newt = sat.Allocate();
+ newt = new(newt) T(t); // construct a new T at the address given by newt
+// *newt = t;
+ return newt;
+#elif defined(_USE_BOOST_MATCH_ALLOCATOR)
+ boost::fast_pool_allocator< T > fpa;
+ T* newt = boost::fast_pool_allocator< T >::allocate();
+ fpa.construct(newt, t);
+ return newt;
+#else
+ return new T(t);
+#endif
+}
+
+template< typename T >
+void m_free( T* t )
+{
+#ifdef _USE_SLOT_ALLOCATOR
+ SlotAllocator<T>& sat = SlotAllocator<T>::GetSlotAllocator();
+ sat.Free(t);
+#elif defined(_USE_BOOST_MATCH_ALLOCATOR)
+ boost::fast_pool_allocator< T > fpa;
+ fpa.destroy(t);
+ boost::fast_pool_allocator< T >::deallocate(t);
+#else
+ delete t;
+#endif
+}
+
+/**
+ * AbstractMatch is a pure virtual base class that defines an interface for
+ * both gapped and ungapped alignments among several sequences or several regions
+ * of the same sequence
+ */
+class AbstractMatch : public genome::gnClone {
+public:
+
+ enum orientation {
+ forward, /**< the alignment is on the forward strand */
+ reverse, /**< alignment on the reverse strand */
+ undefined /**< there is no alignment on either strand */
+ };
+
+ /** creates a copy of this using a boost::pool::fast_pool_allocator */
+ virtual AbstractMatch* Copy() const = 0;
+
+ /** frees storage used by this object in a boost::pool::fast_pool_allocator */
+ virtual void Free() = 0;
+
+ /** Returns the length of this match */
+ virtual gnSeqI Length( uint seqI ) const = 0;
+
+ /** Sets the length of this match to @param len */
+ virtual void SetLength( gnSeqI len, uint seqI ) = 0;
+
+ /** Deprecated: use LeftEnd() and Orientation() instead.
+ * Returns the start coordinate of this match in sequence @param startI */
+ virtual int64 Start(uint startI) const = 0;
+
+ /** Deprecated: use SetLeftEnd() and SetOrientation instead
+ * Sets the start in sequence @param seqI of this match to @param start */
+ virtual void SetStart(uint seqI, int64 start) = 0;
+
+ /** Deprecated: use LeftEnd() instead
+ * Returns the start coordinate of this match in sequence @param seqI */
+ int64 operator[](uint seqI) const{return Start(seqI);} // this is a synonym for Start()
+
+ /** Deprecated: use RightEnd() instead
+ * Returns the last coordinate of this match in sequence @param seqI */
+ virtual int64 End(uint seqI) const;
+
+ /** Returns the left end coordinate of this match at the seqI'th matching position/sequence */
+ virtual gnSeqI LeftEnd(uint seqI) const = 0;
+
+ /** Returns the right-end coordinate of this match at the seqI'th matching position/sequence
+ (equal to LeftEnd(seqI) + Length(seqI) - 1) */
+ virtual gnSeqI RightEnd(uint seqI) const{ return LeftEnd(seqI) + Length( seqI ) - 1; };
+
+ /** Returns the orientation of this match at the startI'th matching position/sequence,
+ * either AbstractMatch::forward or AbstractMatch::reverse
+ */
+ virtual orientation Orientation(uint seqI) const = 0;
+
+ /** sets the left end coordinate of this match in the seqI'th matching position/sequence */
+ virtual void SetLeftEnd(uint seqI, gnSeqI start) = 0;
+
+ /** sets the relative orientation of this match in the seqI'th matching position/sequence */
+ virtual void SetOrientation(uint seqI, orientation o) = 0;
+
+ /** Shift the left-end coordinates in forward oriented positions by a given amount */
+ virtual void MoveStart(int64 move_amount) = 0;
+ /** Shift the left-end coordinates in reverse oriented positions by a given amount */
+ virtual void MoveEnd(int64 move_amount) = 0;
+
+ /** Returns the multiplicity of the match. e.g. the number of sequences this match occurs in */
+ virtual uint Multiplicity() const = 0;
+
+ /** Returns the number of sequences in the alignment which contains this match */
+ virtual uint SeqCount() const = 0;
+
+ /** Returns the index of the first sequence this match occurs in */
+ virtual uint FirstStart() const = 0;
+
+ /** Returns the total length of this alignment in columns */
+ virtual gnSeqI AlignmentLength() const = 0;
+
+ /** Inverts the coordinates of this match */
+ virtual void Invert() = 0;
+
+ //warning: none of the following do bounds checking.
+ /**
+ * Deprecated: Use CropLeft and CropRight instead
+ * Removes the first <code>crop_amount</code> base pairs from the beginning of the match.
+ */
+ virtual void CropStart(gnSeqI crop_amount) = 0;
+ /**
+ * Deprecated: Use CropLeft and CropRight instead
+ * Removes the last <code>crop_amount</code> base pairs from the end of the match.
+ */
+ virtual void CropEnd(gnSeqI crop_amount) = 0;
+
+ /**
+ * Crop this match from the left
+ * Removes the first <code>crop_amount</code> positions from the left side of the match.
+ */
+ virtual void CropLeft(gnSeqI crop_amount, uint seqI) = 0;
+ /**
+ * Crop this match from the right
+ * Removes the last <code>crop_amount</code> positions from the right side of the match.
+ */
+ virtual void CropRight(gnSeqI crop_amount, uint seqI) = 0;
+
+// virtual AbstractMatch* Split( gnSeqI before_column ) = 0;
+
+ /**
+ * Gets a copy of the alignment as an array of dynamic_bitsets
+ */
+ virtual void GetAlignment( std::vector< bitset_t >& align_matrix ) const = 0;
+
+ /** Given an alignment column index, this function returns the corresponding sequence coordinates
+ * and whether each sequence is aligned in that column
+ * If a given sequence is not represented in the requested column, the position returned
+ * in pos should be that of the first nucleotide to the left of the requested column. If no
+ * nucleotides exist to the left of the requested column, then a NO_MATCH is returned in pos
+ * for that sequence.
+ */
+ virtual void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const = 0;
+
+// gnSeqI SeqPosToColumn( uint seq, int64 pos) const = 0;
+ /** returns true if the given row,column of the alignment has a gap character */
+ virtual bool IsGap( uint seq, gnSeqI col ) const = 0;
+ /** Returns the id of the i-th defined sequence in this match */
+ virtual uint UsedSeq( uint seqI ) const = 0;
+};
+
+inline
+int64 AbstractMatch::End(uint endI) const
+{
+ if( Start(endI) > 0 )
+ return Start(endI) + Length(endI) - 1;
+ return Start(endI);
+}
+
+
+template< typename MatchType >
+class AbstractMatchStartComparator {
+public:
+ AbstractMatchStartComparator( unsigned seq = 0 ){
+ m_seq = seq;
+ }
+ AbstractMatchStartComparator( const AbstractMatchStartComparator& msc ){
+ m_seq = msc.m_seq;
+ }
+ AbstractMatchStartComparator<MatchType>& operator=( const AbstractMatchStartComparator<MatchType>& msc )
+ {
+ m_seq = msc.m_seq;
+ }
+ // TODO?? make this do a wraparound comparison if all is equal?
+ boolean operator()(const MatchType& a, const MatchType& b) const{
+ int start_diff = std::max( a.FirstStart(), m_seq ) - std::max( a.FirstStart(), m_seq );
+ if(start_diff == 0){
+ uint m_count = a.SeqCount();
+ m_count = m_count <= b.SeqCount() ? m_count : b.SeqCount();
+ for(uint seqI = m_seq; seqI < m_count; seqI++){
+ gnSeqI a_start = a.Orientation(seqI) == AbstractMatch::forward ? a.LeftEnd( seqI ) : a.RightEnd( seqI );
+ gnSeqI b_start = b.Orientation(seqI) == AbstractMatch::forward ? b.LeftEnd( seqI ) : b.RightEnd( seqI );
+ int64 diff = a_start - b_start;
+ if(a_start == NO_MATCH || b_start == NO_MATCH)
+ continue;
+ else if(a_start == b_start)
+ continue;
+ else
+ return a_start < b_start;
+ }
+ }
+ return start_diff < 0;
+ }
+private:
+ unsigned m_seq;
+};
+
+template< typename MatchType >
+class AbstractMatchSingleStartComparator {
+public:
+ AbstractMatchSingleStartComparator( unsigned seq = 0 ){
+ m_seq = seq;
+ }
+ AbstractMatchSingleStartComparator( const AbstractMatchSingleStartComparator& msc ){
+ m_seq = msc.m_seq;
+ }
+ AbstractMatchSingleStartComparator<MatchType>& operator=( const AbstractMatchSingleStartComparator<MatchType>& msc )
+ {
+ m_seq = msc.m_seq;
+ }
+ /**
+ * Compare on only one sequence. Undefined matches are less than defined matches
+ */
+ boolean operator()(const MatchType& a, const MatchType& b) const{
+ int64 a_start = a.LeftEnd( m_seq ), b_start = b.LeftEnd( m_seq );
+ if( a_start == NO_MATCH || b_start == NO_MATCH ){
+ if( b_start != NO_MATCH )
+ return true;
+ return false;
+ }
+
+ return a_start < b_start;
+ }
+private:
+ unsigned m_seq;
+};
+
+
+
+template< typename MatchType >
+class MatchStartComparator {
+public:
+ MatchStartComparator( unsigned seq = 0 ){
+ m_seq = seq;
+ }
+ MatchStartComparator( const MatchStartComparator& msc ){
+ m_seq = msc.m_seq;
+ }
+ MatchStartComparator<MatchType>& operator=( const MatchStartComparator<MatchType>& msc )
+ {
+ m_seq = msc.m_seq;
+ }
+ // TODO?? make this do a wraparound comparison if all is equal?
+ boolean operator()(const MatchType* a, const MatchType* b) const{
+ int start_diff = std::max( a->FirstStart(), m_seq ) - std::max( a->FirstStart(), m_seq );
+ if(start_diff == 0){
+ uint m_count = a->SeqCount();
+ m_count = m_count <= b->SeqCount() ? m_count : b->SeqCount();
+ for(uint seqI = m_seq; seqI < m_count; seqI++){
+ gnSeqI a_start = a->Orientation(seqI) == AbstractMatch::forward ? a->LeftEnd( seqI ) : a->RightEnd( seqI );
+ gnSeqI b_start = b->Orientation(seqI) == AbstractMatch::forward ? b->LeftEnd( seqI ) : b->RightEnd( seqI );
+ int64 diff = a_start - b_start;
+ if(a_start == NO_MATCH || b_start == NO_MATCH)
+ continue;
+ else if(a_start == b_start)
+ continue;
+ else
+ return a_start < b_start;
+ }
+ }
+ return start_diff < 0;
+ }
+private:
+ unsigned m_seq;
+};
+
+template< typename MatchType >
+class SingleStartComparator {
+public:
+ SingleStartComparator( unsigned seq = 0 ){
+ m_seq = seq;
+ }
+ SingleStartComparator( const SingleStartComparator& msc ){
+ m_seq = msc.m_seq;
+ }
+ SingleStartComparator<MatchType>& operator=( const SingleStartComparator<MatchType>& msc )
+ {
+ m_seq = msc.m_seq;
+ }
+ /**
+ * Compare on only one sequence. Undefined matches are less than defined matches
+ */
+ boolean operator()(const MatchType* a, const MatchType* b) const{
+ int64 a_start = a->LeftEnd( m_seq ), b_start = b->LeftEnd( m_seq );
+ if( a_start == NO_MATCH || b_start == NO_MATCH ){
+ if( b_start != NO_MATCH )
+ return true;
+ return false;
+ }
+
+ return a_start < b_start;
+ }
+private:
+ unsigned m_seq;
+};
+
+
+template< typename MatchType >
+class SSC {
+public:
+ SSC( unsigned seq = 0 ){
+ m_seq = seq;
+ }
+ SSC( const SSC<MatchType>& msc ){
+ m_seq = msc.m_seq;
+ }
+ SSC<MatchType>& operator=( const SSC<MatchType>& msc )
+ {
+ m_seq = msc.m_seq;
+ }
+ boolean operator()( const typename boost::add_pointer<MatchType>::type& a,
+ const typename boost::add_pointer<MatchType>::type& b) const
+ {
+ return operator()(*a,*b);
+ }
+ /**
+ * Compare on only one sequence. Undefined matches are less than defined matches
+ */
+ boolean operator()(const typename boost::remove_pointer<MatchType>::type& a,
+ const typename boost::remove_pointer<MatchType>::type& b) const{
+ int64 a_start = a.LeftEnd( m_seq ), b_start = b.LeftEnd( m_seq );
+ if( a_start == NO_MATCH || b_start == NO_MATCH ){
+ if( b_start != NO_MATCH )
+ return true;
+ return false;
+ }
+
+ return a_start < b_start;
+ }
+private:
+ unsigned m_seq;
+};
+
+}
+
+#endif // __AbstractMatch_h__
diff --git a/libMems/Aligner.cpp b/libMems/Aligner.cpp
new file mode 100644
index 0000000..2fa6ee2
--- /dev/null
+++ b/libMems/Aligner.cpp
@@ -0,0 +1,2289 @@
+/*******************************************************************************
+ * $Id: Aligner.cpp,v 1.47 2004/04/19 23:10:30 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/MuscleInterface.h" // it's the default gapped aligner
+#include "libGenome/gnRAWSource.h"
+#include "libMems/DistanceMatrix.h"
+#include "libMems/Files.h"
+
+#include <map>
+#include <fstream> // for debugging
+#include <sstream>
+#include <stack>
+#include <algorithm>
+#include <limits>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+
+boolean validateLCB( MatchList& lcb );
+void validateRangeIntersections( vector< MatchList >& lcb_list );
+bool debug_shite = false;
+
+/**
+ * Test code to ensure that an individual LCB is truly collinear
+ */
+boolean validateLCB( MatchList& lcb ){
+ vector< Match* >::iterator lcb_iter = lcb.begin();
+ if( lcb.size() == 0 )
+ return true;
+ uint seq_count = (*lcb_iter)->SeqCount();
+ uint seqI = 0;
+ boolean complain = false;
+ for(; seqI < seq_count; seqI++ ){
+ lcb_iter = lcb.begin();
+ int64 prev_coord = 0;
+ for(; lcb_iter != lcb.end(); lcb_iter++ ){
+ if( (*lcb_iter)->Start( seqI ) == NO_MATCH )
+ continue;
+ else if( prev_coord != 0 && (*lcb_iter)->Start( seqI ) < prev_coord ){
+ complain = true;
+ }
+ prev_coord = (*lcb_iter)->Start( seqI );
+ }
+ }
+ return !complain;
+}
+
+/**
+ * Delete overlapping regions in favor of the larger match.
+ * This code isn't perfect, it can delete too many base pairs in some cases
+ */
+void EliminateOverlaps( MatchList& ml ){
+ if( ml.size() < 2 )
+ return;
+ vector< Match* > result_matches;
+ uint seq_count = ml[0]->SeqCount();
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ SingleStartComparator<AbstractMatch> msc( seqI );
+ sort( ml.begin(), ml.end(), msc );
+ int64 matchI = 0;
+ int64 nextI = 0;
+ int64 deleted_count = 0;
+ vector< Match* > new_matches;
+
+ // scan forward to first defined match
+ for(; matchI != ml.size(); matchI++ )
+ if( ml[ matchI ]->Start( seqI ) != NO_MATCH )
+ break;
+
+ for(; matchI < ml.size(); matchI++ ){
+ if( ml[ matchI ] == NULL )
+ continue;
+
+ for( nextI = matchI + 1; nextI < ml.size(); nextI++ ){
+ if( ml[ nextI ] == NULL )
+ continue;
+
+ boolean deleted_matchI = false;
+ // check for overlaps
+ int64 startI = ml[ matchI ]->Start( seqI );
+ int64 lenI = ml[ matchI ]->Length();
+ int64 startJ = ml[ nextI ]->Start( seqI );
+// int64 diff = absolut( startJ ) - absolut( startI ) - lenI;
+ int64 diff = absolut( startJ ) - absolut( startI ) - lenI;
+
+ if( diff < 0 ){
+ diff = -diff;
+ Match* new_match;
+ // delete bases from the smaller match
+// if( ml[ nextI ]->Length() * ml[ nextI ]->Multiplicity() >=
+// lenI * ml[ matchI ]->Multiplicity() ){
+ if( ( ml[ nextI ]->Multiplicity() > ml[ matchI ]->Multiplicity() ) ||
+ ( ml[ nextI ]->Multiplicity() == ml[ matchI ]->Multiplicity() && ml[ nextI ]->Length() > ml[ matchI ]->Length() ) ){
+ // mem_iter is smaller
+ new_match = ml[matchI]->Copy();
+ // erase base pairs from new_match
+ if( diff >= lenI ){
+// cerr << "Deleting " << **mem_iter << " at the hands of\n" << **next_iter << endl;
+ ml[ matchI ]->Free();
+ ml[ matchI ] = NULL;
+ matchI--;
+ deleted_matchI = true;
+ deleted_count++;
+ }else{
+ if( startI > 0 ){
+ ml[ matchI ]->CropEnd( diff );
+ new_match->CropStart( new_match->Length() - diff );
+ }else{
+ ml[ matchI ]->CropStart( diff );
+ new_match->CropEnd( new_match->Length() - diff );
+ }
+ }
+ }else{
+ // match_iter is smaller
+ new_match = ml[nextI]->Copy();
+ // erase base pairs from new_match
+ if( diff >= ml[ nextI ]->Length() ){
+// cerr << "Deleting " << **next_iter << " at the hands of\n" << **mem_iter << endl;
+ ml[ nextI ]->Free();
+ ml[ nextI ] = NULL;
+ deleted_count++;
+ }else{
+ if( startJ > 0 ){
+ ml[ nextI ]->CropStart( diff );
+ new_match->CropEnd( new_match->Length() - diff );
+ }else{
+ ml[ nextI ]->CropEnd( diff );
+ new_match->CropStart( new_match->Length() - diff );
+ }
+ }
+
+ }
+ new_match->SetStart( seqI, 0 );
+ if( new_match->Multiplicity() > 1 && new_match->Length() > 0 )
+ new_matches.push_back( new_match );
+ else
+ {
+ new_match->Free();
+ new_match = NULL;
+ }
+ if( deleted_matchI )
+ break;
+ }else
+ break; // there are no more overlaps
+ }
+// if( nextI > 1 )
+// cerr << "There were " << nextI << " overlaps\n";
+// if( nextI > config_value_2 )
+// __asm(nop);
+ }
+
+ if( deleted_count > 0 ){
+ result_matches.reserve( ml.size() - deleted_count );
+ for( int64 copyI = 0; copyI < ml.size(); copyI++ ){
+ if( ml[ copyI ] != NULL )
+ result_matches.push_back( ml[ copyI ] );
+ }
+ ml.clear();
+ ml.insert( ml.end(), result_matches.begin(), result_matches.end() );
+ }
+ ml.insert( ml.end(), new_matches.begin(), new_matches.end() );
+ result_matches.clear();
+ new_matches.clear();
+ }
+
+}
+
+
+const gnSeqI default_min_r_gap_size = 200;
+Aligner::Aligner( uint seq_count ) :
+debug(false),
+seq_count(seq_count),
+min_recursive_gap_length(default_min_r_gap_size),
+collinear_genomes(false),
+gal(&(MuscleInterface::getMuscleInterface())),
+permutation_weight(-1),
+cur_min_coverage(-1),
+max_extension_iters(4)
+{}
+
+Aligner::Aligner( const Aligner& al ) :
+//gap_mh( al.gap_mh ),
+nway_mh( al.nway_mh ),
+seq_count( al.seq_count ),
+debug( al.debug),
+LCB_minimum_density( al.LCB_minimum_density),
+LCB_minimum_range( al.LCB_minimum_range ),
+cur_min_coverage( al.cur_min_coverage),
+min_recursive_gap_length( al.min_recursive_gap_length ),
+collinear_genomes( al.collinear_genomes ),
+gal( al.gal ),
+permutation_weight( al.permutation_weight ),
+permutation_filename( al.permutation_filename ),
+max_extension_iters( al.max_extension_iters )
+{}
+
+Aligner& Aligner::operator=( const Aligner& al )
+{
+ gap_mh = al.gap_mh;
+ nway_mh = al.nway_mh;
+ seq_count = al.seq_count;
+ debug = al.debug;
+
+ LCB_minimum_density = al.LCB_minimum_density;
+ LCB_minimum_range = al.LCB_minimum_range;
+
+ cur_min_coverage = al.cur_min_coverage;
+ min_recursive_gap_length = al.min_recursive_gap_length;
+ collinear_genomes = al.collinear_genomes;
+
+ gal = al.gal;
+
+ permutation_weight = al.permutation_weight;
+ permutation_filename = al.permutation_filename;
+
+ max_extension_iters = al.max_extension_iters;
+
+ return *this;
+}
+
+void Aligner::SetMinRecursionGapLength( gnSeqI min_r_gap ) {
+ min_recursive_gap_length = min_r_gap;
+}
+
+void Aligner::SetGappedAligner( GappedAligner& gal ){
+ this->gal = &(gal);
+}
+
+void Aligner::SetMaxGappedAlignmentLength( gnSeqI len ){
+ gal->SetMaxAlignmentLength( len );
+}
+
+
+/* returns true if all labels between start_label and end_label are contained in the no_match_labels set */
+void scanLabels( set< uint >& no_match_labels, uint& start_label, boolean forward ){
+ uint labelI;
+ // scan no_match_labels for consecutive labels starting at start_label until one is missing
+ if( forward ){
+ for( labelI = start_label + 1; ; labelI++){
+ set< uint >::iterator label_iter = no_match_labels.find( labelI );
+ if( label_iter == no_match_labels.end() ){
+ start_label = labelI - 1;
+ break;
+ }
+ }
+ }else{
+ for( labelI = start_label; labelI > 0; labelI--){
+ set< uint >::iterator label_iter = no_match_labels.find( labelI - 1 );
+ if( label_iter == no_match_labels.end() ){
+ start_label = labelI;
+ break;
+ }
+ }
+ }
+}
+
+boolean checkCollinearity( Match* m1, Match* m2 ){
+ for( uint seqI = 0; seqI < m1->SeqCount(); seqI++ ){
+ if( m1->Start( seqI ) == NO_MATCH ||
+ m2->Start( seqI ) == NO_MATCH )
+ continue;
+ if((( m1->Start( seqI ) > 0 &&
+ m2->Start( seqI ) > 0 ) ||
+ (m1->Start( seqI ) < 0 &&
+ m2->Start( seqI ) < 0 )) &&
+ m1->Start( seqI ) <= m2->Start( seqI ) )
+ continue;
+ return false;
+ }
+ return true;
+}
+
+void scanFit( list< LabeledMem >& pair_list, list< LabeledMem >::iterator& list_iter, Match* new_match, uint sort_seq ){
+
+ list< LabeledMem >::iterator cur_iter = list_iter;
+ list< LabeledMem >::iterator last_iter = list_iter;
+// int64 initial_start = absolut( list_iter->mem->Start( sort_seq ) );
+ int64 initial_start = absolut( list_iter->mem->Start( sort_seq ) );
+
+ uint match_count = 0;
+ for(; last_iter != pair_list.end(); ++last_iter ){
+ if( last_iter->mem->Start( sort_seq ) == NO_MATCH ){
+ ++match_count;
+ continue;
+ }
+// if( absolut( last_iter->mem->Start( sort_seq ) ) < initial_start ||
+// absolut( last_iter->mem->Start( sort_seq ) ) > new_match->Start( sort_seq ) )
+ if( absolut( last_iter->mem->Start( sort_seq ) ) < initial_start ||
+ absolut( last_iter->mem->Start( sort_seq ) ) > new_match->Start( sort_seq ) )
+ break;
+ ++match_count;
+ }
+ vector< vector< int > > score_vector;
+ score_vector.reserve( new_match->SeqCount() - sort_seq - 1 );
+ for( uint seqI = sort_seq + 1; seqI < new_match->SeqCount(); ++seqI ){
+ vector< int > sv;
+ score_vector.push_back( sv );
+ score_vector[ score_vector.size() - 1 ].reserve( match_count );
+ }
+ uint matchI = 0;
+ for(; cur_iter != last_iter; ++cur_iter ){
+
+ for( uint seqI = sort_seq + 1; seqI < new_match->SeqCount(); ++seqI ){
+ int64 p_start = cur_iter->mem->Start( seqI );
+ int64 m_start = new_match->Start( seqI );
+ p_start = p_start < 0 ? -p_start : p_start;
+ m_start = m_start < 0 ? -m_start : m_start;
+ if( m_start == NO_MATCH ){
+ score_vector[ seqI - sort_seq - 1 ].push_back( 0 );
+ }else if( p_start == NO_MATCH ){
+ score_vector[ seqI - sort_seq - 1 ].push_back( 0 );
+ }else if( p_start < m_start ){
+ score_vector[ seqI - sort_seq - 1 ].push_back( 1 );
+ }else
+ score_vector[ seqI - sort_seq - 1 ].push_back( -1 );
+ }
+ }
+ vector< int > scores;
+ scores.reserve( match_count );
+ for( matchI = match_count; matchI > 0; matchI-- )
+ scores.push_back( 0 );
+ for( uint seqI = 0; seqI < new_match->SeqCount() - sort_seq - 1; ++seqI ){
+ boolean redefined = false;
+ for( matchI = match_count; matchI > 0; matchI-- ){
+ if( !redefined ){
+ if( score_vector[ seqI ][ matchI - 1 ] >= 0 ){
+ if( score_vector[ seqI ][ matchI - 1 ] == 1 )
+ redefined = true;
+ ++scores[ matchI - 1 ];
+ }
+ }else{
+ if( score_vector[ seqI ][ matchI - 1 ] == -1 )
+ redefined = false;
+ }
+ }
+ }
+ // find the first highest scoring match
+ cur_iter = list_iter;
+ int max_score = 0;
+ for( matchI = 0; matchI < match_count; ++matchI ){
+ if( scores[ matchI ] > max_score ){
+ max_score = scores[ matchI ];
+ list_iter = cur_iter;
+ }
+ ++cur_iter;
+ }
+}
+
+/**
+ * Aaron's subset LCB algorithm.
+ */
+void AaronsLCB( MatchList& mlist, set<uint>& breakpoints ){
+ breakpoints.clear(); // make sure this is empty
+ if( mlist.size() == 0 )
+ return;
+ // can only look for breakpoints if there is more than one match!!
+ if( mlist.size() == 1 ){
+ breakpoints.insert( 0 );
+ return;
+ }
+ uint seq_count = mlist[0]->SeqCount();
+
+ SingleStartComparator<AbstractMatch> msc( 0 );
+ sort( mlist.begin(), mlist.end(), msc );
+ vector<Match*>::iterator mem_iter = mlist.begin();
+ list<LabeledMem> pair_list;
+
+ map<uint, Match*> debug_label_map;
+ boolean debugging = false;
+
+
+ list< PlacementMatch > placement_list;
+
+ for(; mem_iter != mlist.end(); ++mem_iter ){
+ if( (*mem_iter)->Start( 0 ) != NO_MATCH ){
+ // add this one to the list.
+ LabeledMem lm;
+ lm.mem = *mem_iter;
+ lm.label = 0;
+ pair_list.push_back( lm );
+ }else{
+ PlacementMatch pm;
+ pm.mem = *mem_iter;
+ pm.iter = pair_list.end();
+ placement_list.push_back( pm );
+ }
+ }
+ LabeledMemComparator lmc( 0 );
+ pair_list.sort( lmc );
+ list< LabeledMem >::iterator pair_iter = pair_list.begin();
+ for(; pair_iter != pair_list.end(); ++pair_iter ){
+ PlacementMatch pm;
+ pm.mem = pair_iter->mem;
+ pm.iter = pair_iter;
+ placement_list.push_back( pm );
+ }
+
+ // place all the subset matches from each sequence in the correct place in pair_list.
+ for( uint seqI = 1; seqI < seq_count; ++seqI ){
+ PlacementMatchComparator pmc( seqI );
+ placement_list.sort( pmc );
+ list< PlacementMatch >::iterator placement_prev;
+ list< PlacementMatch >::iterator placement_iter = placement_list.begin();
+ if( placement_iter->iter == pair_list.end() &&
+ placement_iter->mem->Start( seqI ) != NO_MATCH ){
+ LabeledMem lm;
+ lm.mem = placement_iter->mem;
+ lm.label = 0;
+ pair_list.insert( pair_list.begin(), lm );
+ placement_iter->iter = pair_list.begin();
+ }
+
+ for( ++placement_iter; placement_iter != placement_list.end(); ++placement_iter ){
+ placement_prev = placement_iter;
+ placement_prev--;
+
+ if( placement_iter->iter != pair_list.end() )
+ continue;
+
+ if( placement_iter->mem->Start( seqI ) == NO_MATCH )
+ continue;
+
+ list< LabeledMem >::iterator insert_iter = placement_prev->iter;
+ if( insert_iter == pair_list.end() || placement_prev->mem->Start( seqI ) == NO_MATCH )
+ insert_iter = pair_list.begin();
+ else{
+ if( insert_iter->mem->Start( seqI ) < 0 ){
+ // invert if necessary and insert before
+ if( placement_iter->mem->Start( seqI ) > 0 )
+ placement_iter->mem->Invert();
+ if( !checkCollinearity( placement_iter->mem, insert_iter->mem ) ){
+ placement_iter->mem->Invert();
+ scanFit( pair_list, insert_iter, placement_iter->mem, seqI );
+ ++insert_iter;
+ }
+ }else{
+ // insert in the earliest place this match fits with surrounding matches
+ scanFit( pair_list, insert_iter, placement_iter->mem, seqI );
+ ++insert_iter;
+ }
+ }
+
+ LabeledMem lm;
+ lm.mem = placement_iter->mem;
+ lm.label = 0;
+ pair_list.insert( insert_iter, lm );
+ placement_iter->iter = insert_iter;
+ placement_iter->iter--;
+ }
+ }
+ boolean debug_labels = false;
+ ofstream debug_label_file;
+ if( debug_labels )
+ debug_label_file.open( "label_debug.txt" );
+ // number the LabeledMems in the pair_list
+ uint cur_label = 0;
+ mlist.clear();
+ vector< LabeledMem > pair_vec;
+ pair_vec.reserve( pair_list.size() );
+ mlist.reserve( pair_list.size() );
+ for( pair_iter = pair_list.begin(); pair_iter != pair_list.end(); ++pair_iter ){
+ pair_iter->label = cur_label++;
+ mlist.push_back( pair_iter->mem );
+ pair_vec.push_back( *pair_iter );
+ if( debug_labels ){
+ debug_label_map.insert( map<uint, Match*>::value_type( pair_iter->label, pair_iter->mem ) );
+ debug_label_file << pair_iter->label << '\t' << (*pair_iter->mem) << endl;
+ }
+ }
+ if( debug_labels )
+ debug_label_file.close();
+
+ breakpoints.clear();
+ pair_list.clear();
+ vector< LabeledMem >::iterator pair_vec_iter;
+ for( uint seqI = 1; seqI < seq_count; seqI++ ){
+ // sort the list on the current genome
+ LabeledMemComparator lmc( seqI );
+ sort( pair_vec.begin(), pair_vec.end(), lmc );
+ set< uint > no_match_labels;
+
+ // debugging code
+/* stringstream debug_filename;
+ debug_filename << "label_sort_" << seqI << ".txt";
+ ofstream debug_file( debug_filename.str().c_str() );
+ for( uint pairI = 0; pairI < pair_vec.size(); pairI++ ){
+ debug_file << pair_vec[ pairI ].label << *pair_vec[ pairI ].mem << endl;
+ }
+ debug_file.close();
+*/ // end debugging code
+
+ pair_vec_iter = pair_vec.begin();
+ uint block_start = pair_vec_iter->label;
+ uint break_label = 0;
+ for( ++pair_vec_iter; pair_vec_iter != pair_vec.end(); ++pair_vec_iter ){
+ vector<LabeledMem>::iterator pair_prev = pair_vec_iter;
+ pair_prev--;
+ break_label = 0;
+ uint scan_label = 0;
+ if( pair_prev->mem->Start( seqI ) == NO_MATCH ){
+ no_match_labels.insert( set< uint >::value_type( pair_prev->label ) );
+ // get the correct block start
+ if( pair_vec_iter->mem->Start( seqI ) < 0 ){
+ block_start = pair_vec_iter->label;
+ scanLabels( no_match_labels, block_start, true );
+ }else if( pair_vec_iter->mem->Start( seqI ) > 0 ){
+ block_start = pair_vec_iter->label;
+ scanLabels( no_match_labels, block_start, false );
+ }
+
+ continue;
+ }
+
+ if( pair_prev->mem->Start( seqI ) < 0 ){
+ // this block would break at its start
+ break_label = block_start;
+ }else{
+ // this block would break at its end
+ break_label = pair_prev->label;
+ scanLabels( no_match_labels, break_label, true );
+ }
+ if( pair_vec_iter->mem->Start( seqI ) < 0 ){
+ // scan forward to the beginning of new block
+ scan_label = pair_vec_iter->label;
+ scanLabels( no_match_labels, scan_label, true );
+ }else{
+ // scan back to the beginning of new block
+ scan_label = pair_vec_iter->label;
+ scanLabels( no_match_labels, scan_label, false );
+ }
+
+ if( pair_vec_iter->mem->Start( seqI ) < 0 &&
+ pair_prev->mem->Start( seqI ) < 0 ){
+ if( scan_label + 1 == pair_prev->label )
+ continue;
+ if( debugging ){
+ map< uint, Match* >::const_iterator debug_iter = debug_label_map.find( pair_vec_iter->label );
+ while( debug_iter->first <= pair_prev->label ){
+ cout << debug_iter->first << '\t' << *(debug_iter->second) << endl;
+ ++debug_iter;
+ }
+ }
+ }else
+ if( pair_vec_iter->mem->Start( seqI ) > 0 &&
+ pair_prev->mem->Start( seqI ) > 0 ){
+
+ if( scan_label - 1 == pair_prev->label )
+ continue;
+ if( debugging ){
+ map< uint, Match* >::const_iterator debug_iter = debug_label_map.find( pair_prev->label );
+ while( debug_iter->first <= pair_vec_iter->label ){
+ cout << debug_iter->first << '\t' << *(debug_iter->second) << endl;
+ ++debug_iter;
+ }
+ }
+ }
+ // check if the missing matches are in the set of non-matches
+
+ // since it didn't meet any of the above
+ // criteria it's a breakpoint. insert the label of the end of the current block
+ // note that if it's a reverse complement block, the end label is really the start label
+ breakpoints.insert( break_label );
+ block_start = scan_label;
+ }
+
+ // insert the correct block ending
+ if( pair_vec_iter != pair_vec.begin() ){
+ pair_vec_iter--;
+
+ if( pair_vec_iter->mem->Start( seqI ) < 0 ){
+ break_label = block_start;
+ }else{
+ break_label = pair_vec_iter->label;
+ scanLabels( no_match_labels, break_label, true );
+ }
+ breakpoints.insert( break_label );
+ }
+ }
+}
+
+/** Set output parameters for permutation matrices */
+void Aligner::SetPermutationOutput( std::string& permutation_filename, int64 permutation_weight )
+{
+ this->permutation_filename = permutation_filename;
+ this->permutation_weight = permutation_weight;
+}
+
+
+void GetLCBCoverage( MatchList& lcb, uint64& coverage ){
+ vector< Match* >::iterator match_iter = lcb.begin();
+ coverage = 0;
+ bool debug = true;
+ for( ; match_iter != lcb.end(); ++match_iter ){
+ coverage += (*match_iter)->Length() * (*match_iter)->Multiplicity();
+
+ // if we have sequence information then
+ // subtract the coverage for any position that contains an N
+ if( lcb.seq_table.size() > 0 )
+ {
+ for( uint seqI = 0; seqI < (*match_iter)->SeqCount(); ++seqI )
+ {
+ gnSeqI lend = absolut((*match_iter)->Start(seqI));
+ gnSeqI length = (*match_iter)->Length();
+ if( lend == 0 )
+ continue;
+ string match_seq = lcb.seq_table[seqI]->ToString(length, lend);
+ for( size_t s = 0; s < match_seq.size(); ++s )
+ if( match_seq[s] == 'n' || match_seq[s] == 'N' )
+ if( (*match_iter)->Start(seqI) > 0 )
+ coverage--;
+ }
+ }
+ }
+}
+
+
+void computeLCBAdjacencies_v2( vector<MatchList>& lcb_list, vector< int64 >& weights, vector< LCB >& adjacencies ){
+ IntervalList iv_list;
+ for( uint lcbI = 0; lcbI < lcb_list.size(); ++lcbI ){
+ vector<AbstractMatch*> asdf;
+ asdf.push_back( lcb_list[ lcbI ].front() );
+ if( lcb_list[lcbI].size() > 1 )
+ asdf.push_back( lcb_list[ lcbI ].back() );
+ Interval iv( asdf.begin(), asdf.end() );
+ iv_list.push_back( iv );
+ }
+ computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
+}
+
+const uint NO_ADJACENCY = (std::numeric_limits<uint>::max)();
+
+/**
+ * Redesign to be more intuitive. left_adjacency is always left, regardless of LCB orientation
+ */
+void computeLCBAdjacencies_v2( IntervalList& iv_list, vector< int64 >& weights, vector< LCB >& adjacencies ){
+ adjacencies.clear(); // start with no LCB adjacencies
+ if( iv_list.size() == 0 )
+ return; // there aren't any LCBs so there aren't any adjacencies!
+
+ uint seq_count = iv_list[0].SeqCount();
+ uint seqI;
+ uint lcbI;
+ adjacencies.resize(iv_list.size());
+ for( lcbI = 0; lcbI < iv_list.size(); ++lcbI ){
+ LCB& lcb = adjacencies[lcbI];
+ lcb.left_end.resize(seq_count);
+ lcb.right_end.resize(seq_count);
+ lcb.left_adjacency.resize(seq_count);
+ lcb.right_adjacency.resize(seq_count);
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ // support "ragged edges" on the ends of LCBs
+ int64 leftI = iv_list[lcbI].LeftEnd(seqI);
+ int64 rightI = NO_MATCH;
+ if( leftI != NO_MATCH )
+ {
+ leftI = iv_list[lcbI].Orientation(seqI) == AbstractMatch::forward ? leftI : -leftI;
+ rightI = iv_list[lcbI].RightEnd(seqI)+1;
+ rightI = iv_list[lcbI].Orientation(seqI) == AbstractMatch::forward ? rightI : -rightI;
+ }
+
+ lcb.left_end[seqI] = leftI;
+ lcb.right_end[seqI] = rightI;
+ lcb.left_adjacency[seqI] = NO_ADJACENCY;
+ lcb.right_adjacency[seqI] = NO_ADJACENCY;
+ }
+ lcb.lcb_id = lcbI;
+ lcb.weight = weights[ lcbI ];
+ lcb.to_be_deleted = false;
+ }
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ LCBLeftComparator llc( seqI );
+ sort( adjacencies.begin(), adjacencies.end(), llc );
+ for( lcbI = 1; lcbI + 1 < iv_list.size(); lcbI++ ){
+ adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
+ adjacencies[ lcbI ].right_adjacency[ seqI ] = adjacencies[ lcbI + 1 ].lcb_id;
+ }
+ if( lcbI == iv_list.size() )
+ lcbI--; // need to decrement when there is only a single LCB
+
+ // set first and last lcb adjacencies to -1
+ adjacencies[ 0 ].left_adjacency[ seqI ] = NO_ADJACENCY;
+ adjacencies[ lcbI ].right_adjacency[ seqI ] = NO_ADJACENCY;
+ if( lcbI > 0 ){
+ adjacencies[ 0 ].right_adjacency[ seqI ] = adjacencies[ 1 ].lcb_id;
+ adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
+ }
+ }
+ LCBIDComparator lic;
+ sort( adjacencies.begin(), adjacencies.end(), lic );
+
+}
+
+
+void scanLeft( int& left_recurseI, vector< LCB >& adjacencies, int min_weight, int seqI ){
+ while( left_recurseI != -1 && adjacencies[ left_recurseI ].weight < min_weight )
+ left_recurseI = adjacencies[ left_recurseI ].left_adjacency[ seqI ];
+}
+void scanRight( int& right_recurseI, vector< LCB >& adjacencies, int min_weight, int seqI ){
+ while( right_recurseI != -1 && adjacencies[ right_recurseI ].weight < min_weight )
+ right_recurseI = adjacencies[ right_recurseI ].right_adjacency[ seqI ];
+}
+
+
+
+/** iv_regions -- lists of intervening regions between LCBs in each sequence
+ * start positions organized as iv_regions[ seqI ][ lcbI * 2 ]
+ * end positions organized as iv_regions[ seqI ][ lcbI * 2 + 1 ]
+ */
+void CreateGapSearchList( vector< LCB >& adjacencies, const vector< gnSequence* >& seq_table, vector< vector< int64 > >& iv_regions, boolean entire_genome )
+{
+ iv_regions.clear();
+ if( adjacencies.size() == 0 )
+ return; // there aren't any intervening LCB regions!
+ if( adjacencies.size() == 1 && !entire_genome )
+ return; // there aren't any interveniing LCB regions in the local area
+ boolean debug_lcb_extension = false; /**< enables debugging output */
+ const uint seq_count = seq_table.size();
+
+ uint seqI = 0;
+ int lcbI = 0;
+ iv_regions = vector< vector< int64 > >( seq_count );
+
+ // extract a gnSequence containing only the intervening regions
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+
+ // find the first LCB in this sequence
+ for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
+ if( adjacencies[ lcbI ].left_adjacency[ seqI ] == -1 )
+ break;
+ }
+ // start concatenating the intervening regions
+ // scan right
+ int right_recurseI = lcbI;
+ lcbI = -1;
+ if( !entire_genome && right_recurseI != -1 ){
+ lcbI = right_recurseI;
+ right_recurseI = adjacencies[ lcbI ].right_adjacency[ seqI ];
+ }
+ gnSeqI seq_len = 0;
+ while( (lcbI != -1 || right_recurseI != -1 ) && right_recurseI < (int)adjacencies.size() ){
+ int64 l_end = lcbI == -1 ? 1 : adjacencies[ lcbI ].right_end[ seqI ];
+ int64 r_end = right_recurseI == -1 ? seq_table[ seqI ]->length() : adjacencies[ right_recurseI ].left_end[ seqI ];
+
+ // break out if outside the last LCB and not searching the entire genome
+ if( !entire_genome && right_recurseI == -1 )
+ break;
+
+ l_end = absolut( l_end );
+ r_end = absolut( r_end );
+
+ if( l_end > r_end && !( r_end + 1 == l_end && right_recurseI == -1 ) ){
+ std::cerr << "Overlapping LCBs. lcbI " << lcbI << " right_recurseI " << right_recurseI << endl;
+ std::cerr << "lend: " << l_end << " rend: " << r_end << endl;
+ l_end = r_end;
+
+ }
+
+ lcbI = right_recurseI;
+ if( right_recurseI != -1 )
+ right_recurseI = adjacencies[ right_recurseI ].right_adjacency[ seqI ];
+ if( r_end + 1 == l_end && right_recurseI == -1 )
+ continue; // we're at the right end and there's nothing to add
+ seq_len += r_end - l_end;
+ iv_regions[ seqI ].push_back( l_end );
+ iv_regions[ seqI ].push_back( r_end );
+ }
+ if( debug_lcb_extension )
+ std::cerr << "seqI " << seqI << " seq_len: " << seq_len << endl;
+ }
+
+}
+
+void SearchLCBGaps( MatchList& new_matches, const std::vector< std::vector< int64 > >& iv_regions, MaskedMemHash& nway_mh ) {
+ if( iv_regions.size() == 0 )
+ return; // there aren't any intervening LCB regions!
+ size_t sI = 0;
+ for( ; sI < iv_regions.size(); sI++ )
+ if( iv_regions[sI].size() > 0 )
+ break;
+ if( sI == iv_regions.size() )
+ return; // there aren't any intervening LCB regions!
+
+ boolean debug_lcb_extension = false; /**< enables debugging output */
+
+ const uint seq_count = new_matches.seq_table.size();
+ uint seqI = 0;
+ int lcbI = 0;
+ MatchList gap_list;
+ gap_list.seq_table = vector< gnSequence* >( seq_count ); /**< intervening regions of sequences */
+ gap_list.sml_table = vector< SortedMerList* >( seq_count );
+
+ // extract a gnSequence containing only the intervening regions
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ gap_list.seq_table[ seqI ] = new gnSequence();
+ gap_list.sml_table[ seqI ] = new DNAMemorySML();
+ gnSeqI seq_len = 0;
+ for( size_t ivI = 0; ivI < iv_regions[seqI].size(); ivI += 2 )
+ {
+ int64 l_end = iv_regions[seqI][ivI];
+ int64 r_end = iv_regions[seqI][ivI+1];
+ try{
+ if( debug_lcb_extension )
+ cerr << "Adding " << seqI << "\t" << l_end << "\t" << r_end << "\t(" << r_end - l_end << " bp)" << endl;
+ gap_list.seq_table[ seqI ]->append( new_matches.seq_table[ seqI ]->ToString(r_end - l_end, l_end ) );
+// gap_list.seq_table[ seqI ]->append( new_matches.seq_table[ seqI ]->subseq( l_end, r_end - l_end ) );
+ }catch(...){
+ cout << "";
+ }
+ seq_len += r_end - l_end;
+ }
+ if( debug_lcb_extension )
+ cerr << "seqI " << seqI << " seq_len: " << seq_len << endl;
+ }
+ //
+ // search for MUMs in the intervening sequence regions
+ //
+
+ // calculate potential mer sizes for searches
+ gnSeqI total_iv_length = 0;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ total_iv_length += gap_list.seq_table[ seqI ]->length();
+/* cerr << "seqI: " << seqI << " length: " << gap_list.seq_table[ seqI ]->length();
+ cerr << "\n";
+*/
+ }
+ total_iv_length /= seq_count;
+
+ uint search_mer_size = getDefaultSeedWeight( total_iv_length );
+ if( search_mer_size < MIN_DNA_SEED_WEIGHT )
+ return; // The seed size is too small to be significant
+ uint64 default_seed = getSeed( search_mer_size );
+
+ // Create sorted mer lists for the intervening gap region
+ vector< boost::filesystem::path > delete_files;
+ boolean create_succeeded = true;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ gap_list.sml_table[ seqI ]->Clear();
+ try{
+ if( debug_lcb_extension )
+ cerr << "Creating memory SML for seqI " << seqI << endl;
+ gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
+ }catch(...){
+ create_succeeded = false;
+ break;
+ }
+ }
+ if( !create_succeeded ){
+ // free memory consumed by any SMLs
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ gap_list.sml_table[ seqI ]->Clear();
+ delete gap_list.sml_table[ seqI ];
+ }
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ cerr << "Creating dmSML for seqI " << seqI << endl;
+ // presumably we ran out of memory and couldn't use a MemorySML.
+ // try using a FileSML with external sort
+ string concat_file = CreateTempFileName("seqconcat");
+
+ concat_file += ".raw"; // need .raw extension to tell stupid libGenome it's a raw file
+ gnRAWSource::Write( *(gap_list.seq_table[ seqI ]), concat_file.c_str() );
+ delete_files.push_back( concat_file );
+ delete gap_list.seq_table[ seqI ]; // make sure memory gets freed!
+ cerr << "Wrote raw sequence for seqI " << seqI << endl;
+ gap_list.seq_table[ seqI ] = new gnSequence();
+ gap_list.seq_table[ seqI ]->LoadSource( concat_file.c_str() );
+ cerr << "Loaded sequence " << seqI << gap_list.seq_table[ seqI ]->length() << "b.p.\n";
+ string sml_file = CreateTempFileName("dmsml");
+ DNAFileSML* sml = new DNAFileSML( sml_file.c_str() );
+ gap_list.sml_table[ seqI ] = sml;
+ sml->dmCreate( *(gap_list.seq_table[ seqI ]), default_seed );
+ delete_files.push_back( sml_file );
+ delete_files.push_back( sml_file + ".coords" );
+ }
+ }
+
+ // Find all exact matches in the gap region
+ nway_mh.Clear();
+ nway_mh.FindMatches( gap_list );
+ gap_list.MultiplicityFilter( seq_count );
+// nway_mh.GetMatchList( gap_list );
+
+ // free memory used by SMLs!
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ gap_list.sml_table[ seqI ]->Clear();
+ delete gap_list.sml_table[ seqI ];
+ }
+
+ if( debug_lcb_extension ){
+ ofstream debug_extension_out( "new_extension_matches.txt" );
+ WriteList( gap_list, debug_extension_out );
+ debug_extension_out.close();
+ }
+
+ //
+ // If an N mask was used, transpose MUMs back into the previous
+ // sequence coordinates
+ //
+ if( !create_succeeded ){
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ transposeMatches( gap_list, seqI, ((FileSML*)gap_list.sml_table[ seqI ])->getUsedCoordinates() );
+ }
+ //
+ // Transpose MUMs back into their original sequence coordinates
+ //
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ transposeMatches( gap_list, seqI, iv_regions[ seqI ] );
+
+ EliminateOverlaps( gap_list );
+ gap_list.MultiplicityFilter( seq_count );
+ // filter out matches that are too short
+ gap_list.LengthFilter( MIN_ANCHOR_LENGTH );
+
+ // free memory used by sequences!
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ delete gap_list.seq_table[ seqI ];
+
+ for( int delI = 0; delI < delete_files.size(); delI++ )
+ boost::filesystem::remove( delete_files[delI] );
+
+ new_matches.insert( new_matches.end(), gap_list.begin(), gap_list.end() );
+}
+
+
+
+class MatchLeftEndComparator {
+public:
+ MatchLeftEndComparator( unsigned seq = 0 ){
+ m_seq = seq;
+ }
+ MatchLeftEndComparator( MatchLeftEndComparator& msc ){
+ m_seq = msc.m_seq;
+ }
+ // TODO?? make this do a wraparound comparison if all is equal?
+ boolean operator()(const AbstractMatch* a, const AbstractMatch* b) const{
+ int32 start_diff = max( a->FirstStart(), m_seq ) - max( b->FirstStart(), m_seq );
+ if(start_diff == 0){
+ uint32 m_count = a->SeqCount();
+ m_count = m_count <= b->SeqCount() ? m_count : b->SeqCount();
+ for(uint32 seqI = m_seq; seqI < m_count; seqI++){
+ int64 a_start = absolut( a->Start( seqI ) ), b_start = absolut( b->Start( seqI ) );
+ int64 diff = a_start - b_start;
+ if(a_start == (int64)NO_MATCH || b_start == (int64)NO_MATCH)
+ continue;
+ else if(diff == 0)
+ continue;
+ else
+ return diff < 0;
+ }
+ }
+ return start_diff < 0;
+ }
+private:
+ unsigned m_seq;
+};
+
+/**
+ * Transposes the coordinates of matches in mlist to correspond to the original
+ * set of source sequence regions described by seq_regions, splitting matches if
+ * necessary.
+ */
+void transposeMatches( MatchList& mlist, uint seqI, const vector< int64 >& seq_regions ){
+ if( seq_regions.size() < 2 )
+ return; // no work to be done here...
+
+ uint matchI = 0;
+ MatchLeftEndComparator msc( seqI );
+ sort( mlist.begin(), mlist.end(), msc );
+ uint regionI = 0;
+ gnSeqI region_sum = seq_regions[ 1 ] - seq_regions[ 0 ];
+ gnSeqI region_start_sum = 0;
+ MatchList new_matches;
+
+ for( ; matchI < mlist.size(); matchI++ ){
+ // find the translated start coordinate for this match
+ int64 trans_start = mlist[ matchI ]->Start( seqI );
+ int64 iv_orig_start = trans_start;
+ if( trans_start == 0 )
+ continue;
+ while( region_sum < absolut( trans_start ) && regionI + 2 < seq_regions.size() ){
+ regionI += 2;
+ region_start_sum = region_sum;
+ region_sum += seq_regions[ regionI + 1 ] - seq_regions[ regionI ];
+ }
+
+ if( trans_start < 0 )
+ trans_start = -seq_regions[ regionI ] - ( -trans_start - region_start_sum ) + 1;
+ else if( trans_start > 0 )
+ trans_start = seq_regions[ regionI ] + ( trans_start - region_start_sum ) - 1;
+
+ int64 trans_end = mlist[ matchI ]->Start( seqI );
+ trans_end += trans_end > 0 ? mlist[ matchI ]->Length() - 1: -(int64)(mlist[ matchI ]->Length()) + 1;
+
+ mlist[ matchI ]->SetStart( seqI, trans_start );
+
+ // this bad boy may need to be split
+ gnSeqI end_region_sum = region_sum;
+ gnSeqI end_prev_sum = region_start_sum;
+ uint end_regionI = regionI;
+ Match* cur_match = mlist[ matchI ];
+ while( end_region_sum < absolut( trans_end ) && end_regionI + 2 < seq_regions.size() ){
+ end_regionI += 2;
+
+ Match* left_match = new Match( *cur_match );
+ // clip off the part going to the other match
+ if( left_match->Start( seqI ) < 0 ){
+ cur_match->CropStart( absolut( iv_orig_start ) + left_match->Length() - end_region_sum - 1);
+ left_match->CropEnd( cur_match->Length() );
+ }else{
+ cur_match->CropEnd( absolut( iv_orig_start ) + left_match->Length() - end_region_sum - 1);
+ left_match->CropStart( cur_match->Length() );
+ }
+
+ iv_orig_start += iv_orig_start > 0 ? cur_match->Length(): -(int64)cur_match->Length();
+
+ if( trans_start < 0 )
+ trans_start = -seq_regions[ end_regionI ] - ( -iv_orig_start - end_region_sum ) + 1;
+ else if( trans_start > 0 )
+ trans_start = seq_regions[ end_regionI ] + ( iv_orig_start - end_region_sum ) - 1;
+
+ left_match->SetStart( seqI, trans_start );
+
+ cur_match = left_match;
+ new_matches.push_back( left_match );
+
+ end_prev_sum = end_region_sum;
+ end_region_sum += seq_regions[ end_regionI + 1 ] - seq_regions[ end_regionI ];
+
+ }
+// if( end_region_sum == absolut( trans_end ) )
+// cerr << "Beware of a possible bug in transposeMatches()\n";
+ }
+
+ // voila... coordinates are translated
+ mlist.insert( mlist.end(), new_matches.begin(), new_matches.end() );
+}
+
+void ComputeLCBs( MatchList& meml, set<uint>& breakpoints, vector<MatchList>& lcb_list, vector<int64>& weights ){
+
+ // there must be at least one end of a block defined
+ if( breakpoints.size() < 1 )
+ return;
+
+ lcb_list.clear();
+ weights.clear();
+
+ // organize the LCBs into different MatchList instances
+
+ set<uint>::iterator break_iter = breakpoints.begin();
+ uint prev_break = 0; // prev_break is the first match in the current block
+ MatchList lcb = meml;
+ for( ; break_iter != breakpoints.end(); break_iter++ ){
+ lcb.clear();
+ lcb.insert( lcb.begin(), meml.begin() + prev_break, meml.begin() + *break_iter + 1 );
+ prev_break = *break_iter + 1;
+
+ // code to filter LCBs based on their coverage
+ uint64 coverage;
+ GetLCBCoverage( lcb, coverage );
+ weights.push_back( coverage );
+
+ // add the new MatchList to the set if it made the cut
+ lcb_list.push_back( lcb );
+ }
+}
+
+void Aligner::Recursion( MatchList& r_list, Match* r_begin, Match* r_end, boolean nway_only ){
+ try{
+ gnSeqI gap_size = 0;
+ uint seqI = 0;
+// gnSeqI min_gap_size = 0;
+ boolean create_ok = true;
+ // create gnSequences for each intervening region
+ // create a MatchList for the intervening region
+ MatchList gap_list;
+
+ gap_list.seq_table.reserve( seq_count );
+ gap_list.sml_table.reserve( seq_count );
+ vector< int64 > starts;
+ uint below_cutoff_count = 0;
+//
+// Get the sequence in the intervening gaps between these two matches
+//
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ int64 gap_end = 0;
+ int64 gap_start = 0;
+ getInterveningCoordinates( r_list.seq_table, r_begin, r_end, seqI, gap_start, gap_end );
+ if( (r_end && r_end->Start( seqI ) == NO_MATCH) ||
+ (r_begin && r_begin->Start( seqI ) == NO_MATCH )){
+ below_cutoff_count++;
+ cerr << "It's screwed up\n";
+ gap_list.seq_table.push_back( new gnSequence() );
+ gap_list.sml_table.push_back( new DNAMemorySML() );
+ continue;
+ }
+ if( gap_end < 0 && gap_start > 0 ){
+ create_ok = false;
+ cerr << "It's screwed up 2\n";
+ break; // bail out on directional inconsistency
+ }else if( gap_end < 0 && gap_start > 0 ){
+ cerr << "It's screwed up 3\n";
+ create_ok = false;
+ break; // bail out on directional inconsistency
+ }
+ int64 diff = gap_end - gap_start;
+ diff = 0 < diff ? diff : 0;
+ gap_size = diff < gap_size ? gap_size : diff;
+
+ if( gap_start == 0 )
+ cerr << "scheiss\n";
+
+ if( debug )
+ cout << r_list.seq_table[ seqI ]->length() << endl;
+
+ if( diff < min_recursive_gap_length )
+ below_cutoff_count++;
+ starts.push_back( gap_start );
+ gnSequence* new_seq = new gnSequence( r_list.seq_table[ seqI ]->subseq( gap_start, diff ) );
+ gap_list.seq_table.push_back( new_seq );
+ gap_list.sml_table.push_back( new DNAMemorySML() );
+ }
+
+ // only perform recursive anchoring if the gapped regions are long enough
+ // otherwise just let ClustalW do the work
+ if( below_cutoff_count + 1 < seq_count ){
+ if( nway_only )
+ nway_mh.Clear();
+ else
+ gap_mh.get().Clear();
+
+ multimap< uint, uint > mer_sizes;
+ // calculate potential mer sizes for searches
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ uint search_mer_size = getDefaultSeedWeight( gap_list.seq_table[ seqI ]->length() );
+ mer_sizes.insert( multimap< uint, uint >::value_type( search_mer_size, seqI ) );
+ }
+ multimap< uint, uint >::iterator mer_iter = mer_sizes.end();
+ mer_iter--;
+ vector< uint > search_seqs;
+ while( mer_iter != mer_sizes.end() ){
+ uint prev_mer = mer_iter->first;
+ uint new_seqs = 0;
+ while( true ){
+ if( mer_iter->first < MIN_DNA_SEED_WEIGHT )
+ break;
+ if( mer_iter->first == prev_mer || search_seqs.size() < 2 ){
+ search_seqs.push_back( mer_iter->second );
+ new_seqs++;
+ if( mer_iter == mer_sizes.begin() ){
+ mer_iter = mer_sizes.end(); // signify that the scan is complete
+ break;
+ }
+ prev_mer = mer_iter->first;
+ mer_iter--;
+ }else
+ break;
+ }
+
+ if( search_seqs.size() < 2 )
+ break;
+ // look for MUMs
+
+ //
+ // Create sorted mer lists for the intervening gap region
+ //
+
+ uint64 default_seed = getSeed( prev_mer );
+ if( prev_mer < MIN_DNA_SEED_WEIGHT )
+ break;
+ for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ ){
+ gap_list.sml_table[ seqI ]->Clear();
+ gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
+ }
+ //
+ // Find all exact matches in the gap region
+ //
+ MatchList cur_mems = gap_list;
+ cur_mems.clear();
+ if( nway_only ){
+ // no sense in searching for matches in subsets!!
+ if( search_seqs.size() < seq_count )
+ continue;
+ nway_mh.ClearSequences();
+ nway_mh.FindMatches( cur_mems );
+ }else{
+ gap_mh.get().ClearSequences();
+ gap_mh.get().FindMatches( cur_mems );
+ }
+ for( size_t mI = 0; mI < cur_mems.size(); ++mI )
+ cur_mems[mI]->Free();
+ cur_mems.clear();
+ }
+ if( nway_only )
+ nway_mh.GetMatchList( gap_list );
+ else
+ gap_mh.get().GetMatchList( gap_list );
+
+
+ // delete overlaps/inclusions
+ EliminateOverlaps( gap_list );
+ // mult. filter after EliminateOverlaps because e.o. may generate some subset matches
+ if( nway_only )
+ gap_list.MultiplicityFilter( seq_count );
+
+ // for anchor accuracy, throw out any anchors that are shorter than the minimum
+ // anchor length after EliminateOverlaps()
+ gap_list.LengthFilter( MIN_ANCHOR_LENGTH );
+
+ // if( min_gap_size < search_mer_size )
+ // create_ok = false;
+ if( gap_list.size() > 0 && create_ok ){
+
+ /* if( debug ){
+ cout << "Starting mem: " << *r_begin << endl;
+ cout << "Next mem: " << *r_end << endl;
+ list<Match*>::iterator gappy_iter = gap_list.begin();
+ while( gappy_iter != gap_list.end() ){
+ cout << **gappy_iter;
+ cout << endl;
+ gappy_iter++;
+ }
+ }
+ */
+
+ // move all the matches that were found
+ vector< Match* >::iterator mum_iter = gap_list.begin();
+ for( ; mum_iter != gap_list.end(); ){
+ boolean add_ok = true;
+ for( uint seqI = 0; seqI < (*mum_iter)->SeqCount(); ++seqI ){
+ int64 gap_start;
+ if( (*mum_iter)->Start( seqI ) == NO_MATCH )
+ continue;
+ else if( (*mum_iter)->Start( seqI ) < 0 ){
+ gap_start = r_begin != NULL ? -r_begin->End( seqI ) : 0;
+ if( gap_start > 0 )
+ // gap_start = -r_end->Start( seqI ) + r_end->Length() - 1;
+ gap_start = r_end != NULL ? r_end->Start( seqI ) - r_end->Length() + 1 : 0;
+ else if( r_begin )
+ add_ok = false;
+ (*mum_iter)->SetStart( seqI, (*mum_iter)->Start( seqI ) + gap_start );
+ }else{
+ // insert them all before mem_iter
+ gap_start = r_begin != NULL ? r_begin->End( seqI ) : 0;
+ if( gap_start < 0 ){
+ gap_start = r_end != NULL ? r_end->Start( seqI ) - r_end->Length() + 1 : 0;
+ add_ok = false;
+ }
+ (*mum_iter)->SetStart( seqI, (*mum_iter)->Start( seqI ) + gap_start );
+ }
+ }
+ if( add_ok )
+ r_list.push_back( *mum_iter );
+ else{
+ (*mum_iter)->Free();
+ (*mum_iter) = NULL;
+ }
+ ++mum_iter;
+ }
+ // for( ; mum_iter != gap_list.end(); )
+ // match_allocator.Free( *mum_iter );
+ }
+ }
+ // delete sequences and smls
+ for( uint seqI = 0; seqI < gap_list.seq_table.size(); ++seqI )
+ delete gap_list.seq_table[ seqI ];
+ for( uint seqI = 0; seqI < gap_list.sml_table.size(); ++seqI )
+ delete gap_list.sml_table[ seqI ];
+
+ gap_list.seq_table.clear();
+ gap_list.sml_table.clear();
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ }catch( exception& e ){
+ cerr << e.what() << endl;
+ }catch(...){
+ cerr << "When I say 'ohhh' you say 'shit'!\n";
+ }
+}
+
+// compute the gapped alignments between anchors in an LCB
+void AlignLCBInParallel( bool collinear_genomes, mems::GappedAligner* gal, MatchList& mlist, Interval& iv, AlnProgressTracker& apt )
+{
+ // check whether this function can do anything useful...
+ if( !collinear_genomes && mlist.size() < 2 ){
+ iv.SetMatches( mlist );
+ return;
+ }
+ size_t galI = 0;
+ vector<GappedAlignment*> gapped_alns(mlist.size()+1, NULL);
+ vector<int> success(gapped_alns.size(), 0);
+ gnSeqI progress_base = apt.cur_leftend;
+//#pragma omp parallel for
+ for( int mI = 0; mI < mlist.size()-1; mI++ )
+ {
+ // align the region between mI and mI+1
+ GappedAlignment ga(mlist.seq_table.size(),0);
+ gapped_alns[mI] = ga.Copy();
+
+ bool align_success = gal->Align( *(gapped_alns[mI]), mlist[mI], mlist[mI+1], mlist.seq_table );
+ if(align_success)
+ success[mI] = 1;
+ if(mI % 50 == 0 && mI > 0)
+ {
+ // update and print progress
+ int done = 0;
+ for( int i = 0; i < gapped_alns.size(); i++ )
+ if(gapped_alns[i] != NULL)
+ done++;
+//#pragma omp critical
+{
+ double cur_progress = ((double)(progress_base+done) / (double)apt.total_len)*100.0;
+ printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
+ apt.prev_progress = cur_progress;
+}
+ }
+ }
+ apt.cur_leftend += mlist.size()-1;
+
+ // merge the alignments and anchors back together
+ vector<AbstractMatch*> merged(mlist.size()*2 + 1);
+ size_t mlistI = 0;
+ size_t gappedI = 0;
+ bool turn = true;
+ size_t mJ = 0;
+
+ // check if genomes are collinear and get the start and end alignments if necessary
+ if(collinear_genomes)
+ {
+ GappedAlignment ga_tmp(mlist.seq_table.size(),0);
+ GappedAlignment* ga = ga_tmp.Copy();
+ bool align_success = gal->Align( *ga, NULL, mlist[0], mlist.seq_table );
+ if(align_success)
+ merged[mJ++] = ga;
+ gapped_alns[mlist.size()] = ga_tmp.Copy();
+ align_success = gal->Align( *(gapped_alns[mlist.size()]), mlist.back(), NULL, mlist.seq_table );
+ if(align_success)
+ success[mlist.size()] = 1;
+ }
+ for( ; mJ < merged.size() && mlistI < mlist.size(); )
+ {
+ if(turn)
+ merged[mJ++] = mlist[mlistI++];
+ else if(success[gappedI])
+ merged[mJ++] = gapped_alns[gappedI++];
+ else
+ gappedI++;
+ turn = !turn;
+ }
+ // add the last alignment
+ if( success[mlist.size()]==1 )
+ merged[mJ++] = gapped_alns.back();
+ merged.resize(mJ);
+
+ iv.SetMatches(merged);
+}
+
+// compute the gapped alignments between anchors in an LCB
+void Aligner::AlignLCB( MatchList& mlist, Interval& iv ){
+ // check whether this function can do anything useful...
+ if( !collinear_genomes && mlist.size() < 2 ){
+ iv.SetMatches( mlist );
+ return;
+ }
+
+ vector< AbstractMatch* > iv_matches;
+ boolean debug_recurse = false;
+ int64 config_value = 138500;
+ int print_interval = 50;
+ try{
+ list< Match* > match_list;
+ match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
+ mlist.clear();
+ MatchList r_list = mlist;
+
+ list< Match* >::iterator recurse_iter = match_list.begin();
+ list< Match* >::iterator recurse_prev = match_list.begin();
+ // scan ahead to the first n-way matches
+ while( recurse_prev != match_list.end() && (*recurse_prev)->Multiplicity() != seq_count )
+ ++recurse_prev;
+
+ recurse_iter = recurse_prev;
+ if( !collinear_genomes ){
+ if( recurse_iter != match_list.end() )
+ ++recurse_iter;
+ while( recurse_iter != match_list.end() && (*recurse_iter)->Multiplicity() != seq_count )
+ ++recurse_iter;
+ }else
+ cout << "Assuming collinear genomes...\n";
+
+ uint memI = 0;
+ uint matchI = 0;
+ while( true ){
+ if( memI >= print_interval && memI % print_interval == 0 || debug)
+ cout << "Number: " << memI << " match " << **recurse_prev << endl;
+ ++memI;
+ if( debug_recurse ){
+ cout << "Recursing on " << endl;
+ if( recurse_prev != match_list.end() )
+ cout << **recurse_prev << " and " << endl;
+ if( recurse_iter != match_list.end() )
+ cout << **recurse_iter << endl;
+ }
+
+ if( recurse_prev != match_list.end() && (*recurse_prev)->Start( 0 ) == config_value )
+ cout << "";
+
+ // recurse on a pair of matches!
+ // this function should locate all matches between the two iterators
+ // and add them to r_list
+ r_list.clear();
+ GappedAlignment* cr = NULL;
+ boolean align_success = false;
+
+ Match* r_lend = NULL;
+ Match* r_rend = NULL;
+ if( recurse_iter != recurse_prev )
+ r_lend = *recurse_prev;
+ if( recurse_iter != match_list.end() )
+ r_rend = *recurse_iter;
+
+ // attempt a clustalW alignment
+ cr = new GappedAlignment();
+ align_success = gal->Align( *cr, r_lend, r_rend, r_list.seq_table );
+
+ // add the gapped alignment to the Interval
+ if( r_lend != NULL )
+ iv_matches.push_back( r_lend );
+ if( align_success )
+ iv_matches.push_back( cr );
+
+ // scan ahead to the next pair of n-way matches
+ recurse_prev = recurse_iter;
+ if( recurse_iter != match_list.end() )
+ ++recurse_iter;
+ while( recurse_iter != match_list.end() && (*recurse_iter)->Multiplicity() != seq_count )
+ ++recurse_iter;
+
+ if( ( recurse_iter == match_list.end() && !collinear_genomes ) ||
+ ( recurse_prev == match_list.end() && collinear_genomes ) )
+ break;
+ }
+ // get the last little bit at the end of the LCB.
+ list< Match* >::iterator iter = recurse_prev;
+ for( ; iter != recurse_iter; ++iter )
+ iv_matches.push_back(*iter);
+
+ mlist.insert( mlist.end(), match_list.begin(), match_list.end() );
+ iv.SetMatches(iv_matches);
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ }catch(exception& e){
+ cerr << e.what();
+ }catch(...){
+ cerr << "matrix exception?\n";
+ }
+}
+
+// just search each intervening region once for matches, no gapped alignment...
+void Aligner::SearchWithinLCB( MatchList& mlist, std::vector< search_cache_t >& new_cache, bool leftmost, bool rightmost){
+ // check whether this function can do anything useful...
+ if( !(leftmost || rightmost) && mlist.size() < 2 )
+ return;
+
+ boolean debug_recurse = false;
+ int64 config_value = 138500;
+ int print_interval = 50;
+
+ try{
+ list< Match* > match_list;
+ match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
+ mlist.clear();
+ MatchList r_list = mlist;
+
+ list< Match* >::iterator recurse_iter = match_list.begin();
+ list< Match* >::iterator recurse_prev = match_list.begin();
+ if( !leftmost && recurse_iter != match_list.end() )
+ ++recurse_iter;
+
+ uint memI = 0;
+ uint matchI = 0;
+ while( recurse_prev != match_list.end() ){
+ if( memI >= print_interval && memI % print_interval == 0 || debug)
+ cout << "Number: " << memI << " match " << **recurse_prev << endl;
+ ++memI;
+ if( debug_recurse ){
+ cout << "Recursing on " << endl;
+ if( recurse_prev != match_list.end() )
+ cout << **recurse_prev << " and " << endl;
+ if( recurse_iter != match_list.end() )
+ cout << **recurse_iter << endl;
+ }
+
+
+ // recurse on a pair of matches!
+ // this function should locate all matches between the two iterators
+ // and add them to r_list
+ r_list.clear();
+ Match* r_left = NULL;
+ Match* r_right = NULL;
+ if( recurse_iter == match_list.begin() && leftmost ){
+ r_left = NULL;
+ r_right = *recurse_iter;
+ }else if( recurse_iter == match_list.end() && rightmost ){
+ r_left = *recurse_prev;
+ r_right = NULL;
+ }else{
+ r_left = *recurse_prev;
+ r_right = *recurse_iter;
+ }
+ // check the cache to see whether this search has already been done!
+
+ search_cache_t cacheval = make_pair( r_left, r_right );
+ if( cacheval.first != NULL )
+ cacheval.first = cacheval.first->Copy();
+ if( cacheval.second != NULL )
+ cacheval.second = cacheval.second->Copy();
+ std::vector< search_cache_t >::iterator cache_entry = std::upper_bound( search_cache.begin(), search_cache.end(), cacheval, cache_comparator );
+ if( cache_entry == search_cache.end() ||
+ (cache_comparator( cacheval, *cache_entry ) || cache_comparator( *cache_entry, cacheval )) )
+ {
+ // search this region
+ Recursion( r_list, r_left, r_right, true );
+ }
+ new_cache.push_back( cacheval );
+
+ if( debug_recurse ){
+ vector< Match* >::iterator r_iter = r_list.begin();
+ cout << "Found matches " << endl;
+ for(; r_iter != r_list.end(); ++r_iter )
+ cout << **r_iter << endl;
+ }
+
+ // insert any n-way matches into the match list
+ for( matchI = 0; matchI < r_list.size(); ++matchI ){
+ if( r_list[ matchI ]->Multiplicity() == seq_count ){
+ match_list.insert( recurse_iter, r_list[ matchI ] );
+ }else
+ {
+ r_list[matchI]->Free();
+ r_list[matchI] = NULL;
+ }
+ }
+
+ // move ahead to the next pair of n-way matches
+ recurse_prev = recurse_iter;
+ if( recurse_iter != match_list.end() )
+ ++recurse_iter;
+
+ // break early if we aren't assuming genome collinearity
+ if( !rightmost && recurse_iter == match_list.end() )
+ break;
+
+ }
+
+ mlist.insert( mlist.begin(), match_list.begin(), match_list.end() );
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ }catch(exception& e){
+ cerr << e.what();
+ }catch(...){
+ cerr << "matrix exception?\n";
+ }
+
+ // Multiplicity Filter...
+ mlist.MultiplicityFilter( seq_count );
+ EliminateOverlaps( mlist );
+ // E.O. can create some matches of lower multiplicity
+ mlist.MultiplicityFilter( seq_count );
+}
+
+void Aligner::consistencyCheck( uint lcb_count, vector< LCB >& adjacencies, vector< MatchList >& lcb_list, vector< int64 >& weights ){
+ vector< LCB > tmp_adj = adjacencies;
+ vector< MatchList > tmp_lcbs = lcb_list;
+ vector< int64 > tmp_weights = weights;
+ filterMatches( tmp_adj, tmp_lcbs, tmp_weights );
+ MatchList emmlist;
+ for( uint lcbI = 0; lcbI < tmp_lcbs.size(); lcbI++ )
+ emmlist.insert( emmlist.end(), tmp_lcbs[ lcbI ].begin(), tmp_lcbs[ lcbI ].end() );
+ set< uint > breakpoints;
+ AaronsLCB( emmlist, breakpoints );
+
+ // do the correct number of LCBs exist?
+ if( lcb_count != tmp_lcbs.size() ){
+ cerr << "lcb_count: " << lcb_count << "\ttmp_lcbs.size(): " << tmp_lcbs.size() << endl;
+ }
+ if( lcb_count != breakpoints.size() ){
+ cerr << "lcb_count: " << lcb_count << "\tbreakpoints.size(): " << breakpoints.size() << endl;
+ }
+ if( tmp_lcbs.size() != breakpoints.size() ){
+ cerr << "tmp_lcbs.size(): " << tmp_lcbs.size() << "\tbreakpoints.size(): " << breakpoints.size() << endl;
+ }
+}
+
+
+/**
+ * Version 2 of this algorithm:
+ * each time two LCBs coalesce, repeatedly search their intervening region until
+ * either a single LCB exists or all LCBs meet the current minimum_weight.
+ * @returns The weight of the minimum weight LCB that remains
+ */
+int64 greedyBreakpointElimination( gnSeqI minimum_weight, vector< LCB >& adjacencies, vector< int64 >& weights, ostream* status_out ){
+ // repeatedly remove the low weight LCBs until the minimum weight criteria is satisfied
+ uint lcbI = 0;
+ vector< uint > low_weight;
+ bool have_weight = false;
+ gnSeqI min_weight = 0;
+ gnSeqI prev_min_weight = 0;
+ uint min_lcb = 0;
+ uint lcb_count = adjacencies.size();
+ boolean debug_bp_elimination = false;
+ uint current_lcbI = 0; /**< tracks how many of the LCBs are above the min weight */
+
+ if( adjacencies.size() == 0 )
+ return 0; // nothing can be done
+ uint seq_count = adjacencies[0].left_end.size();
+
+ while( min_weight < minimum_weight ){
+ if( lcb_count == 1 )
+ break; // if only a single LCB remains, don't remove it
+
+ while(true){
+ have_weight = false;
+ min_weight = 0;
+ current_lcbI = 0; // always scan the entire set
+
+ // start with current_lcbI since everything up to it has already been scanned
+ for( lcbI = current_lcbI; lcbI < weights.size(); lcbI++ ){
+ if( adjacencies[ lcbI ].lcb_id != lcbI ){
+ // this lcb has been removed or merged with another lcb
+ continue;
+ }
+ if( weights[ lcbI ] < min_weight || !have_weight ){
+ min_weight = weights[ lcbI ];
+ min_lcb = lcbI;
+ have_weight = true;
+ if( min_weight == prev_min_weight && current_lcbI > 0 )
+ break; // we've already found a minimum
+ // weight LCB, stop here to save some searching
+ }
+ }
+ lcbI = min_lcb;
+ have_weight = false;
+ // if the min weight changed then scan the entire set from the beginning
+ if( prev_min_weight != min_weight ){
+ if( status_out != NULL )
+ *status_out << "There are " << lcb_count << " LCBs with minimum weight " << min_weight << endl;
+
+ current_lcbI = 0;
+ prev_min_weight = min_weight;
+ continue;
+ }
+
+ // save time by skipping LCBs that have already been scanned
+ current_lcbI = min_lcb;
+ break;
+ }
+
+// consistencyCheck( lcb_count, adjacencies, lcb_list, weights );
+ if( min_weight >= minimum_weight )
+ break;
+
+ // actually remove the LCBs now
+ // (only remove a single LCB for now -- it's easier to calculate adjacencies)
+
+ // remove this LCB
+ adjacencies[ lcbI ].lcb_id = -2;
+
+ // update adjacencies
+ uint seqI;
+ uint left_adj;
+ uint right_adj;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
+ if( debug_bp_elimination ){
+ if( left_adj == -2 || right_adj == -2 ){
+ cerr << "improper linking\n";
+ }
+ // for debugging, check for consistency:
+ if( left_adj != -1 && adjacencies[ left_adj ].right_adjacency[ seqI ] != lcbI )
+ cerr << "Mutiny on the bounty!\n";
+ // for debugging, check for consistency
+ if( right_adj == adjacencies.size() )
+ cerr << "Horrible Error -399a\n";
+ if( right_adj != -1 && adjacencies[ right_adj ].left_adjacency[ seqI ] != lcbI )
+ cerr << "Mutiny on the bounty!\n";
+ }
+ if( left_adj != -1 )
+ adjacencies[ left_adj ].right_adjacency[ seqI ] = right_adj;
+ if( right_adj != -1 && right_adj != adjacencies.size() )
+ adjacencies[ right_adj ].left_adjacency[ seqI ] = left_adj;
+
+ }
+ // just deleted an lcb, drop the lcb count
+ lcb_count--;
+
+ // check for collapse
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
+ if( left_adj == -1 || right_adj == -1 )
+ continue; // can't collapse with a non-existant LCB!
+
+ if( debug_bp_elimination ){
+ if( right_adj == adjacencies.size() )
+ cerr << "Horrible Error -399a\n";
+ // check whether this LCB has already been merged
+ if( left_adj != adjacencies[ left_adj ].lcb_id ||
+ right_adj != adjacencies[ right_adj ].lcb_id ){
+ // because adjacency pointers are always updated to point to the
+ // representative entry of an LCB, the lcb_id and the array index
+ // should always be identical
+ cerr << "improper linking\n";
+ continue;
+ }
+ if( left_adj == -2 || right_adj == -2 ){
+ cerr << "improper linking\n";
+ }
+ }
+
+ // check whether the two LCBs are adjacent in each sequence
+ boolean orientation = adjacencies[ left_adj ].left_end[ seqI ] > 0 ? true : false;
+ uint seqJ;
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
+ if( j_orientation == orientation &&
+ adjacencies[ left_adj ].right_adjacency[ seqJ ] != right_adj )
+ break;
+ if( j_orientation != orientation &&
+ adjacencies[ left_adj ].left_adjacency[ seqJ ] != right_adj )
+ break;
+ // check that they are both in the same orientation
+ if( adjacencies[ right_adj ].left_end[ seqJ ] > 0 != j_orientation )
+ break;
+ }
+
+ if( seqJ != seq_count )
+ continue;
+
+
+ // these two can be collapsed
+ // do it. do it now.
+ adjacencies[ right_adj ].lcb_id = left_adj;
+ if( adjacencies[ right_adj ].lcb_id == -1 ||
+ adjacencies[ right_adj ].lcb_id == -2 )
+ cerr << "Trouble in the eleventh circle\n";
+ weights[ left_adj ] += weights[ right_adj ];
+ // unlink right_adj from the adjacency list and
+ // update left and right ends of left_adj
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
+ uint rr_adj = adjacencies[ right_adj ].right_adjacency[ seqJ ];
+ uint rl_adj = adjacencies[ right_adj ].left_adjacency[ seqJ ];
+ if( j_orientation == orientation ){
+ adjacencies[ left_adj ].right_end[ seqJ ] = adjacencies[ right_adj ].right_end[ seqJ ];
+ adjacencies[ left_adj ].right_adjacency[ seqJ ] = rr_adj;
+ if( rr_adj == adjacencies.size() )
+ cerr << "Horrible Error -399a\n";
+ if( rr_adj != -1 )
+ adjacencies[ rr_adj ].left_adjacency[ seqJ ] = left_adj;
+ }else{
+ adjacencies[ left_adj ].left_end[ seqJ ] = adjacencies[ right_adj ].left_end[ seqJ ];
+ adjacencies[ left_adj ].left_adjacency[ seqJ ] = rl_adj;
+ if( rl_adj == adjacencies.size() )
+ cerr << "Horrible Error -399a\n";
+ if( rl_adj != -1 )
+ adjacencies[ rl_adj ].right_adjacency[ seqJ ] = left_adj;
+ }
+ // update lcbI's adjacency links to point nowhere
+ if( adjacencies[ lcbI ].left_adjacency[ seqJ ] == right_adj )
+ adjacencies[ lcbI ].left_adjacency[ seqJ ] = left_adj;
+ if( adjacencies[ lcbI ].right_adjacency[ seqJ ] == right_adj )
+ adjacencies[ lcbI ].right_adjacency[ seqJ ] = left_adj;
+
+
+ }
+ // just collapsed an lcb, decrement
+ lcb_count--;
+ }
+ }
+ return min_weight;
+}
+
+class LCBLeftEndComp
+{
+public:
+ LCBLeftEndComp() : ssc(0) {};
+ bool operator()( const MatchList& a, const MatchList& b )
+ {
+ return ssc(a.front(), b.front());
+ }
+protected:
+ SingleStartComparator<AbstractMatch> ssc;
+};
+
+/**
+ * Takes a set of filtered LCB adjacencies and an unfiltered set of matches as input
+ * returns a filtered set of matches that reflects the LCBs found
+ */
+void filterMatches( vector< LCB >& adjacencies, vector< MatchList >& lcb_list, vector< int64 >& weights ){
+ if( lcb_list.size() < 1 )
+ return;
+ MatchList lcb_tmp = lcb_list[ 0 ];
+ lcb_tmp.clear();
+ vector< MatchList > filtered_lcbs = vector< MatchList >( lcb_list.size(), lcb_tmp );
+ uint lcbI;
+ for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
+ if( adjacencies[ lcbI ].lcb_id == lcbI ){
+ filtered_lcbs[ lcbI ].insert( filtered_lcbs[ lcbI ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
+ continue;
+ }
+ if( adjacencies[ lcbI ].lcb_id == -1 ){
+ cerr << "weird";
+ continue; // this one was removed
+ }
+ if( adjacencies[ lcbI ].lcb_id == -2 )
+ continue; // this one was removed
+
+ // this one points elsewhere
+ // search and update the union/find structure for the target
+ stack< uint > visited_lcbs;
+ visited_lcbs.push( lcbI );
+ uint cur_lcb = adjacencies[ lcbI ].lcb_id;
+ while( adjacencies[ cur_lcb ].lcb_id != cur_lcb ){
+ visited_lcbs.push( cur_lcb );
+ cur_lcb = adjacencies[ cur_lcb ].lcb_id;
+ if( cur_lcb == -1 || cur_lcb == -2 ){
+// cerr << "improper hoodidge\n";
+ break; // this one points to an LCB that got deleted
+ }
+ }
+ while( visited_lcbs.size() > 0 ){
+ adjacencies[ visited_lcbs.top() ].lcb_id = cur_lcb;
+ visited_lcbs.pop();
+ }
+ // add this LCB's matches to the target LCB.
+ if( cur_lcb != -1 && cur_lcb != -2 )
+ filtered_lcbs[ cur_lcb ].insert( filtered_lcbs[ cur_lcb ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
+ }
+
+
+ lcb_list.clear();
+ vector< int64 > new_weights;
+ for( lcbI = 0; lcbI < filtered_lcbs.size(); lcbI++ ){
+ if( filtered_lcbs[ lcbI ].size() > 0 ){
+ lcb_list.push_back( filtered_lcbs[ lcbI ] );
+ uint64 wt = 0;
+ GetLCBCoverage( filtered_lcbs[lcbI], wt );
+ new_weights.push_back( wt );
+// if( new_weights[ new_weights.size() - 1 ] != weights[ lcbI ] ){
+// cerr << "Error: Have you lost weight Susan? difference: " << new_weights[ new_weights.size() - 1 ] - weights[ lcbI ] << "\n";
+// }
+ }
+ }
+
+ // sort the matches inside consolidated LCBs
+ MatchStartComparator<AbstractMatch> msc( 0 );
+ for( lcbI = 0; lcbI < lcb_list.size(); lcbI++ ){
+ sort( lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end(), msc );
+ }
+
+ // sort the LCBs themselves
+ LCBLeftEndComp llec;
+ std::sort( lcb_list.begin(), lcb_list.end(), llec );
+
+ // calculate the LCB adjacencies
+ weights = new_weights;
+ computeLCBAdjacencies_v2( lcb_list, weights, adjacencies );
+
+}
+
+void Aligner::WritePermutation( vector< LCB >& adjacencies, std::string out_filename )
+{
+ ofstream permutation_out( out_filename.c_str() );
+ if( !permutation_out.is_open() )
+ {
+ cerr << "Error opening " << out_filename << endl;
+ return;
+ }
+ for( int seqI = 0; seqI < seq_count; seqI++ )
+ {
+ // find the left-most LCB in this genome
+ int left_lcb = 0;
+ for( ; left_lcb < adjacencies.size(); left_lcb++ )
+ {
+ uint left_adj = adjacencies[left_lcb].left_adjacency[seqI];
+ if( left_adj == -1 )
+ break;
+ }
+ // write out lcb id's in order
+ for( uint lcbI = left_lcb; lcbI < adjacencies.size(); )
+ {
+ if( lcbI != left_lcb )
+ permutation_out << '\t';
+ if( adjacencies[lcbI].left_end[seqI] < 0 )
+ permutation_out << "-";
+ permutation_out << adjacencies[lcbI].lcb_id;
+ lcbI = adjacencies[lcbI].right_adjacency[seqI];
+ }
+ permutation_out << endl;
+ }
+}
+
+void WritePermutationCoordinates( IntervalList& perm_iv_list, std::string out_filename )
+{
+ ofstream perm_out( out_filename.c_str() );
+ if( !perm_out.is_open() )
+ {
+ cerr << "Error opening \"" << out_filename << "\"\n";
+ return;
+ }
+ perm_out << "#";
+ for( size_t seqI = 0; seqI < perm_iv_list.seq_table.size(); ++seqI )
+ {
+ if( seqI > 0 )
+ perm_out << '\t';
+ perm_out << "seq" << seqI << "_leftend\tseq" << seqI << "_rightend";
+ }
+ perm_out << endl;
+ for( size_t ivI = 0; ivI < perm_iv_list.size(); ++ivI )
+ {
+ for( size_t seqI = 0; seqI < perm_iv_list.seq_table.size(); ++seqI )
+ {
+ if( seqI > 0 )
+ perm_out << '\t';
+ if( perm_iv_list[ivI].Orientation(seqI) == AbstractMatch::reverse )
+ perm_out << '-';
+ perm_out << perm_iv_list[ivI].LeftEnd(seqI) << '\t';
+ if( perm_iv_list[ivI].Orientation(seqI) == AbstractMatch::reverse )
+ perm_out << '-';
+ perm_out << perm_iv_list[ivI].RightEnd(seqI);
+ }
+ perm_out << endl;
+ }
+}
+
+void Aligner::RecursiveAnchorSearch( MatchList& mlist, gnSeqI minimum_weight, vector< MatchList >& LCB_list, boolean entire_genome, ostream* status_out ){
+
+//
+// Step 4) Identify regions of collinearity (LCBs) among the remaining n-way multi-MUMs
+//
+ uint lcbI;
+ set<uint> breakpoints;
+ vector< int64 > weights;
+ vector< LCB > adjacencies;
+ MatchList new_matches;
+ new_matches.seq_table = mlist.seq_table;
+ new_matches.seq_filename = mlist.seq_filename;
+
+ if( mlist.size() == 0 )
+ return;
+
+ AaronsLCB( mlist, breakpoints );
+ if( status_out )
+ *status_out << "The " << mlist.size() << " matches constitute " << breakpoints.size() << " breakpoints\n";
+ // organize the LCBs into different MatchList instances (inside of LCB_list)
+ ComputeLCBs( mlist, breakpoints, LCB_list, weights );
+ uint weightI;
+ for( weightI = 0; weightI < weights.size(); weightI++ )
+ if( weights[weightI] < cur_min_coverage || cur_min_coverage == -1 )
+ cur_min_coverage = weights[weightI];
+
+ computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
+
+ int cur_extension_round = 0;
+ int64 total_weight = 0;
+ int64 prev_total_weight = 0;
+ weightI = 0;
+ vector< vector< int64 > > prev_iv_regions;
+ do {
+
+// for( ; weightI < weights.size(); weightI++ )
+// total_weight += weights[ weightI ];
+
+ int64 extension_weight = total_weight;
+ int64 prev_extension_weight = total_weight;
+
+ // only search outside existing LCBs on the whole-genome scale to save time
+ if( entire_genome && extend_lcbs && total_weight != 0 &&
+ cur_extension_round < this->max_extension_iters )
+ {
+ cur_extension_round++;
+ if( status_out )
+ *status_out << "Performing LCB extension\n";
+ vector< vector< int64 > > cur_iv_regions;
+ CreateGapSearchList( adjacencies, new_matches.seq_table, cur_iv_regions, entire_genome );
+ // only do the search if there's something new to search
+ if( prev_iv_regions != cur_iv_regions )
+ {
+ int local_round = 0;
+ do {
+ local_round++;
+ // search the gaps between the LCBs to extend the ends of LCBs
+ new_matches.clear();
+ vector< vector< int64 > > new_iv_regions;
+ CreateGapSearchList( adjacencies, new_matches.seq_table, new_iv_regions, entire_genome );
+ SearchLCBGaps( new_matches, new_iv_regions, nway_mh );
+ mlist.insert( mlist.end(), new_matches.begin(), new_matches.end() );
+
+ AaronsLCB( mlist, breakpoints );
+ ComputeLCBs( mlist, breakpoints, LCB_list, weights );
+ cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
+ computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
+
+ // calculate the new total LCB weight
+ prev_extension_weight = extension_weight;
+ extension_weight = 0;
+ for( weightI = 0; weightI < weights.size(); weightI++ )
+ extension_weight += weights[ weightI ];
+ if( status_out )
+ *status_out << "Previous weight: " << prev_extension_weight << " new weight: " << extension_weight << endl;
+ if( prev_extension_weight > extension_weight ){
+ cerr << "Error! Previous weight: " << prev_extension_weight << " new weight: " << extension_weight << endl;
+ }
+ }while( extension_weight > prev_extension_weight && local_round < this->max_extension_iters);
+ }
+ swap( prev_iv_regions, cur_iv_regions );
+ }
+
+ // now search within LCBs
+ if( currently_recursing && total_weight != 0 ){
+ vector< search_cache_t > new_cache;
+ for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
+// if( status_out )
+// *status_out << "Searching in LCB: " << lcbI << endl;
+ int prev_size = LCB_list[ lcbI ].size();
+ bool leftmost = true;
+ for( int i = 0; leftmost && i < adjacencies[lcbI].left_adjacency.size(); i++ )
+ if(adjacencies[lcbI].left_adjacency[i] != NO_ADJACENCY)
+ leftmost = false;
+ bool rightmost = true;
+ for( int i = 0; rightmost && i < adjacencies[lcbI].right_adjacency.size(); i++ )
+ if(adjacencies[lcbI].right_adjacency[i] != NO_ADJACENCY)
+ rightmost = false;
+ SearchWithinLCB( LCB_list[ lcbI ], new_cache, leftmost, rightmost );
+// if( status_out )
+// *status_out << "Gained " << LCB_list[ lcbI ].size() - prev_size << " matches\n";
+
+ }
+
+ // delete the previous search cache
+ swap( search_cache, new_cache );
+ for( size_t mI = 0; mI < new_cache.size(); mI++ )
+ {
+ if( new_cache[mI].first != NULL )
+ new_cache[mI].first->Free();
+ if( new_cache[mI].second != NULL )
+ new_cache[mI].second->Free();
+ }
+ new_cache.clear();
+ std::sort( search_cache.begin(), search_cache.end(), cache_comparator );
+ }
+
+ mlist.clear();
+ for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
+ mlist.insert( mlist.end(), LCB_list[ lcbI ].begin(), LCB_list[ lcbI ].end() );
+ }
+
+ if( currently_recursing && total_weight != 0 ){
+ // remove low weight LCBs, while searching coalesced regions
+ AaronsLCB( mlist, breakpoints );
+ ComputeLCBs( mlist, breakpoints, LCB_list, weights );
+ computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
+ cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
+ }
+
+
+ // write alist for debugging
+// ofstream debug_match_list( "debug_match_list.txt" );
+// mlist.WriteList( debug_match_list );
+// debug_match_list.close();
+
+//
+// Step 6) Use greedy breakpoint elimination to remove low-weight LCBs
+//
+ int64 cur_perm_weight = permutation_weight != -1 ? permutation_weight : minimum_weight;
+ do{
+ vector<double> m_weights(weights.size());
+ for( size_t wI = 0; wI < weights.size(); wI++ )
+ m_weights[wI] = (double)weights[wI];
+ SimpleBreakpointScorer sbs(adjacencies, cur_perm_weight, this->collinear_genomes);
+ if( status_out )
+ (*status_out) << "Performing greedy breakpoint elimination (this may take some time)\n";
+
+ greedyBreakpointElimination_v4(adjacencies, m_weights, sbs, NULL, false);
+// cur_min_coverage = greedyBreakpointElimination( cur_perm_weight, adjacencies, weights, status_out );
+// MatchList deleted_matches;
+ filterMatches( adjacencies, LCB_list, weights );
+ cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
+
+ mlist.clear();
+ for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
+ mlist.insert( mlist.end(), LCB_list[ lcbI ].begin(), LCB_list[ lcbI ].end() );
+ }
+ if( status_out )
+ *status_out << "Greedy breakpoint elimination leaves " << mlist.size() << " matches constituting " << LCB_list.size() << " LCBs covering at least " << cur_min_coverage << "b.p.\n";
+
+ if( permutation_weight != -1 ){
+ // construct a filename
+ stringstream cur_perm_filename;
+ cur_perm_filename << permutation_filename << "." << cur_perm_weight / seq_count;
+ // output the permutation
+ WritePermutation( adjacencies, cur_perm_filename.str() );
+
+ // also write out condensed interval data for the permutation
+ cur_perm_filename << ".lcbs";
+ IntervalList perm_iv_list;
+ perm_iv_list.seq_filename = mlist.seq_filename;
+ perm_iv_list.seq_table = mlist.seq_table;
+ for( int permI = 0; permI < LCB_list.size(); permI++ ){
+ vector< AbstractMatch* > perm_vector;
+ perm_vector.push_back( LCB_list[permI].front() );
+ if( LCB_list[permI].size() > 1 )
+ perm_vector.push_back( LCB_list[permI].back() );
+ Interval perm_iv(perm_vector.begin(), perm_vector.end());
+ perm_iv_list.push_back(perm_iv);
+ }
+ WritePermutationCoordinates( perm_iv_list, cur_perm_filename.str() );
+
+ // get the current min weight
+ vector< int64 >::iterator min_w = std::min_element( weights.begin(), weights.end() );
+ // increment the current weight
+ cur_perm_weight = *min_w + seq_count;
+ }
+ }while( cur_perm_weight < minimum_weight );
+ // only enable recursive anchor search once we achieve
+ // the desired weight threshold once -- for speed's sake
+ if( recursive && entire_genome ){
+ currently_recursing = true;
+ }
+
+ // calculate the new total LCB weight
+ prev_total_weight = total_weight;
+ total_weight = 0;
+ for( weightI = 0; weightI < weights.size(); weightI++ )
+ total_weight += weights[ weightI ];
+ if( status_out )
+ *status_out << "Previous weight: " << prev_total_weight << " new weight: " << total_weight << endl;
+ // the weight can shrink--this isn't an error condition
+// if( prev_total_weight > total_weight ){
+// cerr << "Error! Previous weight: " << prev_total_weight << " new weight: " << total_weight << endl;
+ // write out the lcb lists
+// }
+
+//
+// Step 7) Repeat 4, 5 and 6 until the total weight stabilizes
+//
+ }while( total_weight != prev_total_weight );
+
+ // delete the search cache
+ for( size_t mI = 0; mI < search_cache.size(); mI++ )
+ {
+ if( search_cache[mI].first != NULL )
+ search_cache[mI].first->Free();
+ if( search_cache[mI].second != NULL )
+ search_cache[mI].second->Free();
+ }
+}
+
+/**
+ * Note: this algorithm differs from the one reported in the Mauve paper
+ * The modifications should make the Mauve method more sensitive
+ * Given an initial set of multi-MUMs, the alignment is an x step process:
+ * 1) Eliminate overlaps among the multi-MUMs
+ * 2) Compute a phylogenetic guide tree using the multi-MUMs
+ * 3) Remove subset multi-MUMs
+ * 4) Identify regions of collinearity (LCBs) among the remaining n-way multi-MUMs
+ * 5) Perform recursive anchor search within and outside LCBs
+ * 5a) search outside until weight stabilizes
+ * 5b) search within LCBs
+ * 6) Use greedy breakpoint elimination to remove low-weight LCBs
+ * 6a) whenever two LCBs coalesce, search the intervening region for multi-MUMs
+ * 7) Repeat 4, 5 and 6 until the total weight stabilizes
+ * 8) Perform gapped alignment on each LCB
+ * When limited area DP and POA are integrated, step 8 will become step 5c
+ *
+ */
+
+void Aligner::align( MatchList& mlist, IntervalList& interval_list, double LCB_minimum_density, double LCB_minimum_range, boolean recursive, boolean extend_lcbs, boolean gapped_alignment, string tree_filename ){
+ seq_count = mlist.seq_table.size();
+ this->LCB_minimum_density = LCB_minimum_density;
+ this->LCB_minimum_range = LCB_minimum_range;
+ this->recursive = recursive;
+ this->currently_recursing = false;
+ this->extend_lcbs = extend_lcbs;
+ this->gapped_alignment = gapped_alignment;
+
+ // use LCB_minimum_range == -1 to indicate that all genomes are
+ // expected to be collinear
+ this->collinear_genomes = LCB_minimum_range == -1;
+ if( collinear_genomes )
+ cout << "\nAssuming collinear genomes...\n";
+
+ // set the nway_mh mask
+ uint64 nway_mask = 1;
+ nway_mask <<= seq_count;
+ nway_mask--;
+ nway_mh.SetMask( nway_mask );
+
+ cout << "Starting with " << mlist.size() << " MUMs\n";
+
+//
+// Step 1) Eliminate overlaps among the multi-MUMs
+//
+ // Remove linked inclusions
+ EliminateOverlaps( mlist );
+ cout << "Eliminating overlaps yields " << mlist.size() << " MUMs\n";
+
+//
+// Step 2) Compute a phylogenetic guide tree using the multi-MUMs
+//
+
+ bool guide_tree_loaded = false;
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+
+ if( !guide_tree_loaded && (recursive || tree_filename != "") ){
+ // Make a phylogenetic tree for ClustalW
+ interval_list.seq_table = mlist.seq_table;
+ interval_list.seq_filename = mlist.seq_filename;
+ // use the identity matrix method and convert to a distance matrix
+ NumericMatrix< double > distance;
+ DistanceMatrix( mlist, distance );
+ if( tree_filename == "" )
+ tree_filename = CreateTempFileName("guide_tree");
+ mi.CreateTree( distance, tree_filename );
+ }
+
+//
+// Step 3) Remove subset multi-MUMs
+//
+ // Multiplicity Filter...
+ mlist.MultiplicityFilter( seq_count );
+ cout << "Multiplicity filter gives " << mlist.size() << " MUMs\n";
+
+ if( mlist.size() == 0 )
+ return;
+
+//
+// Steps 4 through 7 are contained in RecursiveAnchorSearch
+//
+ vector< MatchList > LCB_list;
+ RecursiveAnchorSearch( mlist, (gnSeqI)LCB_minimum_range, LCB_list, true, &cout );
+
+
+//
+// Step 8) Perform gapped alignment on each LCB using the anchors
+//
+ if( gapped_alignment && recursive )
+ cout << "\nMaking final gapped alignment...\n";
+ interval_list.clear();
+ AlnProgressTracker apt;
+ apt.cur_leftend = 0;
+ apt.prev_progress = 0;
+ apt.total_len = 0;
+ for( uint lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
+ apt.total_len += LCB_list[lcbI].size()-1;
+ for( uint lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
+ Interval new_iv;
+ interval_list.push_back( new_iv );
+ Interval& iv = interval_list.back();
+ if( !gapped_alignment || !recursive ){
+ iv.SetMatches( LCB_list[lcbI] );
+ }else{
+// AlignLCB( LCB_list[ lcbI ], iv );
+ AlignLCBInParallel( collinear_genomes || (LCB_list.size()==1), gal, LCB_list[ lcbI ], iv, apt );
+ }
+ }
+
+ // finally add any unaligned regions to the interval list
+ if( gapped_alignment )
+ addUnalignedIntervals( interval_list );
+}
+
+} // namespace mems
+
diff --git a/libMems/Aligner.h b/libMems/Aligner.h
new file mode 100644
index 0000000..a4e2377
--- /dev/null
+++ b/libMems/Aligner.h
@@ -0,0 +1,307 @@
+/*******************************************************************************
+ * $Id: Aligner.h,v 1.23 2004/04/19 23:10:13 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _Aligner_h_
+#define _Aligner_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/DNAMemorySML.h"
+#include "libMems/GappedAligner.h"
+#include "libMems/MatchList.h"
+#include "libMems/Interval.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MemHash.h"
+#include "libMems/MaskedMemHash.h"
+#include <map>
+#include "libMems/NumericMatrix.h"
+#include "libMems/GreedyBreakpointElimination.h"
+#include <list>
+#include "libMems/LCB.h"
+#include "libMUSCLE/threadstorage.h"
+
+namespace mems {
+
+/**
+ * A mem labeled with a number.
+ * Used by LCB construction algorithm
+ */
+class LabeledMem{
+public:
+ Match* mem;
+ uint label;
+};
+
+/**
+ * Compares Matches labeled with a number.
+ * Used by LCB construction algorithm
+ */
+class LabeledMemComparator {
+public:
+ LabeledMemComparator( uint seq ){
+ m_seq = seq;
+ }
+ LabeledMemComparator( LabeledMemComparator& lmc ){
+ m_seq = lmc.m_seq;
+ }
+ boolean operator()(const LabeledMem& a, const LabeledMem& b) const{
+
+ int64 a_start = a.mem->Start( m_seq ), b_start = b.mem->Start( m_seq );
+ if( a_start == NO_MATCH || b_start == NO_MATCH ){
+ if( b_start != NO_MATCH )
+ return true;
+ return false;
+ }
+ if(a_start < 0)
+ a_start = -a_start;
+// a_start = -a_start + a.mem->Length();
+ if(b_start < 0)
+ b_start = -b_start;
+// b_start = -b_start + b.mem->Length();
+ int64 diff = a_start - b_start;
+ return diff < 0;
+ }
+protected:
+ uint m_seq;
+private:
+ LabeledMemComparator();
+};
+
+/**
+ * A match with an associated list iterator.
+ * Used by LCB construction algorithm
+ */
+class PlacementMatch{
+public:
+ Match* mem;
+ std::list< LabeledMem >::iterator iter;
+};
+
+/**
+ * Compares Matches.
+ * Used by LCB construction algorithm
+ */
+class PlacementMatchComparator {
+public:
+ PlacementMatchComparator( uint seq ){
+ m_seq = seq;
+ }
+ PlacementMatchComparator( PlacementMatchComparator& lmc ){
+ m_seq = lmc.m_seq;
+ }
+ boolean operator()(const PlacementMatch& a, const PlacementMatch& b) const{
+
+ int64 a_start = a.mem->Start( m_seq ), b_start = b.mem->Start( m_seq );
+ if( a_start == NO_MATCH || b_start == NO_MATCH ){
+ if( b_start != NO_MATCH )
+ return true;
+ return false;
+ }
+ if(a_start < 0)
+ a_start = -a_start;
+// a_start = -a_start + a.mem->Length();
+ if(b_start < 0)
+ b_start = -b_start;
+// b_start = -b_start + b.mem->Length();
+
+ int64 diff = a_start - b_start;
+ return diff < 0;
+ }
+protected:
+ uint m_seq;
+private:
+ PlacementMatchComparator();
+};
+
+
+/** a cache type to remember which intervals have already been searched */
+typedef std::pair< mems::Match*, mems::Match* > search_cache_t;
+
+
+/**
+ * Used to find locally colinear blocks (LCBs) and do recursive
+ * alignments on the blocks
+ * To create an alignment one need only use the align method.
+ * LCB lists are typically stored using the IntervalList class. They can be
+ * read and written in interval format using that class. For input and output
+ * of gapped alignments in other formats, see the gnAlignedSequences class.
+ * Other methods in this class are available for experimentation.
+ */
+class Aligner {
+public:
+ /**
+ * Constructs an aligner for the specified number of sequences.
+ * @param seq_count The number of sequences that will be aligned with this Aligner
+ */
+ Aligner( uint seq_count );
+ Aligner( const Aligner& al );
+ Aligner& operator=( const Aligner& al );
+
+ /**
+ * Performs an alignment. Takes a MatchList as input and outputs a list of LCBs as an IntervalList.
+ * Several of the options can be used to filter out unlikely LCBs. If the recursive option is
+ * specified, the regions between matches in each LCB are searched for further homology and a full
+ * gapped alignment is produced.
+ * @param mlist The MatchList to use as input for the alignment process
+ * @param interval_list The IntervalList that is created by the alignment process
+ * @param LCB_minimum_density The minimum density that an LCB may have to be considered a valid block
+ * This should be a number between 0 and 1.
+ * @param LCB_minimum_range A misnomer: really it's the minimum number of matching base pairs an LCB
+ * must contain to be considered an LCB. Coverage is defined as
+ * (length of match) * (# of matching sequences)
+ * @param recursive Option for performing a recursive alignment. If this is set to
+ * true, all regions which have gaps will be searched for exact matches.
+ * @param extend_lcbs If true, attempt to extend the boundaries of LCBs by searching for
+ * additional matches between LCBs
+ * @param tree_filename The name of the output file to write the phylogenetic guide tree into. If
+ * an empty string is specified then a temporary file is created.
+ * @throws AlignerError may be thrown if an error occurs
+ */
+ void align( MatchList& mlist, IntervalList& interval_list, double LCB_minimum_density, double LCB_minimum_range, boolean recursive, boolean extend_lcbs, boolean gapped_alignment, std::string tree_filename = "" );
+
+ void Recursion( MatchList& r_list, Match* r_begin, Match* r_end, boolean nway_only = false );
+ void GetBestLCB( MatchList& r_list, MatchList& best_lcb );
+ void DoSomethingCool( MatchList& mlist, Interval& iv );
+
+ /**
+ * Set the minimum size of intervening region between two anchor matches that will
+ * be considered for recursive anchor determination. When the gaps between two anchors
+ * are less than this cutoff value the region is handed off to the dynamic programming aligner
+ * e.g. ClustalW
+ */
+ void SetMinRecursionGapLength( gnSeqI min_r_gap );
+
+ void SetMaxExtensionIterations( uint ext_iters ){ this->max_extension_iters = ext_iters; }
+
+ void SearchWithinLCB( MatchList& mlist, std::vector< search_cache_t >& new_cache, bool leftmost = false, bool rightmost = false );
+ void RecursiveAnchorSearch( MatchList& mlist, gnSeqI minimum_weight, std::vector< MatchList >& LCB_list, boolean entire_genome, std::ostream* status_out = NULL );
+
+ void AlignLCB( MatchList& mlist, Interval& iv );
+ void SetGappedAligner( GappedAligner& gal );
+ /** forwards the request to whatever gapped aligner is being used */
+ void SetMaxGappedAlignmentLength( gnSeqI len );
+
+ /** Set output parameters for permutation matrices */
+ void SetPermutationOutput( std::string& permutation_filename, int64 permutation_weight );
+ void WritePermutation( std::vector< LCB >& adjacencies, std::string out_filename );
+
+ void SetRecursive( bool value ){ this->recursive = value; }
+protected:
+ TLS<MemHash> gap_mh; /**< Used during recursive alignment */
+ MaskedMemHash nway_mh; /**< Used during recursive alignment to find nway matches only */
+ uint32 seq_count; /**< The number of sequences this aligner is working with */
+ boolean debug; /**< Flag for debugging output */
+
+ double LCB_minimum_density;
+ double LCB_minimum_range;
+
+ uint max_extension_iters; /**< maximum number of attempts at LCB extension */
+
+ int64 cur_min_coverage; /**< Tracks the minimum weight of the least weight LCB */
+
+ gnSeqI min_recursive_gap_length; /**< Minimum size of gap regions that will be recursed on */
+
+ void consistencyCheck( uint lcb_count, std::vector< LCB >& adjacencies, std::vector< MatchList >& lcb_list, std::vector< int64 >& weights );
+
+ boolean recursive; /**< Set to true if a recursive anchor search/gapped alignment should be performed */
+ boolean extend_lcbs; /**< Set to true if LCB extension should be attempted */
+ boolean gapped_alignment; /**< Set to true to complete a gapped alignment */
+ boolean currently_recursing; /**< True when the recursive search has begun */
+ boolean collinear_genomes; /**< Set to true if all genomes are assumed to be collinear */
+
+ GappedAligner* gal;
+
+ std::string permutation_filename;
+ int64 permutation_weight;
+
+ std::vector< search_cache_t > search_cache; /**< a list of recursive searches that have already been done */
+};
+
+/**
+ * Thrown if some error occurs during alignment
+ */
+CREATE_EXCEPTION( AlignerError );
+
+void transposeMatches( MatchList& mlist, uint seqI, const std::vector< int64 >& seq_regions );
+
+/**
+ * Deletes overlapping regions in a set of matches. Always removes matching base pairs from the
+ * match covering fewer bases. Coverage is defined as (length of match) * (# of matching sequences)
+ */
+void EliminateOverlaps( MatchList& ml );
+
+/**
+ * Function to determine the breakpoints in a set of matches.
+ * Sorts the matches in mlist and returns the indices of breakpoints.
+ * This function attempts (sometimes unsuccessfully) to determine subset LCBs. If a set of
+ * matches containing subset LCBs has been passed to it, the resulting breakpoint set may
+ * be incorrect. You have been warned.
+ * @param mlist A list of matches to search for LCBs.
+ * @param breakpoints The indices of matches in the sorted match list that are at LCB boundaries
+ */
+void AaronsLCB( MatchList& mlist, std::set<uint>& breakpoints );
+
+
+void ComputeLCBs( MatchList& meml, std::set<uint>& breakpoints, std::vector<MatchList>& lcb_list, std::vector<int64>& weights );
+void computeLCBAdjacencies_v2( std::vector<MatchList>& lcb_list, std::vector< int64 >& weights, std::vector< LCB >& adjacencies );
+void computeLCBAdjacencies_v2( IntervalList& iv_list, std::vector< int64 >& weights, std::vector< LCB >& adjacencies );
+void scanLeft( int& left_recurseI, std::vector< LCB >& adjacencies, int min_weight, int seqI );
+void scanRight( int& right_recurseI, std::vector< LCB >& adjacencies, int min_weight, int seqI );
+void GetLCBCoverage( MatchList& lcb, uint64& coverage );
+
+int64 greedyBreakpointElimination( gnSeqI minimum_weight, std::vector< LCB >& adjacencies, std::vector< int64 >& weights, std::ostream* status_out = NULL );
+void filterMatches( std::vector< LCB >& adjacencies, std::vector< MatchList >& lcb_list, std::vector< int64 >& weights );
+
+void CreateGapSearchList( std::vector< LCB >& adjacencies, const std::vector< genome::gnSequence* >& seq_table, std::vector< std::vector< int64 > >& iv_regions, boolean entire_genome );
+void SearchLCBGaps( MatchList& new_matches, const std::vector< std::vector< int64 > >& iv_regions, MaskedMemHash& nway_mh );
+
+static const uint MIN_ANCHOR_LENGTH = 9;
+
+
+/** used for search cache lookups */
+class SearchCacheComparator
+{
+public:
+ SearchCacheComparator() : msc(0){};
+ bool operator()( const search_cache_t& a, const search_cache_t& b ) const
+ {
+ bool lt = true;
+ if( a.first == NULL )
+ {
+ if( b.first == NULL )
+ lt = false;
+ }else if( b.first == NULL )
+ {
+ lt = false;
+ }else if( !msc( a.first, b.first ) )
+ lt = false;
+ else if( a.second == NULL )
+ {
+ if( b.second == NULL )
+ lt = false;
+ }else if( b.second == NULL )
+ {
+ lt = false;
+ }else if( !msc( a.second, b.second ) )
+ lt = false;
+
+ return lt;
+ }
+protected:
+ mems::MatchStartComparator<mems::Match> msc;
+};
+
+static SearchCacheComparator cache_comparator;
+
+
+}
+
+#endif // _Aligner_h_
diff --git a/libMems/Backbone.cpp b/libMems/Backbone.cpp
new file mode 100644
index 0000000..afa0ed8
--- /dev/null
+++ b/libMems/Backbone.cpp
@@ -0,0 +1,1203 @@
+/*******************************************************************************
+ * $Id: Backbone.cpp,v 1.12 2004/04/19 23:11:19 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/Backbone.h"
+#include "libMems/Islands.h"
+#include "libMems/CompactGappedAlignment.h"
+
+#include <boost/graph/graph_traits.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/topological_sort.hpp>
+#include <boost/graph/johnson_all_pairs_shortest.hpp>
+#include <boost/graph/depth_first_search.hpp>
+#include <boost/graph/undirected_dfs.hpp>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+
+template< typename MatchVector >
+void getBpList( MatchVector& mvect, uint seq, vector< gnSeqI >& bp_list )
+{
+ bp_list.clear();
+ for( size_t ivI = 0; ivI < mvect.size(); ivI++ )
+ {
+ if( mvect[ivI]->LeftEnd(seq) == NO_MATCH )
+ continue;
+ bp_list.push_back( mvect[ivI]->LeftEnd(seq) );
+ bp_list.push_back( mvect[ivI]->RightEnd(seq)+1 );
+ }
+ std::sort( bp_list.begin(), bp_list.end() );
+}
+
+template< typename MatchVector >
+void createMap( const MatchVector& mv_from, const MatchVector& mv_to, vector< size_t >& map )
+{
+ typedef typename MatchVector::value_type MatchPtr;
+ vector< pair< MatchPtr, size_t > > m1(mv_from.size());
+ vector< pair< MatchPtr, size_t > > m2(mv_to.size());
+ for( size_t i = 0; i < mv_from.size(); ++i )
+ m1[i] = make_pair( mv_from[i], i );
+ for( size_t i = 0; i < mv_to.size(); ++i )
+ m2[i] = make_pair( mv_to[i], i );
+ std::sort( m1.begin(), m1.end() );
+ std::sort( m2.begin(), m2.end() );
+ map.resize( m1.size() );
+ for( size_t i = 0; i < m1.size(); ++i )
+ map[m1[i].second] = m2[i].second;
+}
+
+typedef pair< size_t, Interval* > iv_tracker_t;
+class IvTrackerComp
+{
+public:
+ IvTrackerComp( uint seq ) : ssc( seq ) {}
+ bool operator()( const iv_tracker_t& a, const iv_tracker_t& b )
+ {
+ return ssc(a.second, b.second);
+ }
+private:
+ SingleStartComparator<Interval> ssc;
+};
+
+const int LEFT_NEIGHBOR = -1;
+const int RIGHT_NEIGHBOR = 1;
+typedef vector< size_t > neighbor_t;
+
+neighbor_t& getNeighbor( pair< neighbor_t, neighbor_t >& entry, int direction )
+{
+ if( direction == RIGHT_NEIGHBOR )
+ return entry.first;
+ else
+ return entry.second;
+}
+
+
+void collapseCollinear( IntervalList& iv_list )
+{
+ if( iv_list.size() == 0 )
+ return; // nothing to see here, move along...
+ const size_t seq_count = iv_list.seq_table.size();
+ std::vector< Interval* > iv_ptrs(iv_list.size());
+ size_t lilI = 0;
+ for( size_t i = 0; i < iv_list.size(); ++i )
+ {
+ // ignore unaligned regions
+ if( iv_list[i].Multiplicity() < 2 )
+ continue;
+ iv_ptrs[lilI++] = &iv_list[i];
+ }
+ iv_ptrs.resize(lilI);
+ const size_t NEIGHBOR_UNKNOWN = (std::numeric_limits<size_t>::max)();
+ neighbor_t lefties_tmp( seq_count, NEIGHBOR_UNKNOWN );
+ pair< neighbor_t, neighbor_t > neighbor_pair( lefties_tmp, lefties_tmp );
+ vector< pair< neighbor_t, neighbor_t > > neighbor_list( iv_ptrs.size(), neighbor_pair );
+ vector< iv_tracker_t > iv_tracker( iv_ptrs.size() );
+ for( size_t i = 0; i < iv_ptrs.size(); ++i )
+ {
+ iv_tracker[i] = make_pair( i, iv_ptrs[i] );
+ }
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ IvTrackerComp ivc( seqI );
+ sort( iv_tracker.begin(), iv_tracker.end(), ivc );
+ size_t prev_i = NEIGHBOR_UNKNOWN;
+ size_t cur_i = NEIGHBOR_UNKNOWN;
+ for( size_t i = 0; i < iv_tracker.size(); ++i )
+ {
+ if( iv_tracker[i].second->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if( cur_i != NEIGHBOR_UNKNOWN )
+ {
+ neighbor_list[cur_i].first[seqI] = prev_i;
+ neighbor_list[cur_i].second[seqI] = iv_tracker[i].first;
+ }
+ prev_i = cur_i;
+ cur_i = iv_tracker[i].first;
+ }
+ // get the last one
+ if( cur_i != NEIGHBOR_UNKNOWN )
+ {
+ neighbor_list[cur_i].first[seqI] = prev_i;
+ neighbor_list[cur_i].second[seqI] = NEIGHBOR_UNKNOWN;
+ }
+ }
+
+ // now look for neighbor pair entries which can be merged
+ for( int d = -1; d < 2; d+= 2 ) // iterate over both directions
+ {
+ size_t unknown_count = 0;
+ for( size_t nI = 0; nI < neighbor_list.size(); ++nI )
+ {
+ size_t nayb = NEIGHBOR_UNKNOWN;
+ size_t seqI = 0;
+ bool parity = false;
+ size_t ct = 0;
+ for( ; seqI < seq_count; ++seqI )
+ {
+ if( iv_ptrs[nI]->Orientation(seqI) == AbstractMatch::undefined )
+ continue;
+ int orient = iv_ptrs[nI]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
+
+ if( nayb == NEIGHBOR_UNKNOWN )
+ {
+ nayb = getNeighbor( neighbor_list[nI], d * orient * -1 )[seqI];
+ if( nayb != NEIGHBOR_UNKNOWN )
+ parity = iv_ptrs[nI]->Orientation(seqI) == iv_ptrs[nayb]->Orientation(seqI);
+ }
+ else if( nayb != getNeighbor( neighbor_list[nI], d * orient * -1 )[seqI] )
+ break;
+ else if( parity != (iv_ptrs[nI]->Orientation(seqI) == iv_ptrs[nayb]->Orientation(seqI)) )
+ break;
+ if( nayb != NEIGHBOR_UNKNOWN )
+ ct++;
+ }
+ if( seqI < seq_count || ct < iv_ptrs[nI]->Multiplicity() )
+ continue; // not collinear
+ if( nayb == NEIGHBOR_UNKNOWN )
+ continue;
+
+ // merge nI and nayb
+ uint fs = iv_ptrs[nI]->FirstStart();
+ gnSeqI nI_lend_fs = iv_ptrs[nI]->LeftEnd(fs);
+ gnSeqI nayb_lend_fs = iv_ptrs[nayb]->LeftEnd(fs);
+ AbstractMatch::orientation o = iv_ptrs[nI]->Orientation(fs);
+ vector< AbstractMatch* > nI_matches;
+ iv_ptrs[nI]->StealMatches( nI_matches );
+ vector< AbstractMatch* > nayb_matches;
+ iv_ptrs[nayb]->StealMatches( nayb_matches );
+ if( !parity )
+ {
+ std::reverse( nI_matches.begin(), nI_matches.end() );
+ for( size_t i = 0; i < nI_matches.size(); ++i )
+ nI_matches[i]->Invert();
+ o = o == AbstractMatch::forward ? AbstractMatch::reverse : AbstractMatch::forward;
+ }
+ if( (o == AbstractMatch::forward && nI_lend_fs > nayb_lend_fs) ||
+ (o == AbstractMatch::reverse && nI_lend_fs < nayb_lend_fs))
+ nayb_matches.insert( nayb_matches.end(), nI_matches.begin(), nI_matches.end() );
+ else
+ nayb_matches.insert( nayb_matches.begin(), nI_matches.begin(), nI_matches.end() );
+
+ iv_ptrs[nayb]->SetMatches( nayb_matches );
+
+ // update all pointers to point to nayb
+ seqI = 0;
+ for( ; seqI < seq_count; ++seqI )
+ {
+ if( getNeighbor( neighbor_list[nI], -1 )[seqI] == NEIGHBOR_UNKNOWN &&
+ getNeighbor( neighbor_list[nI], 1 )[seqI] == NEIGHBOR_UNKNOWN )
+ continue;
+ int orient = iv_ptrs[nayb]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
+ size_t other_nayb = getNeighbor( neighbor_list[nI], d * orient * (parity ? 1 : -1) )[seqI];
+ if( other_nayb != NEIGHBOR_UNKNOWN )
+ {
+ if( getNeighbor( neighbor_list[other_nayb], 1 )[seqI] == nI )
+ getNeighbor( neighbor_list[other_nayb], 1 )[seqI] = nayb;
+ else if( getNeighbor( neighbor_list[other_nayb], -1 )[seqI] == nI )
+ getNeighbor( neighbor_list[other_nayb], -1 )[seqI] = nayb;
+ else
+ {
+ cerr << "serious programmer error\n";
+ genome::breakHere();
+ }
+ }
+ if( getNeighbor( neighbor_list[nayb], 1 )[seqI] == nI )
+ getNeighbor( neighbor_list[nayb], 1 )[seqI] = other_nayb;
+ else if( getNeighbor( neighbor_list[nayb], -1 )[seqI] == nI )
+ getNeighbor( neighbor_list[nayb], -1 )[seqI] = other_nayb;
+ else
+ {
+ cerr << "inexcusable programmer error\n";
+ genome::breakHere();
+ }
+ neighbor_list[nI].first[seqI] = NEIGHBOR_UNKNOWN;
+ neighbor_list[nI].second[seqI] = NEIGHBOR_UNKNOWN;
+ }
+ }
+ }
+
+ IntervalList new_list;
+ new_list.seq_filename = iv_list.seq_filename;
+ new_list.seq_table = iv_list.seq_table;
+ new_list.resize( iv_ptrs.size() );
+ size_t newI = 0;
+ for( size_t ivI = 0; ivI < iv_ptrs.size(); ++ivI )
+ {
+ vector< AbstractMatch* > matches;
+ iv_ptrs[ivI]->StealMatches( matches );
+ if( matches.size() > 0 )
+ new_list[newI++].SetMatches( matches );
+ }
+ new_list.resize(newI);
+ swap( iv_list, new_list );
+ addUnalignedRegions(iv_list);
+}
+
+
+void checkForAllGapColumns( IntervalList& iv_list )
+{
+ // debug: sanity check whether there are all gap columns
+ for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
+ {
+ vector< string > aln;
+ mems::GetAlignment( iv_list[ivI], iv_list.seq_table, aln );
+ for( size_t colI = 0; colI < aln[0].size(); ++colI )
+ {
+ size_t rowI = 0;
+ for( ; rowI < aln.size(); ++rowI )
+ if( aln[rowI][colI] != '-' )
+ break;
+ if( rowI == aln.size() )
+ {
+ cerr << "ERROR! IV " << ivI << " COLUMN " << colI << " IS ALL GAPS!\n";
+ }
+ }
+ }
+}
+
+
+
+void translateToPairwiseGenomeHSS( const hss_array_t& hss_array, pairwise_genome_hss_t& hss_cols )
+{
+ uint seq_count = hss_array.shape()[0];
+ uint iv_count = hss_array.shape()[2];
+ hss_cols.resize( boost::extents[seq_count][seq_count][iv_count] );
+
+ // make pairwise projections of intervals and find LCBs...
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ for( size_t ivI = 0; ivI < iv_count; ++ivI )
+ {
+ const hss_list_t& cur_list = hss_array[seqI][seqJ][ivI];
+ hss_cols[seqI][seqJ][ivI].resize( cur_list.size() );
+ for( size_t hssI = 0; hssI < cur_list.size(); hssI++ )
+ {
+ hss_cols[seqI][seqJ][ivI][hssI].first = cur_list[hssI].left_col;
+ hss_cols[seqI][seqJ][ivI][hssI].second = cur_list[hssI].right_col;
+ }
+ }
+ }
+ }
+}
+
+
+double computeGC( std::vector< gnSequence* >& seq_table )
+{
+ const uint8* tab = SortedMerList::BasicDNATable();
+ size_t counts[4];
+ for( int i = 0; i < 4; i++ )
+ counts[i] = 0;
+ for( size_t seqI = 0; seqI < seq_table.size(); seqI++ )
+ {
+ std::string seq;
+ seq_table[seqI]->ToString( seq );
+ for( size_t cI = 0; cI < seq.size(); cI++ )
+ counts[ tab[ seq[cI] ] ]++;
+ }
+ return double(counts[1]+counts[2]) / double(counts[1]+counts[2] + counts[0]+counts[3]);
+}
+
+
+void makeAllPairwiseGenomeHSS( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_ptrs, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, const HssDetector* detector )
+{
+ uint seq_count = iv_list.seq_table.size();
+ // make pairwise projections of intervals and find LCBs...
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ vector< uint > projection;
+ projection.push_back( seqI );
+ projection.push_back( seqJ );
+ vector< vector< MatchProjectionAdapter* > > LCB_list;
+ vector< LCB > projected_adjs;
+ projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
+ // make intervals
+ IntervalList pair_ivs;
+ pair_ivs.seq_table.push_back( iv_list.seq_table[seqI] );
+ pair_ivs.seq_table.push_back( iv_list.seq_table[seqJ] );
+ pair_ivs.resize( LCB_list.size() );
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ pair_ivs[lcbI].SetMatches( LCB_list[lcbI] );
+ LCB_list.clear();
+
+ vector< CompactGappedAlignment<>* > pair_cgas( pair_ivs.size() );
+ for( size_t lcbI = 0; lcbI < pair_ivs.size(); ++lcbI )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ pair_cgas[lcbI] = tmp_cga.Copy();
+ new (pair_cgas[lcbI])CompactGappedAlignment<>( pair_ivs[lcbI] );
+ }
+
+ // break up these alignments on contig and chromosome boundaries
+ for(int ssI=0; ssI<2; ssI++){
+ vector<gnSeqI> contig_bounds;
+ for( size_t cI=0; cI < pair_ivs.seq_table[ssI]->contigListSize(); cI++ ){
+ contig_bounds.push_back(pair_ivs.seq_table[ssI]->contigLength(cI));
+ if( cI > 0 )
+ contig_bounds[cI] += contig_bounds[cI-1];
+ }
+ GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm(ssI);
+ applyBreakpoints(contig_bounds, pair_cgas, gmsm);
+ }
+
+ vector< CompactGappedAlignment<>* > hss_list;
+ // now find islands
+ hss_array_t hss_array;
+ (*detector)( pair_cgas, pair_ivs.seq_table, hss_array );
+ HssArrayToCga(pair_cgas, pair_ivs.seq_table, hss_array, hss_list);
+
+ for( size_t cgaI = 0; cgaI < pair_cgas.size(); ++cgaI )
+ pair_cgas[cgaI]->Free();
+ pair_cgas.clear();
+
+ // now split up on iv boundaries
+ vector< gnSeqI > bp_list;
+ getBpList( iv_ptrs, seqI, bp_list );
+ GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm(0);
+ SingleStartComparator< CompactGappedAlignment<> > ssc(0);
+ std::sort(hss_list.begin(), hss_list.end(), ssc );
+ applyBreakpoints( bp_list, hss_list, gmsm );
+ // and again on seqJ
+ getBpList( iv_ptrs, seqJ, bp_list );
+ GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm1(1);
+ SingleStartComparator< CompactGappedAlignment<> > ssc1(1);
+ std::sort(hss_list.begin(), hss_list.end(), ssc1 );
+ applyBreakpoints( bp_list, hss_list, gmsm1 );
+
+ // now transform into interval-specific columns
+ std::sort(hss_list.begin(), hss_list.end(), ssc );
+
+ SingleStartComparator< CompactGappedAlignment<> > ivcomp(seqI);
+ std::sort( iv_ptrs.begin(), iv_ptrs.end(), ivcomp );
+ vector< size_t > iv_map;
+ createMap( iv_ptrs, iv_orig_ptrs, iv_map );
+ size_t ivI = 0;
+ while( ivI < iv_ptrs.size() && iv_ptrs[ivI]->LeftEnd(0) == NO_MATCH )
+ ++ivI;
+ for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
+ {
+ if( hss_list[hssI]->LeftEnd(0) == NO_MATCH || hss_list[hssI]->Length(0) == 0 )
+ continue;
+ if( ivI == iv_ptrs.size() )
+ {
+ cerr << "huh?\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs.back()->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs.back()->RightEnd(seqI) << endl;
+ }
+ while( ivI < iv_ptrs.size() &&
+ (iv_ptrs[ivI]->LeftEnd(seqI) == NO_MATCH ||
+ hss_list[hssI]->LeftEnd(0) > iv_ptrs[ivI]->RightEnd(seqI) ) )
+ ++ivI;
+ if( ivI == iv_ptrs.size() )
+ {
+ cerr << "hssI fit!!\n";
+ genome::breakHere();
+ }
+ // check for containment in seqJ
+ if( iv_ptrs[ivI]->LeftEnd(seqJ) == NO_MATCH ||
+ iv_ptrs[ivI]->RightEnd(seqJ) < hss_list[hssI]->LeftEnd(1) ||
+ hss_list[hssI]->RightEnd(1) < iv_ptrs[ivI]->LeftEnd(seqJ) )
+ continue; // this hss falls to an invalid range in seqJ
+
+ if( hss_list[hssI]->RightEnd(0) < iv_ptrs[ivI]->LeftEnd(seqI) )
+ {
+ cerr << "huh 2?\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
+ hssI++;
+ continue;
+ }
+
+ vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][iv_map[ivI]];
+
+ gnSeqI left_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->LeftEnd(0) );
+ gnSeqI right_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->RightEnd(0) );
+ if(left_col > right_col && iv_ptrs[ivI]->Orientation(seqI) == AbstractMatch::reverse )
+ {
+ swap(left_col, right_col); // must have been a revcomp seq
+ }
+ else if(left_col > right_col)
+ {
+ cerr << "bad cols\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
+ genome::breakHere();
+ }
+
+ if( left_col > 2000000000 || right_col > 2000000000 )
+ {
+ cerr << "huh 2?\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
+ genome::breakHere();
+ }
+ cur_hss_cols.push_back( make_pair( left_col, right_col ) );
+ }
+ for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
+ hss_list[hssI]->Free();
+ }
+ }
+}
+
+void mergePairwiseHomologyPredictions( vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, vector< vector< ULA* > >& ula_list )
+{
+ uint seq_count = hss_cols.shape()[0];
+ uint iv_count = hss_cols.shape()[2];
+ //
+ // FINALLY! ready to merge. how to do it?
+ // make an empty list of UngappedLocalAlignments
+ // start with the first seq and create a ULA for every col
+ // range. Then continue to the second seq, and when
+ // a col range overlaps a pre-existing ULA, create a new ULA
+ // for the intersected region and a smaller ULA for the non-intersected region
+ ula_list.resize( iv_count );
+ for( size_t ivI = 0; ivI < iv_count; ++ivI )
+ {
+ vector< ULA* >& iv_ulas = ula_list[ivI];
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][ivI];
+ vector< ULA* > cur_ulas( cur_hss_cols.size() );
+ ULA tmp_ula(seq_count);
+ for( size_t hssI = 0; hssI < cur_hss_cols.size(); ++hssI )
+ {
+ cur_ulas[hssI] = tmp_ula.Copy();
+ cur_ulas[hssI]->SetStart(seqI, cur_hss_cols[hssI].first+1);
+ cur_ulas[hssI]->SetStart(seqJ, cur_hss_cols[hssI].first+1);
+ cur_ulas[hssI]->SetLength( cur_hss_cols[hssI].second - cur_hss_cols[hssI].first + 1 );
+ }
+
+ vector< gnSeqI > iv_bp_list;
+ vector< gnSeqI > cur_bp_list;
+ SingleStartComparator<ULA> ulacompI(seqI);
+ std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompI );
+ std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
+ getBpList( iv_ulas, seqI, iv_bp_list );
+ getBpList( cur_ulas, seqI, cur_bp_list );
+ GenericMatchSeqManipulator< ULA > gmsm(seqI);
+ applyBreakpoints( iv_bp_list, cur_ulas, gmsm );
+ applyBreakpoints( cur_bp_list, iv_ulas, gmsm );
+
+ SingleStartComparator<ULA> ulacompJ(seqJ);
+ std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompJ );
+ std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompJ );
+ getBpList( iv_ulas, seqJ, iv_bp_list );
+ getBpList( cur_ulas, seqJ, cur_bp_list );
+ GenericMatchSeqManipulator< ULA > gmsmJ(seqJ);
+ applyBreakpoints( iv_bp_list, cur_ulas, gmsmJ );
+ applyBreakpoints( cur_bp_list, iv_ulas, gmsmJ );
+
+ // do seqI a second time to propagate any breakpoints introduced by seqJ
+ std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompI );
+ std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
+ getBpList( iv_ulas, seqI, iv_bp_list );
+ getBpList( cur_ulas, seqI, cur_bp_list );
+ applyBreakpoints( iv_bp_list, cur_ulas, gmsm );
+ applyBreakpoints( cur_bp_list, iv_ulas, gmsm );
+
+ std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompI );
+ std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
+ // now that cur_ulas and iv_ulas are all broken up according to each other's boundaries
+ // we can simply scan along and add
+ size_t iv_ulas_size = iv_ulas.size();
+ size_t ivuI = 0;
+ size_t curuI = 0;
+ vector< ULA* > added_to( cur_ulas.size(), NULL ); // this tracks which of iv_ulas a cur_ula was added to
+ vector< ULA* > to_delete;
+ while( ivuI < iv_ulas_size && curuI < cur_ulas.size() )
+ {
+ if( iv_ulas[ivuI]->LeftEnd(seqI) == cur_ulas[curuI]->LeftEnd(seqI) )
+ {
+ if( added_to[curuI] == iv_ulas[ivuI] )
+ {
+ // do nothing
+ }else if( added_to[curuI] == NULL )
+ {
+ iv_ulas[ivuI]->SetLeftEnd(seqJ, cur_ulas[curuI]->LeftEnd(seqJ));
+ added_to[curuI] = iv_ulas[ivuI];
+ }else{
+ ULA* merge = added_to[curuI];
+ for( size_t seqK = 0; seqK < seq_count; ++seqK )
+ {
+ if( merge->Start(seqK) == NO_MATCH )
+ continue;
+ iv_ulas[ivuI]->SetStart( seqK, merge->Start(seqK) );
+ }
+ to_delete.push_back( merge );
+ }
+ ivuI++;
+ }else if( iv_ulas[ivuI]->LeftEnd(seqI) < cur_ulas[curuI]->LeftEnd(seqI) )
+ {
+ ivuI++;
+ }else
+ curuI++;
+ }
+
+ // delete to_delete...
+ std::sort( to_delete.begin(), to_delete.end() );
+ vector< ULA* >::iterator last = std::unique( to_delete.begin(), to_delete.end() );
+ to_delete.erase( last, to_delete.end() );
+ vector< ULA* > new_iv_ulas( iv_ulas.size() - to_delete.size() );
+ std::sort( iv_ulas.begin(), iv_ulas.end() );
+ std::set_difference( iv_ulas.begin(), iv_ulas.end(), to_delete.begin(), to_delete.end(), new_iv_ulas.begin() );
+ swap( iv_ulas, new_iv_ulas );
+ for( size_t delI = 0; delI < to_delete.size(); ++delI )
+ to_delete[delI]->Free();
+
+ vector< ULA* > orig_ula_order = cur_ulas;
+ // now do something similar for seqJ
+ std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompJ );
+ std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompJ );
+
+ vector< size_t > added_map;
+ createMap( cur_ulas, orig_ula_order, added_map );
+
+ ivuI = 0;
+ curuI = 0;
+ to_delete.clear();
+ while( ivuI < iv_ulas_size && curuI < cur_ulas.size() )
+ {
+ if( iv_ulas[ivuI]->LeftEnd(seqJ) == cur_ulas[curuI]->LeftEnd(seqJ) )
+ {
+ if( added_to[added_map[curuI]] == iv_ulas[ivuI] )
+ {
+ // do nothing
+ }else if( added_to[added_map[curuI]] == NULL )
+ {
+ iv_ulas[ivuI]->SetLeftEnd(seqI, cur_ulas[curuI]->LeftEnd(seqI));
+ added_to[added_map[curuI]] = iv_ulas[ivuI];
+ }else{
+ ULA* merge = added_to[added_map[curuI]];
+ for( size_t seqK = 0; seqK < seq_count; ++seqK )
+ {
+ if( merge->Start(seqK) == NO_MATCH )
+ continue;
+ iv_ulas[ivuI]->SetStart( seqK, merge->Start(seqK) );
+ }
+ to_delete.push_back( merge );
+ }
+ ivuI++;
+ }else if( iv_ulas[ivuI]->LeftEnd(seqJ) < cur_ulas[curuI]->LeftEnd(seqJ) )
+ {
+ ivuI++;
+ }else
+ {
+ curuI++;
+ }
+ }
+
+ // anything with a null added_to entry needs to be added to iv_ulas
+ // everything else needs to get freed
+ std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
+ for( curuI = 0; curuI < cur_ulas.size(); ++curuI )
+ {
+ if( added_to[curuI] == NULL )
+ iv_ulas.push_back( cur_ulas[curuI] );
+ else
+ cur_ulas[curuI]->Free();
+ }
+ // delete to_delete...
+ std::sort( to_delete.begin(), to_delete.end() );
+ last = std::unique( to_delete.begin(), to_delete.end() );
+ to_delete.erase( last, to_delete.end() );
+ new_iv_ulas = vector< ULA* >( iv_ulas.size() - to_delete.size() );
+ std::sort( iv_ulas.begin(), iv_ulas.end() );
+ std::set_difference( iv_ulas.begin(), iv_ulas.end(), to_delete.begin(), to_delete.end(), new_iv_ulas.begin() );
+ swap( iv_ulas, new_iv_ulas );
+ for( size_t delI = 0; delI < to_delete.size(); ++delI )
+ to_delete[delI]->Free();
+ }
+ }
+ }
+
+ // Eliminate segments that have no representation in a genome
+ for( size_t ivI = 0; ivI < ula_list.size(); ++ivI )
+ {
+ for( size_t mI = 0; mI < ula_list[ivI].size(); ++mI )
+ {
+ size_t seqI = ula_list[ivI][mI]->FirstStart();
+ std::vector<gnSeqI> l_pos;
+ std::vector<bool> l_column;
+ std::vector<gnSeqI> r_pos;
+ std::vector<bool> r_column;
+ gnSeqI left_col = ula_list[ivI][mI]->LeftEnd(seqI)-1;
+ gnSeqI right_col = ula_list[ivI][mI]->RightEnd(seqI)-1;
+ iv_orig_ptrs[ivI]->GetColumn(left_col, l_pos, l_column);
+ iv_orig_ptrs[ivI]->GetColumn(right_col, r_pos, r_column);
+ for( ; seqI < ula_list[ivI][mI]->SeqCount(); ++seqI )
+ {
+ if( ula_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if( l_pos[seqI] == r_pos[seqI] && !l_column[seqI] && !r_column[seqI] )
+ ula_list[ivI][mI]->SetStart(seqI, NO_MATCH); // no match in this col
+ }
+ if( ula_list[ivI][mI]->Multiplicity() < 2 )
+ {
+ ula_list[ivI][mI]->Free();
+ ula_list[ivI][mI] = NULL;
+ }
+ }
+ // clean out any NULL ptrs
+ std::vector< ULA* >::iterator last = std::remove( ula_list[ivI].begin(), ula_list[ivI].end(), (ULA*)NULL );
+ ula_list[ivI].erase( last, ula_list[ivI].end() );
+ }
+}
+
+void unalignIslands( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, vector< vector< ULA* > >& ula_list )
+{
+ uint seq_count = iv_list.seq_table.size();
+ // unalign regions in the iv list that aren't contained in backbone
+ for( size_t ivI = 0; ivI < ula_list.size(); ++ivI )
+ {
+ vector< AbstractMatch* > new_matches(ula_list[ivI].size());
+ for( size_t mI = 0; mI < ula_list[ivI].size(); ++mI )
+ {
+ size_t seqI = ula_list[ivI][mI]->FirstStart();
+ gnSeqI left_col = ula_list[ivI][mI]->LeftEnd(seqI)-1;
+ CompactGappedAlignment<> tmp_cga;
+ CompactGappedAlignment<>* new_cga = tmp_cga.Copy();
+ iv_orig_ptrs[ivI]->copyRange(*new_cga, left_col, ula_list[ivI][mI]->Length(seqI));
+ for( seqI = 0; seqI < ula_list[ivI][mI]->SeqCount(); ++seqI )
+ {
+ if( ula_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
+ new_cga->SetLeftEnd(seqI, NO_MATCH);
+ }
+ new_cga->CondenseGapColumns();
+ new_matches[mI] = new_cga;
+ }
+ if( new_matches.size() > 0 )
+ {
+
+ vector< vector< AbstractMatch* > > disjoint_subsets;
+ {
+ // split into multiple intervals if some sequences are completely unaligned
+ // use a union-find structure to quickly figure out how many subgroups there are
+ vector< uint > seq_map( seq_count );
+ for( size_t sI = 0; sI < seq_map.size(); ++sI )
+ seq_map[sI] = sI;
+ for( size_t mI = 0; mI < new_matches.size(); ++mI )
+ {
+ uint sI = new_matches[mI]->FirstStart();
+ uint map_to = seq_map[sI];
+ while( map_to != seq_map[map_to] )
+ map_to = seq_map[map_to];
+ seq_map[sI] = map_to;
+ for( ++sI; sI < seq_count; ++sI )
+ {
+ if( new_matches[mI]->LeftEnd(sI) == NO_MATCH )
+ continue;
+ uint map_from = seq_map[sI];
+ while( map_from != seq_map[map_from] )
+ map_from = seq_map[map_from];
+ seq_map[map_from] = map_to;
+ }
+ }
+ vector< vector< AbstractMatch* > > mapped_lists( seq_count );
+ for( size_t mI = 0; mI < new_matches.size(); ++mI )
+ {
+ uint sI = new_matches[mI]->FirstStart();
+ uint map_to = seq_map[sI];
+ while( map_to != seq_map[map_to] )
+ map_to = seq_map[map_to];
+ mapped_lists[map_to].push_back( new_matches[mI] );
+ }
+ for( uint sI = 0; sI < seq_count; ++sI )
+ {
+ if( mapped_lists[sI].size() > 0 )
+ disjoint_subsets.push_back( mapped_lists[sI] );
+ }
+ }
+
+ for( size_t dI = 0; dI < disjoint_subsets.size(); ++dI )
+ {
+ vector< AbstractMatch* >& cur_d_matches = disjoint_subsets[dI];
+ vector< AbstractMatch* > orig_order = cur_d_matches;
+ // need to sort these. use boost's topological sort.
+ vector< size_t > id_map;
+ typedef boost::adjacency_list< boost::vecS, boost::vecS, boost::directedS, boost::property<boost::vertex_color_t, boost::default_color_type> > Graph;
+ typedef boost::graph_traits<Graph>::vertex_descriptor Vertex;
+ typedef std::pair< int, int > Pair;
+ vector< Pair > edges;
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ SingleStartComparator<AbstractMatch> ssc(seqI);
+ std::sort( cur_d_matches.begin(), cur_d_matches.end(), ssc );
+ createMap( cur_d_matches, orig_order, id_map );
+ int prev = -1;
+ int first = -1;
+ bool reverse = false;
+ for( int mI = 0; mI < cur_d_matches.size(); ++mI )
+ {
+ if( cur_d_matches[mI]->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if( prev != -1 )
+ {
+ Pair edge( id_map[prev], id_map[mI] );
+ if( reverse )
+ swap( edge.first, edge.second );
+ edges.push_back(edge);
+ }else
+ {
+ reverse = cur_d_matches[mI]->Start(seqI) < 0;
+ first = mI;
+ }
+ prev = mI;
+ }
+ if( prev != -1 && !reverse )
+ edges.push_back( Pair( id_map[prev], cur_d_matches.size() ) );
+ else if( prev != -1 && reverse )
+ edges.push_back( Pair( id_map[first], cur_d_matches.size() ) );
+ }
+ std::sort( edges.begin(), edges.end() );
+ vector< Pair >::iterator ee_iter = std::unique( edges.begin(), edges.end() );
+ edges.erase( ee_iter, edges.end() );
+ Pair* edge_array = new Pair[edges.size()];
+ for( size_t eI = 0; eI < edges.size(); ++eI )
+ edge_array[eI] = edges[eI];
+ typedef boost::graph_traits<Graph>::vertices_size_type v_size_t;
+ Graph G(edge_array, edge_array + edges.size(), v_size_t(edges.size()));
+ typedef std::vector< Vertex > container;
+ container c;
+ topological_sort(G, std::back_inserter(c));
+ cur_d_matches.clear();
+ for ( container::reverse_iterator ii=c.rbegin(); ii!=c.rend(); ++ii)
+ {
+ if( *ii < orig_order.size() )
+ cur_d_matches.push_back( orig_order[ *ii ] );
+ }
+ if( dI == 0 )
+ iv_list[ivI].SetMatches(cur_d_matches);
+ else
+ {
+ Interval new_iv( cur_d_matches.begin(), cur_d_matches.end() );
+ iv_list.push_back(new_iv);
+ }
+ delete[] edge_array;
+ }
+ }
+ else
+ {
+ iv_orig_ptrs[ivI]->Free();
+ iv_orig_ptrs[ivI] = NULL;
+ }
+ }
+
+
+ // update iv_list to match the filtered iv_orig_ptrs
+ size_t givI = 0;
+ for( size_t iI = 0; iI < iv_orig_ptrs.size(); ++iI )
+ {
+ if( iv_orig_ptrs[iI] != NULL )
+ {
+ swap( iv_list[givI], iv_list[iI] );
+ iv_orig_ptrs[iI]->Free(); // done with the CompactGappedAlignments
+ iv_orig_ptrs[iI] = NULL;
+ givI++;
+ }
+ }
+ // pick up any intervals that were split in half
+ for( size_t iI = iv_orig_ptrs.size(); iI < iv_list.size(); ++iI )
+ swap( iv_list[givI++], iv_list[iI] );
+ iv_list.erase( iv_list.begin()+givI, iv_list.end() );
+
+ // collapse any intervals that are trivially collinear
+ collapseCollinear( iv_list );
+}
+
+void createBackboneList( const IntervalList& iv_list, backbone_list_t& ula_list )
+{
+ ula_list.resize( iv_list.size() );
+ for( size_t ivI = 0; ivI < iv_list.size(); ++ivI )
+ {
+ if( iv_list[ivI].Multiplicity() < 2 )
+ continue;
+ const vector< AbstractMatch* >& matches = iv_list[ivI].GetMatches();
+ int64 right_col = 0;
+ int64 left_col = 0;
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ {
+ left_col = right_col;
+ right_col += matches[mI]->AlignmentLength();
+ if( matches[mI]->Multiplicity() < 2 )
+ continue;
+ ULA tmp_ula(matches[mI]->SeqCount());
+ ULA* mula = tmp_ula.Copy();
+ for( size_t seqI = 0; seqI < matches[mI]->SeqCount(); ++seqI )
+ if( matches[mI]->LeftEnd(seqI) != NO_MATCH )
+ mula->SetLeftEnd( seqI, left_col+1 );
+ mula->SetLength( right_col - left_col );
+ ula_list[ivI].push_back(mula);
+ }
+ // merge neighbors that cover identical match components
+ for( size_t ulaI = 1; ulaI < ula_list[ivI].size(); ulaI++ )
+ {
+ size_t seqI = 0;
+ for( ; seqI < ula_list[ivI][ulaI]->SeqCount(); ++seqI )
+ {
+ int64 s1 = ula_list[ivI][ulaI-1]->Start(seqI);
+ int64 s2 = ula_list[ivI][ulaI]->Start(seqI);
+ if( s1 == mems::NO_MATCH && s2 == mems::NO_MATCH )
+ continue;
+ if( s1 == mems::NO_MATCH && s2 != mems::NO_MATCH )
+ break;
+ if( s1 != mems::NO_MATCH && s2 == mems::NO_MATCH )
+ break;
+ int64 r1 = ula_list[ivI][ulaI-1]->RightEnd(seqI);
+ if( r1 + 1 != s2 )
+ break; // must be adjacent to each other
+ }
+ if( seqI == ula_list[ivI][ulaI]->SeqCount() )
+ {
+ // ulaI-1 needs to be swallowed up by ulaI
+ ula_list[ivI][ulaI]->ExtendStart( ula_list[ivI][ulaI-1]->Length() );
+ ula_list[ivI][ulaI-1]->SetLength(0);
+ }
+ }
+ // get rid of matches that were swallowed up
+ vector< ULA* > condensed_list;
+ for( size_t ulaI = 0; ulaI < ula_list[ivI].size(); ulaI++ )
+ {
+ if( ula_list[ivI][ulaI]->Length() > 0 )
+ condensed_list.push_back(ula_list[ivI][ulaI]);
+ else
+ ula_list[ivI][ulaI]->Free();
+ }
+ swap( ula_list[ivI], condensed_list );
+ }
+}
+
+void detectAndApplyBackbone( AbstractMatch* m, vector< gnSequence* >& seq_table, CompactGappedAlignment<>*& result, backbone_list_t& bb_list, const Params& hmm_params, boolean left_homologous, boolean right_homologous )
+{
+ vector< AbstractMatch* > mlist( 1, m );
+ uint seq_count = seq_table.size();
+
+ // indexed by seqI, seqJ, ivI, hssI (left col, right col)
+ pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][1]);
+
+ // ugg. need CompactGappedAlignment for its SeqPosToColumn
+ vector< CompactGappedAlignment<>* > iv_ptrs(1);
+ CompactGappedAlignment<> tmp_cga;
+ iv_ptrs[0] = tmp_cga.Copy();
+ new (iv_ptrs[0])CompactGappedAlignment<>( *m ); // this will be freed when unalignIslands() gets called
+
+ vector< CompactGappedAlignment<>* > iv_orig_ptrs(iv_ptrs);
+ hss_array_t island_array, hss_array;
+
+ findHssHomologyHMM( mlist, seq_table, island_array, hmm_params, left_homologous, right_homologous );
+ translateToPairwiseGenomeHSS( island_array, hss_cols );
+
+ // merge overlapping pairwise homology predictions into n-way predictions
+ backbone_list_t ula_list;
+ mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, ula_list );
+
+ // unalignIslands wants an IntervalList
+ IntervalList iv_list;
+ iv_list.seq_table = seq_table;
+ iv_list.resize(1);
+ vector<AbstractMatch*> asdf(1, iv_orig_ptrs.front()->Copy() );
+ iv_list[0].SetMatches( asdf );
+ // unalign regions found to be non-homologous
+ unalignIslands( iv_list, iv_orig_ptrs, ula_list );
+
+ // free all ULAs and reconstruct them from the new alignment column coordinates
+ for( size_t ulaI = 0; ulaI < ula_list.size(); ++ulaI )
+ for( size_t i = 0; i < ula_list[ulaI].size(); ++i )
+ ula_list[ulaI][i]->Free();
+ ula_list.clear();
+
+
+ createBackboneList( iv_list, ula_list );
+
+ iv_orig_ptrs.clear();
+
+ bb_list.clear();
+ bb_list = ula_list;
+
+ result = tmp_cga.Copy();
+ if( iv_list.size() > 0 )
+ new (result)CompactGappedAlignment<>( iv_list[0] );
+}
+
+
+
+void applyBackbone( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, backbone_list_t& bb_list )
+{
+ // unalign regions found to be non-homologous
+ unalignIslands( iv_list, iv_orig_ptrs, bb_list );
+
+ // need to add in all the unaligned regions so the viewer doesn't throw a fit
+ addUnalignedRegions( iv_list );
+
+ // free all ULAs and reconstruct them from the new alignment column coordinates
+ for( size_t ulaI = 0; ulaI < bb_list.size(); ++ulaI )
+ for( size_t i = 0; i < bb_list[ulaI].size(); ++i )
+ bb_list[ulaI][i]->Free();
+ bb_list.clear();
+
+ createBackboneList( iv_list, bb_list );
+}
+
+void detectBackbone( IntervalList& iv_list, backbone_list_t& bb_list, const HssDetector* detector, vector< CompactGappedAlignment<>* >& iv_orig_ptrs )
+{
+ // collapse any intervals that are trivially collinear
+ collapseCollinear( iv_list );
+
+ uint seq_count = iv_list.seq_table.size();
+
+ // indexed by seqI, seqJ, ivI, hssI (left col, right col)
+ pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][iv_list.size()]);
+
+ // ugg. need CompactGappedAlignment for its SeqPosToColumn
+ vector< CompactGappedAlignment<>* > iv_ptrs(iv_list.size());
+ for( size_t i = 0; i < iv_list.size(); ++i )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ iv_ptrs[i] = tmp_cga.Copy();
+ new (iv_ptrs[i])CompactGappedAlignment<>( iv_list[i] );
+ }
+
+ iv_orig_ptrs = iv_ptrs;
+ makeAllPairwiseGenomeHSS( iv_list, iv_ptrs, iv_orig_ptrs, hss_cols, detector );
+
+ // merge overlapping pairwise homology predictions into n-way predictions
+ mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, bb_list );
+}
+
+
+// add unique segments of some minimum length
+// FIXME: does not add begin and end segments!
+void addUniqueSegments( std::vector< bb_seqentry_t >& bb_seq_list, size_t min_length )
+{
+ if( bb_seq_list.size() == 0 )
+ return;
+ vector< bb_seqentry_t > new_segs;
+ uint seq_count = bb_seq_list[0].size();
+ // now mark segs that are too close to each other to be considered independent
+ for( size_t sI = 0; sI < seq_count; sI++ )
+ {
+ BbSeqEntrySorter bbs(sI);
+ std::sort( bb_seq_list.begin(), bb_seq_list.end(), bbs );
+ for( size_t bbI = 1; bbI < bb_seq_list.size(); bbI++ )
+ {
+ if( bb_seq_list[bbI][sI].first == 0 )
+ continue;
+ int64 diff = genome::absolut(bb_seq_list[bbI][sI].first) - genome::absolut(bb_seq_list[bbI-1][sI].second);
+ if( genome::absolut(diff) > min_length )
+ {
+ bb_seqentry_t newb( seq_count, make_pair( 0,0 ) );
+ newb[sI].first = genome::absolut(bb_seq_list[bbI-1][sI].second) + 1;
+ newb[sI].second = genome::absolut(bb_seq_list[bbI][sI].first) - 1;
+ new_segs.push_back( newb );
+ }
+ }
+ }
+ bb_seq_list.insert( bb_seq_list.end(), new_segs.begin(), new_segs.end() );
+}
+
+
+void mergeAdjacentSegments( std::vector< bb_seqentry_t >& bb_seq_list )
+{
+ if( bb_seq_list.size() == 0 )
+ return;
+ uint seq_count = bb_seq_list[0].size();
+ // now mark segs that are too close to each other to be considered independent
+ for( size_t sI = 0; sI < seq_count; sI++ )
+ {
+ BbSeqEntrySorter bbs(sI);
+ std::sort( bb_seq_list.begin(), bb_seq_list.end(), bbs );
+ bitset_t merged;
+ merged.resize( bb_seq_list.size() );
+ for( size_t bbI = 1; bbI < bb_seq_list.size(); bbI++ )
+ {
+ if( bb_seq_list[bbI][sI].first == 0 )
+ continue;
+ size_t j = 0;
+ for( ; j < seq_count; j++ )
+ {
+ if( bb_seq_list[bbI][j].first == 0 ^ bb_seq_list[bbI-1][j].first == 0)
+ break;
+ if( bb_seq_list[bbI][j].first == 0)
+ continue;
+ int64 diff = 0;
+ if( bb_seq_list[bbI][j].first > 0 )
+ diff = bb_seq_list[bbI][j].first - bb_seq_list[bbI-1][j].second;
+ else
+ diff = bb_seq_list[bbI][j].second - bb_seq_list[bbI-1][j].first;
+ if( diff != 1 )
+ break;
+ }
+ if(j == seq_count)
+ { // they can be merged!
+ merged.set(bbI-1);
+ for( j = 0; j < seq_count; j++ )
+ if( bb_seq_list[bbI][j].first > 0 )
+ bb_seq_list[bbI][j].first = bb_seq_list[bbI-1][j].first;
+ else
+ bb_seq_list[bbI][j].second = bb_seq_list[bbI-1][j].second;
+ }
+ }
+ // remove merged entries
+ size_t cur = 0;
+ for( size_t bbI = 0; bbI < bb_seq_list.size(); bbI++ )
+ if( !merged.test( bbI ) )
+ swap( bb_seq_list[cur++], bb_seq_list[bbI] );
+ bb_seq_list.erase( bb_seq_list.begin() + cur, bb_seq_list.end() );
+ }
+}
+
+
+void detectBackbone( IntervalList& iv_list, backbone_list_t& bb_list, const HssDetector* detector )
+{
+ vector< CompactGappedAlignment<>* > iv_orig_ptrs;
+ detectBackbone( iv_list, bb_list, detector, iv_orig_ptrs );
+ // FIXME: clean up iv_orig_ptrs
+}
+
+void detectAndApplyBackbone( IntervalList& iv_list, backbone_list_t& bb_list, const Params& hmm_params )
+{
+ HomologyHmmDetector* hmm_detector = new HomologyHmmDetector( hmm_params, true, true );
+ vector< CompactGappedAlignment<>* > iv_orig_ptrs;
+ detectBackbone( iv_list, bb_list, hmm_detector, iv_orig_ptrs );
+ applyBackbone( iv_list, iv_orig_ptrs, bb_list );
+ delete hmm_detector;
+}
+
+
+void writeBackboneColumns( ostream& bb_out, backbone_list_t& bb_list )
+{
+ //
+ // At last! write out the backbone list
+ //
+ for( size_t ivI = 0; ivI < bb_list.size(); ++ivI )
+ {
+ for( size_t mI = 0; mI < bb_list[ivI].size(); ++mI )
+ {
+ size_t seqI = bb_list[ivI][mI]->FirstStart();
+ bb_out << ivI << '\t' << bb_list[ivI][mI]->LeftEnd(seqI) << '\t' << bb_list[ivI][mI]->Length();
+ for( ; seqI < bb_list[ivI][mI]->SeqCount(); ++seqI )
+ {
+ if( bb_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ bb_out << '\t' << seqI;
+ }
+ bb_out << endl;
+ }
+ }
+}
+
+void writeBackboneSeqCoordinates( backbone_list_t& bb_list, IntervalList& iv_list, ostream& bb_out )
+{
+ if( bb_list.size() == 0 )
+ return;
+ // find seq_count
+ uint seq_count = 0;
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ if( bb_list[bbI].size() > 0 )
+ {
+ seq_count = bb_list[bbI].front()->SeqCount();
+ break;
+ }
+
+ // different format -- use real sequence coordinates...
+ // print a header line first
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( seqI > 0 )
+ bb_out << '\t';
+ bb_out << "seq_" << seqI << "_leftend\t";
+ bb_out << "seq_" << seqI << "_rightend";
+ }
+ bb_out << endl;
+ for( size_t ivI = 0; ivI < bb_list.size(); ++ivI )
+ {
+ // there seems to be a bug in the backbone creation code that causes the CGA that gets
+ // stuffed into the interval to have the wrong coordinates internally, while the interval
+ // maintains the correct coordinates. work around it by converting the whole interval to a cga
+ CompactGappedAlignment<> iv_cga( iv_list[ivI] );
+ for( size_t mI = 0; mI < bb_list[ivI].size(); ++mI )
+ {
+ uint fs = bb_list[ivI][mI]->FirstStart();
+ // get the sequence positions out of the alignment
+ vector< gnSeqI > left_pos;
+ vector< bool > left_cols;
+ iv_cga.GetColumn( bb_list[ivI][mI]->LeftEnd(fs)-1, left_pos, left_cols );
+ vector< gnSeqI > right_pos;
+ vector< bool > right_cols;
+ iv_cga.GetColumn( bb_list[ivI][mI]->RightEnd(fs)-1, right_pos, right_cols );
+ for( size_t seqI = 0; seqI < bb_list[ivI][mI]->SeqCount(); ++seqI )
+ {
+ if( seqI > 0 )
+ bb_out << '\t';
+ if( bb_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
+ {
+ bb_out << "0\t0";
+ continue;
+ }else{
+ int64 leftI = left_pos[seqI];
+ int64 rightI = right_pos[seqI];
+ if( iv_cga.Orientation(seqI) == AbstractMatch::forward && leftI != 0 && !left_cols[seqI] )
+ leftI++;
+ if( iv_cga.Orientation(seqI) == AbstractMatch::reverse && rightI != 0 && !right_cols[seqI] )
+ rightI++;
+ if( iv_cga.Orientation(seqI) == AbstractMatch::reverse )
+ {
+ swap( leftI, rightI ); // must be reverse complement
+ }
+ if( rightI + 1 == leftI )
+ {
+ bb_out << "0\t0";
+ continue;
+ }
+ if( leftI > rightI )
+ {
+ cerr << "oh crahpey!\n";
+ cerr << "leftI: " << leftI << endl;
+ cerr << "rightI: " << rightI << endl;
+ cerr << "seqI: " << seqI << endl;
+ cerr << "ivI: " << ivI << endl;
+ }
+ if( leftI == 0 )
+ leftI = iv_cga.LeftEnd(seqI);
+ if( rightI == iv_cga.RightEnd(seqI)+1 )
+ rightI--;
+ if( iv_cga.Orientation(seqI) == AbstractMatch::reverse )
+ {
+ leftI *= -1;
+ rightI *= -1;
+ }
+ bb_out << leftI << '\t' << rightI;
+ }
+ }
+ bb_out << endl;
+ }
+ }
+}
+
+
+} // namespace mems
+
diff --git a/libMems/Backbone.h b/libMems/Backbone.h
new file mode 100644
index 0000000..878eb8e
--- /dev/null
+++ b/libMems/Backbone.h
@@ -0,0 +1,240 @@
+/*******************************************************************************
+ * $Id: Backbone.h,v 1.7 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __Backbone_h__
+#define __Backbone_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/SubstitutionMatrix.h"
+#include "libMems/IntervalList.h"
+#include "libMems/NumericMatrix.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include <boost/multi_array.hpp>
+
+#include <sstream>
+#include <vector>
+
+namespace mems {
+
+typedef mems::UngappedLocalAlignment< mems::HybridAbstractMatch<> > ULA;
+typedef std::vector< std::vector< ULA* > > backbone_list_t;
+// indexed by seqI, seqJ, ivI, hssI (left col, right col)
+typedef boost::multi_array< std::vector< std::pair< size_t, size_t > >, 3 > pairwise_genome_hss_t;
+
+class HssDetector;
+
+/** compute the GC content of a set of sequences */
+double computeGC( std::vector< genome::gnSequence* >& seq_table );
+
+/**
+ * collapse Intervals that are trivially collinear with each other
+ */
+void collapseCollinear( IntervalList& iv_list );
+
+/**
+ * sanity checks for alignment columns that contain only gaps
+ */
+void checkForAllGapColumns( IntervalList& iv_list );
+
+/**
+ * Applies pairwise transitive homology statistics to detect backbone in a single collinear alignment
+ * Unaligns any regions found to be non-homologous, returns coordinates of the homologous segments in bb_list
+ * @param m The input match in which homology detection will be applied
+ * @param seq_table A sequence table with one gnSequence pointer per match component
+ * @param result (output) A newly allocated CompactGappedAlignment that contains the resulting alignment of
+ * homologous sequence. It is the caller's responsibility to free the memory using AbstractMatch::Free()
+ * @param bb_list (output) A list of homologous segments among each component of the output match
+ * @param left_homologous Set to true if the detection code should assume that sequence beyond the left-most alignment
+ * column is homologous sequence
+ * @param right_homologous Set to true if the detection code should assume that sequence beyond the right-most alignment
+ * column is homologous sequence
+ */
+void detectAndApplyBackbone( AbstractMatch* m, std::vector< genome::gnSequence* >& seq_table, CompactGappedAlignment<>*& result, backbone_list_t& bb_list, const Params& hmm_params, boolean left_homologous = false, boolean right_homologous = false );
+
+/**
+ * Applies pairwise transitive homology statistics to detect backbone in a genome alignment
+ * Unaligns any regions found to be non-homologous, returns coordinates of the homologous segments in bb_list
+ */
+void detectAndApplyBackbone( IntervalList& iv_list, backbone_list_t& bb_list, const Params& hmm_params );
+
+/**
+ * Simply detects backbone using the particular algorithm implemented by HssDetector
+ */
+void detectBackbone( IntervalList& iv_list, backbone_list_t& bb_list, const HssDetector* detector );
+
+/**
+ * Writes a backbone column file. This file type gets used by the Mauve GUI.
+ */
+void writeBackboneColumns( std::ostream& bb_out, backbone_list_t& bb_list );
+
+/**
+ * Writes a backbone sequence coordinate file. This file type is easier to analyze with statistical packages.
+ */
+void writeBackboneSeqCoordinates( backbone_list_t& bb_list, IntervalList& iv_list, std::ostream& bb_out );
+
+class HssDetector
+{
+public:
+ typedef std::vector< CompactGappedAlignment<>* > MatchListType;
+ virtual void operator() (
+ const MatchListType& iv_list,
+ std::vector< genome::gnSequence* >& seq_table,
+ hss_array_t& hss_array ) const = 0;
+};
+
+class HomologyHmmDetector : public HssDetector
+{
+public:
+ HomologyHmmDetector( const Params& hmm_params, bool left_homologous, bool right_homologous ) :
+ p(hmm_params), left(left_homologous), right(right_homologous) {}
+ virtual void operator() ( const MatchListType& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array ) const
+ {
+ findHssHomologyHMM( iv_list, seq_table, hss_array, p, left, right );
+ }
+private:
+ const Params& p;
+ bool left;
+ bool right;
+};
+
+class BigGapsDetector : public HssDetector
+{
+public:
+ BigGapsDetector( size_t big_gap_size ) : big(big_gap_size) {}
+ virtual void operator() ( const MatchListType& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array ) const
+ {
+ hss_array_t gap_array;
+ findBigGaps( iv_list, seq_table, gap_array, big );
+ // we want the cols that represent regions without big gaps...
+ HssColsToIslandCols( iv_list, seq_table, gap_array, hss_array );
+ }
+private:
+ size_t big;
+};
+
+
+
+typedef std::vector< std::pair< int64, int64 > > bb_seqentry_t;
+typedef struct bb_entry_s
+{
+ bb_seqentry_t bb_seq;
+ ULA bb_cols;
+ size_t iv;
+} bb_entry_t;
+
+void addUniqueSegments( std::vector< bb_seqentry_t >& bb_seq_list, size_t min_length = 20 );
+void mergeAdjacentSegments( std::vector< bb_seqentry_t >& bb_seq_list );
+
+class BbSeqEntrySorter
+{
+public:
+ BbSeqEntrySorter( size_t seqI ){ m_seq = seqI; }
+ bool operator()( const bb_seqentry_t& a, const bb_seqentry_t& b )
+ {
+ return genome::absolut(a[m_seq].first) < genome::absolut(b[m_seq].first);
+ }
+ size_t m_seq;
+};
+
+inline
+void printBbSeq( std::ostream& os, const bb_seqentry_t& bbseq )
+{
+ for( size_t i = 0; i < bbseq.size(); ++i )
+ {
+ if( i > 0 )
+ os << '\t';
+ os << "(" << bbseq[i].first << ", " << bbseq[i].second << ")";
+ }
+}
+
+inline
+void readBackboneSeqFile( std::istream& bbseq_input, std::vector< bb_seqentry_t >& backbone )
+{
+ std::string cur_line;
+ std::getline( bbseq_input, cur_line ); // read off the header line
+ while( std::getline( bbseq_input, cur_line ) )
+ {
+ bb_seqentry_t bb;
+ std::stringstream line_str( cur_line );
+ int64 lpos = 0;
+ while( line_str >> lpos )
+ {
+ int64 rpos = 0;
+ line_str >> rpos;
+ bb.push_back( std::make_pair( lpos, rpos ) );
+ }
+ backbone.push_back(bb);
+ }
+}
+
+inline
+void writeBackboneSeqFile( std::ostream& bbseq_out, std::vector< bb_seqentry_t >& backbone )
+{
+ if(backbone.size()==0)
+ return; // can't write if there's no backbone!
+ for( size_t seqI = 0; seqI < backbone[0].size(); seqI++ )
+ {
+ if( seqI > 0 )
+ bbseq_out << '\t';
+ stringstream ss;
+ ss << "seq" << seqI;
+ bbseq_out << ss.str() << "_leftend\t" << ss.str() << "_rightend";
+ }
+ bbseq_out << std::endl;
+ for( size_t bbI = 0; bbI < backbone.size(); bbI++ )
+ {
+ for( size_t seqI = 0; seqI < backbone[bbI].size(); seqI++ )
+ {
+ if( seqI > 0 )
+ bbseq_out << '\t';
+ bbseq_out << backbone[bbI][seqI].first << '\t' << backbone[bbI][seqI].second;
+ }
+ bbseq_out << std::endl;
+ }
+}
+
+inline
+void readBackboneColsFile( std::istream& bbcol_input, std::vector< std::pair< size_t, ULA > >& bb_list )
+{
+ std::string cur_line;
+ while( std::getline( bbcol_input, cur_line ) )
+ {
+ ULA tmp_ula;
+ size_t ivI;
+ std::stringstream ss( cur_line );
+ ss >> ivI;
+ size_t left_col;
+ size_t len;
+ ss >> left_col;
+ ss >> len;
+ gnSeqI bbseq;
+ while( ss >> bbseq )
+ {
+ tmp_ula.SetStart( bbseq, left_col );
+ }
+ tmp_ula.SetLength( len );
+ bb_list.push_back( std::make_pair( ivI, tmp_ula ) );
+ }
+}
+
+void makeAllPairwiseGenomeHSS( IntervalList& iv_list, std::vector< CompactGappedAlignment<>* >& iv_ptrs, std::vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, const HssDetector* detector );
+void mergePairwiseHomologyPredictions( std::vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, std::vector< std::vector< ULA* > >& ula_list );
+
+
+}
+
+#endif // __Backbone_h__
+
diff --git a/libMems/ClustalInterface.cpp b/libMems/ClustalInterface.cpp
new file mode 100644
index 0000000..ad2b14a
--- /dev/null
+++ b/libMems/ClustalInterface.cpp
@@ -0,0 +1,576 @@
+/*******************************************************************************
+ * $Id: ClustalInterface.cpp,v 1.27 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/ClustalInterface.h"
+#include <sstream>
+#include "libGenome/gnFilter.h"
+
+#include <fstream>
+
+extern "C" {
+#include "libClustalW/clustalw.h"
+
+extern sint max_names;
+extern Boolean usemenu, dnaflag, explicit_dnaflag;
+extern Boolean interactive;
+extern char *seqname;
+extern sint nseqs;
+extern sint *seqlen_array;
+extern char **names,**titles;
+extern char **seq_array;
+//extern Boolean profile1_empty, profile2_empty;
+extern sint max_aln_length;
+//extern char *gap_penalty_mask, *sec_struct_mask;
+//extern sint struct_penalties;
+extern float gap_open, gap_extend;
+extern float dna_gap_open, dna_gap_extend;
+//extern char *gap_penalty_mask1,*gap_penalty_mask2;
+//extern char *sec_struct_mask1,*sec_struct_mask2;
+//extern sint struct_penalties1,struct_penalties2;
+//extern char *ss_name1,*ss_name2;
+extern float pw_go_penalty, pw_ge_penalty;
+extern float dna_pw_go_penalty, dna_pw_ge_penalty;
+//extern sint wind_gap,ktup,window,signif;
+//extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
+
+extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
+extern FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile, *nexus_outfile;
+//extern char clustal_outname[FILENAMELEN+1], gcg_outname[FILENAMELEN+1];
+extern char* amino_acid_codes;
+extern sint max_aa;
+
+//extern short blosum45mt[];
+//extern short def_aa_xref[];
+extern sint gap_pos1;
+extern double** tmat;
+
+extern Boolean use_endgaps;
+extern Boolean endgappenalties;
+
+extern sint output_order;
+extern Boolean no_weights;
+
+}
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+/**
+ * When performing progressive alignment, clustalW misaligns the first sequence for
+ * some reason. define MISALIGNMENT_WORKAROUND to enable a workaround for this bug.
+ * The workaround adds an additional copy of the first sequence to each alignment
+ * then removes the misaligned copy of the first sequence.
+ */
+#define MISALIGNMENT_WORKAROUND
+
+lint get_aln_score(void);
+
+
+ClustalInterface& ClustalInterface::getClustalInterface()
+{
+ static ClustalInterface m_ci;
+
+ return m_ci;
+}
+
+ClustalInterface::ClustalInterface(){
+ // some defaults can't hurt
+ max_alignment_length = 10000;
+ min_flank_size = 3;
+ clustal_score_cutoff = 0;
+
+ // shut off end gaps...
+ use_endgaps = FALSE;
+ // enable end gap penalties
+ endgappenalties = TRUE;
+ // force same input/output order
+ output_order = INPUT;
+ no_weights = FALSE; // TRUE;
+
+
+ init_amenu();
+ init_interface();
+ init_matrix();
+
+ fill_chartab();
+ allocated_aln = false;
+
+ // shut off end gaps...
+ use_endgaps = FALSE;
+ // enable end gap penalties
+ endgappenalties = TRUE;
+}
+
+ClustalInterface& ClustalInterface::operator=( const ClustalInterface& ci )
+{
+ GappedAligner::operator=( ci );
+ min_flank_size = ci.min_flank_size;
+ clustal_score_cutoff = ci.clustal_score_cutoff;
+ distance_matrix = ci.distance_matrix;
+ allocated_aln = ci.allocated_aln;
+ return *this;
+}
+
+void ClustalInterface::SetDistanceMatrix( NumericMatrix< double >& distance_matrix, string& tree_filename ){
+ SetDistanceMatrix( distance_matrix, tree_filename, false );
+}
+
+void ClustalInterface::SetDistanceMatrix( NumericMatrix< double >& distance_matrix, string& tree_filename, boolean reread_tree ){
+ char* phylip_name;
+ uint seqI, seqJ;
+#ifdef MISALIGNMENT_WORKAROUND
+ if( reread_tree == false ){
+ NumericMatrix< double > dist_plus_matrix( distance_matrix.cols() + 1, distance_matrix.cols() + 1 );
+ for( seqI = 0; seqI < dist_plus_matrix.cols(); seqI++ ){
+ for( seqJ = 0; seqJ < dist_plus_matrix.cols(); seqJ++ ){
+ double new_val = 0;
+ if( seqI == 0 ){
+ if( seqJ == 0 )
+ new_val = 0;
+ else
+ new_val = distance_matrix( seqI, seqJ - 1 );
+ }else{
+ if( seqJ == 0 )
+ new_val = distance_matrix( seqI - 1, seqJ );
+ else
+ new_val = distance_matrix( seqI - 1, seqJ - 1 );
+ }
+ dist_plus_matrix( seqI, seqJ ) = new_val;
+ }
+ }
+ SetDistanceMatrix( dist_plus_matrix, tree_filename, true );
+ }
+#else
+ reread_tree = true;
+#endif
+ if( reread_tree )
+ this->distance_matrix = distance_matrix;
+ free_aln( nseqs );
+ nseqs = distance_matrix.cols();
+ alloc_aln( nseqs );
+ allocated_aln = true;
+
+ for( seqI = 1; seqI <= distance_matrix.cols(); seqI++ ){
+ ostringstream ss;
+ ss << "seq" << seqI;
+ int namelen = MAXNAMES < ss.str().size() ? MAXNAMES : ss.str().size();
+ strncpy( names[ seqI ], ss.str().c_str(), namelen); /* " " name */
+ strncpy( titles[ seqI ], ss.str().c_str(), namelen); /* " " title */
+
+ alloc_seq( seqI, 1 );
+ // set max_names and max_aln_length
+ if( strlen( names[ seqI ] ) > max_names )
+ max_names = strlen( names[ seqI ] );
+ }
+ // copy phylo tree name
+ phylip_name = (char * ) ckalloc( tree_filename.length() + 1);
+ strcpy( phylip_name, tree_filename.c_str() );
+
+ // copy tmat entries
+ for( seqI = 0; seqI < nseqs; seqI++ )
+ for( uint seqJ = 0; seqJ < nseqs; seqJ++ )
+ tmat[ seqI + 1][ seqJ + 1 ] = distance_matrix( seqI, seqJ );
+
+ FILE* tree;
+ if((tree = open_explicit_file( phylip_name ))==NULL) return;
+ if (nseqs >= 2) {
+ guide_tree(tree,1,nseqs);
+ }
+
+
+// read the tree back in
+ if( reread_tree )
+ int status = read_tree(phylip_name, (sint)0, nseqs);
+ phylip_name = (char*)ckfree( phylip_name );
+ allocated_aln = false;
+
+}
+
+// tries to read in a guide tree from a particular file,
+// throws an exception if it doesn't work out
+void ClustalInterface::setGuideTree( string& tree_filename, NumericMatrix< double >& dist_mat, uint seq_count ){
+#ifdef MISALIGNMENT_WORKAROUND
+ seq_count++;
+#endif
+ distance_matrix = dist_mat;
+ // check whether the file exists
+ ifstream guide_file( tree_filename.c_str() );
+ if( guide_file.is_open() )
+ guide_file.close(); // success
+ else
+ throw( "Unable to open guide tree file" );
+
+ char* phylip_name;
+ uint seqI;
+
+ free_aln( nseqs );
+ nseqs = seq_count;
+ alloc_aln( nseqs );
+ allocated_aln = true;
+
+ for( seqI = 1; seqI <= seq_count; seqI++ ){
+ ostringstream ss;
+ ss << "seq" << seqI;
+ int namelen = MAXNAMES < ss.str().size() ? MAXNAMES : ss.str().size();
+ strncpy( names[ seqI ], ss.str().c_str(), namelen); /* " " name */
+ strncpy( titles[ seqI ], ss.str().c_str(), namelen); /* " " title */
+
+ alloc_seq( seqI, 1 );
+ // set max_names and max_aln_length
+ if( strlen( names[ seqI ] ) > max_names )
+ max_names = strlen( names[ seqI ] );
+ }
+
+ // copy tmat entries
+ for( seqI = 0; seqI < nseqs; seqI++ )
+ for( uint seqJ = 0; seqJ < nseqs; seqJ++ )
+ tmat[ seqI + 1][ seqJ + 1 ] = 1 - distance_matrix( seqI, seqJ );
+
+ // copy phylo tree name
+ phylip_name = (char * ) ckalloc( tree_filename.length() + 1);
+ strcpy( phylip_name, tree_filename.c_str() );
+ int success = read_tree(phylip_name, (sint)0, nseqs);
+ phylip_name = (char*)ckfree( phylip_name );
+ allocated_aln = false;
+ if( !success )
+ throw "Error loading guide tree\n";
+}
+
+boolean ClustalInterface::Align( GappedAlignment& cr, Match* r_begin, Match* r_end, vector< gnSequence* >& seq_table ){
+ boolean flank = false;
+ gnSeqI gap_size = 0;
+ boolean create_ok = true;
+ uint seq_count = seq_table.size();
+ uint seqI;
+ uint align_seqs = 0;
+//
+// get the size of the largest intervening gap
+// also do some sanity checking while we're at it.
+//
+try{
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ int64 gap_start = 0;
+ int64 gap_end = 0;
+ create_ok = getInterveningCoordinates( seq_table, r_begin, r_end, seqI, gap_start, gap_end );
+ // skip this sequence if it's undefined
+ if( gap_start == NO_MATCH || gap_end == NO_MATCH )
+ continue;
+ if( !create_ok )
+ break;
+
+ int64 diff = gap_end - gap_start;
+ if( diff <= 0 ){
+ continue; // can't align nothing
+ }
+ if( diff > max_alignment_length ){
+ cout << "gap from " << gap_start << " to " << gap_end << " is too big for ClustalW\n";
+ continue; // can't align if it's too big
+ }
+ gap_size = diff < gap_size ? gap_size : diff;
+ align_seqs++;
+ }
+
+ if( align_seqs <= 1 )
+ create_ok = false;
+//
+// Get the sequence in the intervening gaps between these two matches
+// Include a flank of matching sequence on either side
+//
+ vector< string > seq_data;
+ vector< int64 > starts;
+ gnSeqI left_flank, right_flank;
+ const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
+
+ if( create_ok ){
+// left_flank = min( r_begin->Length(), max( gap_size, min_flank_size ) );
+// right_flank = min( r_end->Length(), max( gap_size, min_flank_size ) );
+ left_flank = 0;
+ right_flank = 0;
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ // cheap hack to avoid mysterious clustalW misalignment
+#ifdef MISALIGNMENT_WORKAROUND
+ if( seqI == 1 )
+ seq_data.push_back( seq_data[ 0 ] );
+#endif
+ // skip this sequence if it's undefined
+ if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH ) ||
+ (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
+ starts.push_back( NO_MATCH );
+ seq_data.push_back( "" );
+ continue;
+ }
+
+ // determine the size of the gap
+ int64 gap_start = 0;
+ int64 gap_end = 0;
+ getInterveningCoordinates( seq_table, r_begin, r_end, seqI, gap_start, gap_end );
+
+ int64 diff = gap_end - gap_start;
+ if( diff <= 0 || diff > max_alignment_length ){
+ starts.push_back( NO_MATCH );
+ seq_data.push_back( "" );
+ continue;
+ }
+ // calculate flank size and extract sequence data
+ if( r_end == NULL || r_end->Start( seqI ) > 0 ){
+ starts.push_back( gap_start );
+ seq_data.push_back( seq_table[ seqI ]->ToString( left_flank + diff + right_flank, gap_start - left_flank ) );
+ }else{
+ // reverse complement the sequence data.
+ starts.push_back( -gap_start );
+ string cur_seq_data = seq_table[ seqI ]->ToString( left_flank + diff + right_flank, gap_start - right_flank );
+ rc_filter->ReverseFilter( cur_seq_data );
+ seq_data.push_back( cur_seq_data );
+ }
+ }
+ }
+
+ if( create_ok ){
+ if( !CallClustal( seq_data ) ){
+ cout << "Clustal was unable to align:\n";
+ cout << "Left match: " << *r_begin << endl;
+ cout << "Right match: " << *r_end << endl;
+ return false;
+ }
+
+ // ensure that the flanks were successfully aligned
+ boolean good_alignment = true;
+ gnSeqI flankI = 0;
+ gnSeqI align_length=0;
+ for( seqI = 1; seqI <= seq_count; seqI++ )
+ if( align_length < ( seqlen_array[seqI] < 0 ? 0 : (gnSeqI)seqlen_array[seqI] ))
+ align_length = seqlen_array[seqI];
+
+ if( !good_alignment ){
+ // just align without the flanking regions for now??
+ return false;
+ }else{
+ // now extract the alignment from clustal's global variables
+ cr = GappedAlignment( seq_count, align_length );
+ vector< string > align_array;
+ int64 last_residue = -1; // tracks the right-most residue in the alignment
+ int64 first_residue = align_length + 2; // tracks the left-most residue in the alignment
+#ifdef MISALIGNMENT_WORKAROUND
+ for( seqI = 2; seqI <= seq_count + 1; seqI++ ){
+#else
+ for( seqI = 1; seqI <= seq_count; seqI++ ){
+#endif
+ string new_seq = string( seqlen_array[ seqI ] - left_flank - right_flank, '-' );
+ uint new_seq_charI = 0;
+ uint cur_seq_len = 0;
+ for( uint charJ = left_flank + 1; charJ <= seqlen_array[ seqI ] - right_flank; charJ++ ){
+ char val = seq_array[ seqI ][ charJ ];
+ if( val >= 0 && val <= max_aa ){
+ if( charJ > last_residue )
+ last_residue = charJ;
+ if( charJ < first_residue )
+ first_residue = charJ;
+ new_seq[ new_seq_charI ]= amino_acid_codes[ val ];
+ cur_seq_len++;
+ }
+ new_seq_charI++;
+ }
+ align_array.push_back( new_seq );
+// cerr << "new_seq.size() is: " << new_seq.size() << endl;
+#ifdef MISALIGNMENT_WORKAROUND
+ cr.SetStart( seqI - 2, starts[ seqI - 2 ] );
+ cr.SetLength( cur_seq_len, seqI - 2 );
+#else
+ cr.SetStart( seqI - 1, starts[ seqI - 1 ] );
+ cr.SetLength( cur_seq_len, seqI - 1 );
+#endif
+ }
+ int64 end_gap_count = align_array[ 0 ].size() - (last_residue - left_flank);
+ if( last_residue != -1 && end_gap_count > 0 ){
+ for( seqI = 0; seqI < align_array.size(); seqI++ ){
+ align_array[ seqI ] = align_array[ seqI ].substr( 0, align_array[ seqI ].size() - end_gap_count );
+ }
+ }
+ int64 start_gap_count = left_flank + 1 - first_residue;
+ if( first_residue != align_length && start_gap_count > 0 ){
+ for( seqI = 0; seqI < align_array.size(); seqI++ ){
+ align_array[ seqI ] = align_array[ seqI ].substr( start_gap_count, align_array[ seqI ].size() - start_gap_count );
+ }
+ }
+ cr.SetAlignment( align_array );
+ }
+ return true;
+ }
+}catch(exception& e){
+ cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
+ cerr << e.what();
+}
+ return false;
+}
+
+
+boolean ClustalInterface::CallClustal( vector< string >& seq_table ){
+ char* phylip_name;
+
+// if( allocated_aln )
+ free_aln( nseqs );
+ alloc_aln( seq_table.size() );
+ allocated_aln = true;
+
+ if( distance_matrix.cols() == seq_table.size() ){
+ // copy tmat entries
+ for( uint seqI = 0; seqI < nseqs; seqI++ )
+ for( uint seqJ = 0; seqJ < nseqs; seqJ++ )
+ tmat[ seqI + 1][ seqJ + 1 ] = 1 - distance_matrix( seqI, seqJ );
+ }else{
+ // prepare to infer a phylo tree
+ phylip_name = (char * ) ckalloc( strlen( "tmp_tree.txt" ) + 1);
+ strcpy( phylip_name, "tmp_tree.txt" );
+ }
+
+ uint seqI;
+ max_aln_length = 0;
+ max_names = 0;
+ for( seqI = 1; seqI <= seq_table.size(); seqI++ ){
+ seqlen_array[ seqI ] = seq_table[ seqI - 1 ].length(); /* store the length */
+ ostringstream ss;
+ ss << "seq" << seqI;
+ int namelen = ss.str().size();
+ names[ seqI ] = (char * ) ckalloc( namelen + 1 );
+ titles[ seqI ] = (char * ) ckalloc( namelen + 1 );
+ strcpy( names[ seqI ], ss.str().c_str()); /* " " name */
+ strcpy( titles[ seqI ], ss.str().c_str()); /* " " title */
+
+ // set max_names and max_aln_length
+ if( (int)strlen( names[ seqI ] ) > max_names )
+ max_names = strlen( names[ seqI ] );
+ if( seqlen_array[ seqI ] > max_aln_length )
+ max_aln_length = seqlen_array[ seqI ];
+ }
+
+ for( seqI = 1; seqI <= seq_table.size(); seqI++ ){
+
+ alloc_seq( seqI, max_aln_length );
+ char* seq_char_array = new char[ seq_table[ seqI - 1 ].length() + 2];
+ uint copyI = 0;
+ string& dna_seq = seq_table[ seqI - 1 ];
+ for( ; copyI < dna_seq.length(); copyI++ )
+ seq_char_array[ copyI + 1 ] = toupper( dna_seq[ copyI ] );
+ seq_char_array[ 0 ] = '-'; // silly clustal ignores the first character.
+ seq_char_array[ copyI + 1 ] = 0;
+ n_encode( seq_char_array, seq_array[ seqI ], dna_seq.length() );
+ delete[] seq_char_array;
+ }
+ max_aln_length *= 2;
+
+/* struct_penalties1 = struct_penalties2 = NONE;
+ if (sec_struct_mask1 != NULL) sec_struct_mask1=( char* )ckfree(sec_struct_mask1);
+ if (sec_struct_mask2 != NULL) sec_struct_mask2=( char* )ckfree(sec_struct_mask2);
+ if (gap_penalty_mask1 != NULL) gap_penalty_mask1=( char* )ckfree(gap_penalty_mask1);
+ if (gap_penalty_mask2 != NULL) gap_penalty_mask2=( char* )ckfree(gap_penalty_mask2);
+ if (ss_name1 != NULL) ss_name1=( char* )ckfree(ss_name1);
+ if (ss_name2 != NULL) ss_name2=( char* )ckfree(ss_name2);
+*/
+ nseqs = seq_table.size();
+ gap_open = dna_gap_open;
+ gap_extend = dna_gap_extend;
+ pw_go_penalty = dna_pw_go_penalty;
+ pw_ge_penalty = dna_pw_ge_penalty;
+/* ktup = dna_ktup;
+ window = dna_window;
+ signif = dna_signif;
+ wind_gap = dna_wind_gap;
+*/ dnaflag = TRUE;
+ output_clustal = FALSE;
+
+ int retval = 0;
+ if( distance_matrix.cols() == seq_table.size() ){
+// char* dump_file = "clustalout.txt";
+// output_clustal = TRUE;
+// if((clustal_outfile = open_explicit_file( dump_file ))==NULL) return false;
+
+ retval = malign_nofiles( 0, false );
+// create_alignment_output( 1, nseqs );
+// fclose( clustal_outfile ); // this is done by the clustal output function
+
+ }else{
+ pairalign((sint)0,nseqs,(sint)0,nseqs);
+
+ FILE* tree;
+ if((tree = open_explicit_file( phylip_name ))==NULL) return false;
+ if (nseqs >= 2) {
+ guide_tree(tree,1,nseqs);
+ }
+
+// char* dump_file = "clustalout.txt";
+// if((clustal_outfile = open_explicit_file( dump_file ))==NULL) return false;
+
+ retval = malign( 0, phylip_name );
+
+ phylip_name = (char*)ckfree( phylip_name );
+ // fclose( clustal_outfile ); // this is done by the clustal output function
+ }
+
+ if( retval <= 0 )
+ return false;
+ return true;
+
+}
+
+/*
+lint get_aln_score(void)
+{
+ static short *mat_xref, *matptr;
+ static sint maxres;
+ static sint s1,s2,c1,c2;
+ static sint ngaps;
+ static sint i,l1,l2;
+ static lint score;
+ static sint matrix[NUMRES][NUMRES];
+
+
+ matptr = blosum45mt;
+ mat_xref = def_aa_xref;
+ maxres = get_matrix(matptr, mat_xref, matrix, TRUE, 100);
+ if (maxres == 0)
+ {
+ fprintf(stdout,"Error: matrix blosum30 not found\n");
+ return -1;
+ }
+
+ score=0;
+ for (s1=1;s1<=nseqs;s1++)
+ {
+ for (s2=1;s2<s1;s2++)
+ {
+
+ l1 = seqlen_array[s1];
+ l2 = seqlen_array[s2];
+ for (i=1;i<l1 && i<l2;i++)
+ {
+ c1 = seq_array[s1][i];
+ c2 = seq_array[s2][i];
+ if ((c1>=0) && (c1<=max_aa) && (c2>=0) && (c2<=max_aa))
+ score += matrix[c1][c2];
+ }
+
+ ngaps = count_gaps(s1, s2, l1);
+
+ score -= (int)(100 * gap_open * ngaps);
+
+ }
+ }
+
+ score /= 100;
+
+ return score;
+}
+*/
+
+}
diff --git a/libMems/ClustalInterface.h b/libMems/ClustalInterface.h
new file mode 100644
index 0000000..b0a6c1e
--- /dev/null
+++ b/libMems/ClustalInterface.h
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ * $Id: ClustalInterface.h,v 1.12 2004/04/19 23:10:50 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _ClustalInterface_h_
+#define _ClustalInterface_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/NumericMatrix.h"
+#include "libGenome/gnFilter.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/GappedAligner.h"
+
+// attempt to auto-link the ClustalW library on windows
+#if defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "ClustalW64omp.lib")
+#endif
+#if defined(WIN64)&&defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "ClustalW64fdomp.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "ClustalWomp.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "ClustalWfdomp.lib")
+#endif
+#if defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "ClustalW64.lib")
+#endif
+#if defined(WIN64)&&defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "ClustalW64fd.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "ClustalW.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "ClustalWfd.lib")
+#endif
+
+
+namespace mems {
+
+class ClustalInterface : public GappedAligner {
+public:
+ /**
+ * Returns a reference to a usable ClustalInterface
+ */
+ static ClustalInterface& getClustalInterface();
+ /**
+ * Attempts to perform a multiple alignment using ClustalW between
+ * <code>r_begin</code> and <code>r_end</code>
+ */
+ boolean Align( GappedAlignment& cr, Match* r_begin, Match* r_end, std::vector< genome::gnSequence* >& seq_table );
+ /**
+ * Set the distance matrix to use when computing alignments, writes the guide tree to the location
+ * specified in <code>tree_filename</code>
+ * @param distance_matrix An NxN distance matrix for the sequences
+ * @param tree_filename The output file name for the guide tree
+ */
+ void SetDistanceMatrix( NumericMatrix< double >& distance_matrix, std::string& tree_filename );
+ /**
+ * Set the minimum flank size used to anchor alignments on the sequences
+ */
+ void SetMinFlankSize( gnSeqI min_flank ){ min_flank_size = min_flank; }
+
+ /**
+ * Try using the guide tree in the file given by tree_filename. Throws an
+ * exception if the tree file couldn't be loaded
+ * @param tree_filename The path to the guide tree file
+ * @param dist_mat The distance matrix relating sequences
+ * @param seq_count The number of genomes in the guide tree file
+ */
+ void setGuideTree( std::string& tree_filename, NumericMatrix< double >& dist_mat, uint seq_count );
+
+ /** returns true if a guide tree has been loaded already */
+ boolean guideTreeLoaded() const { return distance_matrix.cols() > 0; };
+
+ void SetDistanceMatrix( NumericMatrix< double >& distance_matrix, std::string& tree_filename, boolean reread_tree );
+protected:
+ boolean CallClustal( std::vector< std::string >& seq_table );
+ NumericMatrix< double > distance_matrix;
+ gnSeqI min_flank_size;
+ int clustal_score_cutoff;
+ bool allocated_aln;
+private:
+ ClustalInterface( const ClustalInterface& ci ){ *this = ci; }
+ ClustalInterface& operator=( const ClustalInterface& ci );
+ ClustalInterface();
+};
+
+}
+
+#endif // _ClustalInterface_h_
diff --git a/libMems/CompactGappedAlignment.h b/libMems/CompactGappedAlignment.h
new file mode 100644
index 0000000..942d4aa
--- /dev/null
+++ b/libMems/CompactGappedAlignment.h
@@ -0,0 +1,819 @@
+/*******************************************************************************
+ * $Id: CompactGappedAlignment.h,v 1.12 2004/04/19 23:10:50 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __CompactGappedAlignment_h__
+#define __CompactGappedAlignment_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnDebug.h"
+#include "libGenome/gnFilter.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/SparseAbstractMatch.h"
+#include "libMems/HybridAbstractMatch.h"
+#include "libMems/AbstractGappedAlignment.h"
+#include "libMems/UngappedLocalAlignment.h"
+
+#include <algorithm>
+
+#ifdef WIN32
+#include "windows.h"
+#endif
+
+namespace mems {
+
+/**
+ * The CompactGappedAlignment stores a gapped alignment as a bit-vector
+ * Rather than using one byte per aligned position, this class uses one bit, making
+ * particularly space efficient
+ */
+template< class BaseType = AbstractGappedAlignment< HybridAbstractMatch<> > >
+class CompactGappedAlignment : public BaseType
+{
+public:
+ CompactGappedAlignment() : BaseType(){};
+ CompactGappedAlignment( uint seq_count, gnSeqI align_length );
+ CompactGappedAlignment( std::vector< bitset_t >& aln_mat, gnSeqI alignment_length );
+
+ template< class MatchType >
+ CompactGappedAlignment( MatchType& m ) :
+ BaseType( m.SeqCount(), m.AlignmentLength() ),
+ bcount( std::vector< std::vector< size_t > >( m.SeqCount() ) )
+ {
+ m.GetAlignment(align_matrix);
+
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ this->SetStart(seqI, m.Start(seqI));
+ if( m.Start(seqI) != NO_MATCH )
+ this->SetLength(m.Length(seqI), seqI);
+ else
+ this->SetLength(0, seqI);
+ }
+
+ this->create_bitcount();
+
+ if( !this->validate() )
+ std::cerr << "kahnstruct error\n";
+ }
+
+ CompactGappedAlignment* Clone() const { return new CompactGappedAlignment( *this ); }
+ CompactGappedAlignment* Copy() const;
+ virtual void Free();
+
+ void SetAlignment( const std::vector< std::string >& seq_align );
+
+ void SetAlignment( std::vector< bitset_t >& seq_align );
+
+ // Inherited methods from AbstractMatch:
+ virtual void Invert();
+ virtual void CropStart(gnSeqI crop_amount);
+ virtual void CropEnd(gnSeqI crop_amount);
+
+ virtual void CropLeft(gnSeqI crop_amount, uint seqI);
+ virtual void CropRight(gnSeqI crop_amount, uint seqI);
+
+ void GetAlignment( std::vector< bitset_t >& align_matrix ) const;
+
+ /** allows a peek at the data inside this alignment. don't change it or the CompactGappedAlignment will become corrupt */
+ const std::vector< bitset_t >& GetAlignment() const{ return align_matrix; }
+
+// friend void GetAlignment( const CompactGappedAlignment& ga, const std::vector< genome::gnSequence* >& seq_table, std::vector<std::string>& alignment );
+
+ void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const;
+
+ /** returns true if the given row,column of the alignment has a gap character */
+ virtual bool IsGap( uint seq, gnSeqI col ) const;
+ /** translate a cga to a new coordinate system */
+ void translate( CompactGappedAlignment& cga, uint cga_seq, uint my_seq, bool add_bits = true );
+
+ bool validate() const;
+ bool validate_bitcount() const;
+
+ void copyRange( CompactGappedAlignment& dest, gnSeqI left_column, gnSeqI length );
+ gnSeqI SeqPosToColumn( uint seq, int64 pos);
+
+ /** Eliminates any columns that contain only gap characters */
+ void CondenseGapColumns();
+
+ void swap( CompactGappedAlignment& other ){ swap(&other); }
+
+protected:
+ // for use by derived classes in order to swap contents
+ void swap( CompactGappedAlignment* other ){
+ std::swap( align_matrix, other->align_matrix );
+ std::swap( bcount, other->bcount );
+ BaseType::swap( other );
+ }
+
+ std::vector< bitset_t > align_matrix; /**< aligned positions have true values, gaps are false */
+ std::vector< std::vector< size_t > > bcount;
+
+ void create_bitcount();
+ gnSeqI SeqPosToColumn( gnSeqI pos, const bitset_t& bvec, const std::vector< size_t >& index ) const;
+
+};
+
+static bool debug_cga = false;
+
+template< class BaseType >
+CompactGappedAlignment<BaseType>* CompactGappedAlignment<BaseType>::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::Free()
+{
+ m_free(this);
+}
+
+template< class BaseType >
+bool CompactGappedAlignment<BaseType>::validate() const
+{
+ if( !debug_cga )
+ return true;
+ bool good = true;
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( this->AlignmentLength() != align_matrix[seqI].size() )
+ {
+ good = false;
+ std::cerr << "vanishing pig trick\n";
+ genome::breakHere();
+ }
+ gnSeqI count = align_matrix[seqI].count();
+ if( count > 0 && this->LeftEnd(seqI) == 0 )
+ {
+ good = false;
+ std::cerr << "boner_McHoserknob\n";
+ genome::breakHere();
+ }
+ if( (count == 0 || this->Length(seqI) == 0) && this->LeftEnd(seqI) != 0 )
+ {
+ good = false;
+ std::cerr << "Length(" << seqI << "): " << this->Length(seqI) << std::endl;
+ std::cerr << "LeftEnd(seqI): " << this->LeftEnd(seqI) << std::endl;
+ std::cerr << "spumante explosion\n";
+ genome::breakHere();
+ }
+ if( count != this->Length(seqI) )
+ {
+ std::cerr << "seqI: " << seqI << " count: " << count << " Length(seqI): " << this->Length(seqI) << std::endl;
+ std::cerr << "LeftEnd(seqI): " << this->LeftEnd(seqI) << std::endl;
+ std::cerr << "lendo mismatcho\n";
+ genome::breakHere();
+ return false;
+ }
+// std::vector< std::vector< size_t > > tmp_bcount = bcount;
+// create_bitcount();
+// if( !tmp_bcount == bcount )
+// {
+// good = false;
+// std::cerr << "bcount mismatch!!!\n";
+// }
+// bcount = tmp_bcount;
+
+ }
+ if( good ) // check for all gap cols
+ {
+/* allow gap cols...
+ for( size_t colI = 0; colI < this->AlignmentLength(); ++colI )
+ {
+ bool aa = false;
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ aa = aa || align_matrix[seqI].test(colI);
+ if( aa == false )
+ {
+ std::cerr << "gap col at " << colI << std::endl;
+ genome::breakHere();
+ }
+ }
+ */
+ }
+ return validate_bitcount() && good;
+}
+
+
+template< class BaseType >
+CompactGappedAlignment<BaseType>::CompactGappedAlignment( std::vector< bitset_t >& aln_mat, gnSeqI alignment_length ) :
+BaseType( aln_mat.size(), alignment_length ),
+align_matrix( aln_mat ),
+bcount( std::vector< std::vector< size_t > >( aln_mat.size() ) )
+{
+ this->create_bitcount();
+ this->validate_bitcount();
+}
+
+template< class BaseType >
+CompactGappedAlignment<BaseType>::CompactGappedAlignment( uint seq_count, gnSeqI align_length ) :
+BaseType( seq_count, align_length )
+{}
+
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::SetAlignment( const std::vector< std::string >& seq_align ){
+ if( seq_align.size() == 0 )
+ {
+ this->SetAlignmentLength(0);
+ return;
+ }
+ this->SetAlignmentLength(seq_align[0].size());
+ align_matrix = std::vector< bitset_t >( seq_align.size(), bitset_t( seq_align[0].size(), false ) );
+ bcount = std::vector< std::vector<size_t> >( seq_align.size() );
+ for( size_t seqI = 0; seqI < seq_align.size(); seqI++ )
+ {
+ bool nonzero = false;
+ for( size_t charI = 0; charI < seq_align[seqI].size(); charI++ )
+ if( seq_align[seqI][charI] != '-' )
+ {
+ align_matrix[seqI].set(charI);
+ nonzero = true;
+ }
+ }
+ this->create_bitcount();
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::SetAlignment( std::vector< bitset_t >& seq_align )
+{
+ std::swap( align_matrix, seq_align );
+ seq_align.clear();
+ if( align_matrix.size() > 0 )
+ this->SetAlignmentLength( align_matrix[0].size() );
+ else
+ this->SetAlignmentLength(0);
+ bcount = std::vector< std::vector<size_t> >(align_matrix.size());
+ this->create_bitcount();
+ this->validate_bitcount();
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::GetAlignment( std::vector< bitset_t >& align_matrix ) const
+{
+ align_matrix = this->align_matrix;
+}
+
+template< class BaseType >
+bool CompactGappedAlignment<BaseType>::IsGap( uint seq, gnSeqI col ) const
+{
+ return !align_matrix[seq][col];
+}
+
+static const unsigned INDEX_INTERVAL = 512;
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::create_bitcount()
+{
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+// if( this->LeftEnd(seqI) == NO_MATCH )
+// continue;
+ bitset_t& bvec = align_matrix[seqI];
+ bcount[seqI].clear();
+ bcount[seqI].push_back(0);
+ for( size_t indie = 0; indie + INDEX_INTERVAL <= bvec.size(); indie += INDEX_INTERVAL )
+ {
+ size_t end = indie + INDEX_INTERVAL;
+ size_t ct = 0;
+ for( size_t i = indie; i < end; ++i )
+ ct += bvec.test(i);
+ bcount[seqI].push_back( ct + bcount[seqI].back() );
+ }
+ }
+}
+
+template< class BaseType >
+bool CompactGappedAlignment<BaseType>::validate_bitcount() const
+{
+ if( !debug_cga )
+ return true;
+ bool valid = true; // innocent until proven guilty
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ gnSeqI count = align_matrix[seqI].count();
+ size_t bc_len = align_matrix[seqI].size() / INDEX_INTERVAL;
+ if( count < INDEX_INTERVAL && bcount[seqI].size() == 0 )
+ continue; // a-ok here
+ if( bc_len + 1 != bcount[seqI].size() && (bcount[seqI].back() % INDEX_INTERVAL != 0) )
+ {
+ std::cerr << "bitcount problem, bc_len + 1: " << bc_len + 1 << " and bcount[seqI].size(): " << bcount[seqI].size() << std::endl;
+ std::cerr << "count: " << count << " and bcount[seqI].back(): " << bcount[seqI].back() << std::endl;
+ valid = false;
+ }
+ if( count - bcount[seqI].back() > INDEX_INTERVAL )
+ {
+ std::cerr << "bitcount problem, count: " << count << " and bcount[seqI].back(): " << bcount[seqI].back() << std::endl;
+ valid = false;
+ }
+ }
+ return valid;
+}
+
+template< class BaseType >
+gnSeqI CompactGappedAlignment<BaseType>::SeqPosToColumn( uint seq, int64 pos )
+{
+ if( this->Orientation(seq) == AbstractMatch::forward )
+ pos = genome::absolut(pos) - this->LeftEnd(seq) + 1;
+ else
+ pos = this->RightEnd(seq)-genome::absolut(pos) + 1; // is this right?
+ return SeqPosToColumn(pos, align_matrix[seq], bcount[seq]);
+}
+
+template< class BaseType >
+gnSeqI CompactGappedAlignment<BaseType>::SeqPosToColumn( gnSeqI pos, const bitset_t& bvec, const std::vector< size_t >& index ) const
+{
+ std::vector<size_t>::const_iterator iter = std::lower_bound(index.begin(), index.end(), pos);
+ --iter;
+ size_t cur_pos = *iter;
+ size_t col = iter - index.begin();
+ col *= INDEX_INTERVAL;
+ if( col == 0 )
+ col = bvec.find_first();
+ else
+ col = bvec.find_next(col-1);
+ for( ++cur_pos; cur_pos < pos; ++cur_pos )
+ col = bvec.find_next(col);
+ return col;
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::translate( CompactGappedAlignment& cga, uint cga_seq, uint my_seq, bool add_bits ) // const
+{
+ AbstractMatch::orientation my_orient = this->Orientation(my_seq);
+
+ if( cga.Length(cga_seq) > this->Length(my_seq) )
+ {
+ std::cerr << "Oh scheisskopf. What are you trying to do to me??\n";
+ std::cerr << "cga.Length(" << cga_seq << "): " << cga.Length(cga_seq) << std::endl;
+ std::cerr << "Length(" << my_seq << "): " << this->Length(my_seq) << std::endl;
+ genome::breakHere();
+ }
+
+ gnSeqI prev_lend = cga.LeftEnd(cga_seq);
+ gnSeqI prev_len = cga.Length(cga_seq);
+ gnSeqI my_lend = this->LeftEnd(my_seq);
+ gnSeqI my_len = this->Length(my_seq);
+ gnSeqI my_count = 0;
+ uint seqI = 0;
+
+ // what assumptions should be made about cga?
+ // does it already have the correct left-end relative to this?
+ // no, it needs to have a left-end relative to the first aligned char in this
+ size_t cur_bit = 0;
+
+ // determine left_bit
+ size_t left_bit = this->SeqPosToColumn(cga.LeftEnd(cga_seq), align_matrix[my_seq], bcount[my_seq]);
+ // determine right_bit
+ size_t right_bit = this->SeqPosToColumn(cga.RightEnd(cga_seq), align_matrix[my_seq], bcount[my_seq]);
+ if( right_bit > 4000000000u )
+ {
+ std::cerr << "cga doesn't fit\n";
+ std::cerr << "cga.RightEnd(cga_seq) " << cga.RightEnd(cga_seq) << std::endl;
+ std::cerr << "RightEnd(my_seq): " << this->RightEnd(my_seq) << std::endl;
+ std::cerr << "cga.LeftEnd(cga_seq) " << cga.LeftEnd(cga_seq) << std::endl;
+ std::cerr << "LeftEnd(my_seq): " << this->LeftEnd(my_seq) << std::endl;
+ std::cerr << "cga.AlignmentLength(): " << cga.AlignmentLength() << std::endl;
+ std::cerr << "AlignmentLength(): " << this->AlignmentLength() << std::endl;
+ genome::breakHere();
+ }
+ right_bit++;
+ if( right_bit == 0 )
+ right_bit = this->AlignmentLength();
+
+ cga.SetLeftEnd(cga_seq,left_bit+1);
+
+ // add on length of unaligned left and right sides
+ size_t cga_left = cga.align_matrix[cga_seq].find_first();
+
+ size_t somesize = (right_bit - left_bit) - cga.Length(cga_seq) + cga.AlignmentLength();
+
+ size_t cga_bit = cga_left;
+ size_t my_bit = left_bit;
+ size_t xlat_bit = cga_left;
+ size_t added_bits = 0;
+ // copy in everything up to cga_left
+ std::vector< bitset_t > xrated( cga.SeqCount(), bitset_t( somesize, false ) );
+ for( size_t seqI = 0; seqI < xrated.size(); ++seqI )
+ for( size_t asdf = cga.align_matrix[seqI].find_first(); asdf < cga_left; asdf = cga.align_matrix[seqI].find_next(asdf) )
+ xrated[seqI].set(asdf);
+
+ while(xlat_bit < somesize)
+ {
+ // assume that align_matrix[my_seq][my_bit] is set
+ if( !align_matrix[my_seq].test(my_bit) )
+ {
+ std::cerr << "ohhhhhhzheiss!\n";
+ genome::breakHere();
+ }
+ // copy the column in cga
+ for( size_t seqI = 0; seqI < xrated.size(); ++seqI )
+ xrated[seqI].set( xlat_bit, cga.align_matrix[seqI].test(cga_bit) );
+
+ ++cga_bit;
+ ++xlat_bit;
+
+ if( xlat_bit >= somesize )
+ break;
+
+ // TODO: should this condition be replaced by cropping xlat_bit + diff - 1 down to < somesize?
+ if( cga.align_matrix[cga_seq].test(cga_bit) )
+ {
+ size_t next_bit = align_matrix[my_seq].find_next(my_bit);
+ if( next_bit > 4000000000u )
+ genome::breakHere();
+ size_t diff = next_bit - my_bit;
+ if( diff > 1 && add_bits )
+ {
+ if( xlat_bit + diff - 1 >= somesize )
+ {
+ std::cerr << "ERRRORRR porker!!\n";
+ genome::breakHere();
+ }
+ for( size_t i = xlat_bit; i < xlat_bit + diff - 1; ++i )
+ xrated[cga_seq].set(i);
+ added_bits += diff-1;
+ }
+ my_bit = next_bit;
+ xlat_bit += diff - 1;
+ }
+ }
+
+ cga.align_matrix = xrated;
+ cga.create_bitcount();
+ cga.SetLength(cga.Length(cga_seq)+added_bits,cga_seq);
+ cga.SetAlignmentLength(somesize);
+ if( !cga.validate() )
+ {
+ std::cerr << "prev_lend: " << prev_lend << std::endl;
+ std::cerr << "prev_len: " << prev_len << std::endl;
+ std::cerr << "translate error\n";
+ genome::breakHere();
+ }
+}
+
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::Invert(){
+ for(uint seqI = 0; seqI < this->SeqCount(); seqI++)
+ {
+ if( this->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ bitset_t& fwd = align_matrix[seqI];
+ bitset_t rev(this->AlignmentLength());
+ size_t r = this->AlignmentLength();
+ for( size_t i = 0; i < fwd.size(); ++i )
+ rev.set( --r, fwd.test(i) );
+ fwd.swap(rev);
+ }
+ this->create_bitcount();
+ BaseType::Invert();
+ if( !this->validate() )
+ {
+ std::cerr << "invert error\n";
+ }
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::CropStart(gnSeqI crop_amount){
+ if( crop_amount > this->AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ if( crop_amount == 0 )
+ return;
+
+ gnSeqI pre_alignlen = this->AlignmentLength();
+ gnSeqI pre_lend0 = this->LeftEnd(0);
+
+ std::vector<gnSeqI> pos;
+ std::vector<bool> column;
+ GetColumn( crop_amount-1, pos, column );
+
+ for( uint i=0; i < this->SeqCount(); i++ ){
+ if( this->LeftEnd(i) == NO_MATCH )
+ {
+ align_matrix[i].resize(this->AlignmentLength()-crop_amount);
+ align_matrix[i] = align_matrix[i]; // force reallocation on "optimized" windows builds
+ continue;
+ }
+
+ align_matrix[i] >>= crop_amount; // why not shift left? is this a bug in boost::dynamic_bitset?
+ align_matrix[i].resize(this->AlignmentLength()-crop_amount);
+ align_matrix[i] = align_matrix[i]; // force reallocation on "optimized" windows builds
+ size_t char_count = this->Orientation(i) == AbstractMatch::forward ? pos[i] - this->LeftEnd(i) + 1 : this->RightEnd(i) - pos[i] + 1;
+
+ if( pos[i] > 0 && char_count > 0 )
+ {
+ this->SetLength(this->Length(i)-char_count, i);
+ if( this->Length(i) == 0 )
+ this->SetStart(i, NO_MATCH);
+ if( this->Orientation(i) == AbstractMatch::forward )
+ this->SetStart(i, this->Start(i) + char_count);
+ }else if( pos[i] == 0 && this->Orientation(i) == AbstractMatch::reverse )
+ {
+ // this sequence was completely obliterated by the crop
+ this->SetLength(0, i);
+ this->SetStart(i, NO_MATCH);
+ }
+ }
+
+ this->SetAlignmentLength( this->AlignmentLength() - crop_amount );
+ this->create_bitcount();
+ if( !this->validate() )
+ {
+ std::cerr << "pre_lend0: " << pre_lend0 << std::endl;
+ std::cerr << "pre_alignlen: " << pre_alignlen << std::endl;
+ std::cerr << "CropStart error\n";
+ }
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::CropEnd(gnSeqI crop_amount){
+ if( crop_amount > this->AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ if( crop_amount == 0 )
+ return;
+
+ std::vector<gnSeqI> pos;
+ std::vector<bool> column;
+ this->GetColumn( this->AlignmentLength()-crop_amount, pos, column );
+
+ for( uint i=0; i < this->SeqCount(); i++ ){
+ align_matrix[i].resize( this->AlignmentLength() - crop_amount );
+ align_matrix[i] = align_matrix[i]; // force reallocation on "optimized" windows builds
+ if( this->LeftEnd(i) == NO_MATCH )
+ continue;
+ AbstractMatch::orientation orient = this->Orientation(i);
+ if( pos[i] > 0 )
+ {
+ gnSeqI char_count = pos[i] - (orient == AbstractMatch::forward ? (column[i] ? 1 : 0 ) : (column[i] ? 0 : 1 ) );
+ char_count = orient == AbstractMatch::forward ? char_count - this->LeftEnd(i) + 1 : this->RightEnd(i) - char_count;
+ if( char_count == 0 && align_matrix[i].count() > 0)
+ {
+ std::cerr << "orienatation: " << (orient == AbstractMatch::forward ? "forward\n" : (orient == AbstractMatch::reverse ? "reverse\n" : "undef\n"));
+ std::cerr << "lend: " << this->LeftEnd(i) << std::endl;
+ std::cerr << "length: " << this->Length(i) << std::endl;
+ std::cerr << "count: " << align_matrix[i].count() << std::endl;
+ }
+ gnSeqI deleted = this->Length(i) - char_count;
+ this->SetLength(char_count, i);
+ if( this->Length(i) == 0 )
+ this->SetStart(i, 0);
+ if( this->Start(i) < 0 )
+ this->SetStart(i, this->Start(i)-deleted);
+ }else if( orient == AbstractMatch::forward ){
+ this->SetLength(0, i);
+ this->SetStart(i, 0);
+ }
+ }
+ this->SetAlignmentLength( this->AlignmentLength() - crop_amount );
+ this->create_bitcount();
+ if( !this->validate() )
+ std::cerr << "CropEnd error\n";
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::CropLeft(gnSeqI crop_amount, uint seqI)
+{
+ if( crop_amount == 0 )
+ return;
+
+ gnSeqI pre_len = this->Length(seqI);
+ // count "crop_amount" characters into seqI and crop there
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ {
+ size_t left_col = this->SeqPosToColumn(crop_amount, align_matrix[seqI], bcount[seqI]) + 1;
+ this->CropStart(left_col);
+ }else{
+ size_t left_col = this->SeqPosToColumn(this->Length(seqI) - crop_amount + 1, align_matrix[seqI], bcount[seqI]);
+ if( left_col > 4000000000u )
+ {
+ std::cerr << this->LeftEnd(seqI) << std::endl;
+ std::cerr << this->LeftEnd(0) << std::endl;
+ std::cerr << "bogus cropper cga\n";
+ }
+ this->CropEnd(this->AlignmentLength()-left_col);
+ }
+ if( this->Length(seqI) != pre_len - crop_amount )
+ {
+ std::cerr << this->LeftEnd(seqI) << std::endl;
+ std::cerr << this->LeftEnd(0) << std::endl;
+ std::cerr << "bad cropperLeftie\n";
+ }
+ if( !this->validate() )
+ std::cerr << "CropLeft error\n";
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::CropRight(gnSeqI crop_amount, uint seqI)
+{
+ if( crop_amount == 0 )
+ return;
+
+ gnSeqI pre_len = this->Length(seqI);
+ gnSeqI pre_lend = this->LeftEnd(seqI);
+ gnSeqI pre_lend0 = this->LeftEnd(0);
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ {
+ // count "crop_amount" characters into seqI and crop there
+ size_t right_col = this->SeqPosToColumn(this->Length(seqI) - crop_amount + 1, align_matrix[seqI], bcount[seqI]);
+ this->CropEnd( this->AlignmentLength()-right_col );
+ }else
+ {
+ size_t right_col = this->SeqPosToColumn(crop_amount, align_matrix[seqI], bcount[seqI]) + 1;
+ if( right_col > 4000000000u )
+ {
+ std::cerr << this->LeftEnd(seqI) << std::endl;
+ std::cerr << this->LeftEnd(0) << std::endl;
+ std::cerr << "bogus cropper cga\n";
+ }
+ this->CropStart( right_col );
+ }
+ if( this->Length(seqI) != pre_len - crop_amount )
+ {
+ std::cerr << this->LeftEnd(seqI) << std::endl;
+ std::cerr << this->LeftEnd(0) << std::endl;
+ std::cerr << "bad cropperight\n";
+ }
+ if( !this->validate() )
+ std::cerr << "CropRight error\n";
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
+{
+ pos = std::vector<gnSeqI>(this->SeqCount(), NO_MATCH);
+ column = std::vector<bool>(this->SeqCount(), false);
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( align_matrix[seqI][col] )
+ column[seqI] = true;
+
+ gnSeqI count = 0;
+ if( this->LeftEnd(seqI) != NO_MATCH )
+ {
+ size_t col_index = col / INDEX_INTERVAL;
+ for( size_t i = col_index * INDEX_INTERVAL; i <= col; i++ )
+ count += align_matrix[seqI].test(i);
+ count += bcount[seqI][col_index];
+ }
+
+ if( count > 0 && this->Orientation(seqI) == AbstractMatch::forward )
+ pos[seqI] = this->LeftEnd(seqI) + count - 1;
+ else if( this->Orientation(seqI) == AbstractMatch::reverse && !(count == this->Length(seqI) && !column[seqI]) )
+ pos[seqI] = this->RightEnd(seqI) - count + 1;
+ }
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::copyRange( CompactGappedAlignment& dest, gnSeqI left_column, gnSeqI length )
+{
+ if( left_column + length > this->AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+// if( length == 0 )
+// return;
+
+ // first copy the coordinates
+ dest = CompactGappedAlignment(this->SeqCount(), length);
+ for( uint i=0; i < this->SeqCount(); i++ ){
+ dest.SetStart(i, this->Start(i));
+ if( this->Orientation(i) != AbstractMatch::undefined )
+ dest.SetLength(this->Length(i), i);
+ }
+ // then trim the coordinates appropriately
+
+ gnSeqI pre_alignlen = this->AlignmentLength();
+ gnSeqI pre_lend0 = this->LeftEnd(0);
+
+ std::vector< bitset_t > dest_mat(this->SeqCount(), bitset_t(length));
+ std::vector<gnSeqI> pos;
+ std::vector<bool> column;
+ std::vector<gnSeqI> left_cc(this->SeqCount(), 0);
+ if( left_column > 0 )
+ {
+ this->GetColumn( left_column-1, pos, column );
+ for( uint i=0; i < this->SeqCount(); i++ ){
+ if( this->LeftEnd(i) == NO_MATCH )
+ continue;
+
+ size_t char_count = this->Orientation(i) == AbstractMatch::forward ? pos[i] - this->LeftEnd(i) + 1 : this->RightEnd(i) - pos[i] + 1;
+ if( pos[i] > 0 && char_count > 0 )
+ {
+ left_cc[i] = char_count;
+ if( dest.Orientation(i) == AbstractMatch::forward )
+ dest.SetStart(i, dest.Start(i) + char_count);
+ }else if( pos[i] == 0 && dest.Orientation(i) == AbstractMatch::reverse )
+ {
+ // this sequence was completely obliterated by the crop
+ dest.SetStart(i, NO_MATCH);
+ }
+ }
+ }
+
+// now trim up the right side...
+ gnSeqI right_trim = this->AlignmentLength() - left_column - length;
+
+ if( right_trim > 0 )
+ {
+ this->GetColumn( this->AlignmentLength()-right_trim, pos, column );
+
+ for( uint i=0; i < this->SeqCount(); i++ ){
+ if( this->LeftEnd(i) == NO_MATCH )
+ continue;
+ AbstractMatch::orientation orient = this->Orientation(i);
+ if( pos[i] > 0 )
+ {
+ gnSeqI char_count = pos[i] - (orient == AbstractMatch::forward ? (column[i] ? 1 : 0 ) : (column[i] ? 0 : 1 ) );
+ char_count = orient == AbstractMatch::forward ? char_count - this->LeftEnd(i) + 1 : this->RightEnd(i) - char_count;
+ char_count -= left_cc[i];
+ gnSeqI deleted = this->Length(i) - char_count;
+ if( dest.Start(i) < 0 )
+ dest.SetStart(i, dest.Start(i)-deleted+left_cc[i]); // fixme: is this off-by-one?
+ }else if( orient == AbstractMatch::forward ){
+ dest.SetStart(i, NO_MATCH);
+ }
+ }
+ }
+
+ for( size_t i = 0; i < dest_mat.size(); ++i )
+ {
+ size_t count = 0;
+ for( size_t j = 0; j < length; ++j )
+ {
+ if(align_matrix[i].test(j+left_column))
+ {
+ dest_mat[i].set(j, true);
+ ++count;
+ }
+ }
+ dest.SetLength(count, i);
+ if( count == 0 )
+ dest.SetStart(i, NO_MATCH);
+ }
+ dest.SetAlignment(dest_mat);
+
+ dest.create_bitcount();
+ if( !dest.validate() )
+ {
+ std::cerr << "pre_lend0: " << pre_lend0 << std::endl;
+ std::cerr << "pre_alignlen: " << pre_alignlen << std::endl;
+ std::cerr << "CropStart error\n";
+ }
+
+}
+
+template< class BaseType >
+void CompactGappedAlignment<BaseType>::CondenseGapColumns()
+{
+ const size_t len = this->AlignmentLength();
+ size_t d = 0; // destination index
+ for( size_t i = 0; i < len; ++i )
+ {
+ size_t seqI = 0;
+ // check whether this is a gap col
+ for( ; seqI < align_matrix.size(); ++seqI )
+ if( this->LeftEnd(seqI) != 0 && align_matrix[seqI].test(i) )
+ break;
+
+ // copy if not a gap col (and i != d )
+ if( seqI < align_matrix.size() )
+ {
+ if( i != d )
+ {
+ for( seqI = 0; seqI < align_matrix.size(); ++seqI )
+ align_matrix[seqI].set( d, align_matrix[seqI].test(i) );
+ }
+ d++;
+ }
+ else
+ std::cout << "";
+ }
+ this->SetAlignmentLength(d);
+ for( size_t seqI = 0; seqI < align_matrix.size(); ++seqI )
+ {
+ align_matrix[seqI].resize(d);
+ align_matrix[seqI] = align_matrix[seqI]; // force reallocation on "optimized" windows builds
+ }
+ this->create_bitcount();
+}
+
+
+}
+
+namespace std {
+template<> inline
+void swap( mems::CompactGappedAlignment<>& a, mems::CompactGappedAlignment<>& b )
+{
+ a.swap(b);
+}
+}
+
+
+#endif // __CompactGappedAlignment_h__
+
diff --git a/libMems/DNAFileSML.cpp b/libMems/DNAFileSML.cpp
new file mode 100644
index 0000000..de91365
--- /dev/null
+++ b/libMems/DNAFileSML.cpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ * $Id: DNAFileSML.cpp,v 1.4 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libGenome/gnFilter.h"
+#include "libMems/DNAFileSML.h"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+DNAFileSML::DNAFileSML() : FileSML(){
+ FileSML::header.version = FormatVersion();
+}
+
+DNAFileSML::DNAFileSML(const string& fname, const uint8* table, const uint32 alpha_bits){
+ header.alphabet_bits = alpha_bits;
+ memcpy(header.translation_table, table, UINT8_MAX);
+ filename = fname;
+ header.version = FormatVersion();
+}
+
+DNAFileSML& DNAFileSML::operator=(const DNAFileSML& msa ){
+ FileSML::operator=(msa);
+ return *this;
+}
+
+DNAFileSML* DNAFileSML::Clone() const{
+ DNAFileSML *bdsa = new DNAFileSML();
+ (*bdsa) = *this;
+ return bdsa;
+}
+
+uint64 DNAFileSML::GetNeededMemory(gnSeqI len){
+ uint64 neededmem = (len * FileSML::header.alphabet_bits) / 8;
+ //forward and reverse copies of the sequence
+ neededmem += len * 2;
+ neededmem += sizeof(bmer) * len;
+ return neededmem;
+}
+
+uint32 DNAFileSML::CalculateMaxMerSize() const{
+ return 62 / header.alphabet_bits;
+}
+
+uint64 DNAFileSML::GetMer(gnSeqI position) const{
+ return GetDnaMer( position );
+}
+
+uint64 DNAFileSML::GetSeedMer( gnSeqI offset ) const{
+ return GetDnaSeedMer( offset );
+}
+
+void DNAFileSML::FillSML(const gnSequence& seq, vector<bmer>& sml_array)
+{
+ FillDnaSML(seq, sml_array);
+}
+
+} // namespace mems
diff --git a/libMems/DNAFileSML.h b/libMems/DNAFileSML.h
new file mode 100644
index 0000000..f08514d
--- /dev/null
+++ b/libMems/DNAFileSML.h
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ * $Id: DNAFileSML.h,v 1.6 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _DNAFileSML_h_
+#define _DNAFileSML_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/FileSML.h"
+
+namespace mems {
+
+/**
+ * The seed pattern for DNA SMLs must be palindromic
+ */
+class DNAFileSML : public FileSML
+{
+public:
+ DNAFileSML();
+
+ /**
+ * Load or create a DNAFileSML ()
+ * Attempts to load a DNA sorted mer list from the named file if it exists.
+ * If the given file does not exist it creates an empty DNAFileSML with
+ * the supplied translation table and alphabet bit size.
+ * @param fname The name of the file to create.
+ * @param table The array used to translate characters into binary code
+ * @param alpha_bits The number of bits each character consumes in binary
+ */
+ DNAFileSML(const std::string& fname, const uint8* table = SortedMerList::BasicDNATable(), const uint32 alpha_bits = DNA_ALPHA_BITS);
+ DNAFileSML(const SortedMerList& sa);
+ DNAFileSML& operator=(const DNAFileSML& msa );
+
+ DNAFileSML* Clone() const;
+
+ virtual uint64 GetMer(gnSeqI position) const;
+
+ virtual uint32 FormatVersion();
+
+ virtual uint64 GetSeedMer( gnSeqI offset ) const;
+
+protected:
+ virtual void FillSML(const genome::gnSequence& seq, std::vector<bmer>& sml_array);
+ virtual uint32 CalculateMaxMerSize() const;
+ virtual uint64 GetNeededMemory(gnSeqI len);
+};
+
+// version 3 was original DNAFileSML format
+// version 4 was introduction of inexact seeds
+// version 5 was fix in header struct for 64-bit seed size
+inline
+uint32 DNAFileSML::FormatVersion(){
+ static uint32 f_version = 5;
+ return f_version;
+}
+
+}
+
+#endif //_DNAFileSML_h_
diff --git a/libMems/DNAMemorySML.cpp b/libMems/DNAMemorySML.cpp
new file mode 100644
index 0000000..9ef9285
--- /dev/null
+++ b/libMems/DNAMemorySML.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ * $Id: DNAMemorySML.cpp,v 1.3 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/DNAMemorySML.h"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+DNAMemorySML::DNAMemorySML(const uint8* table, const uint32 alpha_bits) :
+MemorySML( table, alpha_bits )
+{}
+
+DNAMemorySML& DNAMemorySML::operator=(const DNAMemorySML& msa ){
+ MemorySML::operator=(msa);
+ return *this;
+}
+
+DNAMemorySML* DNAMemorySML::Clone() const{
+ DNAMemorySML *bdsa = new DNAMemorySML();
+ (*bdsa) = *this;
+ return bdsa;
+}
+
+uint64 DNAMemorySML::GetMer(gnSeqI position) const{
+ return GetDnaMer( position );
+}
+
+uint64 DNAMemorySML::GetSeedMer( gnSeqI offset ) const{
+ return GetDnaSeedMer( offset );
+}
+
+void DNAMemorySML::FillSML(const gnSequence& seq, vector<bmer>& sml_array)
+{
+ FillDnaSML(seq, sml_array);
+}
+
+} // namespace mems
diff --git a/libMems/DNAMemorySML.h b/libMems/DNAMemorySML.h
new file mode 100644
index 0000000..e2558bb
--- /dev/null
+++ b/libMems/DNAMemorySML.h
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ * $Id: DNAMemorySML.h,v 1.3 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _DNAMemorySML_h_
+#define _DNAMemorySML_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/MemorySML.h"
+
+namespace mems {
+
+/** The DNAMemorySML is an implementation of sorted mer lists which creates and
+ * stores the sorted mer list entirely in memory. A DNAMemorySML consumes
+ * roughly 32 + alpha_bits bits of memory per character in the sequences.
+ * For unambiguous DNA sequences 4.25 bytes per base are required.
+ * The seed pattern for DNA SMLs must be palindromic
+ */
+class DNAMemorySML : public MemorySML
+{
+public:
+ /**
+ * Create an empty DNAMemorySML
+ * Creates an empty DNAMemorySML with the supplied translation
+ * table and alphabet bit size. Defaults to DNA settings
+ * @param table The array used to translate characters into binary code
+ * @param alpha_bits The number of bits each character consumes in binary
+ */
+ DNAMemorySML(const uint8* table = SortedMerList::BasicDNATable(), const uint32 alpha_bits = DNA_ALPHA_BITS);
+ DNAMemorySML(const DNAMemorySML& msa);
+ DNAMemorySML(const SortedMerList& sa);
+ DNAMemorySML& operator=(const DNAMemorySML& msa );
+ DNAMemorySML* Clone() const;
+
+
+ virtual uint64 GetMer(gnSeqI offset) const;
+ virtual uint64 GetSeedMer( gnSeqI offset ) const;
+
+protected:
+
+ virtual void FillSML(const genome::gnSequence& seq, std::vector<bmer>& sml_array);
+
+};
+
+}
+
+#endif //_DNAMemorySML_h_
diff --git a/libMems/DenseAbstractMatch.h b/libMems/DenseAbstractMatch.h
new file mode 100644
index 0000000..cddeeab
--- /dev/null
+++ b/libMems/DenseAbstractMatch.h
@@ -0,0 +1,169 @@
+/*******************************************************************************
+ * $Id: DenseAbstractMatch.h,v 1.8 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __DenseAbstractMatch_h__
+#define __DenseAbstractMatch_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include "libMems/AbstractMatch.h"
+#include <limits>
+
+namespace mems {
+
+/**
+ * The DenseAbstractMatch implements the AbstractMatch interface in a way
+ * that is most efficient when Multiplicity and SeqCount are identical or
+ * nearly so. It stores all data inline in a fixed size arrays, affording it
+ * storage in a contiguous chunk of memory.
+ */
+template< unsigned int MAX_SEQS >
+class DenseAbstractMatch : public AbstractMatch
+{
+public:
+ DenseAbstractMatch();
+ /**
+ * Creates a new AbstractMatch.
+ * @param seq_count The total number of sequences in the alignment
+ */
+ DenseAbstractMatch(const uint seq_count );
+ // use the compiler generated copy constructor, assignment operator, and destructor
+
+ virtual AbstractMatch* Clone() const = 0;
+
+ // see AbstractMatch base class documentation for these functions
+
+ int64 Start(uint seqI) const{
+ int64 s = leftend[seqI];
+ return orient[seqI]? -s : s;
+ }
+ void SetStart(uint seqI, int64 startI)
+ {
+ SetLeftEnd( seqI, genome::absolut(startI) );
+ orient[seqI] = startI < 0;
+ }
+ uint Multiplicity() const{return m_multiplicity;}
+ uint SeqCount() const{return m_seq_count;}
+ virtual uint FirstStart() const;
+ virtual void Invert();
+
+ virtual gnSeqI LeftEnd(uint seqI) const{ return leftend[seqI]; }
+ virtual orientation Orientation(uint seqI) const;
+ virtual void SetLeftEnd(uint seqI, gnSeqI position)
+ {
+ if( position == NO_MATCH && leftend[seqI] != NO_MATCH )
+ --m_multiplicity;
+ else if( position != NO_MATCH && leftend[seqI] == NO_MATCH )
+ ++m_multiplicity;
+ leftend[seqI]=position;
+ }
+ virtual void SetOrientation(uint seqI, orientation o){ orient[seqI]= (o == reverse); }
+
+ virtual boolean operator==( const DenseAbstractMatch& dam ) const;
+
+ void MoveStart(int64 move_amount);
+
+ void MoveEnd(int64 move_amount);
+
+ virtual uint UsedSeq( uint seqI ) const {
+ return seqI;
+ }
+
+protected:
+
+ uint m_seq_count;
+ gnSeqI leftend[ MAX_SEQS ];
+ bool orient[ MAX_SEQS ];
+ uint m_multiplicity;
+};
+
+template< unsigned int MAX_SEQS >
+DenseAbstractMatch<MAX_SEQS>::DenseAbstractMatch() :
+m_seq_count(0),
+m_multiplicity(0)
+{
+ memset( leftend, 0, MAX_SEQS * sizeof(gnSeqI) );
+ memset( orient, 0, sizeof( orient ) );
+}
+
+template< unsigned int MAX_SEQS >
+DenseAbstractMatch<MAX_SEQS>::DenseAbstractMatch(const uint seq_count ) :
+m_seq_count(seq_count),
+m_multiplicity(0)
+{
+ memset( leftend, 0, MAX_SEQS * sizeof(gnSeqI) );
+ memset( orient, 0, sizeof( orient ) );
+}
+
+template< unsigned int MAX_SEQS >
+boolean DenseAbstractMatch<MAX_SEQS>::operator==( const DenseAbstractMatch<MAX_SEQS>& dam ) const
+{
+ for( uint seqI = 0; seqI < m_seq_count; ++seqI )
+ {
+ if( leftend[seqI] != dam.leftend[seqI] ||
+ (leftend[seqI] != 0 && orient[seqI] != orient[seqI]))
+ return false;
+ }
+ return true;
+}
+
+template< unsigned int MAX_SEQS >
+AbstractMatch::orientation DenseAbstractMatch<MAX_SEQS>::Orientation(uint seqI) const
+{
+ if( leftend[seqI] != NO_MATCH && seqI < m_seq_count )
+ return orient[seqI] ? reverse : forward;
+ return undefined;
+}
+
+template< unsigned int MAX_SEQS >
+void DenseAbstractMatch<MAX_SEQS>::Invert()
+{
+ for( uint seqI = 0; seqI < MAX_SEQS; ++seqI )
+ orient[seqI] = !orient[seqI];
+}
+
+template< unsigned int MAX_SEQS >
+uint DenseAbstractMatch<MAX_SEQS>::FirstStart() const
+{
+ for( uint m_firstStart = 0; m_firstStart < SeqCount(); ++m_firstStart )
+ if( leftend[m_firstStart] != NO_MATCH )
+ return m_firstStart;
+ return (std::numeric_limits<uint>::max)();
+}
+
+template< unsigned int MAX_SEQS >
+void DenseAbstractMatch<MAX_SEQS>::MoveStart(int64 move_amount)
+{
+ for( uint i=0; i < m_seq_count; ++i )
+ if( leftend[i] != NO_MATCH && orient[i] == false )
+ leftend[i] += move_amount;
+}
+
+template< unsigned int MAX_SEQS >
+void DenseAbstractMatch<MAX_SEQS>::MoveEnd(int64 move_amount)
+{
+ for( uint i=0; i < m_seq_count; ++i )
+ if( leftend[i] != NO_MATCH && orient[i] )
+ leftend[i] += move_amount;
+}
+
+
+typedef DenseAbstractMatch<2> DenseAbstractMatch2;
+typedef DenseAbstractMatch<4> DenseAbstractMatch4;
+typedef DenseAbstractMatch<8> DenseAbstractMatch8;
+typedef DenseAbstractMatch<16> DenseAbstractMatch16;
+typedef DenseAbstractMatch<32> DenseAbstractMatch32;
+typedef DenseAbstractMatch<64> DenseAbstractMatch64;
+typedef DenseAbstractMatch<128> DenseAbstractMatch128;
+
+}
+
+#endif // _DenseAbstractMatch_h_
diff --git a/libMems/DistanceMatrix.h b/libMems/DistanceMatrix.h
new file mode 100644
index 0000000..dfefc23
--- /dev/null
+++ b/libMems/DistanceMatrix.h
@@ -0,0 +1,327 @@
+/*******************************************************************************
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __DistanceMatrix_h__
+#define __DistanceMatrix_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/SubstitutionMatrix.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/CompactGappedAlignment.h"
+
+
+namespace mems {
+
+
+void TransformDistanceIdentity( NumericMatrix<double>& identity );
+
+void DistanceMatrix( const MatchList& mlist, NumericMatrix<double>& identity );
+
+
+template< class AbstractMatchVectorType >
+void IdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity );
+template<class AbstractMatchType>
+void MatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity);
+
+void DistanceMatrix( uint seq_count, const std::vector< std::pair< uint64, uint64 > >& detail_list, NumericMatrix<double>& distance );
+
+void IdentityMatrix( const IntervalList& iv_list, NumericMatrix<double>& identity );
+inline
+void IdentityMatrix( const IntervalList& iv_list, NumericMatrix<double>& identity )
+{
+ std::vector< const AbstractMatch* > am_list;
+ for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
+ am_list.push_back( &iv_list[ivI] );
+ IdentityMatrix( am_list, iv_list.seq_table, identity );
+}
+
+template< class AbstractMatchVectorType >
+void IdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity ){
+ if( matches.size() == 0 )
+ return;
+
+ uint seq_count = seq_table.size();
+ identity = NumericMatrix<double>( seq_count, seq_count );
+ identity.init( 0 );
+ NumericMatrix<double> possible( seq_count, seq_count );
+ possible.init( 0 );
+
+ for( uint ivI = 0; ivI < matches.size(); ivI++ ){
+ AddToMatchIdentityMatrix( *matches[ ivI ], seq_table, identity );
+ }
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
+ gnSeqI shorter_len = seq_table[seqI]->length() < seq_table[seqJ]->length() ? seq_table[seqI]->length() : seq_table[seqJ]->length();
+ possible( seqI, seqJ ) += shorter_len;
+ }
+ }
+ identity /= possible;
+}
+
+
+template< class AbstractMatchVectorType >
+void BackboneIdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity ){
+ if( matches.size() == 0 )
+ return;
+
+ size_t seq_count = seq_table.size();
+ identity = NumericMatrix<double>( seq_count, seq_count );
+ identity.init( 0 );
+
+ for( uint ivI = 0; ivI < matches.size(); ivI++ ){
+ AddToMatchIdentityMatrix( *matches[ ivI ], seq_table, identity );
+ }
+
+ NumericMatrix<double> possible( seq_count, seq_count );
+ possible.init( 0 );
+
+ for( size_t mI = 0; mI < matches.size(); ++mI ){
+ std::vector< std::string > alignment;
+ GetAlignment( *(matches[mI]), seq_table, alignment );
+ for( gnSeqI charI = 0; charI < matches[mI]->AlignmentLength(); charI++ ){
+ for( size_t seqI = 0; seqI < seq_count; seqI++ ){
+ for( size_t seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+ if( alignment[ seqI ][ charI ] != '-' &&
+ alignment[ seqJ ][ charI ] != '-' ){
+ possible( seqI, seqJ ) += 1;
+ }
+ }
+ }
+ }
+ }
+
+ identity /= possible;
+}
+
+
+template<class AbstractMatchType>
+void MatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
+{
+ if( amt.SeqCount() == 0 )
+ return;
+ uint seq_count = amt.SeqCount();
+ identity = NumericMatrix<double>( seq_count, seq_count );
+ identity.init( 0 );
+ uint seqI;
+ uint seqJ;
+
+ std::vector< std::string > alignment;
+ GetAlignment( amt, seq_table, alignment );
+ for( gnSeqI charI = 0; charI < amt.AlignmentLength(); charI++ ){
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+ if( ( toupper( alignment[ seqI ][ charI ] ) ==
+ toupper( alignment[ seqJ ][ charI ] ) ) &&
+ alignment[ seqI ][ charI ] != '-' ){
+
+ identity( seqI, seqJ ) += 1;
+ }
+ }
+ }
+ }
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ for( seqJ = seq_count; seqJ > 0; seqJ-- ){
+ if( seqI == seqJ - 1 )
+ // set the diagonal to identical
+ identity( seqI, seqJ - 1 ) = 1;
+ else if( seqI < seqJ - 1 ){
+ // determine the length of the shorter sequence
+ gnSeqI shorter_len = amt.Length( seqI ) < amt.Length( seqJ - 1 ) ? amt.Length( seqI ) : amt.Length( seqJ - 1 );
+ // divide through
+ identity( seqI, seqJ - 1 ) /= (double)shorter_len;
+ // maxes out at 1
+ if( identity( seqI, seqJ - 1 ) > 1 )
+ identity( seqI, seqJ - 1 ) = 1;
+ }else // copy the other one
+ identity( seqI, seqJ - 1 ) = identity( seqJ - 1, seqI );
+ }
+ }
+}
+
+
+
+template<class AbstractMatchType>
+void AddToMatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
+{
+ if( amt.SeqCount() == 0 )
+ return;
+ uint seq_count = amt.SeqCount();
+ uint seqI;
+ uint seqJ;
+
+ std::vector< std::string > alignment;
+ GetAlignment( amt, seq_table, alignment );
+ for( gnSeqI charI = 0; charI < amt.AlignmentLength(); charI++ ){
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+ if( ( toupper( alignment[ seqI ][ charI ] ) ==
+ toupper( alignment[ seqJ ][ charI ] ) ) &&
+ alignment[ seqI ][ charI ] != '-' ){
+
+ identity( seqI, seqJ ) += 1;
+ }
+ }
+ }
+ }
+}
+
+/*
+// template specialization for (exact) matches
+inline
+void AddToMatchIdentityMatrix( const Match& m, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
+{
+ if( m.SeqCount() == 0 )
+ return;
+ for( uint seqI = 0; seqI < m.SeqCount(); seqI++ )
+ if( m.LeftEnd(seqI) != NO_MATCH )
+ for( uint seqJ = seqI + 1; seqJ < m.SeqCount(); seqJ++ )
+ if( m.LeftEnd(seqJ) != NO_MATCH )
+ identity(seqI,seqJ) += m.Length();
+}
+*/
+
+template< typename MatchVector >
+void SingleCopyDistanceMatrix( MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& distance )
+{
+ uint seq_count = seq_table.size();
+ distance = NumericMatrix<double>( seq_count, seq_count );
+ distance.init( 0 );
+ uint seqI;
+ uint seqJ;
+ std::vector< std::pair< bitset_t, bitset_t > > tmp_comp( seq_count );
+ std::vector< std::vector< std::pair< bitset_t, bitset_t > > > pair_comp( seq_count, tmp_comp );
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ pair_comp[seqI][seqJ].first.resize( seq_table[seqI]->length(), false );
+ pair_comp[seqI][seqJ].second.resize( seq_table[seqJ]->length(), false );
+ }
+ }
+#pragma omp parallel for
+ for( int ivI = 0; ivI < iv_list.size(); ++ivI )
+ {
+ std::vector< bitset_t > aln_table;
+#pragma omp critical
+{
+ iv_list[ivI]->GetAlignment(aln_table);
+}
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ gnSeqI seqI_pos = iv_list[ivI]->LeftEnd(seqI);
+ gnSeqI seqJ_pos = iv_list[ivI]->LeftEnd(seqJ);
+ AbstractMatch::orientation o_i = iv_list[ivI]->Orientation(seqI);
+ AbstractMatch::orientation o_j = iv_list[ivI]->Orientation(seqJ);
+ if( o_i == AbstractMatch::reverse )
+ seqI_pos = iv_list[ivI]->RightEnd(seqI);
+ if( o_j == AbstractMatch::reverse )
+ seqJ_pos = iv_list[ivI]->RightEnd(seqJ);
+ if( seqI_pos == NO_MATCH || seqJ_pos == NO_MATCH )
+ continue;
+ for( size_t colI = 0; colI < aln_table[seqI].size(); ++colI )
+ {
+ if( aln_table[seqI].test(colI) && aln_table[seqJ].test(colI) )
+ {
+ pair_comp[seqI][seqJ].first.set(seqI_pos-1,true);
+ pair_comp[seqI][seqJ].second.set(seqJ_pos-1,true);
+ }
+ if( aln_table[seqI].test(colI) )
+ if( o_i == AbstractMatch::forward )
+ seqI_pos++;
+ else
+ seqI_pos--;
+ if( aln_table[seqJ].test(colI) )
+ if( o_j == AbstractMatch::forward )
+ seqJ_pos++;
+ else
+ seqJ_pos--;
+ }
+ }
+ }
+ }
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ distance(seqI,seqI) = 1;
+ for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ double pI = ((double)pair_comp[seqI][seqJ].first.count())/((double)pair_comp[seqI][seqJ].first.size());
+ double pJ = ((double)pair_comp[seqI][seqJ].second.count())/((double)pair_comp[seqI][seqJ].second.size());
+ distance(seqI,seqJ) = (pI + pJ) / 2.0;
+ distance(seqJ,seqI) = (pI + pJ) / 2.0;
+ }
+ }
+ TransformDistanceIdentity(distance);
+}
+
+inline
+void DistanceMatrix( const MatchList& mlist, NumericMatrix<double>& distance ){
+ IdentityMatrix(mlist, mlist.seq_table, distance );
+ TransformDistanceIdentity( distance );
+}
+
+inline
+void TransformDistanceIdentity( NumericMatrix<double>& identity ){
+ for( int i = 0; i < identity.cols(); i++ ){
+ for( int j = 0; j < identity.rows(); j++ ){
+ identity( i, j ) = 1 - identity( i, j );
+ }
+ }
+}
+
+inline
+void DistanceMatrix( uint seq_count, const std::vector< std::pair< uint64, uint64 > >& detail_list, NumericMatrix<double>& distance ){
+ distance = NumericMatrix<double>( seq_count, seq_count );
+ distance.init( 0 );
+ uint seqI;
+ uint seqJ;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ uint64 seqI_mask = 1;
+ seqI_mask <<= seq_count - seqI - 1;
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ uint64 seqJ_mask = 1;
+ seqJ_mask <<= seq_count - seqJ - 1;
+ for( uint pairI = 0; pairI < detail_list.size(); pairI++ ){
+ if( (detail_list[ pairI ].first & seqI_mask) != 0 &&
+ (detail_list[ pairI ].first & seqJ_mask) != 0 ){
+ distance( seqI, seqJ ) += detail_list[ pairI ].second;
+ }
+ }
+ }
+ }
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( seqI == seqJ )
+ continue;
+ double avg_length = ( distance( seqI, seqI ) + distance( seqJ, seqJ ) ) / 2;
+ distance( seqI, seqJ ) = 1.0 - ( distance( seqI, seqJ ) / avg_length );
+ if( !(distance( seqI, seqJ ) == distance( seqI, seqJ )) ){
+ distance( seqI, seqJ ) = 1.0;
+ }
+ }
+ }
+
+ // set the diagonal identical to itself
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ distance( seqI, seqI ) = 0;
+}
+
+
+} // namespace mems
+
+
+#endif // __DistanceMatrix_h__
+
diff --git a/libMems/FileSML.cpp b/libMems/FileSML.cpp
new file mode 100644
index 0000000..ff9ce01
--- /dev/null
+++ b/libMems/FileSML.cpp
@@ -0,0 +1,679 @@
+/*******************************************************************************
+ * $Id: FileSML.cpp,v 1.22 2004/04/26 21:13:58 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+
+#include "libMems/FileSML.h"
+// for CreateTempFileName():
+#include "libMems/Aligner.h"
+#include "libGenome/gnFilter.h"
+#include "libGenome/gnRAWSource.h"
+#include <algorithm>
+#include <cmath>
+#include "boost/filesystem/operations.hpp"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+FileSML& FileSML::operator=(const FileSML& sa)
+{
+ SortedMerList::operator=( sa );
+ filename = sa.filename;
+ sarray_start_offset = sa.sarray_start_offset;
+ seq_coords = sa.seq_coords;
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ if(!sarfile.is_open()){
+ DebugMsg("FileSML::=: Unable to open suffix array file.\n");
+ sarfile.clear();
+ return *this;
+ }
+ return *this;
+}
+
+void FileSML::Clear() {
+ SortedMerList::Clear();
+ filename = "";
+ sarfile.close();
+ sarray_start_offset = 0;
+ seq_coords.clear();
+}
+
+void FileSML::LoadFile(const string& fname){
+ filename = fname;
+ sarfile.open(fname.c_str(), ios::binary | ios::in );
+ if(!sarfile.is_open()){
+ sarfile.clear();
+ Throw_gnExMsg( FileNotOpened(), "Unable to open file.\n");
+ }
+ // read the header into a temporary header struct just
+ // in case it's bogus
+ SMLHeader tmp_header;
+ sarfile.read((char*)&tmp_header, sizeof(struct SMLHeader));
+ if(sarfile.gcount() < (int)sizeof(struct SMLHeader)){
+ sarfile.clear();
+ Throw_gnExMsg( FileUnreadable(), "Unable to read file.");
+ }
+ if(tmp_header.version != FormatVersion()){
+ Throw_gnExMsg( FileUnreadable(), "Unsupported file format.");
+ }
+ header = tmp_header;
+
+ SetMerMaskSize( header.seed_weight );
+ seed_mask = mer_mask;
+ SetMerMaskSize( header.seed_length );
+
+ //header is ok. read the sequence.
+ gnSeqI seq_len = header.length;
+ if(header.circular)
+ seq_len += header.seed_length - 1;
+ binary_seq_len = ((uint64)seq_len * (uint64)header.alphabet_bits) / 32;
+ if(((uint64)seq_len * (uint64)header.alphabet_bits) % 32 != 0)
+ binary_seq_len++;
+ binary_seq_len+=2; //fix for access violations.
+
+ if(sequence != NULL)
+ delete[] sequence;
+ sequence = new uint32[binary_seq_len];
+
+ sarfile.read((char*)sequence, binary_seq_len*sizeof(uint32));
+ if(sarfile.gcount() < (int64)(binary_seq_len*sizeof(uint32))){
+ sarfile.clear();
+ Throw_gnExMsg( FileUnreadable(), "Error reading sequence data.");
+ }
+
+ sarray_start_offset = sarfile.tellg();
+ sarfile.seekg(sarray_start_offset + sizeof(gnSeqI) * header.length);
+ if(!sarfile.good()){
+ sarfile.clear();
+ Throw_gnExMsg( FileUnreadable(), "Premature end of file.");
+ }
+ filename = fname;
+
+ // create a memory-map to the data of interest
+ sardata.open(fname);
+
+ // check whether there is a .coords mask file to read
+ string coordfile = filename + ".coords";
+ ifstream coord_in( coordfile.c_str() );
+ if( coord_in.is_open() ){
+ seq_coords.clear();
+ int64 cur_coord;
+ while( coord_in >> cur_coord ){
+ seq_coords.push_back( cur_coord );
+ }
+ }
+}
+
+void FileSML::OpenForWriting( boolean truncate ){
+ // Open smlfile for writing
+ boolean was_open = sarfile.is_open();
+ if(was_open)
+ sarfile.close();
+ if( truncate )
+ sarfile.open(filename.c_str(), ios::binary | ios::in | ios::out | ios::trunc );
+ else
+ sarfile.open(filename.c_str(), ios::binary | ios::in | ios::out );
+ if(!sarfile.is_open() || !sarfile.good()){
+ sarfile.clear();
+ if(was_open)
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ Throw_gnExMsg(FileNotOpened(), "Unable to open file for writing.");
+ }
+}
+
+boolean FileSML::WriteHeader(){
+ if(!sarfile.is_open()){
+ Throw_gnExMsg(IOStreamFailed(), "File is not valid.");
+ }
+ boolean success = true;
+ const char* errormsg = "";
+ // Open sarfile for writing and write new header.
+ OpenForWriting( false );
+ sarfile.write((char*)&header, sizeof(struct SMLHeader));
+ if(!sarfile.good()){
+ errormsg = "Error writing header to disk.";
+ success = false;
+ }
+
+ // reopen the sorted mer list file read-only
+ sarfile.close();
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ if(!sarfile.is_open()){
+ errormsg = "Error opening sorted mer list file.";
+ success = false;
+ }
+ if(!success)
+ Throw_gnExMsg(IOStreamFailed(), errormsg);
+ return success;
+}
+
+gnSeqI FileSML::UniqueMerCount(){
+ gnSeqI tmp_count = header.unique_mers;
+ SortedMerList::UniqueMerCount();
+ if(tmp_count != header.unique_mers)
+ WriteHeader();
+ return header.unique_mers;
+}
+
+//change the description in memory and on disk
+void FileSML::SetDescription(const string& d){
+ strncpy(header.description, d.c_str(), DESCRIPTION_SIZE-1);
+ WriteHeader();
+}
+
+void FileSML::SetID(const sarID_t d){
+ header.id = d;
+ WriteHeader();
+}
+
+
+extern "C" {
+#include "libMems/dmSML/dmsort.h"
+}
+
+char** FileSML::tmp_paths = NULL;
+
+void FileSML::registerTempPath( const string& path ) {
+ string tmp_path = path;
+ // add trailing path separator if necessary
+#ifdef WIN32
+ if( tmp_path[ tmp_path.size() - 1 ] != '\\' )
+ tmp_path += "\\";
+#else
+ if( tmp_path[ tmp_path.size() - 1 ] != '/' )
+ tmp_path += "/";
+#endif
+
+ if( tmp_paths == NULL ){
+ tmp_paths = new char*[1];
+ tmp_paths[ 0 ] = NULL;
+ }
+
+ int path_count = 0;
+ while( tmp_paths[ path_count ] != NULL )
+ path_count++;
+
+ // create a new array with room for another element
+ char** tmp_tmp_paths = new char*[ path_count+1 ];
+ // copy old elements
+ for( int pathI = 0; pathI < path_count; pathI++ )
+ tmp_tmp_paths[ pathI ] = tmp_paths[ pathI ];
+ // add new element
+ tmp_tmp_paths[ path_count ] = new char[ tmp_path.size() + 1 ];
+ strncpy( tmp_tmp_paths[ path_count ], tmp_path.c_str(), tmp_path.size() + 1 );
+ // set null terminator element
+ tmp_tmp_paths[ path_count + 1 ] = NULL;
+
+ // set new paths
+ char** old_paths = tmp_paths;
+ tmp_paths = tmp_tmp_paths;
+
+ // free old array
+ delete[] old_paths;
+}
+
+const char* FileSML::getTempPath( int pathI ){
+ return tmp_paths[ pathI ];
+}
+
+int FileSML::getTempPathCount(){
+ int path_count = 0;
+ while( tmp_paths && tmp_paths[ path_count ] != NULL )
+ path_count++;
+ return path_count;
+}
+
+
+void maskNNNNN( const gnSequence& in_seq, gnSequence& out_seq, vector< int64 >& seq_coords, int mask_n_length ) {
+
+ gnSeqI seqI = 1;
+ gnSeqI read_length = 1024*1024;
+ string cur_seq;
+ gnSeqI n_count = 0;
+ gnSeqI n_stretch_start = 0;
+ gnSeqI n_stretch_end = 1;
+
+ while( seqI <= in_seq.length() ){
+ read_length = seqI + read_length < in_seq.length() ? read_length : in_seq.length() - seqI + 1;
+ in_seq.ToString( cur_seq, read_length, seqI );
+
+ uint charI = 0;
+ for( ; charI < cur_seq.size(); charI++ ){
+ if( cur_seq[ charI ] == 'N' || cur_seq[ charI ] == 'n' ){
+ if( n_count == 0 ){
+ n_stretch_start = seqI + charI;
+ }
+ n_count++;
+ }else{
+ if( n_count > mask_n_length ){
+ if( n_stretch_start - n_stretch_end != 0 ){
+ // Add the sequence region to the output sequence
+ out_seq += in_seq.subseq( n_stretch_end, n_stretch_start - n_stretch_end );
+ // add the masked coordinates
+ seq_coords.push_back( n_stretch_end );
+ seq_coords.push_back( n_stretch_start - 1 );
+ }
+ // update n_stretch_end to the first non N character
+ n_stretch_end = seqI + charI;
+ }
+ n_count = 0;
+ }
+ }
+ seqI += read_length;
+ }
+ out_seq += in_seq.subseq( n_stretch_end, seqI - n_stretch_end );
+
+ // add the masked coordinates
+ seq_coords.push_back( n_stretch_end );
+ seq_coords.push_back( seqI - 1 );
+}
+
+ // use dmSML to construct the SML
+ // then read it in using LoadFile()
+void FileSML::dmCreate(const gnSequence& seq, const uint64 seed){
+ // Filter NNNNNs
+ gnSequence masked_seq;
+ seq_coords.clear();
+ maskNNNNN( seq, masked_seq, seq_coords, 0 );
+
+ // write a raw sequence to a tmp file stored in the first scratch path
+ string rawfile = CreateTempFileName("dm_rawseq");
+ gnRAWSource::Write( masked_seq, rawfile.c_str() );
+
+ // write a sequence coordinate file
+ if( seq_coords.size() > 0 ){
+ string coordfile = filename + ".coords";
+ ofstream coord_out( coordfile.c_str() );
+ if( !coord_out.is_open() ){
+ cerr << "Could not open " << coordfile << endl;
+ throw "";
+ }
+
+ for( int coordI = 0; coordI < seq_coords.size(); coordI+=2 ){
+ coord_out << seq_coords[ coordI ] << '\t' << seq_coords[ coordI + 1 ] << endl;
+ }
+ coord_out.close();
+ }
+
+
+ // run dmSML
+ const char* const* scratch_paths = (const char* const*)tmp_paths;
+ sarfile.close();
+ int rval = dmSML( rawfile.c_str(), filename.c_str(), scratch_paths, seed );
+ if( rval != 0 )
+ cerr << "Crap. It's broke, return value " << rval << endl;
+
+ boost::filesystem::remove( rawfile );
+ // load the sorted mer list
+ LoadFile( filename );
+}
+
+void FileSML::Create(const gnSequence& seq, const uint64 seed){
+
+ vector<bmer> sml_array;
+ bool is_spaced_seed = getSeedWeight(seed) != getSeedLength(seed);
+ OpenForWriting( true );
+
+ try{
+ SortedMerList::Create( seq, seed );
+
+ if( is_spaced_seed )
+ FillDnaSeedSML(seq, sml_array);
+ else
+ FillSML(seq, sml_array);
+
+ }catch(...){
+ // if there was a memory allocation error then
+ // try using dmSML to do an external sort
+ sarfile.clear();
+ sarfile.close();
+ sarfile.clear();
+ if( sequence != NULL )
+ delete[] sequence;
+ binary_seq_len = 0;
+
+ dmCreate( seq, seed );
+ }
+
+// RadixSort(s_array);
+ sort(sml_array.begin(), sml_array.end(), &bmer_lessthan);
+
+ /* now write out the file header */
+ sarfile.write((char*)&header, sizeof(struct SMLHeader));
+
+ if(!sarfile.good()){
+ sarfile.clear();
+ Throw_gnExMsg( IOStreamFailed(), "Error writing sorted mer list header to disk.\n");
+ }
+
+ /* write out the actual sequence */
+ sarfile.write((char*)sequence, binary_seq_len*sizeof(uint32));
+ sarray_start_offset = sarfile.tellg();
+
+ /* write out the sorted mer list */
+ for(gnSeqI suffixI=0; suffixI < sml_array.size(); suffixI++)
+ sarfile.write((char*)&(sml_array[suffixI].position), sizeof(smlSeqI_t));
+
+ sarfile.flush();
+ if(!sarfile.good()){
+ sarfile.clear();
+ Throw_gnExMsg( IOStreamFailed(), "Error writing sorted mer list to disk.\n");
+ }
+ // reopen the sorted mer list file read-only
+ sarfile.close();
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ if(!sarfile.is_open())
+ Throw_gnExMsg( FileNotOpened(), "FileSML::Create: Error opening sorted mer list file.\n");
+
+ sardata.open(filename);
+}
+
+bmer FileSML::operator[](gnSeqI index)
+{
+ bmer tmp_mer;
+ tmp_mer.position = base()[index];
+ tmp_mer.mer = GetSeedMer(tmp_mer.position);
+ return tmp_mer;
+}
+
+
+boolean FileSML::Read(vector<bmer>& readVector, gnSeqI size, const gnSeqI offset)
+{
+ if(!sarfile.is_open()){
+ DebugMsg("FileSML::Read: Error sar file not open.\n");
+ return false;
+ }
+
+ gnSeqI total_len = SMLLength();
+ if(offset >= total_len){
+ readVector.clear();
+ return false;
+ }
+ gnSeqI readlen = offset + size < total_len ? size : total_len - offset;
+
+ readVector.resize( readlen );
+
+ //copy data to the vector
+ for(gnSeqI j=0; j < readlen; j++){
+ bmer tmp_mer;
+ tmp_mer.position = base()[offset+j];
+ if( tmp_mer.position > header.length ){
+ string errmsg = "Corrupted SML, position ";
+ errmsg += tmp_mer.position + " is out of range\n";
+ ErrorMsg( errmsg );
+ cerr << errmsg;
+ }else
+ tmp_mer.mer = GetSeedMer(tmp_mer.position);
+ readVector[ j ] = tmp_mer;
+ }
+ return true;
+}
+
+void FileSML::BigCreate(const gnSequence& seq, const uint32 split_levels, const uint32 mersize){
+// unsigned long freemem = wxGetFreeMemory(); //get the amount of free memory.
+// unsigned long neededmem = GetNeededMemory(seq.length());
+// if(neededmem >= freemem && neededmem > MEMORY_MINIMUM){ // divide and conquer
+ if(split_levels > 0){ // split_levels defines the number of times to divide and conquer
+ uint64 midpoint = seq.length() / 2;
+ midpoint = (midpoint * header.alphabet_bits) / 32;
+ midpoint = (midpoint / header.alphabet_bits) * 32;
+ gnSequence seqA = seq.subseq(1, midpoint);
+ gnSequence seqB = seq.subseq(1 + midpoint, seq.length() - midpoint);
+ seqA.setCircular(false);
+ seqB.setCircular(false);
+ cout << "Splitting " << seq.length() << " to " << seqA.length() << " and " << seqB.length() << "\n";
+
+ //create the first sar
+ string temp_sarfile = CreateTempFileName("bdsa_split");
+ FileSML* temp_sar = this->Clone();
+ temp_sar->filename = temp_sarfile.c_str();
+ temp_sar->BigCreate(seqA, split_levels - 1, mersize);
+
+ //create the second sar
+ string temp_sarfile2 = CreateTempFileName("bdsa_split");
+ FileSML* temp_sar2 = this->Clone();
+ temp_sar2->filename = temp_sarfile2.c_str();
+ temp_sar2->BigCreate(seqB, split_levels - 1, mersize);
+
+ //merge them to this file
+ cout << "Merging " << seqA.length() << " and " << seqB.length() << "\n";
+ Merge(*temp_sar, *temp_sar2);
+ //free up RAM
+ delete temp_sar;
+ delete temp_sar2;
+ //erase the temp files.
+ boost::filesystem::remove(temp_sarfile);
+ boost::filesystem::remove(temp_sarfile2);
+ }else{
+ Create(seq, mersize);
+ }
+}
+
+void FileSML::RadixSort(vector<bmer>& s_array){
+ vector<bmer> *source_buckets;
+ vector<bmer> *tmp_buckets;
+ vector<bmer> *buckets;
+ uint32 radix_size = 11;
+ uint64 radix_mask = 0xFFFFFFFF;
+ radix_mask <<= 32;
+ radix_mask |= 0xFFFFFFFF;
+ radix_mask >>= 64 - radix_size;
+
+ uint32 bucket_count = (uint32) pow((double)2, (double)radix_size);
+ uint32 cur_shift_bits = 0;
+ buckets = new vector<bmer>[bucket_count];
+ source_buckets = new vector<bmer>[bucket_count];
+ uint64 cur_bucket;
+ for(uint32 merI = 0; merI < s_array.size(); merI++){
+ cur_bucket = s_array[merI].mer & radix_mask;
+ source_buckets[cur_bucket].push_back(s_array[merI]);
+ }
+ s_array.clear();
+ cur_shift_bits += radix_size;
+ radix_mask <<= radix_size;
+ while(cur_shift_bits < 64){
+ for(uint32 bucketI = 0; bucketI < bucket_count; bucketI++){
+ for(uint32 merI = 0; merI < source_buckets[bucketI].size(); merI++){
+ cur_bucket = source_buckets[bucketI][merI].mer & radix_mask;
+ cur_bucket >>= cur_shift_bits;
+ buckets[cur_bucket].push_back(source_buckets[bucketI][merI]);
+ }
+ source_buckets[bucketI].clear();
+ }
+ cur_shift_bits += radix_size;
+ radix_mask <<= radix_size;
+ tmp_buckets = source_buckets;
+ source_buckets = buckets;
+ buckets = tmp_buckets;
+ }
+ s_array.clear();
+ for(uint32 bucketI = 0; bucketI < bucket_count; bucketI++){
+ for(uint32 merI = 0; merI < source_buckets[bucketI].size(); merI++){
+ s_array.push_back(source_buckets[bucketI][merI]);
+ }
+ source_buckets[bucketI].clear();
+ }
+ delete[] source_buckets;
+ delete[] buckets;
+}
+
+//Merges the supplied sorted mer lists into this one, overwriting the existing sml.
+//KNOWN BUG: The first sorted mer list must have (length * alphabet_bits) / word_bits == 0
+//for Merge to work properly.
+void FileSML::Merge(SortedMerList& sa, SortedMerList& sa2){
+STACK_TRACE_START
+ SMLHeader sa_head = sa.GetHeader();
+ SMLHeader sa_head2 = sa2.GetHeader();
+
+ //basic copying
+ header = sa_head;
+ //take the smaller mer_size
+ if(sa_head.seed_length < sa_head2.seed_length){
+ header.seed_length = sa_head.seed_length;
+ mer_mask = sa.GetMerMask();
+ }else{
+ header.seed_length = sa_head2.seed_length;
+ mer_mask = sa2.GetMerMask();
+ }
+ header.unique_mers = NO_UNIQUE_COUNT;
+ header.length += sa_head2.length;
+
+ //allocate some memory
+ const uint32 SEQ_BUFFER_SIZE = 200000;
+ Array<uint32> seq_buf ( SEQ_BUFFER_SIZE + header.seed_length );
+
+ //do some sanity checks on the sars we're merging.
+ if(sa_head.alphabet_bits != sa_head2.alphabet_bits ||
+ sa_head.version != sa_head2.version ||
+ memcmp(sa_head.translation_table, sa_head2.translation_table, UINT8_MAX)){
+ Throw_gnExMsg(SMLMergeError(), "Incompatible sorted mer lists.");
+ }
+
+ OpenForWriting( true );
+
+ //write the header
+ sarfile.write((char*)&header, sizeof(struct SMLHeader));
+ if(!sarfile.good()){
+ sarfile.clear();
+ sarfile.close();
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ Throw_gnExMsg(IOStreamFailed(), "Error writing sorted mer list header to disk.");
+ }
+
+ //copy sequence data into memory.
+ uint32 binary_seq_len = (header.length * header.alphabet_bits) / 32;
+ if((header.length * header.alphabet_bits) % 32 > 0)
+ binary_seq_len++;
+
+ //The +1 is to avoid access violations when copying in the
+ //binary sequence before shifting.
+ if( sequence != NULL )
+ delete[] sequence;
+ sequence = new uint32[binary_seq_len+1];
+ sa.GetBSequence(sequence, sa_head.length, 0);
+
+ uint32 bseq_len1 = (sa_head.length * sa_head.alphabet_bits) / 32;
+ uint32 bseq_remainder = (sa_head.length * sa_head.alphabet_bits) % 32;
+ if(bseq_remainder > 0){
+ sa2.GetBSequence(&(sequence[bseq_len1]), sa_head2.length, 0);
+ //mask off the end of the first sequence
+ uint32 end_mask = 0xFFFFFFFF;
+ end_mask <<= bseq_remainder;
+ sequence[bseq_len1] &= end_mask;
+
+ //shift the second sequence over.
+ for(uint32 i=bseq_len1; i < binary_seq_len; i++){
+ uint32 tmp = sequence[i+1];
+ tmp >>= 32 - bseq_remainder;
+ sequence[i] |= tmp;
+ sequence[i+1] <<= bseq_remainder;
+ }
+ }else
+ sa2.GetBSequence(&(sequence[bseq_len1]), sa_head2.length, 0);
+
+ //write the sequence
+ sarfile.write((char*)sequence, binary_seq_len * sizeof(uint32));
+ sarray_start_offset = sarfile.tellg();
+
+ //get new mers in the middle
+ vector<bmer> middle_mers;
+ bmer mid_mer;
+ for(uint32 midI = sa_head.length - header.seed_length + 1; midI < sa_head.length; midI++){
+ mid_mer.position = midI;
+ mid_mer.mer = GetMer(midI);
+ middle_mers.push_back(mid_mer);
+ }
+ sort(middle_mers.begin(), middle_mers.end(), &bmer_lessthan);
+ //put a special mer at the end which will never go into the sorted mer list
+ //since every possible mer is less than it.
+ mid_mer.mer = 0xFFFFFFFF;
+ mid_mer.mer <<= 32;
+ mid_mer.mer |= 0xFFFFFFFF;
+ mid_mer.position = GNSEQI_END;
+ middle_mers.push_back(mid_mer);
+ //merge and write the sorted mer lists
+ vector<bmer> array1, array2;
+ uint32 SAR_BUFFER_SIZE = SEQ_BUFFER_SIZE/2; //actual size is this number * 13 bytes
+ uint32 k=0, l=0, midI=0;
+ uint32 m = 0, n = 0;
+ gnSeqI bufferI=0;
+ do{
+ //mergesort them
+ while(m < array1.size() && n < array2.size()){
+ if(array1[m].mer <= array2[n].mer){
+ if(array1[m].mer <= middle_mers[midI].mer){
+ seq_buf.data[bufferI] = array1[m].position;
+ m++;
+ bufferI++;
+ }else{
+ seq_buf.data[bufferI] = middle_mers[midI].position;
+ midI++;
+ bufferI++;
+ }
+ }else if(array2[n].mer <= middle_mers[midI].mer){
+ seq_buf.data[bufferI] = array2[n].position + sa_head.length;
+ n++;
+ bufferI++;
+ }else{
+ seq_buf.data[bufferI] = middle_mers[midI].position;
+ midI++;
+ bufferI++;
+ }
+ if(bufferI == SEQ_BUFFER_SIZE){
+ sarfile.write((char*)seq_buf.data, bufferI * sizeof(uint32));
+ bufferI = 0;
+ }
+ }
+ if(m == array1.size()){
+ sa.Read(array1, SAR_BUFFER_SIZE, k);
+ k += array1.size();
+ m = 0;
+ }
+ if(n == array2.size()){
+ sa2.Read(array2, SAR_BUFFER_SIZE, l);
+ l += array2.size();
+ n = 0;
+ }
+ }while(array1.size() != 0 && array2.size() != 0);
+ if(bufferI > 0)
+ sarfile.write((char*)seq_buf.data, (bufferI)*sizeof(uint32));
+ //consolidate the remaining mers to a known vector
+ vector<bmer> remaining_mers;
+ for(;m < array1.size(); m++)
+ remaining_mers.push_back(array1[m]);
+ for(;n < array2.size(); n++){
+ remaining_mers.push_back(array2[n]);
+ remaining_mers[remaining_mers.size()-1].position += sa_head.length;
+ }
+ for(;midI < middle_mers.size() - 1; midI++)
+ remaining_mers.push_back(middle_mers[midI]);
+ //merge them with the remaining middle_mers
+ sort(remaining_mers.begin(), remaining_mers.end(), &bmer_lessthan);
+ uint32 remI = 0;
+ for(;remI < remaining_mers.size(); remI++)
+ seq_buf.data[remI] = remaining_mers[remI].position;
+ if(remI > 0)
+ sarfile.write((char*)seq_buf.data, (remI)*sizeof(uint32));
+
+ if(!sarfile.good()){
+ sarfile.clear();
+ sarfile.close();
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ Throw_gnExMsg(IOStreamFailed(), "Error writing position array.");
+ }
+ // reopen the sorted mer list file read-only
+ sarfile.close();
+ sarfile.open(filename.c_str(), ios::binary | ios::in );
+ if(!sarfile.is_open()){
+ sarfile.clear();
+ Throw_gnExMsg(FileNotOpened(), "Error opening sorted mer list file.");
+ }
+STACK_TRACE_END
+}
+
+}
diff --git a/libMems/FileSML.h b/libMems/FileSML.h
new file mode 100644
index 0000000..e262db0
--- /dev/null
+++ b/libMems/FileSML.h
@@ -0,0 +1,135 @@
+/*******************************************************************************
+ * $Id: FileSML.h,v 1.11 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _FileSML_h_
+#define _FileSML_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#pragma warning(pop)
+
+#include "libGenome/gnSequence.h"
+#include "libMems/SortedMerList.h"
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <fstream>
+#include <vector>
+#include <string>
+
+namespace mems {
+
+//sequence database size will be
+//base_count / 4 + base_count * 12 bytes
+
+#define DEFAULT_MEMORY_MINIMUM 20971520 //~20 Megabytes
+
+class FileSML : public SortedMerList
+{
+public:
+ FileSML() : SortedMerList() {
+// file_mutex = new wxMutex();
+ };
+ FileSML& operator=(const FileSML& sa);
+ virtual FileSML* Clone() const = 0;
+
+ virtual void Clear();
+
+ /**
+ * Loads an existing sorted mer list from a file on disk.
+ * @param fname The name of the file to load
+ * @throws FileNotOpened thrown if the file could not be opened
+ * @throws FileUnreadable thrown if the file was corrupt or not a sorted mer list
+ */
+ virtual void LoadFile(const std::string& fname);
+ /**
+ * Creates large sorted mer lists which do not fit entirely in memory.
+ * BigCreate uses an external mergesort to create large sorted mer lists.
+ * It will divide the data a number of times specified by the split_levels
+ * parameter. Each split is written to temp files on disk and merged.
+ * @param seq The sequence to create an SML for.
+ * @param split_levels The number of times to divide the sequence in half.
+ * @param mersize The size of the mers to sort on.
+ * @see FileSML::Create
+ */
+ virtual void BigCreate(const genome::gnSequence& seq, const uint32 split_levels, const uint32 mersize = DNA_MER_SIZE);
+ virtual void Create(const genome::gnSequence& seq, const uint64 seed );
+ virtual boolean Read(std::vector<bmer>& readVector, gnSeqI size, gnSeqI offset = 0);
+ virtual void Merge(SortedMerList& sa, SortedMerList& sa2);
+
+ virtual bmer operator[]( gnSeqI index );
+
+ virtual gnSeqI UniqueMerCount();
+ virtual void SetDescription(const std::string& d);
+ virtual void SetID(const sarID_t d);
+
+ virtual uint32 FormatVersion();
+ static uint64 MemoryMinimum();
+ virtual void RadixSort(std::vector<bmer>& s_array);
+
+ void dmCreate(const genome::gnSequence& seq, const uint64 seed);
+ static void registerTempPath( const std::string& tmp_path );
+
+ static const char* getTempPath( int pathI );
+
+ static int getTempPathCount();
+
+ const std::vector< int64 >& getUsedCoordinates() const { return seq_coords; };
+
+protected:
+ /**
+ * Reopens the sarfile fstream in read/write mode
+ * @throws FileNotOpened thrown if the file could not be opened for writing
+ */
+ virtual void OpenForWriting( boolean truncate = false );
+ /**
+ * Writes the SML header to disk
+ * @throws FileNotOpened thrown if the file could not be opened for writing
+ * @throws IOStreamFailed thrown if an error occurred writing the data
+ */
+ virtual boolean WriteHeader();
+ /**
+ * Calculates and returns the amount of memory needed to create a sorted
+ * mer list for a sequence of the specified length.
+ * @param len The length of the sequence
+ * @return The amount of memory needed in bytes.
+ */
+ virtual uint64 GetNeededMemory(gnSeqI len) = 0;
+
+ std::string filename;
+ std::fstream sarfile;
+ uint64 sarray_start_offset;
+
+ boost::iostreams::mapped_file_source sardata;
+ smlSeqI_t* base(){ return (smlSeqI_t*)(sardata.data()+sarray_start_offset); }
+
+ static char** tmp_paths; /**< paths to scratch disk space that can be used for an external sort */
+ std::vector< int64 > seq_coords; /**< If Ns are masked, contains coordinates of regions without Ns */
+};
+
+// versions 2 and 5 were previous
+// jump to 100 to avoid confusion with DNAFileSML
+inline
+uint32 FileSML::FormatVersion(){
+ static uint32 f_version = 100;
+ return f_version;
+}
+
+inline
+uint64 FileSML::MemoryMinimum(){
+ static uint32 m_minimum = DEFAULT_MEMORY_MINIMUM;
+ return m_minimum;
+}
+
+void maskNNNNN( const genome::gnSequence& in_seq, genome::gnSequence& out_seq, std::vector< int64 >& seq_coords, int mask_n_length );
+
+}
+
+#endif //_FileSML_h_
diff --git a/libMems/Files.h b/libMems/Files.h
new file mode 100644
index 0000000..9a944e7
--- /dev/null
+++ b/libMems/Files.h
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ * $Id: Files.h,v 1.23 2004/04/19 23:10:13 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __libMems_Files_h__
+#define __libMems_Files_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+// for CreateTempFilename
+#ifdef WIN32
+#include "windows.h"
+#else
+#include "unistd.h"
+#endif
+
+#include "boost/filesystem/operations.hpp"
+#include "boost/filesystem/exception.hpp"
+#include "boost/algorithm/string.hpp"
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+
+/**
+ * Register a file name to be deleted before the process exits
+ * When passed an empty string, it does not add to the list of files to delete
+ * @param fname The name of a file to delete, empty strings are ignored
+ * @return A vector of file names registered for deletion
+ */
+std::vector< std::string >& registerFileToDelete( std::string fname = "" );
+
+inline
+std::vector< std::string >& registerFileToDelete( std::string fname ) {
+ // since this vector is needed when atexit() is called we allocate it
+ // on the heap so its destructor won't get called
+ static std::vector< std::string >* files = new std::vector< std::string >();
+#pragma omp critical
+{
+ if( fname != "" )
+ files->push_back( fname );
+}
+ return *files;
+}
+
+void deleteRegisteredFiles();
+inline
+void deleteRegisteredFiles() {
+ // don't be a slob, clean up after yourself:
+ // delete any files that are laying around
+ std::vector< std::string >& del_files = registerFileToDelete();
+ for( int fileI = 0; fileI < del_files.size(); fileI++ )
+ boost::filesystem::remove( del_files[ fileI ] );
+ del_files.clear(); // clear the deleted files from the list
+}
+
+
+/**
+ * Create a temporary file
+ */
+std::string CreateTempFileName(const std::string& prefix);
+
+
+/* shamelessly ripped from wxWidgets and boostified*/
+inline
+std::string CreateTempFileName(const std::string& prefix)
+{
+ std::string dir, name, ret_path;
+#ifdef WIN32
+ char buf[MAX_PATH + 1];
+#else
+ char buf[PATH_MAX + 1];
+#endif
+ boost::filesystem::path path( prefix );
+ dir = path.branch_path().string();
+#ifdef WIN32
+ name = path.leaf();
+#else
+ name = path.leaf().string();
+#endif
+ if( name == "/" )
+ {
+ dir += name;
+ name.clear();
+ }
+#if defined(WIN32)
+
+ if ( dir.size() == 0 )
+ {
+ strncpy(buf, dir.c_str(), MAX_PATH);
+ if ( !::GetTempPath(MAX_PATH, buf) )
+ std::cerr << "GetTempPath\n";
+
+ dir = buf;
+ if ( dir.size()==0 )
+ dir = "."; // GetTempFileName() fails if we pass it an emptystd::string
+ }
+ else // we have a dir to create the file in
+ {
+ // ensure we use only the back slashes as GetTempFileName(), unlike all
+ // the other APIs, is picky and doesn't accept the forward ones
+ boost::algorithm::replace_all( dir, "/", "\\" );
+ }
+
+ strncpy(buf, path.string().c_str(), MAX_PATH);
+ if ( !::GetTempFileName(dir.c_str(), name.c_str(), 0, buf) )
+ {
+ std::cerr << "GetTempFileName\n";
+ path = boost::filesystem::path();
+ }
+ ret_path = buf;
+
+#else // !Windows
+ if ( dir.empty() )
+ {
+ char* env_val = getenv("TMP");
+ dir = env_val != NULL ? env_val : "";
+
+ if ( dir.size() == 0 ){
+ env_val = getenv("TMPDIR");
+ dir = env_val != NULL ? env_val : "";
+ }
+
+ if ( dir.size() == 0 ){
+ env_val = getenv("TEMP");
+ dir = env_val != NULL ? env_val : "";
+ }
+
+ if ( dir.size()==0 )
+ {
+ // default
+ #ifdef __DOS__
+ dir = ".";
+ #else
+ dir = "/tmp";
+ #endif
+ }
+ }
+
+ path = dir;
+ path /= name;
+
+ // we need to copy the path to the buffer in which mkstemp() can modify it
+ std::string path_str = path.string();
+ path_str += "XXXXXX"; // scratch space for mkstemp()
+ strncpy( buf, path_str.c_str(), path_str.size()+1 );
+
+#if defined(HAVE_MKSTEMP)
+ // cast is safe because thestd::string length doesn't change
+ int fdTemp = mkstemp( buf );
+ if ( fdTemp == -1 )
+ {
+ // this might be not necessary as mkstemp() on most systems should have
+ // already done it but it doesn't hurt neither...
+// path.clear();
+ }
+ else // mkstemp() succeeded
+ {
+ ret_path = buf;
+ close(fdTemp);
+ }
+#else // !HAVE_MKSTEMP
+
+#ifdef HAVE_MKTEMP
+ // same as above
+ if ( int fdTemp = mktemp( buf ) ){
+ ret_path = buf;
+ close(fdTemp);
+ }
+
+#else // !HAVE_MKTEMP (includes __DOS__)
+ // generate the unique file name ourselves
+ unsigned my_pid = 0;
+ #ifndef __DOS__
+ my_pid = getpid();
+ #endif
+
+ std::ostringstream oss;
+
+ std::string oss_str;
+ static const size_t numTries = 1000;
+ for ( size_t n = 0; n < numTries; n++ )
+ {
+ std::ostringstream oss;
+ oss << path.string() << my_pid << "." << std::setfill('0') << std::setw(3) << n;
+ // 3 hex digits is enough for numTries == 1000 < 4096
+ boost::filesystem::path pathTry( oss.str() );
+ oss_str = oss.str();
+ if ( !boost::filesystem::exists(pathTry) )
+ break;
+
+ }
+
+ ret_path = oss_str;
+#endif // HAVE_MKTEMP/!HAVE_MKTEMP
+
+#endif // HAVE_MKSTEMP/!HAVE_MKSTEMP
+
+#endif // Windows/!Windows
+
+ return ret_path;
+}
+
+
+#endif // __libMems_Files_h__
+
diff --git a/libMems/GappedAligner.h b/libMems/GappedAligner.h
new file mode 100644
index 0000000..42d94bd
--- /dev/null
+++ b/libMems/GappedAligner.h
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * $Id: GappedAligner.h,v 1.12 2004/04/19 23:10:50 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _GappedAligner_h_
+#define _GappedAligner_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Match.h"
+
+namespace mems {
+
+class GappedAligner {
+public:
+ GappedAligner(){ max_alignment_length = 10000; } // default to something
+ GappedAligner& operator=( const GappedAligner& ga )
+ {
+ max_alignment_length = ga.max_alignment_length;
+ return *this;
+ }
+ /**
+ * Set the maximum allowed length for a gapped alignment. Sequences above this length
+ * threshold will be ignored.
+ * @param max_length The maximum length
+ */
+ void SetMaxAlignmentLength( gnSeqI len ){max_alignment_length = len;}
+ virtual boolean Align( GappedAlignment& cr, Match* r_begin, Match* r_end, std::vector< genome::gnSequence* >& seq_table ) = 0;
+protected:
+ gnSeqI max_alignment_length;
+};
+
+
+
+
+
+boolean getInterveningCoordinates( std::vector< genome::gnSequence* >& seq_table, Match* r_begin, Match* r_end, uint seqI, int64& gap_lend, int64& gap_rend );
+
+inline
+boolean getInterveningCoordinates( std::vector< genome::gnSequence* >& seq_table, Match* r_begin, Match* r_end, uint seqI, int64& gap_lend, int64& gap_rend ){
+ // skip this sequence if it's undefined
+ if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH) ||
+ (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
+ gap_lend = 0;
+ gap_rend = 0;
+ return true;
+ }
+
+ // determine the size of the gap
+ gap_rend = r_end != NULL ? r_end->Start( seqI ) : seq_table[ seqI ]->length() + 1;
+ gap_lend = r_begin != NULL ? r_begin->End( seqI ) + 1 : 1;
+ if( gap_rend < 0 || gap_lend < 0 ){
+ gap_rend = r_begin != NULL ? -r_begin->Start( seqI ) : seq_table[ seqI ]->length() + 1;
+ gap_lend = r_end != NULL ? -r_end->Start( seqI ) + r_end->Length() : 1;
+ }
+ if( gap_rend <= 0 || gap_lend <= 0 ){
+ // if either is still < 0 then there's a problem...
+ genome::ErrorMsg( "Error constructing intervening coordinates" );
+ }
+ return true;
+}
+
+}
+
+#endif // _GappedAligner_h_
diff --git a/libMems/GappedAlignment.cpp b/libMems/GappedAlignment.cpp
new file mode 100644
index 0000000..043bc41
--- /dev/null
+++ b/libMems/GappedAlignment.cpp
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ * $Id: GappedAlignment.cpp,v 1.27 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/GappedAlignment.h"
+#include <sstream>
+#include "libGenome/gnFilter.h"
+
+#include <fstream>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+GappedAlignment::GappedAlignment() :
+AbstractGappedAlignment< SparseAbstractMatch<> >()
+{}
+
+GappedAlignment::GappedAlignment( uint seq_count, gnSeqI align_length ) :
+AbstractGappedAlignment< SparseAbstractMatch<> >( seq_count, align_length )
+{
+ align_matrix.resize(seq_count);
+}
+
+void GappedAlignment::SetAlignment( const vector< string >& seq_align ){
+ align_matrix = seq_align;
+ if( seq_align.size() > 0 )
+ SetAlignmentLength(seq_align[0].size());
+ else
+ SetAlignmentLength(0);
+}
+
+std::ostream& operator<<( std::ostream& os, const GappedAlignment& ga ); //write to source.
+std::ostream& operator<<( std::ostream& os, const GappedAlignment& ga ){
+ os << "GappedAlignmentSeqs: " << ga.SeqCount() << endl;
+ os << ga.AlignmentLength();
+ for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
+ os << '\t' << ga.Start( seqI );
+ os << endl;
+ for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ ){
+ os << ga.align_matrix[ seqI ] << endl;
+ }
+ return os;
+};
+
+std::istream& operator>>( std::istream& is, GappedAlignment& ga ); // read from source
+std::istream& operator>>( std::istream& is, GappedAlignment& ga ){
+ uint seq_count;
+ string nuffin;
+ is >> nuffin;
+ is >> seq_count;
+ ga = GappedAlignment( seq_count, 0 );
+ is >> nuffin;
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ int64 startI;
+ is >> startI;
+ ga.SetStart( seqI, startI );
+ }
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ string seq;
+ is >> seq;
+ ga.align_matrix.push_back( seq );
+ }
+ if( ga.align_matrix.size() > 0 )
+ ga.SetAlignmentLength( ga.align_matrix[ 0 ].length() );
+ return is;
+};
+
+}
diff --git a/libMems/GappedAlignment.h b/libMems/GappedAlignment.h
new file mode 100644
index 0000000..74285d5
--- /dev/null
+++ b/libMems/GappedAlignment.h
@@ -0,0 +1,283 @@
+/*******************************************************************************
+ * $Id: GappedAlignment.h,v 1.12 2004/04/19 23:10:50 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __GappedAlignment_h__
+#define __GappedAlignment_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnFilter.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/SparseAbstractMatch.h"
+#include "libMems/AbstractGappedAlignment.h"
+#include "libMems/Memory.h"
+#include <iostream>
+
+namespace mems {
+
+class GappedAlignment : public AbstractGappedAlignment< SparseAbstractMatch<> >
+{
+public:
+ GappedAlignment();
+ GappedAlignment( uint seq_count, gnSeqI align_length );
+
+ GappedAlignment* Clone() const { return new GappedAlignment( *this ); }
+ GappedAlignment* Copy() const;
+ virtual void Free();
+
+ void SetAlignment( const std::vector< std::string >& seq_align );
+
+ /**
+ * Writes this GappedAlignment to the specified output stream (e.g. cout).
+ */
+ friend std::ostream& operator<<(std::ostream& os, const GappedAlignment& ga); //write to source.
+
+ /**
+ * Reads a GappedAlignment from the specified input stream (e.g. cin).
+ */
+ friend std::istream& operator>>(std::istream& is, GappedAlignment& ga); //read from source
+
+ // Inherited methods from AbstractMatch:
+ virtual void Invert();
+ virtual void CropStart(gnSeqI crop_amount);
+ virtual void CropEnd(gnSeqI crop_amount);
+
+ virtual void CropLeft(gnSeqI crop_amount, uint seqI);
+ virtual void CropRight(gnSeqI crop_amount, uint seqI);
+
+ void GetAlignment( std::vector< bitset_t >& align_matrix ) const;
+
+ friend const std::vector<std::string>& GetAlignment( const GappedAlignment& ga, const std::vector< genome::gnSequence* >& seq_table );
+
+ void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const;
+
+ /**
+ * Splits the alignment before the specified column. The left-side remains in "this" GappedAlignment,
+ * and the right side is returned as a new GappedAlignment
+ */
+ virtual AbstractMatch* Split( gnSeqI before_column );
+
+ virtual bool IsGap( uint seq, gnSeqI col ) const;
+
+ void swap( GappedAlignment& other ){ swap(&other); }
+
+protected:
+ // for use by derived classes in order to swap contents
+ void swap( GappedAlignment* other ){
+ std::swap( align_matrix, other->align_matrix );
+ AbstractGappedAlignment< SparseAbstractMatch<> >::swap( other );
+ }
+
+ std::vector< std::string > align_matrix;
+
+ void CropStartCoords(gnSeqI crop_amount);
+ void CropEndCoords(gnSeqI crop_amount);
+};
+
+
+inline
+GappedAlignment* GappedAlignment::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+inline
+void GappedAlignment::Free()
+{
+ m_free(this);
+}
+
+inline
+void GappedAlignment::Invert(){
+ const genome::gnFilter* rc_filter = genome::gnFilter::DNAComplementFilter();
+ for(uint startI = 0; startI < SeqCount(); startI++)
+ rc_filter->ReverseFilter( align_matrix[ startI ] );
+ AbstractGappedAlignment< SparseAbstractMatch<> >::Invert();
+}
+
+inline
+void GappedAlignment::CropStartCoords(gnSeqI crop_amount){
+ if( crop_amount > AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ for( uint i=0; i < SeqCount(); i++ ){
+ gnSeqI char_count = 0;
+ for( gnSeqI cropI = 0; cropI < crop_amount; cropI++ )
+ if( align_matrix[i][cropI] != '-' )
+ char_count++;
+ if( Start(i) > 0 )
+ SetStart(i, Start(i) + char_count);
+ SetLength(Length(i)-char_count, i);
+ if( Length(i) == 0 )
+ SetLeftEnd(i, NO_MATCH);
+ }
+ SetAlignmentLength( AlignmentLength() - crop_amount );
+}
+
+inline
+void GappedAlignment::CropStart(gnSeqI crop_amount){
+ CropStartCoords(crop_amount);
+ for( uint i=0; i < SeqCount(); i++ )
+ align_matrix[ i ] = align_matrix[ i ].substr( crop_amount );
+
+}
+
+inline
+void GappedAlignment::CropEndCoords(gnSeqI crop_amount){
+ if( crop_amount > AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ SetAlignmentLength( AlignmentLength() - crop_amount );
+
+ for( uint i=0; i < SeqCount(); i++ ){
+ gnSeqI char_count = 0;
+ for( gnSeqI cropI = align_matrix[i].length() - crop_amount; cropI < align_matrix[i].length(); cropI++ )
+ if( align_matrix[i][cropI] != '-' )
+ char_count++;
+ if( Start(i) < 0 )
+ SetStart(i, Start(i)-char_count);
+ SetLength(Length(i)-char_count, i);
+ if( Length(i) == 0 )
+ SetLeftEnd(i, NO_MATCH);
+ }
+}
+
+inline
+void GappedAlignment::CropEnd(gnSeqI crop_amount){
+ CropEndCoords(crop_amount);
+ // this code doesn't free up memory in Windows release builds
+// for( uint i=0; i < SeqCount(); i++ )
+// {
+// align_matrix[ i ].resize( AlignmentLength() );
+// align_matrix[ i ].reserve( AlignmentLength() );
+// }
+ std::vector< std::string > new_matrix(SeqCount());
+ for( uint i=0; i < SeqCount(); i++ )
+ new_matrix[ i ] = align_matrix[ i ].substr( 0, AlignmentLength() );
+ std::swap( new_matrix, align_matrix );
+}
+
+inline
+void GappedAlignment::CropLeft(gnSeqI crop_amount, uint seqI)
+{
+ // count "crop_amount" characters into seqI and crop there
+ size_t left_col = 0;
+ if( Orientation(seqI) == AbstractMatch::forward )
+ {
+ for( ; crop_amount > 0 && left_col < align_matrix[seqI].size(); ++left_col )
+ if( align_matrix[seqI][left_col] != '-' )
+ --crop_amount;
+
+ CropStart(left_col);
+ }else{
+ left_col = align_matrix[seqI].size();
+ for( ; crop_amount > 0 && left_col > 0; --left_col )
+ if( align_matrix[seqI][left_col-1] != '-' )
+ --crop_amount;
+ CropEnd(AlignmentLength()-left_col);
+ }
+}
+
+inline
+void GappedAlignment::CropRight(gnSeqI crop_amount, uint seqI)
+{
+ // TODO: remove the dependency on Invert() since it will be slow
+ Invert();
+ CropLeft(crop_amount, seqI);
+ Invert();
+}
+
+inline
+void GappedAlignment::GetAlignment( std::vector< bitset_t >& align_matrix ) const
+{
+ align_matrix = std::vector< bitset_t >( this->align_matrix.size(), bitset_t(this->AlignmentLength(), false) );
+ for( size_t seqI = 0; seqI < this->align_matrix.size(); seqI++ )
+ {
+ if( LeftEnd(seqI) == NO_MATCH )
+ continue;
+ for( std::string::size_type charI = 0; charI < this->align_matrix[seqI].size(); charI++ )
+ if( this->align_matrix[seqI][charI] != '-' )
+ align_matrix[seqI].set(charI);
+ }
+}
+
+inline
+AbstractMatch* GappedAlignment::Split( gnSeqI before_column )
+{
+ GappedAlignment ga_tmp(SeqCount(), AlignmentLength());
+ GappedAlignment* ga = ga_tmp.Copy();
+
+ for( size_t seqI = 0; seqI < SeqCount(); seqI++ )
+ {
+ ga->SetStart( seqI, Start(seqI) );
+ ga->SetLength( Length(seqI), seqI );
+ }
+ std::swap(ga->align_matrix, align_matrix);
+ ga->CropStartCoords(before_column);
+ std::swap(ga->align_matrix, align_matrix);
+
+ ga->align_matrix.resize(SeqCount());
+ for( size_t seqI = 0; seqI < SeqCount(); seqI++ )
+ ga->align_matrix[seqI] = align_matrix[seqI].substr( before_column );
+ ga->SetAlignmentLength( AlignmentLength()-before_column );
+ CropEnd(AlignmentLength()-before_column);
+
+ return ga;
+}
+
+const std::vector<std::string>& GetAlignment( const GappedAlignment& ga, const std::vector< genome::gnSequence* >& seq_table );
+inline
+const std::vector<std::string>& GetAlignment( const GappedAlignment& ga, const std::vector< genome::gnSequence* >& seq_table )
+{
+ return ga.align_matrix;
+}
+
+inline
+void GappedAlignment::GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
+{
+ pos = std::vector<gnSeqI>(SeqCount(), NO_MATCH);
+ column = std::vector<bool>(SeqCount(), false);
+ for( uint seqI = 0; seqI < SeqCount(); seqI++ )
+ {
+ if( align_matrix[seqI][col] != '-' )
+ column[seqI] = true;
+
+ gnSeqI count = 0;
+ for( size_t colI = 0; colI <= col; colI++ )
+ if( align_matrix[seqI][colI] != '-' )
+ count++;
+
+ if( count > 0 )
+ {
+ if( Orientation(seqI) == forward )
+ pos[seqI] = LeftEnd(seqI) + count - 1;
+ else if( Orientation(seqI) == reverse )
+ pos[seqI] = RightEnd(seqI) - count + 1;
+ }
+ }
+}
+
+inline
+bool GappedAlignment::IsGap( uint seq, gnSeqI col ) const
+{
+ return align_matrix[seq][col] == '-';
+}
+
+}
+
+
+namespace std {
+template<> inline
+void swap( mems::GappedAlignment& a, mems::GappedAlignment& b )
+{
+ a.swap(b);
+}
+}
+
+
+#endif // __GappedAlignment_h__
+
diff --git a/libMems/GreedyBreakpointElimination.cpp b/libMems/GreedyBreakpointElimination.cpp
new file mode 100644
index 0000000..3e30e59
--- /dev/null
+++ b/libMems/GreedyBreakpointElimination.cpp
@@ -0,0 +1,994 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "libMems/GreedyBreakpointElimination.h"
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/MuscleInterface.h" // it's the default gapped aligner
+#include "libGenome/gnRAWSource.h"
+#include "libMems/gnAlignedSequences.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/PairwiseMatchFinder.h"
+#include "libMems/TreeUtilities.h"
+#include "libMems/PairwiseMatchAdapter.h"
+
+#include <boost/dynamic_bitset.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include <map>
+#include <fstream> // for debugging
+#include <sstream>
+#include <stack>
+#include <algorithm>
+#include <limits>
+#include <iomanip>
+
+using namespace std;
+using namespace genome;
+
+namespace mems {
+// working in mems
+
+bool penalize_repeats = false;
+
+void printProgress( uint prev_prog, uint cur_prog, ostream& os )
+{
+ if( prev_prog != cur_prog )
+ {
+ if( cur_prog / 10 != prev_prog / 10 )
+ os << endl;
+ os << cur_prog << "%..";
+ os.flush();
+ }
+}
+
+
+
+
+void getPairwiseLCBs(
+ uint nI,
+ uint nJ,
+ uint dI,
+ uint dJ,
+ vector< TrackingMatch* >& tracking_matches,
+ vector< TrackingLCB<TrackingMatch*> >& t_lcbs,
+ boost::multi_array< double, 3 >& tm_score_array,
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array )
+{
+ // make a set of projection matches
+ vector< AbstractMatch* > pair_matches;
+ for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
+ {
+ if( tracking_matches[mI]->node_match->LeftEnd(nI) == NO_MATCH ||
+ tracking_matches[mI]->node_match->LeftEnd(nJ) == NO_MATCH )
+ continue;
+ PairwiseMatchAdapter pma(tracking_matches[mI]->node_match, nI, nJ );
+ pma.tm = tracking_matches[mI];
+ if( pma.Orientation(0) == AbstractMatch::reverse )
+ pma.Invert();
+ pair_matches.push_back(pma.Copy());
+ }
+ // find LCBs...
+ vector< gnSeqI > breakpoints;
+ IdentifyBreakpoints( pair_matches, breakpoints );
+
+ vector< vector< AbstractMatch* > > LCB_list;
+ ComputeLCBs_v2( pair_matches, breakpoints, LCB_list );
+
+ //
+ // now compute scores on them
+ //
+ vector< double > lcb_scores(LCB_list.size());
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ {
+ double lcb_score = 0;
+ for( size_t mI = 0; mI < LCB_list[lcbI].size(); ++mI )
+ {
+ PairwiseMatchAdapter* pma = (PairwiseMatchAdapter*)LCB_list[lcbI][mI];
+ lcb_score += tm_score_array[pma->tm->match_id][dI][dJ];
+ }
+ lcb_scores[lcbI] = lcb_score;
+ }
+
+ // and build the pairwise adjacency list
+ vector< LCB > adjacencies;
+ computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
+
+ t_lcbs.resize(adjacencies.size());
+ for( size_t lcbI = 0; lcbI < adjacencies.size(); ++lcbI )
+ {
+ t_lcbs[lcbI] = adjacencies[lcbI];
+ t_lcbs[lcbI].matches.resize(LCB_list[lcbI].size());
+ for( size_t mI = 0; mI < LCB_list[lcbI].size(); ++mI )
+ t_lcbs[lcbI].matches[mI] = ((PairwiseMatchAdapter*)LCB_list[lcbI][mI])->tm;
+ // sort them by ptr
+ sort( t_lcbs[lcbI].matches.begin(), t_lcbs[lcbI].matches.end() );
+
+ // set the match LCB ids appropriately
+ for( size_t mI = 0; mI < t_lcbs[lcbI].matches.size(); ++mI )
+ tm_lcb_id_array[t_lcbs[lcbI].matches[mI]->match_id][dI][dJ] = lcbI;
+ }
+
+ // free the memory used by pairwise matches
+ for( size_t mI = 0; mI < pair_matches.size(); ++mI )
+ pair_matches[mI]->Free();
+}
+
+/** creates an appropriately sized matrix for mapping individual TrackingMatches to their containing LCBs */
+void initTrackingMatchLCBTracking(
+ const std::vector< TrackingMatch >& tracking_matches,
+ size_t n1_count,
+ size_t n2_count,
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array )
+{
+ tm_lcb_id_array.resize( boost::extents[tracking_matches.size()][n1_count][n2_count] );
+ for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
+ {
+ for( size_t nI = 0; nI < n1_count; ++nI )
+ for( size_t nJ = 0; nJ < n2_count; ++nJ )
+ tm_lcb_id_array[mI][nI][nJ] = LCB_UNASSIGNED;
+ }
+}
+
+
+/** removes an LCB from an LCB list and coalesces surrounding LCBs. Returns the number of LCBs removed
+ * After LCBs are removed, the adjacency list should be processed with filterLCBs()
+ * @param id_remaps This is populated with a list of LCB ids that were deleted or coalesced and now have a new LCB id
+ * for each coalesced LCB, an entry of the form <old id, new id> is added, deleted LCBs have
+ * entries of the form <deleted, -1>. Entries appear in the order operations were performed
+ * and the function undoLcbRemoval() can undo these operations in reverse order
+ */
+template< class LcbVector >
+uint RemoveLCBandCoalesce( size_t lcbI, uint seq_count, LcbVector& adjacencies, std::vector< double >& scores, std::vector< std::pair< uint, uint > >& id_remaps, std::vector< uint >& impact_list )
+{
+ uint removed_count = 0;
+ vector< uint > imp_tmp(seq_count * (2 + seq_count * 4), LCB_UNASSIGNED);
+ swap(impact_list, imp_tmp);
+ size_t impactI = 0;
+ id_remaps.clear();
+
+ adjacencies[ lcbI ].lcb_id = -2;
+
+ // update adjacencies
+ uint seqI;
+ uint left_adj;
+ uint right_adj;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
+ if( left_adj != -1 )
+ adjacencies[ left_adj ].right_adjacency[ seqI ] = right_adj;
+ if( right_adj != -1 && right_adj != adjacencies.size() )
+ adjacencies[ right_adj ].left_adjacency[ seqI ] = left_adj;
+ }
+
+ // populate the impact list -- LCBs whose removal scores may change due to this one's removal
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
+ impact_list[impactI++] = left_adj;
+ impact_list[impactI++] = right_adj;
+ for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( left_adj != -1 )
+ {
+ impact_list[impactI++] = adjacencies[ left_adj ].left_adjacency[ seqJ ];
+ impact_list[impactI++] = adjacencies[ left_adj ].right_adjacency[ seqJ ];
+ }
+ if( right_adj != -1 )
+ {
+ impact_list[impactI++] = adjacencies[ right_adj ].left_adjacency[ seqJ ];
+ impact_list[impactI++] = adjacencies[ right_adj ].right_adjacency[ seqJ ];
+ }
+ }
+ }
+
+ // just deleted an lcb...
+ id_remaps.push_back( make_pair( lcbI, -1 ) );
+ removed_count++;
+
+ // check for collapse
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
+ // find the real slim shady
+ while( left_adj != -1 && adjacencies[ left_adj ].lcb_id != left_adj )
+ left_adj = adjacencies[ left_adj ].left_adjacency[ seqI ];
+ while( right_adj != -1 && adjacencies[ right_adj ].lcb_id != right_adj )
+ right_adj = adjacencies[ right_adj ].right_adjacency[ seqI ];
+ if( left_adj == -1 || right_adj == -1 )
+ continue; // can't collapse with a non-existant LCB!
+ if( adjacencies[ left_adj ].lcb_id != left_adj ||
+ adjacencies[ right_adj ].lcb_id != right_adj )
+ if( seqI > 0 )
+ continue; // already coalesced
+ else
+ cerr << "trouble on down street\n";
+
+ // check whether the two LCBs are adjacent in each sequence
+ boolean orientation = adjacencies[ left_adj ].left_end[ seqI ] > 0 ? true : false;
+ uint seqJ;
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
+ if( j_orientation == orientation &&
+ adjacencies[ left_adj ].right_adjacency[ seqJ ] != right_adj )
+ break;
+ if( j_orientation != orientation &&
+ adjacencies[ left_adj ].left_adjacency[ seqJ ] != right_adj )
+ break;
+ // check that they are both in the same orientation
+ if( adjacencies[ right_adj ].left_end[ seqJ ] > 0 != j_orientation )
+ break;
+ }
+
+ if( seqJ != seq_count ||
+ adjacencies[ left_adj ].to_be_deleted ||
+ adjacencies[ right_adj ].to_be_deleted )
+ continue; // if these two aren't collinear, or one or both will get deleted, then don't coalesce
+
+
+ // these two can be coalesced
+ // do it. do it now.
+ id_remaps.push_back( make_pair( adjacencies[ right_adj ].lcb_id, left_adj ) );
+ adjacencies[ right_adj ].lcb_id = left_adj;
+ scores[ left_adj ] += scores[ right_adj ];
+ adjacencies[ left_adj ].weight += adjacencies[ right_adj ].weight;
+
+ // unlink right_adj from the adjacency list and
+ // update left and right ends of left_adj
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
+ uint rr_adj = adjacencies[ right_adj ].right_adjacency[ seqJ ];
+ uint rl_adj = adjacencies[ right_adj ].left_adjacency[ seqJ ];
+ if( j_orientation == orientation ){
+ adjacencies[ left_adj ].right_end[ seqJ ] = adjacencies[ right_adj ].right_end[ seqJ ];
+ adjacencies[ left_adj ].right_adjacency[ seqJ ] = rr_adj;
+ if( rr_adj != -1 )
+ adjacencies[ rr_adj ].left_adjacency[ seqJ ] = left_adj;
+ }else{
+ adjacencies[ left_adj ].left_end[ seqJ ] = adjacencies[ right_adj ].left_end[ seqJ ];
+ adjacencies[ left_adj ].left_adjacency[ seqJ ] = rl_adj;
+ if( rl_adj != -1 )
+ adjacencies[ rl_adj ].right_adjacency[ seqJ ] = left_adj;
+ }
+ }
+ // just coalesced two LCBs...
+ removed_count++;
+ }
+ // uniquify the impact list and get rid of empty entries
+ std::sort( impact_list.begin(), impact_list.end() );
+ vector< uint >::iterator imp_end = std::unique( impact_list.begin(), impact_list.end() );
+ vector< uint >::iterator imp_preend = std::lower_bound( impact_list.begin(), imp_end, LCB_UNASSIGNED );
+ impact_list.erase( imp_preend, impact_list.end() );
+
+ return removed_count;
+}
+
+
+template< class LcbVector >
+void undoLcbRemoval( uint seq_count, LcbVector& adjs, std::vector< std::pair< uint, uint > >& id_remaps )
+{
+ for( size_t rI = id_remaps.size(); rI > 0; --rI )
+ {
+ if( id_remaps[rI-1].second == -1 )
+ {
+ // this one was deleted
+ // revert adjacencies
+ uint lcbI = id_remaps[rI-1].first;
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ uint left_adj = adjs[ lcbI ].left_adjacency[ seqI ];
+ uint right_adj = adjs[ lcbI ].right_adjacency[ seqI ];
+ if( left_adj != -1 )
+ adjs[ left_adj ].right_adjacency[ seqI ] = lcbI;
+ if( right_adj != -1 && right_adj != adjs.size() )
+ adjs[ right_adj ].left_adjacency[ seqI ] = lcbI;
+ }
+ adjs[lcbI].lcb_id = lcbI; // reset the lcb id
+ adjs[lcbI].to_be_deleted = false; // no longer TBD
+ }else{
+ // this one was coalesced
+ // uncoalesce it
+ uint lcbI = id_remaps[rI-1].first;
+ uint lcbJ = id_remaps[rI-1].second;
+ adjs[lcbI].lcb_id = lcbI;
+ adjs[lcbJ].weight -= adjs[lcbI].weight;
+ // link lcbI back in
+ // TODO: fix right end and left end coordinates
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ uint ladj = adjs[lcbI].left_adjacency[seqI];
+ uint radj = adjs[lcbI].right_adjacency[seqI];
+ if( ladj == lcbJ )
+ {
+ adjs[lcbJ].right_adjacency[seqI] = lcbI;
+ if( radj != -1 && radj != adjs.size())
+ adjs[radj].left_adjacency[seqI] = lcbI;
+ }else
+ if( radj == lcbJ )
+ {
+ adjs[lcbJ].left_adjacency[seqI] = lcbI;
+ if( ladj != -1 && ladj != adjs.size())
+ adjs[ladj].right_adjacency[seqI] = lcbI;
+ }
+ }
+ }
+ }
+}
+
+EvenFasterSumOfPairsBreakpointScorer::EvenFasterSumOfPairsBreakpointScorer(
+ double breakpoint_penalty,
+ double minimum_breakpoint_penalty,
+ boost::multi_array<double,2> bp_weight_matrix,
+ boost::multi_array<double,2> conservation_weight_matrix,
+ vector< TrackingMatch* > tracking_match,
+ PairwiseLCBMatrix& pairwise_adjacency_matrix,
+ vector<node_id_t>& n1_descendants,
+ vector<node_id_t>& n2_descendants,
+ boost::multi_array< double, 3 >& tm_score_array,
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array,
+ size_t seqI_begin,
+ size_t seqI_end,
+ size_t seqJ_begin,
+ size_t seqJ_end
+ ) :
+ bp_penalty( breakpoint_penalty ),
+ min_breakpoint_penalty( minimum_breakpoint_penalty ),
+ bp_weights( bp_weight_matrix ),
+ conservation_weights( conservation_weight_matrix ),
+ tracking_matches( tracking_match ),
+ pairwise_adjacencies( pairwise_adjacency_matrix ),
+ n1_des(n1_descendants),
+ n2_des(n2_descendants),
+ tm_score_array(tm_score_array),
+ tm_lcb_id_array(tm_lcb_id_array),
+ seqI_count(pairwise_adjacencies.shape()[0]),
+ seqJ_count(pairwise_adjacencies.shape()[1]),
+ seqI_first(seqI_begin),
+ seqI_last(seqI_end),
+ seqJ_first(seqJ_begin),
+ seqJ_last(seqJ_end),
+ first_time(true)
+{
+ std::sort(tracking_matches.begin(), tracking_matches.end());
+ pairwise_lcb_count.resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );
+ pairwise_lcb_score.resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );;
+ all_id_remaps.resize( boost::extents[pairwise_lcb_count.shape()[0]][pairwise_lcb_count.shape()[1]] );
+ full_impact_list.resize( boost::extents[pairwise_lcb_count.shape()[0]][pairwise_lcb_count.shape()[1]] );
+ my_del_lcbs.resize(100); // buffer for use during lcb removal score computation
+ for( size_t i = 0; i < 3; ++i )
+ {
+ internal_lcb_score_diff[i].resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );
+ internal_lcb_removed_count[i].resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );
+ }
+ lsd_zeros.resize( internal_lcb_score_diff[0].num_elements(), 0 );
+ lrc_zeros.resize( internal_lcb_removed_count[0].num_elements(), 0 );
+ using_lsd = -1;
+ size_t max_pair_adj_size = 0;
+ for( size_t i = 0; i < seqI_count; ++i )
+ {
+ for( size_t j = 0; j < seqJ_count; ++j )
+ {
+ pairwise_lcb_count[i][j] = pairwise_adjacencies[i][j].size();
+ pairwise_lcb_score[i][j] = 0;
+ max_pair_adj_size = (std::max)(max_pair_adj_size, pairwise_adjacencies[i][j].size());
+ for( size_t lcbI = 0; lcbI < pairwise_adjacencies[i][j].size(); ++lcbI )
+ pairwise_lcb_score[i][j] += pairwise_adjacencies[i][j][lcbI].weight;
+ }
+ }
+ bogus_scores.resize(max_pair_adj_size+10);
+};
+
+
+/**
+ * Returns the number of possible moves a search algorithm may make from the current
+ * location in LCB search space. In this case it's simply the total number of pairwise LCBs
+ */
+size_t EvenFasterSumOfPairsBreakpointScorer::getMoveCount()
+{
+ size_t move_count = 0;
+ for( size_t i = seqI_first; i < seqI_last; ++i )
+ for( size_t j = seqJ_first; j < seqJ_last; ++j )
+ move_count += pairwise_adjacencies[i][j].size();
+ return move_count;
+}
+
+/** returns the score of the current state */
+double EvenFasterSumOfPairsBreakpointScorer::score()
+{
+ // score is the sum of all pairwise LCB scores,
+ // minus the sum of all pairwise breakpoint penalties
+ double score = 0;
+ for( size_t seqI = seqI_first; seqI < seqI_last; ++seqI )
+ {
+ for( size_t seqJ = seqJ_first; seqJ < seqJ_last; ++seqJ )
+ {
+ const double pw_lcb_score = pairwise_lcb_score[seqI][seqJ];
+ // add LCB scores
+ score += pairwise_lcb_score[seqI][seqJ];
+ // subtract breakpoint penalty
+ // subtract 1 from number of LCBs so that a single circular LCB doesn't get penalized
+ double cweights = 1 - conservation_weights[seqI][seqJ];
+ double bweights = 1 - bp_weights[seqI][seqJ];
+ double penalty = max( bp_penalty * cweights * cweights * cweights * cweights * bweights * bweights, min_breakpoint_penalty );
+ if(first_time)
+ cout << "Scoring with scaled breakpoint penalty: " << penalty << endl;
+ first_time = false;
+ score -= ( penalty * (pairwise_lcb_count[seqI][seqJ]-1));
+ if( !(score > -1e200 && score < 1e200) )
+ {
+ genome::breakHere();
+ cerr << "bp_weights[seqI][seqJ] " << bp_weights[seqI][seqJ] << endl;
+ cerr << "conservation_weights[seqI][seqJ] " << conservation_weights[seqI][seqJ] << endl;
+ cerr << "pairwise_lcb_count[seqI][seqJ] " << pairwise_lcb_count[seqI][seqJ] << endl;
+ cerr << "pairwise_lcb_score[seqI][seqJ] " << pw_lcb_score << endl;
+ cerr << "Invalid score!!\n";
+ }
+ }
+ }
+ return score;
+}
+
+/** scores a move */
+double EvenFasterSumOfPairsBreakpointScorer::operator()( pair< double, size_t >& the_move )
+{
+ size_t new_move_count;
+ vector< pair< double, size_t > > new_move_list;
+ using_lsd++;
+ std::copy(lsd_zeros.begin(),lsd_zeros.end(),internal_lcb_score_diff[using_lsd].data());
+ std::copy(lrc_zeros.begin(),lrc_zeros.end(),internal_lcb_removed_count[using_lsd].data());
+ remove( the_move, false, internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd], false, new_move_list, new_move_count );
+ applyScoreDifference( internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd] );
+ double m_score = score();
+ undoScoreDifference( internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd] );
+ using_lsd--;
+ return m_score;
+}
+
+bool EvenFasterSumOfPairsBreakpointScorer::isValid( pair< double, size_t >& the_move )
+{
+ using_lsd++;
+ std::copy(lsd_zeros.begin(),lsd_zeros.end(),internal_lcb_score_diff[using_lsd].data());
+ std::copy(lrc_zeros.begin(),lrc_zeros.end(),internal_lcb_removed_count[using_lsd].data());
+ vector< pair< double, size_t > > new_move_list;
+ size_t new_move_count;
+ bool success = remove( the_move, false, internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd], false, new_move_list, new_move_count );
+ using_lsd--;
+ return success;
+}
+
+bool EvenFasterSumOfPairsBreakpointScorer::remove( pair< double, size_t >& the_move, vector< pair< double, size_t > >& new_move_list, size_t& new_move_count )
+{
+ using_lsd++;
+ std::copy(lsd_zeros.begin(),lsd_zeros.end(),internal_lcb_score_diff[using_lsd].data());
+ std::copy(lrc_zeros.begin(),lrc_zeros.end(),internal_lcb_removed_count[using_lsd].data());
+ bool success = remove( the_move, true, internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd], true, new_move_list, new_move_count );
+ if( success )
+ applyScoreDifference( internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd] );
+ using_lsd--;
+ return success;
+}
+
+void EvenFasterSumOfPairsBreakpointScorer::applyScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count )
+{
+ size_t nelems = pairwise_lcb_count.num_elements();
+ for( size_t elemI = 0; elemI < nelems; elemI++ )
+ {
+ if( !(lcb_score_diff.data()[elemI] > -1e200 && lcb_score_diff.data()[elemI] < 1e200) )
+ {
+ genome::breakHere();
+ cerr << "Invalid score!!\n";
+ }
+ pairwise_lcb_count.data()[elemI] -= lcb_removed_count.data()[elemI];
+ pairwise_lcb_score.data()[elemI] -= lcb_score_diff.data()[elemI];
+ if( !(pairwise_lcb_score.data()[elemI] > -1e200 && pairwise_lcb_score.data()[elemI] < 1e200) )
+ {
+ genome::breakHere();
+ cerr << "Invalid score!!\n";
+ }
+ }
+}
+
+void EvenFasterSumOfPairsBreakpointScorer::undoScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count )
+{
+ size_t nelems = pairwise_lcb_count.num_elements();
+ for( size_t elemI = 0; elemI < nelems; elemI++ )
+ {
+ if( !(lcb_score_diff.data()[elemI] > -1e200 && lcb_score_diff.data()[elemI] < 1e200) )
+ {
+ genome::breakHere();
+ cerr << "Invalid score!!\n";
+ }
+ pairwise_lcb_count.data()[elemI] += lcb_removed_count.data()[elemI];
+ pairwise_lcb_score.data()[elemI] += lcb_score_diff.data()[elemI];
+ if( !(pairwise_lcb_score.data()[elemI] > -1e200 && pairwise_lcb_score.data()[elemI] < 1e200) )
+ {
+ genome::breakHere();
+ cerr << "Invalid score!!\n";
+ }
+ }
+}
+
+size_t EvenFasterSumOfPairsBreakpointScorer::getMaxNewMoveCount()
+{
+ return 20 * seqI_count * seqJ_count;
+}
+
+/** call to indicate that the given LCB has been removed
+ * returns false if the move was invalid
+ */
+bool EvenFasterSumOfPairsBreakpointScorer::remove( pair< double, size_t >& the_move, bool really_remove, boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count, bool score_new_moves, vector< pair< double, size_t > >& new_move_list, size_t& new_move_count )
+{
+ if( score_new_moves && !really_remove )
+ {
+ cerr << "Error: Incompatible options in the breakpoint scorer!!!\n";
+ throw "oh shit!";
+ }
+ new_move_count = 0;
+ // figure out which lcb we're being asked to delete
+ size_t moveI = the_move.second;
+ size_t move_count = 0;
+ size_t move_base = 0;
+ size_t seqI = 0;
+ size_t seqJ = 0;
+ for( seqI = seqI_first; seqI < seqI_last; ++seqI )
+ {
+ for( seqJ = seqJ_first; seqJ < seqJ_last; ++seqJ )
+ {
+ all_id_remaps[seqI][seqJ].clear();
+ full_impact_list[seqI][seqJ].clear();
+ }
+ }
+
+ for( seqI = seqI_first; seqI < seqI_last; ++seqI )
+ {
+ for( seqJ = seqJ_first; seqJ < seqJ_last; ++seqJ )
+ {
+ move_count += pairwise_adjacencies[seqI][seqJ].size();
+ if( move_count > moveI )
+ break;
+ move_base = move_count;
+ }
+ if( move_count > moveI )
+ break;
+ }
+ // score deletion of the LCB at (moveI - move_base) from the pairwise alignment of seqI and seqJ
+ size_t del_lcb = moveI - move_base;
+ if( pairwise_adjacencies[seqI][seqJ][del_lcb].lcb_id != del_lcb && really_remove )
+ {
+ if( pairwise_adjacencies[seqI][seqJ][del_lcb].lcb_id == LCB_UNASSIGNED )
+ cerr << "bad movement, dirty dancing\n";
+ return false; // this is an invalid move -- already deleted or coalesced with another
+ }
+ if( pairwise_adjacencies[seqI][seqJ][del_lcb].lcb_id != del_lcb )
+ {
+ return false; // this is an invalid move -- already deleted
+ }
+
+ vector< TrackingMatch* > matches(pairwise_adjacencies[seqI][seqJ][del_lcb].matches);
+ double cur_score = score();
+
+ if( really_remove )
+ {
+ deleted_tracking_matches.insert( deleted_tracking_matches.end(), matches.begin(), matches.end() );
+ }
+
+ for( size_t i = seqI_first; i < seqI_last; ++i )
+ {
+ for( size_t j = seqJ_first; j < seqJ_last; ++j )
+ {
+ lcb_score_diff[i][j] = 0;
+ vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[i][j];
+ // create a list of LCBs affected by deletion of this match
+ // check whether any of them will have all of their matches removed
+ if( lcb_ids.size() < matches.size() )
+ lcb_ids.resize( matches.size() + 100 );
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ lcb_ids[mI] = tm_lcb_id_array[matches[mI]->match_id][i][j];
+ size_t lcb_id_count = matches.size();
+ std::sort(lcb_ids.begin(), lcb_ids.begin()+lcb_id_count);
+ vector< size_t >::iterator last = std::unique(lcb_ids.begin(), lcb_ids.begin()+lcb_id_count);
+ lcb_id_count = last - lcb_ids.begin();
+ // delete the last one if its unassigned
+ if( lcb_ids[lcb_id_count-1] == LCB_UNASSIGNED )
+ lcb_id_count--;
+
+ vector< pair< size_t, vector< TrackingMatch* > > > aff_lcbs(lcb_id_count);
+ for( size_t lI = 0; lI < lcb_id_count; ++lI )
+ aff_lcbs[lI].first = lcb_ids[lI];
+
+ // organize the deleted matches
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ {
+ size_t id = tm_lcb_id_array[matches[mI]->match_id][i][j];
+ if( id == LCB_UNASSIGNED )
+ continue;
+ vector< pair< size_t, vector< TrackingMatch* > > >::iterator iter = std::lower_bound( aff_lcbs.begin(), aff_lcbs.end(), make_pair(id,vector< TrackingMatch* >() ) );
+ iter->second.push_back( matches[mI] );
+ }
+
+ // actually delete the matches and keep a list of LCBs that get completely deleted
+ size_t my_del_count = 0;
+ for( size_t lI = 0; lI < aff_lcbs.size(); ++lI )
+ {
+ vector< TrackingMatch* >& cur_matches = adjs[lcb_ids[lI]].matches;
+ size_t diff = cur_matches.size() - aff_lcbs[lI].second.size();
+ if( diff == 0 )
+ {
+ if( my_del_count + 1 >= my_del_lcbs.size() )
+ my_del_lcbs.resize(2*my_del_lcbs.size());
+ my_del_lcbs[my_del_count++] = lcb_ids[lI];
+ adjs[lcb_ids[lI]].to_be_deleted = true;
+ lcb_score_diff[i][j] += adjs[lcb_ids[lI]].weight;
+ if( really_remove )
+ {
+ adjs[lcb_ids[lI]].weight = 0;
+ cur_matches.clear();
+ }
+ continue;
+ }
+
+ // update the LCB score
+ double del_score_sum = 0;
+ for( size_t mI = 0; mI < aff_lcbs[lI].second.size(); ++mI )
+ del_score_sum += tm_score_array[aff_lcbs[lI].second[mI]->match_id][i][j];
+ lcb_score_diff[i][j] += del_score_sum;
+ full_impact_list[i][j].push_back( aff_lcbs[lI].first );
+
+ if( really_remove )
+ {
+ adjs[lcb_ids[lI]].weight -= del_score_sum;
+
+ // remove the deleted matches
+ vector< TrackingMatch* > dest( diff );
+ std::set_difference( cur_matches.begin(), cur_matches.end(),
+ aff_lcbs[lI].second.begin(), aff_lcbs[lI].second.end(), dest.begin() );
+ swap( dest, cur_matches );
+ }
+ }
+
+ lcb_removed_count[i][j] = 0;
+
+ // now remove each LCB that needs to be deleted
+ std::vector< std::pair< uint, uint > >& fid_remaps = all_id_remaps[i][j];
+ std::vector< uint >& fimp_list = full_impact_list[i][j];
+ for( size_t delI = 0; delI < my_del_count; ++delI )
+ {
+ if( adjs[my_del_lcbs[delI]].lcb_id != my_del_lcbs[delI] )
+ continue; // skip this one if it's already been deleted
+
+ std::vector< std::pair< uint, uint > > id_remaps;
+ std::vector< uint > impact_list;
+ uint removed_count = RemoveLCBandCoalesce( my_del_lcbs[delI], 2, adjs, bogus_scores, id_remaps, impact_list );
+ fid_remaps.insert( fid_remaps.end(), id_remaps.begin(), id_remaps.end() );
+ fimp_list.insert( fimp_list.end(), impact_list.begin(), impact_list.end() );
+
+ lcb_removed_count[i][j] += removed_count;
+ // only do this part if we're really deleting
+ if( really_remove )
+ {
+ // move all matches to the new LCB
+ for( size_t rI = 0; rI < id_remaps.size(); ++rI )
+ {
+ if( id_remaps[rI].second == -1 )
+ continue; // deletion
+ vector< TrackingMatch* >& src_matches = adjs[id_remaps[rI].first].matches;
+ vector< TrackingMatch* >& dest_matches = adjs[id_remaps[rI].second].matches;
+ for( size_t mI = 0; mI < src_matches.size(); ++mI )
+ tm_lcb_id_array[src_matches[mI]->match_id][i][j] = id_remaps[rI].second;
+ dest_matches.insert( dest_matches.end(), src_matches.begin(), src_matches.end() );
+ std::sort( dest_matches.begin(), dest_matches.end() );
+ src_matches.clear();
+ }
+ }
+ }
+ }
+ }
+
+ // will be undone later
+ applyScoreDifference( lcb_score_diff, lcb_removed_count );
+ double new_score = score();
+
+ if( score_new_moves )
+ {
+ size_t mbase = 0;
+ for( size_t i = seqI_first; i < seqI_last; ++i )
+ {
+ for( size_t j = seqJ_first; j < seqJ_last; ++j )
+ {
+ vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[i][j];
+ std::vector< uint >& fimp_list = full_impact_list[i][j];
+ sort( fimp_list.begin(), fimp_list.end() );
+ vector< uint >::iterator iter = std::unique( fimp_list.begin(), fimp_list.end() );
+ fimp_list.erase( iter, fimp_list.end() );
+ for( size_t fI = 0; fI < fimp_list.size(); fI++ )
+ {
+ if( adjs[fimp_list[fI]].lcb_id != fimp_list[fI] )
+ {
+ new_move_list[new_move_count++] = make_pair( -(std::numeric_limits<double>::max)(), mbase + fimp_list[fI] );
+ continue; // this one got trashed
+ }
+ // score removal of this block
+ pair< double, size_t > p( 0, mbase + fimp_list[fI] );
+ double scorediff = (*this)(p) - new_score;
+ p.first = scorediff;
+ new_move_list[new_move_count++] = p;
+ }
+ mbase += adjs.size();
+ }
+ }
+ }
+
+
+ // if we're not really removing, undo all the removals
+ if( !really_remove )
+ for( size_t i = seqI_first; i < seqI_last; ++i )
+ for( size_t j = seqJ_first; j < seqJ_last; ++j )
+ undoLcbRemoval( 2, pairwise_adjacencies[i][j], all_id_remaps[i][j] );
+
+ undoScoreDifference( lcb_score_diff, lcb_removed_count );
+
+ // if the change in score doesn't match then this is an invalid move!!
+ // allow for some numerical instability
+ bool valid = true;
+ if( new_score - cur_score < the_move.first - 0.00001 ||
+ new_score - cur_score > the_move.first + 0.00001 )
+ valid = false;
+
+ return valid;
+}
+
+vector< TrackingMatch* > EvenFasterSumOfPairsBreakpointScorer::getResults()
+{
+ std::sort(deleted_tracking_matches.begin(), deleted_tracking_matches.end());
+ vector< TrackingMatch* > result_matches(tracking_matches.size()-deleted_tracking_matches.size());
+ std::set_difference( tracking_matches.begin(), tracking_matches.end(), deleted_tracking_matches.begin(), deleted_tracking_matches.end(), result_matches.begin() );
+ return result_matches;
+}
+
+ bool EvenFasterSumOfPairsBreakpointScorer::validate()
+{
+ vector< TrackingMatch* > trams = getResults(); // need to apply any deletions...
+ bool success = true; // be optimistic!
+ // make sure all the tracking matches point to the right LCBs
+ for( size_t tmI = 0; tmI < trams.size(); tmI++ )
+ {
+ TrackingMatch* tm = trams[tmI];
+ for( size_t i = 0; i < tm_lcb_id_array.shape()[1]; ++i )
+ for( size_t j = 0; j < tm_lcb_id_array.shape()[2]; ++j )
+ {
+ // skip this match if it's not defined
+ if( tm->node_match->LeftEnd(n1_des[i]) == NO_MATCH ||
+ tm->node_match->LeftEnd(n2_des[j]) == NO_MATCH ||
+ tm_lcb_id_array[tm->match_id][i][j] == LCB_UNASSIGNED)
+ continue;
+ // find the tracking match in this LCB
+ size_t id = tm_lcb_id_array[tm->match_id][i][j];
+ vector< TrackingMatch* >& matches = pairwise_adjacencies[i][j][id].matches;
+ vector< TrackingMatch* >::iterator iter = std::lower_bound( matches.begin(), matches.end(), tm );
+ if( iter == matches.end() || *iter != tm )
+ {
+ cerr << "Missing match!!\n";
+ cerr << "lcb_id: " << id << endl;
+ cerr << "match: " << tm << endl;
+ genome::breakHere();
+ success = false;
+ }
+ }
+ }
+ // make sure all the LCBs point to valid tracking matches
+ for( size_t i = 0; i < pairwise_adjacencies.shape()[0]; ++i )
+ for( size_t j = 0; j < pairwise_adjacencies.shape()[1]; ++j )
+ {
+ vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[i][j];
+ for( size_t lcbI = 0; lcbI < adjs.size(); lcbI++ )
+ {
+ for( size_t mI = 0; mI < adjs[lcbI].matches.size(); ++mI )
+ {
+ vector< TrackingMatch* >::iterator iter = std::lower_bound( trams.begin(), trams.end(), adjs[lcbI].matches[mI] );
+ if( *iter != adjs[lcbI].matches[mI] )
+ {
+ cerr << "Missing match: in adjacencies but not tracking_matches!!\n";
+ cerr << "lcb_id: " << tm_lcb_id_array[adjs[lcbI].matches[mI]->match_id][i][j] << endl;
+ genome::breakHere();
+ success = false;
+ }
+ }
+ }
+ }
+
+ // make sure that the number of breakpoints matches up with what tracking_matches suggests
+ vector< TrackingMatch* > final = trams;
+ // convert back to an LCB list
+ vector< AbstractMatch* > new_matches(final.size());
+ for( size_t mI = 0; mI < final.size(); ++mI )
+ new_matches[mI] = final[mI]->original_match;
+
+ vector< gnSeqI > breakpoints;
+ IdentifyBreakpoints( new_matches, breakpoints );
+ vector< vector< AbstractMatch* > > LCB_list;
+ IdentifyBreakpoints( new_matches, breakpoints );
+ ComputeLCBs_v2( new_matches, breakpoints, LCB_list );
+ cout << "breakpoints.size(): " << breakpoints.size() << "\tpairwise_lcb_count[0][0]: " << pairwise_lcb_count[0][0] << endl;
+ if( breakpoints.size() != pairwise_lcb_count[0][0] )
+ success = false;
+ size_t adjI = 0;
+ vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[0][0];
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ {
+ // make sure each LCB exists...
+ while( adjI != -1 && adjI != adjs[adjI].lcb_id )
+ adjI++;
+
+ // compare matches...
+ vector< AbstractMatch* > ms(adjs[adjI].matches.size()+LCB_list[lcbI].size(), (AbstractMatch*)NULL);
+ std::sort( LCB_list[lcbI].begin(), LCB_list[lcbI].end() );
+ vector< AbstractMatch* > asdf(adjs[adjI].matches.size());
+ for( size_t mI = 0; mI < adjs[adjI].matches.size(); ++mI )
+ asdf[mI] = adjs[adjI].matches[mI]->original_match;
+ std::sort( asdf.begin(), asdf.end() );
+ std::set_symmetric_difference( LCB_list[lcbI].begin(), LCB_list[lcbI].end(), asdf.begin(), asdf.end(), ms.begin() );
+ // this should throw a fit if the sets aren't equal.
+ if( ms[0] != NULL )
+ {
+ cerr << "In adjacencies:\n";
+ for( size_t asdfI = 0; asdfI < asdf.size(); asdfI++ )
+ {
+ printMatch(asdf[asdfI], cerr);
+ cerr << endl;
+ }
+ cerr << "\nIn LCB_list:\n";
+ for( size_t mI = 0; mI < LCB_list[lcbI].size(); mI++ )
+ {
+ printMatch(LCB_list[lcbI][mI], cerr);
+ cerr << endl;
+ }
+ cerr << "\nAll matches ssc1\n";
+ SingleStartComparator<AbstractMatch> ssc1(1);
+ std::sort(new_matches.begin(), new_matches.end(), ssc1);
+ for( size_t mI = 0; mI < new_matches.size(); mI++ )
+ {
+ printMatch(new_matches[mI], cerr);
+ cerr << endl;
+ }
+
+ cerr << "\nAll matches ssc0\n";
+ SingleStartComparator<AbstractMatch> ssc0(0);
+ std::sort(new_matches.begin(), new_matches.end(), ssc0);
+ for( size_t mI = 0; mI < new_matches.size(); mI++ )
+ {
+ printMatch(new_matches[mI], cerr);
+ cerr << endl;
+ }
+ genome::breakHere();
+ }
+ adjI++;
+ }
+
+ return success;
+}
+
+
+
+SimpleBreakpointScorer::SimpleBreakpointScorer( std::vector< LCB >& adjacencies, double breakpoint_penalty, bool collinear ) :
+ adjs( adjacencies ),
+ bp_penalty( breakpoint_penalty ),
+ collinear( collinear )
+{
+ scores = std::vector< double >(adjs.size(), 0);
+ total_weight = 0;
+ bp_count = adjs.size();
+ for( size_t lcbI = 0; lcbI < adjs.size(); lcbI++ )
+ total_weight += adjs[lcbI].weight;
+}
+
+size_t SimpleBreakpointScorer::getMoveCount()
+{
+ return adjs.size();
+}
+
+double SimpleBreakpointScorer::score()
+{
+ double bp_score = (double)bp_count * bp_penalty;
+ return total_weight - bp_score;
+}
+
+bool SimpleBreakpointScorer::isValid( size_t lcbI, double move_score )
+{
+ if( adjs[lcbI].lcb_id != lcbI )
+ return false;
+ return (*this)(lcbI) == move_score;
+}
+
+/** return the relative change in score if lcbI were to be removed */
+double SimpleBreakpointScorer::operator()( size_t lcbI )
+{
+ double cur_score = score();
+ std::vector< std::pair< uint, uint > > id_remaps;
+ std::vector< uint > impact_list;
+ uint bp_removed = RemoveLCBandCoalesce( lcbI, adjs[0].left_adjacency.size(), adjs, scores, id_remaps, impact_list );
+ undoLcbRemoval( adjs[0].left_adjacency.size(), adjs, id_remaps );
+ double bp_score = (double)(bp_count - bp_removed) * bp_penalty;
+ double move_score = total_weight - adjs[lcbI].weight - bp_score;
+ double score_diff = move_score - cur_score;
+ if( collinear && bp_count - bp_removed > 0 && score_diff < 0 )
+ return 1/(-score_diff); // ensure that we continue removing blocks until only one is left
+ return move_score - cur_score;
+}
+
+/** call to indicate that the given LCB has been removed */
+void SimpleBreakpointScorer::remove( uint lcbI, vector< pair< double, size_t > >& new_moves )
+{
+ std::vector< std::pair< uint, uint > > id_remaps;
+ std::vector< uint > impact_list;
+ uint bp_removed = RemoveLCBandCoalesce( lcbI, adjs[0].left_adjacency.size(), adjs, scores, id_remaps, impact_list );
+ total_weight -= adjs[lcbI].weight;
+ bp_count -= bp_removed;
+ for( size_t impI = 0; impI < impact_list.size(); impI++ )
+ {
+ if( adjs[impact_list[impI]].lcb_id != impact_list[impI] )
+ continue;
+ double scorediff = (*this)(impact_list[impI]);
+ new_moves.push_back(make_pair(scorediff, impact_list[impI]));
+ }
+}
+
+
+GreedyRemovalScorer::GreedyRemovalScorer( std::vector< LCB >& adjacencies, double minimum_weight ) :
+adjs( adjacencies ),
+min_weight( minimum_weight )
+{
+ scores = std::vector< double >(adjs.size(), 0);
+ total_weight = 0;
+ for( size_t lcbI = 0; lcbI < adjs.size(); lcbI++ )
+ total_weight += adjs[lcbI].weight - min_weight;
+}
+
+size_t GreedyRemovalScorer::getMoveCount()
+{
+ return adjs.size();
+}
+
+double GreedyRemovalScorer::score()
+{
+ return total_weight;
+}
+
+bool GreedyRemovalScorer::isValid( size_t lcbI, double move_score )
+{
+ if( adjs[lcbI].lcb_id != lcbI )
+ return false;
+ return (*this)(lcbI) == move_score;
+}
+
+/** return the relative change in score if lcbI were to be removed */
+double GreedyRemovalScorer::operator()( size_t lcbI )
+{
+ return -(adjs[lcbI].weight-min_weight);
+}
+
+/** call to indicate that the given LCB has been removed */
+void GreedyRemovalScorer::remove( uint lcbI, vector< pair< double, size_t > >& new_moves )
+{
+ std::vector< std::pair< uint, uint > > id_remaps;
+ std::vector< uint > impact_list;
+ uint bp_removed = RemoveLCBandCoalesce( lcbI, adjs[0].left_adjacency.size(), adjs, scores, id_remaps, impact_list );
+ total_weight -= (adjs[lcbI].weight-min_weight);
+ for( size_t impI = 0; impI < impact_list.size(); impI++ )
+ {
+ if( adjs[impact_list[impI]].lcb_id != impact_list[impI] )
+ continue;
+ double scorediff = (*this)(impact_list[impI]);
+ new_moves.push_back(make_pair(scorediff, impact_list[impI]));
+ }
+}
+
+
+
+
+} // namespace mems
+
diff --git a/libMems/GreedyBreakpointElimination.h b/libMems/GreedyBreakpointElimination.h
new file mode 100644
index 0000000..254a880
--- /dev/null
+++ b/libMems/GreedyBreakpointElimination.h
@@ -0,0 +1,873 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __GreedyBreakpointElimination_h__
+#define __GreedyBreakpointElimination_h__
+
+#include <libMems/AbstractMatch.h>
+#include <iostream>
+#include <boost/multi_array.hpp>
+#include <libMems/PhyloTree.h>
+#include <libMems/SubstitutionMatrix.h>
+#include <libMems/SeedOccurrenceList.h>
+#include <libMems/IntervalList.h>
+#include <libMems/LCB.h>
+#include <stack>
+
+namespace mems {
+
+extern bool penalize_repeats;
+
+/**
+ * A wrapper that maps a match among extant sequences to a match among ancestral and extant seqs
+ */
+template <class MatchType>
+class LcbTrackingMatch
+{
+public:
+ MatchType original_match;
+ MatchType node_match;
+ size_t match_id; // used to index into global arrays of lcb_id and score
+};
+typedef LcbTrackingMatch< mems::AbstractMatch* > TrackingMatch;
+
+/**
+ * This class is used to track relationships between LCBs during the LCB determination process.
+ */
+template <class MatchType>
+class TrackingLCB
+{
+public:
+ TrackingLCB(){}
+ TrackingLCB( const TrackingLCB& l ){ *this = l; }
+ /** Constructs a TrackingLCB from a pairwise LCB */
+ TrackingLCB( const mems::LCB& l ){ *this = l; }
+ TrackingLCB& operator=( const mems::LCB& l )
+ {
+ left_end[0] = l.left_end[0];
+ left_end[1] = l.left_end[1];
+ right_end[0] = l.right_end[0];
+ right_end[1] = l.right_end[1];
+ left_adjacency[0] = l.left_adjacency[0];
+ left_adjacency[1] = l.left_adjacency[1];
+ right_adjacency[0] = l.right_adjacency[0];
+ right_adjacency[1] = l.right_adjacency[1];
+ lcb_id = l.lcb_id;
+ weight = l.weight;
+ to_be_deleted = false;
+ return *this;
+ }
+ int64 left_end[2]; /**< The left end position of the LCB in each sequence */
+ int64 right_end[2]; /**< The right end position of the LCB in each sequence */
+ uint left_adjacency[2]; /**< 'Pointers' (actually IDs) to the LCBs on the left in each sequence */
+ uint right_adjacency[2]; /**< 'Pointers' (actually IDs) to the LCBs on the right in each sequence */
+ double weight; /**< The weight (or coverage) of this LCB */
+ std::vector< MatchType > matches;
+ int lcb_id; /**< A numerical ID that can be assigned to this LCB */
+ bool to_be_deleted;
+};
+
+/** indicates an LCB identifier hasn't been assigned or is unknown */
+const uint LCB_UNASSIGNED = (std::numeric_limits<uint>::max)();
+
+typedef boost::multi_array< std::vector< TrackingLCB< TrackingMatch* > >, 2 > PairwiseLCBMatrix;
+
+
+/**
+ * computes an anchoring score for the matches contained inside an LCB
+ */
+template< class MatchVector >
+double GetPairwiseAnchorScore(
+ MatchVector& lcb, std::vector< genome::gnSequence* >& seq_table,
+ const mems::PairwiseScoringScheme& subst_scoring, mems::SeedOccurrenceList& sol_1,
+ mems::SeedOccurrenceList& sol_2, bool penalize_gaps = false );
+
+class MoveScoreHeapComparator
+{
+public:
+ bool operator()( const std::pair< double, size_t >& a, const std::pair< double, size_t >& b ) const
+ {
+ return a.first < b.first; // want to order by > instead of <
+ }
+};
+
+/**
+ * Computes all pairwise LCBs from a set of tracking matches
+ */
+void getPairwiseLCBs(
+ uint nI,
+ uint nJ,
+ uint dI,
+ uint dJ,
+ std::vector< TrackingMatch* >& tracking_matches,
+ std::vector< TrackingLCB<TrackingMatch*> >& t_lcbs,
+ boost::multi_array< double, 3 >& tm_score_array,
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array );
+
+/** creates an appropriately sized matrix for mapping individual TrackingMatches to their containing LCBs */
+void initTrackingMatchLCBTracking(
+ const std::vector< mems::TrackingMatch >& tracking_matches,
+ size_t n1_count,
+ size_t n2_count,
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array );
+
+
+/** removes an LCB from an LCB list and coalesces surrounding LCBs. Returns the number of LCBs removed
+ * After LCBs are removed, the adjacency list should be processed with filterLCBs()
+ * @param id_remaps This is populated with a list of LCB ids that were deleted or coalesced and now have a new LCB id
+ * for each coalesced LCB, an entry of the form <old id, new id> is added, deleted LCBs have
+ * entries of the form <deleted, -1>. Entries appear in the order operations were performed
+ * and the function undoLcbRemoval() can undo these operations in reverse order
+ */
+template< class LcbVector >
+uint RemoveLCBandCoalesce( size_t lcbI, uint seq_count,
+ LcbVector& adjacencies,
+ std::vector< double >& scores,
+ std::vector< std::pair< uint, uint > >& id_remaps,
+ std::vector< uint >& impact_list );
+
+
+void printMatch( mems::AbstractMatch* m, std::ostream& os );
+
+inline
+void printMatch( mems::AbstractMatch* m, std::ostream& os )
+{
+ for( size_t ii = 0; ii < m->SeqCount(); ++ii )
+ {
+ if( ii > 0 )
+ os << '\t';
+ os << "(" << m->Start(ii) << "," << m->RightEnd(ii) << ")";
+ }
+}
+
+void printProgress( uint prev_prog, uint cur_prog, std::ostream& os );
+
+
+template< typename PairType >
+class LabelSort
+{
+public:
+ LabelSort( uint seqI ) : ssc( seqI ) {};
+ bool operator()( const PairType& pt1, const PairType& pt2 )
+ {
+ return ssc( pt1.first, pt2.first );
+ }
+private:
+ LabelSort();
+ mems::SSC<mems::AbstractMatch> ssc;
+};
+
+template<class MatchVector>
+void IdentifyBreakpoints( MatchVector& mlist, std::vector<gnSeqI>& breakpoints )
+{
+ if( mlist.size() == 0 )
+ return;
+ breakpoints = std::vector<gnSeqI>(1, mlist.size()-1);
+
+ mems::SSC<mems::AbstractMatch> ssc(0);
+ std::sort( mlist.begin(), mlist.end(), ssc );
+ typedef typename MatchVector::value_type value_type;
+ typedef std::pair< value_type, size_t > LabelPairType;
+ std::vector< LabelPairType > label_list;
+ typename MatchVector::iterator cur = mlist.begin();
+ typename MatchVector::iterator end = mlist.end();
+ size_t i = 0;
+ for( ;cur != end; ++cur )
+ {
+ label_list.push_back( std::make_pair( *cur, i ) );
+ ++i;
+ }
+
+ uint seq_count = mlist[0]->SeqCount();
+ // check for breakpoints in each sequence
+ for( uint seqI = 1; seqI < seq_count; seqI++ )
+ {
+ LabelSort< LabelPairType > ls(seqI);
+ std::sort( label_list.begin(), label_list.end(), ls );
+
+ typename std::vector< LabelPairType >::const_iterator prev = label_list.begin();
+ typename std::vector< std::pair< typename MatchVector::value_type, size_t > >::const_iterator iter = label_list.begin();
+ typename std::vector< std::pair< typename MatchVector::value_type, size_t > >::const_iterator lab_end = label_list.end();
+
+ bool prev_orient = (*prev).first->Orientation(seqI) == (*prev).first->Orientation(0);
+ if( !prev_orient ) // if we start in a different orientation than the ref seq there's a bp here
+ breakpoints.push_back(prev->second);
+
+ for( ++iter; iter != lab_end; ++iter )
+ {
+ bool cur_orient = (*iter).first->Orientation(seqI) == (*iter).first->Orientation(0);
+ if( prev_orient == cur_orient &&
+ ( ( prev_orient && (*prev).second + 1 == (*iter).second) ||
+ ( !prev_orient && (*prev).second - 1 == (*iter).second)
+ )
+ )
+ {
+ prev_orient = cur_orient;
+ ++prev;
+ continue; // no breakpoint here
+ }
+
+ // always add the last match in a new block (scanning from left to right in seq 0)
+ if( prev_orient )
+ breakpoints.push_back( prev->second );
+ if( !cur_orient )
+ breakpoints.push_back( iter->second );
+
+ prev_orient = cur_orient;
+ ++prev;
+ }
+ if( prev_orient )
+ breakpoints.push_back( prev->second );
+ }
+ std::sort( breakpoints.begin(), breakpoints.end() );
+ std::vector<gnSeqI>::iterator uni = std::unique( breakpoints.begin(), breakpoints.end() );
+ breakpoints.erase( uni, breakpoints.end() );
+}
+
+
+template< class MatchVector >
+void ComputeLCBs_v2( const MatchVector& meml, const std::vector<gnSeqI>& breakpoints, std::vector< MatchVector >& lcb_list )
+{
+ // there must be at least one end of a block defined
+ if( breakpoints.size() < 1 )
+ return;
+
+ lcb_list.clear();
+
+ // organize the LCBs into different MatchVector instances
+ std::vector<gnSeqI>::const_iterator break_iter = breakpoints.begin();
+ uint prev_break = 0; // prev_break is the first match in the current block
+ MatchVector lcb;
+ for( ; break_iter != breakpoints.end(); ++break_iter ){
+ // add the new MatchList to the set if it made the cut
+ lcb_list.push_back( lcb );
+ lcb_list.back().insert( lcb_list.back().end(), meml.begin() + prev_break, meml.begin() + *break_iter + 1 );
+ prev_break = *break_iter + 1;
+ }
+}
+
+
+template <class MatchVector>
+void computeLCBAdjacencies_v3( const std::vector< MatchVector >& lcb_list, std::vector< double >& weights, std::vector< mems::LCB >& adjacencies )
+{
+ adjacencies.clear(); // start with no LCB adjacencies
+ if( lcb_list.size() == 0 )
+ return; // there aren't any LCBs so there aren't any adjacencies!
+
+ uint seq_count = lcb_list.front().front()->SeqCount();
+ uint seqI;
+ uint lcbI;
+ for( lcbI = 0; lcbI < lcb_list.size(); ++lcbI ){
+ mems::LCB lcb;
+ std::vector<gnSeqI> left_end;
+ std::vector<gnSeqI> length;
+ std::vector<bool> orientation;
+ FindBoundaries( lcb_list[lcbI], left_end, length, orientation );
+
+ lcb.left_adjacency = std::vector<uint>( left_end.size(), -1 );
+ lcb.right_adjacency = std::vector<uint>( left_end.size(), -1 );
+ lcb.left_end = std::vector<int64>( left_end.size(), 0 );
+ lcb.right_end = std::vector<int64>( left_end.size(), 0 );
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ // support "ragged edges" on the ends of LCBs
+ if( left_end[seqI] == mems::NO_MATCH )
+ continue;
+ lcb.left_end[seqI] = left_end[seqI];
+ lcb.right_end[seqI] = left_end[seqI] + length[seqI];
+ if( !orientation[seqI] )
+ {
+ lcb.left_end[seqI] = -lcb.left_end[seqI];
+ lcb.right_end[seqI] = -lcb.right_end[seqI];
+ }
+ }
+ lcb.lcb_id = adjacencies.size();
+ lcb.weight = weights[ lcbI ];
+ adjacencies.push_back( lcb );
+ }
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ mems::LCBLeftComparator llc( seqI );
+ std::sort( adjacencies.begin(), adjacencies.end(), llc );
+ for( lcbI = 1; lcbI + 1 < lcb_list.size(); lcbI++ ){
+ adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
+ adjacencies[ lcbI ].right_adjacency[ seqI ] = adjacencies[ lcbI + 1 ].lcb_id;
+ }
+ if( lcbI == lcb_list.size() )
+ lcbI--; // need to decrement when there is only a single LCB
+
+ // set first and last lcb adjacencies to -1
+ adjacencies[ 0 ].left_adjacency[ seqI ] = (uint)-1;
+ adjacencies[ lcbI ].right_adjacency[ seqI ] = (uint)-1;
+ if( lcbI > 0 ){
+ adjacencies[ 0 ].right_adjacency[ seqI ] = adjacencies[ 1 ].lcb_id;
+ adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
+ }
+ }
+ mems::LCBIDComparator lic;
+ std::sort( adjacencies.begin(), adjacencies.end(), lic );
+
+}
+
+/**
+ * Redesign to be more intuitive. left_adjacency is always left, regardless of LCB orientation
+ */
+inline
+void computeLCBAdjacencies_v3( mems::IntervalList& iv_list, std::vector< double >& weights, std::vector< mems::LCB >& adjacencies ){
+ std::vector< std::vector< mems::Interval* > > nivs;
+ for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
+ nivs.push_back( std::vector< mems::Interval* >( 1, &iv_list[ivI] ) );
+ computeLCBAdjacencies_v3( nivs, weights, adjacencies );
+}
+
+/**
+ * Takes a set of filtered LCB adjacencies and an unfiltered set of matches as input
+ * returns a filtered set of matches that reflects the LCBs found
+ */
+template< class MatchVector >
+void filterMatches_v2( std::vector< mems::LCB >& adjacencies, std::vector< MatchVector >& lcb_list, std::vector< double >& weights, MatchVector& deleted_matches ){
+ if( lcb_list.size() < 1 )
+ return;
+ MatchVector lcb_tmp = lcb_list[ 0 ];
+ lcb_tmp.clear();
+ std::vector< MatchVector > filtered_lcbs( lcb_list.size(), lcb_tmp );
+ uint lcbI;
+ for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
+ if( adjacencies[ lcbI ].lcb_id == lcbI ){
+ filtered_lcbs[ lcbI ].insert( filtered_lcbs[ lcbI ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
+ continue;
+ }
+ if( adjacencies[ lcbI ].lcb_id == -1 ){
+ std::cerr << "weird";
+ continue; // this one was removed
+ }
+ if( adjacencies[ lcbI ].lcb_id == -2 )
+ {
+ deleted_matches.insert( deleted_matches.end(), lcb_list[lcbI].begin(), lcb_list[lcbI].end() );
+ continue; // this one was removed
+ }
+
+ // this one points elsewhere
+ // search and update the union/find structure for the target
+ std::stack< uint > visited_lcbs;
+ visited_lcbs.push( lcbI );
+ uint cur_lcb = adjacencies[ lcbI ].lcb_id;
+ while( adjacencies[ cur_lcb ].lcb_id != cur_lcb ){
+ visited_lcbs.push( cur_lcb );
+ cur_lcb = adjacencies[ cur_lcb ].lcb_id;
+ if( cur_lcb == -1 || cur_lcb == -2 ){
+// std::cerr << "improper hoodidge\n";
+ break; // this one points to an LCB that got deleted
+ }
+ }
+ while( visited_lcbs.size() > 0 ){
+ adjacencies[ visited_lcbs.top() ].lcb_id = cur_lcb;
+ visited_lcbs.pop();
+ }
+ // add this LCB's matches to the target LCB.
+ if( cur_lcb != -1 && cur_lcb != -2 )
+ filtered_lcbs[ cur_lcb ].insert( filtered_lcbs[ cur_lcb ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
+ else
+ deleted_matches.insert( deleted_matches.end(), lcb_list[lcbI].begin(), lcb_list[lcbI].end() );
+ }
+
+
+ lcb_list.clear();
+ std::vector< double > new_weights;
+ for( lcbI = 0; lcbI < filtered_lcbs.size(); lcbI++ ){
+ if( filtered_lcbs[ lcbI ].size() > 0 ){
+ lcb_list.push_back( filtered_lcbs[ lcbI ] );
+ new_weights.push_back( weights[lcbI] );
+ }
+ }
+
+ // sort the matches inside consolidated LCBs
+ mems::MatchStartComparator<mems::AbstractMatch> msc( 0 );
+ for( lcbI = 0; lcbI < lcb_list.size(); lcbI++ ){
+ std::sort( lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end(), msc );
+ }
+
+ // calculate the LCB adjacencies
+ weights = new_weights;
+ computeLCBAdjacencies_v3( lcb_list, weights, adjacencies );
+
+}
+
+// predeclared to avoid need to include Islands.h
+const score_t INV_SCORE = (std::numeric_limits<score_t>::max)();
+void computeMatchScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
+void computeGapScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
+
+
+template< class MatchVector >
+double GetPairwiseAnchorScore( MatchVector& lcb,
+ std::vector< genome::gnSequence* >& seq_table,
+ const mems::PairwiseScoringScheme& subst_scoring,
+ mems::SeedOccurrenceList& sol_1,
+ mems::SeedOccurrenceList& sol_2,
+ bool penalize_gaps )
+{
+ double lcb_score = 0;
+ typename MatchVector::iterator match_iter = lcb.begin();
+ for( ; match_iter != lcb.end(); ++match_iter )
+ {
+ typedef typename MatchVector::value_type MatchPtrType;
+ MatchPtrType m = *match_iter;
+ std::vector< score_t > scores(m->AlignmentLength(), 0);
+ std::vector< std::string > et;
+ mems::GetAlignment(*m, seq_table, et);
+
+ // get substitution/gap score
+ mems::computeMatchScores( et[0], et[1], subst_scoring, scores );
+ if( penalize_gaps )
+ mems::computeGapScores( et[0], et[1], subst_scoring, scores );
+
+ // scale match scores by uniqueness
+ size_t merI = 0;
+ size_t merJ = 0;
+ double uni_count = 0;
+ double uni_score = 0;
+ const size_t m_aln_length = m->AlignmentLength();
+ const int64 m_leftend_0 = m->LeftEnd(0);
+ const int64 m_leftend_1 = m->LeftEnd(1);
+ for( size_t colI = 0; colI < m_aln_length; ++colI )
+ {
+ if(et[0][colI] != '-' && et[1][colI] != '-' )
+ {
+ mems::SeedOccurrenceList::frequency_type uni1 = sol_1.getFrequency(m_leftend_0 + merI - 1);
+ mems::SeedOccurrenceList::frequency_type uni2 = sol_2.getFrequency(m_leftend_1 + merJ - 1);
+ mems::SeedOccurrenceList::frequency_type uniprod = uni1*uni2;
+ uniprod = uniprod == 0 ? 1 : uniprod;
+ // scale by the uniqueness product, which approximates the number of ways to match up non-unique k-mers
+ // in the worst case of a very repetitive match, the score becomes the negative of the match score
+ if( scores[colI] > 0 )
+ {
+ if(penalize_repeats)
+ scores[colI] = (score_t)((double)scores[colI] * (2.0 / uniprod)) - scores[colI];
+ else
+ scores[colI] = (score_t)((mems::SeedOccurrenceList::frequency_type)scores[colI] / uniprod);
+ }
+ }
+ if(et[0][colI] != '-')
+ merI++;
+ if(et[1][colI] != '-')
+ merJ++;
+ }
+
+
+ double m_score = 0;
+ for( size_t i = 0; i < scores.size(); ++i )
+ if( scores[i] != INV_SCORE )
+ m_score += scores[i];
+
+ if( !( m_score > -1000000000 && m_score < 1000000000 ) )
+ {
+ std::cerr << "scoring error\n";
+ genome::breakHere();
+ }
+ lcb_score += m_score;
+ }
+
+
+ return lcb_score;
+}
+
+
+
+class EvenFasterSumOfPairsBreakpointScorer
+{
+public:
+ EvenFasterSumOfPairsBreakpointScorer(
+ double breakpoint_penalty,
+ double minimum_breakpoint_penalty,
+ boost::multi_array<double,2> bp_weight_matrix,
+ boost::multi_array<double,2> conservation_weight_matrix,
+ std::vector< TrackingMatch* > tracking_match,
+ mems::PairwiseLCBMatrix& pairwise_adjacency_matrix,
+ std::vector<node_id_t>& n1_descendants,
+ std::vector<node_id_t>& n2_descendants,
+ boost::multi_array< double, 3 >& tm_score_array,
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array,
+ size_t seqI_begin,
+ size_t seqI_end,
+ size_t seqJ_begin,
+ size_t seqJ_end
+ );
+
+ /**
+ * Returns the number of possible moves a search algorithm may make from the current
+ * location in LCB search space. In this case it's simply the total number of pairwise LCBs
+ */
+ size_t getMoveCount();
+
+ /** returns the score of the current state */
+ double score();
+
+ /** scores a move */
+ double operator()( std::pair< double, size_t >& the_move );
+
+ /** checks whether a particular move is a valid move */
+ bool isValid( std::pair< double, size_t >& the_move );
+
+ bool remove( std::pair< double, size_t >& the_move, std::vector< std::pair< double, size_t > >& new_move_list, size_t& new_move_count );
+
+ /** applies a score difference */
+ void applyScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count );
+
+ /** undoes a score difference, if it wasn't accepted for example */
+ void undoScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count );
+
+ /** returns the maximum number of new moves generated by any LCB removal */
+ size_t getMaxNewMoveCount();
+
+ /** call to indicate that the given LCB has been removed
+ * @param really_remove set to false if the move should merely be checked for validity
+ * returns false if the move was invalid
+ */
+ bool remove( std::pair< double, size_t >& the_move, bool really_remove,
+ boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count,
+ bool score_new_moves, std::vector< std::pair< double, size_t > >& new_move_list, size_t& new_move_count );
+
+ /** returns the final set of TrackingMatch values which remain after applying greedy breakpoint elimination */
+ std::vector< mems::TrackingMatch* > getResults();
+
+ /** sanity checks all internal data structures */
+ bool validate();
+
+protected:
+ double bp_penalty;
+ boost::multi_array<double,2> bp_weights;
+ boost::multi_array<double,2> conservation_weights;
+ std::vector< mems::TrackingMatch* > tracking_matches;
+ mems::PairwiseLCBMatrix pairwise_adjacencies;
+ std::vector<node_id_t> n1_des;
+ std::vector<node_id_t> n2_des;
+
+ boost::multi_array< size_t, 2 > pairwise_lcb_count;
+ boost::multi_array< double, 2 > pairwise_lcb_score;
+
+ std::vector< TrackingMatch* > deleted_tracking_matches;
+
+ double min_breakpoint_penalty;
+
+private:
+ // avoid continuous size lookup
+ const size_t seqI_count;
+ const size_t seqJ_count;
+
+ // variables used during score computation
+ boost::multi_array< std::vector< std::pair< uint, uint > >, 2 > all_id_remaps;
+ boost::multi_array< std::vector< uint >, 2 > full_impact_list;
+ boost::multi_array< double, 2 > internal_lcb_score_diff[3];
+ boost::multi_array< size_t, 2 > internal_lcb_removed_count[3];
+ int using_lsd;
+ std::vector< double > lsd_zeros;
+ std::vector< size_t > lrc_zeros;
+ std::vector< double > bogus_scores;
+ std::vector< size_t > my_del_lcbs;
+ std::vector< size_t > lcb_ids;
+
+ boost::multi_array< double, 3 >& tm_score_array;
+ boost::multi_array< size_t, 3 >& tm_lcb_id_array;
+
+ // limit to a range of sequences
+ const size_t seqI_first;
+ const size_t seqJ_first;
+ const size_t seqI_last;
+ const size_t seqJ_last;
+
+ // for debugging
+ bool first_time;
+};
+
+
+template< class BreakpointScorerType >
+int64 greedyBreakpointElimination_v4( std::vector< mems::LCB >& adjacencies, std::vector< double >& scores, BreakpointScorerType& bp_scorer, std::ostream* status_out, size_t g1_tag = 0, size_t g2_tag = 0 );
+
+template< class SearchScorer >
+double greedySearch( SearchScorer& spbs );
+
+
+/**
+ * A breakpoint scorer that applies a fixed penalty for each breakpoint that exists in a set of
+ * two or more sequences
+ */
+class SimpleBreakpointScorer
+{
+public:
+ SimpleBreakpointScorer( std::vector< LCB >& adjacencies, double breakpoint_penalty, bool collinear );
+
+ size_t getMoveCount();
+
+ double score();
+
+ bool isValid( size_t lcbI, double move_score );
+
+ /** return the relative change in score if lcbI were to be removed */
+ double operator()( size_t lcbI );
+
+ /** call to indicate that the given LCB has been removed */
+ void remove( uint lcbI, std::vector< std::pair< double, size_t > >& new_moves );
+
+private:
+ std::vector< mems::LCB > adjs;
+ double bp_penalty;
+ std::vector< double > scores;
+ double total_weight;
+ size_t bp_count;
+ bool collinear;
+};
+
+
+class GreedyRemovalScorer
+{
+public:
+ GreedyRemovalScorer( std::vector< LCB >& adjacencies, double minimum_weight );
+
+ size_t getMoveCount();
+
+ double score();
+
+ bool isValid( size_t lcbI, double move_score );
+
+ /** return the relative change in score if lcbI were to be removed */
+ double operator()( size_t lcbI );
+
+ /** call to indicate that the given LCB has been removed */
+ void remove( uint lcbI, std::vector< std::pair< double, size_t > >& new_moves );
+
+private:
+ std::vector< mems::LCB > adjs;
+ double min_weight;
+ std::vector< double > scores;
+ double total_weight;
+};
+
+
+
+
+template< class BreakpointScorerType >
+int64 greedyBreakpointElimination_v4( std::vector< mems::LCB >& adjacencies,
+ std::vector< double >& scores, BreakpointScorerType& bp_scorer, std::ostream* status_out,
+ size_t g1_tag, size_t g2_tag )
+{
+ // repeatedly remove the low weight LCBs until the minimum weight criteria is satisfied
+ uint lcb_count = adjacencies.size();
+ double total_initial_lcb_weight = 0;
+ for( size_t wI = 0; wI < scores.size(); wI++ )
+ total_initial_lcb_weight += scores[wI];
+ double total_current_lcb_weight = total_initial_lcb_weight;
+
+ if( adjacencies.size() == 0 )
+ return 0; // nothing can be done
+ uint seq_count = adjacencies[0].left_end.size();
+
+ double prev_score = bp_scorer.score();
+ uint report_frequency = 10;
+ uint moves_made = 0;
+
+ size_t move_count = bp_scorer.getMoveCount();
+ std::vector< std::pair< double, size_t > > move_heap( move_count * 2 );
+ size_t heap_end = move_count;
+ for( size_t moveI = 0; moveI < move_count; ++moveI )
+ {
+ move_heap[moveI].first = bp_scorer(moveI);
+ move_heap[moveI].second = moveI;
+ }
+
+#ifdef LCB_WEIGHT_LOSS_PLOT
+ std::vector< double >::iterator min_iter = std::min_element(scores.begin(), scores.end());
+ double mins = *min_iter;
+ if( status_out != NULL )
+ {
+ (*status_out) << g1_tag << '\t' << g2_tag << '\t' << lcb_count << '\t' << 1 - (total_current_lcb_weight / total_initial_lcb_weight) << '\t' << mins << endl;
+ }
+#endif
+
+ // make a heap of moves ordered by score
+ // repeatedly:
+ // 1) pop the highest scoring move off the heap
+ // 2) attempt to apply the move
+ // 3) add any new moves to the heap
+ // 4) stop when the highest scoring move no longer increases the score
+ MoveScoreHeapComparator mshc;
+ std::make_heap( move_heap.begin(), move_heap.end(), mshc );
+ while( heap_end > 0 )
+ {
+ std::pop_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
+ heap_end--;
+ std::pair< double, size_t > best_move = move_heap[ heap_end ];
+#ifdef LCB_WEIGHT_LOSS_PLOT
+ if( total_current_lcb_weight == scores[best_move.second] )
+ break; // don't remove the last LCB
+#else
+ if( (best_move.first < 0 ) ||
+ total_current_lcb_weight == scores[best_move.second] )
+ break; // can't improve score
+#endif
+
+ std::vector< std::pair< double, size_t > > new_moves;
+ bool success = bp_scorer.isValid(best_move.second, best_move.first);
+ if( !success )
+ continue;
+ bp_scorer.remove(best_move.second, new_moves);
+
+
+ for( size_t newI = 0; newI < new_moves.size(); newI++ )
+ {
+ if( heap_end < move_heap.size() )
+ {
+ heap_end++;
+ move_heap[heap_end-1] = new_moves[newI];
+ std::push_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
+ }else{
+ // just push the rest on all at once
+ size_t prev_size = move_heap.size();
+ move_heap.insert( move_heap.end(), new_moves.begin()+newI, new_moves.end() );
+ for( size_t newdI = 0; newdI < new_moves.size()-newI; newdI++ )
+ std::push_heap( move_heap.begin(), move_heap.begin()+prev_size+newdI+1, mshc );
+ heap_end = move_heap.size();
+ break;
+ }
+ }
+
+ total_current_lcb_weight -= scores[best_move.second];
+ std::vector< std::pair< uint, uint > > id_remaps;
+ std::vector< uint > impact_list;
+ lcb_count -= RemoveLCBandCoalesce( best_move.second, adjacencies[0].left_end.size(), adjacencies, scores, id_remaps, impact_list );
+#ifdef LCB_WEIGHT_LOSS_PLOT
+ mins = scores[best_move.second];
+ if( status_out != NULL )
+ {
+ (*status_out) << g1_tag << '\t' << g2_tag << '\t' << lcb_count << '\t' << 1 - (total_current_lcb_weight / total_initial_lcb_weight) << '\t' << mins << endl;
+ }
+#endif
+ double cur_score = bp_scorer.score();
+ prev_score = cur_score;
+ moves_made++;
+#ifndef LCB_WEIGHT_LOSS_PLOT
+ if( status_out != NULL && moves_made % report_frequency == 0 )
+ (*status_out) << "move: " << moves_made << " alignment score " << cur_score << std::endl;
+#endif
+ }
+
+ return 0;
+}
+
+extern bool debug_aligner;
+
+/** finds the best anchoring, returns the anchoring score */
+template< class SearchScorer >
+double greedySearch( SearchScorer& spbs )
+{
+ double prev_score = spbs.score();
+ uint report_frequency = 10;
+ uint moves_made = 0;
+ if( debug_aligner )
+ spbs.validate();
+ size_t move_count = spbs.getMoveCount();
+ std::vector< double > current_moves( spbs.getMoveCount() );
+ // use double the size for the move heap to avoid an almost instant reallocation
+ // when a new move gets pushed onto the heap
+ size_t heap_end = spbs.getMoveCount();
+ std::vector< std::pair< double, size_t > > move_heap( spbs.getMoveCount() * 2 );
+ std::vector< std::pair< double, size_t > > new_moves( spbs.getMaxNewMoveCount() + 10 );
+ for( size_t moveI = 0; moveI < move_count; ++moveI )
+ {
+ std::pair< double, size_t > p( 0, moveI );
+ double scorediff = spbs(p) - prev_score;
+ p.first = scorediff;
+ move_heap[moveI] = p;
+ current_moves[moveI] = p.first;
+ }
+
+ if( debug_aligner )
+ spbs.validate();
+ // make a heap of moves ordered by score
+ // repeatedly:
+ // 1) pop the highest scoring move off the heap
+ // 2) attempt to apply the move
+ // 3) add any new moves to the heap
+ // 4) stop when the highest scoring move no longer increases the score
+ MoveScoreHeapComparator mshc;
+ std::make_heap( move_heap.begin(), move_heap.begin() + heap_end, mshc );
+ double successful = 0;
+ double invalids = 0;
+ int progress = 0;
+ int prev_progress = -1;
+ while( heap_end > 0 )
+ {
+ std::pop_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
+ std::pair< double, size_t > best_move = move_heap[--heap_end];
+ if( best_move.first < 0 )
+ break; // can't improve score
+
+ if( best_move.first != current_moves[best_move.second] )
+ continue;
+
+ if( !spbs.isValid(best_move) )
+ {
+ invalids++;
+ continue;
+ }
+
+ size_t new_move_count = 0;
+ bool success = spbs.remove(best_move, new_moves, new_move_count);
+ if( !success )
+ {
+ std::cerr << "numerical instability? need to investigate this...\n";
+// genome::breakHere();
+ invalids++;
+ continue;
+ }
+
+ successful++;
+ if( debug_aligner )
+ spbs.validate();
+
+ current_moves[ best_move.second ] = -(std::numeric_limits<double>::max)();
+ for( size_t newI = 0; newI < new_move_count; newI++ )
+ current_moves[ new_moves[newI].second ] = new_moves[newI].first;
+
+ for( size_t newI = 0; newI < new_move_count; newI++ )
+ {
+ if( heap_end < move_heap.size() )
+ {
+ heap_end++;
+ move_heap[heap_end-1] = new_moves[newI];
+ std::push_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
+ }else{
+ // just push the rest on all at once
+ move_heap.resize( (std::min)((size_t)(heap_end * 1.6), heap_end + new_move_count) );
+ std::copy( new_moves.begin() + newI, new_moves.begin() + new_move_count, move_heap.begin()+heap_end );
+ for( size_t newdI = 0; newdI < new_move_count-newI; newdI++ )
+ std::push_heap( move_heap.begin(), move_heap.begin()+heap_end+newdI+1, mshc );
+ heap_end = move_heap.size();
+ break;
+ }
+ }
+
+ moves_made++;
+ prev_progress = progress;
+ progress = (100 * moves_made) / move_count;
+ printProgress( prev_progress, progress, std::cout );
+// if( moves_made % report_frequency == 0 )
+// cout << "move: " << moves_made << " alignment score " << cur_score << " success ratio " << successful / invalids << endl;
+ }
+
+ return spbs.score();
+}
+
+struct AlnProgressTracker
+{
+ gnSeqI total_len;
+ gnSeqI cur_leftend;
+ double prev_progress;
+};
+
+
+} // namespace mems
+
+#endif // __greedyBreakpointElimination_h__
+
diff --git a/libMems/HomologyHMM/algebras.cc b/libMems/HomologyHMM/algebras.cc
new file mode 100644
index 0000000..4996a8e
--- /dev/null
+++ b/libMems/HomologyHMM/algebras.cc
@@ -0,0 +1,52 @@
+/*
+ * This file is part of HMMoC 1.0, a hidden Markov model compiler.
+ * Copyright (C) 2006 by Gerton Lunter, Oxford University.
+ *
+ * HMMoC is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * HMMOC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HMMoC; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+\*/
+//
+// algebras.cc - extended real types
+//
+// Gerton Lunter, 27/8/04
+//
+//
+
+
+#include "algebras.h"
+
+
+BFMantissa *BFloat::aConversionLookup; // Actual location of the static members of BFloat class
+double *BFloat::aDoubleConversionLookup;
+
+
+_BFloatInitialize _dummyInitializer; // This initializes aConversionLookup and aDoubleConversionLookup
+
+
+_BFloatInitialize::_BFloatInitialize() {
+
+ BFloat::aConversionLookup = new BFMantissa[cBFloatConvTableSize];
+ BFloat::aDoubleConversionLookup = new double[cBFloatDoubleConvTableSize];
+
+ BFMantissa iBFM = 1.0;
+ for (int i = 0; i < cBFloatConvTableSize; i++) {
+ BFloat::aConversionLookup[ i ] = iBFM;
+ iBFM *= cBFloatRangeInv;
+ }
+
+ for (int i = 0; i < cBFloatDoubleConvTableSize; i++) {
+ BFloat::aDoubleConversionLookup[ i ] = exp( (i-cBFloatDoubleConvTableSize/2) * logcBFloatRange );
+ }
+
+}
diff --git a/libMems/HomologyHMM/algebras.h b/libMems/HomologyHMM/algebras.h
new file mode 100644
index 0000000..15c109d
--- /dev/null
+++ b/libMems/HomologyHMM/algebras.h
@@ -0,0 +1,558 @@
+/*
+ * This file is part of HMMoC 1.0, a hidden Markov model compiler.
+ * Copyright (C) 2006 by Gerton Lunter, Oxford University.
+ *
+ * HMMoC is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * HMMOC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HMMoC; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+\*/
+//
+// algebras.h - extended real types
+//
+// Gerton Lunter, 27/8/04
+//
+
+
+
+#ifndef _algebras_h_
+#define _algebras_h_
+
+
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+
+
+using namespace std;
+
+
+// typedefs
+
+typedef float BFMantissa;
+const BFMantissa cBFloatRange = 20282409603651670423947251286016.0; // 2.03e+31; 2^104
+const BFMantissa cBFloatRangeInv = 1.0/cBFloatRange;
+// Aaron E. Darling 6/7/7: need to typecast to avoid compiler warnings about imprecise FP representations
+const BFMantissa cBFloatRangeSqrt = (BFMantissa)1.0e+18; // Value between square root of the exponent, and the exponent
+const BFMantissa cBFloatRangeInvSqrt = (BFMantissa)1.0e-18; // Square of this should still be representable, with full mantissa!
+const BFMantissa logcBFloatRange = log(cBFloatRange);
+const int cBFloatDigits = 7; // Number of significant digits for printing (7 for floats, 16 for doubles?)
+const int cBFloatInfinity = 1000000000; // Tiniest number representable is cBFloatRangeInv ^ BFloatInfinity
+const int cBFloatConvTableSize = 100; // This includes many zero entries, it makes additions a bit faster
+const int cBFloatDoubleConvTableSize = 50; // Table size for bfloat -> double conversion; cBFloatRange^(-size/2) is double 0
+//#define BFLOAT_CHECK_UOFLOW // Don't bother with under- and overflow checking.
+
+
+//
+// BFloats: more buoyant floats.
+//
+// struct{ float + int } is 8 bytes; nice size makes noticable speed difference
+//
+class BFloat {
+ public:
+ static BFMantissa* aConversionLookup; // used by addition
+ static double* aDoubleConversionLookup; // used by Value()
+ BFMantissa f;
+ int e;
+ public:
+ BFloat(BFMantissa iF, int iE) : f(iF), e(iE) {};
+ BFloat() {};
+ ~BFloat() {};
+ inline double Value() const {
+ if (abs(e) < cBFloatDoubleConvTableSize/2) {
+ return (double)f * aDoubleConversionLookup[ e + cBFloatDoubleConvTableSize/2 ];
+ } else if (e < cBFloatDoubleConvTableSize/2) {
+ return 0.0;
+ } else {
+ return (double)f * exp((double)e * logcBFloatRange);
+ }
+ }
+ void clear() { f=0; e=-cBFloatInfinity; }
+};
+
+
+//
+// dummy class to initialise BFloat lookup table
+//
+
+class _BFloatInitialize {
+public:
+ _BFloatInitialize();
+};
+
+
+
+
+//
+// Finally, all implementations of BFloat calculations
+//
+
+
+// Normalization of BFloat result of a single operation
+#ifdef BFLOAT_CHECK_UOFLOW
+static inline void BFloatNormalise(BFloat& a)
+ //#define BFloatNormalise(a)
+{\
+ if (a.f > cBFloatRangeSqrt) {\
+ a.f *= cBFloatRangeInv;\
+ a.e++;\
+ } else if (a.f < cBFloatRangeInvSqrt) {\
+ if (a.f == 0.0) {\
+ a.e = -cBFloatInfinity;\
+ } else {\
+ a.f *= cBFloatRange;\
+ a.e--;\
+ }\
+ }\
+ if (a.e > cBFloatInfinity) {\
+ cerr << "BFloat: Overflow" << endl;\
+ a.e = cBFloatInfinity;\
+ } else if (a.e < -cBFloatInfinity) {\
+ cerr << "BFloat: Underflow" << endl;\
+ a.e = -cBFloatInfinity;\
+ a.f = 0.0;\
+ }\
+};
+#else
+static inline void BFloatNormDown(BFloat& a) {
+ a.f *= cBFloatRangeInv;
+ a.e++;
+}
+static inline void BFloatNormUp(BFloat& a) {
+ if (a.f == 0.0) {
+ a.e = -cBFloatInfinity;
+ } else {
+ a.f *= cBFloatRange;
+ a.e--;
+ }
+}
+static inline void BFloatNormalise(BFloat& a)
+ //#define BFloatNormalise(a)
+{
+ if (a.f > cBFloatRangeSqrt) {
+ BFloatNormDown(a);
+ } else if (a.f < cBFloatRangeInvSqrt) {
+ BFloatNormUp(a);
+ }
+};
+#endif
+
+static inline void DoubleNormalise(double& f, int& e)
+{
+ if (f <= 0.0) {
+ if (f != 0.0) cerr << "BFloat: Negative number: " << f << endl;
+ f = 0.0;
+ e=-cBFloatInfinity;
+ } else {
+ while (f > cBFloatRangeSqrt) {
+ f *= cBFloatRangeInv;
+ e++;
+ }
+ while (f < cBFloatRangeInvSqrt) {
+ f *= cBFloatRange;
+ e--;
+ }
+ }
+};
+
+// Logarithm of a BFloat
+static inline double bfloat_doublelog( const BFloat& a ) { return a.e*logcBFloatRange+log(a.f); }
+
+// BFloat exp of a double
+static inline BFloat bfloat_doubleexp( double iA )
+{
+ int iE = (int)floor( iA / log(cBFloatRange) );
+ iA -= iE * log(cBFloatRange);
+ BFloat iX( exp(iA), iE );
+ BFloatNormalise( iX );
+ return iX;
+}
+
+// Returns a double value - or underflow/overflow if it does not fit.
+static inline double bfloat2double( const BFloat bfloat) { return bfloat.Value(); }
+
+// Brain-dead version of double-to-BFloat conversion - can be slow if mantisse is a float
+static inline BFloat double2bfloat( double prob) {
+ if (prob <= 0.0) {
+ if (prob != 0.0)
+ cerr << "BFloat: Negative number: " << prob << endl;
+ return BFloat (0.0, -cBFloatInfinity );
+ } else {
+ register BFloat a( 0.0, 0 );
+ while (prob > cBFloatRangeSqrt) {
+ prob *= cBFloatRangeInv;
+ a.e++;
+ }
+ //if (prob == 0.0) {
+ // a.e = -cBFloatInfinity;
+ //} else {
+ while ((prob < cBFloatRangeInvSqrt)) {
+ prob *= cBFloatRange;
+ a.e--;
+ }
+ a.f = prob;
+ return a;
+ }
+}
+
+static inline BFloat bfloat_pr_product (const BFloat& a, const BFloat& b)
+{
+ register BFloat sf(a.f*b.f,a.e+b.e);
+ BFloatNormalise(sf);
+ return sf;
+}
+
+static inline BFloat bfloat_pr_double_product (const BFloat& a, double b)
+{
+ register double mantisse = a.f*b;
+ int exponent = a.e;
+ DoubleNormalise(mantisse, exponent);
+ return BFloat(mantisse, exponent);
+}
+
+static inline void bfloat_pr_product_accum( BFloat& a, const BFloat& b) {
+ a.f *= b.f; a.e += b.e;
+ BFloatNormalise( a );
+}
+
+static inline void bfloat_pr_double_product_accum (BFloat& a, double b)
+{
+ register double mantisse = a.f*b;
+ DoubleNormalise(mantisse, a.e);
+ a.f = mantisse;
+}
+
+static inline BFloat bfloat_pr_quotient( const BFloat& a, const BFloat& b)
+{
+ register BFloat sf(a.f/b.f, a.e-b.e);
+ BFloatNormalise(sf);
+ return sf;
+}
+
+static inline void bfloat_pr_quotient_accum( BFloat& a, const BFloat& b)
+{
+ a.f /= b.f;
+ a.e -= b.e;
+ BFloatNormalise( a );
+}
+
+static inline BFloat bfloat_pr_sum(const BFloat& a, const BFloat& b)
+{
+ if (a.e > b.e) {
+ if (a.e >= b.e + cBFloatConvTableSize)
+ return a;
+ else
+ return BFloat( a.f + b.f * BFloat::aConversionLookup[ a.e - b.e ], a.e );
+ } else {
+ if (a.e <= b.e - cBFloatConvTableSize)
+ return b;
+ else
+ return BFloat( b.f + a.f * BFloat::aConversionLookup[ b.e - a.e ], b.e );
+ }
+}
+
+static inline void bfloat_pr_sum_accum( BFloat& a, const BFloat& b)
+{
+ if (a.e >= b.e) {
+ if (a.e < b.e + cBFloatConvTableSize)
+ a.f += b.f * BFloat::aConversionLookup[ a.e - b.e ];
+ } else {
+ if (a.e > b.e - cBFloatConvTableSize) {
+ a.f = b.f + a.f * BFloat::aConversionLookup[ b.e - a.e ];
+ a.e = b.e;
+ } else {
+ a = b;
+ }
+ }
+}
+
+static inline bool bfloat_less( const BFloat& a, const BFloat& b)
+{
+ if (a.e > b.e) {
+ if (a.e >= b.e + cBFloatConvTableSize)
+ return false;
+ else
+ return a.f < b.f * BFloat::aConversionLookup[ a.e - b.e ];
+ }
+ if (a.e <= b.e - cBFloatConvTableSize)
+ return true;
+ else
+ return a.f * BFloat::aConversionLookup[ b.e - a.e ] < b.f;
+};
+
+static inline bool bfloat_equal( const BFloat& a, const BFloat& b)
+{
+ if (a.e > b.e) {
+ if (a.e >= b.e + cBFloatConvTableSize)
+ return false;
+ else
+ return a.f == b.f * BFloat::aConversionLookup[ a.e - b.e ];
+ }
+ if (a.e <= b.e - cBFloatConvTableSize)
+ return false;
+ else
+ return a.f * BFloat::aConversionLookup[ b.e - a.e ] == b.f;
+};
+
+static inline bool bfloat_lessequal( const BFloat& a, const BFloat& b)
+{
+ if (a.e > b.e) {
+ if (a.e >= b.e + cBFloatConvTableSize)
+ return false;
+ else
+ return a.f <= b.f * BFloat::aConversionLookup[ a.e - b.e ];
+ }
+ if (a.e <= b.e - cBFloatConvTableSize)
+ return true;
+ else
+ return a.f * BFloat::aConversionLookup[ b.e - a.e ] <= b.f;
+};
+
+static inline ostream& bfloat_print( ostream& out, const BFloat& x )
+{
+ static const double log10 = log(10.0);
+ static const double maxmantisse = 10.0 * (1.0 - 0.55 * exp(-cBFloatDigits * log10));
+ //out.setf(ios::fixed,ios::floatfield);
+ out.precision( cBFloatDigits );
+ if (x.e == cBFloatInfinity) {
+ out << 1.0 << "e+Inf";
+ }
+ if (x.e == -cBFloatInfinity) {
+ out << 1.0 << "e-Inf";
+ } else {
+ double iM = (log(x.f) + log(cBFloatRange)*(double)x.e) / log10;
+ long iExp = long(floor(iM));
+ iM = exp((iM - iExp) * log10);
+ if (iM > maxmantisse) {
+ iExp += 1;
+ iM = 1.0;
+ }
+ out << iM << ( iExp<0 ? "e" : "e+" ) << iExp;
+ }
+ //out.setf(ios::fixed,ios::floatfield); // default // ****** first arg should be 0
+ out.precision( 6 ); // default
+ return out;
+}
+
+
+
+//
+// Wrapper to allow BFloats to be used by Algebra template
+//
+struct BFloatMethods
+{
+ typedef BFloat Value;
+ static inline double to_prob (BFloat iX) { return bfloat2double(iX); }
+ static inline BFloat from_prob (double iP) { return double2bfloat(iP); }
+ static inline BFloat pmul( BFloat iX, BFloat iY) { return bfloat_pr_product(iX,iY); }
+ static inline BFloat pmuldouble( BFloat iX, double iY) { return bfloat_pr_double_product(iX,iY); }
+ static inline BFloat pdiv( BFloat iX, BFloat iY) { return bfloat_pr_quotient(iX,iY); }
+ static inline BFloat psum( BFloat iX, BFloat iY) { return bfloat_pr_sum(iX,iY); }
+ static inline BFloat pdiff( BFloat iX, BFloat iY) { cerr << "Bfloat pdiff: Not implemented." << endl; return BFloat(0,0); }
+ static inline BFloat doubleexp( double iX) { return bfloat_doubleexp(iX); }
+ static inline double doublelog( BFloat iX) { return bfloat_doublelog(iX); }
+ static inline void pmulacc( BFloat& iX, BFloat iY) { bfloat_pr_product_accum(iX,iY); }
+ static inline void pmulaccdouble( BFloat& iX, double iY) { bfloat_pr_double_product_accum(iX,iY); }
+ static inline void pdivacc( BFloat& iX, BFloat iY) { bfloat_pr_quotient_accum(iX,iY); }
+ static inline void psumacc( BFloat& iX, BFloat iY) { bfloat_pr_sum_accum(iX,iY); }
+ static inline void pdiffacc( BFloat& iX, BFloat iY) { cerr << "Bfloat pdiffacc: Not implemented." << endl; }
+ static inline bool less( BFloat iX, BFloat iY) { return bfloat_less(iX,iY); }
+ static inline bool equal( BFloat iX, BFloat iY) { return bfloat_equal(iX,iY); }
+ static inline bool lessequal( BFloat iX, BFloat iY) { return bfloat_lessequal(iX,iY); }
+ static inline ostream& print( ostream& iOut, BFloat iX ) { return bfloat_print( iOut, iX ); }
+};
+
+
+
+
+//
+// Simple log-space numbers - don't use, except possibly for Viterbi
+//
+class Logspace {
+ double x;
+ public:
+ Logspace( double x ) : x(x) {}
+ Logspace() {}
+ operator double&(){ return x; }
+ void clear() {x=-1.0e+300;}
+};
+
+inline Logspace logspace_addsmall( Logspace iX, Logspace iY ) {
+ if (iX - iY > 36.7) return iX;
+ return iX + log(1.0+exp(iY-iX));
+}
+
+inline Logspace logspace_add( Logspace iX, Logspace iY ) {
+ if (iX>iY) return logspace_addsmall(iX,iY); else return logspace_addsmall(iY,iX);
+}
+
+struct LogspaceMethods
+{
+ typedef Logspace Value;
+ static inline double to_prob (Value iX) { return exp(iX); }
+ static inline Value from_prob (double iP) { return Value(log(iP)); }
+ static inline Value pmul( Value iX, Value iY) { return iX+iY; }
+ static inline Value pmuldouble( Value iX, double iY) { return iX+log(iY); }
+ static inline Value pdiv( Value iX, Value iY) { return iX-iY; }
+ static inline Value psum( Value iX, Value iY) { return logspace_add(iX,iY); }
+ static inline Value pdiff( Value iX, Value iY) { cerr << "Logspace pdiff: Not implemented." << endl; return 0.0; }
+ static inline Value doubleexp( double iX) { return iX; }
+ static inline double doublelog( Value iX) { return iX; }
+ static inline void pmulacc( Value& iX, Value iY) { iX+=iY; }
+ static inline void pmulaccdouble( Value& iX, double iY) { iX+=log(iY); }
+ static inline void pdivacc( Value& iX, Value iY) { iX -= iY; }
+ static inline void psumacc( Value& iX, Value iY) { iX = logspace_add(iX,iY); }
+ static inline void pdiffacc( Value& iX, Value iY) { cerr << "Logspace pdiffacc: Not implemented." << endl; }
+ static inline bool less( Value iX, Value iY) { return iX<iY; }
+ static inline bool equal( Value iX, Value iY) { return iX==iY; }
+ static inline bool lessequal( Value iX, Value iY) { return iX<=iY; }
+ static inline ostream& print( ostream& iOut, Value iX ) { return bfloat_print( iOut, bfloat_doubleexp(iX) ); }
+};
+
+
+
+//
+// Algebra - Wrapper for overloading all arithmetic operators, to use different 'algebras'.
+//
+// Gerton Lunter, 19/3/03
+// Based on logprob.h by by Ian Holmes.
+//
+
+
+template <class AlgebraMethods>
+class Algebra {
+public:
+ // typedef
+ typedef typename AlgebraMethods::Value Value;
+
+ // value
+ Value val;
+
+public:
+ // constructors
+ Algebra() { } // no initialisation, for speed
+ Algebra (double px) : val(from_prob(px)) { }
+ Algebra (const Algebra& lx) : val(lx.val) { }
+ Algebra (const BFloat v) : val(v) { }
+
+ // fast initialization
+ void clear() { val.clear(); }
+
+ // assignment operators
+ inline Algebra& operator= (const Algebra& lx) { val = lx.val; return *this; }
+ inline Algebra& operator= (double px) { val = from_prob(px); return *this; }
+
+ // arithmetic operators; all combinations of Algebra and double are covered
+ inline friend Algebra operator+ (const Algebra& lx, const Algebra& ly) { return from_log (psum (lx.val, ly.val)); }
+ inline friend Algebra operator+ (const Algebra& lx, double py) { return from_log (psum (lx.val, from_prob(py))); }
+ inline friend Algebra operator+ (double px, const Algebra& ly) { return from_log (psum (from_prob(px), ly.val)); }
+ inline Algebra& operator+= (const Algebra& lx) { psumacc (val, lx.val); return *this; }
+ inline Algebra& operator+= (double px) { psumacc (val, from_prob(px)); return *this; }
+
+ inline friend Algebra operator- (const Algebra& lx, const Algebra& ly) { return from_log (pdiff (lx.val, ly.val)); }
+ inline friend Algebra operator- (const Algebra& lx, double py) { return from_log (pdiff (lx.val, from_prob(py))); }
+ inline friend Algebra operator- (double px, const Algebra& ly) { return from_log (pdiff (from_prob(px), ly.val)); }
+ inline Algebra& operator-= (const Algebra& lx) { pdiffacc (val, lx.val); return *this; }
+ inline Algebra& operator-= (double px) { pdiffacc (val, from_prob(px)); return *this; }
+
+ inline friend Algebra operator* (const Algebra& lx, const Algebra& ly) { return from_log (pmul (lx.val, ly.val)); }
+ inline friend Algebra operator* (const Algebra& lx, double py) { return from_log (pmuldouble (lx.val, py)); }
+ inline friend Algebra operator* (double px, const Algebra& ly) { return from_log (pmuldouble (ly.val, px)); }
+ inline Algebra& operator*= (const Algebra& lx) { pmulacc (val, lx.val); return *this; }
+ inline Algebra& operator*= (double px) { pmulaccdouble (val, px); return *this; }
+
+ inline friend Algebra operator/ (const Algebra& lx, const Algebra& ly) { return from_log (pdiv (lx.val, ly.val)); }
+ inline friend Algebra operator/ (const Algebra& lx, double py) { return from_log (pdiv (lx.val, from_prob(py))); }
+ inline friend Algebra operator/ (double px, const Algebra& ly) { return from_log (pdiv (from_prob(px), ly.val)); }
+ inline Algebra& operator/= (const Algebra& lx) { pdivacc (val, lx.val); return *this; }
+ inline Algebra& operator/= (double px) { pdivacc (val, from_prob(px)); return *this; }
+
+ // miscellaneous operators
+ inline friend double log( const Algebra& lx ) { return doublelog( lx.val ); }
+ inline friend Algebra exp( const Algebra& px ) { return doubleexp( to_prob(px) ); }
+
+ // increment & decremement
+ Algebra& operator++() { *this += 1.; return *this; }
+ Algebra operator++(int) { Algebra tmp (*this); ++(*this); return tmp; }
+
+ Algebra& operator--() { *this -= 1.; return *this; }
+ Algebra operator--(int) { Algebra tmp (*this); --(*this); return tmp; }
+
+ // relational operators
+ inline friend int operator== (const Algebra& lx, const Algebra& ly) { return equal(lx.val, ly.val); }
+ inline friend int operator== (const Algebra& lx, const double py) { return equal(lx.val, from_prob(py)); }
+ inline friend int operator== (const double px, const Algebra& ly) { return equal(from_prob(px), ly.val); }
+
+ inline friend int operator!= (const Algebra& lx, const Algebra& ly) { return !equal(lx.val, ly.val); }
+ inline friend int operator!= (const Algebra& lx, const double py) { return !equal(lx.val, from_prob(py)); }
+ inline friend int operator!= (const double px, const Algebra& ly) { return !equal(from_prob(px), ly.val); }
+
+ inline friend int operator< (const Algebra& lx, const Algebra& ly) { return less(lx.val, ly.val); }
+ inline friend int operator< (const Algebra& lx, const double py) { return less(lx.val, from_prob(py)); }
+ inline friend int operator< (const double px, const Algebra& ly) { return less(from_prob(px), ly.val); }
+
+ inline friend int operator> (const Algebra& lx, const Algebra& ly) { return less(ly.val, lx.val); }
+ inline friend int operator> (const Algebra& lx, const double py) { return less(from_prob(py), lx.val); }
+ inline friend int operator> (const double px, const Algebra& ly) { return less(ly.val, from_prob(px)); }
+
+ inline friend int operator<= (const Algebra& lx, const Algebra& ly) { return lessequal(lx.val, ly.val); }
+ inline friend int operator<= (const Algebra& lx, const double py) { return lessequal( lx.val, from_prob(py) ); }
+ inline friend int operator<= (const double px, const Algebra& ly) { return lessequal( from_prob(px), ly.val); }
+
+ inline friend int operator>= (const Algebra& lx, const Algebra& ly) { return lessequal( ly.val, lx.val); }
+ inline friend int operator>= (const Algebra& lx, const double py) { return lessequal( from_prob(py), lx.val ); }
+ inline friend int operator>= (const double px, const Algebra& ly) { return lessequal( ly.val, from_prob(px) ); }
+
+ // stream operators
+ inline friend ostream& operator<< (ostream& out, const Algebra& lx) { return AlgebraMethods::print(out, lx.val); }
+ inline friend istream& operator>> (istream& in, const Algebra& lx) { double px; in >> px; lx.val = px; return in; }
+
+ // cast operators
+ inline double prob() const { return to_prob (val); }
+ inline operator double() const { return to_prob (val); }
+
+private:
+ // private AlgebraMethods method wrappers
+ static inline double to_prob (Value X) { return AlgebraMethods::to_prob (X); }
+ static inline Value from_prob (double P) { return AlgebraMethods::from_prob (P); }
+ static inline Value pmul (Value X, Value Y) { return AlgebraMethods::pmul (X, Y); }
+ static inline Value pmuldouble (Value X, double Y) { return AlgebraMethods::pmuldouble (X, Y); }
+ static inline Value pdiv (Value X, Value Y) { return AlgebraMethods::pdiv( X, Y); }
+ static inline Value psum (Value X, Value Y) { return AlgebraMethods::psum (X, Y); }
+ static inline Value pdiff (Value X, Value Y) { return AlgebraMethods::pdiff (X, Y); }
+ static inline Value doubleexp (double X) { return AlgebraMethods::doubleexp( X ); }
+ static inline double doublelog (Value X) { return AlgebraMethods::doublelog( X ); }
+ static inline void pmulacc (Value& X, Value Y) { AlgebraMethods::pmulacc (X, Y); }
+ static inline void pmulaccdouble (Value& X, double Y) { AlgebraMethods::pmulaccdouble (X, Y); }
+ static inline void pdivacc( Value& X, Value Y) { AlgebraMethods::pdivacc( X, Y); }
+ static inline void psumacc (Value& X, Value Y) { AlgebraMethods::psumacc (X, Y); }
+ static inline void pdiffacc (Value& X, Value Y) { AlgebraMethods::pdiffacc (X, Y); }
+ static inline bool less (Value X, Value Y ) { return AlgebraMethods::less( X, Y ); }
+ static inline bool equal (Value X, Value Y ) { return AlgebraMethods::equal( X, Y ); }
+ static inline bool lessequal( Value X, Value Y ) { return AlgebraMethods::lessequal( X, Y ); }
+
+public:
+ // static constructor from logspace value
+ static inline Algebra from_log (Value X) { Algebra lx; lx.val = X; return lx; }
+};
+
+
+
+//
+// and bfloats are the things that we'll use:
+//
+
+#define bfloat Algebra<BFloatMethods>
+
+#define logspace Algebra<LogspaceMethods>
+
+#endif
diff --git a/libMems/HomologyHMM/dptables.h b/libMems/HomologyHMM/dptables.h
new file mode 100644
index 0000000..90c9c9a
--- /dev/null
+++ b/libMems/HomologyHMM/dptables.h
@@ -0,0 +1,387 @@
+/*
+ * This file is part of HMMoC 1.0, a hidden Markov model compiler.
+ * Copyright (C) 2006 by Gerton Lunter, Oxford University.
+ *
+ * HMMoC is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * HMMOC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HMMoC; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+\*/
+/*
+ *
+ * DPTable.h -- ordinary and sparse dynamic programming tables
+ *
+ * Gerton Lunter, 1 Oct 2006
+ *
+ * Modified for GCC 4.0.2 and Microsoft Visual Studio 2005 by Aaron Darling, 2007
+ *
+ */
+
+#ifndef __dptable_h_
+#define __dptable_h_
+
+
+#include <map>
+#include <cassert>
+
+
+#ifdef __GNUC__
+ #define HAVE_HASH_MAP
+ #if __GNUC__ < 3
+ #include <hash_map.h>
+ namespace Sgi { using ::hash_map; }; // inherit globals
+ #else
+ #include <ext/hash_map>
+ #if __GNUC_MINOR__ + __GNUC__ == 3
+ namespace Sgi = std; // GCC 3.0
+ #else
+ namespace Sgi = ::__gnu_cxx; // GCC 3.1 and later
+ #endif
+ #endif
+#else // ...there are other compilers, right?
+#ifdef _MSC_VER
+// visual studio 2005 has no hash map. older versions did.
+#else
+// default for all other compilers
+#define HAVE_HASH_MAP
+namespace Sgi = std;
+#endif
+#endif
+
+
+using std::map;
+#ifdef HAVE_HASH_MAP
+using Sgi::hash_map;
+#endif
+
+// Define aliases for two maps: red-black trees, and hashes
+// (GNU C++ does not define a hash function for long long int, so we have to define our own)
+
+template<class key>
+struct _hash {
+ size_t operator()(long long x) const { return x; }
+};
+
+// typedefs can't be templated
+
+template<class key, class value>
+class treemap : public map<key,value> {};
+
+#ifdef HAVE_HASH_MAP
+template<class key, class value>
+class hashmap : public hash_map<key,value,_hash<key> > {};
+#endif
+
+// select one of the maps (the hash map is faster and appears to use less memory)
+
+#ifdef HAVE_HASH_MAP
+#define _mymap hashmap
+#else
+#define _mymap treemap
+#endif
+
+
+
+// States are stored in a self-initializing array
+
+template<class Real,int size> class States;
+
+template<int size>
+class States<double,size> {
+private:
+ double data[size];
+public:
+ enum { length = size }; // to know the size, just in case
+ States() { for (int i=0; i<size; i++) data[i]=0; } // initialization
+ operator double* () { return data; } // cast to actual array
+ operator const double* () const { return data; }
+};
+
+template<class Real, int size>
+class States {
+private:
+ Real data[size];
+public:
+ enum { length = size }; // to know the size, just in case
+ States() { for (int i=0; i<size; i++) data[i].clear(); } // initialization
+ operator Real* () { return data; } // cast to actual array
+ operator const Real* () const { return data; }
+};
+
+
+
+// Define index types to serve as keys to the DP table position
+
+template<int dim> class _index {};
+template<> class _index<1> { public: typedef unsigned int t; };
+template<> class _index<2> { public: typedef unsigned long t; };
+template<> class _index<3> { public: typedef unsigned long long t; };
+template<> class _index<4> { public: typedef unsigned long long t; };
+
+
+//
+// Base classes for a dynamic programming table
+//
+// DP tables provide the following methods:
+//
+// const States& read(...) : read access to state array
+// States& write(...) : write access to state array
+// void written() : signal that write access is finished
+// void allocate(...) : inform table about its dimensions; must be called before read/write access
+// void clear() : empties the table; keeps its dimensions
+// void clear(int) : empties one column of a (folded) DP table
+// void absolve() : ensures that table does not delete its data; another table with reference to the data,
+// which is created by the default copy constructor, is now responsible. Not allowed for
+// folded tables
+//
+
+template<class States>
+class _DPT { // base class, keeps track of the responsibility for the data
+ private:
+ void clear(int i) {} // placeholder, to allow dummy definition for non-folded tables
+ protected:
+ bool isInCharge; // true if this class' destructor destroys the data
+ public:
+ typedef States states_type;
+ _DPT() : isInCharge(true) {}
+ void absolve() { isInCharge=false; } // take away the responsibility of destroying the data
+ void written() {} // signal that we're done writing -- used by extensions
+};
+
+template<template<typename,int> class DPTable, class States, int dim> // Wrapper for memory-efficient Fw/Bw/Baum-Welch
+class _FoldedTable : public _DPT<States> {
+ protected:
+ DPTable<States,dim-1>* aTables[2];
+ public:
+ _FoldedTable() { aTables[0] = new DPTable<States,dim-1>(); aTables[1] = new DPTable<States,dim-1>(); }
+ ~_FoldedTable() { assert(_DPT<States>::isInCharge); delete aTables[0]; delete aTables[1]; } // do not allow data to be retained
+ void clear(int i) { aTables[i%2]->clear(); }
+};
+
+template<class States, int dim>
+class DPTable {};
+
+template<class States, int dim>
+class SparseDPTable {};
+
+template<template<typename,int> class DPTable, class States, int dim>
+class FoldedTable {};
+
+
+
+// Explicit partial specializations for up to 4 spatial dimensions
+
+
+template<template<typename,int> class DPTable, class States>
+class FoldedTable<DPTable, States, 0> : public _FoldedTable<DPTable, States, 1> {
+ public:
+ void allocate() { this->aTables[0]->allocate(); };
+ const States& read() const { return this->aTables[0]->read(); }
+ States& write() { return this->aTables[0]->write(); }
+ void written() { this->aTables[0]->written(); }
+};
+
+
+template<template<typename,int> class DPTable, class States>
+class FoldedTable<DPTable, States, 1> : public _FoldedTable<DPTable, States, 1> {
+ int z;
+ public:
+ void allocate(int a) { this->aTables[0]->allocate(); this->aTables[1]->allocate(); };
+ const States& read(int a) const { return this->aTables[a%2]->read(); }
+ States& write(int a) { return this->aTables[z=a%2]->write(); }
+ void written() { this->aTables[z]->written(); }
+};
+
+
+template<template<typename,int> class DPTable, class States>
+class FoldedTable<DPTable, States, 2> : public _FoldedTable<DPTable, States, 2> {
+ int z;
+ public:
+ void allocate(int a, int b) { this->aTables[0]->allocate(a); this->aTables[1]->allocate(a); };
+ const States& read(int a, int b) const { return this->aTables[b%2]->read(a); }
+ States& write(int a, int b) { return this->aTables[z=b%2]->write(a); }
+ void written() { this->aTables[z]->written(); }
+};
+
+
+template<template<typename,int> class DPTable, class States>
+class FoldedTable<DPTable, States, 3> : public _FoldedTable<DPTable, States, 3> {
+ int z;
+ public:
+ void allocate(int a, int b, int c) { this->aTables[0]->allocate(a,b); this->aTables[1]->allocate(a,b); };
+ const States& read(int a, int b, int c) const { return this->aTables[c%2]->read(a,b); }
+ States& write(int a, int b, int c) { return this->aTables[z=c%2]->write(a,b); }
+ void written() { this->aTables[z]->written(); }
+};
+
+
+template<template<typename,int> class DPTable, class States>
+class FoldedTable<DPTable, States, 4> : public _FoldedTable<DPTable, States, 4> {
+ int z;
+ public:
+ void allocate(int a, int b, int c, int d) { this->aTables[0]->allocate(a,b,c); this->aTables[1]->allocate(a,b,c); };
+ const States& read(int a, int b, int c, int d) const { return this->aTables[d%2]->read(a,b,c); }
+ States& write(int a, int b, int c, int d) { return this->aTables[z=d%2]->write(a,b,c); }
+ void written() { this->aTables[z]->written(); }
+};
+
+
+template<class States>
+class DPTable<States,0> : public _DPT<States> {
+private:
+ States *pTable;
+public:
+ DPTable() { pTable = 0; };
+ ~DPTable() { if (pTable && _DPT<States>::isInCharge) delete pTable; };
+ void allocate() { pTable = new States(); };
+ void clear() { delete pTable; allocate(); };
+ const States& read() const { return *pTable; }
+ States& write() { return *pTable; }
+};
+
+
+template<class States>
+class DPTable<States,1> : public _DPT<States> {
+private:
+ States *pTable;
+ int maxa;
+public:
+ DPTable() { pTable = 0; }
+ ~DPTable() { if (pTable && _DPT<States>::isInCharge ) { delete[] pTable; } }
+ void allocate(int a) { maxa = a; pTable = new States[a]; }
+ void clear() { delete[] pTable; allocate(maxa); };
+ const States& read(int a) const { return pTable[a]; }
+ States& write(int a) { return pTable[a]; }
+};
+
+
+template<class States>
+class DPTable<States,2> : public _DPT<States> {
+private:
+ States *pTable;
+ int maxa, maxb;
+public:
+ DPTable() { pTable = 0; }
+ ~DPTable() { if (pTable && _DPT<States>::isInCharge ) { delete[] pTable; } }
+ void allocate(int a, int b) { maxa = a; maxb = b; pTable = new States[a*b]; }
+ void clear() { delete[] pTable; allocate(maxa,maxb); };
+ const States& read(int a, int b) const { return pTable[a+maxa*b]; }
+ States& write(int a, int b) { return pTable[a+maxa*b]; }
+};
+
+
+template<class States>
+class DPTable<States,3> : public _DPT<States> {
+private:
+ States *pTable;
+ int maxa, maxb, maxc;
+public:
+ DPTable() { pTable = 0; }
+ ~DPTable() { if (pTable && _DPT<States>::isInCharge ) { delete[] pTable; } }
+ void allocate(int a, int b, int c) { maxa = a; maxb = b; maxc = c; pTable = new States[a*b*c]; }
+ void clear() { delete[] pTable; allocate(maxa,maxb,maxc); };
+ const States& read(int a, int b, int c) const { return pTable[a+maxa*(b+maxb*c)]; }
+ States& write(int a, int b, int c) { return pTable[a+maxa*(b+maxb*c)]; }
+};
+
+
+template<class States>
+class SparseDPTable<States,0> : public DPTable<States,0> {};
+
+
+template<class States>
+class SparseDPTable<States,1> : public _DPT<States> {
+private:
+ typedef _index<1>::t idx;
+ _mymap<idx,States> &table;
+ const States zero;
+public:
+ SparseDPTable() : table(*new _mymap<_index<1>::t,States>), zero() {};
+ ~SparseDPTable() { if (_DPT<States>::isInCharge) delete &table; };
+ void allocate(int a) {};
+ void clear() { table.clear(); }
+ States& write(int a) { return table[idx(a)]; }
+ const States& read(int a) const {
+ _mymap<_index<1>::t,char>::iterator iter2;
+ typename _mymap<idx,States>::const_iterator iter = table.find(idx(a));
+ if (iter == table.end()) return zero;
+ return iter->second;
+ }
+};
+
+
+template<class States>
+class SparseDPTable<States,2> : public _DPT<States> {
+private:
+ typedef _index<2>::t idx;
+ _mymap<idx,States> &table;
+ idx maxa;
+ const States zero;
+public:
+ SparseDPTable() : table(*new _mymap<idx,States>), zero() {};
+ ~SparseDPTable() { if (_DPT<States>::isInCharge) delete &table; };
+ void allocate(int a, int b) { maxa = a; };
+ void clear() { table.clear(); }
+ States& write(int a, int b) { return table[idx(a)+maxa*idx(b)]; }
+ const States& read(int a, int b) const {
+ typename _mymap<idx,States>::const_iterator iter = table.find(unsigned(a)+maxa*unsigned(b));
+ if (iter == table.end()) return zero;
+ return iter->second;
+ }
+};
+
+
+template<class States>
+class SparseDPTable<States,3> : public _DPT<States> {
+
+private:
+ typedef _index<3>::t idx;
+ _mymap<idx,States> &table;
+ idx maxa, maxb;
+ const States zero;
+public:
+ SparseDPTable() : table(*new _mymap<idx,States>), zero() {};
+ ~SparseDPTable() { if (_DPT<States>::isInCharge) delete &table; };
+ void allocate(int a, int b, int c) { maxa = a; maxb = b; };
+ void clear() { table.clear(); }
+ States& write(int a, int b, int c) { return table[idx(a)+maxa*(idx(b)+maxb*idx(c))]; }
+ const States& read(int a, int b, int c) const {
+ typename _mymap<idx,States>::const_iterator iter = table.find(unsigned(a)+maxa*(unsigned(b)+maxb*unsigned(c)));
+ if (iter == table.end()) return zero;
+ return iter->second;
+ }
+};
+
+
+template<class States>
+class SparseDPTable<States,4> : public _DPT<States> {
+private:
+ typedef _index<4>::t idx;
+ _mymap<idx,States> &table;
+ idx maxa, maxb, maxc;
+ const States zero;
+public:
+ SparseDPTable() : table(*new _mymap<idx,States>), zero() {};
+ ~SparseDPTable() { if (_DPT<States>::isInCharge) delete &table; };
+ void allocate(int a, int b, int c, int d) { maxa=a; maxb=b; maxc=c; }
+ void clear() { table.clear(); }
+ States& write(int a, int b, int c, int d) { return table[unsigned(a)+maxa*(unsigned(b)+maxb*(unsigned(c)+maxc*unsigned(d)))]; }
+ const States& read(int a, int b, int c, int d) const {
+ typename _mymap<idx,States>::const_iterator iter = table.find(unsigned(a)+maxa*(unsigned(b)+maxb*(unsigned(c)+maxc*unsigned(d))));
+ if (iter == table.end()) return zero;
+ return iter->second;
+ }
+};
+
+
+#endif
+
diff --git a/libMems/HomologyHMM/homology.cc b/libMems/HomologyHMM/homology.cc
new file mode 100644
index 0000000..55d10be
--- /dev/null
+++ b/libMems/HomologyHMM/homology.cc
@@ -0,0 +1,786 @@
+/* Code generated by HMMoC version VERSION, Copyright (C) 2006 Gerton Lunter */
+/* Generated from file homology.xml (author: Aaron Darling) on Mon Jul 16 11:09:12 EST 2007 */
+
+/*
+This file is a work based on HMMoC VERSION, a hidden Markov model compiler.
+Copyright (C) 2006 by Gerton Lunter, Oxford University.
+
+HMMoC and works based on it are free software; you can redistribute
+it and/or modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+HMMOC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with HMMoC; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+
+#include "homology.h"
+
+
+const extern string _HomologystateId[];
+const extern string _HomologyemissionId[];
+const extern string _HomologytransitionId[];
+const extern string _HomologytransF[];
+const extern string _HomologytransT[];
+const extern string _HomologytransP[];
+const extern string _HomologytransE[];
+const extern string _HomologyoutputId[];
+const extern string _Homologyempty;
+const extern int _HomologystateNum;
+const extern int _HomologyemitNum;
+const extern int _HomologytransNum;
+const extern int _HomologyoutputNum;
+
+HomologyDPTable::HomologyDPTable(int iLen) : isInCharge(true), stateId(_HomologystateId), emissionId(_HomologyemissionId), transitionId(_HomologytransitionId), transitionFrom(_HomologytransF), transitionTo(_HomologytransT), transitionProb(_HomologytransP), transitionEmit(_HomologytransE), outputId(_HomologyoutputId) {
+ // init code:
+ this->iLen = iLen;
+ StateMemoryblock2.allocate(0+iLen);
+ StateMemoryblock1.allocate();
+ StateMemoryblock3.allocate();
+}
+
+
+HomologyDPTable::~HomologyDPTable() {
+ if (!isInCharge) {
+ // make sure data does not get deleted:
+ StateMemoryblock2.absolve();
+ StateMemoryblock1.absolve();
+ StateMemoryblock3.absolve();
+ } // if(!isInCharge)
+} // destructor
+
+const string& HomologyDPTable::getTransitionId(int id) { return id>=0 && id<_HomologytransNum ? _HomologytransitionId[id] : _Homologyempty; }
+const string& HomologyDPTable::getEmissionId(int id) { return id>=0 && id<_HomologyemitNum ? _HomologyemissionId[id] : _Homologyempty; }
+const string& HomologyDPTable::getStateId(int id) { return id>=0 && id<_HomologystateNum ? _HomologystateId[id] : _Homologyempty; }
+const string& HomologyDPTable::getOutputId(int id) { return id>=0 && id<_HomologyoutputNum ? _HomologyoutputId[id] : _Homologyempty; }
+int HomologyDPTable::getId(const string& sId)
+{
+ static bool bInit = false;
+ static map<string,int>* pmId;
+ if (!bInit) {
+ pmId = new map<string,int>();
+ for (int i=0;i<_HomologystateNum;i++) {
+ (*pmId)[_HomologystateId[i]] = i; // add state identifiers
+ }
+ for (int i=0; i<_HomologyemitNum; i++) {
+ (*pmId)[_HomologyemissionId[i]] = i; // add emission identifiers
+ }
+ for (int i=0; i<_HomologytransNum; i++) {
+ (*pmId)[_HomologytransitionId[i]] = i; // add transition identifiers
+ }
+ for (int i=0; i<_HomologyoutputNum; i++) {
+ (*pmId)[_HomologyoutputId[i]] = i; // finally, add output identifiers
+ }
+ bInit = true;
+ }
+ map<string,int>::iterator iter = pmId->find(sId);
+ if (iter == pmId->end()) {
+ if (sId == "_cleanup_") {
+ delete pmId;
+ } else {
+ cout << "HomologyDPTable::getId: WARNING: identifier '" << sId << "' not found." << endl;
+ }
+ return -1;
+ }
+ return iter->second;
+}
+
+
+bfloat HomologyDPTable::getProb(const string sState ,int iPos0) const
+{
+ return getProb(getId(sState) ,iPos0);
+}
+
+
+bfloat HomologyDPTable::getProb(int iState ,int iPos0) const
+{
+ const bfloat *CurStateMemoryblock1Secondary;
+ const bfloat *CurStateMemoryblock2Secondary;
+ const bfloat *CurStateMemoryblock3Secondary;
+ static const int blockTable[] = {0, 1, 1, 2};
+ static const int stateTable[] = {0, 0, 1, 0};
+ switch (blockTable[iState]) {
+ default:
+ return 0.0;
+ break;
+ case 0:
+ if ((iPos0+0>=0)&&(iPos0+0<=0)) {
+ CurStateMemoryblock1Secondary = this->StateMemoryblock1.read();
+ return CurStateMemoryblock1Secondary[stateTable[iState]];
+ }
+ else {
+ return 0.0;
+
+ }
+ break;
+ case 1:
+ if ((iPos0+0>=1)&&(iPos0+0<=iLen+0)) {
+ CurStateMemoryblock2Secondary = this->StateMemoryblock2.read((iPos0-(0))-(1));
+ return CurStateMemoryblock2Secondary[stateTable[iState]];
+ }
+ else {
+ return 0.0;
+
+ }
+ break;
+ case 2:
+ if ((iPos0+0>=iLen+0)&&(iPos0+0<=iLen+0)) {
+ CurStateMemoryblock3Secondary = this->StateMemoryblock3.read();
+ return CurStateMemoryblock3Secondary[stateTable[iState]];
+ }
+ else {
+ return 0.0;
+
+ }
+ } // switch
+} // DPTable...::getProb(int,...)
+
+int HomologyBaumWelch::transitionIndex(string strId) const {
+ map<const string,int>::const_iterator iter = mId.find(strId);
+ if (iter == mId.end()) {
+ cout << "HomologyBaumWelch::transitionIndex: WARNING: identifier '" << strId << "' not found." << endl;
+ return -1;
+ }
+ return iter->second;
+}
+
+
+int HomologyBaumWelch::emissionIndex(string strId) const {
+ map<const string,int>::const_iterator iter = mId.find(strId);
+ if (iter == mId.end()) {
+ cout << "HomologyBaumWelch::emissionIndex: WARNING: identifier '" << strId << "' not found." << endl;
+ return -1;
+ }
+ return iter->second;
+}
+
+
+void HomologyBaumWelch::resetCounts() {
+ static bool bInited = false;
+ if (!bInited) {
+ static const int aTemp[] = {0, 1, 2, 3, 4, 5, 6, 7};
+ for (int i=0; i<8; i++) {
+ transitionIdentifier0[i] = aTemp[i];
+ atransitionIdx[aTemp[i]] = i;
+ mId[_HomologytransitionId[aTemp[i]]] = i;
+ }
+ }
+ for (int i=0; i<8; i++) {
+
+ transitionBaumWelchCount0[i] = 0.0;
+ }
+ if (!bInited) {
+ static const int aTemp[] = {1};
+ for (int i=0; i<1; i++) {
+ emissionIdentifier0[i] = aTemp[i];
+ aemissionIdx[aTemp[i]] = i;
+ mId[_HomologyemissionId[aTemp[i]]] = i;
+ }
+ }
+ for (int i=0; i<1; i++) {
+
+ emissionBaumWelchCount0[i] = 0.0;
+ }
+ if (!bInited) {
+ static const int aTemp[] = {0, 2};
+ for (int i=0; i<2; i++) {
+ emissionIdentifier1[i] = aTemp[i];
+ aemissionIdx[aTemp[i]] = i;
+ mId[_HomologyemissionId[aTemp[i]]] = i;
+ }
+ }
+ for (int i=0; i<2; i++) {
+ for(int v00=0;v00<8;v00++)
+ emissionBaumWelchCount1[v00][i] = 0.0;
+ }
+ bInited = true;
+};
+
+
+int HomologyBaumWelch::transitionIdentifier0[];
+int HomologyBaumWelch::emissionIdentifier0[];
+int HomologyBaumWelch::emissionIdentifier1[];
+
+void HomologyBaumWelch::scaleCounts(bfloat scale) {
+ for (int i=0; i<8; i++) {
+
+ transitionBaumWelchCount0[i] *= scale;
+ }
+ for (int i=0; i<1; i++) {
+
+ emissionBaumWelchCount0[i] *= scale;
+ }
+ for (int i=0; i<2; i++) {
+ for(int v00=0;v00<8;v00++)
+ emissionBaumWelchCount1[v00][i] *= scale;
+ }
+}
+
+
+map<const string,int> HomologyBaumWelch::mId;
+int HomologyBaumWelch::atransitionIdx[];
+int HomologyBaumWelch::aemissionIdx[];
+
+bfloat hmmocMax(bfloat i, bfloat j) { return i>j ? i : j; }
+void hmmocMaxInPlace(bfloat& i, bfloat j) { if (i<j) i=j; }
+ostream& operator<<(ostream& os, const Path& p)
+{
+ for (unsigned int i=0; i<p.size(); i++) {
+ os << p.fromState(i) << "--{";
+ for (unsigned int j=0; j<p.emission(i).size(); j++) {
+ if (j) os<<",";
+ os<<p.emission(i)[j];
+ }
+ os<<"};"<<p.prob(i)<<"-->"<<p.toState(i)<<endl;
+ }
+ return os;
+}
+
+void SimplePath::addEdge(int tr, double p, vector<int>& e, int f, int t) {
+ transitions.push_back(tr);
+ probs.push_back(p);
+ emissions.push_back(e);
+ froms.push_back(f);
+ tos.push_back(t);
+}
+
+void SimplePath::reverse()
+{
+ std::reverse(transitions.begin(),transitions.end());
+ std::reverse(probs.begin(),probs.end());
+ std::reverse(emissions.begin(),emissions.end());
+ std::reverse(froms.begin(),froms.end());
+ std::reverse(tos.begin(),tos.end());
+}
+
+double SimplePath::prob(int i) const {
+ return probs[i];
+}
+
+int SimplePath::nextFrom(int i) const {
+ if (i+1 < (int)transitions.size())
+ return i+1;
+ else
+ return -1;
+}
+
+int SimplePath::nextTo(int i) const {
+ return -1;
+}
+
+const vector<int>& SimplePath::emission(int i) const {
+ return emissions[i];
+}
+
+int SimplePath::fromState(int i) const {
+ return froms[i];
+}
+
+int SimplePath::toState(int i) const {
+ return tos[i];
+}
+
+const string _HomologystateId[] = {"start","homologous","unrelated","end"};
+const string _HomologyemissionId[] = {"emitHomologous","empty","emitUnrelated"};
+const string _HomologytransitionId[] = {"id$13","id$14","id$15","id$16","id$17","id$18","id$19","id$20"};
+const string _HomologytransF[] = {"start","start","homologous","homologous","unrelated","unrelated","homologous","unrelated"};
+const string _HomologytransT[] = {"homologous","unrelated","homologous","unrelated","unrelated","homologous","end","end"};
+const string _HomologytransP[] = {"startHomologous","startUnrelated","stayHomologous","goUnrelated","stayUnrelated","goHomologous","goStopFromHomologous","goStopFromUnrelated"};
+const string _HomologytransE[] = {"emitHomologous","emitUnrelated","emitHomologous","emitUnrelated","emitUnrelated","emitHomologous","empty","empty"};
+const string _HomologyoutputId[] = {"sequence"};
+const string _Homologyempty = "";
+const int _HomologystateNum = 4;
+const int _HomologyemitNum = 3;
+const int _HomologytransNum = 8;
+const int _HomologyoutputNum = 1;
+
+
+
+
+bfloat Forward(HomologyDPTable** ppOutTable,Params iPar,char *aSeq,int iLen) {
+ double iTransition[8];
+ bfloat *CurStateMemoryblock2To;
+ const bfloat *CurStateMemoryblock1From;
+ const bfloat *CurStateMemoryblock2From;
+ bfloat *CurStateMemoryblock3To;
+ const bfloat *CurStateMemoryblock3From;
+ int iPrevSlowCoord;
+ int iSymbol[1];
+ if (false && iSymbol[0] == iSymbol[0]) {} // avoid 'unused variable' warnings
+ double iEmission[2];
+ /* temporary storage for ordinary reals */
+ register double iTempResult[1];
+ /* temporary storage for extended-exponent reals */
+ register bfloat iTempProb[1];
+ HomologyDPTable dp(iLen);
+ iTransition[0] = iPar.iStartHomologous;
+
+ iTransition[1] = 1.0 - iPar.iStartHomologous;
+
+ iTransition[2] = 1.0 - iPar.iGoUnrelated - iPar.iGoStopFromHomologous;
+
+ iTransition[3] = iPar.iGoUnrelated;
+
+ iTransition[4] = 1.0 - iPar.iGoHomologous - iPar.iGoStopFromUnrelated;
+
+ iTransition[5] = iPar.iGoHomologous;
+
+ iTransition[6] = iPar.iGoStopFromHomologous;
+
+ iTransition[7] = iPar.iGoStopFromUnrelated;
+ dp.StateMemoryblock1.write()[0] = 1.0;
+ dp.StateMemoryblock1.written();
+ iPrevSlowCoord = -1;
+ for (int iPos0=0; iPos0<iLen+1; ++iPos0) {
+ if ((iPos0+0<=0)) {
+ }
+ if ((iPos0+0>=1)) {
+ if ((iPos0+-1>=0)) {
+ iSymbol[0] = aSeq[iPos0+-1];
+ }
+ else {
+ iSymbol[0] = '1' /* dummy value */;
+
+ }
+ CurStateMemoryblock2To = dp.StateMemoryblock2.write((iPos0-(0))-(1));
+ iTempResult[0] = iPar.aEmitUnrelated[ iSymbol[0] - '1' ];
+ iEmission[0] = iTempResult[0];
+ iTempResult[0] = iPar.aEmitHomologous[ iSymbol[0] - '1' ];
+ iEmission[1] = iTempResult[0];
+ if ((iPos0+-1<=0)) {
+ CurStateMemoryblock1From = dp.StateMemoryblock1.read();
+ CurStateMemoryblock2To[1] = ((iTransition[1])*(iEmission[0]))*CurStateMemoryblock1From[0];
+ CurStateMemoryblock2To[0] = ((iTransition[0])*(iEmission[1]))*CurStateMemoryblock1From[0];
+ }
+ if ((iPos0+-1>=1)) {
+ CurStateMemoryblock2From = dp.StateMemoryblock2.read((iPos0-(1))-(1));
+ CurStateMemoryblock2To[1] += ((iTransition[4])*(iEmission[0]))*CurStateMemoryblock2From[1];
+ CurStateMemoryblock2To[1] += ((iTransition[3])*(iEmission[0]))*CurStateMemoryblock2From[0];
+ CurStateMemoryblock2To[0] += ((iTransition[5])*(iEmission[1]))*CurStateMemoryblock2From[1];
+ CurStateMemoryblock2To[0] += ((iTransition[2])*(iEmission[1]))*CurStateMemoryblock2From[0];
+ }
+ dp.StateMemoryblock2.written();
+ }
+ if ((iPos0+0>=iLen+0)) {
+ CurStateMemoryblock3To = dp.StateMemoryblock3.write();
+ iEmission[0] = 1.0;
+ if ((iPos0+0>=1)) {
+ CurStateMemoryblock2From = dp.StateMemoryblock2.read((iPos0-(0))-(1));
+ CurStateMemoryblock3To[0] = ((iTransition[7])*(iEmission[0]))*CurStateMemoryblock2From[1];
+ CurStateMemoryblock3To[0] += ((iTransition[6])*(iEmission[0]))*CurStateMemoryblock2From[0];
+ }
+ dp.StateMemoryblock3.written();
+ }
+ iPrevSlowCoord = iPos0;
+ }
+ iPrevSlowCoord = -1;
+ {
+ int iPos0=iLen+0;
+ if (iPos0==iPos0) {} // avoid 'unused variable' warnings
+ CurStateMemoryblock3From = dp.StateMemoryblock3.read();
+ iTempProb[0] = CurStateMemoryblock3From[0];
+ }
+ *ppOutTable = new HomologyDPTable(dp);
+ // make sure tables don't get deleted
+ dp.isInCharge = false;
+ return iTempProb[0];
+};
+
+
+
+
+
+bfloat Backward(HomologyBaumWelch& bw,HomologyDPTable* pInTable,HomologyDPTable** ppOutTable,Params iPar,char *aSeq,int iLen) {
+ const bfloat *CurStateMemoryblock3Secondary;
+ double iTransition[8];
+ bfloat *CurStateMemoryblock2To;
+ const bfloat *CurStateMemoryblock2Secondary;
+ const bfloat *CurStateMemoryblock2From;
+ unsigned char alphaSymbolsitepatterns[8] = {'1', '2', '3', '4', '5', '6', '7', '8'};
+ unsigned char alphaIndexsitepatterns[256];
+ const bfloat *CurStateMemoryblock3From;
+ bfloat *CurStateMemoryblock1To;
+ const bfloat *CurStateMemoryblock1Secondary;
+ const bfloat *CurStateMemoryblock1From;
+ int iPrevSlowCoord;
+ int iSymbol[1];
+ if (false && iSymbol[0] == iSymbol[0]) {} // avoid 'unused variable' warnings
+ double iEmission[2];
+ /* temporary storage for ordinary reals */
+ register double iTempResult[1];
+ /* temporary storage for extended-exponent reals */
+ register bfloat iTempProb[3];
+ HomologyDPTable dp(iLen);
+ HomologyDPTable dp2(*pInTable);
+ // make sure tables don't get deleted
+ dp2.isInCharge = false;
+ iTransition[0] = iPar.iStartHomologous;
+
+ iTransition[1] = 1.0 - iPar.iStartHomologous;
+
+ iTransition[2] = 1.0 - iPar.iGoUnrelated - iPar.iGoStopFromHomologous;
+
+ iTransition[3] = iPar.iGoUnrelated;
+
+ iTransition[4] = 1.0 - iPar.iGoHomologous - iPar.iGoStopFromUnrelated;
+
+ iTransition[5] = iPar.iGoHomologous;
+
+ iTransition[6] = iPar.iGoStopFromHomologous;
+
+ iTransition[7] = iPar.iGoStopFromUnrelated;
+ for (int i=0; i<256; i++) {
+ alphaIndexsitepatterns[i]=0;
+ }
+ for (int i=0; i<8; i++) {
+ alphaIndexsitepatterns[alphaSymbolsitepatterns[i]]=i;
+ }
+ dp.StateMemoryblock3.write()[0] = 1.0;
+ dp.StateMemoryblock3.written();
+ iPrevSlowCoord = -1;
+ {
+ int iPos0=iLen+0;
+ if (iPos0==iPos0) {} // avoid 'unused variable' warnings
+ CurStateMemoryblock3Secondary = dp2.StateMemoryblock3.read();
+ iTempProb[2] = CurStateMemoryblock3Secondary[0];
+ bw.scaleCounts(iTempProb[2]);
+ }
+ iPrevSlowCoord = -1;
+ for (int iPos0=(iLen+1)-1; iPos0>=0; --iPos0) {
+ if ((iPos0+0>=iLen+0)) {
+ }
+ if ((iPos0+0>=1)) {
+ if ((iPos0+0<=iLen+-1)) {
+ iSymbol[0] = aSeq[iPos0+0];
+ }
+ else {
+ iSymbol[0] = '1' /* dummy value */;
+
+ }
+ CurStateMemoryblock2To = dp.StateMemoryblock2.write((iPos0-(0))-(1));
+ CurStateMemoryblock2Secondary = dp2.StateMemoryblock2.read((iPos0-(0))-(1));
+ iTempResult[0] = iPar.aEmitUnrelated[ iSymbol[0] - '1' ];
+ iEmission[0] = iTempResult[0];
+ iTempResult[0] = iPar.aEmitHomologous[ iSymbol[0] - '1' ];
+ iEmission[1] = iTempResult[0];
+ if ((iPos0+1<=iLen+0)) {
+ CurStateMemoryblock2From = dp.StateMemoryblock2.read((iPos0-(-1))-(1));
+ CurStateMemoryblock2To[0] = iTempProb[1] = ((iTransition[2])*(iEmission[1]))*CurStateMemoryblock2From[0];
+ iTempProb[1] *= CurStateMemoryblock2Secondary[0];
+ bw.transitionBaumWelchCount0[2] += iTempProb[1];
+ bw.emissionBaumWelchCount1[alphaIndexsitepatterns[iSymbol[0]]][0] += iTempProb[1];
+ CurStateMemoryblock2To[0] += iTempProb[1] = ((iTransition[3])*(iEmission[0]))*CurStateMemoryblock2From[1];
+ iTempProb[1] *= CurStateMemoryblock2Secondary[0];
+ bw.transitionBaumWelchCount0[3] += iTempProb[1];
+ bw.emissionBaumWelchCount1[alphaIndexsitepatterns[iSymbol[0]]][1] += iTempProb[1];
+ CurStateMemoryblock2To[1] = iTempProb[1] = ((iTransition[4])*(iEmission[0]))*CurStateMemoryblock2From[1];
+ iTempProb[1] *= CurStateMemoryblock2Secondary[1];
+ bw.transitionBaumWelchCount0[4] += iTempProb[1];
+ bw.emissionBaumWelchCount1[alphaIndexsitepatterns[iSymbol[0]]][1] += iTempProb[1];
+ CurStateMemoryblock2To[1] += iTempProb[1] = ((iTransition[5])*(iEmission[1]))*CurStateMemoryblock2From[0];
+ iTempProb[1] *= CurStateMemoryblock2Secondary[1];
+ bw.transitionBaumWelchCount0[5] += iTempProb[1];
+ bw.emissionBaumWelchCount1[alphaIndexsitepatterns[iSymbol[0]]][0] += iTempProb[1];
+ }
+ iEmission[0] = 1.0;
+ if ((iPos0+0>=iLen+0)) {
+ CurStateMemoryblock3From = dp.StateMemoryblock3.read();
+ CurStateMemoryblock2To[0] += iTempProb[1] = ((iTransition[6])*(iEmission[0]))*CurStateMemoryblock3From[0];
+ iTempProb[1] *= CurStateMemoryblock2Secondary[0];
+ bw.transitionBaumWelchCount0[6] += iTempProb[1];
+ bw.emissionBaumWelchCount0[0] += iTempProb[1];
+ CurStateMemoryblock2To[1] += iTempProb[1] = ((iTransition[7])*(iEmission[0]))*CurStateMemoryblock3From[0];
+ iTempProb[1] *= CurStateMemoryblock2Secondary[1];
+ bw.transitionBaumWelchCount0[7] += iTempProb[1];
+ bw.emissionBaumWelchCount0[0] += iTempProb[1];
+ }
+ dp.StateMemoryblock2.written();
+ }
+ if ((iPos0+0<=0)) {
+ if ((iPos0+0<=iLen+-1)) {
+ iSymbol[0] = aSeq[iPos0+0];
+ }
+ else {
+ iSymbol[0] = '1' /* dummy value */;
+
+ }
+ CurStateMemoryblock1To = dp.StateMemoryblock1.write();
+ CurStateMemoryblock1Secondary = dp2.StateMemoryblock1.read();
+ iTempResult[0] = iPar.aEmitUnrelated[ iSymbol[0] - '1' ];
+ iEmission[0] = iTempResult[0];
+ iTempResult[0] = iPar.aEmitHomologous[ iSymbol[0] - '1' ];
+ iEmission[1] = iTempResult[0];
+ if ((iPos0+1<=iLen+0)) {
+ CurStateMemoryblock2From = dp.StateMemoryblock2.read((iPos0-(-1))-(1));
+ CurStateMemoryblock1To[0] = iTempProb[1] = ((iTransition[0])*(iEmission[1]))*CurStateMemoryblock2From[0];
+ iTempProb[1] *= CurStateMemoryblock1Secondary[0];
+ bw.transitionBaumWelchCount0[0] += iTempProb[1];
+ bw.emissionBaumWelchCount1[alphaIndexsitepatterns[iSymbol[0]]][0] += iTempProb[1];
+ CurStateMemoryblock1To[0] += iTempProb[1] = ((iTransition[1])*(iEmission[0]))*CurStateMemoryblock2From[1];
+ iTempProb[1] *= CurStateMemoryblock1Secondary[0];
+ bw.transitionBaumWelchCount0[1] += iTempProb[1];
+ bw.emissionBaumWelchCount1[alphaIndexsitepatterns[iSymbol[0]]][1] += iTempProb[1];
+ }
+ dp.StateMemoryblock1.written();
+ }
+ iPrevSlowCoord = iPos0;
+ }
+ bw.scaleCounts(1.0 / iTempProb[2]);
+ iPrevSlowCoord = -1;
+ {
+ int iPos0=0;
+ if (iPos0==iPos0) {} // avoid 'unused variable' warnings
+ CurStateMemoryblock1From = dp.StateMemoryblock1.read();
+ iTempProb[0] = CurStateMemoryblock1From[0];
+ }
+ *ppOutTable = new HomologyDPTable(dp);
+ // make sure tables don't get deleted
+ dp.isInCharge = false;
+ return iTempProb[0];
+};
+
+
+
+
+
+bfloat Viterbi_recurse(HomologyDPTable** ppOutTable,Params iPar,char *aSeq,int iLen) {
+ double iTransition[8];
+ bfloat *CurStateMemoryblock2To;
+ const bfloat *CurStateMemoryblock2From;
+ const bfloat *CurStateMemoryblock3From;
+ bfloat *CurStateMemoryblock1To;
+ const bfloat *CurStateMemoryblock1From;
+ int iPrevSlowCoord;
+ int iSymbol[1];
+ if (false && iSymbol[0] == iSymbol[0]) {} // avoid 'unused variable' warnings
+ double iEmission[2];
+ /* temporary storage for ordinary reals */
+ register double iTempResult[1];
+ /* temporary storage for extended-exponent reals */
+ register bfloat iTempProb[1];
+ HomologyDPTable dp(iLen);
+ iTransition[0] = iPar.iStartHomologous;
+
+ iTransition[1] = 1.0 - iPar.iStartHomologous;
+
+ iTransition[2] = 1.0 - iPar.iGoUnrelated - iPar.iGoStopFromHomologous;
+
+ iTransition[3] = iPar.iGoUnrelated;
+
+ iTransition[4] = 1.0 - iPar.iGoHomologous - iPar.iGoStopFromUnrelated;
+
+ iTransition[5] = iPar.iGoHomologous;
+
+ iTransition[6] = iPar.iGoStopFromHomologous;
+
+ iTransition[7] = iPar.iGoStopFromUnrelated;
+ dp.StateMemoryblock3.write()[0] = 1.0;
+ dp.StateMemoryblock3.written();
+ iPrevSlowCoord = -1;
+ for (int iPos0=(iLen+1)-1; iPos0>=0; --iPos0) {
+ if ((iPos0+0>=iLen+0)) {
+ }
+ if ((iPos0+0>=1)) {
+ if ((iPos0+0<=iLen+-1)) {
+ iSymbol[0] = aSeq[iPos0+0];
+ }
+ else {
+ iSymbol[0] = '1' /* dummy value */;
+
+ }
+ CurStateMemoryblock2To = dp.StateMemoryblock2.write((iPos0-(0))-(1));
+ iTempResult[0] = iPar.aEmitHomologous[ iSymbol[0] - '1' ];
+ iEmission[0] = iTempResult[0];
+ iTempResult[0] = iPar.aEmitUnrelated[ iSymbol[0] - '1' ];
+ iEmission[1] = iTempResult[0];
+ if ((iPos0+1<=iLen+0)) {
+ CurStateMemoryblock2From = dp.StateMemoryblock2.read((iPos0-(-1))-(1));
+ CurStateMemoryblock2To[0] = ((iTransition[2])*(iEmission[0]))*CurStateMemoryblock2From[0];
+ hmmocMaxInPlace( CurStateMemoryblock2To[0], ((iTransition[3])*(iEmission[1]))*CurStateMemoryblock2From[1] );
+ CurStateMemoryblock2To[1] = ((iTransition[4])*(iEmission[1]))*CurStateMemoryblock2From[1];
+ hmmocMaxInPlace( CurStateMemoryblock2To[1], ((iTransition[5])*(iEmission[0]))*CurStateMemoryblock2From[0] );
+ }
+ iEmission[0] = 1.0;
+ if ((iPos0+0>=iLen+0)) {
+ CurStateMemoryblock3From = dp.StateMemoryblock3.read();
+ hmmocMaxInPlace( CurStateMemoryblock2To[0], ((iTransition[6])*(iEmission[0]))*CurStateMemoryblock3From[0] );
+ hmmocMaxInPlace( CurStateMemoryblock2To[1], ((iTransition[7])*(iEmission[0]))*CurStateMemoryblock3From[0] );
+ }
+ dp.StateMemoryblock2.written();
+ }
+ if ((iPos0+0<=0)) {
+ if ((iPos0+0<=iLen+-1)) {
+ iSymbol[0] = aSeq[iPos0+0];
+ }
+ else {
+ iSymbol[0] = '1' /* dummy value */;
+
+ }
+ CurStateMemoryblock1To = dp.StateMemoryblock1.write();
+ iTempResult[0] = iPar.aEmitHomologous[ iSymbol[0] - '1' ];
+ iEmission[0] = iTempResult[0];
+ iTempResult[0] = iPar.aEmitUnrelated[ iSymbol[0] - '1' ];
+ iEmission[1] = iTempResult[0];
+ if ((iPos0+1<=iLen+0)) {
+ CurStateMemoryblock2From = dp.StateMemoryblock2.read((iPos0-(-1))-(1));
+ CurStateMemoryblock1To[0] = ((iTransition[0])*(iEmission[0]))*CurStateMemoryblock2From[0];
+ hmmocMaxInPlace( CurStateMemoryblock1To[0], ((iTransition[1])*(iEmission[1]))*CurStateMemoryblock2From[1] );
+ }
+ dp.StateMemoryblock1.written();
+ }
+ iPrevSlowCoord = iPos0;
+ }
+ iPrevSlowCoord = -1;
+ {
+ int iPos0=0;
+ if (iPos0==iPos0) {} // avoid 'unused variable' warnings
+ CurStateMemoryblock1From = dp.StateMemoryblock1.read();
+ iTempProb[0] = CurStateMemoryblock1From[0];
+ }
+ *ppOutTable = new HomologyDPTable(dp);
+ // make sure tables don't get deleted
+ dp.isInCharge = false;
+ return iTempProb[0];
+};
+
+
+
+
+
+Path& Viterbi_trace(HomologyDPTable* pInTable,Params iPar,char *aSeq,int iLen) {
+ double iTransition[8];
+ const bfloat *CurStateMemoryblock1To;
+ const bfloat *CurStateMemoryblock2To;
+ const bfloat *CurStateMemoryblock3To;
+ int iPrevSlowCoord;
+ SimplePath* pPath = new SimplePath();
+ vector<int> emit;
+ int iSymbol[1];
+ if (false && iSymbol[0] == iSymbol[0]) {} // avoid 'unused variable' warnings
+ double iEmission[2];
+ /* temporary vector storage */
+ bfloat iTempVector[9];
+ /* temporary int vector storage */
+ int iTempIntVec[6];
+ /* temporary storage for ordinary reals */
+ register double iTempResult[1];
+ iTransition[0] = iPar.iStartHomologous;
+
+ iTransition[1] = 1.0 - iPar.iStartHomologous;
+
+ iTransition[2] = 1.0 - iPar.iGoUnrelated - iPar.iGoStopFromHomologous;
+
+ iTransition[3] = iPar.iGoUnrelated;
+
+ iTransition[4] = 1.0 - iPar.iGoHomologous - iPar.iGoStopFromUnrelated;
+
+ iTransition[5] = iPar.iGoHomologous;
+
+ iTransition[6] = iPar.iGoStopFromHomologous;
+
+ iTransition[7] = iPar.iGoStopFromUnrelated;
+ static const int stateTable[] = {1, 2, 1, 2, 2, 1, 3, 3};
+ static const int stateFromTable[] = {0, 0, 1, 1, 2, 2, 1, 2};
+ static const int iPos0Table[] = {1, 1, 1, 1, 1, 1, 0, 0};
+ HomologyDPTable dp(*pInTable);
+ // make sure tables don't get deleted
+ dp.isInCharge = false;
+ dp.StateMemoryblock1.write()[0] = 1.0;
+ dp.StateMemoryblock1.written();
+ iPrevSlowCoord = -1;
+ {
+ int iPos0=0;
+ if (iPos0==iPos0) {} // avoid 'unused variable' warnings
+ iTempIntVec[0] = 0;
+ while (iTempIntVec[0] != 3) {
+ iTempIntVec[1] = 2;
+ if ((iPos0+0<=iLen+-1)) {
+ iSymbol[0] = aSeq[iPos0+0];
+ }
+ else {
+ iSymbol[0] = '1' /* dummy value */;
+
+ }
+ CurStateMemoryblock1To = dp.StateMemoryblock1.read();
+ CurStateMemoryblock2To = dp.StateMemoryblock2.read((iPos0-(0))-(1));
+ if ((iPos0+1<=iLen+0)) {
+ iTempResult[0] = iPar.aEmitHomologous[ iSymbol[0] - '1' ];
+ iEmission[0] = iTempResult[0];
+ iTempResult[0] = iPar.aEmitUnrelated[ iSymbol[0] - '1' ];
+ iEmission[1] = iTempResult[0];
+ CurStateMemoryblock2To = dp.StateMemoryblock2.read((iPos0-(-1))-(1));
+ switch (iTempIntVec[0]) {
+ default:
+ break;
+ case 0:
+ iTempVector[iTempIntVec[1]] = iTransition[0]*iEmission[0]*CurStateMemoryblock2To[0];
+ iTempVector[iTempIntVec[1]+3] = iTransition[0]*iEmission[0];
+ iTempIntVec[iTempIntVec[1]++] = 0;
+ iTempVector[iTempIntVec[1]] = iTransition[1]*iEmission[1]*CurStateMemoryblock2To[1];
+ iTempVector[iTempIntVec[1]+3] = iTransition[1]*iEmission[1];
+ iTempIntVec[iTempIntVec[1]++] = 1;
+ break;
+ case 1:
+ iTempVector[iTempIntVec[1]] = iTransition[2]*iEmission[0]*CurStateMemoryblock2To[0];
+ iTempVector[iTempIntVec[1]+3] = iTransition[2]*iEmission[0];
+ iTempIntVec[iTempIntVec[1]++] = 2;
+ iTempVector[iTempIntVec[1]] = iTransition[3]*iEmission[1]*CurStateMemoryblock2To[1];
+ iTempVector[iTempIntVec[1]+3] = iTransition[3]*iEmission[1];
+ iTempIntVec[iTempIntVec[1]++] = 3;
+ break;
+ case 2:
+ iTempVector[iTempIntVec[1]] = iTransition[5]*iEmission[0]*CurStateMemoryblock2To[0];
+ iTempVector[iTempIntVec[1]+3] = iTransition[5]*iEmission[0];
+ iTempIntVec[iTempIntVec[1]++] = 5;
+ iTempVector[iTempIntVec[1]] = iTransition[4]*iEmission[1]*CurStateMemoryblock2To[1];
+ iTempVector[iTempIntVec[1]+3] = iTransition[4]*iEmission[1];
+ iTempIntVec[iTempIntVec[1]++] = 4;
+ break;
+ }
+ }
+ CurStateMemoryblock3To = dp.StateMemoryblock3.read();
+ if ((iPos0+0>=iLen+0)) {
+ iEmission[0] = 1.0;
+ CurStateMemoryblock3To = dp.StateMemoryblock3.read();
+ switch (iTempIntVec[0]) {
+ default:
+ break;
+ case 1:
+ iTempVector[iTempIntVec[1]] = iTransition[6]*iEmission[0]*CurStateMemoryblock3To[0];
+ iTempVector[iTempIntVec[1]+3] = iTransition[6]*iEmission[0];
+ iTempIntVec[iTempIntVec[1]++] = 6;
+ break;
+ case 2:
+ iTempVector[iTempIntVec[1]] = iTransition[7]*iEmission[0]*CurStateMemoryblock3To[0];
+ iTempVector[iTempIntVec[1]+3] = iTransition[7]*iEmission[0];
+ iTempIntVec[iTempIntVec[1]++] = 7;
+ break;
+ }
+ }
+ iTempVector[0] = 0.0;
+ for (int i=2; i<iTempIntVec[1]; i++) {
+ if (iTempVector[i]>iTempVector[0]) {
+ iTempVector[0]=iTempVector[i];
+ iTempIntVec[0] = i;
+ }
+ }
+ emit.resize(1);
+ emit[0] = iPos0Table[iTempIntVec[iTempIntVec[0]]];
+ pPath->addEdge(iTempIntVec[iTempIntVec[0]],iTempVector[iTempIntVec[0]+3],emit,stateFromTable[iTempIntVec[iTempIntVec[0]]],stateTable[iTempIntVec[iTempIntVec[0]]]);
+ iPos0 += iPos0Table[iTempIntVec[iTempIntVec[0]]];
+ iTempIntVec[0] = stateTable[iTempIntVec[iTempIntVec[0]]];
+ }
+ }
+ return *pPath;
+};
+
+
+
+/* --- end of HMMoC-generated file --- */
diff --git a/libMems/HomologyHMM/homology.h b/libMems/HomologyHMM/homology.h
new file mode 100644
index 0000000..888c403
--- /dev/null
+++ b/libMems/HomologyHMM/homology.h
@@ -0,0 +1,188 @@
+/* Code generated by HMMoC version VERSION, Copyright (C) 2006 Gerton Lunter */
+/* Generated from file homology.xml (author: Aaron Darling) on Mon Jul 16 11:09:12 EST 2007 */
+
+/*
+This file is a work based on HMMoC VERSION, a hidden Markov model compiler.
+Copyright (C) 2006 by Gerton Lunter, Oxford University.
+
+HMMoC and works based on it are free software; you can redistribute
+it and/or modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+HMMOC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with HMMoC; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef _homology_h_
+#define _homology_h_
+
+
+#include "dptables.h"
+#include "algebras.h"
+
+#include <vector>
+#include <iostream>
+#include <string>
+#include <algorithm>
+
+using std::vector;
+using std::cout;
+
+#include <map>
+
+using std::map;
+
+struct Params;
+
+//void run(std::string& sequence, std::string& prediction, double goHomologous = 0.004, double goUnrelated = 0.004, std::vector<double>* emitHomologous = NULL, std::vector<double>* emitUnrelated = NULL);
+
+void run(std::string& sequence, std::string& prediction, const Params& params );
+
+// Here go the state memory clique typedefs:
+typedef States<bfloat,2> Statesblock2;
+typedef States<bfloat,1> Statesblock1;
+typedef States<bfloat,1> Statesblock3;
+
+class HomologyDPTable {
+ public:
+ // If true, this class' destructor will delete the DP arrays
+ bool isInCharge;
+ // Pointers to arrays containing ids of states and transitions
+ const std::string* const stateId;
+ const std::string* const emissionId;
+ const std::string* const transitionId;
+ const std::string* const transitionFrom;
+ const std::string* const transitionTo;
+ const std::string* const transitionProb;
+ const std::string* const transitionEmit;
+ const std::string* const outputId;
+ // The actual DP tables, and total sequence lengths (which determine size of DP arrays) follow:
+ int iLen;
+ DPTable<Statesblock2,1> StateMemoryblock2;
+ DPTable<Statesblock1,0> StateMemoryblock1;
+ DPTable<Statesblock3,0> StateMemoryblock3;
+ // Member functions:
+ public:
+ // Default copy constructor is used; user has to set isInCharge appropriately afterwards!
+ HomologyDPTable(int iLen);
+ ~HomologyDPTable();
+ // returns probability from DP table, given position and int or std::string state identifier
+ bfloat getProb(int iState ,int ) const;
+ bfloat getProb(const std::string sState ,int ) const;
+ // converts std::string identifier (for state, transition or emission) into integer id
+ static int getId(const std::string& sState);
+ static const std::string& getTransitionId(int id);
+ static const std::string& getEmissionId(int id);
+ static const std::string& getStateId(int id);
+ static const std::string& getOutputId(int id);
+ static void _cleanup() { getId("_cleanup_"); }
+};
+
+// give a name to the real type used for this HMM
+typedef bfloat HomologyReal;
+// define type for a 'short' real -- usually double, but can be logspace for efficiency
+typedef double HomologyShortReal;
+
+
+
+class HomologyBaumWelch {
+ public:
+ // Default copy constructor is used.
+ // Void constructor:
+ HomologyBaumWelch() { resetCounts(); }
+ // Not calling resetCounts() across calls allows to aggregate results over multiple datasets
+ void resetCounts();
+ void scaleCounts(bfloat scale);
+ // Translate an identifier (string or integer) to the index into their corresponding Baum-Welch counter array (below)
+ // Which array is used for any particular emission/transition depends on its order signature - see documentation for details
+ int transitionIndex(int intId) const { return atransitionIdx[intId]; }
+ int transitionIndex(string strId) const;
+ int emissionIndex(int intId) const { return aemissionIdx[intId]; }
+ int emissionIndex(string strId) const;
+ // Now follow, in triplets (one for each order signature):
+ // Transition or emission counters;
+ // Array of identifiers; and
+ // Dimension of array (number of counters).
+ bfloat transitionBaumWelchCount0[8];
+ static int transitionIdentifier0[8];
+ static const int transitionDimension0 = 8;
+ bfloat emissionBaumWelchCount0[1];
+ static int emissionIdentifier0[1];
+ static const int emissionDimension0 = 1;
+ bfloat emissionBaumWelchCount1[8][2];
+ static int emissionIdentifier1[2];
+ static const int emissionDimension1 = 2;
+ private:
+ static int atransitionIdx[8];
+ static int aemissionIdx[3];
+ static map<const std::string,int> mId;
+};
+
+
+
+class Path {
+ //protected:
+ public:
+ vector<int> transitions;
+ public:
+ unsigned int size() const { // Number of transitions in path
+ return transitions.size();
+ }
+ int transition(int i) const { // i-th transition
+ return transitions[i];
+ }
+ virtual double prob(int) const = 0; // i-th transition*emission probability:
+ virtual const vector<int>& emission(int) const = 0; // i-th emission vector
+ virtual int fromState(int) const = 0; // State at from-end of i-th transition
+ virtual int toState(int) const = 0; // State at to-end of i-th transition
+ virtual int nextFrom(int) const = 0; // index of next sibling, -1 if no more (always -1 for simple paths)
+ virtual int nextTo(int) const = 0; // index of child (always i+1 for simple paths), or -1 if no more
+ virtual ~Path() {}
+};
+
+ostream& operator<<(ostream& os, const Path& p);
+
+class SimplePath: public Path {
+ public:
+ vector<double> probs;
+ vector<vector<int> > emissions;
+ vector<int> froms;
+ vector<int> tos;
+ public:
+ void addEdge(int transition, double prob, vector<int>& emission, int from, int to);
+ double prob(int index) const;
+ int nextFrom(int index) const;
+ int nextTo(int index) const;
+ const vector<int>& emission(int index) const;
+ int fromState(int index) const;
+ int toState(int index) const;
+ void reverse();
+};
+struct Params {
+ double iStartHomologous;
+ double iGoHomologous;
+ double iGoUnrelated;
+ double iGoStopFromUnrelated;
+ double iGoStopFromHomologous;
+ double aEmitHomologous[8];
+ double aEmitUnrelated[8];
+};
+
+bfloat Forward(HomologyDPTable** ppOutTable,Params iPar,char *aSeq,int iLen);
+
+bfloat Backward(HomologyBaumWelch& bw,HomologyDPTable* pInTable,HomologyDPTable** ppOutTable,Params iPar,char *aSeq,int iLen);
+
+bfloat Viterbi_recurse(HomologyDPTable** ppOutTable,Params iPar,char *aSeq,int iLen);
+
+Path& Viterbi_trace(HomologyDPTable* pInTable,Params iPar,char *aSeq,int iLen);
+
+#endif // _homology_h_
+
+/* --- end of HMMoC-generated file --- */
diff --git a/libMems/HomologyHMM/homology.xml b/libMems/HomologyHMM/homology.xml
new file mode 100644
index 0000000..2ee7022
--- /dev/null
+++ b/libMems/HomologyHMM/homology.xml
@@ -0,0 +1,217 @@
+<?xml version="1.0"?>
+<!--
+ Derived from casino.xml by Aaron Darling
+ Copyright (C) 2007 Aaron Darling
+
+ This file is part of HMMoC 0.5, a hidden Markov model compiler.
+ Copyright (C) 2006 by Gerton Lunter, Oxford University.
+
+ HMMoC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ HMMOC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with HMMoC; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+-->
+
+
+
+<hml debug="true">
+
+
+
+<author>Aaron Darling</author>
+
+
+<!-- We code a site pattern as a number
+ 1 A:A, T:T
+ 2 A:C, C:A, G:T, T:G
+ 3 A:G, G:A, C:T, T:C
+ 4 A:T, T:A
+ 5 C:C, G:G
+ 6 C:G, G:C
+ 7 N:gap open/close
+ 8 N:gap extend
+-->
+<alphabet id="sitepatterns">
+ 12345678
+</alphabet>
+
+
+
+<output id="sequence">
+ <alphabet idref="sitepatterns"/>
+ <identifier type="length" value="iLen"/>
+ <identifier type="sequence" value="aSeq"/>
+ <code type="parameter" value="char *aSeq"/>
+ <code type="parameter" value="int iLen"/>
+</output>
+
+
+<hmm id="Homology">
+
+ <description> An HMM to distinguish homologous from unrelated sites in an alignment </description>
+
+ <outputs id="homologyoutputs">
+ <output idref="sequence"/>
+ </outputs>
+
+
+ <clique id="block1">
+ <state id="start"/>
+ </clique>
+
+ <clique id="block2">
+ <state id="homologous"/>
+ <state id="unrelated"/>
+ </clique>
+
+ <clique id="block3">
+ <state id="end"/>
+ </clique>
+
+
+ <graph>
+ <clique idref="block1"/>
+ <clique idref="block2"/>
+ <clique idref="block3"/>
+ </graph>
+
+
+ <transitions>
+ <transition from="start" to="homologous" probability="startHomologous" emission="emitHomologous"/>
+ <transition from="start" to="unrelated" probability="startUnrelated" emission="emitUnrelated"/>
+ <transition from="homologous" to="homologous" probability="stayHomologous" emission="emitHomologous"/>
+ <transition from="homologous" to="unrelated" probability="goUnrelated" emission="emitUnrelated"/>
+ <transition from="unrelated" to="unrelated" probability="stayUnrelated" emission="emitUnrelated"/>
+ <transition from="unrelated" to="homologous" probability="goHomologous" emission="emitHomologous"/>
+ <transition from="homologous" to="end" probability="goStopFromHomologous" emission="empty"/>
+ <transition from="unrelated" to="end" probability="goStopFromUnrelated" emission="empty"/>
+ </transitions>
+
+
+ <code id="paramsClassDef" where="classdefinitions">
+ <![CDATA[
+ struct Params {
+ double iStartHomologous;
+ double iGoHomologous;
+ double iGoUnrelated;
+ double iGoStopFromUnrelated;
+ double iGoStopFromHomologous;
+ double aEmitHomologous[8];
+ double aEmitUnrelated[8];
+ };
+ ]]>
+ </code>
+
+
+ <emission id="empty">
+ <probability>
+ <code type="expression"> 1.0 </code>
+ </probability>
+ </emission>
+
+
+ <emission id="emitHomologous">
+ <output idref="sequence"/>
+ <probability>
+ <code type="statement">
+ <identifier output="sequence" value="iEmission"/>
+ <identifier type="result" value="iProb"/>
+ <![CDATA[
+
+ iProb = iPar.aEmitHomologous[ iEmission - '1' ];
+
+ ]]>
+ </code>
+ </probability>
+ </emission>
+
+
+ <emission id="emitUnrelated">
+ <output idref="sequence"/>
+ <probability>
+ <code type="statement">
+ <identifier output="sequence" value="iEmission"/>
+ <identifier type="result" value="iProb"/>
+ <!-- Here goes the code computing the probability -->
+ <![CDATA[
+
+ iProb = iPar.aEmitUnrelated[ iEmission - '1' ];
+
+ ]]>
+ </code>
+ </probability>
+ </emission>
+
+
+ <probability id="one"><code> 1.0 </code></probability>
+
+
+ <probability id="goUnrelated">
+ <code>
+ <!-- Tell HMMoC that this code requires an input parameter, which itself need a definition to make sense -->
+ <code type="parameter" init="paramsClassDef" value="Params iPar"/>
+ <!-- The actual code for this probability follows (no need to quote this) -->
+
+ iPar.iGoUnrelated
+
+ </code>
+ </probability>
+
+ <probability id="startHomologous"><code> iPar.iStartHomologous </code></probability>
+ <probability id="startUnrelated"><code> 1.0 - iPar.iStartHomologous </code></probability>
+ <probability id="goHomologous"><code> iPar.iGoHomologous </code></probability>
+ <probability id="goStopFromHomologous"><code> iPar.iGoStopFromHomologous </code></probability>
+ <probability id="goStopFromUnrelated"><code> iPar.iGoStopFromUnrelated </code></probability>
+ <probability id="stayHomologous"><code> 1.0 - iPar.iGoUnrelated - iPar.iGoStopFromHomologous </code></probability>
+ <probability id="stayUnrelated"><code> 1.0 - iPar.iGoHomologous - iPar.iGoStopFromUnrelated </code></probability>
+
+</hmm>
+
+
+
+
+
+
+
+
+<!-- Code generation -->
+
+
+<forward outputTable="yes" name="Forward" id="fw">
+ <!-- Specify HMM to make code for -->
+ <hmm idref="Homology"/>
+</forward>
+
+<backward outputTable="yes" baumWelch="yes" name="Backward" id="bw">
+ <!-- Specify HMM to make code for -->
+ <hmm idref="Homology"/>
+</backward>
+
+<viterbi name="Viterbi" id="vit">
+ <hmm idref="Homology"/>
+</viterbi>
+
+
+
+<codeGeneration realtype="bfloat" file="homology.cc" header="homology.h" language="C++">
+
+ <forward idref="fw"/>
+ <backward idref="bw"/>
+ <viterbi idref="vit"/>
+
+</codeGeneration>
+
+
+
+</hml>
+
+
diff --git a/libMems/HomologyHMM/homologymain.cc b/libMems/HomologyHMM/homologymain.cc
new file mode 100644
index 0000000..ef1a5cf
--- /dev/null
+++ b/libMems/HomologyHMM/homologymain.cc
@@ -0,0 +1,65 @@
+/*
+ * This file is part of HMMoC 0.5, a hidden Markov model compiler.
+ * Copyright (C) 2006 by Gerton Lunter, Oxford University.
+ *
+ * HMMoC is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * HMMOC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HMMoC; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+\*/
+#include <cstdlib>
+#include <cstring>
+#include "homology.h"
+
+
+void run(std::string& sequence, std::string& prediction, const Params& params )
+{
+
+ // The parameters of the model
+ Params iPar = params;
+
+ //
+ // Next, build an input emission sequence by sampling the emitted symbols according to true path
+ //
+
+ int iPathLength = sequence.length() ;
+ char* aSequence = new char[ iPathLength ];
+ memcpy(aSequence, sequence.data(), iPathLength );
+
+ // Decode the emission sequence using Viterbi, and compute posteriors and Baum Welch counts using Forward and Backward
+ HomologyDPTable *pViterbiDP, *pFWDP, *pBWDP;
+ HomologyBaumWelch bw;
+
+ bfloat iFWProb = Forward(&pFWDP, iPar, aSequence, iPathLength );
+ bfloat iBWProb = Backward(bw, pFWDP, &pBWDP, iPar, aSequence, iPathLength );
+
+ prediction.resize(iPathLength);
+ for (int i=0; i<iPathLength; i++) {
+
+ double iPosterior = pFWDP->getProb("homologous",i+1)*pBWDP->getProb("homologous",i+1)/iFWProb;
+// if (iViterbiPath.toState(i) == iVHomologous) {
+ if (iPosterior >= 0.9) {
+ prediction[i] = 'H';
+ } else {
+ prediction[i] = 'N';
+ }
+// cout << " " << iPosterior << endl;
+
+ }
+ //clean up aSequence, does this do any good?
+ delete[] aSequence;
+ delete pFWDP;
+ delete pBWDP;
+
+}
+
+
diff --git a/libMems/HomologyHMM/parameters.h b/libMems/HomologyHMM/parameters.h
new file mode 100644
index 0000000..304afc9
--- /dev/null
+++ b/libMems/HomologyHMM/parameters.h
@@ -0,0 +1,162 @@
+#ifndef __HomologyHMM_parameters_h__
+#define __HomologyHMM_parameters_h__
+
+#include "homology.h"
+
+Params getHoxdParams();
+Params getAdaptedHoxdMatrixParameters( double gc_content );
+void adaptToPercentIdentity( Params& params, double pct_identity );
+
+inline
+Params getHoxdParams()
+{
+ static Params* params = NULL;
+ if( params == NULL )
+ {
+ params = new Params();
+ params->iStartHomologous = 0.5;
+ params->iGoHomologous = 0.00001;
+ params->iGoUnrelated = 0.0000001;
+ params->iGoStopFromUnrelated = 0.00000001;
+ params->iGoStopFromHomologous = 0.00000001;
+
+ // original values from Chiaromonte et al supplied by Webb Miler
+ params->aEmitHomologous[0] = 0.1723*2; //a:a, t:t
+ params->aEmitHomologous[1] = 0.1462*2; //c:c, g:g
+ params->aEmitHomologous[2] = 0.0180*4; //a:c, c:a, g:t, t:g
+ params->aEmitHomologous[3] = 0.0426*4; //a:g, g:a, c:t, t:c
+ params->aEmitHomologous[4] = 0.0186*2; //a:t, t:a
+ params->aEmitHomologous[5] = 0.0142*2; //g:c, c:g
+ params->aEmitHomologous[6] = 0.004461; // gap open (from an e. coli y pestis alignment)
+ // gap extend // 0.050733
+ params->aEmitHomologous[7] = 1 - (params->aEmitHomologous[0] + params->aEmitHomologous[1] + params->aEmitHomologous[2] +
+ params->aEmitHomologous[3] + params->aEmitHomologous[4] + params->aEmitHomologous[5] + params->aEmitHomologous[6]);
+
+
+ params->aEmitUnrelated[0] = 0.12818742714404662781015820149872; // a:a, t:t
+ params->aEmitUnrelated[1] = 0.10493347210657785179017485428807; // c:c, g:g
+ params->aEmitUnrelated[2] = 0.11597910074937552039966694421313; // a:c, c:a
+ params->aEmitUnrelated[3] = params->aEmitUnrelated[2];
+ params->aEmitUnrelated[4] = params->aEmitUnrelated[0];
+ params->aEmitUnrelated[5] = params->aEmitUnrelated[1];
+ params->aEmitUnrelated[6] = 0.0483; // gap open (derived by aligning a 48%GC sequence with
+ // its reverse--not complement--to derive expected gap frequencies in
+ // unrelated sequence)
+ // gap extend // 0.2535
+ params->aEmitUnrelated[7] = 1 - (params->aEmitUnrelated[0] + params->aEmitUnrelated[1] + params->aEmitUnrelated[2] +
+ params->aEmitUnrelated[3] + params->aEmitUnrelated[4] + params->aEmitUnrelated[5] + params->aEmitUnrelated[6]);
+ }
+
+ return *params;
+}
+
+
+/**
+ * Adapts an emission matrix to an arbitrary nucleotide composition
+ * @param gc_content The fraction of the genome which is G/C
+ */
+inline
+Params getAdaptedHoxdMatrixParameters( double gc_content )
+{
+ Params params;
+ double s = 0.03028173853;
+ double at_content = 1-gc_content;
+ double norm_factor = 0.0;
+
+ double gO_unrelated = 0.0483;
+ double gE_unrelated = 0.2535;
+
+ double gO_homologous = 0.004461;
+ double gE_homologous = 0.050733;
+
+ // Unrelated state emission probabilities
+ // use AT/GC background frequency instead of mononucleotide frequency since that is how it is described in the manuscript
+ params.aEmitUnrelated[0] = (at_content/2)*(at_content/2)+(at_content/2)*(at_content/2); // a:a, t:t
+ params.aEmitUnrelated[1] = (gc_content/2)*(gc_content/2)+(gc_content/2)*(gc_content/2); // c:c, g:g
+ params.aEmitUnrelated[2] = (at_content/2)*(gc_content/2)+(gc_content/2)*(at_content/2); //a:c, c:a, g:t, t:g
+ params.aEmitUnrelated[3] = params.aEmitUnrelated[2]; //a:g, g:a, c:t, t:c
+ params.aEmitUnrelated[4] = params.aEmitUnrelated[0]; //a:t, t:a
+ params.aEmitUnrelated[5] = params.aEmitUnrelated[1]; //g:c, c:g
+
+
+ norm_factor = (1-(gO_unrelated+gE_unrelated))/(params.aEmitUnrelated[0] + params.aEmitUnrelated[1] +params.aEmitUnrelated[2] + params.aEmitUnrelated[3]
+ + params.aEmitUnrelated[4] + params.aEmitUnrelated[5] );
+
+ //NORMALIZE the values
+ params.aEmitUnrelated[0] = params.aEmitUnrelated[0]*norm_factor;
+ params.aEmitUnrelated[1] = params.aEmitUnrelated[1]*norm_factor;
+ params.aEmitUnrelated[2] = params.aEmitUnrelated[2]*norm_factor;
+ params.aEmitUnrelated[3] = params.aEmitUnrelated[3]*norm_factor;
+ params.aEmitUnrelated[4] = params.aEmitUnrelated[4]*norm_factor;
+ params.aEmitUnrelated[5] = params.aEmitUnrelated[5]*norm_factor;
+ params.aEmitUnrelated[6] = gO_unrelated;// gap open
+ params.aEmitUnrelated[7] = 1 - (params.aEmitUnrelated[0] + params.aEmitUnrelated[1] + params.aEmitUnrelated[2] + params.aEmitUnrelated[3]
+ + params.aEmitUnrelated[4] + params.aEmitUnrelated[5] + params.aEmitUnrelated[6]);
+
+ //USE PRE-NORMALIZED VALUES (from the HOXD matrix)!!
+ double H_AA = 0.1723*2; //a:a, t:t
+ double H_CC = 0.1462*2; //c:c, g:g
+ double H_AC = 0.0180*4; //a:c, c:a, g:t, t:g
+ double H_AG = 0.0426*4; //a:g, g:a, c:t, t:c
+ double H_AT = 0.0186*2; //a:t, t:a
+ double H_CG = 0.0142*2; //g:c, c:g
+
+ // Homologous state emission probabilities
+ params.aEmitHomologous[0] = (at_content/0.525)*H_AA; // a:a, t:t
+ params.aEmitHomologous[1] = (gc_content/0.475)*H_CC; // c:c, g:g
+ params.aEmitHomologous[2] = H_AC; //a:c, c:a, g:t, t:g
+ params.aEmitHomologous[3] = H_AG; //a:g, g:a, c:t, t:c
+ params.aEmitHomologous[4] = (at_content/0.525)*H_AT; //a:t, t:a
+ params.aEmitHomologous[5] = (gc_content/0.475)*H_CG; //g:c, c:g
+
+
+ norm_factor = (1-(gO_homologous+gE_homologous))/(params.aEmitHomologous[0] + params.aEmitHomologous[1] + params.aEmitHomologous[2] + params.aEmitHomologous[3]
+ + params.aEmitHomologous[4] + params.aEmitHomologous[5]);
+
+ //NORMALIZE the values
+ params.aEmitHomologous[0] = params.aEmitHomologous[0]*norm_factor;
+ params.aEmitHomologous[1] = params.aEmitHomologous[1]*norm_factor;
+ params.aEmitHomologous[2] = params.aEmitHomologous[2]*norm_factor;
+ params.aEmitHomologous[3] = params.aEmitHomologous[3]*norm_factor;
+ params.aEmitHomologous[4] = params.aEmitHomologous[4]*norm_factor;
+ params.aEmitHomologous[5] = params.aEmitHomologous[5]*norm_factor;
+ params.aEmitHomologous[6] = gO_homologous;// gap open
+ params.aEmitHomologous[7] = 1 - (params.aEmitHomologous[0] + params.aEmitHomologous[1] + params.aEmitHomologous[2] + params.aEmitHomologous[3]
+ + params.aEmitHomologous[4] + params.aEmitHomologous[5] + params.aEmitHomologous[6]);
+
+
+ // set state transition probabilities
+ params.iStartHomologous = 0.5;
+ params.iGoHomologous = 0.00001;
+ params.iGoUnrelated = 0.0000001;
+
+ params.iGoStopFromHomologous = 0.0000001;
+ params.iGoStopFromUnrelated = 0.0000001;
+
+ return params;
+}
+
+inline
+void adaptToPercentIdentity( Params& params, double pct_identity )
+{
+ if( pct_identity <= 0 || pct_identity > 1 )
+ throw "Bad pct identity"; // error condition
+ // normalize new pct identity to gap content
+ double gapnorm_pct_id = pct_identity * (1.0 - params.aEmitHomologous[6] - params.aEmitHomologous[7]);
+ // calculate the previous expected identity as H_AA + H_CC
+ double prev_pct_id = params.aEmitHomologous[0] + params.aEmitHomologous[1];
+ double id_diff = prev_pct_id - gapnorm_pct_id;
+ // spread id_diff proportionally among other substitutions
+ double rest_sum = params.aEmitHomologous[2] + params.aEmitHomologous[3] +
+ params.aEmitHomologous[4] + params.aEmitHomologous[5];
+ params.aEmitHomologous[2] += id_diff * params.aEmitHomologous[2] / rest_sum;
+ params.aEmitHomologous[3] += id_diff * params.aEmitHomologous[3] / rest_sum;
+ params.aEmitHomologous[4] += id_diff * params.aEmitHomologous[4] / rest_sum;
+ params.aEmitHomologous[5] += id_diff * params.aEmitHomologous[5] / rest_sum;
+
+ params.aEmitHomologous[0] -= id_diff * params.aEmitHomologous[0] / prev_pct_id;
+ params.aEmitHomologous[1] -= id_diff * params.aEmitHomologous[1] / prev_pct_id;
+}
+
+#endif // __HomologyHMM_parameters_h__
+
diff --git a/libMems/HybridAbstractMatch.h b/libMems/HybridAbstractMatch.h
new file mode 100644
index 0000000..d76f11a
--- /dev/null
+++ b/libMems/HybridAbstractMatch.h
@@ -0,0 +1,315 @@
+/*******************************************************************************
+ * $Id: HybridAbstractMatch.h,v 1.8 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __HybridAbstractMatch_h__
+#define __HybridAbstractMatch_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include "libGenome/gnDefs.h"
+#include "libMems/AbstractMatch.h"
+#include <vector>
+#include <limits>
+#include <cstring>
+namespace mems {
+
+/**
+ * The HybridAbstractMatch implements the AbstractMatch interface in a way
+ * that allows matches with a large SeqCount and low Multiplicity to be stored efficiently
+ */
+template< unsigned FIXED_SEQ_COUNT=2, class int64Alloc=std::allocator<int64>, class uintAlloc=std::allocator<uint> >
+class HybridAbstractMatch : public AbstractMatch {
+public:
+ HybridAbstractMatch() : m_seq_count(0)
+ {
+ memset(fixed_seq_ids, 0xFF, sizeof(fixed_seq_ids));
+ memset(fixed_starts, 0, sizeof(fixed_starts));
+ }
+ /**
+ * Creates a new HybridAbstractMatch.
+ * @param seq_count The total number of sequences in the alignment
+ */
+ HybridAbstractMatch(const uint seq_count )
+ : m_seq_count(seq_count)
+ {
+ memset(fixed_seq_ids, 0xFF, sizeof(fixed_seq_ids));
+ memset(fixed_starts, 0, sizeof(fixed_starts));
+ }
+
+
+ // use compiler-generated copy constructor, assignment operator, and destructor
+
+ // see AbstractMatch base class documentation for these functions
+
+ int64 Start(uint seqI) const;
+ void SetStart(uint seqI, int64 startI);
+ uint Multiplicity() const
+ {
+ uint mult = 0;
+ for( size_t fI = 0; fI < FIXED_SEQ_COUNT; ++fI )
+ mult += fixed_seq_ids[fI] != NO_SEQ ? 1 : 0;
+ return mult + (uint)seq_ids.size();
+ }
+ uint SeqCount() const{return m_seq_count;}
+ uint FirstStart() const;
+ virtual void Invert();
+
+ gnSeqI LeftEnd(uint seqI) const;
+ orientation Orientation(uint seqI) const;
+ void SetLeftEnd(uint seqI, gnSeqI position);
+ void SetOrientation(uint seqI, orientation o);
+
+ // these functions manipulate the start coordinates quickly
+ virtual void MoveStart(int64 move_amount);
+ virtual void MoveEnd(int64 move_amount);
+
+ virtual boolean operator==( const HybridAbstractMatch& ham ) const;
+
+ virtual uint UsedSeq( uint seqI ) const {
+ if(seqI < FIXED_SEQ_COUNT) return fixed_seq_ids[seqI];
+ return seq_ids[seqI];
+ }
+
+protected:
+ uint m_seq_count;
+
+ static const uint NO_SEQ = UINT_MAX;
+
+ // storage for a fixed number of seqs
+ uint fixed_seq_ids[FIXED_SEQ_COUNT];
+ int64 fixed_starts[FIXED_SEQ_COUNT];
+
+ // storage for any number of seqs
+ std::vector<uint, uintAlloc > seq_ids;
+ std::vector<int64, int64Alloc > starts;
+
+ uint SeqToIndex( uint seqI ) const;
+
+ // for use by derived classes in order to swap contents
+ void swap( HybridAbstractMatch* other );
+};
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::swap( HybridAbstractMatch* other )
+{
+ std::swap( m_seq_count, other->m_seq_count );
+
+ uint tmp_ids[FIXED_SEQ_COUNT];
+ for( int i = 0; i < FIXED_SEQ_COUNT; i++ ) tmp_ids[i] = other->fixed_seq_ids[i];
+ for( int i = 0; i < FIXED_SEQ_COUNT; i++ ) other->fixed_seq_ids[i] = fixed_seq_ids[i];
+ for( int i = 0; i < FIXED_SEQ_COUNT; i++ ) fixed_seq_ids[i] = tmp_ids[i];
+
+ int64 tmp_starts[FIXED_SEQ_COUNT];
+ for( int i = 0; i < FIXED_SEQ_COUNT; i++ ) tmp_starts[i] = other->fixed_starts[i];
+ for( int i = 0; i < FIXED_SEQ_COUNT; i++ ) other->fixed_starts[i] = fixed_starts[i];
+ for( int i = 0; i < FIXED_SEQ_COUNT; i++ ) fixed_starts[i] = tmp_starts[i];
+
+ std::swap( seq_ids, other->seq_ids );
+ std::swap( starts, other->starts );
+}
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+uint HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::FirstStart() const
+{
+ uint minI = NO_SEQ;
+ std::size_t i = 0;
+ for( ; i < FIXED_SEQ_COUNT; ++i )
+ minI = fixed_seq_ids[i] < minI ? fixed_seq_ids[i] : minI;
+ for( i = 0; i < seq_ids.size(); ++i )
+ minI = seq_ids[i] < minI ? seq_ids[i] : minI;
+ return minI;
+}
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+uint HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::SeqToIndex( uint seqI ) const
+{
+ uint posI = 0;
+ for( ; posI < FIXED_SEQ_COUNT; ++posI )
+ if( fixed_seq_ids[posI] == seqI )
+ break;
+ if(posI < FIXED_SEQ_COUNT)
+ return posI;
+ for( posI = 0; posI < seq_ids.size(); ++posI )
+ if( seq_ids[posI] == seqI )
+ break;
+ if( posI == seq_ids.size() )
+ return NO_SEQ;
+ return posI + FIXED_SEQ_COUNT;
+}
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+int64 HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::Start(uint seqI) const
+{
+ uint posI = SeqToIndex( seqI );
+ if( posI == NO_SEQ )
+ return NO_MATCH;
+ if( posI < FIXED_SEQ_COUNT )
+ return fixed_starts[posI];
+ return starts[posI-FIXED_SEQ_COUNT];
+}
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::SetStart(uint seqI, int64 startI)
+{
+ uint posI = SeqToIndex( seqI );
+ if( startI == NO_MATCH && posI == NO_SEQ )
+ return;
+ if( posI == NO_SEQ )
+ {
+ for( size_t i = 0; i < FIXED_SEQ_COUNT; ++i )
+ if( fixed_seq_ids[i] == NO_SEQ )
+ {
+ posI = i;
+ break;
+ }
+ }
+ if( posI < FIXED_SEQ_COUNT )
+ {
+ if( startI == NO_MATCH )
+ fixed_seq_ids[posI] = NO_SEQ;
+ else
+ fixed_seq_ids[posI] = seqI;
+ fixed_starts[posI] = startI;
+ }
+ else
+ {
+ posI -= FIXED_SEQ_COUNT;
+ if( startI == NO_MATCH )
+ {
+ seq_ids.erase( seq_ids.begin() + posI );
+ starts.erase( starts.begin() + posI );
+ return;
+ }
+ if( posI >= seq_ids.size() )
+ {
+ seq_ids.push_back(seqI);
+ starts.push_back(startI);
+ }else{
+ starts[posI] = startI;
+ }
+ }
+}
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::Invert()
+{
+ for( size_t i = 0; i < FIXED_SEQ_COUNT; ++i )
+ fixed_starts[i] = -fixed_starts[i];
+ for( size_t i = 0; i < starts.size(); ++i )
+ starts[i] = -starts[i];
+}
+
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+gnSeqI HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::LeftEnd(uint seqI) const
+{
+ uint posI = SeqToIndex( seqI );
+ if( posI == NO_SEQ )
+ return NO_MATCH;
+ if( posI < FIXED_SEQ_COUNT )
+ return genome::absolut(fixed_starts[posI]);
+ return genome::absolut(starts[posI-FIXED_SEQ_COUNT]);
+}
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+AbstractMatch::orientation HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::Orientation(uint seqI) const
+{
+ uint posI = SeqToIndex( seqI );
+ if( posI == NO_SEQ )
+ return undefined;
+ if( posI < FIXED_SEQ_COUNT )
+ return fixed_starts[posI] < 0 ? reverse : forward;
+ return starts[posI-FIXED_SEQ_COUNT] < 0 ? reverse : forward;
+}
+
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::SetLeftEnd(uint seqI, gnSeqI position)
+{
+ uint posI = SeqToIndex( seqI );
+ orientation o = posI == NO_SEQ || position == NO_MATCH ? undefined : Orientation( seqI );
+ SetStart(seqI,position);
+ if( o != undefined )
+ SetOrientation(seqI, o);
+}
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::SetOrientation(uint seqI, orientation o)
+{
+ if( o == undefined )
+ {
+ SetStart(seqI, NO_MATCH);
+ return;
+ }
+ uint posI = SeqToIndex( seqI );
+ if( posI == NO_SEQ )
+ throw "ArrayIndexOutOfBounds!\n";
+ int oi = o == reverse ? -1 : 1;
+ if( posI < FIXED_SEQ_COUNT )
+ {
+ fixed_starts[posI] = genome::absolut(fixed_starts[posI]) * oi;
+ return;
+ }
+ starts[posI-FIXED_SEQ_COUNT] = genome::absolut(starts[posI-FIXED_SEQ_COUNT]) * oi;
+}
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::MoveStart(int64 move_amount)
+{
+ for( size_t i=0; i < FIXED_SEQ_COUNT; ++i )
+ if( fixed_starts[i] > 0 )
+ fixed_starts[i] += move_amount;
+ for( size_t i=0; i < starts.size(); ++i )
+ if( starts[i] > 0 )
+ starts[i] += move_amount;
+}
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+void HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::MoveEnd(int64 move_amount)
+{
+ for( size_t i=0; i < FIXED_SEQ_COUNT; ++i )
+ if( fixed_starts[i] < 0 )
+ fixed_starts[i] -= move_amount;
+ for( size_t i=0; i < starts.size(); ++i )
+ if( starts[i] < 0 )
+ starts[i] -= move_amount;
+}
+
+template< unsigned FIXED_SEQ_COUNT, class gnSeqIAlloc, class uintAlloc >
+boolean HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >::operator==( const HybridAbstractMatch< FIXED_SEQ_COUNT, gnSeqIAlloc, uintAlloc >& sam ) const
+{
+ for( size_t i = 0; i < FIXED_SEQ_COUNT; ++i )
+ {
+ if( fixed_seq_ids[i] == NO_SEQ )
+ continue;
+ if( Start(fixed_seq_ids[i]) != sam.Start(fixed_seq_ids[i]) )
+ return false;
+ }
+ for( size_t i = 0; i < seq_ids.size(); ++i )
+ {
+ if( seq_ids[i] == NO_SEQ )
+ continue;
+ if( Start(seq_ids[i]) != sam.Start(seq_ids[i]) )
+ return false;
+ }
+ return Multiplicity() == sam.Multiplicity();
+}
+
+
+}
+
+#endif // __HybridAbstractMatch_h__
diff --git a/libMems/Interval.cpp b/libMems/Interval.cpp
new file mode 100644
index 0000000..b7060cb
--- /dev/null
+++ b/libMems/Interval.cpp
@@ -0,0 +1,25 @@
+/*******************************************************************************
+ * $Id: Interval.cpp,v 1.12 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/Interval.h"
+#include "libMems/gnAlignedSequences.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Match.h"
+#include <list>
+#include <iterator>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+
+}
diff --git a/libMems/Interval.h b/libMems/Interval.h
new file mode 100644
index 0000000..724ca5f
--- /dev/null
+++ b/libMems/Interval.h
@@ -0,0 +1,958 @@
+/*******************************************************************************
+ * $Id: GenericInterval.h,v 1.4 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __Interval_h__
+#define __Interval_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include "libGenome/gnDebug.h"
+#include "libMems/SparseAbstractMatch.h"
+#include "libMems/gnAlignedSequences.h"
+#include "libMems/AbstractGappedAlignment.h"
+#include "libMems/Match.h"
+#include "libMems/GappedAlignment.h"
+#include <iostream>
+#include <vector>
+#include "libMems/twister.h"
+
+//#include "boost/pool/object_pool.hpp"
+
+namespace mems {
+
+// adapter function to allow inserts on reverse iterators
+template< typename ListType, typename RanIt, typename Ty >
+void insert( ListType& the_list, std::reverse_iterator<RanIt>& riter, Ty& val )
+{
+ the_list.insert( riter.base(), val );
+ ++riter; // need to shift riter
+}
+template< typename ListType, typename Ty >
+void insert( ListType& the_list, const typename ListType::iterator& iter, Ty& val )
+{
+ the_list.insert( iter, val );
+}
+
+
+template< class GappedBaseImpl = AbstractGappedAlignment< SparseAbstractMatch<> > >
+class GenericInterval : public GappedBaseImpl
+{
+public:
+ GenericInterval(){};
+
+// GenericInterval( uint seq_count, gnSeqI aln_length) : GappedBaseImpl( seq_count, aln_length ){};
+
+ /** construct from a MatchList or a vector of pointers to AbstractMatches */
+ template<typename BidIt>
+ GenericInterval( BidIt it_begin, const BidIt& it_end ) : GappedBaseImpl( (*it_begin)->SeqCount(), 0 )
+ {
+ std::vector<gnSeqI> pos((*it_begin)->SeqCount(), NO_MATCH);
+ for( ; it_begin != it_end; ++it_begin )
+ this->matches.push_back( (*it_begin)->Copy() );
+ CalculateOffset();
+ addUnalignedRegions();
+ CalculateAlignmentLength();
+ ValidateMatches();
+ }
+
+ GenericInterval( const GenericInterval& iv );
+ ~GenericInterval();
+ GenericInterval& operator=( const GenericInterval& iv );
+
+ GenericInterval* Clone() const;
+ GenericInterval* Copy() const;
+ virtual void Free();
+
+ /** Set the matches in this interval *without* making a copy. The GenericInterval takes ownership of matches */
+ template< class MatchVector >
+ void SetMatches( MatchVector& matches )
+ {
+ // Set the SeqCount and other bits
+ Match m( matches[0]->SeqCount() );
+ std::vector<AbstractMatch*> tmp(1, &m);
+ *this = GenericInterval( tmp.begin(), tmp.end() );
+
+ // then delete the allocated dummy match
+ for( std::size_t mI = 0; mI < this->matches.size(); mI++ )
+ this->matches[mI]->Free();
+
+ // now set the matches and update the interval data
+ this->matches.resize(matches.size());
+ std::copy(matches.begin(), matches.end(), this->matches.begin());
+// this->matches.insert( this->matches.end(), matches.begin(), matches.end() );
+ CalculateOffset();
+ addUnalignedRegions();
+ CalculateAlignmentLength();
+ ValidateMatches();
+
+ // finally, clear the user supplied matches to indicate that we own the memory
+ matches.clear();
+ }
+
+ /** Set the matches in this interval *without* cloberring the interval.*/
+ template< class MatchVector >
+ void SetMatchesTemp( MatchVector& matches )
+ {
+ // Set the SeqCount and other bits
+ Match m( matches[0]->SeqCount() );
+ std::vector<AbstractMatch*> tmp(1, &m);
+ *this = GenericInterval( tmp.begin(), tmp.end() );
+
+ // then delete the allocated dummy match
+ for( std::size_t mI = 0; mI < this->matches.size(); mI++ )
+ this->matches[mI]->Free();
+
+ // now set the matches and update the interval data
+ this->matches.resize(matches.size());
+ std::copy(matches.begin(), matches.end(), this->matches.begin());
+ CalculateOffset();
+ CalculateAlignmentLength();
+ ValidateMatches();
+
+ // finally, clear the user supplied matches to indicate that we own the memory
+ matches.clear();
+ }
+ /**
+ * Writes this GenericInterval to the specified output stream (e.g. cout).
+ */
+ template<typename BaseImpl> friend std::ostream& operator<<(std::ostream& os, const GenericInterval<BaseImpl>& iv); //write to source.
+
+ /**
+ * Reads a GenericInterval from the specified input stream (e.g. cin).
+ */
+ template<typename BaseImpl> friend std::istream& operator>>(std::istream& is, const GenericInterval<BaseImpl>& iv); //read from source
+
+ // Inherited methods from AbstractMatch:
+ void Invert();
+ void CropStart(gnSeqI crop_amount);
+ void CropEnd(gnSeqI crop_amount);
+ void MoveStart(int64 move_amount);
+ void MoveEnd(int64 move_amount);
+
+ virtual void CalculateOffset();
+
+ void add( AbstractMatch* am ){ matches.push_back( am->Copy() ); }
+
+ /**
+ * Get a gnAlignedSequences object
+ * TODO: get rid of this
+ */
+ virtual void GetAlignedSequences( gnAlignedSequences& gnas, const std::vector< genome::gnSequence* >& seq_table ) const;
+
+ void GetAlignment( std::vector< bitset_t >& align_matrix ) const;
+
+ void CropLeft( gnSeqI amount, uint seqI );
+ void CropRight( gnSeqI amount, uint seqI );
+
+ void SetAlignment( const std::vector< std::string >& seq_align );
+
+ // TODO: get rid of code that uses this hack...
+ const std::vector<AbstractMatch*>& GetMatches() const{ return matches; }
+ void StealMatches( std::vector<AbstractMatch*>& matches );
+
+ /** marbles the gaps so that no sequence has more than "size" contiguous gaps */
+ void Marble( gnSeqI size );
+
+ void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const;
+
+ bool IsGap( uint seq, gnSeqI col ) const;
+
+ /** self test code */
+ void ValidateMatches() const;
+
+ void swap( GenericInterval& other ){ swap(&other); }
+
+protected:
+ // for use by derived classes in order to swap contents
+ void swap( GenericInterval* other ){
+ std::swap( matches, other->matches );
+ GappedBaseImpl::swap( other );
+ }
+ std::vector< AbstractMatch* > matches;
+private:
+ void addUnalignedRegions();
+ void FindMatchPos( uint seqI, gnSeqI pos, size_t& matchI, gnSeqI& match_pos );
+ void GetColumnAndMatch( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column, size_t& matchI, gnSeqI& match_col ) const;
+ void CalculateAlignmentLength();
+};
+
+typedef GenericInterval<> Interval;
+
+
+template<class GappedBaseImpl>
+GenericInterval<GappedBaseImpl>* GenericInterval<GappedBaseImpl>::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::Free()
+{
+ m_free(this);
+}
+
+template<class GappedBaseImpl>
+GenericInterval<GappedBaseImpl>::~GenericInterval()
+{
+ for( std::size_t mI = 0; mI < matches.size(); mI++ )
+ matches[mI]->Free();
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::StealMatches( std::vector<AbstractMatch*>& matches ){
+ matches = this->matches;
+ this->matches.clear();
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ this->SetLeftEnd( seqI, NO_MATCH );
+ this->SetLength( 0, seqI );
+ }
+ this->SetAlignmentLength(0);
+}
+
+template<class GappedBaseImpl>
+GenericInterval<GappedBaseImpl>::GenericInterval( const GenericInterval<GappedBaseImpl>& iv )
+{
+ *this = iv;
+}
+
+template<class GappedBaseImpl>
+GenericInterval<GappedBaseImpl>& GenericInterval<GappedBaseImpl>::operator=( const GenericInterval& iv )
+{
+ GappedBaseImpl::operator=( iv );
+ for( std::size_t mI = 0; mI < matches.size(); mI++ )
+ matches[mI]->Free();
+ matches.clear();
+ for( std::size_t mI = 0; mI < iv.matches.size(); mI++ )
+ matches.push_back( iv.matches[mI]->Copy() );
+ return *this;
+}
+
+template<class GappedBaseImpl>
+GenericInterval<GappedBaseImpl>* GenericInterval<GappedBaseImpl>::Clone() const
+{
+ return new GenericInterval( *this );
+}
+
+
+static bool debug_interval = false;
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::ValidateMatches() const
+{
+ if( !debug_interval )
+ return;
+ if( matches.size() == 0 )
+ {
+// genome::breakHere();
+// std::cerr << "iv has no matches\n";
+ return;
+ }
+ for( uint seqI = 0; seqI < matches[0]->SeqCount(); ++seqI )
+ {
+ gnSeqI prev_rend = this->LeftEnd(seqI);
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ {
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ {
+ if( matches[mI]->LeftEnd(seqI) != NO_MATCH )
+ {
+ if( prev_rend != matches[mI]->LeftEnd(seqI) )
+ {
+ std::cerr << "iv broken\n";
+ std::cerr << "seqI: " << seqI << "\t prev_rend: " << prev_rend << std::endl;
+ std::cerr << "mI: " << mI << "\tlend: " << matches[mI]->LeftEnd(seqI) << std::endl;
+ genome::breakHere();
+ }
+ prev_rend = matches[mI]->RightEnd(seqI) + 1;
+ }
+ }
+ }else if( this->Orientation(seqI) == AbstractMatch::reverse )
+ {
+ for( size_t mI = matches.size(); mI > 0; mI-- )
+ {
+ if( matches[mI-1]->LeftEnd(seqI) != NO_MATCH )
+ {
+ if( prev_rend != matches[mI-1]->LeftEnd(seqI) )
+ {
+ std::cerr << "iv broken 2\n";
+ genome::breakHere();
+ }
+ prev_rend = matches[mI-1]->RightEnd(seqI) + 1;
+ }
+ }
+ }
+
+ if( this->Orientation(seqI) != AbstractMatch::undefined && this->Length(seqI) == 0 )
+ {
+ genome::breakHere();
+ std::cerr << "ERROR: confused interval\n";
+ }
+ }
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::GetColumnAndMatch( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column, size_t& matchI, gnSeqI& match_col ) const
+{
+ // bail when the appropriate match is found
+ gnSeqI col_pos = 0;
+ size_t mI = 0;
+ pos.clear();
+ for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
+ {
+ if( this->LeftEnd(seqI) == NO_MATCH )
+ pos.push_back(NO_MATCH);
+ else if( this->Orientation(seqI) == AbstractMatch::forward )
+ pos.push_back(this->LeftEnd(seqI));
+ else
+ pos.push_back(this->RightEnd(seqI)+1);
+ }
+
+ column = std::vector<bool>(this->SeqCount(), false);
+
+ for( ; mI < matches.size(); ++mI )
+ {
+ uint seqI = 0;
+
+ gnSeqI diff = matches[mI]->AlignmentLength();
+ diff = col_pos + diff <= col ? diff : col - col_pos;
+
+ for( seqI = 0; seqI < this->SeqCount(); ++seqI )
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ pos[seqI] += diff;
+ else if( this->Orientation(seqI) == AbstractMatch::reverse )
+ pos[seqI] -= diff;
+
+ col_pos += diff;
+
+ if( col_pos >= col && diff < matches[mI]->AlignmentLength() )
+ {
+ std::vector<gnSeqI> m_pos;
+ matches[mI]->GetColumn( diff, m_pos, column );
+ for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
+ if( m_pos[seqI] != NO_MATCH )
+ pos[seqI] = m_pos[seqI];
+ matchI = mI;
+ match_col = diff;
+ break;
+ }
+ }
+}
+
+template<typename ListType, typename Iter>
+void AddGapMatches( ListType& the_list, const Iter& first, const Iter& last,
+ uint seqI, int64 left_end, int64 right_end,
+ AbstractMatch::orientation seq_orient, uint seq_count )
+{
+ Iter iter = first;
+ int64 pos = left_end-1;
+ //MatchList& tmp_list;
+ std::vector< std::pair<Match*,Iter> > insert_pos;
+ for( ; iter != last; ++iter )
+ {
+ if( (*iter)->LeftEnd(seqI) != NO_MATCH )
+ {
+ gnSeqI len = (*iter)->LeftEnd(seqI)-pos-1;
+
+ //tjt: there are perfectly valid chains that blow up when this is enabled
+ //i.e:
+ // <----c1----><----d1---->
+ // <--a1---><---b1--->
+ // pos would get set to b1->RightEnd() since diff between a1 & b1 == 0
+ // but then c1->LeftEnd < pos, so genome::breakHere() gets called
+ // this is because SetMatches() gets called before finalize(), but should it??
+
+ if( len > 4000000000u )
+ {
+ std::cerr << "triplebogus interval data\n";
+ std::cerr << "(*iter)->LeftEnd(" << seqI << "): " << (*iter)->LeftEnd(seqI) << std::endl;
+ std::cerr << "pos: " << pos << std::endl;
+ genome::breakHere();
+ }
+
+ if( len > 0 )
+ {
+ Match tmp(seq_count);
+ Match* new_m = tmp.Copy();
+ new_m->SetLeftEnd(seqI, pos + 1);
+ new_m->SetOrientation(seqI, seq_orient);
+ new_m->SetLength(len);
+ pos = (*iter)->RightEnd(seqI);
+ //insert(the_list, iter, new_m); // this may move iter
+ //tmp_list.push_back(new_m);
+ insert_pos.push_back(make_pair(new_m,iter));
+ }
+ else
+ pos = (*iter)->RightEnd(seqI);
+ }
+ }
+ for ( uint i = 0; i < insert_pos.size(); i++)
+ {
+ insert(the_list, insert_pos.at(i).second, insert_pos.at(i).first);
+ }
+ if( right_end != pos )
+ {
+ Match tmp(seq_count);
+ Match* new_m = tmp.Copy();
+ new_m->SetLeftEnd(seqI, pos+1);
+ new_m->SetLength(right_end-pos-1);
+ insert(the_list, iter, new_m);
+ }
+}
+
+// The best steaks are well marbled
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::Marble( gnSeqI size )
+{
+ if( this->SeqCount() > 2 )
+ throw "I can't handle that many at once\n";
+ if( this->Multiplicity() < 2 )
+ return; // can't marble unless there are at least two seqs
+
+ // first break up all the pieces
+ std::list<AbstractMatch*> mlist;
+ mlist.insert( mlist.end(), matches.begin(), matches.end() );
+ std::list<AbstractMatch*>::iterator m_iter = mlist.begin();
+ for(; m_iter != mlist.end(); ++m_iter )
+ {
+ if( (*m_iter)->Multiplicity() != 1 || (*m_iter)->AlignmentLength() <= size )
+ continue;
+ // which seq are we working with?
+ uint seqI = 0;
+ for( ; seqI < (*m_iter)->SeqCount(); seqI++ )
+ if( (*m_iter)->LeftEnd(seqI) != NO_MATCH )
+ break;
+ AbstractMatch* left_iv = (*m_iter)->Copy();
+ left_iv->CropEnd( left_iv->AlignmentLength() - size );
+ (*m_iter)->CropStart( size );
+ m_iter = mlist.insert( m_iter, left_iv );
+ }
+ matches.clear();
+ matches.insert( matches.end(), mlist.begin(), mlist.end() );
+ this->ValidateMatches();
+
+ // now interleave the gaps
+ std::vector< std::vector<AbstractMatch*>::iterator > seq_iter( this->SeqCount(), matches.begin() );
+ std::vector< AbstractMatch* > interleaved(matches.size());
+ std::vector<AbstractMatch*>::iterator anchor = matches.begin();
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( this->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ for( ; seq_iter[seqI] != matches.end() && (*seq_iter[seqI])->LeftEnd(seqI) == NO_MATCH; ++seq_iter[seqI] );
+ }
+ for( ; anchor != matches.end() && (*anchor)->Multiplicity() < this->SeqCount(); ++anchor );
+ size_t cur = 0;
+ while(true)
+ {
+ // increment anchor if an iter has caught up to it...
+ uint seqI = 0;
+ do{
+ for( seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( seq_iter[seqI] == anchor && anchor != matches.end() )
+ {
+ for( uint seqJ = 0; seqJ < this->SeqCount(); seqJ++ )
+ {
+ // add anything in seq_iter[seqJ]
+ while( seq_iter[seqJ] != anchor )
+ {
+ interleaved[cur++] = *(seq_iter[seqJ]);
+ for( ++seq_iter[seqJ]; seq_iter[seqJ] != matches.end() && (*seq_iter[seqJ])->LeftEnd(seqJ) == NO_MATCH; ++seq_iter[seqJ] );
+ }
+ // don't end on an anchor
+ for( ++seq_iter[seqJ]; seq_iter[seqJ] != matches.end() && (*seq_iter[seqJ])->LeftEnd(seqJ) == NO_MATCH; ++seq_iter[seqJ] );
+ }
+ // increment anchor
+ interleaved[cur++] = *anchor;
+ for( ++anchor; anchor != matches.end() && (*anchor)->Multiplicity() < this->SeqCount(); ++anchor );
+
+ break;
+ }
+ }
+ }while( seqI < this->SeqCount() );
+
+ size_t diff1 = anchor - seq_iter[0];
+ size_t diff2 = anchor - seq_iter[1];
+ if( diff1 == 0 && diff2 == 0 )
+ break;
+ // sample from a binomial with p(success) = diff1 / diff1+diff2
+// double samp = ((double)rand())/((double)RAND_MAX);
+ double samp = RandTwisterDouble();
+ // add one of the intervals and move on to the next...
+ if( diff2 == 0 || (samp < .5 && diff1 > 0) )
+ {
+ interleaved[cur++] = *(seq_iter[0]);
+ for( ++seq_iter[0]; seq_iter[0] != matches.end() && (*seq_iter[0])->LeftEnd(0) == NO_MATCH; ++seq_iter[0] );
+ }else{
+ interleaved[cur++] = *(seq_iter[1]);
+ for( ++seq_iter[1]; seq_iter[1] != matches.end() && (*seq_iter[1])->LeftEnd(1) == NO_MATCH; ++seq_iter[1] );
+ }
+ }
+ matches = interleaved;
+ this->ValidateMatches();
+}
+
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::CropStart(gnSeqI crop_amount)
+{
+ if( crop_amount > this->AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ if( crop_amount == 0 )
+ return;
+
+ std::vector<bool> col;
+ std::vector<gnSeqI> pos;
+ size_t matchI = 0;
+ gnSeqI match_col;
+ this->GetColumnAndMatch( crop_amount, pos, col, matchI, match_col );
+
+ // delete everything before matchI
+ for( size_t mI = 0; mI < matchI; ++mI )
+ matches[mI]->Free();
+ matches.erase(matches.begin(), matches.begin()+matchI);
+
+ // crop from within matchI
+ matches[0]->CropStart(match_col);
+
+ this->CalculateOffset();
+ this->ValidateMatches();
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::CropEnd(gnSeqI crop_amount)
+{
+ if( crop_amount > this->AlignmentLength() )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ if( crop_amount == 0 )
+ return;
+ std::vector<bool> col;
+ std::vector<gnSeqI> pos;
+ size_t matchI = 0;
+ gnSeqI match_col;
+ this->GetColumnAndMatch( this->AlignmentLength()-crop_amount, pos, col, matchI, match_col );
+
+ // delete everything after matchI
+ size_t plusmatch = match_col == 0 ? 0 : 1;
+ for( size_t mI = matchI+plusmatch; mI < matches.size(); ++mI )
+ matches[mI]->Free();
+ matches.erase(matches.begin()+matchI+plusmatch, matches.end());
+
+ // crop from within matchI
+ if( matches.size() > 0 && plusmatch == 1 )
+ matches.back()->CropEnd(matches.back()->AlignmentLength() - match_col);
+
+ this->CalculateOffset();
+ this->ValidateMatches();
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::GetAlignment( std::vector< bitset_t >& align_matrix ) const
+{
+ gnSeqI cur_col = 0;
+ align_matrix = std::vector< bitset_t >( this->SeqCount(), bitset_t(this->AlignmentLength(),false) );
+ for( uint matchI = 0; matchI < matches.size(); ++matchI ){
+ std::vector< bitset_t > aln_mat;
+ matches[matchI]->GetAlignment( aln_mat );
+ for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
+ {
+ if( matches[matchI]->LeftEnd(seqI) == NO_MATCH || matches[matchI]->Length(seqI) == 0 )
+ continue;
+
+ size_t ct = 0;
+ gnSeqI len = matches[matchI]->Length(seqI);
+ for( bitset_t::size_type pos = aln_mat[seqI].find_first(); ct < len; pos = aln_mat[seqI].find_next(pos) )
+ {
+ align_matrix[seqI].set( cur_col + pos );
+ ct++;
+ }
+ }
+ cur_col += matches[matchI]->AlignmentLength();
+ }
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::CropLeft( gnSeqI amount, uint seqI )
+{
+ if( amount > this->Length(seqI) )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ if( this->LeftEnd(seqI) == NO_MATCH || amount == 0 )
+ return;
+
+ // for debugging
+ gnSeqI pre_len = this->Length(seqI);
+ gnSeqI pre_lend = this->LeftEnd(seqI);
+
+ gnSeqI match_pos;
+ size_t mI;
+ this->FindMatchPos(seqI, amount, mI, match_pos);
+ if( matches[mI]->Orientation(seqI) == this->Orientation(seqI) )
+ matches[mI]->CropLeft(match_pos, seqI);
+ else
+ matches[mI]->CropRight(match_pos, seqI);
+
+ if( matches[mI]->Length(seqI) == 0 )
+ std::cerr << "Big fat zero 1\n";
+
+ // get rid of everything to the left of mI
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ {
+ for( size_t m = 0; m < mI; m++ )
+ matches[m]->Free();
+ matches.erase(matches.begin(), matches.begin()+mI);
+ }else{
+ for( size_t m = mI+1; m < matches.size(); m++ )
+ matches[m]->Free();
+ matches.erase(matches.begin()+mI+1, matches.end());
+ }
+
+ this->CalculateOffset();
+ this->ValidateMatches();
+
+ if( this->Length(seqI) != pre_len - amount )
+ {
+ std::cerr << "Error intercroplef\n";
+ std::cerr << "pre len: " << pre_len << std::endl;
+ std::cerr << "pre lend: " << pre_lend << std::endl;
+ std::cerr << "amount: " << amount << std::endl;
+ std::cerr << "LeftEnd(seqI) " << this->LeftEnd(seqI) << std::endl;
+ std::cerr << "Length(seqI) " << this->Length(seqI) << std::endl;
+ std::cerr << "AlignmentLength() " << this->AlignmentLength() << std::endl;
+ genome::breakHere();
+ }
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::CropRight( gnSeqI amount, uint seqI )
+{
+ if( amount > this->Length(seqI) )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+
+ if( this->LeftEnd(seqI) == NO_MATCH || amount == 0 )
+ return;
+
+ // for debugging
+ gnSeqI pre_len = this->Length(seqI);
+ gnSeqI pre_lend = this->LeftEnd(seqI);
+
+ gnSeqI left_amount = this->Length(seqI) - amount;
+ gnSeqI match_pos;
+ size_t mI;
+ this->FindMatchPos(seqI, left_amount, mI, match_pos);
+ if( matches[mI]->Orientation(seqI) == this->Orientation(seqI) )
+ matches[mI]->CropRight(matches[mI]->Length(seqI)-match_pos, seqI);
+ else
+ matches[mI]->CropLeft(matches[mI]->Length(seqI)-match_pos, seqI);
+
+ if( matches[mI]->Length(seqI) == 0 )
+ mI += this->Orientation(seqI) == AbstractMatch::forward ? -1 : 1; // delete this match too
+
+ // get rid of everything to the left of mI
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ {
+ for( size_t m = mI+1; m < matches.size(); m++ )
+ matches[m]->Free();
+ matches.erase(matches.begin()+(mI+1), matches.end());
+ }else{
+ for( size_t m = 0; m < mI; m++ )
+ matches[m]->Free();
+ matches.erase(matches.begin(), matches.begin()+mI);
+ }
+
+ this->CalculateOffset();
+ this->ValidateMatches();
+
+ if( this->Length(seqI) != pre_len - amount )
+ {
+ std::cerr << "Error intercropright\n";
+ std::cerr << "pre len: " << pre_len << std::endl;
+ std::cerr << "pre lend: " << pre_lend << std::endl;
+ std::cerr << "amount: " << amount << std::endl;
+ std::cerr << "LeftEnd(seqI) " << this->LeftEnd(seqI) << std::endl;
+ std::cerr << "Length(seqI) " << this->Length(seqI) << std::endl;
+ std::cerr << "AlignmentLength() " << this->AlignmentLength() << std::endl;
+ genome::breakHere();
+ }
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::MoveStart(int64 move_amount)
+{
+ GappedBaseImpl::MoveStart(move_amount);
+ for( size_t mI = 0; mI < matches.size(); mI++ )
+ matches[mI]->MoveStart(move_amount);
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::MoveEnd(int64 move_amount)
+{
+ GappedBaseImpl::MoveEnd(move_amount);
+ for( size_t mI = 0; mI < matches.size(); mI++ )
+ matches[mI]->MoveEnd(move_amount);
+}
+
+
+template< class MatchVector >
+void FindBoundaries( const MatchVector& matches, std::vector<gnSeqI>& left_ends, std::vector<gnSeqI>& lengths, std::vector<bool>& orientations )
+{
+ uint seqI;
+ boolean zero_exists = false;
+ uint seq_count = matches.front()->SeqCount();
+ left_ends = std::vector<gnSeqI>( seq_count, NO_MATCH );
+ lengths = std::vector<gnSeqI>( seq_count, 0 );
+ orientations = std::vector<bool>( seq_count, false );
+
+ // find leftend in each forward sequence
+ uint matchI = 0;
+ for(; matchI != matches.size(); ++matchI )
+ {
+ zero_exists = false;
+ for( seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( left_ends[seqI] == NO_MATCH && matches[matchI]->Orientation(seqI) == AbstractMatch::forward )
+ {
+ left_ends[seqI] = matches[ matchI ]->LeftEnd(seqI);
+ orientations[seqI] = true;
+ }
+ else if( left_ends[seqI] == NO_MATCH )
+ zero_exists = true;
+ }
+ if( !zero_exists )
+ break;
+ }
+
+ // find end in each forward sequence
+ for( matchI = matches.size(); matchI > 0; matchI-- )
+ {
+ zero_exists = false;
+ for( seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( lengths[seqI] == 0 &&
+ matches[ matchI - 1 ]->Orientation(seqI) == AbstractMatch::forward )
+ {
+ lengths[seqI] = matches[matchI - 1]->LeftEnd(seqI) + matches[matchI - 1]->Length(seqI) - left_ends[seqI];
+ }
+ if( left_ends[seqI] != NO_MATCH && lengths[seqI] == 0 )
+ zero_exists = true;
+ }
+ if( !zero_exists )
+ break;
+ }
+
+ // find start in each reverse sequence
+ for( matchI = matches.size(); matchI > 0; matchI-- )
+ {
+ zero_exists = false;
+ for( seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( left_ends[seqI] == NO_MATCH && matches[ matchI - 1 ]->Orientation(seqI) == AbstractMatch::reverse )
+ left_ends[seqI] = matches[matchI - 1]->LeftEnd(seqI);
+ if( left_ends[seqI] == NO_MATCH )
+ zero_exists = true;
+ }
+ if( !zero_exists )
+ break;
+ }
+
+ // find end in each reverse sequence
+ for( matchI = 0; matchI != matches.size(); ++matchI )
+ {
+ zero_exists = false;
+ for( seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( lengths[seqI] == 0 &&
+ matches[matchI]->Orientation(seqI) == AbstractMatch::reverse )
+ {
+ lengths[seqI] = matches[matchI]->Length(seqI)+(matches[matchI]->LeftEnd(seqI) - left_ends[seqI]);
+ }
+ if( lengths[seqI] == 0 )
+ zero_exists = true;
+ }
+ if( !zero_exists )
+ break;
+ }
+}
+
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::addUnalignedRegions()
+{
+ std::list<AbstractMatch*> new_matches(matches.begin(), matches.end());
+
+ for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
+ {
+ if( this->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if(this->Orientation(seqI) == AbstractMatch::forward)
+ AddGapMatches( new_matches, new_matches.begin(), new_matches.end(), seqI, this->LeftEnd(seqI), this->RightEnd(seqI), this->Orientation(seqI), this->SeqCount() );
+ else
+ AddGapMatches( new_matches, new_matches.rbegin(), new_matches.rend(), seqI, this->LeftEnd(seqI), this->RightEnd(seqI), this->Orientation(seqI), this->SeqCount() );
+ }
+ matches.clear();
+ matches.insert(matches.end(), new_matches.begin(), new_matches.end());
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::Invert(){
+ GappedBaseImpl::Invert();
+ for( uint matchI = 0; matchI < matches.size(); ++matchI )
+ matches[ matchI ]->Invert();
+
+ std::reverse( matches.begin(), matches.end() );
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
+{
+ size_t matchI;
+ gnSeqI match_col;
+ this->GetColumnAndMatch( col, pos, column, matchI, match_col );
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::FindMatchPos( uint seqI, gnSeqI pos, size_t& matchI, gnSeqI& match_pos )
+{
+ match_pos = pos;
+ int diff_amt = 0;
+ int incr = 1;
+ matchI = 0;
+ size_t end_mI = matches.size();
+ if( this->Orientation(seqI) == AbstractMatch::reverse )
+ {
+ diff_amt = -1;
+ incr = -1;
+ matchI = matches.size();
+ end_mI = 0;
+ }
+
+ for( ; matchI != end_mI; matchI+=incr )
+ {
+ if( matches[matchI+diff_amt]->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if( matches[matchI+diff_amt]->Length(seqI) <= match_pos )
+ match_pos -= matches[matchI+diff_amt]->Length(seqI);
+ else
+ break;
+ }
+
+ if( this->Orientation(seqI) == AbstractMatch::reverse )
+ matchI--;
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::CalculateOffset(){
+ std::vector<gnSeqI> left_end( this->SeqCount(), NO_MATCH );
+ std::vector<gnSeqI> length( this->SeqCount(), 0 );
+ std::vector<bool> orientation;
+ if( this->matches.size() > 0 )
+ FindBoundaries( this->matches, left_end, length, orientation );
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( left_end[seqI] != 0 )
+ {
+ this->SetLeftEnd(seqI, left_end[seqI]);
+ this->SetLength(length[seqI], seqI);
+ if( orientation[seqI] )
+ this->SetOrientation(seqI, AbstractMatch::forward);
+ else
+ this->SetOrientation(seqI, AbstractMatch::reverse);
+ }else if( this->LeftEnd(seqI) != NO_MATCH )
+ {
+ this->SetLength(0, seqI);
+ this->SetLeftEnd(seqI, NO_MATCH);
+ }
+
+ }
+
+ this->CalculateAlignmentLength();
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::SetAlignment( const std::vector< std::string >& seq_align )
+{
+ GappedAlignment* ga = new GappedAlignment(seq_align.size(), seq_align[0].size());
+ matches.clear();
+ matches.push_back(ga);
+ ga->SetAlignment(seq_align);
+ for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
+ {
+ ga->SetStart(seqI, this->Start(seqI));
+ ga->SetLength(this->Length(seqI), seqI);
+ }
+}
+
+
+/**
+ * Writes this GenericInterval to the specified output stream (e.g. cout).
+ */
+template<class GappedBaseImpl>
+std::ostream& operator<<(std::ostream& os, const GenericInterval<GappedBaseImpl>& cr){
+ try{
+ for( uint matchI = 0; matchI < cr.matches.size(); ++matchI ){
+ const AbstractMatch* m = cr.matches[ matchI ];
+ const GappedAlignment* clust = dynamic_cast< const GappedAlignment* >( m );
+ if( clust != NULL )
+ os << *clust;
+ const Match* match = dynamic_cast< const Match* >( m );
+ if( match != NULL )
+ os << *match;
+ os << std::endl;
+ }
+ }catch(...){
+ std::cerr << "Exceptional handler\n";
+ }
+ return os;
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::CalculateAlignmentLength()
+{
+ gnSeqI aln_len = 0;
+ // count each match's alignment length
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ aln_len += matches[mI]->AlignmentLength();
+ this->SetAlignmentLength(aln_len);
+}
+
+template<class GappedBaseImpl>
+void GenericInterval<GappedBaseImpl>::GetAlignedSequences( gnAlignedSequences& gnas, const std::vector< genome::gnSequence* >& seq_table ) const
+{
+ gnas.names.clear();
+ for( uint seqI = 0; seqI < seq_table.size(); ++seqI ){
+ std::string name;
+ if( seq_table[ seqI ]->contigListSize() > 0 )
+ name = seq_table[ seqI ]->contigName( 0 );
+ gnas.names.push_back( name );
+ gnas.positions.push_back(this->Start(seqI));
+ }
+ mems::GetAlignment( *this, seq_table, gnas.sequences );
+}
+
+template<class GappedBaseImpl>
+bool GenericInterval<GappedBaseImpl>::IsGap( uint seq, gnSeqI col ) const
+{
+ std::vector<gnSeqI> pos;
+ std::vector<bool> column;
+ GetColumn(col, pos, column);
+ return column[seq];
+}
+
+}
+
+namespace std {
+template<> inline
+void swap( mems::Interval& a, mems::Interval& b )
+{
+ a.swap(b);
+}
+}
+
+#endif // __Interval_h__
diff --git a/libMems/IntervalList.cpp b/libMems/IntervalList.cpp
new file mode 100644
index 0000000..ef612ca
--- /dev/null
+++ b/libMems/IntervalList.cpp
@@ -0,0 +1,25 @@
+/*******************************************************************************
+ * $Id: IntervalList.cpp,v 1.12 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/MemHash.h"
+#include "libMems/GappedAlignment.h"
+
+#include <map>
+#include <sstream>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+}
diff --git a/libMems/IntervalList.h b/libMems/IntervalList.h
new file mode 100644
index 0000000..9d9321a
--- /dev/null
+++ b/libMems/IntervalList.h
@@ -0,0 +1,842 @@
+/*******************************************************************************
+ * $Id: GenericIntervalList.h,v 1.6 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _IntervalList_h_
+#define _IntervalList_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <iostream>
+#include <list>
+#include <sstream>
+
+#include "libMems/SortedMerList.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/Interval.h"
+#include "libMems/MemHash.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libGenome/gnSourceFactory.h"
+#include "libGenome/gnFASSource.h"
+#include "libGenome/gnSEQSource.h"
+#include "libGenome/gnGBKSource.h"
+#include "libGenome/gnRAWSource.h"
+
+namespace mems {
+
+/**
+ * This class represents a set Intervals, each of which is a collinear aligned region
+ * There are functions to read and write an GenericIntervalList.
+ * @see Interval
+ */
+template< class MatchType = Interval >
+class GenericIntervalList : public std::vector< MatchType > {
+public:
+ GenericIntervalList(){};
+ GenericIntervalList( const GenericIntervalList& ml );
+ GenericIntervalList& operator=( const GenericIntervalList& ml );
+
+ /**
+ * Deletes the objects associated
+ * with this GenericIntervalList.
+ */
+ void Clear();
+
+ /**
+ * Reads a GenericIntervalList from an input stream
+ * Sequence and SML file names are read into the seq_filename
+ * and sml_filename vectors, but the actual files are not
+ * opened. The calling function should load them after
+ * using this method.
+ * @param match_stream The input stream to read from
+ */
+ void ReadList( std::istream& match_stream );
+
+ /**
+ * Writes a GenericIntervalList to the designated output stream
+ * @param match_stream The outptu stream to write to
+ */
+ void WriteList( std::ostream& match_stream ) const;
+
+ /**
+ * Writes a gapped alignment of sequences to the output stream
+ */
+ void WriteAlignedSequences(std::ostream& match_file) const;
+
+ /**
+ * Writes a gapped alignment of sequences in a standard format
+ */
+ void WriteStandardAlignment( std::ostream& out_file ) const;
+
+ /**
+ * Writes a gapped alignment of sequences in xml format
+ */
+ void WriteXMLAlignment( std::ostream& out_file ) const;
+
+ /**
+ * Reads in a set of intervals that are in xmfa (eXtended MultiFastA) format
+ */
+ void ReadStandardAlignment( std::istream& in_stream );
+
+ /**
+ * Reads in a set of intervals that are in xmfa (eXtended MultiFastA) format
+ * and stores them in CompactGappedAlignments<>
+ */
+ void ReadStandardAlignmentCompact( std::istream& in_stream );
+
+ std::vector<std::string> seq_filename; /**< The names of files associated with the sequences used by this alignment */
+ std::vector<genome::gnSequence*> seq_table; /**< The actual sequences used in this alignment */
+
+ std::string backbone_filename; /**< The name of an associated backbone file, or empty if none exists */
+protected:
+
+};
+
+
+typedef GenericIntervalList<> IntervalList;
+
+template< class MatchType >
+GenericIntervalList<MatchType>::GenericIntervalList( const GenericIntervalList<MatchType>& ml )
+{
+ *this = ml;
+}
+
+template< class MatchType >
+GenericIntervalList<MatchType>& GenericIntervalList<MatchType>::operator=( const GenericIntervalList<MatchType>& ml )
+{
+ std::vector< MatchType >::operator=( ml );
+ seq_filename = ml.seq_filename;
+ seq_table = ml.seq_table;
+ return *this;
+}
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::Clear()
+{
+ for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
+ if( seq_table[ seqI ] != NULL )
+ delete seq_table[ seqI ];
+ }
+ seq_filename.clear();
+ this->clear();
+}
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::ReadList(std::istream& match_file)
+{
+ std::string tag;
+ gnSeqI len;
+ int64 start;
+ unsigned int seq_count;
+ uint seqI;
+
+ match_file >> tag; //format version tag
+ if( tag != "FormatVersion" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> tag; //format version
+ if( tag != "4" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> tag; //sequence count tag
+ if( tag != "SequenceCount" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> seq_count; //sequence count
+ if(seq_count < 2){
+ Throw_gnEx(InvalidFileFormat());
+ }
+
+ std::vector< std::string > alignment;
+ // read the sequence file names and lengths
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ match_file >> tag; // name tag
+ getline( match_file, tag );
+ // skip the tab character
+ tag = tag.substr( 1 );
+ seq_filename.push_back(tag);
+// try{
+// gnSequence *new_seq = new gnSequence();
+// new_seq->LoadSource(tag);
+// seq_table.push_back( new_seq );
+// }catch( gnException& gne );
+ match_file >> tag; // length tag
+ match_file >> tag; // length
+
+ alignment.push_back( "" ); // initialize alignment vector
+ }
+ uint interval_count;
+ match_file >> tag; // interval count tag
+ match_file >> interval_count; // interval count
+
+
+ // read the matches
+ std::string cur_line;
+ Interval* cur_iv = NULL;
+ boolean clustal_match;
+ std::vector< AbstractMatch* > iv_matches;
+ bool parsing = false;
+
+ while( std::getline( match_file, cur_line ) ){
+ if( cur_line.find( "Interval" ) != std::string::npos ){
+ // end the old interval
+ if( iv_matches.size() > 0 )
+ {
+ this->push_back( Interval(iv_matches.begin(), iv_matches.end()) );
+// for( size_t mI = 0; mI < iv_matches.size(); mI++ )
+// delete iv_matches[mI];
+ iv_matches.clear();
+ }
+ parsing = true;
+ continue;
+ }
+ if( !parsing )
+ continue;
+ if( cur_line.length() == 0 )
+ continue;
+
+ clustal_match = false;
+ if( cur_line == "GappedAlignment" ){
+ clustal_match = true;
+ getline( match_file, cur_line );
+
+ std::stringstream line_stream( cur_line );
+ line_stream >> len;
+ GappedAlignment* cr = new GappedAlignment( seq_count, len );
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ line_stream >> start;
+ cr->SetStart( seqI, start );
+ std::getline( match_file, alignment[ seqI ] );
+ int64 seq_len = 0;
+ for( uint charI = 0; charI < alignment[ seqI ].length(); charI++ )
+ if( alignment[ seqI ][ charI ] != '-' )
+ seq_len++;
+ cr->SetLength( seq_len, seqI );
+ }
+ cr->SetAlignment( alignment );
+ iv_matches.push_back( cr );
+ }
+ else{
+
+ Match mmhe( seq_count );
+ Match* mhe = mmhe.Copy();
+ std::stringstream line_stream( cur_line );
+
+ line_stream >> len;
+ mhe->SetLength(len);
+
+ for( seqI = 0; seqI < seq_count; seqI++){
+ line_stream >> start;
+ mhe->SetStart(seqI, start);
+ }
+
+ iv_matches.push_back( mhe );
+ }
+ }
+ if( iv_matches.size() > 0 )
+ this->push_back( Interval(iv_matches.begin(), iv_matches.end()) );
+ if( interval_count != this->size() ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+
+}
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::WriteList(std::ostream& match_file) const
+{
+
+ unsigned int seq_count = seq_table.size();
+ uint seqI;
+
+ match_file << "FormatVersion" << '\t' << 4 << "\n";
+ match_file << "SequenceCount" << '\t' << seq_count << "\n";
+ for(seqI = 0; seqI < seq_count; seqI++){
+ match_file << "Sequence" << seqI << "File" << '\t';
+ if( seq_filename.size() > seqI )
+ match_file << seq_filename[seqI];
+ else
+ match_file << "null";
+ match_file << "\n";
+ match_file << "Sequence" << seqI << "Length" << '\t';
+ if( seq_table.size() > seqI )
+ match_file << seq_table[seqI]->length();
+ else
+ match_file << "0";
+ match_file << "\n";
+ }
+
+ match_file << "IntervalCount" << '\t' << this->size() << std::endl;
+
+ for( uint ivI = 0; ivI < this->size(); ivI++ ){
+ match_file << "Interval " << ivI << std::endl;
+ const std::vector<AbstractMatch*>& matches = (*this)[ ivI ].GetMatches();
+ for( uint matchI = 0; matchI < matches.size(); matchI++ ){
+ const AbstractMatch* m = matches[ matchI ];
+ const GappedAlignment* cr = dynamic_cast< const GappedAlignment* >( m );
+ const Match* match = dynamic_cast< const Match* >( m );
+ if( match != NULL ){
+ match_file << *match << std::endl;
+ }
+ else if( cr != NULL ){
+ match_file << "GappedAlignment\n";
+ match_file << cr->Length();
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ match_file << '\t' << cr->Start( seqI );
+ match_file << std::endl;
+
+ const std::vector< std::string >& align_matrix = GetAlignment( *cr, seq_table );
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ match_file << align_matrix[ seqI ] << std::endl;
+ }
+ }
+ match_file << std::endl;
+ }
+}
+
+//stub for now, later use a XML library to write/read alignments in xml format..
+template< class MatchType >
+void GenericIntervalList<MatchType>::WriteXMLAlignment( std::ostream& out_file ) const
+{
+ if( this->size() == 0 )
+ return;
+ // write source sequence filenames and formats
+ // to make Paul happy
+ boolean single_input = true;
+ uint seqI = 0;
+ for( seqI = 1; seqI < seq_filename.size(); seqI++ ){
+ if( seq_filename[ 0 ] != seq_filename[ seqI ] ){
+ single_input = false;
+ break;
+ }
+ }
+// unsigned int seq_count = seq_table.size();
+
+ out_file << "<procrastAlignment sequence=\"" << seq_filename[ 0 ] << "\">" << std::endl;
+ for( uint ivI = 0; ivI < this->size(); ivI++ ){
+ if( (*this)[ ivI ].AlignmentLength() == 0 ){
+ continue;
+ }
+ out_file << "\t<localAlignment id = \"" << ivI+1 << "\" length = \"" << (*this)[ ivI ].AlignmentLength() << "\" multiplicity = \"" << (*this)[ ivI ].Multiplicity() << "\" spscore=\"" << (*this)[ ivI ].spscore << "\">" << std::endl;
+
+ std::vector<std::string> alignment;
+ if( seq_table.size() == 1 && seq_table.size() != (*this)[ ivI ].SeqCount() )
+ {
+ GetAlignment( (*this)[ ivI ], std::vector<genome::gnSequence*>((*this)[ ivI ].SeqCount(), seq_table[0]), alignment );
+ }else
+ GetAlignment( (*this)[ ivI ], seq_table, alignment );
+ for( seqI = 0; seqI < (*this)[ ivI ].SeqCount(); seqI++ ){
+ int64 startI = (*this)[ ivI ].Start( seqI );
+ gnSeqI length = (*this)[ ivI ].Length( seqI );
+ // if this genome doesn't have any sequence in this
+ // interval then skip it...
+ if( startI == 0 &&ivI > 0) // kludge: write all seqs into the first interval so java parser can read it
+ continue;
+
+ out_file << "\t\t<component id=\"" << seqI+1 << "\" seqid=\"1\" leftend=\"" << (*this)[ ivI ].LeftEnd( seqI ) << "\" length=\"" << (*this)[ ivI ].Length( seqI ) << "\" orientation=\"" << (*this)[ ivI ].Orientation( seqI) << "\">" << alignment[ seqI ].data();
+ out_file << "\t\t</component> " << std::endl;
+
+
+ }
+ out_file << "\t</localAlignment>" << std::endl;
+ }
+ out_file << "</procrastAlignment>" << std::endl;
+}
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::WriteStandardAlignment( std::ostream& out_file ) const
+{
+ if( this->size() == 0 )
+ return;
+
+// unsigned int seq_count = seq_table.size();
+ uint seqI = 0;
+
+ // write out the format version
+ out_file << "#FormatVersion Mauve1" << std::endl;
+
+ // write source sequence filenames and formats
+ // to make Paul happy
+ boolean single_input = true;
+ for( seqI = 1; seqI < seq_filename.size(); seqI++ ){
+ if( seq_filename[ 0 ] != seq_filename[ seqI ] ){
+ single_input = false;
+ break;
+ }
+ }
+ for( seqI = 0; seqI < seq_filename.size(); seqI++ ){
+ out_file << "#Sequence" << seqI + 1 << "File\t" << seq_filename[ seqI ] << std::endl;
+ if( single_input )
+ out_file << "#Sequence" << seqI + 1 << "Entry\t" << seqI + 1 << std::endl;
+
+ genome::gnSourceFactory* sf = genome::gnSourceFactory::GetSourceFactory();
+ genome::gnBaseSource* gnbs = sf->MatchSourceClass( seq_filename[ seqI ] );
+ genome::gnFASSource* gnfs = dynamic_cast< genome::gnFASSource* >(gnbs);
+ genome::gnRAWSource* gnrs = dynamic_cast< genome::gnRAWSource* >(gnbs);
+ genome::gnSEQSource* gnss = dynamic_cast< genome::gnSEQSource* >(gnbs);
+ genome::gnGBKSource* gngs = dynamic_cast< genome::gnGBKSource* >(gnbs);
+ if( gnfs != NULL )
+ out_file << "#Sequence" << seqI + 1 << "Format\tFastA" << std::endl;
+ else if( gnrs != NULL )
+ out_file << "#Sequence" << seqI + 1 << "Format\traw" << std::endl;
+ else if( gnss != NULL ){
+ out_file << "#Sequence" << seqI + 1 << "Format\tDNAstar" << std::endl;
+ out_file << "#Annotation" << seqI + 1 << "File\t" << seq_filename[ seqI ] << std::endl;
+ out_file << "#Annotation" << seqI + 1 << "Format\tDNAstar" << std::endl;
+ }else if( gngs != NULL ){
+ out_file << "#Sequence" << seqI + 1 << "Format\tGenBank" << std::endl;
+ out_file << "#Annotation" << seqI + 1 << "File\t" << seq_filename[ seqI ] << std::endl;
+ out_file << "#Annotation" << seqI + 1 << "Format\tGenBank" << std::endl;
+ }
+ }
+
+ if( this->backbone_filename != "" )
+ out_file << "#BackboneFile\t" << this->backbone_filename << std::endl;
+
+ for( uint ivI = 0; ivI < this->size(); ivI++ ){
+ if( (*this)[ ivI ].AlignmentLength() == 0 ){
+ continue;
+ }
+ std::vector<std::string> alignment;
+ if( seq_table.size() == 1 && seq_table.size() != (*this)[ ivI ].SeqCount() )
+ {
+ GetAlignment( (*this)[ ivI ], std::vector<genome::gnSequence*>((*this)[ ivI ].SeqCount(), seq_table[0]), alignment );
+ }else
+ GetAlignment( (*this)[ ivI ], seq_table, alignment );
+ for( seqI = 0; seqI < (*this)[ ivI ].SeqCount(); seqI++ ){
+ int64 startI = (*this)[ ivI ].Start( seqI );
+ gnSeqI length = (*this)[ ivI ].Length( seqI );
+ // if this genome doesn't have any sequence in this
+ // interval then skip it...
+ if( startI == 0 &&ivI > 0) // kludge: write all seqs into the first interval so java parser can read it
+ continue;
+ out_file << "> " << seqI + 1 << ":";
+ if( startI > 0 ){
+ out_file << genome::absolut( startI ) << "-" << genome::absolut( startI ) + length - 1 << " + ";
+ }else if(startI == 0){
+ out_file << 0 << "-" << 0 << " + ";
+ }else{
+ out_file << genome::absolut( startI ) << "-" << genome::absolut( startI ) + length - 1 << " - ";
+ }
+ if( single_input )
+ out_file << seq_filename[ 0 ]; // write the sequence filename as the seq name
+ else
+ out_file << seq_filename[ seqI ]; // write the sequence filename as the seq name
+ out_file << std::endl;
+ gnSeqI cur_pos = 0;
+ for( ; cur_pos < alignment[ seqI ].length(); cur_pos += 80 ){
+ gnSeqI cur_len = cur_pos + 80 < alignment[ seqI ].length() ? 80 : alignment[ seqI ].length() - cur_pos;
+ out_file.write( alignment[ seqI ].data() + cur_pos, cur_len );
+ out_file << std::endl;
+ }
+ }
+ out_file << "=" << std::endl;
+ out_file.flush();
+ }
+
+}
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::ReadStandardAlignment( std::istream& in_stream )
+{
+ uint seq_count = 0;
+ gnSeqI max_len = 0;
+ std::string cur_line;
+ if( !std::getline( in_stream, cur_line ) )
+ {
+ Clear(); // if we can't read from the file then just return an empty interval list
+ return;
+ }
+ uint seqI = 0;
+ std::vector< gnSeqI > lengths;
+ std::vector< GappedAlignment* > ga_list;
+ GappedAlignment cr;
+ std::string empty_line;
+ std::vector< std::string > aln_mat;
+ uint line_count = 1;
+ while( true ){
+
+ while( cur_line[0] == '#' ){
+ // hit a comment or metadata. try to parse it if it's a filename
+ std::getline( in_stream, cur_line );
+ line_count++;
+ std::stringstream ss( cur_line );
+ std::string token;
+ std::getline( ss, token, '\t' );
+ if( token.substr(1, 8) != "Sequence" || token.find( "File" ) == std::string::npos )
+ continue;
+ std::getline( ss, token );
+ seq_filename.push_back( token );
+ }
+
+ // read and parse the def. line
+ std::stringstream line_str( cur_line );
+ std::getline( line_str, cur_line, '>' );
+ std::getline( line_str, cur_line, ':' );
+ // take off leading whitespace
+ std::stringstream parse_str( cur_line );
+
+ parse_str >> seqI; // the sequence number
+
+ int64 start, stop;
+ std::getline( line_str, cur_line, '-' );
+ parse_str.clear();
+ parse_str.str( cur_line );
+ parse_str >> start;
+ line_str >> stop;
+ std::string strand;
+ line_str >> strand;
+
+ std::string name; // anything left is the name
+ std::getline( line_str, name );
+
+ // read and parse the sequence
+ while( aln_mat.size() < seqI )
+ aln_mat.push_back( empty_line );
+
+ gnSeqI chars = 0;
+ while( std::getline( in_stream, cur_line ) ){
+ line_count++;
+ if( (cur_line[ 0 ] == '>' ) || (cur_line[ 0 ] == '=' ))
+ break;
+ for( uint charI = 0; charI < cur_line.length(); charI++ )
+ if( cur_line[ charI ] != '-' )
+ chars++;
+ aln_mat[ seqI - 1 ] += cur_line;
+ }
+ while( lengths.size() < seqI )
+ lengths.push_back(0);
+
+ lengths[ seqI - 1 ] = chars;
+
+// temporary workaround for file format inconsistency
+ if( strand == "+" )
+ cr.SetStart( seqI - 1, start );
+ else if( start < stop ){
+ if( chars == 0 )
+ cr.SetStart( seqI - 1, 0 );
+ else
+ cr.SetStart( seqI - 1, -start );
+ if( chars != stop - start + 1 && !(chars == 0 && stop - start == 1) ){
+ std::cerr << "Error in XMFA file format\n";
+ std::cerr << "Before line " << line_count << std::endl;
+ std::cerr << "Expecting " << stop - start + 1 << " characters based on defline\n";
+ std::cerr << "Actually read " << chars << " characters of sequence\n";
+ Throw_gnEx(InvalidFileFormat());
+ }
+ }else{
+ if( chars == 0 )
+ cr.SetStart( seqI - 1, 0 );
+ else
+ cr.SetStart( seqI - 1, -stop );
+ if( chars != start - stop + 1 && !(chars == 0 && stop - start == 1) ){
+ std::cerr << "Error in XMFA file format\n";
+ std::cerr << "Before line " << line_count << std::endl;
+ std::cerr << "Expecting " << start - stop + 1 << " characters based on defline\n";
+ std::cerr << "Actually read " << chars << " characters of sequence\n";
+ Throw_gnEx(InvalidFileFormat());
+ }
+ }
+
+ if( chars > max_len )
+ max_len = aln_mat[ seqI - 1 ].length();
+
+ if( cur_line.size() == 0 )
+ break;
+ // did we finish an aligned region?
+ if( cur_line[ 0 ] != '>' ){
+ GappedAlignment *new_cr = new GappedAlignment( aln_mat.size(), max_len );
+ for( uint seqJ = 0; seqJ < seqI; seqJ++ ){
+ new_cr->SetStart( seqJ, cr.Start( seqJ ) );
+ new_cr->SetLength( lengths[ seqJ ], seqJ );
+ cr.SetStart( seqJ, NO_MATCH );
+ }
+ for( uint seqJ = 0; seqJ < seqI; seqJ++ )
+ aln_mat[seqJ].resize( max_len, '-' );
+
+ new_cr->SetAlignment(aln_mat);
+ lengths.clear();
+ if( seq_count < seqI )
+ seq_count = seqI;
+
+ ga_list.push_back( new_cr );
+
+ max_len = 0; // reset length for the next interval
+ aln_mat.clear(); // reset cr for next interval
+
+ // bail out on EOF or corruption
+ if( cur_line[ 0 ] != '=' )
+ break;
+ // otherwise read up to the next def. line
+ while( std::getline( in_stream, cur_line ) ){
+ line_count++;
+ if( cur_line[ 0 ] == '>' )
+ break;
+ }
+ if( cur_line[ 0 ] != '>' )
+ break;
+ }
+ }
+
+ // now process all GappedAlignments into Intervals
+ for( uint ivI = 0; ivI < ga_list.size(); ivI++ ){
+ GappedAlignment* cr = ga_list[ ivI ];
+ GappedAlignment* new_cr = new GappedAlignment( seq_count, cr->AlignmentLength() );
+
+ const std::vector< std::string >& align_matrix = GetAlignment( *cr, seq_table );
+ std::vector< std::string > new_aln_mat(seq_count);
+ for( seqI = 0; seqI < align_matrix.size(); seqI++ ){
+ new_cr->SetLength( cr->Length( seqI ), seqI );
+ new_cr->SetStart( seqI, cr->Start(seqI) );
+ new_aln_mat[ seqI ] = align_matrix[ seqI ];
+ if( new_aln_mat[ seqI ].length() == 0 )
+ new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
+ }
+ for( ; seqI < seq_count; seqI++ ){
+ new_cr->SetLength( 0, seqI );
+ new_cr->SetStart( seqI, 0 );
+ new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
+ }
+ new_cr->SetAlignment(new_aln_mat);
+ delete cr;
+ cr = new_cr;
+ ga_list[ ivI ] = new_cr;
+
+ std::vector<AbstractMatch*> asdf(1, cr);
+ Interval iv( asdf.begin(), asdf.end() );
+ this->push_back( iv );
+ }
+}
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::ReadStandardAlignmentCompact( std::istream& in_stream )
+{
+ uint seq_count = 0;
+ gnSeqI max_len = 0;
+ std::string cur_line;
+ std::getline( in_stream, cur_line );
+ uint seqI = 0;
+ std::vector< gnSeqI > lengths;
+ std::vector< GappedAlignment* > ga_list;
+ GappedAlignment cr;
+ std::string empty_line;
+ std::vector< std::string > aln_mat;
+ uint line_count = 1;
+ while( true ){
+
+ while( cur_line[0] == '#' ){
+ // hit a comment or metadata. try to parse it if it's a filename
+ std::getline( in_stream, cur_line );
+ line_count++;
+ std::stringstream ss( cur_line );
+ std::string token;
+ std::getline( ss, token, '\t' );
+ if( token.substr(1, 8) != "Sequence" || token.find( "File" ) == std::string::npos )
+ continue;
+ std::getline( ss, token );
+ seq_filename.push_back( token );
+ }
+
+ // read and parse the def. line
+ std::stringstream line_str( cur_line );
+ std::getline( line_str, cur_line, '>' );
+ std::getline( line_str, cur_line, ':' );
+ // take off leading whitespace
+ std::stringstream parse_str( cur_line );
+
+ parse_str >> seqI; // the sequence number
+
+ int64 start, stop;
+ std::getline( line_str, cur_line, '-' );
+ parse_str.clear();
+ parse_str.str( cur_line );
+ parse_str >> start;
+ line_str >> stop;
+ std::string strand;
+ line_str >> strand;
+
+ std::string name; // anything left is the name
+ std::getline( line_str, name );
+
+ // read and parse the sequence
+ while( aln_mat.size() < seqI )
+ aln_mat.push_back( empty_line );
+
+ gnSeqI chars = 0;
+ while( std::getline( in_stream, cur_line ) ){
+ line_count++;
+ if( (cur_line[ 0 ] == '>' ) || (cur_line[ 0 ] == '=' ))
+ break;
+ for( uint charI = 0; charI < cur_line.length(); charI++ )
+ if( cur_line[ charI ] != '-' )
+ chars++;
+ aln_mat[ seqI - 1 ] += cur_line;
+ }
+ while( lengths.size() < seqI )
+ lengths.push_back(0);
+
+ lengths[ seqI - 1 ] = chars;
+
+// temporary workaround for file format inconsistency
+ if( strand == "+" )
+ cr.SetStart( seqI - 1, start );
+ else if( start < stop ){
+ if( chars == 0 )
+ cr.SetStart( seqI - 1, 0 );
+ else
+ cr.SetStart( seqI - 1, -start );
+ if( chars != stop - start + 1 && !(chars == 0 && stop - start == 1) ){
+ std::cerr << "Error in XMFA file format\n";
+ std::cerr << "Before line " << line_count << std::endl;
+ std::cerr << "Expecting " << stop - start + 1 << " characters based on defline\n";
+ std::cerr << "Actually read " << chars << " characters of sequence\n";
+ Throw_gnEx(InvalidFileFormat());
+ }
+ }else{
+ if( chars == 0 )
+ cr.SetStart( seqI - 1, 0 );
+ else
+ cr.SetStart( seqI - 1, -stop );
+ if( chars != start - stop + 1 && !(chars == 0 && stop - start == 1) ){
+ std::cerr << "Error in XMFA file format\n";
+ std::cerr << "Before line " << line_count << std::endl;
+ std::cerr << "Expecting " << start - stop + 1 << " characters based on defline\n";
+ std::cerr << "Actually read " << chars << " characters of sequence\n";
+ Throw_gnEx(InvalidFileFormat());
+ }
+ }
+
+ if( chars > max_len )
+ max_len = aln_mat[ seqI - 1 ].length();
+
+ if( cur_line.size() == 0 )
+ break;
+ // did we finish an aligned region?
+ if( cur_line[ 0 ] != '>' ){
+ GappedAlignment *new_cr = new GappedAlignment( aln_mat.size(), max_len );
+ for( uint seqJ = 0; seqJ < seqI; seqJ++ ){
+ new_cr->SetStart( seqJ, cr.Start( seqJ ) );
+ new_cr->SetLength( lengths[ seqJ ], seqJ );
+ cr.SetStart( seqJ, NO_MATCH );
+ }
+ for( uint seqJ = 0; seqJ < seqI; seqJ++ )
+ aln_mat[seqJ].resize( max_len, '-' );
+
+ new_cr->SetAlignment(aln_mat);
+ lengths.clear();
+ if( seq_count < seqI )
+ seq_count = seqI;
+
+ ga_list.push_back( new_cr );
+
+ max_len = 0; // reset length for the next interval
+ aln_mat.clear(); // reset cr for next interval
+
+ // bail out on EOF or corruption
+ if( cur_line[ 0 ] != '=' )
+ break;
+ // otherwise read up to the next def. line
+ while( std::getline( in_stream, cur_line ) ){
+ line_count++;
+ if( cur_line[ 0 ] == '>' )
+ break;
+ }
+ if( cur_line[ 0 ] != '>' )
+ break;
+ }
+ }
+
+ // now process all GappedAlignments into Intervals
+ //cerr << "Stuffing all GappedAlignments into Intervals" << endl;
+ for( uint ivI = 0; ivI < ga_list.size(); ivI++ )
+ {
+ GappedAlignment* cr = ga_list[ ivI ];
+ uint compact_seq_count = cr->SeqCount();
+ CompactGappedAlignment<>* new_cr = new CompactGappedAlignment<>(compact_seq_count, cr->AlignmentLength() );
+ const std::vector< std::string > align_matrix = GetAlignment( *cr, seq_table );
+ //cout << cr->SeqCount() << " " << seq_count << " " << align_matrix.size() << endl;
+
+ std::vector< std::string > new_aln_mat(compact_seq_count);
+ for( seqI = 0; seqI < compact_seq_count; seqI++ ){
+ new_cr->SetLength( cr->Length( seqI ), seqI );
+ new_cr->SetStart( seqI, cr->Start(seqI) );
+ new_aln_mat[ seqI ] = align_matrix[ seqI ];
+ if( new_aln_mat[ seqI ].length() == 0 )
+ new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
+ }
+
+ for( ; seqI < compact_seq_count; seqI++ ){
+ new_cr->SetLength( 0, seqI );
+ new_cr->SetStart( seqI, 0 );
+ new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
+ }
+
+ new_cr->SetAlignment( new_aln_mat );
+ delete cr;
+
+ //CompactGappedAlignment<>* cga = new_cr;
+ //ga_list[ ivI ] = dynamic_cast<GappedAlignment*>(cga);
+ Interval iv;
+ this->push_back( iv );
+ std::vector< AbstractMatch* > matches(1, new_cr);
+ this->back().SetMatches( matches );
+ }
+}
+
+
+template< class MatchType >
+void GenericIntervalList<MatchType>::WriteAlignedSequences(std::ostream& match_file) const
+{
+
+ unsigned int seq_count = seq_table.size();
+ uint seqI;
+
+ match_file << "mauveAligner data\n";
+ match_file << "FormatVersion" << '\t' << 5 << "\n";
+ match_file << "SequenceCount" << '\t' << seq_count << "\n";
+ for(seqI = 0; seqI < seq_count; seqI++){
+ match_file << "Sequence" << seqI << "File" << '\t';
+ if( seq_filename.size() > seqI )
+ match_file << seq_filename[seqI];
+ else
+ match_file << "null";
+ match_file << "\n";
+ match_file << "Sequence" << seqI << "Length" << '\t';
+ if( seq_table.size() > seqI )
+ match_file << seq_table[seqI]->length();
+ else
+ match_file << "0";
+ match_file << "\n";
+ }
+
+ match_file << "AlignmentCount" << '\t' << this->size() << std::endl;
+
+ if( this->size() == 0 )
+ return;
+
+ for( uint ivI = 0; ivI < this->size(); ivI++ ){
+
+ match_file << (*this)[ ivI ].AlignmentLength();
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ match_file << '\t' << (*this)[ ivI ].Start( seqI );
+ match_file << std::endl;
+
+ std::vector<std::string> alignment;
+ GetAlignment( (*this)[ ivI ], this->seq_table, alignment );
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ match_file << alignment[ seqI ] << std::endl;
+ match_file << std::endl;
+ }
+
+}
+
+
+}
+
+#endif //_IntervalList_h_
diff --git a/libMems/Islands.cpp b/libMems/Islands.cpp
new file mode 100644
index 0000000..7cfde9c
--- /dev/null
+++ b/libMems/Islands.cpp
@@ -0,0 +1,320 @@
+/*******************************************************************************
+ * $Id: Islands.cpp,v 1.12 2004/04/19 23:11:19 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/Islands.h"
+#include "libMems/Aligner.h"
+#include "libMems/GappedAlignment.h"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+/**
+ * Identifies gaps in the alignment between pairs of sequences that are longer than
+ * some number of base pairs in length. Prints islands to an output stream
+ */
+void simpleFindIslands( IntervalList& iv_list, uint island_size, ostream& island_out ){
+ vector< Island > island_list;
+ simpleFindIslands( iv_list, island_size, island_list );
+ for( size_t isleI = 0; isleI < island_list.size(); isleI++ )
+ {
+ Island& i = island_list[isleI];
+ island_out << i.seqI << '\t' << i.leftI << '\t' << i.rightI << '\t'
+ << i.seqJ << '\t' << i.leftJ << '\t' << i.rightJ << endl;
+ }
+}
+
+
+void simpleFindIslands( IntervalList& iv_list, uint island_size, vector< Island >& island_list ){
+ if( iv_list.size() == 0 )
+ return;
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ Interval& iv = iv_list[ iv_listI ];
+ gnAlignedSequences gnas;
+ iv.GetAlignedSequences( gnas, iv_list.seq_table );
+ uint seq_count = iv_list.seq_table.size();
+
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ uint seqJ;
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+ uint columnI = 0;
+ gnSeqI curI = 0;
+ gnSeqI curJ = 0;
+ gnSeqI lastI = 0;
+ gnSeqI lastJ = 0;
+ for( columnI = 0; columnI < gnas.alignedSeqsSize(); columnI++ ){
+ if( gnas.sequences[ seqI ][ columnI ] != '-' )
+ curI++;
+ if( gnas.sequences[ seqJ ][ columnI ] != '-' )
+ curJ++;
+ if( toupper( gnas.sequences[ seqI ][ columnI ] ) ==
+ toupper( gnas.sequences[ seqJ ][ columnI ] ) &&
+ gnas.sequences[ seqJ ][ columnI ] != '-' ){
+ // check for an island that was big enough
+ if( curI - lastI > island_size ||
+ curJ - lastJ > island_size ){
+ int64 leftI = iv.Start( seqI );
+ int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
+ leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
+ int64 leftJ = iv.Start( seqJ );
+ int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
+ leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
+ Island isle;
+ isle.seqI = seqI;
+ isle.seqJ = seqJ;
+ isle.leftI = leftI;
+ isle.leftJ = leftJ;
+ isle.rightI = rightI;
+ isle.rightJ = rightJ;
+ island_list.push_back(isle);
+ }
+
+ lastI = curI;
+ lastJ = curJ;
+ }
+ }
+ }
+ }
+ }
+}
+
+
+/**
+ * Identifies stretches of alignment existing in all sequences that doesn't
+ * contain a gap larger than a particular size. Such regions are considered
+ * the backbone of the alignment.
+ */
+void simpleFindBackbone( IntervalList& iv_list, uint backbone_size, uint max_gap_size, vector< GappedAlignment >& backbone_regions ){
+ if( iv_list.size() == 0 )
+ return;
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ Interval& iv = iv_list[ iv_listI ];
+ gnAlignedSequences gnas;
+ uint seqI;
+ uint seq_count = iv_list.seq_table.size();
+ vector< int64 > positions( seq_count );
+ vector< int64 > starts( seq_count );
+ vector< int64 > ends( seq_count );
+ vector< uint > gap_size( seq_count, 0 );
+ uint seqJ;
+ gnSeqI bb_start_col = 0;
+ gnSeqI bb_end_col = 0;
+ GappedAlignment cur_backbone( seq_count, 0 );
+
+ // initialize positions and starts
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ positions[ seqI ] = iv_list[ iv_listI ].Start( seqI );
+ if( positions[ seqI ] < 0 )
+ positions[ seqI ] -= iv_list[ iv_listI ].Length( seqI ) + 1;
+ }
+ starts = positions;
+ ends = positions;
+
+ iv.GetAlignedSequences( gnas, iv_list.seq_table );
+ bool backbone = true; // assume we are starting out with a complete alignment column
+ uint columnI = 0;
+ vector< int64 > prev_positions;
+ for( ; columnI < gnas.alignedSeqsSize(); columnI++ ){
+ bool no_gaps = true;
+ prev_positions = positions;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ char cur_char = gnas.sequences[ seqI ][ columnI ];
+ if( cur_char != '-' && toupper(cur_char) != 'N' ){
+ if( gap_size[ seqI ] > max_gap_size && backbone ){
+ // end a stretch of backbone here only
+ // if the backbone meets size requirements in each
+ // sequence.
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( ends[ seqJ ] - starts[ seqJ ] < backbone_size ){
+ break;
+ }
+ }
+ if( seqJ == seq_count ) {
+ // it's a legitimate stretch of backbone
+ backbone_regions.push_back( cur_backbone );
+ uint bbI = backbone_regions.size() - 1;
+ vector< string > aln_mat( seq_count );
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( starts[ seqJ ] < 0 )
+ backbone_regions[ bbI ].SetStart( seqJ, ends[ seqJ ] + 1);
+ else
+ backbone_regions[ bbI ].SetStart( seqJ, starts[ seqJ ] );
+ backbone_regions[ bbI ].SetLength( ends[ seqJ ] - starts[ seqJ ], seqJ );
+ aln_mat[ seqJ ] = gnas.sequences[ seqJ ].substr( bb_start_col, bb_end_col - bb_start_col + 1);
+ }
+ backbone_regions[ bbI ].SetAlignment(aln_mat);
+
+ }
+ // we either just finished backbone or a short area that didn't
+ // qualify as backbone
+ // look for a new backbone region
+ backbone = false;
+ }
+ positions[ seqI ]++;
+ gap_size[ seqI ] = 0;
+ }else{
+ gap_size[ seqI ]++;
+ no_gaps = false;
+ }
+ }
+ if( no_gaps ){
+ bb_end_col = columnI;
+ ends = positions;
+ if( !backbone ){
+ starts = prev_positions;
+ bb_start_col = columnI;
+ backbone = true;
+ }
+ }
+ }
+
+ // check for backbone one last time
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( ends[ seqJ ] - starts[ seqJ ] < backbone_size ){
+ break;
+ }
+ }
+ if( seqJ == seq_count ) {
+ // it's a legitimate stretch of backbone
+ backbone_regions.push_back( cur_backbone );
+ uint bbI = backbone_regions.size() - 1;
+ vector< string > aln_mat( seq_count );
+ for( seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( starts[ seqJ ] < 0 )
+ backbone_regions[ bbI ].SetStart( seqJ, ends[ seqJ ] + 1);
+ else
+ backbone_regions[ bbI ].SetStart( seqJ, starts[ seqJ ] );
+ backbone_regions[ bbI ].SetLength( ends[ seqJ ] - starts[ seqJ ], seqJ );
+ aln_mat[ seqJ ] = gnas.sequences[ seqJ ].substr( bb_start_col, bb_end_col - bb_start_col + 1);
+ }
+ backbone_regions[ bbI ].SetAlignment( aln_mat );
+ }
+ }
+}
+
+
+void outputBackbone( const vector< GappedAlignment >& backbone_regions, ostream& backbone_out ){
+ for( uint bbI = 0; bbI < backbone_regions.size(); bbI++ ){
+ for( uint seqJ = 0; seqJ < backbone_regions[ bbI ].SeqCount(); seqJ++ ){
+ if( seqJ > 0 )
+ backbone_out << '\t';
+ int64 bb_rend = backbone_regions[ bbI ].Start( seqJ );
+ if( backbone_regions[ bbI ].Start( seqJ ) < 0 )
+ bb_rend -= (int64)backbone_regions[ bbI ].Length( seqJ );
+ else
+ bb_rend += (int64)backbone_regions[ bbI ].Length( seqJ );
+ backbone_out << backbone_regions[ bbI ].Start( seqJ ) << '\t' << bb_rend;
+ }
+ backbone_out << endl;
+ }
+}
+
+
+// always return the left end of the one to the left and the right of the one to the right
+
+void getGapBounds( vector<gnSeqI>& seq_lengths, vector< LCB >& adjacencies, uint seqJ, int leftI, int rightI, int64& left_start, int64& right_start ){
+ if( rightI != -1 )
+ right_start = absolut( adjacencies[ rightI ].left_end[ seqJ ] );
+ else
+ right_start = seq_lengths[seqJ] + 1;
+
+ if( leftI != -1 )
+ left_start = absolut( adjacencies[ leftI ].right_end[ seqJ ] );
+ else
+ left_start = 1;
+}
+
+
+void addUnalignedIntervals( IntervalList& iv_list, set< uint > seq_set, vector<gnSeqI> seq_lengths ){
+ vector< LCB > adjacencies;
+ vector< int64 > weights;
+ uint lcbI;
+ uint seqI;
+ if( seq_lengths.size() == 0 )
+ for( seqI = 0; seqI < iv_list.seq_table.size(); seqI++ )
+ seq_lengths.push_back(iv_list.seq_table[seqI]->length());
+
+ uint seq_count = seq_lengths.size();
+
+
+ if( seq_set.size() == 0 )
+ {
+ // if an empty seq set was passed then assume all seqs
+ // should be processed
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ seq_set.insert( seqI );
+ }
+
+ weights = vector< int64 >( iv_list.size(), 0 );
+ computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
+
+ vector< int > rightmost;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ rightmost.push_back( -1 );
+ }
+ for( lcbI = 0; lcbI <= adjacencies.size(); lcbI++ ){
+ set< uint >::iterator seq_set_iterator = seq_set.begin();
+ for( ; seq_set_iterator != seq_set.end(); seq_set_iterator++ ){
+ seqI = *seq_set_iterator;
+ // scan left
+ int leftI;
+ if( lcbI < adjacencies.size() ){
+// left is always to the left!!
+ leftI = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ }else
+ leftI = rightmost[ seqI ];
+
+ int rightI = lcbI < adjacencies.size() ? lcbI : -1;
+// right is always to the right!!
+ if( lcbI < adjacencies.size() )
+ if( adjacencies[ lcbI ].right_adjacency[ seqI ] == -1 )
+ rightmost[ seqI ] = lcbI;
+
+ int64 left_start, right_start;
+ getGapBounds( seq_lengths, adjacencies, seqI, leftI, rightI, left_start, right_start );
+ int64 gap_len = absolut( right_start ) - absolut( left_start );
+ if( gap_len > 0 ){
+ Match mm( seq_count );
+ Match* m = mm.Copy();
+ for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
+ m->SetStart( seqJ, 0 );
+ }
+ m->SetStart( seqI, left_start );
+ m->SetLength( gap_len );
+ vector<AbstractMatch*> tmp(1, m);
+ iv_list.push_back( Interval(tmp.begin(), tmp.end()) );
+ m->Free();
+ }
+ }
+ }
+}
+
+
+void findIslandsBetweenLCBs( IntervalList& iv_list, uint island_size, ostream& island_out ){
+ IntervalList iv_list_tmp = iv_list;
+ addUnalignedIntervals( iv_list_tmp );
+ uint seq_count = iv_list.seq_table.size();
+
+ for( int ivI = iv_list.size(); ivI < iv_list_tmp.size(); ivI++ ){
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ if( iv_list_tmp[ ivI ].Length( seqI ) < island_size )
+ continue;
+
+ // this is an island, write the LCB island out
+ gnSeqI left_end = absolut( iv_list_tmp[ ivI ].Start( seqI ) );
+ gnSeqI right_end = left_end + iv_list_tmp[ ivI ].Length( seqI ) - 1;
+ island_out << "LCB island:\t" << seqI << '\t' << left_end << '\t' << right_end << endl;
+ }
+ }
+}
+
+}
diff --git a/libMems/Islands.h b/libMems/Islands.h
new file mode 100644
index 0000000..66dc18f
--- /dev/null
+++ b/libMems/Islands.h
@@ -0,0 +1,417 @@
+/*******************************************************************************
+ * $Id: Islands.h,v 1.7 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __Islands_h__
+#define __Islands_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/SubstitutionMatrix.h"
+#include "libMems/IntervalList.h"
+#include "libMems/NumericMatrix.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/Aligner.h"
+#include <boost/multi_array.hpp>
+#include "libMems/HomologyHMM/homology.h"
+#include "libMems/Scoring.h"
+
+namespace mems {
+
+/**
+ * A class to represent an island in an alignment. Islands are generally
+ * large insertions of a region of sequence relative to
+ * another sequence.
+ */
+class Island{
+public:
+ uint seqI;
+ uint seqJ;
+ int64 leftI;
+ int64 leftJ;
+ int64 rightI;
+ int64 rightJ;
+};
+
+/**
+ * Identifies gaps in the alignment between pairs of sequences that are longer than
+ * some number of base pairs in length. Prints islands to an output stream
+ */
+void simpleFindIslands( IntervalList& iv_list, uint island_size, std::ostream& island_out );
+void findIslandsBetweenLCBs( IntervalList& iv_list, uint island_size, std::ostream& island_out );
+void simpleFindIslands( IntervalList& iv_list, uint island_size, std::vector< Island >& island_list );
+
+class HssCols{
+public:
+ uint seqI;
+ uint seqJ;
+ size_t left_col;
+ size_t right_col;
+};
+
+typedef std::vector< HssCols > hss_list_t;
+typedef boost::multi_array< hss_list_t, 3 > hss_array_t;
+
+typedef HssCols IslandCols; // use the same structure for island segs
+
+template<typename MatchVector>
+void hssColsToIslandCols( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, std::vector< HssCols >& hss_list, std::vector< IslandCols >& island_col_list );
+
+/**
+ * Find regions in each sequence that do not belong to any LCB, add them to their own
+ * Interval (LCB) in the IntervalList.
+ */
+void addUnalignedIntervals( IntervalList& iv_list, std::set< uint > seq_set = std::set< uint >(), std::vector<gnSeqI> seq_lengths = std::vector<gnSeqI>() );
+
+/**
+ * Identifies stretches of alignment existing in all sequences that doesn't
+ * contain a gap larger than a particular size. Such regions are considered
+ * the backbone of the alignment.
+ */
+void simpleFindBackbone( IntervalList& iv_list, uint backbone_size, uint max_gap_size, std::vector< GappedAlignment >& backbone_regions );
+
+/**
+ * writes out a list of backbone regions
+ */
+void outputBackbone( const std::vector< GappedAlignment >& backbone_regions, std::ostream& backbone_out );
+
+void getGapBounds( std::vector<gnSeqI>& seq_lengths, std::vector< LCB >& adjacencies, uint seqJ, int leftI, int rightI, int64& left_start, int64& right_start );
+
+
+static char charmap[128];
+inline
+char* getCharmap()
+{
+ static bool initialized = false;
+ if(initialized)
+ return charmap;
+ memset(charmap, 0, 128);
+ charmap['a'] = 0;
+ charmap['c'] = 1;
+ charmap['g'] = 2;
+ charmap['t'] = 3;
+ charmap['-'] = 4;
+ charmap['A'] = 0;
+ charmap['C'] = 1;
+ charmap['G'] = 2;
+ charmap['T'] = 3;
+ charmap['-'] = 4;
+ initialized = true;
+ return charmap;
+}
+// a mapping from pairwise alignment columns to HomologyHMM emission codes
+// row/column indices are as given by the charmap above (ACGT- == 01234).
+static char colmap[5][5] = {
+// A C G T -
+ {'1','3','4','5','7'}, // A
+ {'3','2','6','4','7'}, // C
+ {'4','6','2','3','7'}, // G
+ {'5','4','3','1','7'}, // T
+ {'7','7','7','7','\0'}, // -
+};
+
+
+inline
+void findHssHomologyHMM( std::vector< std::string >& aln_table, hss_list_t& hss_list, uint seqI, uint seqJ, const Params& hmm_params,
+ boolean left_homologous, boolean right_homologous )
+{
+ static char* charmap = getCharmap();
+
+ // encode the alignment as column states
+ std::string column_states(aln_table[0].size(),'q');
+ vector< size_t > col_reference(column_states.size(), (std::numeric_limits<size_t>::max)() );
+ size_t refI = 0;
+ for( size_t colI = 0; colI < column_states.size(); colI++ )
+ {
+ char a = charmap[aln_table[seqI][colI]];
+ char b = charmap[aln_table[seqJ][colI]];
+ column_states[colI] = colmap[a][b];
+ if(column_states[colI] != 0 )
+ col_reference[refI++] = colI;
+ }
+ // filter out the gap/gap cols
+ std::string::iterator sitr = std::remove(column_states.begin(), column_states.end(), 0);
+ column_states.resize(sitr - column_states.begin());
+
+ for( size_t colI = 2; colI < column_states.size(); colI++ )
+ {
+ if( column_states[colI] == '7' &&
+ column_states[colI-1] == '7' &&
+ (column_states[colI-2] == '7' || column_states[colI-2] == '8') )
+ column_states[colI-1] = '8';
+ }
+ if( column_states.size() > 1 && column_states[0] == '7' && (column_states[1] == '7' || column_states[1] == '8'))
+ column_states[0] = '8';
+ if( column_states.size() > 1 && column_states[column_states.size()-1] == '7' && (column_states[column_states.size()-2] == '7'|| column_states[column_states.size()-2] == '8') )
+ column_states[column_states.size()-1] = '8';
+ // now feed it to the Homology prediction HMM
+ string prediction;
+ if( right_homologous && !left_homologous )
+ std::reverse(column_states.begin(), column_states.end());
+
+ run(column_states, prediction, hmm_params);
+
+ if( right_homologous && !left_homologous )
+ std::reverse(prediction.begin(), prediction.end());
+ size_t prev_h = 0;
+ size_t i = 1;
+ for( ; i < prediction.size(); i++ )
+ {
+ if( prediction[i] == 'H' && prediction[i-1] == 'N' )
+ {
+ prev_h = i;
+ }
+ if( prediction[i] == 'N' && prediction[i-1] == 'H' )
+ {
+ HssCols hc;
+ hc.seqI = seqI;
+ hc.seqJ = seqJ;
+ hc.left_col = col_reference[prev_h];
+ hc.right_col = col_reference[i-1];
+ hss_list.push_back(hc);
+ prev_h = i;
+ }
+ }
+ // get the last one
+ if( prediction[i-1] == 'H' )
+ {
+ HssCols hc;
+ hc.seqI = seqI;
+ hc.seqJ = seqJ;
+ hc.left_col = col_reference[prev_h];
+ hc.right_col = col_reference[i-1];
+ hss_list.push_back(hc);
+ }
+}
+
+
+template< typename MatchVector >
+void findHssHomologyHMM( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, const Params& hmm_params, boolean left_homologous, boolean right_homologous )
+{
+ typedef typename MatchVector::value_type MatchType;
+ if( iv_list.size() == 0 )
+ return;
+ uint seq_count = seq_table.size();
+ hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ const MatchType& iv = iv_list[ iv_listI ];
+ std::vector< std::string > aln_table;
+ GetAlignment( *iv, seq_table, aln_table );
+
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ uint seqJ;
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+
+ hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
+ hss_list.clear();
+ findHssHomologyHMM( aln_table, hss_list, seqI, seqJ, hmm_params, left_homologous, right_homologous );
+ }
+ }
+ }
+}
+
+
+template< typename MatchVector >
+void HssColsToIslandCols( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, hss_array_t& island_col_array )
+{
+
+ typedef typename MatchVector::value_type MatchType;
+ uint seq_count = seq_table.size();
+ island_col_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ const MatchType& iv = iv_list[ iv_listI ];
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ uint seqJ;
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+ hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
+ hss_list_t& island_col_list = island_col_array[seqI][seqJ][iv_listI];
+ ComplementHss(iv_list[iv_listI]->AlignmentLength(),hss_list,island_col_list,seqI,seqJ);
+ }
+ }
+ }
+}
+inline
+void ComplementHss( const size_t alignment_length, hss_list_t& hss_list, hss_list_t& island_col_list, uint seqI=0, uint seqJ=0 )
+{
+
+
+ size_t left_col = 0;
+ for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
+ {
+ if( left_col >= hss_list[hssI].left_col )
+ {
+ left_col = hss_list[hssI].right_col + 1;
+ continue; // handle the case where the HSS starts at col 0
+ }
+ // ending an island
+ IslandCols isle;
+ isle.seqI = seqI;
+ isle.seqJ = seqJ;
+ isle.left_col = left_col;
+ isle.right_col = hss_list[hssI].left_col;
+ island_col_list.push_back(isle);
+ left_col = hss_list[hssI].right_col + 1;
+ }
+
+ if( left_col < alignment_length )
+ {
+ // add the last island
+ IslandCols isle;
+ isle.seqI = seqI;
+ isle.seqJ = seqJ;
+ isle.left_col = left_col;
+ isle.right_col = alignment_length-1;
+ island_col_list.push_back(isle);
+ }
+}
+
+template< typename MatchVector >
+void HssArrayToCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, std::vector< CompactGappedAlignment<>* >& cga_list )
+{
+ typedef typename MatchVector::value_type MatchType;
+ uint seq_count = seq_table.size();
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ const MatchType& iv = iv_list[ iv_listI ];
+
+ CompactGappedAlignment<>* iv_cga = dynamic_cast< CompactGappedAlignment<>* >(iv);
+ bool allocated = false;
+ if( iv_cga == NULL )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ iv_cga = tmp_cga.Copy();
+ new (iv_cga) CompactGappedAlignment<>(*iv);
+ allocated = true;
+ }
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ for( uint seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+ hss_list_t& isle_list = hss_array[seqI][seqJ][iv_listI];
+ for( size_t curI = 0; curI < isle_list.size(); ++curI )
+ {
+ // extract a cga
+ CompactGappedAlignment<> tmp_cga;
+ cga_list.push_back( tmp_cga.Copy() );
+ iv_cga->copyRange( *(cga_list.back()), isle_list[curI].left_col, isle_list[curI].right_col - isle_list[curI].left_col + 1 );
+ if( cga_list.back()->LeftEnd(0) == NO_MATCH )
+ {
+ // this one must have been covering an invalid region (gaps aligned to gaps)
+ cga_list.back()->Free();
+ cga_list.erase( cga_list.end()-1 );
+ }
+ }
+ }
+ }
+ if( allocated )
+ iv_cga->Free();
+ }
+}
+
+
+template< class IntervalListType >
+void addUnalignedRegions( IntervalListType& iv_list)
+{
+ std::vector< AbstractMatch* > new_ivs;
+ std::vector< AbstractMatch* > iv_ptrs(iv_list.size());
+ for( size_t i = 0; i < iv_list.size(); ++i )
+ iv_ptrs[i] = &iv_list[i];
+ for( size_t seqI = 0; seqI < iv_list.seq_table.size(); ++seqI )
+ {
+ SingleStartComparator< AbstractMatch > ssc( seqI );
+ std::sort( iv_ptrs.begin(), iv_ptrs.end(), ssc );
+ size_t ivI = 0;
+ for( ; ivI < iv_ptrs.size(); ++ivI )
+ if( iv_ptrs[ivI]->LeftEnd(seqI) != NO_MATCH )
+ break;
+ std::list< AbstractMatch* > iv_ptr_list;
+ iv_ptr_list.insert( iv_ptr_list.end(), iv_ptrs.begin()+ivI, iv_ptrs.end() );
+ AddGapMatches( iv_ptr_list, iv_ptr_list.begin(), iv_ptr_list.end(), seqI, 1, iv_list.seq_table[seqI]->length()+1, AbstractMatch::forward, iv_list.seq_table.size() );
+ std::list< AbstractMatch* >::iterator iter = iv_ptr_list.begin();
+ while( ivI != iv_ptrs.size() && iter != iv_ptr_list.end() )
+ {
+ if( iv_ptrs[ivI] == *iter )
+ ivI++;
+ else
+ new_ivs.push_back( *iter );
+ ++iter;
+ }
+ while( iter != iv_ptr_list.end() )
+ {
+ new_ivs.push_back( *iter );
+ ++iter;
+ }
+ }
+ // now add all the new intervals to iv_list
+ size_t prev_size = iv_list.size();
+ iv_list.resize( iv_list.size() + new_ivs.size() );
+ for( size_t newI = 0; newI < new_ivs.size(); ++newI )
+ {
+ Interval iv( new_ivs.begin() + newI, new_ivs.begin() + newI + 1 );
+ iv_list[prev_size + newI] = iv;
+ new_ivs[newI]->Free();
+ }
+}
+
+
+template< typename MatchVector >
+void findBigGaps( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, size_t big_gap_size )
+{
+ typedef typename MatchVector::value_type MatchType;
+ if( iv_list.size() == 0 )
+ return;
+ uint seq_count = seq_table.size();
+ hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ const MatchType& iv = iv_list[ iv_listI ];
+ std::vector< std::string > aln_table;
+ GetAlignment( *iv, seq_table, aln_table );
+
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ uint seqJ;
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ )
+ {
+ if( iv->LeftEnd(seqI) == NO_MATCH || iv->LeftEnd(seqJ) == NO_MATCH )
+ continue;
+
+ hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
+ hss_list.clear();
+ size_t gap_count = 0;
+ size_t gap_lend = 0;
+ for( size_t cI = 0; cI < aln_table[seqI].size(); cI++ )
+ {
+ if( aln_table[seqI][cI] == '-' || aln_table[seqJ][cI] == '-' )
+ {
+ if( aln_table[seqI][cI] == '-' ^ aln_table[seqJ][cI] == '-' )
+ {
+ if( gap_count == 0 )
+ gap_lend = cI;
+ gap_count++;
+ }
+ }else if( gap_count >= big_gap_size )
+ {
+ HssCols hc;
+ hc.seqI = seqI;
+ hc.seqJ = seqJ;
+ hc.left_col = gap_lend;
+ hc.right_col = cI-1;
+ hss_list.push_back( hc );
+ gap_count = 0;
+ }else
+ gap_count = 0;
+ }
+ }
+ }
+ }
+}
+
+
+}
+
+#endif // __Islands_h__
diff --git a/libMems/LCB.h b/libMems/LCB.h
new file mode 100644
index 0000000..65609b6
--- /dev/null
+++ b/libMems/LCB.h
@@ -0,0 +1,70 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __LCB_h__
+#define __LCB_h__
+
+#include <vector>
+#include <libGenome/gnDefs.h>
+
+namespace mems {
+
+/**
+ * This class is used to track relationships between LCBs during the LCB determination process.
+ */
+class LCB{
+public:
+ LCB() : lcb_id(0), weight(0), to_be_deleted(false) {};
+ std::vector< int64 > left_end; /**< The left end position of the LCB in each sequence */
+ std::vector< int64 > right_end; /**< The right end position of the LCB in each sequence */
+ std::vector< uint > left_adjacency; /**< 'Pointers' (actually IDs) to the LCBs on the left in each sequence */
+ std::vector< uint > right_adjacency; /**< 'Pointers' (actually IDs) to the LCBs on the right in each sequence */
+ int lcb_id; /**< A numerical ID that can be assigned to this LCB */
+ double weight; /**< The weight (or coverage) of this LCB */
+ bool to_be_deleted; /**< set to true if this LCB is about to be deleted, but the deletion hasn't yet been processed */
+};
+
+/**
+ * Compares LCBs.
+ * Used by LCB construction algorithm
+ */
+class LCBLeftComparator {
+public:
+ LCBLeftComparator( uint seq ) : m_seq(seq){};
+ bool operator()(const LCB& a, const LCB& b) const{
+
+ int64 a_start = a.left_end[ m_seq ], b_start = b.left_end[ m_seq ];
+ if( a_start == NO_MATCH || b_start == NO_MATCH ){
+ if( b_start != NO_MATCH )
+ return true;
+ return false;
+ }
+ if(a_start < 0)
+ a_start = -a_start;
+ if(b_start < 0)
+ b_start = -b_start;
+
+ int64 diff = a_start - b_start;
+ return diff < 0;
+ }
+protected:
+ uint m_seq;
+private:
+ LCBLeftComparator();
+};
+
+class LCBIDComparator {
+public:
+ bool operator()(const LCB& a, const LCB& b) const
+ {
+ return a.lcb_id < b.lcb_id;
+ }
+};
+
+
+} // namespace mems
+
+
+#endif // __LCB_h__
+
diff --git a/libMems/Makefile.am b/libMems/Makefile.am
new file mode 100644
index 0000000..936b26a
--- /dev/null
+++ b/libMems/Makefile.am
@@ -0,0 +1,85 @@
+
+if DEBUG
+D_CXXFLAGS = -Wall -g -D__GNDEBUG__
+endif
+OPTIMIZATION = -O2 -funroll-loops -fomit-frame-pointer -ftree-vectorize
+AM_CFLAGS = $(OPTIMIZATION) @DEPS_CFLAGS@ -DUSE_POSIX_AIO @OPENMP_CFLAGS@
+AM_CXXFLAGS = $(OPTIMIZATION) @DEPS_CFLAGS@ @BOOST_CPPFLAGS@ $(D_CXXFLAGS) @EXTRA_CXX_FLAGS@ @OPENMP_CXXFLAGS@
+AM_LDFLAGS = $(OPTIMIZATION)
+
+LIBMEMS_H = \
+RepeatHash.h MatchHashEntry.h \
+DNAFileSML.h MemorySML.h MatchProjectionAdapter.h \
+DNAMemorySML.h MatchFinder.h SortedMerList.h IntervalList.h \
+FileSML.h gnAlignedSequences.h Interval.h \
+MemHash.h AbstractMatch.h SlotAllocator.h \
+Aligner.h Match.h MatchList.h Matrix.h NumericMatrix.h \
+Islands.h MaskedMemHash.h SeedMasks.h GappedAlignment.h \
+MuscleInterface.h GappedAligner.h PhyloTree.h SparseAbstractMatch.h \
+DenseAbstractMatch.h RepeatMatch.h UngappedLocalAlignment.h \
+AbstractGappedAlignment.h CompactGappedAlignment.h HybridAbstractMatch.h \
+twister.h SubstitutionMatrix.h RepeatMatchList.h \
+Backbone.h ProgressiveAligner.h PairwiseMatchAdapter.h PairwiseMatchFinder.h \
+SeedOccurrenceList.h TreeUtilities.h SuperInterval.h GreedyBreakpointElimination.h \
+LCB.h DistanceMatrix.h Scoring.h configuration.h Memory.h Files.h gnRAWSequence.h
+
+HOMOLOGYHMM_H = HomologyHMM/homology.h HomologyHMM/dptables.h HomologyHMM/algebras.h HomologyHMM/parameters.h
+
+DMSML_H = \
+dmSML/asyncio.h dmSML/alinuxaio.h dmSML/aPOSIXaio.h \
+dmSML/alibc.h dmSML/awin32aio.h dmSML/buffer.h \
+dmSML/util.h dmSML/sorting.h dmSML/dmsort.h \
+dmSML/timing.h dmSML/sml.h
+
+LIBMEMS_SRC = \
+RepeatHash.cpp \
+DNAFileSML.cpp MatchFinder.cpp \
+DNAMemorySML.cpp MemorySML.cpp SortedMerList.cpp \
+FileSML.cpp MemHash.cpp MatchHashEntry.cpp \
+Interval.cpp IntervalList.cpp twister.c \
+gnAlignedSequences.cpp \
+MatchList.cpp Aligner.cpp \
+Islands.cpp MaskedMemHash.cpp GappedAlignment.cpp \
+MuscleInterface.cpp PhyloTree.cpp \
+RepeatMatchList.cpp RepeatMatch.cpp \
+Backbone.cpp PairwiseMatchFinder.cpp ProgressiveAligner.cpp \
+SuperInterval.cpp GreedyBreakpointElimination.cpp
+
+HOMOLOGYHMM_SRC = \
+HomologyHMM/algebras.cc HomologyHMM/homology.cc HomologyHMM/homologymain.cc
+
+DMSML_SRC = \
+dmSML/asyncio.c dmSML/alinuxaio.c dmSML/aPOSIXaio.c \
+dmSML/alibc.c dmSML/awin32aio.c dmSML/buffer.c \
+dmSML/util.c dmSML/sorting.c dmSML/dmsort.c \
+dmSML/timing.c dmSML/sml.c
+
+
+libmems_includedir=$(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)
+libmems_include_HEADERS = $(LIBMEMS_H)
+
+# build libraries with gcc (no suffix)
+lib_LTLIBRARIES = libMems-1.6.la
+libMems_1_6_la_SOURCES = $(LIBMEMS_SRC) $(HOMOLOGYHMM_SRC) $(DMSML_SRC)
+
+libMems_1_6_la_LDFLAGS= -version-info $(GENERIC_LIBRARY_VERSION)
+
+homologyhmm_includedir=$(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)/HomologyHMM
+homologyhmm_include_HEADERS = $(HOMOLOGYHMM_H)
+
+dmsml_includedir=$(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)/dmSML
+dmsml_include_HEADERS = $(DMSML_H)
+
+EXTRA_DIST = \
+HomologyHMM/homology.xml
+
+#EXTRA_PROGRAMS = TestSML TestSMLstatic
+
+
+#TestSMLstatic_SOURCES = TestSML.cpp
+#TestSMLstatic_INCLUDES = -I$(top_srcdir)/include/ `wx-config --cxxflags`
+#TestSMLstatic_LDFLAGS = -static $(top_builddir)/libMems/libMems.a `wx-config --static --libs`
+
+
+
+
diff --git a/libMems/MaskedMemHash.cpp b/libMems/MaskedMemHash.cpp
new file mode 100644
index 0000000..8741728
--- /dev/null
+++ b/libMems/MaskedMemHash.cpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * $Id: MaskedMemHash.cpp,v 1.3 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MaskedMemHash.h"
+#include <list>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+MaskedMemHash::MaskedMemHash(){
+ seq_mask = 0;
+}
+
+
+MaskedMemHash::MaskedMemHash(const MaskedMemHash& mh) : MemHash(mh){
+ *this = mh;
+}
+
+MaskedMemHash& MaskedMemHash::operator=( const MaskedMemHash& mh ){
+ seq_mask = mh.seq_mask;
+ return *this;
+}
+
+MaskedMemHash* MaskedMemHash::Clone() const{
+ return new MaskedMemHash(*this);
+}
+
+boolean MaskedMemHash::HashMatch(list<idmer>& match_list){
+ //check that there is at least one forward component
+ match_list.sort(&idmer_id_lessthan);
+ // initialize the hash entry
+ MatchHashEntry mhe = MatchHashEntry(seq_count, GetSar(0)->SeedLength());
+ mhe.SetLength(GetSar(0)->SeedLength());
+
+ //Fill in the new Match and set direction parity if needed.
+ list<idmer>::iterator iter = match_list.begin();
+ for(; iter != match_list.end(); iter++)
+ mhe.SetStart(iter->id, iter->position + 1);
+ SetDirection(mhe);
+ mhe.CalculateOffset();
+ uint64 match_number = 0;
+ // compute "MatchNumber"
+ for( uint seqI = 0; seqI < mhe.SeqCount(); seqI++ )
+ {
+ match_number <<= 1;
+ if( mhe.Start(seqI) != NO_MATCH )
+ match_number |= 1;
+ }
+ if( seq_mask == 0 || match_number == seq_mask )
+ AddHashEntry(mhe);
+
+ return true;
+}
+
+} // namespace mems
diff --git a/libMems/MaskedMemHash.h b/libMems/MaskedMemHash.h
new file mode 100644
index 0000000..29c4460
--- /dev/null
+++ b/libMems/MaskedMemHash.h
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ * $Id: MaskedMemHash.h,v 1.3 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _MaskedMemHash_h_
+#define _MaskedMemHash_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MemHash.h"
+
+namespace mems {
+
+/**
+ * Finds matches that meet a particular sequence mask, e.g. 0b11111 for 5-way matches
+ * Doesn't filter anything unless a mask is set using SetMask(). The
+ * filter can be cleared by calling SetMask(0)
+ */
+class MaskedMemHash : public MemHash{
+public:
+ MaskedMemHash();
+ ~MaskedMemHash(){};
+ MaskedMemHash(const MaskedMemHash& mh);
+ MaskedMemHash& operator=( const MaskedMemHash& mh );
+ virtual MaskedMemHash* Clone() const;
+ virtual void SetMask( uint64 seq_mask ){ this->seq_mask = seq_mask; }
+protected:
+ /**
+ * Can't find subsets when there is only one permitted sequence mask!
+ */
+ virtual void FindSubsets(const Match& mhe, std::vector<Match>& subset_matches){};
+ virtual boolean HashMatch(std::list<idmer>& match_list);
+ uint64 seq_mask;
+};
+
+}
+
+#endif //_MaskedMemHash_h_
diff --git a/libMems/Match.h b/libMems/Match.h
new file mode 100644
index 0000000..b7f9200
--- /dev/null
+++ b/libMems/Match.h
@@ -0,0 +1,33 @@
+/*******************************************************************************
+ * $Id: GenericMatch.h,v 1.10 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _Match_h_
+#define _Match_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include <iostream>
+#include <set>
+#include "libMems/UngappedLocalAlignment.h"
+#include "libMems/SparseAbstractMatch.h"
+#include "libMems/DenseAbstractMatch.h"
+#include "libMems/HybridAbstractMatch.h"
+
+namespace mems {
+
+typedef UngappedLocalAlignment< HybridAbstractMatch<> > Match;
+
+static uint seq_compare_start;
+
+
+}
+
+#endif // _Match_h_
diff --git a/libMems/MatchFinder.cpp b/libMems/MatchFinder.cpp
new file mode 100644
index 0000000..3c9c0fa
--- /dev/null
+++ b/libMems/MatchFinder.cpp
@@ -0,0 +1,444 @@
+/*******************************************************************************
+ * $Id: MatchFinder.cpp,v 1.39 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MatchFinder.h"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+MatchFinder::MatchFinder(){
+ mer_size = DNA_MER_SIZE;
+ seq_count = 0;
+ ambiguity_tolerance = 0;
+ m_progress = -1;
+ log_stream = NULL;
+ offset_stream = NULL;
+}
+
+//make sure this calls the destructor on each element
+MatchFinder::~MatchFinder(){
+}
+
+MatchFinder::MatchFinder(const MatchFinder& mf){
+ mer_size = mf.mer_size;
+ seq_count = mf.seq_count;
+ ambiguity_tolerance = mf.ambiguity_tolerance;
+
+ m_progress = mf.m_progress;
+ sar_table = mf.sar_table;
+ seq_table = mf.seq_table;
+ log_stream = mf.log_stream;
+ offset_stream = mf.offset_stream;
+}
+
+void MatchFinder::Clear(){
+ mer_size = DNA_MER_SIZE;
+ seq_count = 0;
+ ambiguity_tolerance = 0;
+ m_progress = -1;
+ sar_table.clear();
+ seq_table.clear();
+ log_stream = NULL;
+ offset_stream = NULL;
+}
+
+void MatchFinder::LogProgress( ostream* os ){
+ log_stream = os;
+}
+
+boolean MatchFinder::AddSequence( SortedMerList* sar, gnSequence* seq ){
+ if(sar == NULL){
+ Throw_gnExMsg( NullPointer(), "Null SortedMerList pointer" );
+ }
+ if(sar == NULL){
+ Throw_gnExMsg( NullPointer(), "Null gnSequence pointer" );
+ }
+
+ //check for consistency between sequence length and sorted mer list lengths
+/* if(seq != NULL && seq->length() != sar->Length()){
+ cerr << "MatchFinder::AddSequence: Error mismatched sml and sequence length.\n";
+ cerr << "Seq length: " << seq->length() << "\tSML length: " << sar->Length() << endl;
+ DebugMsg("MatchFinder::AddSequence: Error mismatched sml and sequence length.");
+ return false;
+ }
+*/
+ //passed checks, add it to the data structures
+ sar_table.push_back(sar);
+ ++seq_count;
+ if(seq != NULL){
+ seq_table.push_back(seq);
+ }
+
+ SMLHeader header = sar->GetHeader();
+ alphabet_bits = header.alphabet_bits;
+
+ return true;
+
+}
+
+void MatchFinder::GetBreakpoint( uint32 sarI, gnSeqI startI, vector<gnSeqI>& breakpoints ) const{
+ breakpoints.clear();
+
+ //put the mer to break on in break_mer
+ bmer break_mer = (*GetSar(sarI))[startI];
+ uint64 mer_mask = GetSar(sarI)->GetSeedMask();
+ bmer prev_mer = break_mer;
+ //search backwards for the first index of this mer
+ while((prev_mer.mer & mer_mask) == (break_mer.mer & mer_mask)){
+ if(startI == 0){
+ startI--;
+ break;
+ }
+ startI--;
+ prev_mer = (*GetSar(sarI))[startI];
+ }
+ ++startI;
+
+ //find the mer's location in the other sorted mer lists
+ for(uint32 i=0; i < seq_count; ++i){
+ if(i == sarI){
+ breakpoints.push_back(startI);
+ }else{
+ gnSeqI cur_start;
+ if(GetSar(i)->FindMer(break_mer.mer, cur_start)){
+ //we found a match, see how far backwards we can go.
+ int64 cur_matchI = cur_start;
+ bmer matchmer = (*GetSar(i))[cur_start];
+ while(cur_matchI >= 0 && ((matchmer.mer & mer_mask) == (break_mer.mer && mer_mask))){
+ cur_matchI--;
+ matchmer = (*GetSar(i))[cur_start];
+ }
+ cur_start = cur_matchI+1;
+ }
+ breakpoints.push_back(cur_start);
+ }
+ }
+}
+
+void MatchFinder::FindMatchSeeds(){
+ vector<gnSeqI> start_points;
+
+ for(uint32 i=0; i < sar_table.size(); ++i){
+ start_points.push_back(0);
+ }
+ FindMatchSeeds( start_points );
+}
+
+void MatchFinder::FindMatchSeeds( const vector<gnSeqI>& start_offsets ){
+ vector<gnSeqI> start_points = start_offsets;
+ vector<gnSeqI> search_len;
+ // keep track of the number of mers processed and the total for progress reporting
+ mers_processed = 0;
+ total_mers = 0;
+ m_progress = -1;
+ for(uint32 i=0; i < sar_table.size(); ++i){
+ search_len.push_back(GNSEQI_END);
+ total_mers += search_len[i] == GNSEQI_END ? sar_table[i]->Length() : search_len[i];
+ mers_processed += start_points[ i ];
+ }
+ while( !SearchRange(start_points, search_len) ){
+ mers_processed = 0;
+ for( uint32 seqI = 0; seqI < sar_table.size(); ++seqI ){
+ if( offset_stream != NULL ){
+ if( seqI > 0 )
+ *offset_stream << '\t';
+ *offset_stream << start_points[ seqI ];
+ }
+ mers_processed += start_points[ seqI ];
+ }
+ if( offset_stream != NULL ){
+ *offset_stream << endl;
+ offset_stream->flush();
+ }
+ }
+}
+
+#define MER_REPEAT_LIMIT 1000 // The maximum number of matching mers before they are completely
+ // ignored.
+
+boolean print_sp = false;
+//startI must be 0
+//At most search_length mers in any one genome will be checked.
+boolean MatchFinder::SearchRange(vector<gnSeqI>& start_points, vector<gnSeqI>& search_len){
+ //picked a semi-arbitrary number for buffer size.
+ uint32 MER_BUFFER_SIZE = 10000;
+ vector<uint32> mer_index; // stores the indexes of the current mers in mer_vector
+ vector<uint32> mer_baseindex; // stores the index in the SortedMerList of each of the first mers in mer_vector
+ IdmerList cur_mers; // stores the current mers.
+ IdmerList cur_match; // stores the current matching mers.
+ list<uint32> sar_hitlist; // list of sars to replace
+ uint32 read_size;
+
+ //make sure there is at least one sequence
+ if(sar_table.size() < 1)
+ return true;
+
+ //check for consistency in seed patterns.
+ uint64 mer_mask = sar_table[0]->GetSeedMask();
+ uint64 seed = sar_table[0]->Seed();
+ mer_size = sar_table[0]->SeedWeight();
+ for(uint32 maskI = 0; maskI < sar_table.size(); ++maskI){
+ if(seed != sar_table[maskI]->Seed()){
+ Throw_gnExMsg(InvalidData(), "Different seed patterns.");
+ }
+ }
+
+ //check that start_points and end_points are ok.
+ if((start_points.size() != sar_table.size()) || (search_len.size() != sar_table.size())){
+ Throw_gnExMsg(InvalidData(), "Inconsistent search range specification.");
+ }
+
+ //allocate buffer space
+ // stores arrays of bmers for each sml.
+
+ vector< vector< bmer > > mer_vector;
+ for( uint vecI = 0; vecI < sar_table.size(); ++vecI ){
+ vector< bmer > vec;
+ mer_vector.push_back( vec );
+ }
+
+ //initialize the data structures
+ idmer newmer;
+ for(uint32 n = 0; n < sar_table.size(); ++n){
+ read_size = MER_BUFFER_SIZE < search_len[n] ? MER_BUFFER_SIZE : search_len[n];
+ mer_vector[n].reserve(read_size);
+ sar_table[n]->Read(mer_vector[n], read_size, start_points[n]);
+ mer_index.push_back(0);
+ mer_baseindex.push_back(0);
+ if( mer_vector[n].size() > 0 ){
+ newmer.position = mer_vector[n][0].position;
+ newmer.mer = mer_vector[n][0].mer & mer_mask;
+ newmer.id = n;
+ cur_mers.push_back(newmer); //cur_mers gets the first mer from each sorted mer list
+ }
+ }
+
+ if( print_sp ){
+ cerr << "First mers are: " << mer_vector[0][0].mer << endl;
+ cerr << "First mers are: " << mer_vector[1][0].mer << endl;
+ cerr << "First mers are: " << mer_vector[2][0].mer << endl;
+ print_sp = false;
+ }
+ //nobody reads these fucking things. why am i writing this.because my fucking
+ //roomate needs a goddamn roadmap...... ohhh ecstasy.... haptic pumpkins
+
+ //loop while there is data to hash.
+ cur_mers.sort(&idmer_lessthan);
+ while(cur_mers.size() > 0){
+ IdmerList::iterator mer_iter = cur_mers.begin();
+ sarID_t cur_id = mer_iter->id;
+ //first check for matches across genomes.
+ if(cur_match.size() > 0){
+ if(mer_iter->mer > cur_match.begin()->mer){
+ //we are done with this matching. hash it.
+ if(cur_match.size() > 1)
+ EnumerateMatches(cur_match);
+ cur_match.clear();
+ }else if(mer_iter->mer < cur_match.begin()->mer){
+ //error checking stuff.
+ ErrorMsg("Horrible error occurred!!\n");
+ }
+ }
+
+ if( cur_match.size() > MER_REPEAT_LIMIT ){
+ // scan past the repetitive mers
+ // create the lexicographically next mer
+ uint64 next_mer = cur_match.begin()->mer;
+ next_mer += ~mer_mask + 1;
+// cerr << "Searching to: " << next_mer << endl;
+ gnSeqI next_pos = 0;
+ uint seqI = 0;
+ for( ; seqI < sar_table.size(); ++seqI ){
+ if( !sar_table[ seqI ]->FindMer( next_mer, next_pos ))
+ ++next_pos;
+ if( next_pos < sar_table[ seqI ]->SMLLength() )
+ break;
+ }
+ vector< gnSeqI > old_starts = start_points;
+ if( seqI < sar_table.size() )
+ GetBreakpoint( seqI, next_pos, start_points );
+ for( int spI = 0; spI < start_points.size(); ++spI ){
+ // don't allow it to move backwards!
+ start_points[ spI ] = start_points[ spI ] < mer_index[ spI ] + mer_baseindex[ spI ] + old_starts[ spI ] ? old_starts[ spI ] + mer_index[ spI ] + mer_baseindex[ spI ] : start_points[ spI ];
+ if( spI < seqI )
+ start_points[ spI ] = sar_table[ spI ]->SMLLength();
+ }
+ return false;
+ }
+ //check for matches within the same genome
+ gnSeqI merI = mer_index[cur_id];
+ boolean buffer_exhausted = merI < mer_vector[cur_id].size() ? false : true;
+ while(!buffer_exhausted && (mer_iter->mer == (mer_vector[cur_id][merI].mer & mer_mask))){
+ newmer.position = mer_vector[cur_id][merI].position;
+ newmer.mer = mer_vector[cur_id][merI].mer & mer_mask;
+ newmer.id = cur_id;
+ cur_match.push_back(newmer);
+ ++merI;
+ ++mer_index[cur_id];
+ //check if we've exhausted our buffer
+ if(merI == mer_vector[cur_id].size())
+ buffer_exhausted = true;
+ }
+
+ if(buffer_exhausted)
+ {
+ //if we've exhausted our buffer then refill it
+ mer_baseindex[cur_id] += mer_vector[cur_id].size();
+
+ // update the mers processed
+ mers_processed += mer_vector[cur_id].size();
+ float64 m_oldprogress = m_progress;
+ m_progress = ((float64)mers_processed / (float64)total_mers) * PROGRESS_GRANULARITY;
+ if( log_stream != NULL ){
+ if((int)m_oldprogress != (int)m_progress){
+ (*log_stream) << (int)((m_progress / PROGRESS_GRANULARITY) * 100) << "%..";
+ log_stream->flush();
+ }
+ if(((int)m_oldprogress / 10) != ((int)m_progress / 10))
+ (*log_stream) << std::endl;
+ }
+ uint32 read_size = MER_BUFFER_SIZE;
+ if(MER_BUFFER_SIZE + mer_baseindex[cur_id] > search_len[cur_id])
+ read_size = search_len[cur_id] - mer_baseindex[cur_id];
+
+ sar_table[cur_id]->Read(mer_vector[cur_id], read_size, start_points[cur_id] + mer_baseindex[cur_id]);
+ mer_index[cur_id] = 0;
+ if(mer_vector[cur_id].size() == 0){
+ //remove mer_iter so that this sar is forgotten
+ cur_mers.erase(mer_iter);
+ }
+ }else{
+ //if we haven't exhausted our buffer then we must have
+ //run out of matching mers.
+ //remove mer_iter and put in a new idmer with the same id
+ cur_mers.erase(mer_iter);
+ newmer.position = mer_vector[cur_id][merI].position;
+ newmer.mer = mer_vector[cur_id][merI].mer & mer_mask;
+ newmer.id = cur_id;
+ mer_iter = cur_mers.begin();
+ while(mer_iter != cur_mers.end() && mer_iter->mer < newmer.mer )
+ ++mer_iter;
+ cur_mers.insert(mer_iter, newmer);
+ }
+
+ }
+ //very last match in the dataset wasn't getting hashed.
+ if(cur_match.size() > 1)
+ EnumerateMatches(cur_match);
+
+ return true;
+}
+
+boolean MatchFinder::EnumerateMatches( IdmerList& match_list ){
+ //this must call HashMatch on every possible combination of matches in the list.
+ if(match_list.size() == 2){
+ //this is the smallest possible match. simply hash it.
+ return HashMatch(match_list);
+ }
+
+ match_list.sort(&idmer_id_lessthan);
+ vector<uint32> id_start;
+ vector<IdmerList::iterator> id_pos;
+ vector<IdmerList::iterator> id_end;
+ IdmerList::iterator iter = match_list.begin();
+ IdmerList::iterator iter2 = match_list.begin();
+ ++iter2;
+ id_start.push_back(0);
+ id_pos.push_back(iter);
+ for(uint32 i=0; iter2 != match_list.end(); ++i){
+ if(iter->id != iter2->id){
+ id_start.push_back(i);
+ id_pos.push_back(iter2);
+ }
+ ++iter;
+ ++iter2;
+ }
+ //the following loop iterates through all possible combinations of idmers with
+ //different id's and hashes them.
+ id_end = id_pos;
+ id_end.push_back(match_list.end());
+ while(true){
+ IdmerList cur_match;
+ for(uint32 k = 0; k < id_pos.size(); ++k){
+ cur_match.push_back(*id_pos[k]);
+ }
+ HashMatch(cur_match);
+ cur_match.clear();
+
+ //increment the iterators (like an odometer)
+ uint32 m = id_pos.size() - 1;
+ while(true){
+ ++id_pos[m];
+ if(id_pos[m] == id_end[m+1]){
+ if(m == 0)
+ return true;
+ id_pos[m] = id_end[m];
+ m--;
+ }else
+ break;
+ }
+ }
+
+ return true;
+}
+/*
+boolean MatchFinder::MatchAmbiguities(MatchHashEntry& mhe, uint32 match_size){
+ if(ambiguity_tolerance == 0)
+ return false;
+ //check that all mers at the new position match
+ //which sequences are used in this match?
+ uint32* cur_seqs = new uint32[mhe.SeqCount()];
+ uint32 used_seqs = 0;
+ for(uint32 seqI = 0; seqI < mhe.SeqCount(); ++seqI){
+ if(mhe[seqI] != NO_MATCH){
+ cur_seqs[used_seqs] = seqI;
+ ++used_seqs;
+ }
+ }
+ string cur_mer, mer_i;
+ gnSequence mer_seq;
+ int64 mer_to_get = mhe[cur_seqs[0]];
+ if(mer_to_get < 0){
+ mer_to_get *= -1;
+ mer_to_get += mhe.Length() - mer_size;
+ }
+ cur_mer = seq_table[cur_seqs[0]]->subseq(mer_to_get, match_size).ToString();
+
+ for(uint32 i=1; i < used_seqs; ++i){
+ mer_to_get = mhe[cur_seqs[i]];
+ if(mer_to_get < 0){
+ //Convert the cur_seqs[i] entry since negative implies reverse complement
+ mer_to_get *= -1;
+ mer_to_get += mhe.Length() - mer_size;
+ }
+ mer_seq = seq_table[cur_seqs[i]]->subseq(mer_to_get, match_size);
+ if(mer_seq.compare(cur_mer) != 0){
+ delete[] cur_seqs;
+ return false;
+ }
+ mer_i = mer_seq.ToString();
+ uint32 ambiguity_count = 0;
+ for(uint32 baseI = 0; baseI < match_size; ++baseI)
+ if(cur_mer[baseI] != mer_i[baseI])
+ ++ambiguity_count;
+ if(ambiguity_count > ambiguity_tolerance){
+ delete[] cur_seqs;
+ return false;
+ }
+ }
+ delete[] cur_seqs;
+ return true;
+}
+*/
+
+} // namespace mems
diff --git a/libMems/MatchFinder.h b/libMems/MatchFinder.h
new file mode 100644
index 0000000..23e15c9
--- /dev/null
+++ b/libMems/MatchFinder.h
@@ -0,0 +1,380 @@
+/*******************************************************************************
+ * $Id: MatchFinder.h,v 1.23 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _MatchFinder_h_
+#define _MatchFinder_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/SortedMerList.h"
+#include "libMems/Match.h"
+#include "libMems/MatchList.h"
+#include <list>
+#include <iostream>
+#include <boost/pool/pool_alloc.hpp>
+
+namespace mems {
+
+struct idmer{
+ gnSeqI position; //starting position of this mer in the genome
+ uint64 mer; //the actual sequence
+ sarID_t id; //the sequence identifier.
+};
+
+// typedef std::list<idmer, boost::fast_pool_allocator<idmer> > IdmerList;
+// using boost::fast_pool_allocator<idmer> results in a significant speedup
+// over std::allocator. testing on a Salmonella vs. Y. pestis comparison shows
+// a 30% speedup
+typedef std::list<idmer> IdmerList;
+
+const unsigned int PROGRESS_GRANULARITY = 100;
+
+/**
+ * This pure virtual class implements a general framework for finding
+ * exactly matching mers. It is extended by the MemHash and MemScorer
+ * classes.
+ * @see MemHash
+ * @see MemScorer
+ */
+class MatchFinder : public genome::gnClone{
+public:
+ MatchFinder();
+ ~MatchFinder();
+ MatchFinder(const MatchFinder& mf);
+ virtual void Clear();
+ /**
+ * Adds a sequence to use when searching for exact matches.
+ * @param sar A pointer to the sorted mer list for the new sequence
+ * @param seq A pointer to the genome::gnSequence corresponding to the new sequence.
+ */
+ virtual boolean AddSequence( SortedMerList* sar, genome::gnSequence* seq = NULL );
+ /**
+ * Given the index of a sequence and an index into the sorted mer list, this function
+ * will search the other sorted mer lists for the same mer. This function returns the
+ * position of the mer in each sequence in the breakpoints vector.
+ */
+ virtual void GetBreakpoint( uint32 sarI, gnSeqI startI, std::vector<gnSeqI>& breakpoints ) const;
+ virtual uint32 Multiplicity(void){return seq_count;};
+ /** NOT IMPLEMENTED: Sets the number of ambiguities allowed in a mer match*/
+ virtual void SetAmbiguityTolerance(uint32 ambiguity_tol){ambiguity_tolerance = ambiguity_tol;}
+ /** @return the number of ambiguities allowed in a mer match */
+ virtual uint32 AmbiguityTolerance(){return ambiguity_tolerance;}
+ /** @return The progress of the current operation. Ranges from 0 to 100. -1 indicates no computation is being performed */
+ virtual float GetProgress() const {return m_progress;}
+
+ /** Finds all the matches between the sequences */
+ virtual void FindMatchSeeds();
+ /** Finds all the matches between the sequences, starting at a particular offset */
+ virtual void FindMatchSeeds( const std::vector<gnSeqI>& start_offsets );
+
+ /**
+ * Logs progress to the designated ostream. Set to null to skip progress logging.
+ */
+ virtual void LogProgress( std::ostream* os );
+ void SetOffsetLog( std::ostream* offset_stream ){ this->offset_stream = offset_stream; }
+protected:
+ /**
+ * Searches for mer matches in a designated range of the sequence's sorted mer lists
+ * @throws InvalidData thrown if the start_points are bad or if the sorted mer lists were sorted on different mer sizes
+ * @return true if completed searching, false if repetitive mers were encountered and FindMatches must be called again.
+ */
+ virtual boolean SearchRange(std::vector<gnSeqI>& start_points, std::vector<gnSeqI>& search_len);
+ /** Called whenever a mer match is found */
+ virtual boolean HashMatch(IdmerList& match_list) = 0;
+ virtual boolean EnumerateMatches(IdmerList& match_list);
+
+ template< class MatchType >
+ void FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches);
+
+ template< class UngappedMatchType >
+ void ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward = GNSEQI_END, gnSeqI max_forward = GNSEQI_END);
+
+ virtual SortedMerList* GetSar(uint32 sarI) const;
+ std::vector<SortedMerList*> sar_table;
+ std::vector<genome::gnSequence*> seq_table;
+
+ uint32 mer_size;
+ uint32 seq_count;
+ uint32 ambiguity_tolerance;
+
+ // for subset matches
+ std::vector< std::vector< uint32 > > alpha_map;
+ uint alpha_map_size;
+ uint alphabet_bits;
+
+ float m_progress;
+ std::ostream* log_stream;
+
+ uint64 mers_processed; /**< The number of mers processed thus far */
+ uint64 total_mers; /**< The total number of mers to search */
+ std::ostream* offset_stream; /**< log for the current offset in each SML */
+};
+
+/**
+ * InvalidData exceptions are thrown when the input to an algorithm is invalid
+ */
+CREATE_EXCEPTION( InvalidData );
+
+inline
+SortedMerList* MatchFinder::GetSar(uint32 sarI) const{
+ return sar_table[sarI];
+}
+
+inline
+bool idmer_lessthan(idmer& a_v, idmer& m_v){
+ return (a_v.mer < m_v.mer);// ? true : false;
+};
+
+//id less than function for STL sort functions
+inline
+bool idmer_id_lessthan(idmer& a_v, idmer& m_v){
+ return (a_v.id < m_v.id);// ? true : false;
+};
+
+
+
+// takes as input a fully extended mem and returns the subset matches on the lower side
+template< class MatchType >
+void MatchFinder::FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches){
+
+ SMLHeader head = GetSar( 0 )->GetHeader();
+ uint shift_amt = 64 - head.alphabet_bits;
+ uint rshift_amt = head.alphabet_bits * ( GetSar(0)->SeedLength() - 1 );
+
+ uint seqI, alphaI;
+
+ // initialize subset match data structures
+ alpha_map_size = 1;
+ alpha_map_size <<= alphabet_bits;
+ if( alpha_map.size() != alpha_map_size ){
+ alpha_map.clear();
+ alpha_map.reserve( alpha_map_size );
+ std::vector< uint32 > tmp_list;
+ tmp_list.reserve( seq_count );
+ for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
+ alpha_map.push_back( tmp_list );
+ }else{
+ for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
+ alpha_map[ alphaI ].clear();
+ }
+
+
+ for( seqI = 0; seqI < seq_count; ++seqI ){
+ //check that all mers at the new position match
+ int64 mer_to_get = mhe[ seqI ];
+ if( mer_to_get == NO_MATCH )
+ continue;
+ if(mer_to_get < 0){
+ mer_to_get *= -1;
+ mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
+ }
+
+ uint64 cur_mer = GetSar( seqI )->GetMer( mer_to_get - 1 );
+
+ boolean parity;
+ if( mhe[ seqI ] < 0 )
+ parity = cur_mer & 0x1;
+ else
+ parity = !(cur_mer & 0x1);
+
+ if( parity ){
+ cur_mer >>= shift_amt;
+ }else{
+ cur_mer <<= rshift_amt;
+ cur_mer = ~cur_mer;
+ cur_mer >>= shift_amt;
+ }
+
+ alpha_map[ cur_mer ].push_back( seqI );
+
+ }
+
+ for( alphaI = 0; alphaI < alpha_map_size; ++alphaI ){
+ if( alpha_map[ alphaI ].size() < 2 ){
+ alpha_map[ alphaI ].clear();
+ continue;
+ }
+ // this is a subset
+ MatchType cur_subset = mhe;
+ cur_subset.SetLength( mhe.Length() );
+ for( uint sqI = 0; sqI < mhe.SeqCount(); ++sqI )
+ cur_subset.SetStart( sqI, NO_MATCH ); // init everything to NO_MATCH
+ for( uint subI = 0; subI < alpha_map[ alphaI ].size(); ++subI )
+ cur_subset.SetStart( alpha_map[ alphaI ][ subI ], mhe[ alpha_map[ alphaI ][ subI ] ] );
+ subset_matches.push_back( cur_subset );
+ alpha_map[ alphaI ].clear();
+ }
+}
+
+// BUGS:
+// matches which span the end-start of a circular sequence will be hashed a second time
+template< class UngappedMatchType >
+void MatchFinder::ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward, gnSeqI max_forward){
+ uint64 cur_mer;
+ uint64 mer_mask = GetSar(0)->GetSeedMask();
+
+ //which sequences are used in this match?
+ uint32* cur_seqs = new uint32[mhe.SeqCount()];
+ uint32 used_seqs = 0;
+ for(uint32 seqI = 0; seqI < mhe.SeqCount(); ++seqI){
+ if(mhe[seqI] != NO_MATCH){
+ cur_seqs[used_seqs] = seqI;
+ ++used_seqs;
+ }
+ }
+ //First extend backwards then extend forwards. The following loop does them both.
+ int jump_size = GetSar(0)->SeedLength();
+ uint extend_limit = 0; /**< Tracks the distance to the most distant overlapping matching seed */
+ uint extend_attempts = 0; /**< Counts the total number of overlapping seeds checked */
+ boolean extend_again = false; /**< Set to true if any overlapping seeds matched, the search will be restarted from that point */
+ for(uint32 directionI = 0; directionI < 4; ++directionI){
+ //how far can we go?
+ //first calculate the maximum amount of traversal
+ //then do fewer comparisons.
+ int64 maxlen = GNSEQI_END;
+ if(directionI == 0)
+ maxlen = max_backward;
+ else if(directionI == 1)
+ maxlen = max_forward;
+ else
+ maxlen = GetSar(0)->SeedLength();
+ for(uint32 maxI = 0; maxI < used_seqs; ++maxI)
+ if(GetSar(cur_seqs[maxI])->IsCircular()){
+ if(GetSar(cur_seqs[maxI])->Length() < maxlen)
+ maxlen = GetSar(cur_seqs[maxI])->Length();
+ }else if(mhe[cur_seqs[maxI]] < 0){
+ int64 rc_len = GetSar(cur_seqs[maxI])->Length() - mhe.Length() + mhe[cur_seqs[maxI]] + 1;
+ if( rc_len < maxlen)
+ maxlen = rc_len;
+ }else if(mhe[cur_seqs[maxI]] - 1 < maxlen)
+ maxlen = mhe[cur_seqs[maxI]] - 1;
+ uint32 j=0;
+ uint32 i = used_seqs; // set to used_seqs in case maxlen is already less than jump size.
+
+ extend_limit = 0;
+ extend_attempts = 0;
+
+ while(maxlen - jump_size >= 0){
+ mhe.SetLength(mhe.Length() + jump_size);
+ maxlen -= jump_size;
+ for(j=0; j < used_seqs; ++j){
+ if(mhe[cur_seqs[j]] > 0){
+ mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - jump_size);
+ if(mhe[cur_seqs[j]] <= 0)
+ mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + GetSar(cur_seqs[j])->Length());
+ }
+ }
+ //check that all mers at the new position match
+ int64 mer_to_get = mhe[cur_seqs[0]];
+ if(mer_to_get < 0){
+ mer_to_get *= -1;
+ mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
+ }
+ cur_mer = GetSar(cur_seqs[0])->GetSeedMer(mer_to_get - 1);
+ boolean parity;
+ if( mhe[cur_seqs[0]] < 0 )
+ parity = cur_mer & 0x1;
+ else
+ parity = !(cur_mer & 0x1);
+ cur_mer &= mer_mask;
+
+ for(i=1; i < used_seqs; ++i){
+ mer_to_get = mhe[cur_seqs[i]];
+ if(mer_to_get < 0){
+ //Convert the cur_seqs[i] entry since negative implies reverse complement
+ mer_to_get *= -1;
+ mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
+ }
+ uint64 comp_mer = GetSar(cur_seqs[i])->GetSeedMer(mer_to_get - 1);
+ boolean comp_parity;
+ if( mhe[cur_seqs[i]] < 0 )
+ comp_parity = comp_mer & 0x1;
+ else
+ comp_parity = !(comp_mer & 0x1);
+ comp_mer &= mer_mask;
+
+ if(cur_mer != comp_mer || parity != comp_parity ){
+ if( directionI < 2 )
+ maxlen = 0;
+ break;
+ }
+ }
+ extend_attempts += jump_size;
+ if( i == used_seqs )
+ extend_limit = extend_attempts;
+ if( directionI > 1 && extend_attempts == GetSar(0)->SeedLength() )
+ break;
+ }
+ //this stuff cleans up if there was a mismatch
+ if(i < used_seqs){
+ mhe.SetLength(mhe.Length() - jump_size);
+ for(;j > 0; j--){
+ if(mhe[cur_seqs[j - 1]] >= 0)
+ mhe.SetStart(cur_seqs[j - 1], mhe[cur_seqs[j - 1]] + jump_size);
+ }
+ }
+ // check whether any of the overlapping seeds matched.
+ // if so, set the match to that length and set the flag to start the search again
+ if( directionI > 1 && extend_attempts > 0 ){
+ if( extend_limit > 0 )
+ extend_again = true;
+ // minus jump_size because the cleanup above already moved the length back a little
+ int unmatched_diff = extend_attempts - extend_limit;
+ if( i < used_seqs )
+ unmatched_diff -= jump_size;
+ if( (unmatched_diff > mhe.Length()) && unmatched_diff >= 0 )
+ std::cerr << "oh sheethockey mushrooms\n";
+ mhe.SetLength(mhe.Length() - unmatched_diff);
+ for(j=0; j < used_seqs; ++j){
+ if(mhe[cur_seqs[j]] > 0){
+ mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + unmatched_diff);
+ if(mhe[cur_seqs[j]] > GetSar(cur_seqs[j])->Length() )
+ mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - GetSar(cur_seqs[j])->Length() );
+ }
+ }
+ }
+ //Invert the sequence directions so that we extend in the other direction
+ //next time through the loop. The second time we do this we are setting
+ //sequence directions back to normal.
+ mhe.Invert();
+
+ //if we've already been through twice then decrease the jump size
+ if(directionI >= 1)
+ jump_size = 1;
+ if( directionI == 3 && extend_again ){
+ directionI = -1; // will become 0 on next iteration
+ jump_size = GetSar(0)->SeedLength();
+ extend_again = false;
+ }
+ }
+ // after the match has been fully extended, search for subset matches
+ // this code only works when using SOLID seeds-- so it's been disabled
+/* if( used_seqs > 2 ){
+ FindSubsets( mhe, subset_matches );
+ mhe.Invert();
+ FindSubsets( mhe, subset_matches );
+ mhe.Invert();
+ }
+*/
+ // set the subsets so their reference sequence is always positive
+ for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
+ if( subset_matches[subsetI][subset_matches[subsetI].FirstStart()] < 0 )
+ subset_matches[subsetI].Invert();
+ subset_matches[subsetI].CalculateOffset();
+ }
+
+ delete[] cur_seqs;
+}
+
+
+
+}
+
+#endif //_MatchFinder_h_
diff --git a/libMems/MatchHashEntry.cpp b/libMems/MatchHashEntry.cpp
new file mode 100644
index 0000000..0a323a1
--- /dev/null
+++ b/libMems/MatchHashEntry.cpp
@@ -0,0 +1,203 @@
+/*******************************************************************************
+ * $Id: MatchHashEntry.cpp,v 1.9 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MatchHashEntry.h"
+#include "libGenome/gnException.h"
+#include "libGenome/gnDebug.h"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+boolean MatchHashEntry::offset_lessthan(const MatchHashEntry& a, const MatchHashEntry& b){
+ return a.m_offset < b.m_offset;
+}
+
+boolean MatchHashEntry::start_lessthan_ptr(const MatchHashEntry* a, const MatchHashEntry* b){
+ int32 start_diff = a->FirstStart() - b->FirstStart();
+ if(start_diff == 0){
+ uint32 m_count = a->SeqCount();
+ m_count = m_count <= b->SeqCount() ? m_count : b->SeqCount();
+ for(uint32 seqI = seq_compare_start; seqI < m_count; seqI++){
+ int64 a_start = a->Start(seqI), b_start = b->Start(seqI);
+ if(a_start < 0)
+ a_start = -a_start + a->Length() - a->m_mersize;
+ if(b_start < 0)
+ b_start = -b_start + b->Length() - b->m_mersize;
+ int64 diff = a_start - b_start;
+ if(a_start == NO_MATCH || b_start == NO_MATCH)
+ continue;
+ else if(diff == 0)
+ continue;
+ else
+ return diff < 0;
+ }
+ }
+ return start_diff < 0;
+}
+
+boolean MatchHashEntry::strict_start_lessthan_ptr(const MatchHashEntry* a, const MatchHashEntry* b){
+ int start_diff = a->FirstStart() - b->FirstStart();
+ if(start_diff == 0){
+ uint m_count = a->SeqCount();
+ m_count = m_count <= b->SeqCount() ? m_count : b->SeqCount();
+ for(uint seqI = 0; seqI < m_count; seqI++){
+ int64 a_start = a->Start(seqI), b_start = b->Start(seqI);
+ if(a_start < 0)
+ a_start = -a_start + a->Length() - a->m_mersize;
+ if(b_start < 0)
+ b_start = -b_start + b->Length() - b->m_mersize;
+ int64 diff = a_start - b_start;
+ if(diff == 0)
+ continue;
+ else
+ return diff < 0;
+ }
+ }
+ return start_diff < 0;
+}
+
+
+//ignores mem_no_matches
+int64 MatchHashEntry::start_compare(const MatchHashEntry& a, const MatchHashEntry& b){
+ uint m_count = a.SeqCount();
+ m_count = m_count <= b.SeqCount() ? m_count : b.SeqCount();
+ for(uint seqI = 0; seqI < m_count; seqI++){
+ int64 a_start = a.Start(seqI), b_start = b.Start(seqI);
+ if(a_start < 0)
+ a_start = -a_start + a.Length() - a.m_mersize;
+ if(b_start < 0)
+ b_start = -b_start + b.Length() - b.m_mersize;
+ int64 diff = a_start - b_start;
+ if(a_start == NO_MATCH || b_start == NO_MATCH)
+ continue;
+ else if(diff == 0)
+ continue;
+ else
+ return diff;
+ }
+ return 0;
+}
+
+int64 MatchHashEntry::end_to_start_compare(const MatchHashEntry& a, const MatchHashEntry& b){
+ MatchHashEntry tmp_a = a;
+ tmp_a.CropStart(tmp_a.Length()-1);
+ return MatchHashEntry::start_compare(tmp_a, b);
+}
+
+
+MatchHashEntry::MatchHashEntry() :
+Match(),
+m_extended( false ),
+m_mersize( 0 )
+{
+}
+
+
+MatchHashEntry::MatchHashEntry(uint32 seq_count, const gnSeqI mersize, MemType m_type) :
+ Match( seq_count ),
+ m_mersize( mersize )
+{
+ m_extended = m_type == extended;
+}
+
+
+MatchHashEntry* MatchHashEntry::Clone() const{
+ return new MatchHashEntry(*this);
+}
+
+MatchHashEntry& MatchHashEntry::operator=(const MatchHashEntry& mhe)
+{
+ Match::operator=( mhe );
+ m_extended = mhe.m_extended;
+ m_mersize = 0;
+ m_offset = mhe.m_offset;
+
+ return *this;
+}
+
+boolean MatchHashEntry::operator==(const MatchHashEntry& mhe) const
+{
+ if(m_seq_count != mhe.m_seq_count)
+ return false;
+ if(m_mersize != mhe.m_mersize)
+ return false;
+ if(m_extended != mhe.m_extended)
+ return false;
+ if( m_offset != mhe.m_offset )
+ return false;
+ return Match::operator ==(mhe);
+}
+
+void MatchHashEntry::CalculateOffset()
+{
+ if( SeqCount() == 0 )
+ return;
+
+ int64 tmp_off = 0;
+ m_offset = 0;
+
+ uint seqI = FirstStart();
+ int64 ref_start = Start(seqI);
+
+ for(seqI++; seqI < SeqCount(); seqI++){
+ if(Start(seqI) != NO_MATCH){
+ tmp_off = Start(seqI) - ref_start;
+ if( Start(seqI) < 0 )
+ tmp_off -= (int64)Length( seqI );
+ m_offset += tmp_off;
+ }
+ }
+}
+
+// checks if mhe is _perfectly_ contained in this match.
+// all offsets in all sequences must be aligned to each other
+boolean MatchHashEntry::Contains(const MatchHashEntry& mhe) const{
+ uint i;
+ int64 diff_i;
+ int64 diff;
+ uint seq_count = mhe.SeqCount();
+ //check for a consistent number of genomes and
+ //identical generalized offsets
+ if(SeqCount() != seq_count || m_offset != mhe.m_offset)
+ return false;
+
+ i = mhe.FirstStart();
+ diff = mhe.Start(i) - Start(i);
+ if(Start(i) == NO_MATCH)
+ return false;
+
+ //check for containment properties
+ if(diff < 0 || Length() < mhe.Length() + diff)
+ return false;
+
+ //everything is ok so far, check for alignment
+ int64 diff_rc = (int64)mhe.Length() - (int64)Length() + diff;
+ for(i++; i < seq_count; i++){
+ //check for consistent alignment between all genomes
+ //in the case of revcomp, diff_i must equal diff_rc
+ diff_i = mhe.Start(i) - Start(i);
+
+ //it's ok if neither matches in a sequence
+ if(mhe.Start(i) == NO_MATCH && Start(i) == NO_MATCH)
+ continue;
+ else if(mhe.Start(i) < 0 && diff_rc == diff_i)
+ continue;
+ else if(diff != diff_i )
+ return false;
+ }
+ //it was contained.
+ return true;
+}
+
+
+} // namespace mems
diff --git a/libMems/MatchHashEntry.h b/libMems/MatchHashEntry.h
new file mode 100644
index 0000000..42dd0df
--- /dev/null
+++ b/libMems/MatchHashEntry.h
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ * $Id: Match.h,v 1.10 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __MatchHashEntry_h__
+#define __MatchHashEntry_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include <iostream>
+#include <set>
+#include "libMems/Match.h"
+
+namespace mems {
+
+/**
+ * The Match class stores the location of an <b>equal size</b> (inexact or exactly)
+ * matching region
+ * between several sequences. There are numerous functions in this
+ * class which can be used to compare and manipulate this match.
+ */
+
+class MatchHashEntry : public Match
+{
+public:
+ enum MemType
+ {
+ seed,
+ extended
+ };
+
+public:
+ MatchHashEntry();
+ /**
+ * Creates a new Match.
+ * @param seq_count The total number of sequences in the alignment
+ * @param mersize The size of the mers used in the sorted mer lists.
+ * @param m_type The type of mem to create, can either be a seed or already extended.
+ * @see MemType
+ */
+ MatchHashEntry( const uint seq_count, const gnSeqI mersize, const MemType m_type = seed );
+ MatchHashEntry* Clone() const;
+ MatchHashEntry* Copy() const;
+ virtual void Free();
+ MatchHashEntry( const MatchHashEntry& mhe ){ *this = mhe; }
+ MatchHashEntry& operator=(const MatchHashEntry& mhe);
+
+ /** comparison operator, compares two matches to see if they are the same */
+ boolean operator==(const MatchHashEntry& mhe) const;
+
+
+ /** @return true if this match has already been extended */
+ boolean Extended() const{return m_extended;}
+ /** Sets this match to be extended if the value passed in "extended" is true */
+ void SetExtended(boolean extended){m_extended = extended;}
+ /** @return the mer size of the sorted mer lists used to find this match */
+ uint MerSize() const{return m_mersize;}
+
+ /**
+ * Calculates the generalized offset and other bookkeeping information
+ * for this mem. This should <b>always</b> be called after changing the start
+ * positions of the mem.
+ */
+ virtual void CalculateOffset();
+
+ /** Returns the generalized offset of this match */
+ int64 Offset() const{return m_offset;};
+
+ /** Sets the generalized offset of this match to "offset" */
+ void SetOffset(int64 offset){m_offset = offset;};
+
+ static boolean offset_lessthan(const MatchHashEntry& a, const MatchHashEntry& b);
+ static boolean start_lessthan_ptr(const MatchHashEntry* a, const MatchHashEntry* b);
+ static bool start_lessthan(const MatchHashEntry& a, const MatchHashEntry& b);
+ static boolean strict_start_lessthan_ptr(const MatchHashEntry* a, const MatchHashEntry* b);
+ /** compare the end of a to the start of b
+ */
+ static int64 end_to_start_compare(const MatchHashEntry& a, const MatchHashEntry& b);
+ static int64 start_compare(const MatchHashEntry& a, const MatchHashEntry& b);
+
+ /**
+ * Will return true if this match contains mhe
+ * Containment implies that a match has a length >= the contained
+ * match, it has coordinates in every genome the contained match has,
+ * the difference in start positions in each genome is the same.
+ * @param mhe The match to check for containment.
+ * @return True if this match contains mhe.
+ */
+ boolean Contains(const MatchHashEntry& mhe) const;
+
+private:
+
+ boolean m_extended;
+ gnSeqI m_mersize;
+ int64 m_offset;
+};
+
+inline
+MatchHashEntry* MatchHashEntry::Copy() const
+{
+ return m_allocateAndCopy(*this);
+}
+inline
+void MatchHashEntry::Free()
+{
+ m_free(this);
+}
+
+inline
+bool MatchHashEntry::start_lessthan(const MatchHashEntry& a, const MatchHashEntry& b){
+ return start_lessthan_ptr(&a, &b);
+}
+
+class MheCompare {
+public:
+ bool operator()(const MatchHashEntry* a, const MatchHashEntry* b) const{
+ if( a->FirstStart() > b->FirstStart() ){
+ return true;
+ }else if( a->FirstStart() == b->FirstStart() ){
+ // check that the matches hit the same genomes
+ for( size_t i = a->FirstStart(); i < a->SeqCount(); i++ )
+ {
+ if( a->LeftEnd(i) == NO_MATCH && b->LeftEnd(i) != NO_MATCH )
+ return true;
+ else if( a->LeftEnd(i) != NO_MATCH && b->LeftEnd(i) == NO_MATCH )
+ return false;
+ }
+ //offsets are the same, check for containment...
+ if(a->Contains(*b) || b->Contains(*a)){
+ return false;
+ }else
+ return MatchHashEntry::strict_start_lessthan_ptr(a, b);
+ }
+ return false;
+ }
+};
+
+}
+
+#endif // __MatchHashEntry_h__
diff --git a/libMems/MatchList.cpp b/libMems/MatchList.cpp
new file mode 100644
index 0000000..25648e6
--- /dev/null
+++ b/libMems/MatchList.cpp
@@ -0,0 +1,26 @@
+/*******************************************************************************
+ * $Id: MatchList.cpp,v 1.22 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MatchList.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/DNAMemorySML.h"
+#include "libMems/MemHash.h"
+#include <map>
+#include <sstream>
+#include <ctime>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+
+} // namespace mems
diff --git a/libMems/MatchList.h b/libMems/MatchList.h
new file mode 100644
index 0000000..fc25a01
--- /dev/null
+++ b/libMems/MatchList.h
@@ -0,0 +1,668 @@
+/*******************************************************************************
+ * $Id: MatchList.h,v 1.10 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _MatchList_h_
+#define _MatchList_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <iostream>
+#include <list>
+#include "libMems/SortedMerList.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/DNAMemorySML.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/Match.h"
+#include "libMems/gnRAWSequence.h"
+#include "libGenome/gnRAWSource.h"
+#include "libMems/Files.h"
+#include <sstream>
+#include <map>
+#include <ctime>
+
+namespace mems {
+
+template< typename MatchPtrType >
+class GenericMatchList : public std::vector< MatchPtrType >
+{
+public:
+ GenericMatchList(){};
+ GenericMatchList( const GenericMatchList& ml );
+ GenericMatchList& operator=( const GenericMatchList& ml );
+
+
+ /**
+ * Attempts to load SMLs designated by the
+ * elements of the sml_filename vector. This
+ * method will create the sorted mer lists if they do not exist.
+ * The DNAFileSML objects are created on the heap
+ * and are not deallocated when this class is destroyed. They should
+ * be manually destroyed when no longer in use.
+ * @param seed_rank The rank of the seed to use, 0-2 are ranked spaced seeds,
+ * other options include CODING_SEED and SOLID_SEED
+ */
+ void LoadSMLs( uint mer_size, std::ostream* log_stream, int seed_rank = 0, bool solid = false, bool force_recreate = false );
+
+ /**
+ * Loads sequences to align from a Multi-FastA file and constructs a SML
+ * for each sequence entry in the file.
+ * The genome::gnSequence and SortedMerList objects are created on the heap
+ * and are not deallocated when this class is destroyed. They should
+ * be manually destroyed when no longer in use.
+ *
+ * @param mer_size The seed size to use when constructing the sorted mer lists
+ * @param log_stream An output stream to log messages to. If NULL no logging is done
+ * @param load_smls Specifies whether sorted mer lists should be created
+ * for each sequence entry
+ */
+ void CreateMemorySMLs( uint mer_size, std::ostream* log_stream, int seed_rank = 0 );
+
+ /**
+ * Calculates a default search mer size for the given set of sequences
+ * @param seq_table The vector of sequences to calculate a default mer size for
+ */
+ static uint GetDefaultMerSize( const std::vector< genome::gnSequence* >& seq_table );
+
+ /**
+ * Deletes the genome::gnSequence, SortedMerList, and Match objects associated
+ * with this GenericMatchList.
+ */
+ void Clear();
+
+ /**
+ * Removes all matches that have a multiplicity lower than the specified level
+ * @param mult The multiplicity filter threshold
+ */
+ void MultiplicityFilter( unsigned mult );
+
+ /**
+ * Removes all matches that shorter than the specified length
+ * @param length The minimum length
+ */
+ void LengthFilter( gnSeqI length );
+
+ /**
+ * Removes matches that do not match in exactly the sequences specified in filter_spec
+ * @param filter_spec The specification of the exact filter, true designates that the
+ * match must exist in that sequence. filter_spec must contain
+ * one boolean entry for every sequence.
+ */
+// void ExactFilter( valarray< bool >& filter_spec );
+ /**
+ * Removes matches that do not intersect with the sequences specified in filter_spec
+ * @param filter_spec The specification of the intersection filter, true designates
+ * match must exist in that sequence. filter_spec must contain
+ * one boolean entry for every sequence.
+ */
+// void IntersectFilter( valarray< bool >& filter_spec );
+
+
+ std::vector<std::string> sml_filename; /**< The file names of the sorted mer list for each sequence, may be empty or null */
+ std::vector<std::string> seq_filename; /**< The file names of the sequence data, may be empty or null */
+ std::vector<SortedMerList*> sml_table; /**< The sorted mer list associated with each sequence, may be empty or null */
+ std::vector<genome::gnSequence*> seq_table; /**< The actual sequences associated with the matches stored in this list. Should not be empty or null. */
+
+protected:
+
+};
+
+typedef GenericMatchList< Match* > MatchList;
+
+CREATE_EXCEPTION( InvalidArgument );
+
+/**
+ * Thrown when a file being read is invalid
+ */
+CREATE_EXCEPTION(InvalidFileFormat)
+
+
+/**
+ * Reads a GenericMatchList from an input stream
+ * Sequence and SML file names are read into the seq_filename
+ * and sml_filename vectors, but the actual files are not
+ * opened. The calling function should load them after
+ * using this method.
+ * @param match_stream The input stream to read from
+ */
+void ReadList( MatchList& mlist, std::istream& match_stream );
+
+/**
+ * Writes a GenericMatchList to the designated output stream
+ * @param match_stream The output stream to write to
+ */
+void WriteList( const MatchList& mlist, std::ostream& match_stream );
+
+typedef void* MatchID_t;
+
+template< typename MatchPtrType >
+GenericMatchList< MatchPtrType >::GenericMatchList( const GenericMatchList< MatchPtrType >& ml ){
+ *this = ml;
+}
+
+template< typename MatchPtrType >
+GenericMatchList< MatchPtrType >& GenericMatchList< MatchPtrType >::operator=( const GenericMatchList< MatchPtrType >& ml ){
+ std::vector< MatchPtrType >::operator=( ml );
+ sml_filename = ml.sml_filename;
+ seq_filename = ml.seq_filename;
+ sml_table = ml.sml_table;
+ seq_table = ml.seq_table;
+ return *this;
+}
+
+/**
+ * Attempts to load the sequences designated by the
+ * elements of the seq_filename vector.
+ * The genome::gnSequence objects are created on the heap
+ * and are not deallocated when this class is destroyed. They should
+ * be manually destroyed when no longer in use.
+ */
+template< typename MatchListType >
+void LoadSequences( MatchListType& mlist, std::ostream* log_stream ){
+
+ if( mlist.seq_filename.size() == 0 )
+ return;
+
+ for( uint seqI = 0; seqI < mlist.seq_filename.size(); seqI++ ){
+ genome::gnSequence* file_sequence = new genome::gnSequence();
+ // Load the sequence and tell the user if it loaded successfully
+ try{
+ file_sequence->LoadSource( mlist.seq_filename[ seqI ] );
+ }catch( genome::gnException& gne ){
+ delete file_sequence;
+ if( gne.GetCode() == genome::FileNotOpened() )
+ std::cerr << "Error loading " << mlist.seq_filename[ seqI ] << std::endl;
+ else
+ std::cerr << gne;
+ return;
+ }catch( std::exception& e ){
+ delete file_sequence;
+ std::cerr << "Unhandled exception loading " << mlist.seq_filename[ seqI ] << std::endl;
+ std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
+ std::cerr << e.what();
+ return;
+ }catch( ... ){
+ delete file_sequence;
+ std::cerr << "Unknown exception when loading " << mlist.seq_filename[ seqI ] << std::endl;
+ return;
+ }
+
+ mlist.seq_table.push_back( file_sequence );
+ if( log_stream != NULL ){
+ (*log_stream) << "Sequence loaded successfully.\n";
+ (*log_stream) << mlist.seq_filename[ seqI ] << " " << file_sequence->length() << " base pairs.\n";
+ }
+ }
+
+}
+
+/**
+ * Loads the sequences designated by the elements of the seq_filename vector and
+ * creates temporary RAW sequence files. The resulting gnSequences are gnRAWSequences.
+ * The genome::gnRAWSequence objects are created on the heap
+ * and are not deallocated when this class is destroyed. They should
+ * be manually destroyed when no longer in use.
+ */
+template< typename MatchListType >
+void LoadAndCreateRawSequences( MatchListType& mlist, std::ostream* log_stream ){
+
+ if( mlist.seq_filename.size() == 0 )
+ return;
+
+ for( uint seqI = 0; seqI < mlist.seq_filename.size(); seqI++ ){
+ genome::gnSequence* file_sequence = new genome::gnSequence();
+ // Load the sequence and tell the user if it loaded successfully
+ try{
+ file_sequence->LoadSource( mlist.seq_filename[ seqI ] );
+ }catch( genome::gnException& gne ){
+ delete file_sequence;
+ if( gne.GetCode() == genome::FileNotOpened() )
+ std::cerr << "Error loading " << mlist.seq_filename[ seqI ] << std::endl;
+ else
+ std::cerr << gne;
+ return;
+ }catch( std::exception& e ){
+ delete file_sequence;
+ std::cerr << "Unhandled exception loading " << mlist.seq_filename[ seqI ] << std::endl;
+ std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
+ std::cerr << e.what();
+ return;
+ }catch( ... ){
+ delete file_sequence;
+ std::cerr << "Unknown exception when loading " << mlist.seq_filename[ seqI ] << std::endl;
+ return;
+ }
+
+ // now create a temporary raw sequence
+ std::string tmpfilename = "rawseq";
+ tmpfilename = CreateTempFileName("rawseq");
+ genome::gnRAWSource::Write( *file_sequence, tmpfilename );
+ delete file_sequence;
+ registerFileToDelete( tmpfilename );
+
+ if( log_stream != NULL )
+ (*log_stream) << "Storing raw sequence at " << tmpfilename << std::endl;
+ genome::gnRAWSequence* raw_seq = new genome::gnRAWSequence( tmpfilename );
+ mlist.seq_table.push_back( raw_seq );
+ if( log_stream != NULL ){
+ (*log_stream) << "Sequence loaded successfully.\n";
+ (*log_stream) << mlist.seq_filename[ seqI ] << " " << raw_seq->length() << " base pairs.\n";
+ }
+ }
+}
+
+
+template< typename MatchPtrType >
+void GenericMatchList< MatchPtrType >::LoadSMLs( uint mer_size, std::ostream* log_stream, int seed_rank, bool solid, bool force_create ){
+
+ // if the mer_size parameter is 0 then calculate a default mer size for these sequences
+ if( mer_size == 0 ){
+ mer_size = GetDefaultMerSize( seq_table );
+ if( log_stream != NULL ){
+ (*log_stream) << "Using weight " << mer_size << " mers for initial seeds\n";
+ }
+ }
+
+ // load and creates SMLs as necessary
+ uint64 default_seed = getSeed( mer_size, seed_rank );
+ if (solid)
+ uint64 default_seed = getSolidSeed( mer_size );
+ std::vector< uint > create_list;
+ uint seqI = 0;
+ for( seqI = 0; seqI < seq_table.size(); seqI++ ){
+ // define a DNAFileSML to store a sorted mer list
+ DNAFileSML* file_sml = new DNAFileSML();
+ sml_table.push_back( file_sml );
+
+ boolean success = true;
+ try{
+ file_sml->LoadFile( sml_filename[ seqI ] );
+ }catch( genome::gnException& gne ){
+ success = false;
+ create_list.push_back( seqI );
+ }
+ boolean recreate = false;
+ if(success && force_create){
+ if( log_stream != NULL )
+ (*log_stream) << "SML exists, but forcefully recreating. A new sorted mer list will be created.\n";
+ recreate = true;
+ create_list.push_back( seqI );
+ }
+ else if(success && (file_sml->Seed() != default_seed )){
+ if( log_stream != NULL )
+ (*log_stream) << "Default seed mismatch. A new sorted mer list will be created.\n";
+ recreate = true;
+ create_list.push_back( seqI );
+ }
+
+ if( success && !recreate && log_stream != NULL && !force_create )
+ (*log_stream) << "Sorted mer list loaded successfully\n";
+ }
+
+ // free up memory before creating any SMLs
+ if( create_list.size() > 0 )
+ for( seqI = 0; seqI < sml_table.size(); seqI++ ){
+ sml_table[ seqI ]->Clear();
+ delete sml_table[ seqI ];
+ sml_table[ seqI ] = NULL;
+ }
+
+ // create any SMLs that need to be created
+ for( uint createI = 0; createI < create_list.size(); createI++ ){
+ if( log_stream != NULL )
+ (*log_stream) << "Creating sorted mer list\n";
+ try{
+
+ time_t start_time = time(NULL);
+ sml_table[ create_list[ createI ] ] = new DNAFileSML( sml_filename[ create_list[ createI ] ] );
+ sml_table[ create_list[ createI ] ]->Create( *seq_table[ create_list[ createI ] ], default_seed );
+ time_t end_time = time(NULL);
+ if( log_stream != NULL )
+ (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
+
+ }catch(...){
+ std::cerr << "Error creating sorted mer list\n";
+ throw;
+ }
+ }
+
+ // reload the other SMLs now that creation has completed
+ if( create_list.size() > 0 ){
+ for( seqI = 0; seqI < seq_filename.size(); seqI++ ){
+ if( sml_table[ seqI ] != NULL )
+ continue;
+ sml_table[ seqI ] = new DNAFileSML( sml_filename[ seqI ] );
+ try{
+ ((DNAFileSML*)sml_table[ seqI ])->LoadFile( sml_filename[ seqI ] );
+ }catch( genome::gnException& gne ){
+ std::cerr << "Error loading sorted mer list\n";
+ throw;
+ }
+ }
+ }
+}
+
+template< typename MatchPtrType >
+uint GenericMatchList< MatchPtrType >::GetDefaultMerSize( const std::vector< genome::gnSequence* >& seq_table ){
+ gnSeqI total_len = 0;
+ for( uint seqI = 0; seqI < seq_table.size(); seqI++ )
+ total_len += seq_table[ seqI ]->length();
+ return getDefaultSeedWeight( total_len / seq_table.size() );
+}
+
+
+/**
+ * Loads sequences to align from a Multi-FastA file
+ * The genome::gnSequence and SortedMerList objects are created on the heap
+ * and are not deallocated when this class is destroyed. They should
+ * be manually destroyed when no longer in use.
+ *
+ * @param mfa_filename The name of the Multi-FastA file to read in. Each
+ * sequence entry will be treated as a separate sequence to
+ * be aligned.
+ * @param log_stream An output stream to log messages to. If NULL no logging is done
+ */
+template< typename MatchListType >
+void LoadMFASequences( MatchListType& mlist, const std::string& mfa_filename, std::ostream* log_stream ) {
+ genome::gnSequence file_sequence;
+ // Load the sequence and tell the user if it loaded successfully
+ try{
+ file_sequence.LoadSource( mfa_filename );
+ }catch( genome::gnException& gne ){
+ if( gne.GetCode() == genome::FileNotOpened() )
+ std::cerr << "Error loading " << mfa_filename << std::endl;
+ else
+ std::cerr << gne;
+ return;
+ }catch( std::exception& e ){
+ std::cerr << "Unhandled exception loading " << mfa_filename << std::endl;
+ std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
+ std::cerr << e.what();
+ return;
+ }catch( ... ){
+ std::cerr << "Unknown exception when loading " << mfa_filename << std::endl;
+ return;
+ }
+
+ mlist.seq_filename.clear();
+ gnSeqI total_len = 0;
+ for( uint contigI = 0; contigI < file_sequence.contigListSize(); contigI++ ){
+ genome::gnSequence* contig_seq = new genome::gnSequence( file_sequence.contig( contigI ) );
+ mlist.seq_filename.push_back( mfa_filename );
+// mlist.seq_filename.push_back( file_sequence.contigName( contigI ) );
+ if( log_stream != NULL ){
+ (*log_stream) << "Sequence loaded successfully.\n";
+ (*log_stream) << mlist.seq_filename[ contigI ] << " " << contig_seq->length() << " base pairs.\n";
+ }
+ mlist.seq_table.push_back( contig_seq );
+ }
+}
+
+template< typename MatchPtrType >
+void GenericMatchList< MatchPtrType >::CreateMemorySMLs( uint mer_size, std::ostream* log_stream, int seed_rank )
+{
+ // if the mer_size parameter is 0 then calculate a default mer size for these sequences
+ if( mer_size == 0 ){
+ mer_size = GetDefaultMerSize( seq_table );
+ if( log_stream != NULL ){
+ (*log_stream) << "Using " << mer_size << "-mers for initial seeds\n";
+ }
+ }
+
+ uint64 default_seed = getSeed( mer_size, seed_rank );
+
+ // define a DNAMemorySML to store a sorted mer list
+ for( uint contigI = 0; contigI < seq_table.size(); contigI++ )
+ {
+ DNAMemorySML* contig_sml = new DNAMemorySML();
+ boolean success = true;
+ if( log_stream != NULL )
+ (*log_stream) << "Creating sorted mer list\n";
+ time_t start_time = time(NULL);
+ contig_sml->Create( *seq_table[contigI], default_seed );
+ time_t end_time = time(NULL);
+ if( log_stream != NULL )
+ (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
+
+ sml_table.push_back( contig_sml );
+ }
+}
+
+template< typename MatchPtrType >
+void GenericMatchList< MatchPtrType >::Clear() {
+ for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
+ if( seq_table[ seqI ] != NULL )
+ delete seq_table[ seqI ];
+ }
+ for( uint seqI = 0; seqI < sml_table.size(); seqI++ ){
+ if( sml_table[ seqI ] != NULL )
+ delete sml_table[ seqI ];
+ }
+ typename std::vector<MatchPtrType>::iterator match_iter = this->begin();
+ for(; match_iter != this->end(); match_iter++ ){
+ (*match_iter)->Free();
+ (*match_iter) = NULL;
+ }
+ seq_table.clear();
+ sml_table.clear();
+ this->clear();
+ seq_filename.clear();
+ sml_filename.clear();
+}
+
+/**
+ * Use this to update linkage pointers after copying an entire set of Matches
+ */
+template< class FromType, class ToType, class MatchListType >
+void RemapSubsetMatchAddresses( std::map<FromType, ToType>& old_to_new_map, MatchListType& match_list );
+
+
+template< class FromType, class ToType, class MatchListType >
+void RemapSubsetMatchAddresses( std::map<FromType, ToType>& old_to_new_map, MatchListType& match_list )
+{
+ // now remap the subset and superset links
+ typename MatchListType::iterator match_iter = match_list.begin();
+ //typedef typename MatchListType::value_type MatchType;
+ //typedef typename Match MatchType;
+ typename std::map<FromType, ToType>::iterator map_iter;
+ for(; match_iter != match_list.end(); ++match_iter ){
+ // remap all subsets
+ std::set< Match* >& subsets = (*match_iter)->Subsets();
+ std::set< Match* > new_subsets;
+ std::set< Match* >::iterator sub_iter = subsets.begin();
+ for(; sub_iter != subsets.end(); ++sub_iter ){
+ map_iter = old_to_new_map.find( (FromType)*sub_iter );
+ new_subsets.insert( map_iter->second );
+ }
+ subsets = new_subsets;
+
+ // remap all supersets
+ std::set< Match* >& supersets = (*match_iter)->Supersets();
+ std::set< Match* > new_supersets;
+ std::set< Match* >::iterator super_iter = supersets.begin();
+ for(; super_iter != supersets.end(); ++super_iter ){
+ map_iter = old_to_new_map.find( (FromType)*super_iter );
+ new_supersets.insert( map_iter->second );
+ }
+ supersets = new_supersets;
+ }
+}
+
+inline
+void ReadList(MatchList& mlist, std::istream& match_file)
+{
+ std::string tag;
+ gnSeqI len;
+ int64 start;
+ unsigned int seq_count;
+
+ match_file >> tag; //format version tag
+ if( tag != "FormatVersion" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> tag; //format version
+ if( tag != "3" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> tag; //sequence count tag
+ if( tag != "SequenceCount" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> seq_count; //sequence count
+ if(seq_count < 2){
+ Throw_gnEx(InvalidFileFormat());
+ }
+
+ // read the sequence file names and lengths
+ for( unsigned int seqI = 0; seqI < seq_count; seqI++ ){
+ match_file >> tag; // name tag
+ std::getline( match_file, tag );
+ // skip the tab character
+ tag = tag.substr( 1 );
+ mlist.seq_filename.push_back(tag);
+ match_file >> tag; // length tag
+ gnSeqI seq_len;
+ match_file >> seq_len; // length
+ if( seqI < mlist.seq_table.size() )
+ if( mlist.seq_table[ seqI ]->length() != seq_len ){
+ std::cerr << "Warning: Genome sizes in the match list differ.\n";
+ std::cerr << "seq_table[ " << seqI << " ]->length() " << mlist.seq_table[ seqI ]->length() << " seq_len: " << seq_len << std::endl;
+ }
+ }
+
+ // read the number of matches
+ unsigned int match_count;
+ match_file >> tag; // match count tag
+ match_file >> match_count; // match count
+
+ // read the matches
+ std::map< MatchID_t, Match* > match_map;
+ std::string cur_line;
+ std::getline( match_file, cur_line );
+ while( getline( match_file, cur_line ) ){
+ Match mhe( seq_count );
+ std::stringstream line_stream( cur_line );
+
+ line_stream >> len;
+ mhe.SetLength(len);
+
+ for(uint32 seqI = 0; seqI < seq_count; seqI++){
+ line_stream >> start;
+ mhe.SetStart(seqI, start);
+ }
+
+ MatchID_t match_id;
+ line_stream >> match_id;
+
+ uint sub_count;
+ boolean bad_stream = false;
+ line_stream >> sub_count;
+ if(sub_count > 0)
+ throw "Unable to read file, invalid format, cannot read subset data\n";
+
+ if( bad_stream )
+ break;
+
+ uint sup_count;
+ line_stream >> sup_count;
+ if(sub_count > 0)
+ throw "Unable to read file, invalid format, cannot read superset data\n";
+ if( bad_stream )
+ break;
+
+ Match* new_match = mhe.Copy();
+ mlist.push_back( new_match );
+ match_map.insert( std::map< MatchID_t, Match* >::value_type( match_id, new_match ));
+ }
+ if( match_count != mlist.size() ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+}
+
+inline
+void WriteList( const MatchList& mlist, std::ostream& match_file)
+{
+ if( mlist.size() == 0 )
+ return;
+ Match* first_mem = *(mlist.begin());
+ unsigned int seq_count = first_mem->SeqCount();
+
+ match_file << "FormatVersion" << '\t' << 3 << "\n";
+ match_file << "SequenceCount" << '\t' << seq_count << "\n";
+ for(unsigned int seqI = 0; seqI < seq_count; seqI++){
+ match_file << "Sequence" << seqI << "File" << '\t';
+ if( mlist.seq_filename.size() > seqI )
+ match_file << mlist.seq_filename[seqI];
+ else
+ match_file << "null";
+ match_file << "\n";
+ match_file << "Sequence" << seqI << "Length" << '\t';
+ if( mlist.seq_table.size() > seqI )
+ match_file << mlist.seq_table[seqI]->length();
+ else
+ match_file << "0";
+ match_file << "\n";
+ }
+
+ match_file << "MatchCount" << '\t' << mlist.size() << std::endl;
+
+ //get all the mems out of the hash table and write them out
+ std::vector<Match*>::const_iterator match_iter;
+ match_iter = mlist.begin();
+ std::set<Match*> cur_set;
+ std::set<Match*>::iterator set_iter;
+ for(; match_iter != mlist.end(); match_iter++){
+ // print the match
+ match_file << **match_iter << '\t';
+
+ // print the match address
+ match_file << (MatchID_t)(*match_iter) << '\t';
+
+ // print subset id's
+ match_file << 0;
+
+ // print superset id's
+ match_file << '\t' << 0;
+ match_file << std::endl;
+ }
+}
+
+template< typename MatchPtrType >
+void GenericMatchList< MatchPtrType >::MultiplicityFilter( unsigned mult ){
+
+ size_t cur = 0;
+ for( uint memI = 0; memI < this->size(); memI++ ){
+ if( (*this)[ memI ]->Multiplicity() == mult )
+ (*this)[cur++] = (*this)[memI];
+ else{
+ (*this)[ memI ]->Free();
+ (*this)[ memI ] = NULL;
+ }
+ }
+ this->resize(cur);
+}
+
+template< typename MatchPtrType >
+void GenericMatchList< MatchPtrType >::LengthFilter( gnSeqI length ){
+
+ size_t cur = 0;
+ for( size_t memI = 0; memI < this->size(); memI++ ){
+ if( (*this)[ memI ]->Length() >= length )
+ (*this)[cur++] = (*this)[memI];
+ else{
+ (*this)[ memI ]->Free();
+ (*this)[ memI ] = NULL;
+ }
+ }
+ this->resize(cur);
+}
+
+}
+
+#endif //_MatchList_h_
diff --git a/libMems/MatchProjectionAdapter.h b/libMems/MatchProjectionAdapter.h
new file mode 100644
index 0000000..91d2193
--- /dev/null
+++ b/libMems/MatchProjectionAdapter.h
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ * $Id: MatchProjectionAdapter.h,v 1.8 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __MatchProjectionAdapter_h__
+#define __MatchProjectionAdapter_h__
+
+#include "libMems/AbstractMatch.h"
+#include <vector>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+namespace mems {
+
+/**
+ * MatchProjectionAdapter is a wrapper around an AbstractMatch that effectively projects a multi-match to a
+ * subset match. The adapter class forwards most function calls to the original match
+ * class, to which it stores a pointer. Use of non-const functions results in undefined state.
+ */
+class MatchProjectionAdapter : public mems::AbstractMatch
+{
+public:
+ MatchProjectionAdapter() : m(NULL){};
+ MatchProjectionAdapter( mems::AbstractMatch* match, const std::vector< size_t >& projection ) :
+ seq(projection)
+ {
+ m = match->Copy();
+ }
+
+ MatchProjectionAdapter( const MatchProjectionAdapter& mpa ) :
+ seq( mpa.seq )
+ {
+ if( mpa.m != NULL )
+ m = mpa.m->Copy();
+ else
+ m = NULL;
+ }
+
+ ~MatchProjectionAdapter()
+ {
+ if( m != NULL )
+ m->Free();
+ }
+
+ MatchProjectionAdapter* Clone() const { return new MatchProjectionAdapter( *this ); }
+
+ inline
+ MatchProjectionAdapter* Copy() const
+ {
+ return m_allocateAndCopy( *this );
+ }
+
+ void Free()
+ {
+ m_free(this);
+ }
+
+ MatchProjectionAdapter& operator=( const MatchProjectionAdapter& mpa )
+ {
+ if( m != NULL )
+ m->Free();
+ m = mpa.m->Copy();
+ seq = mpa.seq;
+ return *this;
+ }
+
+ //
+ // forward all function calls to match
+ //
+ gnSeqI Length( uint seqI ) const { return m->Length(seq[seqI]); }
+ void SetLength( gnSeqI len, uint seqI ) { m->SetLength(len, seq[seqI]); }
+ int64 Start(uint startI) const { return m->Start(seq[startI]); }
+ void SetStart(uint seqI, int64 start) { m->SetStart(seq[seqI],start); }
+ gnSeqI LeftEnd(uint seqI) const { return m->LeftEnd(seq[seqI]); }
+ orientation Orientation(uint seqI) const { return m->Orientation(seq[seqI]); }
+ void SetLeftEnd(uint seqI, gnSeqI start) { m->SetLeftEnd(seq[seqI],start); }
+ void SetOrientation(uint seqI, orientation o) { m->SetOrientation(seq[seqI],o); }
+ void MoveStart(int64 move_amount) { m->MoveStart(move_amount); }
+ void MoveEnd(int64 move_amount) { m->MoveEnd(move_amount); }
+ uint Multiplicity() const
+ {
+ size_t mult = 0;
+ for( size_t projI = 0; projI < seq.size(); projI++ )
+ if( m->LeftEnd(projI) != mems::NO_MATCH )
+ ++mult;
+ return mult;
+ }
+ uint SeqCount() const { return seq.size(); }
+ uint FirstStart() const { return 0; }
+ gnSeqI AlignmentLength() const { return m->AlignmentLength(); }
+ void Invert() { m->Invert(); }
+ void CropStart(gnSeqI crop_amount) { m->CropStart(crop_amount); }
+ void CropEnd(gnSeqI crop_amount) { m->CropEnd(crop_amount); }
+ void CropLeft(gnSeqI crop_amount, uint seqI) { m->CropLeft(crop_amount, seq[seqI]); }
+ void CropRight(gnSeqI crop_amount, uint seqI) { m->CropRight(crop_amount, seq[seqI]); }
+ void GetAlignment( std::vector< mems::bitset_t >& align_matrix ) const
+ {
+ std::vector< mems::bitset_t > aln_mat;
+ m->GetAlignment(aln_mat);
+ align_matrix.clear();
+ for( size_t seqI = 0; seqI < seq.size(); ++seqI )
+ align_matrix.push_back(aln_mat[seq[seqI]]);
+ }
+ void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
+ {
+ std::vector<gnSeqI> m_pos;
+ std::vector<bool> m_column;
+ m->GetColumn(col,m_pos,m_column);
+ pos.clear();
+ for( size_t seqI = 0; seqI < seq.size(); ++seqI )
+ {
+ pos.push_back(m_pos[seq[seqI]]);
+ column.push_back(m_column[seq[seqI]]);
+ }
+ }
+ bool IsGap( uint seqI, gnSeqI col ) const { return m->IsGap( seq[seqI],col ); }
+ uint UsedSeq( uint seqI ) const
+ {
+ uint c = 0;
+ for( uint i = 0; i < seq.size(); i++ )
+ {
+ if(m->Start(seq[i]) != 0)
+ c++;
+ if(c>seqI)
+ return i;
+ }
+ return (std::numeric_limits<uint>::max)();
+ };
+
+ mems::AbstractMatch* m;
+ std::vector< size_t > seq;
+};
+
+}
+
+#endif // __MatchProjectionAdapter_h__
diff --git a/libMems/Matrix.h b/libMems/Matrix.h
new file mode 100644
index 0000000..568d521
--- /dev/null
+++ b/libMems/Matrix.h
@@ -0,0 +1,174 @@
+/*******************************************************************************
+ * $Id: Matrix.h,v 1.6 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __Matrix_h__
+#define __Matrix_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSetup.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <stdexcept>
+
+template<class T>
+class Matrix
+{
+public:
+ Matrix();
+ Matrix(unsigned nrows, unsigned ncols);
+ //�Throws a BadSize object if either size is zero
+ class BadSize : public std::range_error{
+ public:
+ BadSize() : std::range_error( "Bad matrix size" ){}
+ };
+
+ // Based on the Law Of The Big Three:
+ ~Matrix();
+ Matrix(const Matrix<T>& m);
+ Matrix<T>& operator= (const Matrix<T>& m);
+ // Access methods to get the (i,j) element:
+ T& operator() (unsigned i, unsigned j);
+ const T& operator() (unsigned i, unsigned j) const;
+ // These throw a BoundsViolation object if i or j is too big
+ class BoundsViolation : public std::range_error{
+ public:
+ BoundsViolation() : std::range_error( "Index out of bounds" ){}
+ };
+ // Support for initializing each matrix element to a value
+ void init( const T& init_val );
+
+ void print( std::ostream& os ) const;
+ void read( std::istream& is );
+
+ unsigned rows() const;
+ unsigned cols() const;
+protected:
+ T* data_;
+ unsigned nrows_, ncols_;
+};
+
+template<class T>
+inline Matrix<T>::Matrix()
+{
+ data_ = NULL;
+ nrows_ = 0;
+ ncols_ = 0;
+}
+
+template<class T>
+inline unsigned Matrix<T>::rows() const
+{
+ return nrows_;
+}
+
+template<class T>
+inline unsigned Matrix<T>::cols() const
+{
+ return ncols_;
+}
+
+template<class T>
+inline T& Matrix<T>::operator() (unsigned row, unsigned col)
+{
+ if (row >= nrows_ || col >= ncols_)
+ throw BoundsViolation();
+ return data_[row*ncols_ + col];
+}
+
+template<class T>
+inline const T& Matrix<T>::operator() (unsigned row, unsigned col) const
+{
+ if (row >= nrows_ || col >= ncols_) {
+ std::cout << "debug me ";
+ throw BoundsViolation();
+ }
+ return data_[row*ncols_ + col];
+}
+
+template<class T>
+inline Matrix<T>::Matrix(unsigned nrows, unsigned ncols)
+ : data_ (new T[nrows * ncols]),
+ nrows_ (nrows),
+ ncols_ (ncols)
+{
+}
+template<class T>
+inline Matrix<T>::Matrix(const Matrix<T>& m){
+ *this = m;
+}
+
+template<class T>
+inline Matrix<T>& Matrix<T>::operator=( const Matrix<T>& m )
+{
+ if( data_ != NULL )
+ delete[] data_;
+ data_ = new T[m.nrows_ * m.ncols_];
+ nrows_ = m.nrows_;
+ ncols_ = m.ncols_;
+ memcpy( data_, m.data_, nrows_ * ncols_ * sizeof( T ) );
+ return *this;
+}
+
+template<class T>
+inline Matrix<T>::~Matrix()
+{
+ if( data_ != NULL )
+ delete[] data_;
+}
+
+template<class T>
+inline void Matrix<T>::init( const T& init_val )
+{
+ for( unsigned rowI = 0; rowI < nrows_; rowI++ )
+ for( unsigned colI = 0; colI < ncols_; colI++ )
+ data_[ rowI * ncols_ + colI ] = init_val;
+}
+
+template<class T>
+inline void Matrix<T>::print( std::ostream& os ) const{
+ for( unsigned rowI = 0; rowI < nrows_; rowI++ ){
+ for( unsigned colI = 0; colI < ncols_; colI++ ){
+ if( colI > 0 )
+ os << '\t';
+ os << data_[ rowI * ncols_ + colI ];
+ }
+ os << std::endl;
+ }
+}
+
+template<class T>
+inline void Matrix<T>::read( std::istream& is ){
+ std::vector< std::string > lines;
+ std::string cur_line;
+ while( std::getline( is, cur_line ) )
+ lines.push_back( cur_line );
+
+ nrows_ = lines.size();
+ // count ncols
+ std::stringstream ss( lines[0] );
+ ncols_ = 0;
+ while( std::getline( ss, cur_line, '\t' ) )
+ ncols_++;
+
+ data_ = new T[nrows_ * ncols_];
+
+ int valueI = 0;
+ for( int lineI = 0; lineI < lines.size(); lineI++ ){
+ ss = std::stringstream( lines[ lineI ] );
+ std::getline( ss, cur_line, '\t' );
+ std::stringstream type_stream( cur_line );
+ type_stream >> data_[ valueI ];
+ valueI++;
+ }
+}
+
+#endif // __Matrix_h__
diff --git a/libMems/MemHash.cpp b/libMems/MemHash.cpp
new file mode 100644
index 0000000..9c35f10
--- /dev/null
+++ b/libMems/MemHash.cpp
@@ -0,0 +1,330 @@
+/*******************************************************************************
+ * $Id: MemHash.cpp,v 1.32 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MemHash.h"
+#include "libGenome/gnFilter.h"
+#include <list>
+#include <map>
+#include <sstream>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+ MemHash::MemHash() : MatchFinder(), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
+
+{
+ table_size = DEFAULT_MEM_TABLE_SIZE;
+ seq_count = 0;
+ m_mem_count = 0;
+ m_collision_count = 0;
+ m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
+ m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
+ //allocate the hash table
+ mem_table.resize(table_size);
+ mem_table_count.reserve( table_size );
+ for(uint32 i=0; i < table_size; ++i)
+ mem_table_count.push_back(0);
+ match_log = NULL;
+}
+
+//make sure this calls the destructor on each element
+MemHash::~MemHash(){
+// allocator.Free(allocated);
+}
+
+MemHash::MemHash(const MemHash& mh) : MatchFinder(mh), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
+{
+ *this = mh;
+}
+
+MemHash& MemHash::operator=( const MemHash& mh ){
+ table_size = mh.table_size;
+ mer_size = mh.mer_size;
+ seq_count = mh.seq_count;
+ m_mem_count = mh.m_mem_count;
+ m_collision_count = mh.m_collision_count;
+ m_repeat_tolerance = mh.m_repeat_tolerance;
+ m_enumeration_tolerance = mh.m_enumeration_tolerance;
+ mem_table.resize(table_size);
+ for(uint32 i=0; i < table_size; ++i){
+ mem_table_count.push_back(mh.mem_table_count[i]);
+ mem_table[i] = mh.mem_table[i];
+ }
+ match_log = mh.match_log;
+ return *this;
+}
+
+MemHash* MemHash::Clone() const{
+ return new MemHash(*this);
+}
+
+void MemHash::ClearSequences()
+{
+ MatchFinder::Clear();
+}
+
+void MemHash::Clear()
+{
+ MatchFinder::Clear();
+ m_mem_count = 0;
+ m_collision_count = 0;
+ m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
+ m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
+ //clear the hash table
+ for(uint32 listI = 0; listI < table_size; ++listI){
+ mem_table[listI].clear();
+ mem_table_count[ listI ] = 0;
+ }
+ match_log = NULL;
+
+ allocator.Free(allocated);
+ // WARNING! WARNING! WARNING! this will destroy ALL objects since the allocator has static lifetime!!
+// allocator.Purge();
+}
+
+void MemHash::SetTableSize(uint32 new_table_size){
+ //allocate the hash table
+ table_size = new_table_size;
+ mem_table.clear();
+ mem_table.resize(table_size);
+ mem_table_count.clear();
+ mem_table_count.resize(table_size,0);
+}
+
+boolean MemHash::CreateMatches(){
+ MatchFinder::FindMatchSeeds();
+ return true;
+}
+
+void MemHash::FindMatches( MatchList& ml ) {
+ vector<gnSeqI> start_points;
+ for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
+ start_points.push_back( 0 );
+ }
+ FindMatchesFromPosition( ml, start_points );
+}
+
+void MemHash::FindMatchesFromPosition( MatchList& ml, const vector<gnSeqI>& start_points ){
+ for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
+ if( !AddSequence( ml.sml_table[ seqI ], ml.seq_table[ seqI ] ) ){
+ ErrorMsg( "Error adding " + ml.seq_filename[seqI] + "\n");
+ return;
+ }
+ }
+ MatchFinder::FindMatchSeeds( start_points );
+
+ GetMatchList( ml );
+}
+
+MatchList MemHash::GetMatchList() const{
+ MatchList ml;
+ GetMatchList( ml );
+ ml.seq_table = seq_table;
+ ml.sml_table = sar_table;
+
+ return ml;
+}
+
+// an attempt to do this without sorting, which appears to be very slow...
+boolean MemHash::EnumerateMatches( IdmerList& match_list )
+{
+ vector< uint > enum_tally(seq_count, 0);
+ IdmerList::iterator iter = match_list.begin();
+ IdmerList hash_list;
+ for(; iter != match_list.end(); ++iter)
+ {
+ if( enum_tally[iter->id] < m_enumeration_tolerance )
+ {
+ hash_list.push_back(*iter);
+ }
+ if(enum_tally[iter->id] > m_repeat_tolerance)
+ return true;
+ ++enum_tally[iter->id];
+ }
+
+ if(hash_list.size() > 1){
+ if(m_enumeration_tolerance == 1)
+ return HashMatch(hash_list);
+ else
+ return MatchFinder::EnumerateMatches( hash_list );
+ }
+ return true;
+}
+
+//why have separate hash tables? dunno. no reason. what was i thinking
+// at that coffeehouse in portland when i wrote this crappy code?
+// MemHashEntries use GENETICIST coordinates. They start at 1, not 0.
+boolean MemHash::HashMatch(IdmerList& match_list){
+ //check that there is at least one forward component
+// match_list.sort(&idmer_id_lessthan);
+ // initialize the hash entry
+ MatchHashEntry mhe = MatchHashEntry(seq_count, GetSar(0)->SeedLength());
+ mhe.SetLength( GetSar(0)->SeedLength() );
+
+ //Fill in the new Match and set direction parity if needed.
+ IdmerList::iterator iter = match_list.begin();
+ for(; iter != match_list.end(); ++iter)
+ mhe.SetStart(iter->id, iter->position + 1);
+ SetDirection(mhe);
+ mhe.CalculateOffset();
+ if(mhe.Multiplicity() < 2){
+ cout << "red flag " << mhe << "\n";
+ cout << "match_list.size(): " << match_list.size() << endl;
+ }else
+ AddHashEntry(mhe);
+
+ return true;
+}
+
+void MemHash::SetDirection(MatchHashEntry& mhe){
+ //get the reference direction
+ boolean ref_forward = false;
+ uint32 seqI=0;
+ for(; seqI < mhe.SeqCount(); ++seqI)
+ if(mhe[seqI] != NO_MATCH){
+ ref_forward = !(GetSar(seqI)->GetDnaSeedMer(mhe[seqI] - 1) & 0x1);
+ break;
+ }
+ //set directional parity for the rest
+ for(++seqI; seqI < mhe.SeqCount(); ++seqI)
+ if(mhe[seqI] != NO_MATCH)
+ if(ref_forward == (GetSar(seqI)->GetDnaSeedMer(mhe[seqI] - 1) & 0x1))
+ mhe.SetStart(seqI, -mhe[seqI]);
+}
+
+// Tries to add a new mem to the mem hash table
+// If the mem already exists in the table, a pointer to it
+// is returned. Otherwise mhe is added and a pointer to
+// it is returned.
+MatchHashEntry* MemHash::AddHashEntry(MatchHashEntry& mhe){
+ //first compute which hash table bucket this is going into
+ int64 offset = mhe.Offset();
+
+ uint32 bucketI = ((offset % table_size) + table_size) % table_size;
+ vector<MatchHashEntry*>::iterator insert_he;
+ insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), &mhe, mhecomp);
+// insert_he = mem_table[bucketI].find(&mhe);
+ if( insert_he != mem_table[bucketI].end() && (!mhecomp(*insert_he, &mhe) && !mhecomp(&mhe, *insert_he)) ){
+ ++m_collision_count;
+ return *insert_he;
+ }
+
+ //if we made it this far there were no collisions
+ //extend the mem into the surrounding region.
+ vector<MatchHashEntry> subset_matches;
+ if( !mhe.Extended() )
+ ExtendMatch(mhe, subset_matches);
+
+ MatchHashEntry* new_mhe = allocator.Allocate();
+ new_mhe = new(new_mhe) MatchHashEntry(mhe);
+// *new_mhe = mhe;
+ allocated.push_back(new_mhe);
+
+ // can't insert until after the extend!!
+ insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), new_mhe, mhecomp);
+ mem_table[bucketI].insert(insert_he, new_mhe);
+
+ // log it.
+ if( match_log != NULL ){
+ (*match_log) << *new_mhe << endl;
+ match_log->flush();
+ }
+
+ // link up the subset matches
+ for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
+ MatchHashEntry* submem = AddHashEntry( subset_matches[ subsetI ] );
+ }
+
+ ++mem_table_count[bucketI];
+ ++m_mem_count;
+ return new_mhe;
+}
+
+void MemHash::PrintDistribution(ostream& os) const{
+ vector<MatchHashEntry*>::const_iterator mem_iter;
+ gnSeqI base_count;
+ for(uint32 i=0; i < mem_table_count.size(); ++i){
+ mem_iter = mem_table[i].begin();
+ base_count = 0;
+ for(; mem_iter != mem_table[i].end(); ++mem_iter){
+ base_count += (*mem_iter)->Length();
+ }
+ os << i << '\t' << mem_table_count[i] << '\t' << base_count << '\n';
+ }
+}
+
+void MemHash::LoadFile(istream& mem_file){
+ string tag;
+ gnSeqI len;
+ int64 start;
+ MatchHashEntry mhe;
+ getline( mem_file, tag );
+ stringstream first_mum( tag );
+ seq_count = 0;
+ first_mum >> len;
+ while( first_mum >> start ){
+ seq_count++;
+ }
+ mhe = MatchHashEntry(seq_count, mer_size, MatchHashEntry::seed);
+ first_mum.str( tag );
+ first_mum.clear();
+ for(uint32 seqI = 0; seqI < seq_count; seqI++){
+ first_mum >> start;
+ mhe.SetStart(seqI, start);
+ }
+ mhe.SetLength( len );
+ mhe.CalculateOffset();
+ AddHashEntry(mhe);
+
+ while(mem_file.good()){
+ mem_file >> len;
+ if(!mem_file.good())
+ break;
+ mhe.SetLength(len);
+ for(uint32 seqI = 0; seqI < seq_count; seqI++){
+ mem_file >> start;
+ mhe.SetStart(seqI, start);
+ }
+ //break if the stream ended
+ if(!mem_file.good())
+ break;
+ mhe.CalculateOffset();
+ AddHashEntry(mhe);
+ }
+}
+
+void MemHash::WriteFile(ostream& mem_file) const{
+ mem_file << "FormatVersion" << '\t' << 1 << "\n";
+ mem_file << "SequenceCount" << '\t' << sar_table.size() << "\n";
+ for(unsigned int seqI = 0; seqI < seq_count; seqI++){
+ mem_file << "Sequence" << seqI << "File";
+ gnGenomeSpec* specker = seq_table[seqI]->GetSpec();
+ string sourcename = specker->GetName();
+ if( sourcename == "" )
+ sourcename = "null";
+ mem_file << '\t' << sourcename << "\n";
+ mem_file << "Sequence" << seqI << "Length";
+ mem_file << '\t' << seq_table[seqI]->length() << "\n";
+ }
+ mem_file << "MatchCount" << '\t' << m_mem_count << endl;
+ //get all the mems out of the hash table and write them out
+ vector<MatchHashEntry*>::const_iterator mem_table_iter;
+ for(uint32 i=0; i < table_size; i++){
+ mem_table_iter = mem_table[i].begin();
+ for(; mem_table_iter != mem_table[i].end(); mem_table_iter++)
+ mem_file << **mem_table_iter << "\n";
+ }
+}
+
+
+} // namespace mems
diff --git a/libMems/MemHash.h b/libMems/MemHash.h
new file mode 100644
index 0000000..8ef145c
--- /dev/null
+++ b/libMems/MemHash.h
@@ -0,0 +1,208 @@
+/*******************************************************************************
+ * $Id: MemHash.h,v 1.23 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _MemHash_h_
+#define _MemHash_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <set>
+#include <map>
+#include <iostream>
+
+#include "libMems/MatchFinder.h"
+#include "libMems/Match.h"
+#include "libGenome/gnException.h"
+#include "libMems/MatchList.h"
+#include "libMems/MatchHashEntry.h"
+#include "libMems/SlotAllocator.h"
+#include "boost/pool/object_pool.hpp"
+
+namespace mems {
+
+static const uint32 DEFAULT_MEM_TABLE_SIZE = 40000;
+static const uint32 DEFAULT_REPEAT_TOLERANCE = 0;
+static const uint32 DEFAULT_ENUMERATION_TOLERANCE = 1;
+
+/**
+ * MemHash implements an algorithm for finding exact matches of a certain minimal
+ * length in several sequences.
+ */
+class MemHash : public MatchFinder{
+
+
+
+public:
+ MemHash();
+ ~MemHash();
+ MemHash(const MemHash& mh);
+ MemHash& operator=( const MemHash& mh );
+ virtual MemHash* Clone() const;
+ virtual void Clear();
+ virtual void ClearSequences();
+
+ /**
+ * Finds all maximal exact matches in the sequences contained by "match_list"
+ * The resulting list of matches is stored within "match_list"
+ */
+ virtual void FindMatches( MatchList& match_list );
+ virtual void FindMatchesFromPosition( MatchList& match_list, const std::vector<gnSeqI>& start_points );
+
+ /**
+ * Generates exact matches for the sequences loaded into this MemHash
+ */
+ virtual boolean CreateMatches();
+
+ /**
+ * Returns the size of the hash table being used.
+ * @return the size of the hash table being used.
+ */
+ virtual uint32 TableSize() const {return table_size;};
+ /**
+ * Sets the size of the hash table to new_table_size.
+ * @param new_table_size The new hash table size
+ */
+ virtual void SetTableSize(uint32 new_table_size);
+ /**
+ * Creates a new MatchList instance which contains all the matches found by calling Create().
+ */
+ virtual MatchList GetMatchList() const;
+ /**
+ * Places pointers to the mems that have been found into the vector mem_list
+ * @param mem_list an empty vector.
+ */
+ //virtual void GetMatchList( std::vector<Match*>& mem_list ) const;
+
+ /**
+ * Use this to convert MatchHashEntry mem list to a generic match list type
+ * converts the mem_list into the type specified by MatchListType
+ */
+ template< class MatchListType >
+ void GetMatchList( MatchListType& mem_list ) const;
+
+ /**
+ * Returns the number of mems found
+ * @return The number of mems found
+ */
+ virtual uint32 MemCount(){return m_mem_count;}
+ /**
+ * Returns the number of mers thrown out because they were contained in an existing mem
+ * @return The number of mers thrown out because they were contained in an existing mem
+ */
+ virtual uint32 MemCollisionCount(){return m_collision_count;}
+ virtual void MemTableCount(std::vector<uint32>& table_count){table_count = mem_table_count;}
+ /**
+ * Prints the number of matches in each hash table bucket to the ostream os.
+ * @param os The stream to print to.
+ */
+ virtual void PrintDistribution(std::ostream& os) const;
+
+ /**
+ * Reads in a list of mems from an input stream
+ * @throws A InvalidFileFormat exception if the file format is unknown or the file is corrupt
+ */
+ virtual void LoadFile(std::istream& mem_file);
+ /**
+ * Writes the matches stored in this MemHash out to the ostream @param mem_file.
+ */
+ virtual void WriteFile(std::ostream& mem_file) const;
+
+ /**
+ * Sets the permitted repetitivity of match seeds.
+ * Set @param repeat_tolerance to 0 to generate MUMs, any higher setting will generate MEMs
+ * Many possible combinations of repetitive seed matches may be ignored, depending on the
+ * setting of the repeat enumeration tolerance.
+ * @see SetEnumerationTolerance
+ * @param repeat_tolerance the permitted repetitivity of match seeds
+ */
+ virtual void SetRepeatTolerance(uint32 repeat_tolerance){m_repeat_tolerance = repeat_tolerance;}
+ /**
+ * @return the permitted repetitivity of match seeds.
+ * @see SetRepeatTolerance
+ */
+ virtual uint32 GetRepeatTolerance() const{return m_repeat_tolerance;}
+ /**
+ * Sets the match seed repeat enumeration tolerance.
+ * When matching mers are found across sequences which also occur several times in any particular
+ * sequence there are several possible match seeds which could be generated.
+ * The enumeration tolerance controls how many of these possibilities are actually used as match
+ * seeds and extended into full matches. The selection of actual seeds from the realm of possibilities
+ * is essentially arbitrary, though not explicitly randomized.
+ */
+ virtual void SetEnumerationTolerance(uint32 enumeration_tolerance){m_enumeration_tolerance = enumeration_tolerance;}
+ /**
+ * @return the match seed repeat enumeration tolerance.
+ * @see SetEnumerationTolerance
+ */
+ virtual uint32 GetEnumerationTolerance() const{return m_enumeration_tolerance;}
+
+ /**
+ * Setting this to a non-null value causes matches to be logged as they are found
+ */
+ void SetMatchLog( std::ostream* match_log ){ this->match_log = match_log; }
+
+
+
+ //end void GetMatchList( std::vector<MatchListType*>& mem_list );
+
+protected:
+ virtual boolean EnumerateMatches( IdmerList& match_list );
+ virtual boolean HashMatch(IdmerList& match_list);
+ virtual void SetDirection(MatchHashEntry& mhe);
+ virtual MatchHashEntry* AddHashEntry(MatchHashEntry& mhe);
+ virtual uint32 quadratic_li(uint32 listI){return (listI*(listI+1))/2;}
+
+ uint32 table_size;
+ std::vector< std::vector<MatchHashEntry*> > mem_table;
+ uint32 m_repeat_tolerance;
+ uint32 m_enumeration_tolerance;
+ uint64 m_mem_count;
+ uint64 m_collision_count;
+ std::vector<uint32> mem_table_count;
+
+ std::ostream* match_log;
+ SlotAllocator<MatchHashEntry>& allocator;
+ std::vector<MatchHashEntry*> allocated; // used to track what needs to get explicitly destroyed later...
+// boost::object_pool<MatchHashEntry> allocator;
+ MheCompare mhecomp;
+};
+
+
+/**
+ * Use this to convert MatchHashEntry mem list to a generic match list type
+ * converts the mem_list into the type specified by MatchListType
+ */
+template< class MatchListType >
+void MemHash::GetMatchList( MatchListType& mem_list ) const {
+
+ mem_list.clear();
+ typedef typename MatchListType::value_type MatchType;
+
+ //Boost to the rescue! use remove_pointer to get at MatchListType's type
+ typedef typename boost::remove_pointer<MatchType>::type SinPtrMatchType;
+ SinPtrMatchType mm;
+
+ for(uint32 i=0; i < table_size; ++i)
+ {
+ std::vector<MatchHashEntry*>::const_iterator iter = mem_table[i].begin();
+ for(; iter != mem_table[i].end(); iter++ )
+ {
+ MatchType m = mm.Copy();
+ *m = **iter;
+ mem_list.push_back( m );
+ }
+ }
+
+}
+
+
+}
+
+#endif //_MemHash_h_
diff --git a/libMems/Memory.h b/libMems/Memory.h
new file mode 100644
index 0000000..ec259ea
--- /dev/null
+++ b/libMems/Memory.h
@@ -0,0 +1,60 @@
+#ifndef __libMems_Memory_h__
+#define __libMems_Memory_h__
+
+
+void printMemUsage();
+static bool debugging_memory = false;
+#include <iostream>
+
+#ifdef WIN32
+#include <windows.h>
+#include <PSAPI.h>
+inline
+void printMemUsage()
+{
+// if(!debugging_memory)
+// return;
+
+ DWORD proclist[500];
+ DWORD cbNeeded;
+ BOOL rval;
+ rval = EnumProcesses( proclist, sizeof(proclist), &cbNeeded );
+ int p_count = cbNeeded / sizeof(DWORD);
+ HANDLE phand;
+ HMODULE hMod;
+ char szFileName[MAX_PATH];
+ for( int p = 0; p < p_count; p++ )
+ {
+ phand = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, 0, proclist[p] );
+ DWORD dwSize2;
+ if (EnumProcessModules(phand, &hMod, sizeof(hMod), &dwSize2))
+ {
+
+ // Get the module name
+ if ( !GetModuleBaseName(phand, hMod, szFileName, sizeof(szFileName)) )
+ szFileName[0] = 0;
+ if( strncmp( szFileName, "progressiveMauve", 16 ) == 0 )
+ break; // found the right module
+ }
+ CloseHandle(phand);
+ }
+
+ PROCESS_MEMORY_COUNTERS mem_info;
+
+ if( GetProcessMemoryInfo( phand, &mem_info, sizeof(mem_info) ) )
+ {
+ std::cout << "Working set size: " << mem_info.WorkingSetSize / (1024 * 1024) << " Mb\n";
+// cout << "Paged pool usage: " << mem_info.QuotaPagedPoolUsage << endl;
+// cout << "Non-Paged pool usage: " << mem_info.QuotaNonPagedPoolUsage << endl;
+ std::cout << "Pagefile usage: " << mem_info.PagefileUsage / (1024 * 1024) << " Mb\n";
+ std::cout.flush();
+ }
+}
+#else
+inline
+void printMemUsage()
+{};
+#endif
+
+#endif //__libMems_Memory_h__
+
diff --git a/libMems/MemorySML.cpp b/libMems/MemorySML.cpp
new file mode 100644
index 0000000..e42c3f8
--- /dev/null
+++ b/libMems/MemorySML.cpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+ * $Id: MemorySML.cpp,v 1.8 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MemorySML.h"
+#include <algorithm>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+MemorySML::MemorySML(const uint8* table, const uint32 alpha_bits ){
+ header.alphabet_bits = alpha_bits;
+ memcpy(header.translation_table, table, UINT8_MAX);
+ header.version = 0;
+}
+
+MemorySML::MemorySML(const MemorySML& msa){
+ *this = msa;
+}
+
+MemorySML& MemorySML::operator=(const MemorySML& msa ) {
+ SortedMerList::operator=( msa );
+ positions = msa.positions;
+ return *this;
+}
+
+MemorySML* MemorySML::Clone() const{
+ return new MemorySML(*this);
+}
+
+void MemorySML::Clear(){
+ SortedMerList::Clear();
+ positions.clear();
+}
+
+void MemorySML::Create(const gnSequence& seq, const uint64 seed ){
+ SortedMerList::Create(seq, seed);
+
+ vector<bmer> sml_array;
+ boolean is_spaced_seed = header.seed_length != header.seed_weight;
+ if( is_spaced_seed )
+ FillDnaSeedSML( seq, sml_array );
+ else
+ FillSML( seq, sml_array );
+ sort( sml_array.begin(), sml_array.end(), &bmer_lessthan );
+ positions.reserve( sml_array.size() );
+ for(gnSeqI merI = 0; merI < sml_array.size(); merI++ ){
+ positions.push_back( sml_array[merI].position );
+ }
+
+}
+
+boolean MemorySML::Read(vector<bmer>& readVector, gnSeqI size, gnSeqI offset )
+{
+ readVector.clear();
+ if( offset > positions.size() )
+ return false;
+
+ gnSeqI last_mer = offset + size;
+ boolean success = true;
+ if( last_mer > positions.size() ){
+ last_mer = positions.size();
+ success = false;
+ }
+
+ bmer cur_mer;
+ for(gnSeqI merI = offset; merI < last_mer; merI++){
+ cur_mer.position = positions[merI];
+ cur_mer.mer = GetSeedMer( cur_mer.position );
+ readVector.push_back( cur_mer );
+ }
+ return success;
+}
+
+void MemorySML::Merge(SortedMerList& sa, SortedMerList& sa2){
+
+}
+
+bmer MemorySML::operator[](gnSeqI index)
+{
+ bmer cur_mer;
+ cur_mer.position = positions[index];
+ cur_mer.mer = GetSeedMer( cur_mer.position );
+ return cur_mer;
+}
+
+} // namespace mems
diff --git a/libMems/MemorySML.h b/libMems/MemorySML.h
new file mode 100644
index 0000000..d89229e
--- /dev/null
+++ b/libMems/MemorySML.h
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ * $Id: MemorySML.h,v 1.7 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _MemorySML_h_
+#define _MemorySML_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libMems/SortedMerList.h"
+
+namespace mems {
+
+/** The MemorySML is an implementation of sorted mer lists which creates and
+ * stores the sorted mer list entirely in memory. A MemorySML consumes
+ * roughly 32 + alpha_bits bits of memory per character in the sequences.
+ * For unambiguous DNA sequences 4.25 bytes per base are required.
+ */
+class MemorySML : public SortedMerList
+{
+public:
+ /**
+ * Create an empty MemorySML
+ * Creates an empty MemorySML with the supplied translation
+ * table and alphabet bit size. Defaults to DNA settings
+ * @param table The array used to translate characters into binary code
+ * @param alpha_bits The number of bits each character consumes in binary
+ */
+ MemorySML(const uint8* table = SortedMerList::BasicDNATable(), const uint32 alpha_bits = DNA_ALPHA_BITS);
+ MemorySML(const MemorySML& msa);
+ MemorySML& operator=(const MemorySML& msa );
+ MemorySML* Clone() const;
+
+ virtual void Clear();
+
+ virtual void Create(const genome::gnSequence& seq, const uint64 seed);
+ virtual boolean Read(std::vector<bmer>& readVector, gnSeqI size, gnSeqI offset = 0);
+ virtual void Merge(SortedMerList& sa, SortedMerList& sa2);
+
+ virtual bmer operator[](gnSeqI index);
+
+protected:
+
+// virtual void FillSML(const gnSeqI seq_len, vector<gnSeqI>& sml_array);
+ std::vector<smlSeqI_t> positions;
+
+};
+
+}
+
+#endif //_MemorySML_h_
diff --git a/libMems/MuscleInterface.cpp b/libMems/MuscleInterface.cpp
new file mode 100644
index 0000000..25deb99
--- /dev/null
+++ b/libMems/MuscleInterface.cpp
@@ -0,0 +1,1192 @@
+/*******************************************************************************
+ * $Id: MuscleInterface.cpp,v 1.27 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MuscleInterface.h"
+
+#include "libGenome/gnFilter.h"
+#include "libGenome/gnFASSource.h"
+#include "libGenome/gnStringTools.h"
+#include "libMUSCLE/muscle.h"
+#include "libMUSCLE/params.h"
+#include "libMUSCLE/msa.h"
+#include "libMUSCLE/seq.h"
+#include "libMUSCLE/seqvect.h"
+#include "libMUSCLE/tree.h"
+#include "libMUSCLE/clust.h"
+#include "libMUSCLE/profile.h"
+#include "libMUSCLE/distfunc.h"
+#include "libMUSCLE/clustsetdf.h"
+#include "libMUSCLE/textfile.h"
+#include "libMUSCLE/types.h"
+#include "boost/algorithm/string/erase.hpp"
+#include "boost/algorithm/string/case_conv.hpp"
+
+#include <sstream>
+#include <fstream>
+
+using namespace std;
+using namespace genome;
+
+// this gets defined in muscle.cpp, but not declared in any headers
+namespace muscle {
+extern void MUSCLE(SeqVect &v, MSA &msaOut);
+extern void RefineW(const MSA &msaIn, MSA &msaOut);
+}
+
+using namespace muscle;
+
+namespace mems {
+
+bool debug_muscle = false;
+
+bool pipeExec( char** cmd_argv, const string& command, const string& input, string& output, string& error );
+
+char** parseCommand( const string& cmd );
+char** parseCommand( const string& cmd ){
+ // first count tokens
+
+ // tokenize on "
+ stringstream qs( cmd );
+ string cur_str;
+ boolean in_quote = true;
+ int token_count = 0;
+ vector< string > cmd_tokens;
+ while( getline( qs, cur_str, '"' ) ){
+ // never start out in a quote
+ in_quote = !in_quote;
+ if( cur_str.length() == 0 )
+ continue;
+ if( in_quote ){
+ cmd_tokens.push_back( cur_str );
+ }else{
+ stringstream ss( cur_str );
+ string asdf;
+ while( ss >> asdf )
+ cmd_tokens.push_back( asdf );
+ }
+ }
+ char ** cmd_array = new char*[ cmd_tokens.size() + 1 ];
+ for( int tokI = 0; tokI < cmd_tokens.size(); tokI++ ){
+ cmd_array[ tokI ] = new char[ cmd_tokens[ tokI ].length() + 1 ];
+ strcpy( cmd_array[ tokI ], cmd_tokens[ tokI ].c_str() );
+ }
+ cmd_array[ cmd_tokens.size() ] = NULL;
+ return cmd_array;
+}
+
+#if !defined(WIN32)
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+// unix pipelined execution code
+bool pipeExec( char** cmd_argv, const string& command, const string& input, string& output, string& error ){
+ int stdin_pipe[2], stdout_pipe[2], stderr_pipe[2];
+ boolean success = false;
+ pid_t sid;
+ pid_t pid1;
+ const char* fail;
+ char buf[1024];
+ ssize_t bread = 0;
+ int rval = 0;
+
+ if((sid = setsid()) < 0)sid = getpgrp();
+
+ if((sid < 0 && (fail = "sid"))
+ || (pipe(stdin_pipe) < 0 && (fail = "stdin"))
+ || (pipe(stdout_pipe) < 0 && (fail = "stdout"))
+// || (pipe(stderr_pipe) < 0 && (fail = "stderr"))
+ )
+ {
+ fprintf(stderr, "Ouch, the world just collapsed (%s).\n", fail);
+ perror("muscle:");
+ goto cleanup;
+ }
+
+ fcntl(stdin_pipe[0], F_SETFL, fcntl(stdin_pipe[0], F_GETFL) & ~O_NONBLOCK);
+ fcntl(stdin_pipe[1], F_SETFL, fcntl(stdin_pipe[1], F_GETFL) & ~O_NONBLOCK);
+ fcntl(stdout_pipe[0], F_SETFL, fcntl(stdout_pipe[0], F_GETFL) & ~O_NONBLOCK);
+ fcntl(stdout_pipe[1], F_SETFL, fcntl(stdout_pipe[1], F_GETFL) & ~O_NONBLOCK);
+/* fcntl(stderr_pipe[0], F_SETFL, fcntl(stderr_pipe[0], F_GETFL) & ~O_NONBLOCK);
+ fcntl(stderr_pipe[1], F_SETFL, fcntl(stderr_pipe[1], F_GETFL) & ~O_NONBLOCK);
+*/
+ if((pid1 = fork()) < 0)goto cleanup;
+ if(pid1)
+ setpgid(pid1, sid);
+ else
+ {
+ dup2(stdin_pipe[0], 0);
+ dup2(stdout_pipe[1], 1);
+// dup2(stderr_pipe[1], 2);
+ close( stdin_pipe[0] );
+ close( stdin_pipe[1] );
+ close( stdout_pipe[0] );
+ close( stdout_pipe[1] );
+// close( stderr_pipe[0] );
+// close( stderr_pipe[1] );
+ execvp(cmd_argv[0], cmd_argv);
+ _exit(errno);
+ }
+ rval = write( stdin_pipe[1], input.c_str(), input.size() );
+ if( rval == -1 )
+ perror( "write: " );
+ if( close( stdin_pipe[1] ) )
+ perror( "close stdin_w: " );
+ if( close( stdin_pipe[0] ) )
+ perror( "close stdin_r: " );
+
+ close( stdout_pipe[1] );
+ // read the alignment
+ while(true){
+ bzero( buf, sizeof(buf) );
+ bread = read( stdout_pipe[0], buf, 1023 );
+ if( bread == 0 )
+ break; // reached EOF
+ if( bread == -1 ){
+ perror("muscle read: " );
+ }
+ output += buf;
+ }
+ wait( NULL );
+ success = true;
+
+cleanup:
+ close(stdin_pipe[0]);
+ close(stdin_pipe[1]);
+ close(stdout_pipe[0]);
+ close(stdout_pipe[1]);
+// close(stderr_pipe[0]);
+// close(stderr_pipe[1]);
+ return success;
+};
+
+
+#else
+
+//windows piping code
+#include <windows.h>
+#define bzero(a) memset(a,0,sizeof(a)) //easier -- shortcut
+
+bool IsWinNT() //check if we're running NT
+{
+ OSVERSIONINFO osv;
+ osv.dwOSVersionInfoSize = sizeof(osv);
+ GetVersionEx(&osv);
+ return (osv.dwPlatformId == VER_PLATFORM_WIN32_NT);
+}
+
+void ErrorMessage(char *str) //display detailed error info
+{
+ LPVOID msg;
+ FormatMessage(
+ FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL,
+ GetLastError(),
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language
+ (LPTSTR) &msg,
+ 0,
+ NULL
+ );
+ printf("%s: %s\n",str,msg);
+ LocalFree(msg);
+}
+
+bool pipeExec( char** cmd_argv, const string& command, const string& input, string& output, string& error ){
+
+ char buf[1024]; //i/o buffer
+
+ STARTUPINFO si;
+ SECURITY_ATTRIBUTES sa;
+ SECURITY_DESCRIPTOR sd; //security information for pipes
+ PROCESS_INFORMATION pi;
+ HANDLE newstdin_w,newstdout_w,newstderr_w,newstdin_r,newstdout_r,newstderr_r;
+ HANDLE read_stdout,read_stderr,write_stdin; //pipe handles
+ boolean success = false;
+
+ if (IsWinNT()) //initialize security descriptor (Windows NT)
+ {
+ InitializeSecurityDescriptor(&sd,SECURITY_DESCRIPTOR_REVISION);
+ SetSecurityDescriptorDacl(&sd, true, NULL, false);
+ sa.lpSecurityDescriptor = &sd;
+ }
+ else sa.lpSecurityDescriptor = NULL;
+
+ sa.nLength = sizeof(SECURITY_ATTRIBUTES);
+ sa.bInheritHandle = true; //allow inheritable handles
+
+ if (!CreatePipe(&newstdin_r,&newstdin_w,&sa,0)) //create stdin pipe
+ {
+ ErrorMessage("CreatePipe");
+ goto finito;
+ }
+ if (!CreatePipe(&newstdout_r,&newstdout_w,&sa,0)) //create stdout pipe
+ {
+ ErrorMessage("CreatePipe");
+ goto finito;
+ }
+ if (!CreatePipe(&newstderr_r,&newstderr_w,&sa,0)) //create stdout pipe
+ {
+ ErrorMessage("CreatePipe");
+ goto finito;
+ }
+ // Duplicate the write handle to the pipe so it is not inherited.
+ boolean fSuccess = DuplicateHandle(GetCurrentProcess(), newstdin_w,
+ GetCurrentProcess(), &write_stdin, 0,
+ FALSE, // not inherited
+ DUPLICATE_SAME_ACCESS);
+ if (! fSuccess){
+ ErrorMessage("DuplicateHandle failed");
+ goto finito;
+ }
+ CloseHandle(newstdin_w);
+ newstdin_w = INVALID_HANDLE_VALUE;
+
+ // Duplicate the read handle to the pipe so it is not inherited.
+ fSuccess = DuplicateHandle(GetCurrentProcess(), newstdout_r,
+ GetCurrentProcess(), &read_stdout, 0,
+ FALSE, // not inherited
+ DUPLICATE_SAME_ACCESS);
+ if (! fSuccess){
+ ErrorMessage("DuplicateHandle failed");
+ goto finito;
+ }
+ CloseHandle(newstdout_r);
+ newstdout_r = INVALID_HANDLE_VALUE;
+
+ // Duplicate the read handle to the pipe so it is not inherited.
+ fSuccess = DuplicateHandle(GetCurrentProcess(), newstderr_r,
+ GetCurrentProcess(), &read_stderr, 0,
+ FALSE, // not inherited
+ DUPLICATE_SAME_ACCESS);
+ if (! fSuccess){
+ ErrorMessage("DuplicateHandle failed");
+ goto finito;
+ }
+ CloseHandle(newstderr_r);
+ newstderr_r = INVALID_HANDLE_VALUE;
+
+ GetStartupInfo(&si); //set startupinfo for the spawned process
+ /*
+ The dwFlags member tells CreateProcess how to make the process.
+ STARTF_USESTDHANDLES validates the hStd* members. STARTF_USESHOWWINDOW
+ validates the wShowWindow member.
+ */
+ si.dwFlags = STARTF_USESTDHANDLES|STARTF_USESHOWWINDOW;
+ si.wShowWindow = SW_HIDE;
+ si.hStdOutput = newstdout_w;
+ si.hStdError = newstderr_w; //set the new handles for the child process
+ si.hStdInput = newstdin_r;
+
+ //spawn the child process
+ char* cmd = new char[ command.length() + 1 ];
+ strcpy( cmd, command.c_str() );
+ if (!CreateProcess(NULL,cmd,NULL,NULL,TRUE,0,
+ NULL,NULL,&si,&pi))
+ {
+ delete cmd;
+ ErrorMessage("CreateProcess");
+ goto finito;
+ }
+ delete cmd;
+
+ unsigned long exit=0; //process exit code
+ unsigned long bread; //bytes read
+ unsigned long avail; //bytes available
+
+ WriteFile(write_stdin, input.c_str(), input.size(), &bread, NULL); //send data to stdin
+ CloseHandle(write_stdin);
+ write_stdin = INVALID_HANDLE_VALUE;
+
+ // Wait until child process exits.
+ while( true ){
+ GetExitCodeProcess( pi.hProcess, &exit );
+ if( exit != STILL_ACTIVE )
+ WaitForSingleObject( pi.hProcess, INFINITE );
+
+ // read anything that came to stdout
+ PeekNamedPipe(read_stdout,buf,1023,&bread,&avail,NULL);
+ if( avail == 0 )
+ Sleep(5); // didn't get anything, so take a break to avoid hogging the CPU...
+ while( avail > 0 ){
+ bzero(buf);
+ int read_size = 1023 < avail ? 1023 : avail;
+ ReadFile(read_stdout,buf,read_size,&bread,NULL); //read the stdout pipe
+ avail -= bread;
+ output += buf;
+ }
+
+ // read anything that came to stderr
+ PeekNamedPipe(read_stderr,buf,1023,&bread,&avail,NULL);
+ while( avail > 0 ){
+ bzero(buf);
+ int read_size = 1023 < avail ? 1023 : avail;
+ ReadFile(read_stderr,buf,read_size,&bread,NULL); //read the stdout pipe
+ avail -= bread;
+ error += buf;
+ }
+
+ if( exit != STILL_ACTIVE )
+ break;
+ }
+ // Wait until child process exits.
+ WaitForSingleObject( pi.hProcess, INFINITE );
+ success = true;
+
+ //clean up and exit
+finito:
+ if( pi.hThread != INVALID_HANDLE_VALUE )
+ CloseHandle(pi.hThread);
+ if( pi.hProcess != INVALID_HANDLE_VALUE )
+ CloseHandle(pi.hProcess);
+ if( newstdin_r != INVALID_HANDLE_VALUE )
+ CloseHandle(newstdin_r);
+ if( newstdout_w != INVALID_HANDLE_VALUE )
+ CloseHandle(newstdout_w);
+ if( newstderr_w != INVALID_HANDLE_VALUE )
+ CloseHandle(newstderr_w);
+ if( read_stdout != INVALID_HANDLE_VALUE )
+ CloseHandle(read_stdout);
+ if( read_stderr != INVALID_HANDLE_VALUE )
+ CloseHandle(read_stderr);
+ if( write_stdin != INVALID_HANDLE_VALUE )
+ CloseHandle(write_stdin);
+ return success;
+}
+
+#endif
+
+
+
+MuscleInterface& MuscleInterface::getMuscleInterface()
+{
+ static MuscleInterface m_ci;
+
+ return m_ci;
+}
+
+MuscleInterface::MuscleInterface() : GappedAligner() {
+ muscle_path = "muscle_aed";
+ muscle_arguments = "-stable -quiet -seqtype DNA";
+ muscle_cmdline = parseCommand( muscle_path + " " + muscle_arguments );
+ max_alignment_length = 12500;
+}
+
+void MuscleInterface::ParseMusclePath( const char* argv0 ){
+ // get the execution path
+ string path_str = argv0;
+ // trim quotes
+ if( path_str[0] == '"' )
+ path_str = path_str.substr( 1, path_str.size() - 2 );
+ standardizePathString( path_str );
+ string::size_type i = path_str.rfind('/');
+ if( i != string::npos )
+ path_str.erase(i+1, path_str.length() - (i+1));
+ else
+ path_str.clear();
+ SetMusclePath( '"' + path_str + "muscle_aed\"");
+}
+
+void MuscleInterface::SetMusclePath( const string& path ){
+ muscle_path = path;
+ ClearCommandLine();
+ muscle_cmdline = parseCommand( muscle_path + " " + muscle_arguments );
+}
+
+void MuscleInterface::SetExtraMuscleArguments( const string& args )
+{
+ extra_muscle_arguments = args;
+}
+
+void MuscleInterface::SetMuscleArguments( const string& args )
+{
+ ClearCommandLine();
+ muscle_arguments = args + " " + extra_muscle_arguments;
+ muscle_cmdline = parseCommand( muscle_path + " " + args + " " + extra_muscle_arguments );
+}
+
+MuscleInterface& MuscleInterface::operator=( const MuscleInterface& ci ){
+ GappedAligner::operator =( ci );
+ return *this;
+}
+
+//tjt: not the best way of doing this, should have just one Align function that takes an AbstractMatch*,
+ // not both Match* & AbstractMatch* in separate, nearly identical functions..
+ // Such a change would involve changes to GappedAligner, and would require some additional care taken
+ // with SeqCount & Multiplicity, as well as seq_table[ seqI ]->length()/seq_table[ 0 ]->length(i),
+ // for now, leave like this. hopefully sooner than later, make pretty!
+boolean MuscleInterface::Align( GappedAlignment& cr, Match* r_begin, Match* r_end, vector< gnSequence* >& seq_table ){
+ gnSeqI gap_size = 0;
+ boolean create_ok = true;
+ uint seq_count = seq_table.size();
+ //seq_count = r_begin->Multiplicity();
+ uint seqI;
+ uint align_seqs = 0;
+ vector< string > tmp_mat = vector< string >( seq_count );
+try{
+
+//
+// Get the sequence in the intervening gaps between these two matches
+//
+ vector< string > seq_data;
+ vector< int64 > starts;
+ vector< uint > seqs;
+ const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+
+ // skip this sequence if it's undefined
+ if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH ) ||
+ (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
+ starts.push_back( NO_MATCH );
+ continue;
+ }
+
+ // determine the size of the gap
+ int64 gap_start = 0;
+ int64 gap_end = 0;
+ getInterveningCoordinates( seq_table, r_begin, r_end, seqI, gap_start, gap_end );
+
+ int64 diff = gap_end - gap_start;
+ if( diff <= 0 || diff > max_alignment_length ){
+ starts.push_back( NO_MATCH );
+ continue; // skip this sequence if it's either too big or too small
+ }
+ seqs.push_back( seqI );
+// the gnSequence pointers are shared across threads and have a common ifstream
+ // extract sequence data
+ if( r_end == NULL || r_end->Start( seqI ) > 0 ){
+ starts.push_back( gap_start );
+ seq_data.push_back( seq_table[ seqI ]->ToString( diff , gap_start ) );
+ }else{
+ // reverse complement the sequence data.
+ starts.push_back( -gap_start );
+ string cur_seq_data = seq_table[ seqI ]->ToString( diff , gap_start );
+ rc_filter->ReverseFilter( cur_seq_data );
+ seq_data.push_back( cur_seq_data );
+ }
+ }
+
+ if( seqs.size() <= 1 )
+ create_ok = false;
+
+ if( create_ok ){
+// SetMuscleArguments( " -quiet -stable -seqtype DNA " );
+ vector< string > aln_matrix;
+ if( !CallMuscleFast( aln_matrix, seq_data, 0, 0 ) ){
+ cout << "Muscle was unable to align:\n";
+ if( r_begin )
+ cout << "Left match: " << *r_begin << endl;
+ if( r_end )
+ cout << "Right match: " << *r_end << endl;
+ return false;
+ }
+
+ gnSeqI aln_length = aln_matrix.size() == 0 ? 0 : aln_matrix[0].length();
+ cr = GappedAlignment( seq_count, aln_length );
+ vector< string > aln_mat = vector< string >( seq_count );
+
+ // set sequence starts
+ for( uint seqI = 0; seqI < seqs.size(); seqI++ ){
+ cr.SetLength( seq_data[ seqI ].size(), seqs[ seqI ] );
+ aln_mat[ seqs[ seqI ] ] = aln_matrix[ seqI ];
+ }
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ cr.SetStart( seqI, starts[ seqI ] );
+ if( aln_mat[ seqI ].length() != aln_length )
+ aln_mat[ seqI ] = string( aln_length, '-' );
+ }
+
+ cr.SetAlignment( aln_mat );
+
+ return true;
+ }
+}catch(exception& e){
+ cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
+ cerr << e.what();
+}
+ return false;
+}
+
+static int failure_count = 0;
+
+boolean MuscleInterface::Align( GappedAlignment& cr, AbstractMatch* r_begin, AbstractMatch* r_end, vector< gnSequence* >& seq_table){
+ gnSeqI gap_size = 0;
+ boolean create_ok = true;
+ //tjt: set the seq_count to a match m's multiplicity
+ // even though all components n of match m could be
+ // less than the k sequences
+ // if n == k, then perhaps there is 1 match component per sequence
+ // if k = 1, n == repeat match multiplicity, where n >= 2
+ //
+ uint seq_count = r_begin->Multiplicity();
+ uint seqI;
+ uint align_seqs = 0;
+ vector< string > tmp_mat = vector< string >( seq_count );
+try{
+
+//
+// Get the sequence in the intervening gaps between these two matches
+//
+ vector< string > seq_data;
+ vector< int64 > starts;
+ vector< uint > seqs;
+ const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
+
+ //std::cout << "getting regions between match components to align" << std::endl;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+
+ // skip this sequence if it's undefined
+ if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH ) ||
+ (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
+ starts.push_back( NO_MATCH );
+ continue;
+ }
+
+ // determine the size of the gap
+ int64 gap_start = 0;
+ int64 gap_end = 0;
+
+ // determine the size of the gap
+ gap_end = r_end != NULL ? r_end->Start( seqI ) : seq_table[ seqI ]->length() + 1;
+ gap_start = r_begin != NULL ? r_begin->End( seqI ) + 1 : 1;
+ if( gap_end < 0 || gap_start < 0 ){
+ gap_end = r_begin != NULL ? -r_begin->Start( seqI ) : seq_table[ 0 ]->length() + 1;
+ gap_start = r_end != NULL ? -r_end->Start( seqI ) + r_end->Length( seqI ) : 1;
+ }
+ if( gap_end <= 0 || gap_start <= 0 ){
+ // if either is still < 0 then there's a problem...
+ genome::ErrorMsg( "Error constructing intervening coordinates" );
+ }
+
+ int64 diff = gap_end - gap_start;
+
+ //diff <= 0 ||
+ if( diff <= 0 || diff > max_alignment_length ){
+ starts.push_back( NO_MATCH );
+ continue; // skip this sequence if it's either too big or too small
+ }
+
+ seqs.push_back( seqI );
+
+ // extract sequence data
+ if (0 )
+ {
+ starts.push_back( gap_start );
+ seq_data.push_back( "A" );
+ std::cout << "A" << std::endl;
+ diff = 1;
+ }
+// the gnSequence pointers are shared across threads and have a common ifstream
+ if( r_end == NULL || r_end->Start( seqI ) > 0 ){
+ starts.push_back( gap_start );
+ //std::cout << seq_table[ 0 ]->ToString( diff , gap_start ) << std::endl;
+ //tjt: all sequences are concatenated together into 1 seq_table entry
+ //
+ seq_data.push_back( seq_table[ 0 ]->ToString( diff , gap_start ) );
+ }else{
+ // reverse complement the sequence data.
+ starts.push_back( -gap_start );
+ //tjt: all sequences are concatenated together into 1 seq_table entry
+ //
+ string cur_seq_data = seq_table[ 0 ]->ToString( diff , gap_start );
+ rc_filter->ReverseFilter( cur_seq_data );
+ seq_data.push_back( cur_seq_data );
+ //std::cout << cur_seq_data << std::endl;
+ }
+ }
+
+ //no seqs able to be aligned..
+ if( seqs.size() == 0)
+ create_ok = false;
+
+
+ if( create_ok ){
+// SetMuscleArguments( " -quiet -stable -seqtype DNA " );
+ vector< string > aln_matrix;
+ if( !CallMuscleFast( aln_matrix, seq_data, 0, 0 ) ){
+ cout << "Muscle was unable to align:\n";
+ return false;
+ }
+
+ //fill in regions between adjacent seeds with gaps
+ //if aln_matrix is smaller than multiplicity, then we know
+ //that there are some regions between seeds that have len == 0
+ if (aln_matrix.size() != r_begin->Multiplicity() && 0)
+ {
+ for( uint seqI = 0; seqI < starts.size(); seqI++ )
+ {
+ //if this a position between two adjacent matches..
+ if (starts.at(seqI) == NO_MATCH)
+ {
+ //calculate the number of gaps to fill in
+ int64 gap_end = r_end != NULL ? r_end->Start( seqI ) : seq_table[ seqI ]->length() + 1;
+ int64 gap_start = r_begin != NULL ? r_begin->End( seqI ) + 1 : 1;
+ if( r_end == NULL || r_end->Start( seqI ) > 0 ){
+ starts[seqI] = 0;//gap_start;
+ seq_data.insert(seq_data.begin()+(seqI),"");
+ }else{
+ starts[seqI] = 0;//-gap_start;
+ seq_data.insert(seq_data.begin()+(seqI),"");
+ }
+ string tmp(aln_matrix[0].length(), '-');
+ aln_matrix.insert(aln_matrix.begin()+(seqI), tmp);
+ seqs.insert(seqs.begin()+(seqI),seqI);
+ }
+ }
+ }
+ gnSeqI aln_length = aln_matrix.size() == 0 ? 0 : aln_matrix[0].length();
+ cr = GappedAlignment( seq_count, aln_length );
+ vector< string > aln_mat = vector< string >( seq_count );
+
+ // set sequence starts
+ for( uint seqI = 0; seqI < seqs.size(); seqI++ ){
+ cr.SetLength( seq_data[ seqI ].size(), seqs[ seqI ] );
+ aln_mat[ seqs[ seqI ] ] = aln_matrix[ seqI ];
+ }
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ cr.SetStart( seqI, starts[ seqI ] );
+ if( aln_mat[ seqI ].length() != aln_length )
+ aln_mat[ seqI ] = string( aln_length, '-' );
+ }
+
+ cr.SetAlignment( aln_mat );
+
+ return true;
+ }
+}catch(exception& e){
+ cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
+ cerr << e.what();
+}
+ return false;
+}
+
+boolean MuscleInterface::CallMuscle( vector< string >& aln_matrix, const vector< string >& seq_table )
+{
+ gnSequence seq;
+
+ try{
+ ostringstream input_seq_stream;
+ //istringstream muscle_input_seq_stream;
+ for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
+ seq += seq_table[ seqI ];
+ seq.setContigName( seqI, "seq" );
+ }
+ gnFASSource::Write( seq, input_seq_stream, false, true );
+ // now open a pipe to Muscle
+ string muscle_cmd = muscle_path + " " + muscle_arguments;
+ string output;
+ string error;
+ boolean success = pipeExec( muscle_cmdline, muscle_cmd, input_seq_stream.str(), output, error );
+ if( !success || output.size() == 0 )
+ {
+ throw "b0rk3d";
+ }
+
+ istringstream output_aln_stream( output );
+ string cur_line;
+
+ // parse the fasta output
+ while( getline( output_aln_stream, cur_line ) ){
+ if( cur_line[0] == '>' ){
+ aln_matrix.push_back( "" );
+ continue;
+ }
+ gnSeqI len = cur_line.size();
+ len = cur_line[ len - 1 ] == '\r' ? len - 1 : len;
+ uint seqI = aln_matrix.size() - 1;
+ aln_matrix[ seqI ] += cur_line.substr( 0, len );
+ }
+
+ return true;
+ }catch( gnException& gne ){
+ }catch( exception& e ){
+ }catch(...){
+ }
+ cerr << "muscle failed! saving failed input data to muscle_failure_" << failure_count << ".txt\n";
+ cerr << "Please contact the Mauve developers about this problem\n";
+ stringstream debug_fname;
+ debug_fname << "muscle_failure_" << failure_count++ << ".txt";
+ ofstream debug_file( debug_fname.str().c_str() );
+ gnFASSource::Write(seq, debug_file, false);
+ debug_file.close();
+ return false;
+}
+
+// version 2 of this code: attempt to call muscle without performing costly disk I/O!!
+boolean MuscleInterface::CallMuscleFast( vector< string >& aln_matrix, const vector< string >& seq_table, int gap_open, int gap_extend )
+{
+ if (gap_open != 0)
+ g_scoreGapOpen.get() = gap_open;
+ if (gap_extend != 0)
+ g_scoreGapExtend.get() = gap_extend;
+ g_SeqType.get() = SEQTYPE_DNA; // we're operating on DNA
+ g_uMaxIters.get() = 1; // and we don't want to refine the alignment...yet
+ g_bStable.get() = true; // we want output seqs in the same order as input
+ g_bQuiet.get() = true; // and don't print anything to the console
+ g_SeqWeight1.get() = SEQWEIGHT_ClustalW; // not sure what weighting scheme works best for DNA
+
+ SetMaxIters(g_uMaxIters.get());
+ SetSeqWeightMethod(g_SeqWeight1.get());
+
+ // now construct a SeqVect containing input sequences
+ SeqVect sv;
+ const char* seqname = "seq00000";
+ for( size_t seqI = 0; seqI < seq_table.size(); seqI++ )
+ {
+ Seq curseq;
+ curseq.SetId(seqI);
+ curseq.SetName(seqname);
+ curseq.resize(seq_table[seqI].size());
+ std::copy(seq_table[seqI].begin(), seq_table[seqI].end(), curseq.begin());
+ sv.AppendSeq(curseq);
+ }
+
+ MSA msaTmp;
+ MUSCLE(sv,msaTmp);
+
+ // now extract the alignment
+ aln_matrix.clear();
+ aln_matrix.resize(msaTmp.GetSeqCount());
+ for( size_t seqI = 0; seqI < msaTmp.GetSeqCount(); seqI++ )
+ {
+ unsigned indie = msaTmp.GetSeqIndex(seqI);
+ const char* buf = msaTmp.GetSeqBuffer(indie);
+ string curseq(buf, msaTmp.GetColCount());
+ swap(aln_matrix[seqI],curseq);
+ }
+ return true; // how can it possibly fail? :)
+}
+
+bool MuscleInterface::Refine( GappedAlignment& ga, size_t windowsize )
+{
+ const vector< string >& seq_table = GetAlignment( ga, vector< gnSequence* >() );
+ vector< string > aln_table;
+ for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
+ {
+ if( ga.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln_table.push_back( seq_table[seqI] );
+ }
+ }
+ vector< string > aln_matrix;
+ if( windowsize == 0 )
+ SetMuscleArguments( " -quiet -refine -seqtype DNA " );
+ else
+ {
+ stringstream sstr;
+ sstr << " -quiet -seqtype DNA -refinew -refinewindow " << windowsize << " ";
+ SetMuscleArguments( sstr.str() );
+ }
+ bool success = CallMuscle( aln_matrix, aln_table );
+ if( success )
+ {
+ aln_table.clear();
+ uint alnI = 0;
+ for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
+ {
+ if( ga.LeftEnd(seqI) != NO_MATCH )
+ aln_table.push_back( aln_matrix[alnI++] );
+ else
+ aln_table.push_back( string( aln_matrix[0].size(), '-' ) );
+ }
+ ga.SetAlignment( aln_table );
+ }
+ return success;
+}
+
+void msaFromSeqTable(MSA& msa, const vector< string >& seq_table, unsigned id_base = 0)
+{
+ msa.SetSize(seq_table.size(), seq_table[0].size());
+ for( uint seqI = 0; seqI < seq_table.size(); seqI++ )
+ {
+ stringstream ss;
+ ss << "seq" << seqI;
+ msa.SetSeqName(seqI, ss.str().c_str());
+ msa.SetSeqId(seqI,seqI+id_base);
+ for(size_t i = 0; i < seq_table[seqI].size(); i++)
+ msa.SetChar(seqI, i, seq_table[seqI][i]);
+ }
+}
+
+
+bool MuscleInterface::RefineFast( GappedAlignment& ga, size_t windowsize )
+{
+ const vector< string >& seq_table = GetAlignment( ga, vector< gnSequence* >() );
+ vector< string > aln_table;
+ for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
+ {
+ if( ga.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln_table.push_back( seq_table[seqI] );
+ }
+ }
+
+ g_SeqType.get() = SEQTYPE_DNA; // we're operating on DNA
+ g_uMaxIters.get() = 1; // and we don't want to refine the alignment...yet
+ g_bStable.get() = true; // we want output seqs in the same order as input
+ g_bQuiet.get() = true; // and don't print anything to the console
+ g_SeqWeight1.get() = SEQWEIGHT_ClustalW; // not sure what weighting scheme works best for DNA
+
+ g_uRefineWindow.get() = windowsize;
+ g_uWindowTo.get() = 0;
+
+ SetMaxIters(g_uMaxIters.get());
+ SetSeqWeightMethod(g_SeqWeight1.get());
+
+ MSA::SetIdCount(seq_table.size());
+
+ // create an MSA
+ MSA msa;
+ msaFromSeqTable(msa, seq_table);
+
+ SetAlpha(ALPHA_DNA);
+ msa.FixAlpha();
+ SetPPScore(PPSCORE_SPN);
+ SetMuscleInputMSA(msa);
+
+ Tree GuideTree;
+ TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get());
+ SetMuscleTree(GuideTree);
+
+ MSA msaOut;
+ MSA* finalMsa;
+
+ if(windowsize == 0)
+ {
+ if (g_bAnchors.get())
+ RefineVert(msa, GuideTree, g_uMaxIters.get());
+ else
+ RefineHoriz(msa, GuideTree, g_uMaxIters.get(), false, false);
+ finalMsa = &msa;
+ }else{
+ RefineW(msa, msaOut);
+ finalMsa = &msaOut;
+ }
+
+
+ ValidateMuscleIds(*finalMsa);
+ ValidateMuscleIds(GuideTree);
+
+ // now extract the alignment
+ vector< string > aln_matrix;
+ aln_matrix.resize(finalMsa->GetSeqCount());
+ for( size_t seqI = 0; seqI < finalMsa->GetSeqCount(); seqI++ )
+ {
+ unsigned indie = finalMsa->GetSeqIndex(seqI);
+ const char* buf = finalMsa->GetSeqBuffer(indie);
+ string curseq(buf, finalMsa->GetColCount());
+ swap(aln_matrix[seqI],curseq);
+ }
+
+ ga.SetAlignment( aln_matrix );
+ return true;
+}
+
+
+void stripGapColumns( std::vector< std::string >& aln )
+{
+ size_t cur_col = 0;
+ size_t gap_seq = 0;
+ for( size_t colI = 0; colI < aln[0].size(); colI++ )
+ {
+ gap_seq = 0;
+ for( ; gap_seq < aln.size(); gap_seq++ )
+ if( aln[gap_seq][colI] != '-' )
+ break;
+ if( gap_seq != aln.size() )
+ {
+ for( gap_seq = 0; gap_seq < aln.size(); gap_seq++ )
+ aln[gap_seq][cur_col] = aln[gap_seq][colI];
+ cur_col++;
+ }
+ }
+ for( gap_seq = 0; gap_seq < aln.size(); gap_seq++ )
+ aln[gap_seq].resize(cur_col);
+}
+
+void stripGaps( std::string& str )
+{
+ std::string::iterator striter = std::remove(str.begin(), str.end(), '-');
+ str.resize(striter - str.begin());
+}
+
+bool MuscleInterface::ProfileAlign( const GappedAlignment& ga1, const GappedAlignment& ga2, GappedAlignment& aln, bool anchored )
+{
+ try{
+ const vector< string >& aln1 = GetAlignment( ga1, vector< gnSequence* >() );
+ const vector< string >& aln2 = GetAlignment( ga2, vector< gnSequence* >() );
+ vector< uint > order;
+ ostringstream input_seq_stream;
+ gnSequence seq;
+ vector< string > aln11( ga1.Multiplicity() );
+ vector< string > aln22( ga2.Multiplicity() );
+ size_t curI = 0;
+ for( uint seqI = 0; seqI < aln1.size(); seqI++ )
+ {
+ if( ga1.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln11[curI++] = aln1[seqI];
+ order.push_back(seqI);
+ }
+ }
+ curI = 0;
+ for( uint seqI = 0; seqI < aln2.size(); seqI++ )
+ {
+ if( ga2.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln22[curI++] = aln2[seqI];
+ order.push_back(seqI);
+ }
+ }
+// strip the gap columns only if we're doing unanchored PP alignment
+ if( !anchored )
+ {
+ stripGapColumns(aln11);
+ stripGapColumns(aln22);
+ }
+ for( uint seqI = 0; seqI < aln11.size(); seqI++ )
+ {
+ seq += aln11[ seqI ];
+ seq.setContigName( seq.contigListLength()-1, "seq" );
+ }
+
+ gnFASSource::Write( seq, input_seq_stream, false, true );
+ input_seq_stream << "=\n";
+
+ gnSequence seq2;
+ for( uint seqI = 0; seqI < aln22.size(); seqI++ )
+ {
+ seq2 += aln22[ seqI ];
+ seq2.setContigName( seq2.contigListLength()-1, "seq" );
+ }
+
+ gnFASSource::Write( seq2, input_seq_stream, false, true );
+ input_seq_stream << "=\n";
+
+ if( debug_muscle )
+ {
+ // for debugging: write the anchored profiles to a file
+ stringstream debug_fname;
+ debug_fname << "muscle_debug_" << failure_count++ << ".txt";
+ ofstream debug_file( debug_fname.str().c_str() );
+ debug_file << input_seq_stream.str();
+ debug_file.close();
+ }
+
+ // now open a pipe to Muscle
+ string musc_args = "-quiet -seqtype DNA -profile -ProfileOnStdIn ";
+ if( anchored )
+ musc_args += "-AnchoredPP ";
+ SetMuscleArguments( musc_args );
+ string output;
+ string error;
+ string muscle_cmd = muscle_path + " " + muscle_arguments;
+ if( debug_muscle )
+ {
+ cerr << "Running " << muscle_cmd << endl;
+ }
+ boolean success = pipeExec( muscle_cmdline, muscle_cmd, input_seq_stream.str(), output, error );
+ if( !success || output.size() == 0 )
+ {
+ if( output.size() == 0 )
+ cerr << "\nmuscle nothing\n";
+ else
+ cerr << "\nunsuccessful muscle\n";
+ return false;
+ }
+
+ istringstream output_aln_stream( output );
+ string cur_line;
+
+ // parse the fasta output
+ vector< string > aln_matrix( ga1.SeqCount() );
+ int ordI = -1;
+ while( getline( output_aln_stream, cur_line ) ){
+ if( cur_line[0] == '>' ){
+ ordI++;
+ continue;
+ }
+ gnSeqI len = cur_line.size();
+ len = cur_line[ len - 1 ] == '\r' ? len - 1 : len;
+ uint seqI = aln_matrix.size() - 1;
+ aln_matrix[ order[ordI] ] += cur_line.substr( 0, len );
+ }
+ for( size_t i = 0; i < aln_matrix.size(); i++ )
+ {
+ if( aln_matrix[i].size() == 0 )
+ aln_matrix[i].resize( aln_matrix[order[0]].size(), '-' );
+ }
+
+ aln.SetAlignment( aln_matrix );
+ for( uint seqI = 0; seqI < ga1.SeqCount(); seqI++ )
+ if( ga1.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln.SetLeftEnd(seqI, ga1.LeftEnd(seqI));
+ aln.SetLength(ga1.Length(seqI), seqI);
+ }
+ for( uint seqI = 0; seqI < ga2.SeqCount(); seqI++ )
+ if( ga2.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln.SetLeftEnd(seqI, ga2.LeftEnd(seqI));
+ aln.SetLength(ga2.Length(seqI), seqI);
+ }
+ return true;
+ }catch( gnException& gne ){
+ }catch( exception& e ){
+ }catch(...){
+ }
+ return false;
+}
+
+
+bool MuscleInterface::ProfileAlignFast( const GappedAlignment& ga1, const GappedAlignment& ga2, GappedAlignment& aln, bool anchored )
+{
+ try{
+ const vector< string >& aln1 = GetAlignment( ga1, vector< gnSequence* >() );
+ const vector< string >& aln2 = GetAlignment( ga2, vector< gnSequence* >() );
+ vector< uint > order;
+ vector< string > aln11( ga1.Multiplicity() );
+ vector< string > aln22( ga2.Multiplicity() );
+ size_t curI = 0;
+ for( uint seqI = 0; seqI < aln1.size(); seqI++ )
+ {
+ if( ga1.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln11[curI++] = aln1[seqI];
+ order.push_back(seqI);
+ }
+ }
+ curI = 0;
+ for( uint seqI = 0; seqI < aln2.size(); seqI++ )
+ {
+ if( ga2.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln22[curI++] = aln2[seqI];
+ order.push_back(seqI);
+ }
+ }
+// strip the gap columns only if we're doing unanchored PP alignment
+ if( !anchored )
+ {
+ stripGapColumns(aln11);
+ stripGapColumns(aln22);
+ }
+
+ g_SeqType.get() = SEQTYPE_DNA; // we're operating on DNA
+ g_uMaxIters.get() = 1; // and we don't want to refine the alignment...yet
+ g_bStable.get() = true; // we want output seqs in the same order as input
+ g_bQuiet.get() = true; // and don't print anything to the console
+ g_SeqWeight1.get() = SEQWEIGHT_ClustalW; // not sure what weighting scheme works best for DNA
+
+ SetMaxIters(g_uMaxIters.get());
+ SetSeqWeightMethod(g_SeqWeight1.get());
+
+ MSA::SetIdCount(order.size());
+
+ MSA msa1;
+ MSA msa2;
+ MSA msaOut;
+ msaFromSeqTable(msa1, aln11);
+ msaFromSeqTable(msa2, aln22, msa1.GetSeqCount());
+
+ SetAlpha(ALPHA_DNA);
+ msa1.FixAlpha();
+ msa2.FixAlpha();
+ SetPPScore(PPSCORE_SPN);
+
+ if(anchored)
+ {
+ AnchoredProfileProfile(msa1, msa2, msaOut);
+ }else{
+ ProfileProfile(msa1, msa2, msaOut);
+ }
+
+ // get the output
+ vector< string > aln_matrix( aln1.size() );
+ for( size_t seqI = 0; seqI < msaOut.GetSeqCount(); seqI++ )
+ {
+ unsigned indie = msaOut.GetSeqIndex(seqI);
+ const char* buf = msaOut.GetSeqBuffer(indie);
+ string curseq(buf, msaOut.GetColCount());
+ swap(aln_matrix[order[indie]],curseq);
+
+ // debugging, check that sequences came out in the same order they went in!
+/* string inseq = aln1[order[indie]];
+ string outseq = aln_matrix[order[indie]];
+ stripGaps(inseq);
+ stripGaps(outseq);
+ if(inseq != outseq)
+ {
+ unsigned indie = msaOut.GetSeqIndex(seqI);
+ cerr << "bad indie " << indie << endl;
+ genome::breakHere();
+ }
+*/
+ }
+ // fill empty seqs with gaps
+ for( size_t seqI = 0; seqI < aln_matrix.size(); seqI++ )
+ if(aln_matrix[seqI].size() == 0)
+ aln_matrix[seqI].resize(msaOut.GetColCount(), '-');
+
+ aln.SetAlignment( aln_matrix );
+ for( uint seqI = 0; seqI < ga1.SeqCount(); seqI++ )
+ if( ga1.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln.SetLeftEnd(seqI, ga1.LeftEnd(seqI));
+ aln.SetLength(ga1.Length(seqI), seqI);
+ }
+ for( uint seqI = 0; seqI < ga2.SeqCount(); seqI++ )
+ if( ga2.LeftEnd(seqI) != NO_MATCH )
+ {
+ aln.SetLeftEnd(seqI, ga2.LeftEnd(seqI));
+ aln.SetLength(ga2.Length(seqI), seqI);
+ }
+ return true;
+
+ }catch( gnException& gne ){
+ }catch( exception& e ){
+ }catch(...){
+ }
+ return false;
+}
+
+
+void MuscleInterface::CreateTree( const NumericMatrix<double>& distances, const std::string& tree_filename )
+{
+ g_bQuiet.get() = true; // don't print anything to the console!
+ DistFunc df;
+ df.SetCount( distances.rows() );
+ for( size_t i = 0; i < distances.rows(); i++ )
+ for( size_t j = 0; j < distances.rows(); j++ )
+ df.SetDist( i, j, distances(i,j) );
+
+ for( size_t i = 0; i < distances.rows(); i++ )
+ {
+ stringstream ss;
+ ss << "seq";
+ ss << i + 1;
+ df.SetName( i, ss.str().c_str() );
+ df.SetId( i, i );
+ }
+ ClustSetDF csdf( df );
+ Clust crusty;
+ crusty.Create( csdf, CLUSTER_NeighborJoining );
+ Tree tt;
+ tt.FromClust( crusty );
+ TextFile tf( tree_filename.c_str(), true );
+ tt.ToFile( tf );
+}
+
+
+}
diff --git a/libMems/MuscleInterface.h b/libMems/MuscleInterface.h
new file mode 100644
index 0000000..9e9d502
--- /dev/null
+++ b/libMems/MuscleInterface.h
@@ -0,0 +1,148 @@
+/*******************************************************************************
+ * $Id: MuscleInterface.h,v 1.12 2004/04/19 23:10:50 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _MuscleInterface_h_
+#define _MuscleInterface_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/NumericMatrix.h"
+#include "libGenome/gnFilter.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/GappedAligner.h"
+
+// attempt to auto-link the MUSCLE library on windows
+#if defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "MUSCLE64omp.lib")
+#endif
+#if defined(WIN64)&&defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "MUSCLE64fdomp.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "MUSCLEomp.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "MUSCLEfdomp.lib")
+#endif
+#if defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "MUSCLE64.lib")
+#endif
+#if defined(WIN64)&&defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "MUSCLE64fd.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "MUSCLE.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "MUSCLEfd.lib")
+#endif
+
+namespace mems {
+
+extern bool debug_muscle;
+
+//template< typename MatchType=AbstractMatch >
+class MuscleInterface : public GappedAligner {
+public:
+ ~MuscleInterface()
+ {
+ ClearCommandLine();
+ }
+ /**
+ * Returns a reference to a usable MuscleInterface
+ */
+ static MuscleInterface& getMuscleInterface();
+
+ /**
+ * Parse the execution path from argv[0] and set the muscle
+ * path accordingly
+ */
+ void ParseMusclePath( const char* argv0 );
+
+ /**
+ * Set the path to the muscle executable
+ * Defaults to "muscle"
+ */
+ void SetMusclePath( const std::string& path );
+
+ /**
+ * Set the arguments to use when executing muscle
+ */
+ void SetExtraMuscleArguments( const std::string& extra_args );
+ /**
+ * Get the arguments to use when executing muscle
+ */
+ std::string GetExtraMuscleArguments(){ return this->extra_muscle_arguments; };
+
+ /**
+ * Attempts to perform a multiple alignment using Muscle between
+ * <code>r_begin</code> and <code>r_end</code>
+ */
+
+ //tjt: not the best way of doing this, should have just one Align function that takes an AbstractMatch*,
+ // not both Match* & AbstractMatch* in separate, nearly identical functions..
+ // Such a change would involve changes to GappedAligner, and would require some additional care taken
+ // with SeqCount & Multiplicity, as well as seq_table[ seqI ]->length()/seq_table[ 0 ]->length(i),
+ // for now, leave like this. hopefully sooner than later, make pretty!
+ boolean Align( GappedAlignment& cr, Match* r_begin, Match* r_end, std::vector< genome::gnSequence* >& seq_table);
+
+ boolean Align( GappedAlignment& cr, AbstractMatch* r_begin, AbstractMatch* r_end, std::vector< genome::gnSequence* >& seq_table);
+
+ bool Refine( GappedAlignment& ga, size_t windowsize = 0 );
+
+ /**
+ * Given two gapped alignments in ga1 and ga2, align them and store the result in aln. ga1 and
+ * ga2 must have equal sequence count and contain disjoint sets of sequences, e.g. for any given
+ * seqI, if ga1.LeftEnd(seqI) != NO_MATCH, then ga2.LeftEnd(seqI) == NO_MATCH
+ */
+ bool ProfileAlign( const GappedAlignment& ga1, const GappedAlignment& ga2, GappedAlignment& aln, bool anchored = true );
+ boolean CallMuscle( std::vector< std::string >& aln_matrix, const std::vector< std::string >& seq_table );
+ boolean CallMuscleFast( std::vector< std::string >& aln_matrix, const std::vector< std::string >& seq_table, int gap_open = 0, int gap_extend = 0);
+ bool RefineFast( GappedAlignment& ga, size_t windowsize = 0 );
+ bool ProfileAlignFast( const GappedAlignment& ga1, const GappedAlignment& ga2, GappedAlignment& aln, bool anchored = true );
+
+ void CreateTree( const NumericMatrix<double>& distances, const std::string& tree_filename );
+
+protected:
+ std::string muscle_path;
+ std::string muscle_arguments;
+ std::string extra_muscle_arguments;
+ char** muscle_cmdline;
+
+ void SetMuscleArguments( const std::string& extra_args );
+
+ void ClearCommandLine()
+ {
+ if( muscle_cmdline != NULL )
+ {
+ size_t cmdI = 0;
+ while(muscle_cmdline[cmdI] != NULL)
+ {
+ delete[] muscle_cmdline[cmdI];
+ cmdI++;
+ }
+ delete[] muscle_cmdline;
+ }
+ }
+
+private:
+ MuscleInterface( const MuscleInterface& ci ){ *this = ci; }
+ MuscleInterface& operator=( const MuscleInterface& ci );
+ MuscleInterface();
+};
+
+
+void stripGapColumns( std::vector< std::string >& aln );
+
+
+}
+
+#endif // _MuscleInterface_h_
diff --git a/libMems/NumericMatrix.h b/libMems/NumericMatrix.h
new file mode 100644
index 0000000..6917dab
--- /dev/null
+++ b/libMems/NumericMatrix.h
@@ -0,0 +1,164 @@
+/*******************************************************************************
+ * $Id: NumericMatrix.h,v 1.4 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _NumericMatrix_h_
+#define _NumericMatrix_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/Matrix.h"
+
+template<class T> // See section on templates for more
+class NumericMatrix : public Matrix<T>
+{
+public:
+ NumericMatrix(){};
+ NumericMatrix(unsigned nrows, unsigned ncols);
+
+ // Based on the Law Of The Big Three:
+ ~NumericMatrix();
+ NumericMatrix(const NumericMatrix<T>& m);
+ NumericMatrix<T>& operator= (const NumericMatrix<T>& m);
+
+ // define some arithmetic operators
+ NumericMatrix<T>& operator+= (const NumericMatrix<T>& m);
+ NumericMatrix<T>& operator-= (const NumericMatrix<T>& m);
+ // not implemented
+ NumericMatrix<T>& operator*= (const NumericMatrix<T>& m);
+ NumericMatrix<T>& operator*= (const T& m);
+ NumericMatrix<T>& operator/= (const NumericMatrix<T>& m);
+ NumericMatrix<T>& operator/= (const T& m);
+
+ // the following 5 are not implemented
+ NumericMatrix<T>& operator+ (const NumericMatrix<T>& m ) const;
+ const NumericMatrix<T>& operator- (const NumericMatrix<T>& m ) const;
+ const NumericMatrix<T>& operator* (const NumericMatrix<T>& m ) const;
+ const NumericMatrix<T>& operator* (const T& n) const;
+ const NumericMatrix<T>& operator/ (const T& n) const;
+
+};
+
+template<class T>
+inline NumericMatrix<T>::NumericMatrix(unsigned nrows, unsigned ncols)
+ : Matrix<T>( nrows, ncols )
+{
+}
+
+template<class T>
+inline NumericMatrix<T>::NumericMatrix(const NumericMatrix<T>& m){
+ *this = m;
+}
+
+template<class T>
+inline NumericMatrix<T>& NumericMatrix<T>::operator= (const NumericMatrix<T>& m)
+{
+ Matrix<T>::operator=( m );
+ return *this;
+}
+
+template<class T>
+inline NumericMatrix<T>::~NumericMatrix()
+{
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator+= (const NumericMatrix<T>& m){
+ // make sure matrix dimensions agree
+ if (this->nrows_ != m.nrows_ || this->ncols_ != m.ncols_)
+ throw typename Matrix<T>::BadSize();
+
+ // do the arithmetic on each matrix entry
+ for(unsigned i = 0; i < Matrix<T>::nrows_ * Matrix<T>::ncols_; i++ )
+ this->data_[ i ] += m.data_[ i ];
+ return *this;
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator-= (const NumericMatrix<T>& m){
+ // make sure matrix dimensions agree
+ if (this->nrows_ != m.nrows_ || this->ncols_ != m.ncols_)
+ throw typename Matrix<T>::BadSize();
+
+ // do the arithmetic on each matrix entry
+ for(unsigned i = 0; i < Matrix<T>::nrows_ * Matrix<T>::ncols_; i++ )
+ this->data_[ i ] -= m.data_[ i ];
+ return *this;
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator*= (const NumericMatrix<T>& m){
+ // make sure matrix dimensions agree
+ if (this->ncols_ != m.nrows_)
+ throw typename Matrix<T>::BadSize();
+ // do a matrix multiply
+ return *this;
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator*= (const T& m){
+ // do the arithmetic on each matrix entry
+ for(unsigned i = 0; i < Matrix<T>::nrows_ * Matrix<T>::ncols_; i++ )
+ this->data_[ i ] *= m;
+ return *this;
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator/= (const T& m){
+ // do the arithmetic on each matrix entry
+ for(unsigned i = 0; i < Matrix<T>::nrows_ * Matrix<T>::ncols_; i++ )
+ this->data_[ i ] /= m;
+ return *this;
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator/= ( const NumericMatrix<T>& m ){
+ // make sure matrix dimensions agree
+ if (this->nrows_ != m.nrows_ || this->ncols_ != m.ncols_)
+ throw typename Matrix<T>::BadSize();
+ // do the arithmetic on each matrix entry
+ for(unsigned i = 0; i < Matrix<T>::nrows_ * Matrix<T>::ncols_; i++ )
+ this->data_[ i ] /= m.data_[ i ];
+ return *this;
+}
+
+template<class T>
+inline
+NumericMatrix<T>& NumericMatrix<T>::operator+ (const NumericMatrix<T>& m) const {
+
+}
+template<class T>
+inline
+const NumericMatrix<T>& NumericMatrix<T>::operator- (const NumericMatrix<T>& m) const {
+
+}
+template<class T>
+inline
+const NumericMatrix<T>& NumericMatrix<T>::operator* (const NumericMatrix<T>& m) const {
+
+}
+template<class T>
+inline
+const NumericMatrix<T>& NumericMatrix<T>::operator* (const T& n) const {
+
+}
+template<class T>
+inline
+const NumericMatrix<T>& NumericMatrix<T>::operator/ (const T& n) const {
+
+}
+
+
+#endif // _NumericMatrix_h_
diff --git a/libMems/PairwiseMatchAdapter.h b/libMems/PairwiseMatchAdapter.h
new file mode 100644
index 0000000..fd83646
--- /dev/null
+++ b/libMems/PairwiseMatchAdapter.h
@@ -0,0 +1,117 @@
+#ifndef __PairwiseMatchAdapter_h__
+#define __PairwiseMatchAdapter_h__
+
+#include "libMems/AbstractMatch.h"
+#include "libMems/ProgressiveAligner.h"
+#include <vector>
+
+namespace mems {
+
+/**
+ * PairwiseMatchAdapter is a wrapper around an AbstractMatch that effectively projects a multi-match to a
+ * pairwise match. The adapter class forwards most function calls to the original match
+ * class, to which it stores a pointer. Use of non-const functions results in undefined state.
+ */
+class PairwiseMatchAdapter : public mems::AbstractMatch
+{
+public:
+ PairwiseMatchAdapter() : m(NULL) {}
+ PairwiseMatchAdapter( AbstractMatch* match, uint seq1, uint seq2 ) :
+ m(match)
+ {
+ seq[0] = seq1;
+ seq[1] = seq2;
+ inverted = false;
+ }
+
+ PairwiseMatchAdapter* Clone() const { return new PairwiseMatchAdapter( *this ); }
+
+ PairwiseMatchAdapter* Copy() const
+ {
+ return m_allocateAndCopy( *this );
+ }
+
+ void Free()
+ {
+ m_free(this);
+ }
+
+ //
+ // forward all function calls to match
+ //
+ gnSeqI Length( uint seqI ) const { return m->Length(seq[seqI]); }
+ void SetLength( gnSeqI len, uint seqI ) { m->SetLength(len, seq[seqI]); }
+ int64 Start(uint startI) const {
+ if(inverted)
+ return -m->Start(seq[startI]);
+ return m->Start(seq[startI]);
+ }
+ void SetStart(uint seqI, int64 start) { m->SetStart(seq[seqI],start); }
+ gnSeqI LeftEnd(uint seqI) const { return m->LeftEnd(seq[seqI]); }
+ orientation Orientation(uint seqI) const {
+ orientation o = m->Orientation(seq[seqI]);
+ if(inverted && o != AbstractMatch::undefined )
+ o = o == AbstractMatch::forward ? AbstractMatch::reverse : AbstractMatch::forward;
+ return o;
+ }
+ void SetLeftEnd(uint seqI, gnSeqI start) { m->SetLeftEnd(seq[seqI],start); }
+ void SetOrientation(uint seqI, orientation o) { m->SetOrientation(seq[seqI],o); }
+ void MoveStart(int64 move_amount) { m->MoveStart(move_amount); }
+ void MoveEnd(int64 move_amount) { m->MoveEnd(move_amount); }
+ uint Multiplicity() const { return 2; }
+ uint SeqCount() const { return 2; }
+ uint FirstStart() const { return 0; }
+ gnSeqI AlignmentLength() const { return m->AlignmentLength(); }
+ void Invert() { inverted = !inverted; }
+ void CropStart(gnSeqI crop_amount) { m->CropStart(crop_amount); }
+ void CropEnd(gnSeqI crop_amount) { m->CropEnd(crop_amount); }
+ void CropLeft(gnSeqI crop_amount, uint seqI) { m->CropLeft(crop_amount, seq[seqI]); }
+ void CropRight(gnSeqI crop_amount, uint seqI) { m->CropRight(crop_amount, seq[seqI]); }
+ void GetAlignment( std::vector< mems::bitset_t >& align_matrix ) const
+ {
+ if( inverted )
+ m->Invert();
+ std::vector< mems::bitset_t > aln_mat;
+ m->GetAlignment(aln_mat);
+ align_matrix.clear();
+ align_matrix.push_back(aln_mat[seq[0]]);
+ align_matrix.push_back(aln_mat[seq[1]]);
+ if( inverted )
+ m->Invert();
+ }
+ void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
+ {
+ if( inverted )
+ m->Invert();
+ std::vector<gnSeqI> m_pos;
+ std::vector<bool> m_column;
+ m->GetColumn(col,m_pos,m_column);
+ pos.clear();
+ pos.push_back(m_pos[seq[0]]);
+ pos.push_back(m_pos[seq[1]]);
+ column.push_back(m_column[seq[0]]);
+ column.push_back(m_column[seq[1]]);
+ if( inverted )
+ m->Invert();
+ }
+
+ bool IsGap( uint seqI, gnSeqI col ) const { return m->IsGap( seq[seqI],col ); }
+ uint UsedSeq( uint seqI ) const
+ {
+ if(m->Start(seq[0]) != NO_MATCH)
+ return 0;
+ if(m->Start(seq[1]) != NO_MATCH)
+ return 1;
+ return (std::numeric_limits<uint>::max)();
+ };
+
+ AbstractMatch* m;
+ TrackingMatch* tm;
+ uint seq[2];
+ bool inverted;
+};
+
+}
+
+#endif // __PairwiseMatchAdapter_h__
+
diff --git a/libMems/PairwiseMatchFinder.cpp b/libMems/PairwiseMatchFinder.cpp
new file mode 100644
index 0000000..1e16f84
--- /dev/null
+++ b/libMems/PairwiseMatchFinder.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * $Id: PairwiseMatchFinder.cpp,v 1.13 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/PairwiseMatchFinder.h"
+#include <list>
+
+using namespace std;
+using namespace genome;
+
+namespace mems {
+
+PairwiseMatchFinder::PairwiseMatchFinder(){
+}
+
+PairwiseMatchFinder::~PairwiseMatchFinder(){
+}
+
+PairwiseMatchFinder::PairwiseMatchFinder(const PairwiseMatchFinder& mh) : MemHash(mh){
+
+}
+
+PairwiseMatchFinder* PairwiseMatchFinder::Clone() const{
+ return new PairwiseMatchFinder(*this);
+}
+
+
+// enumerate out every pairwise match
+boolean PairwiseMatchFinder::EnumerateMatches( IdmerList& match_list ){
+
+ match_list.sort(&idmer_id_lessthan);
+ IdmerList::iterator iter = match_list.begin();
+ IdmerList::iterator iter2 = match_list.begin();
+ uint cur_id_count = 1;
+ IdmerList unique_list;
+ // identify all of the unique seeds and add them to unique_list
+ while(iter2 != match_list.end()){
+ ++iter2;
+ if(iter2 == match_list.end() || iter->id != iter2->id){
+ if( cur_id_count == 1 )
+ unique_list.push_back( *iter );
+ else
+ cur_id_count = 1;
+ }else
+ cur_id_count++;
+ ++iter;
+ }
+ // hash each pair of unique seeds
+ boolean success = true;
+ for( iter = unique_list.begin(); iter != unique_list.end(); ++iter )
+ {
+ for( iter2 = iter; iter2 != unique_list.end(); ++iter2 )
+ {
+ if( iter == iter2 )
+ continue;
+ IdmerList hash_list;
+ hash_list.push_back( *iter );
+ hash_list.push_back( *iter2 );
+ success = success && HashMatch(hash_list);
+ }
+ }
+ return success;
+}
+
+} // namespace mems
diff --git a/libMems/PairwiseMatchFinder.h b/libMems/PairwiseMatchFinder.h
new file mode 100644
index 0000000..beac209
--- /dev/null
+++ b/libMems/PairwiseMatchFinder.h
@@ -0,0 +1,38 @@
+/*******************************************************************************
+ * $Id: PairwiseMatchFinder.h,v 1.8 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _PairwiseMatchFinder_h_
+#define _PairwiseMatchFinder_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MemHash.h"
+
+namespace mems {
+
+/**
+ * Finds all pairwise matches with unique seeds among a group of sequences
+ */
+class PairwiseMatchFinder : public mems::MemHash
+{
+public:
+ PairwiseMatchFinder();
+ ~PairwiseMatchFinder();
+
+ PairwiseMatchFinder(const PairwiseMatchFinder& mh);
+ virtual PairwiseMatchFinder* Clone() const;
+protected:
+
+ virtual boolean EnumerateMatches( mems::IdmerList& match_list );
+};
+
+}
+
+#endif //_PairwiseMatchFinder_h_
diff --git a/libMems/ParallelMemHash.cpp b/libMems/ParallelMemHash.cpp
new file mode 100644
index 0000000..38b67d5
--- /dev/null
+++ b/libMems/ParallelMemHash.cpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ * $Id: ParallelMemHash.cpp,v 1.32 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/ParallelMemHash.h"
+#include <vector>
+
+#ifdef _OPENMP
+
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+ ParallelMemHash::ParallelMemHash() : MemHash()
+
+{
+}
+
+ParallelMemHash::ParallelMemHash(const ParallelMemHash& mh) : MemHash(mh)
+{
+ *this = mh;
+}
+
+ParallelMemHash& ParallelMemHash::operator=( const ParallelMemHash& mh ){
+ thread_mem_table = mh.thread_mem_table;
+ return *this;
+}
+
+ParallelMemHash* ParallelMemHash::Clone() const{
+ return new ParallelMemHash(*this);
+}
+
+void ParallelMemHash::FindMatches( MatchList& ml )
+{
+ for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
+ if( !AddSequence( ml.sml_table[ seqI ], ml.seq_table[ seqI ] ) ){
+ ErrorMsg( "Error adding " + ml.seq_filename[seqI] + "\n");
+ return;
+ }
+ }
+
+ size_t CHUNK_SIZE = 200000;
+ // break up the SMLs into nice small chunks
+ vector< vector< gnSeqI > > chunk_starts;
+ vector< gnSeqI > chunk_lengths;
+
+ // set the progress counter data
+ mers_processed = 0;
+ total_mers = 0;
+ m_progress = -1;
+ for( size_t i = 0; i < ml.sml_table.size(); i++ )
+ total_mers += ml.sml_table[i]->Length();
+
+ // break up on the longest SML
+ int max_length_sml = -1;
+ size_t maxlen = 0;
+ for( size_t i = 0; i < ml.sml_table.size(); i++ )
+ {
+ if( ml.sml_table[i]->Length() > maxlen )
+ {
+ maxlen = ml.sml_table[i]->Length();
+ max_length_sml = i;
+ }
+ }
+
+ chunk_starts.push_back( vector< gnSeqI >( seq_count, 0 ) );
+
+ while( chunk_starts.back()[max_length_sml] + CHUNK_SIZE < ml.sml_table[max_length_sml]->Length() )
+ {
+ vector< gnSeqI > tmp( seq_count, 0 );
+ GetBreakpoint(max_length_sml, chunk_starts.back()[max_length_sml] + CHUNK_SIZE, tmp);
+ chunk_starts.push_back(tmp);
+ }
+
+
+ // now that it's all chunky, search in parallel
+#pragma omp parallel for schedule(dynamic)
+ for( int i = 0; i < chunk_starts.size(); i++ )
+ {
+ if(thread_mem_table.get().size() != mem_table.size())
+ thread_mem_table.get().resize( mem_table.size() );
+
+ vector< gnSeqI > chunk_lens(seq_count);
+ if( i + 1 < chunk_starts.size() )
+ {
+ for( size_t j = 0; j < seq_count; j++ )
+ chunk_lens[j] = chunk_starts[i+1][j] - chunk_starts[i][j];
+ }else
+ chunk_lens = vector< gnSeqI >( seq_count, GNSEQI_END );
+ SearchRange( chunk_starts[i], chunk_lens );
+ MergeTable();
+ }
+ GetMatchList( ml );
+}
+
+void ParallelMemHash::MergeTable()
+{
+#pragma omp critical
+ {
+ size_t buckets = thread_mem_table.get().size();
+ for( size_t bI = 0; bI < buckets; bI++ )
+ {
+ vector< MatchHashEntry* >& bucket = thread_mem_table.get()[bI];
+ for( size_t mI = 0; mI < bucket.size(); mI++ )
+ {
+ MemHash::AddHashEntry((*(bucket[mI])), mem_table);
+// bucket[mI]->Free();
+ }
+ }
+ thread_mem_table.get() = mem_table;
+ }
+}
+
+
+
+MatchHashEntry* ParallelMemHash::AddHashEntry(MatchHashEntry& mhe){
+ // do the normal procedure, but use the thread-local mem table.
+ return MemHash::AddHashEntry(mhe, thread_mem_table.get());
+}
+
+
+} // namespace mems
+
+#endif // _OPENMP
diff --git a/libMems/ParallelMemHash.h b/libMems/ParallelMemHash.h
new file mode 100644
index 0000000..8537201
--- /dev/null
+++ b/libMems/ParallelMemHash.h
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * $Id: ParallelMemHash.h,v 1.23 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _ParallelMemHash_h_
+#define _ParallelMemHash_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef _OPENMP
+
+#include "libMUSCLE/threadstorage.h"
+#include <omp.h>
+#include "libMems/MemHash.h"
+
+namespace mems {
+
+
+/**
+ * ParallelMemHash implements an algorithm for finding exact matches of a certain minimal
+ * length in several sequences.
+ */
+class ParallelMemHash : public MemHash {
+public:
+ ParallelMemHash();
+ ParallelMemHash(const ParallelMemHash& mh);
+ ParallelMemHash& operator=( const ParallelMemHash& mh );
+ virtual ParallelMemHash* Clone() const;
+
+ /**
+ * Finds (in parallel) all matches in the sequences contained by "match_list"
+ * The resulting list of matches is stored within "match_list"
+ */
+ virtual void FindMatches( MatchList& match_list );
+
+
+protected:
+ virtual MatchHashEntry* AddHashEntry(MatchHashEntry& mhe);
+ virtual void MergeTable();
+
+ TLS< std::vector< std::vector<MatchHashEntry*> > > thread_mem_table;
+};
+
+
+}
+
+#else // _OPENMP
+
+namespace mems {
+
+
+/**
+ * When built without OpenMP, the ParallelMemHash is just a stub wrapper around MemHash
+ */
+class ParallelMemHash : public MemHash {
+public:
+ ParallelMemHash() : MemHash();
+ ParallelMemHash(const ParallelMemHash& mh) : MemHash(mh);
+ ParallelMemHash& operator=( const ParallelMemHash& mh ) : MemHash::operator=(mh){ return *this; }
+ virtual ParallelMemHash* Clone() const{ return new ParallelMemHash(*this); }
+};
+
+
+}
+
+
+#endif // _OPENMP
+
+#endif //_ParallelMemHash_h_
diff --git a/libMems/PhyloTree.cpp b/libMems/PhyloTree.cpp
new file mode 100644
index 0000000..a790146
--- /dev/null
+++ b/libMems/PhyloTree.cpp
@@ -0,0 +1,9 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "PhyloTree.h"
+using namespace std;
+
+typedef unsigned uint;
+
diff --git a/libMems/PhyloTree.h b/libMems/PhyloTree.h
new file mode 100644
index 0000000..7267f9c
--- /dev/null
+++ b/libMems/PhyloTree.h
@@ -0,0 +1,378 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __PhyloTree_h__
+#define __PhyloTree_h__
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <stack>
+
+//typedef unsigned int node_id_t;
+typedef size_t node_id_t;
+class TreeNode
+{
+public:
+ TreeNode() : distance(0) {};
+ std::string name; /**< node name */
+ double distance; /**< distance to parent */
+ std::vector< node_id_t > parents; /**< if parents.size() == 0 this is a root node */
+ std::vector< node_id_t > children; /**< if children.size() == 0 this is a leaf node */
+};
+
+template< class T >
+class PhyloTree
+{
+public:
+ PhyloTree();
+ PhyloTree( const PhyloTree<T>& pt );
+ PhyloTree<T>& operator=( const PhyloTree<T>& pt );
+ double weight; /**< Overall tree weight */
+ node_id_t root; /**< root of the tree */
+ std::vector< T > nodes; /**< nodes of the tree */
+ void clear();
+ /**
+ * Reads a tree in Newick format. WARNING: only reads rooted trees correctly
+ */
+ void readTree( std::istream& tree_file );
+ /**
+ * Writes a tree in Newick format
+ */
+ void writeTree( std::ostream& os ) const;
+ /**
+ * Determines the height of the tree along the path from the root to the left-most leaf node
+ */
+ double getHeight() const;
+ /**
+ * Determines the height of the tree along the path from nodeI to its left-most descendant leaf node
+ */
+ double getHeight( node_id_t nodeI ) const;
+
+ T& operator[]( const unsigned i ){ return nodes[i]; }
+ const T& operator[]( const unsigned i ) const{ return nodes[i]; }
+ size_t size() const{ return nodes.size(); }
+ void push_back( T& t ){ nodes.push_back(t); }
+ T& back() { return nodes.back(); }
+ const T& back() const{ return nodes.back(); }
+ void resize( const unsigned s ){ nodes.resize(s); }
+
+
+ void swap( PhyloTree<T>& other )
+ {
+ std::swap( weight, other.weight );
+ std::swap( root, other.root );
+ nodes.swap( other.nodes );
+ }
+protected:
+};
+
+
+template< class T >
+PhyloTree<T>::PhyloTree()
+{
+ weight = 0;
+ root = 0;
+}
+
+template< class T >
+PhyloTree<T>::PhyloTree( const PhyloTree<T>& pt ) :
+nodes( pt.nodes ),
+weight( pt.weight ),
+root( pt.root )
+{}
+
+template< class T >
+PhyloTree<T>& PhyloTree<T>::operator=( const PhyloTree<T>& pt )
+{
+ nodes = pt.nodes;
+ weight = pt.weight;
+ root = pt.root;
+ return *this;
+}
+
+template< class T >
+void PhyloTree<T>::clear()
+{
+ nodes.clear();
+ weight = 0;
+ root = 0;
+}
+
+
+/**
+ * readTree version 2.0: read in a phylogenetic tree in the Newick file format.
+ *
+ */
+template< class T >
+void PhyloTree<T>::readTree( std::istream& tree_file )
+{
+ std::string line;
+ clear();
+ if( !std::getline( tree_file, line ) )
+ return;
+ // look for either a ; or a matched number of parenthesis, if
+ // not found then read another line
+ while(true){
+ int paren_count = 0;
+ for( size_t charI = 0; charI < line.size(); charI++ )
+ {
+ if( line[charI] == '(' )
+ paren_count++;
+ if( line[charI] == ')' )
+ paren_count--;
+ }
+ if( paren_count == 0 )
+ break;
+ if( paren_count != 0 ){
+ std::string another_line;
+ if( !std::getline( tree_file, another_line ) )
+ return;
+ line += another_line;
+ }
+ }
+
+ std::stringstream line_str( line );
+
+ // look for a weight
+ std::string::size_type open_bracket_pos = line.find( "[" );
+ std::string::size_type bracket_pos = line.find( "]" );
+ if( open_bracket_pos != std::string::npos && bracket_pos != std::string::npos &&
+ open_bracket_pos < bracket_pos && bracket_pos < line.find( "(" ) ){
+ // read in a weight
+ getline( line_str, line, '[' );
+ getline( line_str, line, ']' );
+ std::stringstream weight_str( line );
+ weight_str >> weight;
+ }
+
+ // ready to begin parsing the tree data.
+ std::string tree_line;
+ std::getline( line_str, tree_line, ';' );
+ size_t read_state = 0; /**< read_state of 0 indicates nothing has been parsed yet */
+ size_t section_start = 0;
+ std::stack< node_id_t > node_stack;
+ std::stringstream blen_str;
+ T new_node;
+ new_node.distance = 0; // default the distance to 0
+ bool already_read_name = false;
+ bool blen_found = false;
+ for( size_t charI = 0; charI < tree_line.size(); charI++ ){
+ switch( tree_line[ charI ] ){
+ // if this is an open parens then simply create a new
+ // parent node and push it on the parent stack
+ case '(':
+ if( node_stack.size() > 0 ){
+ new_node.parents.clear();
+ new_node.parents.push_back( node_stack.top() );
+ (*this)[ node_stack.top() ].children.push_back( (node_id_t)(*this).size() );
+ }
+ node_stack.push( (node_id_t)(*this).size() );
+ nodes.push_back( new_node );
+ read_state = 1;
+ section_start = charI + 1;
+ break;
+ case ')':
+ if( blen_found )
+ {
+ // read off a branch length
+ blen_str.clear();
+ blen_str.str( tree_line.substr( section_start, charI - section_start ) );
+ blen_str >> (*this)[ node_stack.top() ].distance;
+ }else{
+ // read off a name, if possible
+ if( read_state == 1 ){
+ new_node.parents.clear();
+ new_node.parents.push_back( node_stack.top() );
+ (*this)[ node_stack.top() ].children.push_back( (node_id_t)(*this).size() );
+ node_stack.push( (node_id_t)(*this).size() );
+ nodes.push_back( new_node );
+ read_state = 2; // pop this node after reading its branch length
+ }
+ (*this)[ node_stack.top() ].name = tree_line.substr( section_start, charI - section_start );
+ }
+ if( read_state == 2 )
+ node_stack.pop();
+ section_start = charI + 1;
+ blen_found = false;
+
+ // pop off the top of the node stack
+ read_state = 2;
+ break;
+ case ',':
+ if( blen_found ){
+ // read off a branch length
+ blen_str.clear();
+ blen_str.str( tree_line.substr( section_start, charI - section_start ) );
+ blen_str >> (*this)[ node_stack.top() ].distance;
+ }else{
+ // read off a name, if possible
+ if( read_state == 1 ){
+ new_node.parents.clear();
+ new_node.parents.push_back( node_stack.top() );
+ (*this)[ node_stack.top() ].children.push_back( (node_id_t)(*this).size() );
+ node_stack.push( (node_id_t)(*this).size() );
+ nodes.push_back( new_node );
+ read_state = 2; // pop this node after reading its name
+ }
+ (*this)[ node_stack.top() ].name = tree_line.substr( section_start, charI - section_start );
+ }
+ if( read_state == 2 )
+ node_stack.pop();
+ section_start = charI + 1;
+ read_state = 1; // indicates that we'll be creating a new node when we hit :
+ blen_found = false;
+ break;
+ case ':':
+ // read off a name, if possible
+ if( read_state == 1 ){
+ new_node.parents.clear();
+ new_node.parents.push_back( node_stack.top() );
+ (*this)[ node_stack.top() ].children.push_back( (node_id_t)(*this).size() );
+ node_stack.push( (node_id_t)(*this).size() );
+ nodes.push_back( new_node );
+ read_state = 2; // pop this node after reading its branch length
+ }
+ (*this)[ node_stack.top() ].name = tree_line.substr( section_start, charI - section_start );
+ section_start = charI + 1;
+ blen_found = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+}
+
+
+template< class T >
+void PhyloTree<T>::writeTree( std::ostream& os ) const{
+ std::stack< node_id_t > node_stack;
+ std::stack< size_t > child_stack;
+ node_stack.push( root );
+ child_stack.push( 0 );
+ bool write_branch_lengths = false;
+ for( size_t nodeI = 0; nodeI < this->size(); nodeI++ )
+ {
+ if( (*this)[nodeI].distance != 0 )
+ {
+ write_branch_lengths = true;
+ break;
+ }
+ }
+
+ if( (*this).weight != 0 )
+ os << "[" << weight << "]";
+ os << "(";
+
+ while( node_stack.size() > 0 ) {
+ if( (*this)[ node_stack.top() ].children.size() != 0 ){
+ // this is a parent node
+ // if we have scanned all its children then pop it
+ if( child_stack.top() == (*this)[ node_stack.top() ].children.size() ){
+ os << ")";
+ if( node_stack.size() > 1 && write_branch_lengths )
+ os << ":" << (*this)[ node_stack.top() ].distance;
+ node_stack.pop();
+ child_stack.pop();
+ continue;
+ }
+ // try to recurse to its children
+ // if the child is a parent as well spit out a paren
+ node_id_t child = (*this)[ node_stack.top() ].children[ child_stack.top() ];
+ node_stack.push( child );
+ child_stack.top()++;
+ // print a comma to separate multiple children
+ if( child_stack.top() > 1 )
+ os << ",";
+ if( (*this)[ child ].children.size() > 0 ){
+ child_stack.push( 0 );
+ os << "(";
+ }
+ continue;
+ }
+
+ // this is a leaf node
+ os << (*this)[ node_stack.top() ].name;
+ if( write_branch_lengths )
+ os << ":" << (*this)[ node_stack.top() ].distance;
+
+ // pop the child
+ node_stack.pop();
+ }
+ os << ";" << std::endl;
+}
+
+
+template< class T >
+double PhyloTree<T>::getHeight() const
+{
+ return getHeight( root );
+}
+
+template< class T >
+double PhyloTree<T>::getHeight( node_id_t nodeI ) const
+{
+ if( (*this)[ nodeI ].children.size() == 0 )
+ return (*this)[ nodeI ].distance;
+ return (*this)[ nodeI ].distance + getHeight( (*this)[ nodeI ].children[ 0 ] );
+}
+
+
+/** determine which nodes are descendants of a given node */
+template< class TreeType >
+void getDescendants( TreeType& alignment_tree, node_id_t node, std::vector< node_id_t >& descendants )
+{
+ // do a depth first search
+ std::stack< node_id_t > node_stack;
+ node_stack.push( node );
+ descendants.clear();
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( alignment_tree[cur_node].children.size() > 0 )
+ {
+ node_stack.push(alignment_tree[cur_node].children[0]);
+ node_stack.push(alignment_tree[cur_node].children[1]);
+ }
+ descendants.push_back(cur_node);
+ }
+}
+
+
+/** determine which nodes are leaf nodes below a given node */
+template< class TreeType >
+void getLeaves( TreeType& tree, node_id_t node, std::vector< node_id_t >& leaves )
+{
+ // do a depth first search
+ std::stack< node_id_t > node_stack;
+ node_stack.push( node );
+ leaves.clear();
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( tree[cur_node].children.size() > 0 )
+ {
+ node_stack.push(tree[cur_node].children[0]);
+ node_stack.push(tree[cur_node].children[1]);
+ }else
+ leaves.push_back(cur_node);
+ }
+}
+
+namespace std {
+
+template< class T > inline
+void swap( PhyloTree<T>& a, PhyloTree<T>& b )
+{
+ a.swap(b);
+}
+
+template<> inline void swap( PhyloTree<TreeNode>& a, PhyloTree<TreeNode>& b){ a.swap(b); }
+}
+
+#endif // __PhyloTree_h__
diff --git a/libMems/ProgressiveAligner.cpp b/libMems/ProgressiveAligner.cpp
new file mode 100644
index 0000000..50381c5
--- /dev/null
+++ b/libMems/ProgressiveAligner.cpp
@@ -0,0 +1,3945 @@
+/*******************************************************************************
+ * $Id: progressiveAligner.cpp,v 1.47 2004/04/19 23:10:30 darling Exp $
+ * BEWARE!!
+ * This code was created in the likeness of the flying spaghetti monster
+ *
+ * dedicated to Loren...
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/GreedyBreakpointElimination.h"
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/MuscleInterface.h" // it's the default gapped aligner
+#include "libMems/gnAlignedSequences.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/PairwiseMatchFinder.h"
+#include "libMems/TreeUtilities.h"
+#include "libMems/PairwiseMatchAdapter.h"
+#include "libMems/DistanceMatrix.h"
+
+#include <boost/dynamic_bitset.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/johnson_all_pairs_shortest.hpp>
+#include <boost/graph/undirected_dfs.hpp>
+
+#include <map>
+#include <fstream> // for debugging
+#include <sstream>
+#include <stack>
+#include <algorithm>
+#include <limits>
+#include <iomanip>
+
+#include "stdlib.h"
+
+using namespace std;
+using namespace genome;
+
+namespace mems {
+
+
+bool progress_msgs = false;
+
+bool debug_me = false;
+static int dbg_count = 0;
+
+
+double min_window_size = 200;
+double max_window_size = 20000; // don't feed MUSCLE anything bigger than this
+double min_density = .5;
+double max_density = .9;
+size_t max_gap_length = 5000;
+size_t lcb_hangover = 300;
+
+
+void mergeUnalignedIntervals( uint seqI, vector< Interval* >& iv_list, vector< Interval* >& new_list );
+
+/**
+ * Test code to ensure that an individual LCB is truly collinear
+ * @return true if the LCB is good
+ */
+boolean my_validateLCB( MatchList& lcb ){
+ vector< Match* >::iterator lcb_iter = lcb.begin();
+ if( lcb.size() == 0 )
+ return true;
+ uint seq_count = (*lcb_iter)->SeqCount();
+ uint seqI = 0;
+ boolean complain = false;
+ for(; seqI < seq_count; seqI++ ){
+ lcb_iter = lcb.begin();
+ int64 prev_coord = 0;
+ for(; lcb_iter != lcb.end(); ++lcb_iter ){
+ if( (*lcb_iter)->Start( seqI ) == NO_MATCH )
+ continue;
+ else if( prev_coord != 0 && (*lcb_iter)->Start( seqI ) < prev_coord ){
+ complain = true;
+ }
+ prev_coord = (*lcb_iter)->Start( seqI );
+ }
+ }
+ return !complain;
+}
+
+template< class BoostMatType >
+void print2d_matrix( BoostMatType& mat, std::ostream& os )
+{
+ for( size_t i = 0; i < mat.shape()[0]; ++i )
+ {
+ for( size_t j = 0; j < mat.shape()[1]; ++j )
+ {
+ if( j > 0 )
+ os << "\t";
+ os << mat[i][j];
+ }
+ os << endl;
+ }
+}
+
+double getDefaultBreakpointPenalty( std::vector< gnSequence* >& sequences )
+{
+ uint default_mer_size = MatchList::GetDefaultMerSize( sequences );
+ double avg_seq_len = 0;
+ for( size_t seqI = 0; seqI < sequences.size(); ++seqI )
+ avg_seq_len += (double)sequences[seqI]->length();
+ avg_seq_len /= (double)sequences.size();
+ avg_seq_len = log( avg_seq_len ) / log( 2.0 );
+ return avg_seq_len * 7000; // seems to work reasonably well?
+}
+
+
+double getDefaultBpDistEstimateMinScore( std::vector< gnSequence* >& sequences )
+{
+ // this value was empirically derived by a process that involved burning incense
+ // and uttering arcane words
+ return 3.0 * getDefaultBreakpointPenalty(sequences);
+}
+
+
+
+/*
+ * A progressive alignment algorithm for genomes with rearrangements.
+ * Start simple, add complexity later.
+ * TODO: rewrite the algorithm outline
+ */
+
+ProgressiveAligner::ProgressiveAligner( uint seq_count ) :
+Aligner( seq_count ),
+breakpoint_penalty( -1 ),
+min_breakpoint_penalty( 4000 ),
+debug(false),
+refine(true),
+scoring_scheme(ExtantSumOfPairsScoring),
+use_weight_scaling(true),
+conservation_dist_scale(1),
+bp_dist_scale(.9),
+max_gapped_alignment_length(20000),
+bp_dist_estimate_score(-1),
+use_seed_families(false),
+using_cache_db(true)
+{
+ gapped_alignment = true;
+ max_window_size = max_gapped_alignment_length;
+}
+
+void ProgressiveAligner::SetMaxGappedAlignmentLength( size_t len )
+{
+ max_gapped_alignment_length = len;
+ max_window_size = max_gapped_alignment_length;
+}
+
+/** determine which extant sequences have been aligned at a given node */
+void ProgressiveAligner::getAlignedChildren( node_id_t node, vector< node_id_t >& descendants )
+{
+ // do a depth first search along edges that have been aligned
+ stack< node_id_t > node_stack;
+ node_stack.push( node );
+ vector< bool > visited( alignment_tree.size(), false );
+ descendants.clear();
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ if(progress_msgs) cout << "Evaluating aligned nodes linked to node " << cur_node << endl;
+ node_stack.pop();
+ visited[cur_node] = true;
+ for( uint childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
+ {
+ node_id_t child_id = alignment_tree[cur_node].children[childI];
+ if( alignment_tree[cur_node].children_aligned[childI] && !visited[child_id])
+ node_stack.push( child_id );
+ }
+ if( alignment_tree[ cur_node ].sequence != NULL )
+ descendants.push_back( cur_node );
+ }
+}
+
+
+/** determine which extant sequences have been aligned at a given node */
+void ProgressiveAligner::getPath( node_id_t first_n, node_id_t last_n, vector< node_id_t >& path )
+{
+ // do a depth first search along edges that have been aligned
+ stack< node_id_t > node_stack;
+ node_stack.push( last_n );
+ vector< bool > visited( alignment_tree.size(), false );
+ while( node_stack.top() != first_n )
+ {
+ node_id_t cur_node = node_stack.top();
+ size_t pre_size = node_stack.size();
+ visited[cur_node] = true;
+ for( uint childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
+ {
+ node_id_t child_id = alignment_tree[cur_node].children[childI];
+ if(!visited[child_id])
+ {
+ node_stack.push( child_id );
+ break;
+ }
+ }
+ if( pre_size != node_stack.size() )
+ continue;
+ for( uint parentI = 0; parentI < alignment_tree[cur_node].parents.size(); parentI++ )
+ {
+ node_id_t parent_id = alignment_tree[cur_node].parents[parentI];
+ if(!visited[parent_id])
+ {
+ node_stack.push( parent_id );
+ break;
+ }
+ }
+ if( pre_size != node_stack.size() )
+ continue;
+ node_stack.pop(); // didn't make any progress
+ }
+ path = vector< node_id_t >( node_stack.size() );
+ for( size_t pI = 0; pI < path.size(); pI++ )
+ {
+ path[pI] = node_stack.top();
+ node_stack.pop();
+ }
+}
+
+
+
+
+
+
+template<class MatchType>
+void ProgressiveAligner::propagateDescendantBreakpoints( node_id_t node1, uint seqI, std::vector<MatchType*>& iv_list )
+{
+ SSC<MatchType> ilc(seqI);
+ sort( iv_list.begin(), iv_list.end(), ilc );
+ vector< SuperInterval >& ord = alignment_tree[ node1 ].ordering;
+ vector<gnSeqI> bp_list;
+ for( size_t sI = 0; sI < ord.size(); sI++ )
+ bp_list.push_back( ord[sI].LeftEnd() );
+
+ GenericMatchSeqManipulator<MatchType> ism( seqI );
+ applyBreakpoints( bp_list, iv_list, ism );
+}
+
+// T should be a pointer type
+template<class T, class Manipulator>
+void applyAncestralBreakpoints( const vector< SuperInterval >& siv_list, vector<T>& ord, uint seqI, Manipulator& m )
+{
+ // make bp list
+ vector<gnSeqI> bp_list(siv_list.size()*2, 0);
+ size_t cur = 0;
+ for( size_t i = 0; i < siv_list.size(); i++ )
+ {
+ if( siv_list[i].reference_iv.Start(seqI) == NO_MATCH )
+ continue;
+ bp_list[cur++] = siv_list[i].reference_iv.LeftEnd(seqI);
+ bp_list[cur++] = siv_list[i].reference_iv.LeftEnd(seqI) + siv_list[i].reference_iv.Length(seqI);
+ }
+ bp_list.resize(cur);
+ // sort the breakpoints and apply...
+ sort( bp_list.begin(), bp_list.end() );
+ applyBreakpoints( bp_list, ord, m );
+}
+
+
+// assuming breakpoints have been propagated in both directions
+// there should now be a 1-to-1 correspondence between superintervals
+// in the ancestor and descendants.
+void ProgressiveAligner::linkSuperIntervals( node_id_t node1, uint seqI, node_id_t ancestor )
+{
+ // TODO: speed this up by implementing O(N) instead of O(N^2)
+ vector<SuperInterval>& a_ord = alignment_tree[ancestor].ordering;
+ vector<SuperInterval>& c_ord = alignment_tree[node1].ordering;
+ // initialize all linkages to nothing
+ for( size_t aI = 0; aI < a_ord.size(); aI++ )
+ if( seqI == 0 )
+ a_ord[aI].c1_siv = (std::numeric_limits<size_t>::max)();
+ else
+ a_ord[aI].c2_siv = (std::numeric_limits<size_t>::max)();
+ for( size_t cI = 0; cI < c_ord.size(); cI++ )
+ c_ord[cI].parent_siv = (std::numeric_limits<size_t>::max)();
+
+ for( size_t aI = 0; aI < a_ord.size(); aI++ )
+ {
+ if( a_ord[aI].reference_iv.LeftEnd(seqI) == NO_MATCH )
+ continue;
+ size_t cI = 0;
+ for( ; cI < c_ord.size(); cI++ )
+ {
+ if( absolut(a_ord[aI].reference_iv.Start(seqI)) != c_ord[cI].LeftEnd() )
+ continue;
+ if( a_ord[aI].reference_iv.Length(seqI) != c_ord[cI].Length() )
+ {
+ breakHere();
+ cerr << "mapping length mismatch\n";
+ cerr << "ancestor: " << ancestor << "\t node1: " << node1 << endl;
+ cerr << "a_ord[" << aI << "].reference_iv.Length(" << seqI << "): " << a_ord[aI].reference_iv.Length(seqI) << endl;
+ cerr << "a_ord[" << aI << "].reference_iv.LeftEnd(" << seqI << "): " << a_ord[aI].reference_iv.LeftEnd(seqI) << endl;
+ cerr << "c_ord[" << cI << "].Length(): " << c_ord[cI].Length() << endl;
+ cerr << "c_ord[" << cI << "].LeftEnd(): " << c_ord[cI].LeftEnd() << endl;
+ cerr << "";
+ cerr << "";
+ }
+ // link these
+ if( seqI == 0 )
+ a_ord[aI].c1_siv = cI;
+ else
+ a_ord[aI].c2_siv = cI;
+ c_ord[cI].parent_siv = aI;
+ break;
+ }
+ if( cI == c_ord.size() )
+ {
+ breakHere();
+ cerr << "error no mapping\n";
+ }
+ }
+}
+
+
+void ProgressiveAligner::translateGappedCoordinates( vector<AbstractMatch*>& ml, uint seqI, node_id_t extant, node_id_t ancestor )
+{
+ // determine the path that must be traversed
+ vector< node_id_t > trans_path;
+ getPath( extant, ancestor, trans_path );
+
+ // set seqI to forward orientation
+ for( size_t mI = 0; mI < ml.size(); mI++ )
+ if( ml[mI]->Orientation(seqI) == AbstractMatch::reverse )
+ ml[mI]->Invert();
+
+ // for each node on the path, construct a complete coordinate translation
+ for( size_t nI = 1; nI < trans_path.size(); nI++ )
+ {
+ // first sort matches on start pos and make them all forward oriented
+ // then split them on superinterval boundaries and assign each to a superinterval
+ // then convert each match's coordinates to be superinterval-local
+ // then apply the coordinate translation with transposeCoordinates
+ // then shift each match's coordinates to the global ancestral coordinate space
+ SSC<AbstractMatch> ssc(seqI);
+ sort(ml.begin(), ml.end(), ssc);
+
+ // split on superinterval boundaries
+ vector< SuperInterval >& siv_list = alignment_tree[trans_path[nI]].ordering;
+ vector< vector< AbstractMatch* > > siv_matches = vector< vector< AbstractMatch* > >(siv_list.size());
+ size_t cur_child = 0;
+ if( alignment_tree[trans_path[nI]].children[0] == trans_path[nI-1] )
+ cur_child = 0;
+ else if( alignment_tree[trans_path[nI]].children[1] == trans_path[nI-1] )
+ cur_child = 1;
+ else
+ {
+ breakHere();
+ cerr << "forest fire\n";
+ }
+
+ AbstractMatchSeqManipulator amsm( seqI );
+ applyAncestralBreakpoints(siv_list, ml, cur_child, amsm );
+
+ // sort matches again because new ones were added at the end
+ sort(ml.begin(), ml.end(), ssc);
+
+ // assign each match to a siv, and convert coords to siv-local
+ for( size_t mI = 0; mI < ml.size(); mI++ )
+ {
+ if( ml[mI]->LeftEnd(seqI) == 0 )
+ {
+ breakHere();
+ cerr << "fefefe";
+ }
+ size_t sivI = 0;
+ for( ; sivI < siv_list.size(); sivI++ )
+ {
+ if( siv_list[sivI].reference_iv.LeftEnd(cur_child) == NO_MATCH )
+ continue;
+ if( ml[mI]->LeftEnd(seqI) >= siv_list[sivI].reference_iv.LeftEnd(cur_child) &&
+ ml[mI]->LeftEnd(seqI) < siv_list[sivI].reference_iv.LeftEnd(cur_child) + siv_list[sivI].reference_iv.Length(cur_child) )
+ break;
+ }
+ if( sivI == siv_list.size() )
+ {
+ cerr << "nI is: "<< nI << endl;
+ cerr << "trans_path: ";
+ for( size_t ttI = 0; ttI < trans_path.size(); ttI++ )
+ cerr << " " << trans_path[ttI];
+ cerr << endl;
+ cerr << "problem seq: " << seqI << std::endl;
+ cerr << "ml[" << mI << "]->Start(0) == " << ml[mI]->Start(0) << endl;
+ cerr << "ml[" << mI << "]->Length(0) == " << ml[mI]->Length(1) << endl;
+ cerr << "ml[" << mI << "]->Start(1) == " << ml[mI]->Start(0) << endl;
+ cerr << "ml[" << mI << "]->Length(1) == " << ml[mI]->Length(1) << endl;
+ cerr << "ml.size(): " << ml.size() << endl;
+ for( sivI = 0; sivI < siv_list.size(); sivI++ )
+ {
+ cerr << "siv_list[" << sivI << "] left end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) << endl;
+ if( siv_list[sivI].reference_iv.LeftEnd(0) != 0 )
+ cerr << "siv_list[" << sivI << "] right end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) + siv_list[sivI].reference_iv.Length(0) << endl;
+ cerr << "siv_list[" << sivI << "] left end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) << endl;
+ if( siv_list[sivI].reference_iv.LeftEnd(1) != 0 )
+ cerr << "siv_list[" << sivI << "] right end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) + siv_list[sivI].reference_iv.Length(1) << endl;
+ }
+ breakHere();
+ }
+ if( ml[mI]->LeftEnd(seqI) + ml[mI]->Length(seqI) >
+ siv_list[sivI].reference_iv.LeftEnd(cur_child) + siv_list[sivI].reference_iv.Length(cur_child) )
+ {
+ cerr << "doesn't fit\n";
+ cerr << "ml[" << mI << "]->LeftEnd(" << seqI << "): " << ml[mI]->LeftEnd(seqI) << endl;
+ cerr << "ml[" << mI << "]->RightEnd(" << seqI << "): " << ml[mI]->RightEnd(seqI) << endl;
+ cerr << "siv_list[" << sivI << "] left end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) << endl;
+ if( siv_list[sivI].reference_iv.LeftEnd(0) != 0 )
+ cerr << "siv_list[" << sivI << "] right end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) + siv_list[sivI].reference_iv.Length(0) << endl;
+ cerr << "siv_list[" << sivI << "] left end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) << endl;
+ if( siv_list[sivI].reference_iv.LeftEnd(1) != 0 )
+ cerr << "siv_list[" << sivI << "] right end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) + siv_list[sivI].reference_iv.Length(1) << endl;
+ cerr << "ml.size(): " << ml.size() << endl;
+ cerr << "siv_list.size(): " << siv_list.size() << endl;
+ cerr << "trans_path:";
+ for( size_t tI = 0; tI < trans_path.size(); tI++ )
+ cerr << " " << trans_path[tI];
+ cerr << endl;
+ cerr << "trans_path[" << nI << "]: " << trans_path[nI] << endl;
+ breakHere();
+ }
+
+ ml[mI]->SetLeftEnd( seqI, ml[mI]->LeftEnd(seqI) - siv_list[sivI].reference_iv.LeftEnd(cur_child) + 1 );
+ // if this interval matches the reverse strand then we should effectively invert all matches
+ if( siv_list[sivI].reference_iv.Start(cur_child) < 0 )
+ {
+ int64 new_lend = siv_list[sivI].reference_iv.Length(cur_child) - ml[mI]->LeftEnd(seqI);
+ new_lend -= ml[mI]->Length( seqI ) - 2;
+ new_lend *= ml[mI]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
+ ml[mI]->Invert();
+ ml[mI]->SetStart( seqI, new_lend );
+ }
+ siv_matches[sivI].push_back( ml[mI] );
+ }
+
+ // apply the coordinate translation
+ ml.clear();
+ for( size_t sivI = 0; sivI < siv_matches.size(); sivI++ )
+ {
+ if( siv_matches[sivI].size() == 0 )
+ continue;
+
+ // get a CompactGappedAlignment<> for this interval
+ CompactGappedAlignment<>* siv_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
+ if( siv_list[sivI].reference_iv.GetMatches().size() > 1 )
+ siv_cga = NULL;
+ bool alloc_new_siv = false;
+ CompactGappedAlignment<> tmp_cga;
+ if( siv_cga == NULL )
+ {
+ alloc_new_siv = true;
+ siv_cga = tmp_cga.Copy();
+ CompactGappedAlignment<> dorkas(siv_list[sivI].reference_iv);
+ *siv_cga = dorkas;
+ }
+
+ // now translate each match...
+ for( size_t mI = 0; mI < siv_matches[sivI].size(); mI++ )
+ {
+ CompactGappedAlignment<>* match_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_matches[sivI][mI]);
+ bool alloc_new = false;
+ if( match_cga == NULL )
+ {
+ match_cga = tmp_cga.Copy();
+ *match_cga = CompactGappedAlignment<>(*(siv_matches[sivI][mI]));
+ alloc_new = true;
+ }
+ siv_cga->translate( *match_cga, seqI, cur_child );
+
+ if( alloc_new )
+ {
+ siv_matches[sivI][mI]->Free();
+ siv_matches[sivI][mI] = match_cga;
+ }
+ }
+
+ // shift coordinates back to global space
+ for( size_t mI = 0; mI < siv_matches[sivI].size(); mI++ )
+ {
+ int64 cur_start = siv_matches[sivI][mI]->Start(seqI);
+ if( cur_start > 0 )
+ siv_matches[sivI][mI]->SetStart( seqI, cur_start + siv_list[sivI].LeftEnd() - 1 );
+ else
+ siv_matches[sivI][mI]->SetStart( seqI, cur_start - siv_list[sivI].LeftEnd() + 1);
+ if( (siv_matches[sivI][mI]->LeftEnd(seqI) + siv_matches[sivI][mI]->Length(seqI) > siv_list.back().LeftEnd() + siv_list.back().Length() )
+ )
+ {
+ // is there something wrong with the translation table?
+ cerr << "siv left is: " << siv_list[sivI].LeftEnd() << endl;
+ cerr << "siv right is: " << siv_list[sivI].LeftEnd() + siv_list[sivI].Length() << endl;
+ cerr << "match right is: " << siv_matches[sivI][mI]->LeftEnd(seqI) + siv_matches[sivI][mI]->Length(seqI) << endl;
+ cerr << "superseq right is: " << siv_list.back().LeftEnd() + siv_list.back().Length() << endl;
+ cerr << "";
+ breakHere();
+ }
+ if( debug_aligner && siv_matches[sivI][mI]->Start(seqI) == 0 )
+ {
+ breakHere();
+ }
+ }
+ if(alloc_new_siv)
+ siv_cga->Free();
+ ml.insert( ml.end(), siv_matches[sivI].begin(), siv_matches[sivI].end() );
+ }
+ }
+ // restore forward orientation seqI
+ for( size_t mI = 0; mI < ml.size(); mI++ )
+ if( ml[mI]->Orientation(seqI) == AbstractMatch::reverse )
+ ml[mI]->Invert();
+}
+
+class SuperIntervalPtrComp
+{
+public:
+ bool operator()( const SuperInterval* a, const SuperInterval* b )
+ {
+ return (*a) < (*b);
+ }
+};
+
+void ProgressiveAligner::recursiveApplyAncestralBreakpoints( node_id_t ancestor )
+{
+ stack<node_id_t> node_stack;
+ node_stack.push(ancestor);
+ while( node_stack.size() > 0 )
+ {
+ // pop the current node, apply ancestral breakpoints, recurse on children
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ SuperIntervalManipulator sim;
+ if( progress_msgs ) cout << "cur node: " << cur_node << endl;
+ for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
+ {
+ AlignmentTreeNode& atn = alignment_tree[alignment_tree[cur_node].children[childI]];
+ if( progress_msgs ) cout << "childI " << childI << " aab\n";
+ applyAncestralBreakpoints( alignment_tree[cur_node].ordering, atn.ordering, childI, sim );
+ if( progress_msgs ) cout << "sort childI " << childI << "\n";
+ vector<SuperInterval*> siv_ptr_list(atn.ordering.size());
+ for( size_t sivI = 0; sivI < atn.ordering.size(); ++sivI )
+ siv_ptr_list[sivI] = &(atn.ordering[sivI]);
+ SuperIntervalPtrComp sipc;
+ sort( siv_ptr_list.begin(), siv_ptr_list.end(), sipc );
+ vector< SuperInterval > siv_list;
+ for( size_t sivI = 0; sivI < siv_ptr_list.size(); ++sivI )
+ siv_list.push_back(*siv_ptr_list[sivI]);
+ swap(siv_list, atn.ordering);
+ node_stack.push( alignment_tree[cur_node].children[childI] );
+ }
+ if( debug_aligner && alignment_tree[cur_node].children.size() > 0 )
+ validateSuperIntervals(alignment_tree[cur_node].children[0], alignment_tree[cur_node].children[1], cur_node);
+ if( progress_msgs ) cout << "linking node " << cur_node << "'s" << alignment_tree[cur_node].ordering.size() << " superintervals\n";
+ for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
+ linkSuperIntervals( alignment_tree[cur_node].children[childI], childI, cur_node );
+ }
+}
+
+
+boolean getInterveningCoordinates( const AbstractMatch* iv, uint oseqI, Match* r_begin, Match* r_end, uint seqI, int64& gap_lend, int64& gap_rend ){
+ // skip this sequence if it's undefined
+ if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH) ||
+ (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
+ gap_lend = 0;
+ gap_rend = 0;
+ return true;
+ }
+
+ // determine the size of the gap
+ gap_rend = r_end != NULL ? r_end->Start( seqI ) : iv->RightEnd( oseqI ) + 1;
+ gap_lend = r_begin != NULL ? r_begin->End( seqI ) + 1 : iv->LeftEnd( oseqI );
+ if( gap_rend < 0 || gap_lend < 0 ){
+ gap_rend = r_begin != NULL ? -r_begin->Start( seqI ) : iv->RightEnd( oseqI ) + 1;
+ gap_lend = r_end != NULL ? -r_end->Start( seqI ) + r_end->Length() : 1;
+ }
+ if( gap_rend <= 0 || gap_lend <= 0 ){
+ // if either is still < 0 then there's a problem...
+ genome::ErrorMsg( "Error constructing intervening coordinates" );
+ }
+ return true;
+}
+
+
+void ProgressiveAligner::pairwiseAnchorSearch( MatchList& r_list, Match* r_begin, Match* r_end, const AbstractMatch* iv, uint oseqI, uint oseqJ )
+{
+ uint seqI = 0;
+ MatchList gap_list;
+ vector< int64 > starts;
+//
+// Get the sequence in the intervening gaps between these two matches
+//
+ for( seqI = 0; seqI < 2; seqI++ )
+ {
+ int64 gap_end = 0;
+ int64 gap_start = 0;
+ getInterveningCoordinates( iv, (seqI == 0 ? oseqI : oseqJ), r_begin, r_end, seqI, gap_start, gap_end);
+ int64 diff = gap_end - gap_start;
+ diff = diff > 0 ? diff - 1 : 0;
+
+ starts.push_back( gap_start );
+ gnSequence* new_seq = NULL;
+ if(diff > 0 && gap_start + diff - 1 <= r_list.seq_table[ seqI ]->length())
+ new_seq = new gnSequence( r_list.seq_table[ seqI ]->ToString( diff, gap_start ) );
+ else
+ new_seq = new gnSequence();
+ gap_list.seq_table.push_back( new_seq );
+ gap_list.sml_table.push_back( new DNAMemorySML() );
+ }
+
+ gnSeqI avg_len = (gap_list.seq_table[0]->length() + gap_list.seq_table[1]->length())/2;
+ uint search_seed_size = getDefaultSeedWeight( avg_len );
+ gap_mh.get().Clear();
+
+ uint seed_count = use_seed_families ? 3 : 1;
+ for( size_t seedI = 0; seedI < seed_count; seedI++ )
+ {
+ //
+ // Create sorted mer lists for the intervening gap region
+ //
+ uint64 default_seed = getSeed( search_seed_size, seedI );
+ if( search_seed_size < MIN_DNA_SEED_WEIGHT )
+ {
+ for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ )
+ delete gap_list.seq_table[ seqI ];
+ for( uint seqI = 0; seqI < gap_list.sml_table.size(); seqI++ )
+ delete gap_list.sml_table[ seqI ];
+ return;
+ }
+ for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ ){
+ gap_list.sml_table[ seqI ]->Clear();
+ gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
+ }
+
+ //
+ // Find all matches in the gap region
+ //
+ gap_mh.get().ClearSequences();
+ if(seed_count>1)
+ {
+ MatchList cur_list = gap_list;
+ gap_mh.get().FindMatches( cur_list );
+ for( size_t mI = 0; mI < cur_list.size(); mI++ )
+ cur_list[mI]->Free();
+ }else
+ gap_mh.get().FindMatches( gap_list );
+ }
+ if(seed_count>1)
+ gap_mh.get().GetMatchList(gap_list);
+
+ EliminateOverlaps_v2( gap_list );
+
+ // for anchor accuracy, throw out any anchors that are shorter than the minimum
+ // anchor length after EliminateOverlaps()
+ gap_list.LengthFilter( MIN_ANCHOR_LENGTH + 3 );
+
+ for( size_t gI = 0; gI < gap_list.size(); gI++ )
+ {
+ for( seqI = 0; seqI < 2; seqI++ )
+ {
+ int64 gap_rend = 0;
+ int64 gap_lend = 0;
+ getInterveningCoordinates( iv, (seqI == 0 ? oseqI : oseqJ), r_begin, r_end, seqI, gap_lend, gap_rend);
+ gap_list[gI]->SetLeftEnd(seqI, gap_list[gI]->LeftEnd(seqI) + gap_lend - 1);
+ }
+ }
+ r_list.insert(r_list.end(), gap_list.begin(), gap_list.end());
+
+ // delete sequences and smls
+ for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ )
+ delete gap_list.seq_table[ seqI ];
+ for( uint seqI = 0; seqI < gap_list.sml_table.size(); seqI++ )
+ delete gap_list.sml_table[ seqI ];
+}
+
+template<class GappedAlignmentType>
+void ProgressiveAligner::recurseOnPairs( const vector<node_id_t>& node1_seqs, const vector<node_id_t>& node2_seqs, const GappedAlignmentType& iv, Matrix<MatchList>& matches, Matrix< std::vector< search_cache_t > >& search_cache_db, Matrix< std::vector< search_cache_t > >& new_cache_db, boost::multi_array< vector< vector< int64 > >, 2 >& iv_regions )
+{
+ matches = Matrix<MatchList>(node1_seqs.size(),node2_seqs.size());
+
+ std::vector< bitset_t > aln_matrix;
+ iv.GetAlignment(aln_matrix);
+ Match tmp(2);
+ const size_t sizer = node1_seqs.size() * node2_seqs.size();
+ std::vector< std::pair<size_t,size_t> > node_pairs(sizer);
+ int nni = 0;
+ for( size_t n1 = 0; n1 < node1_seqs.size(); n1++ )
+ for( size_t n2 = 0; n2 < node2_seqs.size(); n2++ )
+ node_pairs[nni++] = make_pair(n1,n2);
+
+#pragma omp parallel for
+ for(int ni = 0; ni < node_pairs.size(); ni++)
+ {
+ size_t n1 = node_pairs[ni].first;
+ size_t n2 = node_pairs[ni].second;
+ vector<node_id_t>::const_iterator n1_iter = node1_seqs.begin() + n1;
+ vector<node_id_t>::const_iterator n2_iter = node2_seqs.begin() + n2;
+
+ uint seqI = node_sequence_map[*n1_iter];
+ uint seqJ = node_sequence_map[*n2_iter];
+ MatchList& mlist = matches(n1, n2);
+ std::vector< search_cache_t >& cache = search_cache_db(n1, n2);
+ std::vector< search_cache_t >& new_cache = new_cache_db(n1, n2);
+ mlist.seq_table.push_back( alignment_tree[*n1_iter].sequence );
+ mlist.seq_table.push_back( alignment_tree[*n2_iter].sequence );
+
+ if( iv.LeftEnd(seqI) == NO_MATCH )
+ {
+ if( iv.LeftEnd(seqJ) != NO_MATCH )
+ {
+ iv_regions[n1][n2][1].push_back(iv.LeftEnd(seqJ));
+ iv_regions[n1][n2][1].push_back(iv.RightEnd(seqJ));
+ }
+ continue; // no sense searching one isn't defined!
+ }
+ if(iv.LeftEnd(seqJ) == NO_MATCH )
+ {
+ if( iv.LeftEnd(seqI) != NO_MATCH )
+ {
+ iv_regions[n1][n2][0].push_back(iv.LeftEnd(seqI));
+ iv_regions[n1][n2][0].push_back(iv.RightEnd(seqI));
+ }
+ continue; // no sense searching one isn't defined!
+ }
+
+ gnSeqI charI = 0;
+ gnSeqI charJ = 0;
+ const size_t iv_aln_length = iv.AlignmentLength();
+
+// first determine the outer aligned boundaries of the LCB and record them for
+// later use
+ pair< int64, int64 > pair_1l(0,0);
+ pair< int64, int64 > pair_1r(0,0);
+ pair< int64, int64 > pair_2l(0,0);
+ pair< int64, int64 > pair_2r(0,0);
+ for( uint colI = 0; colI <= iv_aln_length; colI++ )
+ {
+ if( colI == iv_aln_length || (aln_matrix[seqI].test(colI) && aln_matrix[seqJ].test(colI)) )
+ {
+ if( colI == 0 )
+ break; // nothing to see here, move along...
+ if( iv.Orientation(seqI) == AbstractMatch::forward )
+ pair_1l = make_pair( iv.LeftEnd(seqI), iv.LeftEnd(seqI)+charI );
+ else
+ pair_1r = make_pair( iv.RightEnd(seqI)-charI+1, iv.RightEnd(seqI)+1 );
+ if( iv.Orientation(seqJ) == AbstractMatch::forward )
+ pair_2l = make_pair( iv.LeftEnd(seqJ), iv.LeftEnd(seqJ)+charJ );
+ else
+ pair_2r = make_pair( iv.RightEnd(seqJ)-charJ+1, iv.RightEnd(seqJ)+1 );
+ break;
+ }
+ if( colI < iv_aln_length && aln_matrix[seqI].test(colI) )
+ ++charI;
+ if( colI < iv_aln_length && aln_matrix[seqJ].test(colI) )
+ ++charJ;
+ }
+
+ charI = 0;
+ charJ = 0;
+ for( uint colI = iv_aln_length; colI > 0 ; colI-- )
+ {
+ if( (aln_matrix[seqI].test(colI-1) && aln_matrix[seqJ].test(colI-1)) )
+ {
+ if( colI == iv_aln_length )
+ break; // nothing to see here, move along...
+ if( iv.Orientation(seqI) == AbstractMatch::forward )
+ pair_1r = make_pair( iv.RightEnd(seqI)-charI+1, iv.RightEnd(seqI)+1 );
+ else
+ pair_1l = make_pair( iv.LeftEnd(seqI), iv.LeftEnd(seqI)+charI );
+ if( iv.Orientation(seqJ) == AbstractMatch::forward )
+ pair_2r = make_pair( iv.RightEnd(seqJ)-charJ+1, iv.RightEnd(seqJ)+1 );
+ else
+ pair_2l = make_pair( iv.LeftEnd(seqJ), iv.LeftEnd(seqJ)+charJ );
+ break;
+ }
+ if( aln_matrix[seqI].test(colI-1) )
+ ++charI;
+ if( aln_matrix[seqJ].test(colI-1) )
+ ++charJ;
+ }
+ if( pair_1l.first < pair_1l.second )
+ {
+ iv_regions[n1][n2][0].push_back(pair_1l.first);
+ iv_regions[n1][n2][0].push_back(pair_1l.second);
+ }
+ if( pair_1r.first < pair_1r.second )
+ {
+ if( pair_1l.first < pair_1l.second && pair_1r.first == pair_1l.second )
+ {
+ // just merge them into a single interval
+ iv_regions[n1][n2][0].back() = pair_1r.second;
+ }else{
+ iv_regions[n1][n2][0].push_back(pair_1r.first);
+ iv_regions[n1][n2][0].push_back(pair_1r.second);
+ if( pair_1r.first <= pair_1l.second && pair_1r.second >= pair_1l.first )
+ {
+ cout << "Ohno. Overlap in outside LCB search intervals\n";
+ cout << "Left: " << pair_1l.first << '\t' << pair_1l.second << " right: " << pair_1r.first << '\t' << pair_1r.second << endl;
+ cout << "0 iv.Start(" << seqI << "): " << iv.Start(seqI) << '\t' << "iv.RightEnd(" << seqI << "): " << iv.RightEnd(seqI) << endl;
+ if( pair_1l.first == 0 )
+ genome::breakHere();
+ }
+ }
+ }
+
+ if( pair_2l.first < pair_2l.second )
+ {
+ iv_regions[n1][n2][1].push_back(pair_2l.first);
+ iv_regions[n1][n2][1].push_back(pair_2l.second);
+ }
+ if( pair_2r.first < pair_2r.second )
+ {
+ if( pair_2l.first < pair_2l.second && pair_2r.first == pair_2l.second )
+ {
+ // just merge them into a single interval
+ iv_regions[n1][n2][1].back() = pair_2r.second;
+ }else{
+ iv_regions[n1][n2][1].push_back(pair_2r.first);
+ iv_regions[n1][n2][1].push_back(pair_2r.second);
+ if( pair_2r.first <= pair_2l.second && pair_2r.second >= pair_2l.first )
+ {
+ cout << "Ohno. Overlap in outside LCB search intervals\n";
+ cout << "Left: " << pair_2l.first << '\t' << pair_2l.second << " right: " << pair_2r.first << '\t' << pair_2r.second << endl;
+ cout << "1 iv.Start(" << seqJ << "): " << iv.Start(seqJ) << '\t' << "iv.RightEnd(" << seqJ << "): " << iv.RightEnd(seqJ) << endl;
+ cout << "charI " << charI << "\tcharJ" << charJ << endl;
+ if( pair_2l.first == 0 )
+ genome::breakHere();
+ }
+ }
+ }
+
+ charI = 0;
+ charJ = 0;
+ gnSeqI prev_charI = 0;
+ gnSeqI prev_charJ = 0;
+ bool in_gap = false;
+
+ for( uint colI = 0; colI <= iv_aln_length; colI++ )
+ {
+ if( colI == iv_aln_length ||
+ (aln_matrix[seqI].test(colI) && aln_matrix[seqJ].test(colI)) )
+ {
+ if( in_gap &&
+ charI - prev_charI > min_recursive_gap_length &&
+ charJ - prev_charJ > min_recursive_gap_length )
+ {
+
+ Match* l_match = NULL;
+ l_match = tmp.Copy();
+ if(iv.Orientation(seqI) == AbstractMatch::forward)
+ l_match->SetLeftEnd(0, iv.LeftEnd(seqI)+prev_charI);
+ else
+ {
+ l_match->SetLeftEnd(0, iv.RightEnd(seqI)-prev_charI);
+ l_match->SetOrientation(0, AbstractMatch::reverse );
+ }
+ if(iv.Orientation(seqJ) == AbstractMatch::forward)
+ l_match->SetLeftEnd(1, iv.LeftEnd(seqJ)+prev_charJ);
+ else
+ {
+ l_match->SetLeftEnd(1, iv.RightEnd(seqJ)-prev_charJ);
+ l_match->SetOrientation(1, AbstractMatch::reverse );
+ }
+ l_match->SetLength(0);
+ Match* r_match = NULL;
+ if( charJ != iv.RightEnd(seqJ) && charI != iv.RightEnd(seqI) )
+ {
+ r_match = tmp.Copy();
+ if(iv.Orientation(seqI) == AbstractMatch::forward)
+ r_match->SetLeftEnd(0, iv.LeftEnd(seqI)+charI);
+ else
+ {
+ r_match->SetLeftEnd(0, iv.RightEnd(seqI)-charI);
+ r_match->SetOrientation(0, AbstractMatch::reverse );
+ }
+ if(iv.Orientation(seqJ) == AbstractMatch::forward)
+ r_match->SetLeftEnd(1, iv.LeftEnd(seqJ)+charJ);
+ else
+ {
+ r_match->SetLeftEnd(1, iv.RightEnd(seqJ)-charJ);
+ r_match->SetOrientation(1, AbstractMatch::reverse );
+ }
+ r_match->SetLength(0);
+ }
+
+ if( iv.Orientation(seqI) == AbstractMatch::reverse )
+ {
+ swap(l_match,r_match);
+ if( l_match != NULL ) l_match->Invert();
+ if( r_match != NULL ) r_match->Invert();
+ }
+ // check whether the current cache already has the searched region
+ search_cache_t cacheval = make_pair( l_match, r_match );
+ std::vector< search_cache_t >::iterator cache_entry = std::upper_bound( cache.begin(), cache.end(), cacheval, mems::cache_comparator );
+ if( cache_entry == cache.end() ||
+ (mems::cache_comparator( cacheval, *cache_entry ) || mems::cache_comparator( *cache_entry, cacheval )) )
+ {
+ // search this region
+ pairwiseAnchorSearch(mlist, l_match, r_match, &iv, seqI, seqJ);
+ }
+ if(using_cache_db)
+ new_cache.push_back( cacheval );
+ }
+ prev_charI = charI;
+ prev_charJ = charJ;
+ in_gap = false;
+ }
+ else
+ in_gap = true;
+ if( colI < iv.AlignmentLength() )
+ {
+ if( aln_matrix[seqI].test(colI) )
+ ++charI;
+ if( aln_matrix[seqJ].test(colI) )
+ ++charJ;
+ }
+ }
+ }
+}
+
+void ProgressiveAligner::getAncestralMatches( const vector< node_id_t > node1_seqs, const vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< AbstractMatch* >& ancestral_matches )
+{
+ // to save memory, always make node1_seqs the bigger vector
+// if( node1_seqs.size() < node2_seqs.size() )
+// swap( node1_seqs, node2_seqs );
+
+ // for each pair of genomes, extract pairwise matches and translate up
+ // eliminate overlaps
+ for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ {
+ uint ii = this->node_sequence_map[node1_seqs[seqI]];
+ vector< AbstractMatch* > seqI_matches;
+
+ for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ uint jj = this->node_sequence_map[node2_seqs[seqJ]];
+ vector< AbstractMatch* > cur_matches;
+ for( size_t mI = 0; mI < original_ml.size(); mI++ )
+ {
+ if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
+ continue;
+ if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
+ continue;
+ Match mm( 2 );
+ Match* new_m = mm.Copy();
+ new_m->SetStart( 0, original_ml[mI]->Start(ii));
+ new_m->SetStart( 1, original_ml[mI]->Start(jj));
+ new_m->SetLength(original_ml[mI]->Length());
+ if( new_m->Start(0) < 0 )
+ new_m->Invert(); // assign reference orientation to seq 0
+ cur_matches.push_back( new_m );
+ }
+ // now translate cur_matches
+ translateGappedCoordinates( cur_matches, 1, node2_seqs[seqJ], node2 );
+ seqI_matches.insert( seqI_matches.end(), cur_matches.begin(), cur_matches.end() );
+ }
+ EliminateOverlaps_v2( seqI_matches );
+ translateGappedCoordinates( seqI_matches, 0, node1_seqs[seqI], node1 );
+ ancestral_matches.insert( ancestral_matches.end(), seqI_matches.begin(), seqI_matches.end() );
+ }
+ EliminateOverlaps_v2( ancestral_matches );
+}
+
+
+void ProgressiveAligner::getPairwiseMatches( const vector< node_id_t >& node1_seqs, const vector< node_id_t >& node2_seqs, Matrix<MatchList>& pairwise_matches )
+{
+ pairwise_matches = Matrix< MatchList >( node1_seqs.size(), node2_seqs.size() );
+
+ // copy sequence tables
+ for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ {
+ for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ uint ii = this->node_sequence_map[node1_seqs[seqI]];
+ uint jj = this->node_sequence_map[node2_seqs[seqJ]];
+ pairwise_matches(seqI, seqJ).seq_table.push_back(original_ml.seq_table[ii]);
+ pairwise_matches(seqI, seqJ).seq_table.push_back(original_ml.seq_table[jj]);
+ pairwise_matches(seqI, seqJ).seq_filename.push_back(original_ml.seq_filename[ii]);
+ pairwise_matches(seqI, seqJ).seq_filename.push_back(original_ml.seq_filename[jj]);
+ }
+ }
+
+ // now copy pairwise matches
+ for( size_t mI = 0; mI < original_ml.size(); mI++ )
+ {
+ for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ {
+ uint ii = this->node_sequence_map[node1_seqs[seqI]];
+ if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
+ continue;
+ for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ uint jj = this->node_sequence_map[node2_seqs[seqJ]];
+ if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
+ continue;
+ Match mm( 2 );
+ Match* new_m = mm.Copy();
+ new_m->SetStart( 0, original_ml[mI]->Start(ii));
+ new_m->SetStart( 1, original_ml[mI]->Start(jj));
+ new_m->SetLength(original_ml[mI]->Length());
+ if( new_m->Start(0) < 0 )
+ new_m->Invert(); // assign reference orientation to seq 0
+ pairwise_matches(seqI,seqJ).push_back( new_m );
+ }
+ }
+ }
+}
+
+
+int IsDenseEnough( GappedAlignment* gal_iter )
+{
+ double total_len = 0;
+ gnSeqI seqs = 0;
+ for( uint seqI = 0; seqI < gal_iter->SeqCount(); seqI++ )
+ {
+ if( gal_iter->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ total_len += gal_iter->Length(seqI);
+ }
+ double density = total_len / (gal_iter->AlignmentLength() * (double)gal_iter->Multiplicity());
+ // density of 1 is ideal
+ // the shorter the alignment, the closer we should be to 1 to allow splitting
+ // use a linear threshold with (min_window_size,1) and (max_window_size,min_gappiness)
+ // as endpoints of the threshold line
+
+ // determine the density threshold for the given alignment length
+ double threshold = ((max_density - min_density)/(min_window_size - max_window_size)) * ( (double)gal_iter->AlignmentLength() - max_window_size ) + min_density;
+ if( density > max_density ) // don't bother aligning this, it's so dense we'll wait until iterative refinement.
+ return 2;
+ if( density > threshold )
+ return 1;
+ return 0;
+}
+
+void splitGappedAlignment( const GappedAlignment& ga, GappedAlignment& ga1, GappedAlignment& ga2, std::vector<size_t>& seqs1, std::vector<size_t>& seqs2 )
+{
+ const vector< string >& aln = GetAlignment( ga, std::vector<gnSequence*>(ga.SeqCount()) );
+ ga1 = ga;
+ ga2 = ga;
+ for( size_t seqI = 0; seqI < seqs1.size(); seqI++ )
+ ga2.SetLeftEnd(seqs1[seqI], NO_MATCH);
+ for( size_t seqI = 0; seqI < seqs2.size(); seqI++ )
+ ga1.SetLeftEnd(seqs2[seqI], NO_MATCH);
+}
+
+void removeLargeGapsPP( GappedAlignment& gal, list< GappedAlignment* >& gal_list, vector<bool>& gap_iv, const vector< size_t >& group1, const vector< size_t >& group2 )
+{
+ // scan through and remove any section where members of group1 aren't aligned to members of group2
+ // for more than some number of nucleotides
+ gap_iv.clear();
+ gal_list.clear();
+ const vector< string >& aln_matrix = GetAlignment(gal, vector<gnSequence*>(gal.SeqCount(),NULL));
+ size_t gap_cols = 0;
+ size_t last_aln_col = (std::numeric_limits<size_t>::max)();
+ size_t col_base = 0;
+ GappedAlignment* galp = gal.Copy();
+ for( size_t colI = 0; colI < gal.AlignmentLength(); colI++ )
+ {
+ size_t g1 = 0;
+ size_t g2 = 0;
+ for( ; g1 < group1.size(); ++g1 )
+ {
+ if( aln_matrix[group1[g1]][colI] != '-' )
+ break;
+ }
+ for( ; g2 < group2.size(); ++g2 )
+ {
+ if( aln_matrix[group2[g2]][colI] != '-' )
+ break;
+ }
+ if( g1 < group1.size() && g2 < group2.size() )
+ {
+ // it's an aligned col
+ if( gap_cols > max_gap_length )
+ {
+ // crop out the middle gapped section
+ gnSeqI split_point = 0;
+ if( last_aln_col != (std::numeric_limits<size_t>::max)() )
+ {
+ split_point = last_aln_col + lcb_hangover - col_base;
+ gal_list.push_back( galp );
+ gap_iv.push_back(false);
+ galp = (GappedAlignment*)galp->Split(split_point); // set galp to the right side after splitting
+ col_base += split_point;
+ }
+ split_point = colI - lcb_hangover - col_base;
+ gal_list.push_back( galp );
+ gap_iv.push_back(true);
+ galp = (GappedAlignment*)galp->Split(split_point); // set galp to the right side after splitting
+ col_base += split_point;
+ }
+ last_aln_col = colI;
+ gap_cols = 0;
+ }else
+ ++gap_cols;
+ }
+
+ if( gap_cols > max_gap_length )
+ {
+ gnSeqI split_point = 0;
+ if( last_aln_col != (std::numeric_limits<size_t>::max)() )
+ {
+ split_point = last_aln_col + lcb_hangover - col_base;
+ gal_list.push_back( galp );
+ gap_iv.push_back(false);
+ galp = (GappedAlignment*)galp->Split(split_point); // set galp to the right side after splitting
+ }
+ gap_iv.push_back(true);
+ }else
+ gap_iv.push_back(false);
+ gal_list.push_back( galp );
+}
+
+void ProgressiveAligner::refineAlignment( GappedAlignment& gal, node_id_t ancestor, bool profile_aln, AlnProgressTracker& apt )
+{
+ // divide the gapped alignment up into windows of a given size and have
+ // muscle refine the alignments
+ // when anchors are dense use smaller windows to improve speed efficiency
+ list< GappedAlignment* > gal_list;
+ vector<bool> gap_iv;
+ std::vector<node_id_t> nodes1;
+ std::vector<node_id_t> nodes2;
+ getAlignedChildren( alignment_tree[ancestor].children[0], nodes1 );
+ getAlignedChildren( alignment_tree[ancestor].children[1], nodes2 );
+ std::vector<size_t> seqs1( nodes1.size() );
+ std::vector<size_t> seqs2( nodes2.size() );
+ for( size_t nI = 0; nI < nodes1.size(); nI++ )
+ seqs1[nI] = node_sequence_map[nodes1[nI]];
+ for( size_t nI = 0; nI < nodes2.size(); nI++ )
+ seqs2[nI] = node_sequence_map[nodes2[nI]];
+// if( profile_aln )
+// {
+ removeLargeGapsPP( gal, gal_list, gap_iv, seqs1, seqs2 );
+// }else{
+// gal_list.push_back( gal.Copy() );
+// gap_iv.push_back(false);
+// }
+ list< GappedAlignment* >::iterator gal_iter = gal_list.begin();
+ vector<bool>::iterator gap_iter = gap_iv.begin();
+ while(gal_iter != gal_list.end())
+ {
+ int density = IsDenseEnough( *gal_iter );
+ if( (density == 0 && (*gal_iter)->AlignmentLength() > max_window_size / 3) ||
+ (density == 1 && (*gal_iter)->AlignmentLength() > max_window_size ) ||
+ (density == 2 && (*gal_iter)->AlignmentLength() > max_window_size * 3 )
+
+// || ( (*gal_iter)->AlignmentLength() > min_window_size && density == 1 && profile_aln == true )
+ )
+ {
+ // split in half
+ gnSeqI split_point = (*gal_iter)->AlignmentLength() / 2;
+ list< GappedAlignment* >::iterator ins_iter = gal_iter;
+ ++ins_iter;
+// ins_iter = gal_list.insert(ins_iter, new GappedAlignment(**gal_iter) );
+ ins_iter = gal_list.insert(ins_iter, (*gal_iter)->Copy());
+ vector<bool>::iterator gap_ins_iter = gap_iter;
+ size_t gap_off = gap_iter - gap_iv.begin();
+ ++gap_ins_iter;
+ gap_iv.insert( gap_ins_iter, *gap_iter );
+ gap_iter = gap_iv.begin() + gap_off;
+ (*gal_iter)->CropEnd( split_point );
+ (*ins_iter)->CropStart( (*ins_iter)->AlignmentLength() - split_point );
+ continue;
+ }
+
+ ++gal_iter;
+ ++gap_iter;
+ }
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+ // now that the alignment is all split up use muscle to refine it
+ gnSeqI new_len = 0;
+
+ gap_iter = gap_iv.begin();
+
+ const size_t gal_count = gal_list.size();
+// this section can not be paralellized b/c it makes calls to muscle
+#pragma omp critical
+{
+ for( int galI = 0; galI < gal_count; galI++ )
+ {
+ list<GappedAlignment*>::iterator my_g_iter = gal_list.begin();
+ vector<bool>::iterator my_b_iter = gap_iv.begin();
+ for(uint a = 0; a < galI; a++)
+ {
+ ++my_g_iter;
+ ++my_b_iter;
+ }
+ apt.cur_leftend += (*my_g_iter)->AlignmentLength();
+ if( profile_aln && !(*my_b_iter) )
+ {
+ GappedAlignment ga1;
+ GappedAlignment ga2;
+ splitGappedAlignment( **my_g_iter, ga1, ga2, seqs1, seqs2 );
+ if( ga1.Multiplicity() > 0 && ga2.Multiplicity() > 0 )
+ {
+ mi.ProfileAlignFast( ga1, ga2, **my_g_iter, true );
+ }
+ }else if(!(*my_b_iter))
+ {
+ int density = IsDenseEnough( *my_g_iter );
+ if( density == 0 )
+ mi.RefineFast( **my_g_iter );
+ else if( density == 1 )
+ mi.RefineFast( **my_g_iter, 500 );
+ else
+ mi.RefineFast( **my_g_iter, 200 );
+ }
+
+ new_len += (*my_g_iter)->AlignmentLength();
+ // print a progress message
+ double cur_progress = ((double)apt.cur_leftend / (double)apt.total_len)*100.0;
+ printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
+ apt.prev_progress = cur_progress;
+ }
+ gal_iter = gal_list.end();
+}
+
+ // put humpty dumpty back together
+ vector< string > aln_matrix( gal.SeqCount(), string( new_len, '-' ) );
+ vector< string::size_type > pos( gal.SeqCount(), 0 );
+ for( gal_iter = gal_list.begin(); gal_iter != gal_list.end(); ++gal_iter )
+ {
+ const vector< string >& tmp_mat = GetAlignment(**gal_iter, vector<gnSequence*>( gal.SeqCount() ) );
+ for( uint seqI = 0; seqI < tmp_mat.size(); seqI++ )
+ {
+ if( gal.LeftEnd(seqI) == 0 )
+ continue;
+ aln_matrix[seqI].replace(pos[seqI], tmp_mat[seqI].size(), tmp_mat[seqI]);
+ pos[seqI] += tmp_mat[seqI].size();
+ }
+ (*gal_iter)->Free();
+// delete (*gal_iter);
+ }
+ gal.SetAlignment(aln_matrix);
+}
+
+void ProgressiveAligner::doGappedAlignment( node_id_t ancestor, bool profile_aln )
+{
+ AlnProgressTracker apt;
+ gnSeqI total_len = 0;
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ total_len += alignment_tree[ancestor].ordering[aI].Length();
+ apt.total_len = total_len;
+ apt.prev_progress = 0;
+
+ printProgress(-1, 0, cout);
+ apt.cur_leftend = 1;
+
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ {
+ if( alignment_tree[ancestor].ordering[aI].reference_iv.Multiplicity() == 1 )
+ {
+ apt.cur_leftend += alignment_tree[ancestor].ordering[aI].reference_iv.AlignmentLength();
+ continue; // don't bother re-refining intervals that didn't get aligned here
+ }
+
+// printMemUsage();
+// cout << "extract aln\n";
+ GappedAlignment gal;
+ extractAlignment(ancestor, aI, gal);
+// printMemUsage();
+// cout << "refine aln\n";
+ if( gal.Multiplicity() > 1 ) // no point in refining intervals that are unaligned anyways
+ refineAlignment( gal, ancestor, profile_aln, apt );
+ else
+ apt.cur_leftend += gal.AlignmentLength();
+// printMemUsage();
+// cout << "construct siv\n";
+ ConstructSuperIntervalFromMSA(ancestor, aI, gal);
+// printMemUsage();
+
+ // print a progress message
+ double cur_progress = ((double)apt.cur_leftend / (double)apt.total_len)*100.0;
+ printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
+ apt.prev_progress = cur_progress;
+ }
+ printMemUsage();
+ cout << "Fix left ends\n";
+ FixLeftEnds(ancestor);
+ printMemUsage();
+
+ if( debug_aligner )
+ validateSuperIntervals(alignment_tree[ancestor].children[0], alignment_tree[ancestor].children[1], ancestor);
+ cout << "\ndone.\n";
+}
+
+void ProgressiveAligner::FixLeftEnds( node_id_t ancestor )
+{
+ // fixes all SuperInterval left-end coordinates for nodes below ancestor
+ stack< node_id_t > node_stack;
+ node_stack.push( ancestor );
+ vector<bool> visited( alignment_tree.size(), false );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ // visit post-order
+ if( !visited[cur_node] )
+ {
+ for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
+ node_stack.push( alignment_tree[cur_node].children[childI] );
+ visited[cur_node] = true;
+ continue;
+ }
+ node_stack.pop();
+ if( alignment_tree[cur_node].sequence != NULL )
+ continue; // don't do anything on leaf nodes
+
+ vector< SuperInterval >& siv_list = alignment_tree[cur_node].ordering;
+ gnSeqI left_end = 1;
+ for( size_t sivI = 0; sivI < siv_list.size(); sivI++ )
+ {
+ siv_list[sivI].SetLeftEnd(left_end);
+ siv_list[sivI].SetLength(siv_list[sivI].reference_iv.AlignmentLength());
+ left_end += siv_list[sivI].reference_iv.AlignmentLength();
+ CompactGappedAlignment<>* m_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
+
+ // this one wasn't refined, just move it appropriately
+ if( m_cga == NULL || siv_list[sivI].reference_iv.GetMatches().size() > 1 )
+ {
+ for( uint childI = 0; childI <= 1; childI++ )
+ {
+ size_t cur_siv = childI == 0 ? alignment_tree[cur_node].ordering[sivI].c1_siv : alignment_tree[cur_node].ordering[sivI].c2_siv;
+ if( cur_siv == (std::numeric_limits<size_t>::max)() )
+ continue;
+ const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[childI] ].ordering[ cur_siv ];
+ int64 diff = c_siv.LeftEnd() - siv_list[sivI].reference_iv.LeftEnd(childI);
+ siv_list[sivI].reference_iv.SetLeftEnd(childI, c_siv.LeftEnd());
+ const vector< AbstractMatch* >& matches = siv_list[sivI].reference_iv.GetMatches();
+ for( size_t mI = 0; mI < matches.size(); mI++ )
+ {
+ if( matches[mI]->LeftEnd(childI) != NO_MATCH )
+ matches[mI]->SetLeftEnd(childI, matches[mI]->LeftEnd(childI) + diff);
+ }
+ }
+
+ }else{
+
+ size_t c1_siv = alignment_tree[cur_node].ordering[sivI].c1_siv;
+ if( c1_siv != (std::numeric_limits<size_t>::max)() )
+ {
+ const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[0] ].ordering[ c1_siv ];
+ m_cga->SetLeftEnd(0, c_siv.LeftEnd());
+ siv_list[sivI].reference_iv.SetLeftEnd(0, c_siv.LeftEnd());
+ m_cga->SetLength(c_siv.Length(), 0);
+ siv_list[sivI].reference_iv.SetLength(c_siv.Length(), 0);
+ siv_list[sivI].reference_iv.SetOrientation(0, m_cga->Orientation(0));
+ }
+ size_t c2_siv = alignment_tree[cur_node].ordering[sivI].c2_siv;
+ if( c2_siv != (std::numeric_limits<size_t>::max)() )
+ {
+ const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[1] ].ordering[ c2_siv ];
+ m_cga->SetLeftEnd(1, c_siv.LeftEnd());
+ siv_list[sivI].reference_iv.SetLeftEnd(1, c_siv.LeftEnd());
+ m_cga->SetLength(c_siv.Length(), 1);
+ siv_list[sivI].reference_iv.SetLength(c_siv.Length(), 1);
+ siv_list[sivI].reference_iv.SetOrientation(1, m_cga->Orientation(1));
+ }
+ }
+ if( debug_cga && m_cga && !m_cga->validate() )
+// if( m_cga && !m_cga->validate() )
+ cerr << "oh junkedy\n";
+
+ }
+ }
+}
+
+/**
+ * propagates an inversion of an ancestral SuperInterval to SuperIntervals in descendant nodes
+ */
+void propagateInvert( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor, size_t ans_siv )
+{
+ stack< pair< node_id_t, size_t > > node_siv_stack;
+ node_siv_stack.push( make_pair(ancestor, ans_siv) );
+ while( node_siv_stack.size() > 0 )
+ {
+ pair< node_id_t, size_t > cur = node_siv_stack.top();
+ node_siv_stack.pop();
+ node_id_t cur_node = cur.first;
+ if( alignment_tree[cur_node].ordering[cur.second].c1_siv != (std::numeric_limits<size_t>::max)() )
+ node_siv_stack.push( make_pair( alignment_tree[cur_node].children[0], alignment_tree[cur_node].ordering[cur.second].c1_siv ) );
+ if( alignment_tree[cur_node].ordering[cur.second].c2_siv != (std::numeric_limits<size_t>::max)() )
+ node_siv_stack.push( make_pair( alignment_tree[cur_node].children[1], alignment_tree[cur_node].ordering[cur.second].c2_siv ) );
+ if( cur_node == ancestor )
+ continue; // don't do anything at the ancestor
+ if( alignment_tree[cur_node].sequence != NULL )
+ continue; // don't do anything on leaf nodes
+
+ // reverse the homology structure at this node
+ Interval& ref_iv = alignment_tree[cur_node].ordering[cur.second].reference_iv;
+ vector< AbstractMatch* > matches;
+ ref_iv.StealMatches( matches );
+ AbstractMatch::orientation o0 = matches[0]->Orientation(0);
+ AbstractMatch::orientation o1 = matches[0]->Orientation(1);
+ matches[0]->Invert();
+ if( o0 != AbstractMatch::undefined )
+ matches[0]->SetOrientation(0,o0);
+ if( o1 != AbstractMatch::undefined )
+ matches[0]->SetOrientation(1,o1);
+ ref_iv.SetMatches( matches );
+ if( o0 != AbstractMatch::undefined )
+ {
+ ref_iv.SetOrientation(0,o0);
+ ref_iv.SetLeftEnd(0,0);
+ }
+ if( o1 != AbstractMatch::undefined )
+ {
+ ref_iv.SetOrientation(1,o1);
+ ref_iv.SetLeftEnd(1,0);
+ }
+ }
+}
+
+
+void ProgressiveAligner::ConstructSuperIntervalFromMSA( node_id_t ancestor, size_t ans_siv, GappedAlignment& gal )
+{
+ const vector< string >& aln_matrix = GetAlignment( gal, vector< gnSequence* >() );
+ stack< pair< node_id_t, size_t > > node_siv_stack;
+ node_siv_stack.push( make_pair(ancestor, ans_siv) );
+ vector<bool> visited( alignment_tree.size(), false );
+ while( node_siv_stack.size() > 0 )
+ {
+ pair< node_id_t, size_t > cur = node_siv_stack.top();
+ node_id_t cur_node = cur.first;
+ // visit post-order
+ if( !visited[cur_node] )
+ {
+ if( alignment_tree[cur_node].ordering[cur.second].c1_siv != (std::numeric_limits<size_t>::max)() )
+ node_siv_stack.push( make_pair( alignment_tree[cur_node].children[0], alignment_tree[cur_node].ordering[cur.second].c1_siv ) );
+ if( alignment_tree[cur_node].ordering[cur.second].c2_siv != (std::numeric_limits<size_t>::max)() )
+ node_siv_stack.push( make_pair( alignment_tree[cur_node].children[1], alignment_tree[cur_node].ordering[cur.second].c2_siv ) );
+ visited[cur_node] = true;
+ continue;
+ }
+ node_siv_stack.pop();
+ if( alignment_tree[cur_node].sequence != NULL )
+ continue; // don't do anything on leaf nodes
+
+ // build a super-interval
+ vector< node_id_t > node1_seqs; /**< the node id's of extant sequences below node 1 */
+ vector< node_id_t > node2_seqs; /**< the node id's of extant sequences below node 2 */
+ getAlignedChildren( alignment_tree[cur_node].children[0], node1_seqs );
+ getAlignedChildren( alignment_tree[cur_node].children[1], node2_seqs );
+ vector< bitset_t > m_aln(2, bitset_t( aln_matrix[0].size(), false ) );
+ gnSeqI seqI_len = 0;
+ gnSeqI seqJ_len = 0;
+ gnSeqI cur_col = 0;
+ for( size_t colI = 0; colI < aln_matrix[0].size(); colI++ )
+ {
+ uint seqI = 0;
+ uint seqJ = 0;
+ for( ; seqI < node1_seqs.size(); ++seqI )
+ if( aln_matrix[node_sequence_map[node1_seqs[seqI]]][colI] != '-' )
+ break;
+ for( ; seqJ < node2_seqs.size(); ++seqJ )
+ if( aln_matrix[node_sequence_map[node2_seqs[seqJ]]][colI] != '-' )
+ break;
+
+ if( seqI == node1_seqs.size() && seqJ == node2_seqs.size() )
+ continue; // nothing in this column
+ if( seqI != node1_seqs.size() )
+ {
+ seqI_len++;
+ m_aln[0].set(cur_col);
+ }
+ if( seqJ != node2_seqs.size() )
+ {
+ seqJ_len++;
+ m_aln[1].set(cur_col);
+ }
+ cur_col++;
+ }
+ m_aln[0].resize(cur_col);
+ m_aln[1].resize(cur_col);
+ CompactGappedAlignment<> tmp_cga(m_aln.size(), cur_col);
+ CompactGappedAlignment<>* cga = tmp_cga.Copy();
+ cga->SetLeftEnd(0, seqI_len > 0 ? 1 : 0); // at this point we have no idea where the left end should really be
+ cga->SetLeftEnd(1, seqJ_len > 0 ? 1 : 0);
+ if( cga->LeftEnd(0) != NO_MATCH )
+ cga->SetOrientation(0, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(0));
+ if( cga->LeftEnd(1) != NO_MATCH )
+ cga->SetOrientation(1, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(1));
+ cga->SetLength(seqI_len,0);
+ cga->SetLength(seqJ_len,1);
+ cga->SetAlignment(m_aln); // do this afterwords so that it can create the bitcount
+
+ // the alignment may need to be reversed if the aligned parent is reverse
+ size_t p_siv = alignment_tree[cur_node].ordering[cur.second].parent_siv;
+ bool reverse_me = false;
+ if( p_siv != (std::numeric_limits<size_t>::max)() )
+ {
+ size_t p_node = alignment_tree[cur_node].parents[0];
+ int p_child = alignment_tree[p_node].children[0] == cur_node ? 0 : 1;
+ if( alignment_tree[p_node].ordering[p_siv].reference_iv.Orientation(p_child) == AbstractMatch::reverse )
+ reverse_me = true;
+ }
+ if( reverse_me )
+ {
+ cga->Invert();
+ if( cga->LeftEnd(0) != NO_MATCH )
+ cga->SetOrientation(0, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(0));
+ if( cga->LeftEnd(1) != NO_MATCH )
+ cga->SetOrientation(1, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(1));
+ propagateInvert( alignment_tree, cur_node, cur.second );
+ }
+
+ alignment_tree[cur_node].ordering[cur.second].reference_iv = Interval();
+ vector< AbstractMatch* > am_list(1, cga);
+ alignment_tree[cur_node].ordering[cur.second].reference_iv.SetMatches( am_list );
+ // set these to zero so they don't interfere with coordinate translation
+ alignment_tree[cur_node].ordering[cur.second].reference_iv.SetLeftEnd(0, 0);
+ alignment_tree[cur_node].ordering[cur.second].reference_iv.SetLeftEnd(1, 0);
+ }
+}
+
+typedef boost::tuple<CompactGappedAlignment<>*, vector< bitset_t >*, AbstractMatch* > _sort_tracker_type;
+
+template< class CompType >
+class CgaBsComp
+{
+public:
+ CgaBsComp( CompType& c ) : comp(c) {};
+ bool operator()( const _sort_tracker_type& a, const _sort_tracker_type& b )
+ {
+ return comp( a.get<0>(), b.get<0>() );
+ }
+protected:
+ CompType& comp;
+};
+
+template< typename MatchVector >
+void multFilter( MatchVector& matches, uint mult = 2 )
+{
+ // apply a multiplicity filter
+ size_t cur = 0;
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ {
+ if( matches[mI]->Multiplicity() == mult )
+ matches[cur++] = matches[mI];
+ else
+ matches[mI]->Free();
+ }
+ matches.erase(matches.begin()+cur, matches.end());
+}
+
+template< typename MatchVector >
+void alignedNtCountFilter( MatchVector& matches, uint length )
+{
+ // require at least some number of aligned pairs in the anchor
+ size_t cur = 0;
+ for( size_t mI = 0; mI < matches.size(); ++mI )
+ {
+ size_t len_sum = 0;
+ for( size_t seqI = 0; seqI < matches[mI]->SeqCount(); seqI++ )
+ if(matches[mI]->LeftEnd(seqI) != NO_MATCH)
+ len_sum += matches[mI]->Length(seqI);
+
+ if( len_sum - length > matches[mI]->AlignmentLength() )
+ matches[cur++] = matches[mI];
+ else
+ matches[mI]->Free();
+ }
+ matches.erase(matches.begin()+cur, matches.end());
+}
+
+
+bool debugging_cltm = false;
+void ProgressiveAligner::constructLcbTrackingMatches(
+ node_id_t ancestral_node,
+ vector< AbstractMatch* >& ancestral_matches,
+ vector< LcbTrackingMatch< AbstractMatch* > >& tracking_matches
+ )
+{
+ node_id_t child_0 = alignment_tree[ancestral_node].children[0];
+ node_id_t child_1 = alignment_tree[ancestral_node].children[1];
+ // split up matches at descendant's breakpoints
+ propagateDescendantBreakpoints( child_0, 0, ancestral_matches );
+ propagateDescendantBreakpoints( child_1, 1, ancestral_matches );
+
+ // store alignment bitvectors for each match...
+ vector< bitset_t > bs_tmp(alignment_tree.size());
+ vector< vector< bitset_t > > bs(ancestral_matches.size(), bs_tmp);
+ vector< _sort_tracker_type > cga_list;
+ // initialize alignment bitvectors
+ for( size_t mI = 0; mI < ancestral_matches.size(); mI++ )
+ {
+ vector< bitset_t > aln( alignment_tree.size(), bitset_t(ancestral_matches[mI]->AlignmentLength() ) );
+ swap( bs[mI], aln );
+ ancestral_matches[mI]->GetAlignment(aln);
+ swap( bs[mI][child_0], aln[0] );
+ swap( bs[mI][child_1], aln[1] );
+ CompactGappedAlignment<> c(alignment_tree.size(),0);
+ c.SetLeftEnd(child_0, ancestral_matches[mI]->LeftEnd(0));
+ c.SetOrientation(child_0, ancestral_matches[mI]->Orientation(0));
+ c.SetLength(ancestral_matches[mI]->Length(0), child_0);
+ c.SetLeftEnd(child_1, ancestral_matches[mI]->LeftEnd(1));
+ c.SetOrientation(child_1, ancestral_matches[mI]->Orientation(1));
+ c.SetLength(ancestral_matches[mI]->Length(1), child_1);
+ cga_list.push_back(make_tuple(c.Copy(), &bs[mI], ancestral_matches[mI]));
+ }
+
+ stack<node_id_t> node_stack;
+ node_stack.push(child_0);
+ node_stack.push(child_1);
+ while(node_stack.size() > 0)
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( alignment_tree[cur_node].children.size() == 0 )
+ continue;
+ node_stack.push(alignment_tree[cur_node].children[0]);
+ node_stack.push(alignment_tree[cur_node].children[1]);
+
+ // do processing for cur_node...
+ // 1. determine which interval in the current node each match falls into
+ // 2. determine the offset of this match in that interval
+ // 3. translate with that interval
+
+ vector< SuperInterval >& siv_list = alignment_tree[cur_node].ordering;
+ SingleStartComparator< CompactGappedAlignment<> > ssc(cur_node);
+ CgaBsComp< SingleStartComparator< CompactGappedAlignment<> > > comp( ssc );
+ sort(cga_list.begin(), cga_list.end(), comp);
+ size_t mI = 0;
+ size_t sivI = 0;
+ while( mI < cga_list.size() && sivI < siv_list.size() )
+ {
+ CompactGappedAlignment<>* cur_match = cga_list[mI].get<0>();
+ if( cur_match->Start(cur_node) == 0 )
+ {
+ mI++;
+ continue; // this one doesn't match in this lineage!!
+ }
+ if( cur_match->LeftEnd(cur_node) >= siv_list[sivI].LeftEnd() + siv_list[sivI].Length() )
+ {
+ sivI++;
+ continue;
+ }
+
+ if( cur_match->LeftEnd(cur_node) + cur_match->Length(cur_node) >
+ siv_list[sivI].LeftEnd() + siv_list[sivI].Length() )
+ {
+ cerr << "doesn't fit\n";
+ cerr << "cga_list[" << mI << "]->LeftEnd(" << cur_node << "): " << cur_match->LeftEnd(cur_node) << endl;
+ cerr << "cga_list[" << mI << "]->RightEnd(" << cur_node << "): " << cur_match->RightEnd(cur_node) << endl;
+ breakHere();
+ }
+
+ // extract the region of the siv matched by the current match
+ CompactGappedAlignment<>* siv_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
+ if( siv_list[sivI].reference_iv.GetMatches().size() > 1 )
+ siv_cga = NULL;
+ if( siv_cga == NULL )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ siv_cga = tmp_cga.Copy();
+ *siv_cga = CompactGappedAlignment<>(siv_list[sivI].reference_iv);
+ vector<AbstractMatch*> tmp_matches(1,siv_cga);
+ siv_list[sivI].reference_iv.SetMatches(tmp_matches);
+ }
+ CompactGappedAlignment<> new_cga;
+ siv_cga->copyRange(new_cga, cur_match->LeftEnd(cur_node) - siv_list[sivI].LeftEnd(), cur_match->Length(cur_node));
+ if( cur_match->Orientation(cur_node) == AbstractMatch::reverse )
+ new_cga.Invert();
+ if( new_cga.Multiplicity() == 0 )
+ {
+ cerr << "impossible! there's no match!\n";
+ genome::breakHere();
+ }
+ // set the leftend in cga_list
+ for( uint cur_child = 0; cur_child < 2; cur_child++ )
+ {
+ node_id_t sweet_child = alignment_tree[cur_node].children[cur_child];
+ cur_match->SetLeftEnd(sweet_child, new_cga.LeftEnd(cur_child));
+ if( new_cga.LeftEnd(cur_child) != NO_MATCH )
+ {
+ cur_match->SetOrientation(sweet_child, new_cga.Orientation(cur_child));
+ cur_match->SetLength(new_cga.Length(cur_child), sweet_child);
+ }
+ }
+
+ // prepare a cga for translation
+ CompactGappedAlignment<> c(1,(*cga_list[mI].get<1>())[cur_node].size());
+ c.SetLeftEnd(0,1);
+ c.SetLength((*cga_list[mI].get<1>())[cur_node].count(),0);
+ vector<bitset_t> bivouac(1, (*cga_list[mI].get<1>())[cur_node]);
+ c.SetAlignment(bivouac);
+
+ // now translate each child
+ for( uint cur_child = 0; cur_child < 2; cur_child++ )
+ {
+ if( new_cga.Orientation(cur_child) == AbstractMatch::undefined )
+ continue;
+ CompactGappedAlignment<> cga_tmp = new_cga;
+ cga_tmp.SetStart(cur_child, 1);
+ c.translate(cga_tmp, cur_child, 0, false);
+ // adjust for end-gaps
+ bitset_t bs = (cga_tmp.GetAlignment())[cur_child];
+ bs.resize(c.GetAlignment()[0].size(), false);
+ bs <<= c.GetAlignment()[0].find_first();
+ node_id_t sweet_child = alignment_tree[cur_node].children[cur_child];
+ swap( (*cga_list[mI].get<1>())[sweet_child], bs );
+ for( size_t testI = 0; testI < cga_tmp.SeqCount(); ++testI )
+ {
+ if( ((*cga_list[mI].get<1>())[testI].size() != 0 && (*cga_list[mI].get<1>())[testI].size() != (*cga_list[mI].get<1>())[sweet_child].size() ) )
+ {
+ cerr << "bj0rk3l\n";
+ genome::breakHere();
+ }
+ }
+ }
+
+ debugging_cltm = false;
+ mI++; // advance to the next match
+ }
+ }
+ tracking_matches.resize( cga_list.size() );
+ // finally, construct CompactGappedAlignments out of the bitsets
+ for( size_t bsI = 0; bsI < cga_list.size(); ++bsI )
+ {
+ cga_list[bsI].get<0>()->SetAlignment(*cga_list[bsI].get<1>());
+ cga_list[bsI].get<0>()->validate();
+ TrackingMatch& ltm = tracking_matches[bsI];
+ ltm.node_match = cga_list[bsI].get<0>();
+ ltm.original_match = cga_list[bsI].get<2>();
+ ltm.match_id = bsI;
+
+ bool found_extant = false;
+ for( size_t i = 0; i < alignment_tree.size()-1; ++i )
+ {
+ size_t im = node_sequence_map[i];
+ if( im == (std::numeric_limits<size_t>::max)() )
+ continue;
+ if( ltm.node_match->LeftEnd(i) != NO_MATCH )
+ found_extant = true;
+ }
+ if( !found_extant )
+ {
+ cout << "orig aln len: " << ltm.original_match->AlignmentLength() << endl;
+ cout << "orig lend 0: " << ltm.original_match->Start(0) << endl;
+ cout << "orig lend 1: " << ltm.original_match->Start(1) << endl;
+ cout << "orig length 0: " << ltm.original_match->Length(0) << endl;
+ cout << "orig length 1: " << ltm.original_match->Length(1) << endl;
+
+ cerr << "this is an ungrounded match!!!\n";
+ genome::breakHere();
+ }
+ }
+}
+
+size_t countUnrefined( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push(ancestor);
+ size_t unrefined_count = 0;
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( alignment_tree[cur_node].children.size() > 0 )
+ for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); ++childI )
+ node_stack.push( alignment_tree[cur_node].children[childI] );
+ if( !alignment_tree[cur_node].refined )
+ unrefined_count++;
+ }
+ return unrefined_count;
+}
+
+void markAsRefined( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push(ancestor);
+ size_t refined_count = 0;
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( alignment_tree[cur_node].children.size() > 0 )
+ for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); ++childI )
+ node_stack.push( alignment_tree[cur_node].children[childI] );
+ alignment_tree[cur_node].refined = true;
+ }
+ alignment_tree[ancestor].refined = false;
+}
+
+
+
+void ProgressiveAligner::pairwiseScoreTrackingMatches(
+ std::vector< TrackingMatch >& tracking_matches,
+ std::vector<node_id_t>& node1_descendants,
+ std::vector<node_id_t>& node2_descendants,
+ boost::multi_array< double, 3 >& tm_score_array)
+{
+ tm_score_array.resize( boost::extents[tracking_matches.size()][node1_descendants.size()][node2_descendants.size()] );
+ for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
+ {
+ TrackingMatch* cur_match = &tracking_matches[mI];
+ AbstractMatch* node_match = cur_match->node_match;
+ for( size_t nI = 0; nI < node1_descendants.size(); ++nI )
+ {
+ if( node_sequence_map[node1_descendants[nI]] == (std::numeric_limits<uint>::max)() ||
+ node_match->LeftEnd(node1_descendants[nI]) == NO_MATCH )
+ continue;
+ for( size_t nJ = 0; nJ < node2_descendants.size(); ++nJ )
+ {
+ if( node_sequence_map[node2_descendants[nJ]] == (std::numeric_limits<uint>::max)() ||
+ node_match->LeftEnd(node2_descendants[nJ]) == NO_MATCH )
+ continue; // not extant or no match between this pair
+
+ node_id_t cur_n1 = node1_descendants[nI];
+ node_id_t cur_n2 = node2_descendants[nJ];
+ size_t nsmI = node_sequence_map[cur_n1];
+ size_t nsmJ = node_sequence_map[cur_n2];
+ PairwiseMatchAdapter pma( node_match, cur_n1, cur_n2 );
+ vector< AbstractMatch* > lcb_vect( 1, &pma );
+ vector< gnSequence* > ex_seqs(2);
+ ex_seqs[0] = alignment_tree[ cur_n1 ].sequence;
+ ex_seqs[1] = alignment_tree[ cur_n2 ].sequence;
+
+ tm_score_array[mI][nI][nJ] = GetPairwiseAnchorScore(lcb_vect, ex_seqs, subst_scoring, sol_list[nsmI], sol_list[nsmJ]);
+ }
+ }
+ }
+ computeAvgAncestralMatchScores(tracking_matches, node1_descendants, node2_descendants, tm_score_array);
+}
+
+void ProgressiveAligner::computeAvgAncestralMatchScores(
+ std::vector< TrackingMatch >& tracking_matches,
+ std::vector<node_id_t>& node1_descendants,
+ std::vector<node_id_t>& node2_descendants,
+ boost::multi_array< double, 3 >& tm_score_array)
+{
+ // now build up the consensus (ancestral) match scores and bp distances
+ for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
+ {
+ for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
+ {
+ node_id_t n1 = node1_descendants[nodeI];
+ node_id_t n2 = node2_descendants[nodeJ];
+
+ vector<node_id_t> n1_ext;
+ vector<node_id_t> n2_ext;
+ getAlignedChildren(n1, n1_ext);
+ getAlignedChildren(n2, n2_ext);
+ if( n1_ext.size() == 1 && n2_ext.size() == 1 )
+ continue; // this node has two extant nodes below it and was already scored
+
+ // map the nodes in n1_ext to their indices in n1_descendants
+ vector< node_id_t > n1_ext_map(n1_ext.size());
+ for( size_t i = 0; i < n1_ext.size(); ++i )
+ {
+ vector< node_id_t >::iterator iter = std::find( node1_descendants.begin(), node1_descendants.end(), n1_ext[i] );
+ n1_ext_map[i] = iter - node1_descendants.begin();
+ }
+ vector< node_id_t > n2_ext_map(n2_ext.size());
+ for( size_t i = 0; i < n2_ext.size(); ++i )
+ {
+ vector< node_id_t >::iterator iter = std::find( node2_descendants.begin(), node2_descendants.end(), n2_ext[i] );
+ n2_ext_map[i] = iter - node2_descendants.begin();
+ }
+
+ // compute scores for all matches at this node
+ for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
+ {
+ uint tally = 0;
+ double score_sum = 0;
+ for( size_t i = 0; i < n1_ext.size(); ++i )
+ {
+ if( tracking_matches[mI].node_match->LeftEnd(n1_ext[i]) == NO_MATCH )
+ continue;
+ for( size_t j = 0; j < n2_ext.size(); ++j )
+ {
+ if( tracking_matches[mI].node_match->LeftEnd(n2_ext[j]) == NO_MATCH )
+ continue;
+ ++tally;
+ score_sum += tm_score_array[mI][n1_ext_map[i]][n2_ext_map[j]];
+ }
+ }
+ if( tally > 0 )
+ tm_score_array[mI][nodeI][nodeJ] = score_sum / (double)tally;
+ }
+ }
+ }
+}
+
+
+void ProgressiveAligner::computeInternalNodeDistances(
+ boost::multi_array<double, 2>& bp_dist_mat,
+ boost::multi_array<double, 2>& cons_dist_mat,
+ std::vector<node_id_t>& node1_descendants,
+ std::vector<node_id_t>& node2_descendants)
+{
+ // bp distances for the current node.
+ bp_dist_mat.resize(boost::extents[node1_descendants.size()][node2_descendants.size()]);
+ cons_dist_mat.resize(boost::extents[node1_descendants.size()][node2_descendants.size()]);
+ for( size_t nI = 0; nI < node1_descendants.size(); ++nI )
+ {
+ if( node_sequence_map[node1_descendants[nI]] == (std::numeric_limits<uint>::max)() )
+ continue;
+ for( size_t nJ = 0; nJ < node2_descendants.size(); ++nJ )
+ {
+ if( node_sequence_map[node2_descendants[nJ]] == (std::numeric_limits<uint>::max)() )
+ continue;
+ size_t i = node_sequence_map[node1_descendants[nI]];
+ size_t j = node_sequence_map[node2_descendants[nJ]];
+ bp_dist_mat[nI][nJ] = this->bp_distance[i][j];
+ cons_dist_mat[nI][nJ] = this->conservation_distance[i][j];
+ }
+ }
+
+ // now build up the consensus (ancestral) bp distances
+ for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
+ {
+ for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
+ {
+ node_id_t n1 = node1_descendants[nodeI];
+ node_id_t n2 = node2_descendants[nodeJ];
+
+ vector<node_id_t> n1_ext;
+ vector<node_id_t> n2_ext;
+ getAlignedChildren(n1, n1_ext);
+ getAlignedChildren(n2, n2_ext);
+ if( n1_ext.size() == 1 && n2_ext.size() == 1 )
+ continue; // this node has two extant nodes below it, so already has a dist
+
+ // map the nodes in n1_ext to their indices in n1_descendants
+ vector< node_id_t > n1_ext_map(n1_ext.size());
+ for( size_t i = 0; i < n1_ext.size(); ++i )
+ {
+ vector< node_id_t >::iterator iter = std::find( node1_descendants.begin(), node1_descendants.end(), n1_ext[i] );
+ n1_ext_map[i] = iter - node1_descendants.begin();
+ }
+ vector< node_id_t > n2_ext_map(n2_ext.size());
+ for( size_t i = 0; i < n2_ext.size(); ++i )
+ {
+ vector< node_id_t >::iterator iter = std::find( node2_descendants.begin(), node2_descendants.end(), n2_ext[i] );
+ n2_ext_map[i] = iter - node2_descendants.begin();
+ }
+
+ // compute average bp distance
+ for( size_t i = 0; i < n1_ext.size(); ++i )
+ {
+ for( size_t j = 0; j < n2_ext.size(); ++j )
+ {
+ bp_dist_mat[nodeI][nodeJ] += bp_dist_mat[n1_ext_map[i]][n2_ext_map[j]];
+ cons_dist_mat[nodeI][nodeJ] += cons_dist_mat[n1_ext_map[i]][n2_ext_map[j]];
+ }
+ }
+ bp_dist_mat[nodeI][nodeJ] /= (double)(n1_ext.size() * n2_ext.size());
+ cons_dist_mat[nodeI][nodeJ] /= (double)(n1_ext.size() * n2_ext.size());
+ }
+ }
+
+}
+
+double computeID( GappedAlignment& gal, size_t seqI, size_t seqJ )
+{
+ const vector< string >& aln_mat = GetAlignment( gal, vector< gnSequence* >(gal.SeqCount(), NULL ));
+ double id = 0;
+ double possible = 0;
+ for( size_t colI = 0; colI < gal.AlignmentLength(); colI++ )
+ {
+ if( aln_mat[seqI][colI] == '-' || aln_mat[seqJ][colI] == '-' )
+ continue;
+ if( toupper(aln_mat[seqI][colI]) == toupper(aln_mat[seqJ][colI]))
+ id++;
+ possible++;
+ }
+ return id / possible;
+}
+
+
+//
+//
+// different option -- just pick a representative from leaf(A) and leaf(B) to translate
+void ProgressiveAligner::getRepresentativeAncestralMatches( const vector< node_id_t > node1_seqs, const vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< AbstractMatch* >& ancestral_matches )
+{
+ // for each match, extract a representative match from any pair of genomes in node1_seqs and node2_seqs
+ // translate up the resulting set of matches and eliminate overlaps
+ vector< AbstractMatch* > cur_matches;
+ boost::multi_array< vector< AbstractMatch* >, 2 > seq_matches( boost::extents[node1_seqs.size()][node2_seqs.size()] );
+ for( size_t mI = 0; mI < original_ml.size(); mI++ )
+ {
+ for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ {
+ uint ii = this->node_sequence_map[node1_seqs[seqI]];
+ if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
+ continue;
+
+ for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ uint jj = this->node_sequence_map[node2_seqs[seqJ]];
+ if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
+ continue;
+ Match mm( 2 );
+ Match* new_m = mm.Copy();
+ new_m->SetStart( 0, original_ml[mI]->Start(ii));
+ new_m->SetStart( 1, original_ml[mI]->Start(jj));
+ new_m->SetLength(original_ml[mI]->Length());
+ if( new_m->Start(0) < 0 )
+ new_m->Invert(); // assign reference orientation to seq 0
+ seq_matches[seqI][seqJ].push_back( new_m );
+ break;
+ }
+ break;
+ }
+ }
+ for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ translateGappedCoordinates( seq_matches[seqI][seqJ], 0, node1_seqs[seqI], node1 );
+ translateGappedCoordinates( seq_matches[seqI][seqJ], 1, node2_seqs[seqJ], node2 );
+ ancestral_matches.insert( ancestral_matches.end(), seq_matches[seqI][seqJ].begin(), seq_matches[seqI][seqJ].end() );
+ }
+
+ EliminateOverlaps_v2( ancestral_matches, true );
+}
+
+int cachecomp( const void* e1, const void* e2 )
+{
+ bool a = mems::cache_comparator(*(search_cache_t*)e1, *(search_cache_t*)e2);
+ bool b = mems::cache_comparator(*(search_cache_t*)e2, *(search_cache_t*)e1);
+ if(!a && !b)
+ return 0;
+ return a ? -1 : 1;
+}
+
+void ProgressiveAligner::alignProfileToProfile( node_id_t node1, node_id_t node2, node_id_t ancestor )
+{
+ // 1) find all pairwise matches
+ // 2) convert to pairwise matches among the ancestral sequences
+ // - delete inconsistently aligned regions?
+ // 3) perform greedy b.p. elimination on the pairwise matches
+ // 4) extend LCBs
+ // 5) if total alignment weight hasn't changed, go to (8)
+ // 6) search for additional matches between each match among extant sequences
+ // 7) go back to 2
+ // 8) perform a MUSCLE/Clustal alignment of each intervening region
+
+ vector< node_id_t > node1_seqs; /**< the node id's of extant sequences below node 1 */
+ vector< node_id_t > node2_seqs; /**< the node id's of extant sequences below node 2 */
+ getAlignedChildren( node1, node1_seqs );
+ getAlignedChildren( node2, node2_seqs );
+
+ uint seqI, seqJ;
+ gnSeqI prev_ancestral_seq_len = (std::numeric_limits<gnSeqI>::max)();
+
+ printMemUsage();
+ cout << "get ancestral matches\n";
+
+ Matrix<MatchList> pairwise_matches( node1_seqs.size(), node2_seqs.size() );
+// getPairwiseMatches( node1_seqs, node2_seqs, pairwise_matches );
+ vector< AbstractMatch* > anc_pairwise_matches;
+ getRepresentativeAncestralMatches( node1_seqs, node2_seqs, node1, node2, ancestor, anc_pairwise_matches );
+ printMemUsage();
+
+ PhyloTree< AlignmentTreeNode > aln_tree_backup;
+
+ /** A cache of regions that were searched in the previous round of recursion */
+ Matrix< std::vector< search_cache_t > > search_cache_db(node1_seqs.size(), node2_seqs.size());
+ double prev_anchoring_score = -(std::numeric_limits<double>::max)();
+ double cur_anchoring_score = -(std::numeric_limits<double>::max)();
+
+ while(true)
+ {
+ vector<AbstractMatch*> ancestral_matches;
+ if( anc_pairwise_matches.size() > 0 )
+ {
+ ancestral_matches.insert( ancestral_matches.begin(), anc_pairwise_matches.begin(), anc_pairwise_matches.end() );
+ anc_pairwise_matches.clear();
+ }
+ else
+ {
+ // part 2, construct pairwise matches to the ancestral sequence
+ // A) for each pairwise match, translate its
+ // coordinates to the ancestral genome
+ // -- try to use translateCoordinates
+ // -- build a translation table for translateCoordinates
+
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ {
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ cout << node_sequence_map[node1_seqs[seqI]] << "," << node_sequence_map[node2_seqs[seqJ]] << " has " << pairwise_matches(seqI,seqJ).size() << " pairwise matches\n";
+ cout.flush();
+
+ vector< AbstractMatch* > am_list( pairwise_matches(seqI, seqJ).begin(), pairwise_matches(seqI, seqJ).end() );
+ pairwise_matches(seqI, seqJ).clear();
+ translateGappedCoordinates( am_list, 1, node2_seqs[seqJ], node2 );
+ translateGappedCoordinates( am_list, 0, node1_seqs[seqI], node1 );
+ ancestral_matches.insert( ancestral_matches.end(), am_list.begin(), am_list.end() );
+ }
+ }
+ }
+ // include any matches from a previous iteration of this loop
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ {
+ Interval& ref_iv = alignment_tree[ancestor].ordering[aI].reference_iv;
+ if( ref_iv.Multiplicity() == 2 )
+ for( size_t mI = 0; mI < ref_iv.GetMatches().size(); mI++ )
+ if( ref_iv.GetMatches()[mI]->Multiplicity() > 1 )
+ ancestral_matches.push_back( ref_iv.GetMatches()[mI]->Copy() );
+ }
+
+ // set seq 0 to forward ref. orientation
+ for( size_t mI = 0; mI < ancestral_matches.size(); ++mI )
+ if( ancestral_matches[mI]->Start(0) < 0 )
+ ancestral_matches[mI]->Invert();
+
+ // eliminate overlaps as they correspond to inconsistently or
+ // multiply aligned regions
+ EliminateOverlaps_v2( ancestral_matches );
+
+ multFilter( ancestral_matches );
+
+ vector< vector< AbstractMatch* > > LCB_list;
+ vector< LCB > adjacencies;
+ vector< gnSeqI > breakpoints;
+
+ if( !collinear_genomes )
+ {
+ cout << "Performing Sum-of-pairs Greedy Breakpoint Elimination\n";
+ cout.flush();
+ // project the pairwise matches at this node to all-possible pairs matches at descendant nodes
+ // keep a mapping of ancestral to extant matches so that when an ancestral match gets removed
+ // the match among extant nodes also gets removed
+ // how should candidate matches to remove be generated?
+ // one possibility is to remove entire ancestral LCBs... this may be problematic since ancestral
+ // LCBs don't correspond to the pairwise LCBs thus an ancestral LCB could be removed with no useful
+ // change in alignment score
+ //
+ //
+ // translate the matches into LcbTrackingMatches
+ printMemUsage();
+ cout << "construct LCB tracking matches\n";
+ vector< TrackingMatch > tracking_matches;
+ boost::multi_array< size_t, 3 > tm_lcb_id_array;
+ boost::multi_array< double, 3 > tm_score_array;
+ constructLcbTrackingMatches( ancestor, ancestral_matches, tracking_matches );
+
+ cout << "There are " << tracking_matches.size() << " tracking matches\n";
+ size_t used_components = 0;
+ for( size_t tmI = 0; tmI < tracking_matches.size(); ++tmI )
+ {
+ for( uint ssI = 0; ssI < tracking_matches[tmI].node_match->SeqCount(); ++ssI )
+ if( tracking_matches[tmI].node_match->LeftEnd(ssI) != NO_MATCH )
+ used_components++;
+ }
+ size_t total_components = tracking_matches.size() == 0 ? 0 : tracking_matches.size() * tracking_matches[0].node_match->SeqCount();
+ cout << "There are " << used_components << " / " << total_components << " components used\n";
+
+ vector<node_id_t> node1_descendants;
+ vector<node_id_t> node2_descendants;
+ if( scoring_scheme == ExtantSumOfPairsScoring )
+ {
+ node1_descendants = node1_seqs;
+ node2_descendants = node2_seqs;
+ }else{
+ getDescendants(alignment_tree, node1, node1_descendants);
+ getDescendants(alignment_tree, node2, node2_descendants);
+ }
+
+ //
+ // score the matches
+ //
+ printMemUsage();
+ cout << "init tracking match LCB tracking\n";
+ initTrackingMatchLCBTracking( tracking_matches, node1_descendants.size(), node2_descendants.size(), tm_lcb_id_array );
+ printMemUsage();
+ cout << "pairwise score tracking matches\n";
+ pairwiseScoreTrackingMatches( tracking_matches, node1_descendants, node2_descendants, tm_score_array );
+ printMemUsage();
+
+ // compute bp distances for the current node.
+ // ancestral nodes take the average distance of extant nodes
+ boost::multi_array<double, 2> bp_dist_mat;
+ boost::multi_array<double, 2> cons_dist_mat;
+ computeInternalNodeDistances( bp_dist_mat, cons_dist_mat, node1_descendants, node2_descendants);
+
+ vector< TrackingMatch* > t_matches(tracking_matches.size());
+ for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
+ t_matches[mI] = &tracking_matches[mI];
+
+ // now sort these out into pairwise LCBs
+ cout << "get pairwise LCBs\n";
+ size_t pair_lcb_count = 0;
+ PairwiseLCBMatrix pairwise_adj_mat(boost::extents[node1_descendants.size()][node2_descendants.size()]);
+ for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
+ for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
+ {
+ getPairwiseLCBs( node1_descendants[nodeI], node2_descendants[nodeJ], nodeI, nodeJ, t_matches, pairwise_adj_mat[nodeI][nodeJ], tm_score_array, tm_lcb_id_array );
+ pair_lcb_count += pairwise_adj_mat[nodeI][nodeJ].size();
+ }
+ cout << "there are " << pair_lcb_count << " pairwise LCBs\n";
+ printMemUsage();
+
+ sort( t_matches.begin(), t_matches.end() );
+
+ // other possibility, choose pairwise LCBs to remove. a score improvement is always guaranteed
+ // compute LCBs among descendant nodes
+ // this is a good idea. it factors out ancestral breakpoint decisions entirely
+ // need a data structure to track all pairwise LCBs that contain a given match
+ // template <class MatchType>
+ // class LcbTrackingMatch <MatchType>
+ // {
+ // public:
+ // MatchType node_match;
+ // boost::multi_array< size_t, 2 > lcb_id;
+ // }
+ // all pairwise LCBs would be evaluated for removal and the one that provides the greatest
+ // overall score improvement gets removed.
+ // upon removal, matches associated with that LCB would get removed, and any LCBs in other
+ // genomes would get removed if they no longer had any matches
+ // to pull this off, the LCB struct needs to store the set of matches directly
+ //
+ // but what about small cycles that appear only in 3 or more-way comparisons? are these
+ // important? umm, yeah, but only if you believe in evolution.
+ //
+ // so here's the dilly-oh: score against the ancestral ordering(s) *and* all pairwise orderings
+ // for an ancestor. ancestor contributes the sum of all descendants to the score and breakpoints
+ // are penalized as the sum of /participating/ descendants. a descendant is participating
+ // if it has some matching region defined within the LCB and if removal of that matching region
+ // eliminates a breakpoint in the pairwise comparison
+ cout << "scaling bp penalty by conservation weight:\n";
+ print2d_matrix(cons_dist_mat, cout);
+ cout << "\n\nscaling bp penalty by bp weight: \n";
+ print2d_matrix(bp_dist_mat, cout);
+ cout << "\nGreedy BPE\n";
+ vector< TrackingMatch* > final;
+ if(scoring_scheme == AncestralScoring)
+ {
+ vector<node_id_t>::iterator d1_iter = std::find( node1_descendants.begin(), node1_descendants.end(), node1 );
+ vector<node_id_t>::iterator d2_iter = std::find( node2_descendants.begin(), node2_descendants.end(), node2 );
+ size_t d1_index = d1_iter - node1_descendants.begin();
+ size_t d2_index = d2_iter - node2_descendants.begin();
+ EvenFasterSumOfPairsBreakpointScorer spbs( breakpoint_penalty, min_breakpoint_penalty, bp_dist_mat, cons_dist_mat,
+ t_matches, pairwise_adj_mat, node1_descendants, node2_descendants,
+ tm_score_array, tm_lcb_id_array, d1_index, d1_index+1, d2_index, d2_index+1 );
+ cur_anchoring_score = greedySearch( spbs );
+ final = spbs.getResults();
+ }else{
+ EvenFasterSumOfPairsBreakpointScorer spbs( breakpoint_penalty, min_breakpoint_penalty, bp_dist_mat, cons_dist_mat,
+ t_matches, pairwise_adj_mat, node1_descendants, node2_descendants,
+ tm_score_array, tm_lcb_id_array, 0, node1_descendants.size(), 0, node2_descendants.size() );
+ cur_anchoring_score = greedySearch( spbs );
+ final = spbs.getResults();
+ }
+ cout << "done\n";
+
+ // free memory used by pairwise projections
+ for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
+ tracking_matches[mI].node_match->Free();
+
+ ancestral_matches.clear();
+
+ // free memory from deleted matches here
+ std::sort(final.begin(), final.end());
+ vector< TrackingMatch* > deleted_t_matches( t_matches.size(), NULL );
+ std::set_difference( t_matches.begin(), t_matches.end(), final.begin(), final.end(), deleted_t_matches.begin() );
+ for( size_t delI = 0; delI < deleted_t_matches.size(); ++delI )
+ {
+ if( deleted_t_matches[delI] == NULL )
+ break;
+ deleted_t_matches[delI]->original_match->Free();
+ }
+
+ // convert back to an LCB list
+ vector< AbstractMatch* > new_matches(final.size());
+ for( size_t mI = 0; mI < final.size(); ++mI )
+ new_matches[mI] = final[mI]->original_match;
+
+ IdentifyBreakpoints( new_matches, breakpoints );
+ ComputeLCBs_v2( new_matches, breakpoints, LCB_list );
+
+ } // end if !collinear
+ else
+ { // if we are assuming all genomes are collinear, then we don't need the
+ // sophisticated pairwise breakpoint scoring and can get by with simple breakpoint
+ // penalties
+ IdentifyBreakpoints( ancestral_matches, breakpoints );
+ ComputeLCBs_v2( ancestral_matches, breakpoints, LCB_list );
+
+ vector< double > lcb_scores( LCB_list.size() );
+ double score_sum = 100; // anything > 0 would work. this will be the breakpoint penalty
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ {
+ lcb_scores[lcbI] = SimpleGetLCBCoverage( LCB_list[lcbI] );
+ score_sum += lcb_scores[lcbI];
+ }
+
+ computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
+
+ // want to eliminate all breakpoints
+ SimpleBreakpointScorer wbs( adjacencies, score_sum, true );
+ cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, NULL, false );
+ vector<AbstractMatch*> deleted_matches;
+ filterMatches_v2( adjacencies, LCB_list, lcb_scores, deleted_matches );
+ for( size_t delI = 0; delI < deleted_matches.size(); ++delI )
+ deleted_matches[delI]->Free();
+ }
+ printMemUsage();
+
+ ancestral_matches.clear();
+
+ cout << "Arrived at " << LCB_list.size() << " intervals\n";
+ // create an ancestral ordering
+ vector< Interval* > pairwise_intervals;
+ Interval tmp_iv;
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
+ {
+ pairwise_intervals.push_back( tmp_iv.Copy() );
+ pairwise_intervals.back()->SetMatches( LCB_list[lcbI] );
+ }
+ LCB_list.clear();
+
+ vector<gnSeqI> seq_lengths = vector<gnSeqI>(2,0);
+ for( size_t aI = 0; aI < alignment_tree[node1].ordering.size(); ++aI )
+ seq_lengths[0] += alignment_tree[node1].ordering[aI].Length();
+ for( size_t aI = 0; aI < alignment_tree[node2].ordering.size(); ++aI )
+ seq_lengths[1] += alignment_tree[node2].ordering[aI].Length();
+
+ cout << "Adding unaligned intervals\n";
+ addUnalignedIntervals_v2(pairwise_intervals, set<uint>(), seq_lengths);
+
+ cout << "addUnalignedIntervals yields " << pairwise_intervals.size() << " intervals\n";
+
+ bool borked = false;
+ if(debug_aligner)
+ borked = validatePairwiseIntervals(node1, node2, pairwise_intervals);
+
+ // merge unaligned intervals
+ cout << "Merging unaligned intervals\n";
+ cout.flush();
+ vector<Interval*> new_list1;
+ vector<Interval*> merged_intervals;
+ mergeUnalignedIntervals( 1, pairwise_intervals, new_list1 );
+ mergeUnalignedIntervals( 0, new_list1, merged_intervals );
+ cout << "Marbling gaps\n";
+ cout.flush();
+ for( size_t ivI = 0; ivI < merged_intervals.size(); ivI++ )
+ merged_intervals[ivI]->Marble(50);
+
+ cout << "Propagating descendant breakpoints\n";
+
+ // split up intervals at descendant's breakpoints
+ propagateDescendantBreakpoints( node1, 0, merged_intervals );
+ propagateDescendantBreakpoints( node2, 1, merged_intervals );
+
+ cout << "descendant 0(" << node1 << ") has " << alignment_tree[node1].ordering.size() << " intervals\n";
+ cout << "descendant 1(" << node2 << ") has " << alignment_tree[node2].ordering.size() << " intervals\n";
+ cout << "propagateDescendantBreakpoints yields " << merged_intervals.size() << " intervals\n";
+
+ if(debug_aligner)
+ borked = validatePairwiseIntervals(node1, node2, merged_intervals);
+ cout << "Creating ancestral ordering\n";
+ alignment_tree[ancestor].ordering.clear();
+ createAncestralOrdering( merged_intervals, alignment_tree[ancestor].ordering );
+ for( size_t ivI = 0; ivI < merged_intervals.size(); ivI++ )
+ merged_intervals[ivI]->Free();
+ merged_intervals.clear(); // free up some memory
+
+ if(debug_aligner)
+ validateSuperIntervals( node1, node2, ancestor );
+
+ // if we're not making any progress then bail out...
+ gnSeqI cur_ancestral_seq_len = 0;
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ cur_ancestral_seq_len += alignment_tree[ancestor].ordering[aI].Length();
+
+ if( !collinear_genomes )
+ cout << "Previous anchoring score: " << prev_anchoring_score << ", new anchor score: " << cur_anchoring_score << endl;
+ else
+ cout << "Prev alignment len: " << prev_ancestral_seq_len << ", new alignment length: " << cur_ancestral_seq_len << endl;
+ // if cur_seq_len has decreased then we're improving
+ // if not, then we're done finding matches
+ if( collinear_genomes && cur_ancestral_seq_len >= prev_ancestral_seq_len )
+ break;
+
+ // stop unless we've increased the anchoring score by at least 0.5%
+ // the 0.5% is important for large alignments where many slow iterations might otherwise occur
+ // that only increase the anchoring score by a tiny amount
+ if( !collinear_genomes && cur_anchoring_score <= prev_anchoring_score + (genome::absolut(prev_anchoring_score)/200.0) )
+ break;
+ prev_anchoring_score = cur_anchoring_score;
+ prev_ancestral_seq_len = cur_ancestral_seq_len;
+
+ // accept the new alignment tree...
+ cout << "Backing up alignment tree...\n";
+ cout.flush();
+ aln_tree_backup = alignment_tree;
+
+ cout << "propagating ancestral breakpoints\n";
+ cout.flush();
+ recursiveApplyAncestralBreakpoints(ancestor);
+
+
+ if( debug_me )
+ {
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ {
+ GappedAlignment gal;
+ extractAlignment(ancestor, aI, gal);
+
+ bool check = false;
+ for( size_t ii = 0; ii < gal.SeqCount(); ++ii )
+ {
+ if( gal.LeftEnd(ii) == 0 )
+ continue;
+ for( size_t jj = 0; jj < gal.SeqCount(); ++jj )
+ {
+ if( gal.LeftEnd(jj) == 0 )
+ continue;
+ check = check || computeID( gal, ii, jj ) < .5;
+ }
+ }
+ if( check )
+ cerr << "check iv " << aI << " dbg_count " << dbg_count << endl;
+ else
+ continue;
+
+ const vector< string >& aln_mat = GetAlignment(gal, this->original_ml.seq_table);
+ gnSequence seq;
+ for( size_t seqI = 0; seqI < gal.SeqCount(); ++seqI )
+ if( gal.LeftEnd(seqI) != NO_MATCH )
+ seq += aln_mat[seqI];
+
+ stringstream dbg_fname;
+ dbg_fname << "prof_dbg_iv_" << aI << ".dbg." << dbg_count++ << ".fas";
+ ofstream debug_file( dbg_fname.str().c_str() );
+ gnFASSource::Write( seq, debug_file, false );
+ debug_file.close();
+ }
+ }
+
+ if(debug_aligner)
+ validateSuperIntervals( node1, node2, ancestor );
+
+ if(recursive)
+ {
+ // search for additional alignment anchors
+ cout << "recursive anchor search\n";
+ cout.flush();
+ Matrix<MatchList> matches;
+ Matrix< std::vector< search_cache_t > > new_cache_db(node1_seqs.size(), node2_seqs.size());
+ // initialize storage for intervening regions
+ boost::multi_array< std::vector< std::vector< int64 > >, 2 > iv_regions( boost::extents[node1_seqs.size()][node2_seqs.size()] );
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ iv_regions[seqI][seqJ].resize(2);
+ vector< gnSequence* > bseqs( node1_seqs.size() + node2_seqs.size() );
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ {
+ CompactGappedAlignment<> cga;
+ extractAlignment(ancestor, aI, cga);
+ recurseOnPairs(node1_seqs, node2_seqs, cga, matches, search_cache_db, new_cache_db, iv_regions);
+
+ // add any new matches to the pairwise_matches matrix
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ pairwise_matches(seqI, seqJ).insert( pairwise_matches(seqI, seqJ).end(), matches(seqI, seqJ).begin(), matches(seqI, seqJ).end() );
+
+ }
+
+ // add seqs
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ bseqs[seqI] = alignment_tree[ node1_seqs[seqI] ].sequence;
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ bseqs[seqI+seqJ] = alignment_tree[ node2_seqs[seqJ] ].sequence;
+
+ MaskedMemHash nway_mh;
+ // now search intervening regions
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ std::sort( iv_regions[seqI][seqJ][0].begin(), iv_regions[seqI][seqJ][0].end() );
+ std::sort( iv_regions[seqI][seqJ][1].begin(), iv_regions[seqI][seqJ][1].end() );
+ MatchList new_matches;
+ new_matches.seq_table.resize(2);
+ new_matches.seq_table[0] = bseqs[seqI];
+ new_matches.seq_table[1] = bseqs[node1_seqs.size() + seqJ];
+ SearchLCBGaps( new_matches, iv_regions[seqI][seqJ], nway_mh );
+ cout << seqI << "," << seqJ << " have " << new_matches.size() << " new matches outside LCBs\n";
+ pairwise_matches(seqI, seqJ).insert( pairwise_matches(seqI, seqJ).end(), new_matches.begin(), new_matches.end() );
+ }
+
+ if(using_cache_db)
+ {
+
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ {
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ {
+ for( size_t mI = 0; mI < search_cache_db(seqI,seqJ).size(); mI++ )
+ {
+ if( search_cache_db(seqI,seqJ)[mI].first != NULL )
+ search_cache_db(seqI,seqJ)[mI].first->Free();
+ if( search_cache_db(seqI,seqJ)[mI].second != NULL )
+ search_cache_db(seqI,seqJ)[mI].second->Free();
+ }
+ search_cache_db(seqI,seqJ).clear();
+ if(new_cache_db(seqI, seqJ).size() > 0)
+ {
+ // try sorting using C's qsort -- maybe there's something wrong with std::sort?
+ search_cache_t* sc_array = new search_cache_t[new_cache_db(seqI,seqJ).size()];
+ for( size_t i = 0; i < new_cache_db(seqI,seqJ).size(); i++ )
+ sc_array[i] = new_cache_db(seqI,seqJ)[i];
+ qsort(sc_array, new_cache_db(seqI,seqJ).size(), sizeof(AbstractMatch*), cachecomp);
+
+ search_cache_db(seqI, seqJ).resize(new_cache_db(seqI,seqJ).size());
+ for( size_t i = 0; i < new_cache_db(seqI,seqJ).size(); i++ )
+ search_cache_db(seqI, seqJ)[i] = sc_array[i];
+ delete[] sc_array;
+
+ new_cache_db(seqI, seqJ).clear();
+ }
+ if( pairwise_matches(seqI,seqJ).size() > 0 )
+ cout << seqI << "," << seqJ << " has an additional " << pairwise_matches(seqI,seqJ).size() << " matches\n";
+ }
+ }
+
+ }
+ } // if recursive
+
+ // restore backed up tree since we only want the final set of ancestral
+ // breakpoints applied to the descendants
+ cout << "Restoring backed up alignment tree...\n";
+ cout.flush();
+ swap( alignment_tree, aln_tree_backup );
+
+ } // end while(true)
+
+ if( using_cache_db )
+ {
+ // delete the search cache
+ for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
+ for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
+ for( size_t mI = 0; mI < search_cache_db(seqI,seqJ).size(); mI++ )
+ {
+ if( search_cache_db(seqI,seqJ)[mI].first != NULL )
+ search_cache_db(seqI,seqJ)[mI].first->Free();
+ if( search_cache_db(seqI,seqJ)[mI].second != NULL )
+ search_cache_db(seqI,seqJ)[mI].second->Free();
+ }
+ }
+
+ printMemUsage();
+
+ // aln_tree_backup has the highest scoring alignment_tree
+ swap( alignment_tree, aln_tree_backup );
+ cout << "propagating ancestral breakpoints\n";
+ recursiveApplyAncestralBreakpoints(ancestor);
+
+ printMemUsage();
+
+ // step 8) construct a muscle alignment in each intervening region
+ if( gapped_alignment )
+ {
+ cout << "performing a gapped alignment\n";
+ doGappedAlignment(ancestor, true);
+ }else
+ cout << "skipping gapped alignment\n";
+ if( refine )
+ {
+ size_t unrefined = countUnrefined( alignment_tree, ancestor );
+ if( unrefined > 5 && ancestor != alignment_tree.root )
+ {
+ cout << "performing iterative refinement\n";
+ doGappedAlignment(ancestor, false);
+ markAsRefined( alignment_tree, ancestor );
+ }
+ }
+ printMemUsage();
+
+
+ if( debug_me )
+ {
+ for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
+ {
+
+ static int dbg_count = 0;
+ GappedAlignment gal;
+ extractAlignment(ancestor, aI, gal);
+
+ bool check = false;
+ for( size_t ii = 0; ii < gal.SeqCount(); ++ii )
+ {
+ if( gal.LeftEnd(ii) == 0 )
+ continue;
+ for( size_t jj = 0; jj < gal.SeqCount(); ++jj )
+ {
+ if( gal.LeftEnd(jj) == 0 )
+ continue;
+ check = check || computeID( gal, ii, jj ) < .5;
+ }
+ }
+ if( check )
+ cerr << "check iv " << aI << " dbg_count " << dbg_count << endl;
+ else
+ continue;
+
+ const vector< string >& aln_mat = GetAlignment(gal, this->original_ml.seq_table);
+ gnSequence seq;
+ for( size_t seqI = 0; seqI < gal.SeqCount(); ++seqI )
+ if( gal.LeftEnd(seqI) != NO_MATCH )
+ seq += aln_mat[seqI];
+
+ stringstream dbg_fname;
+ dbg_fname << "prof_dbg_iv_" << aI << ".dbg." << dbg_count++ << ".fas";
+ ofstream debug_file( dbg_fname.str().c_str() );
+ gnFASSource::Write( seq, debug_file, false );
+ debug_file.close();
+ }
+ }
+
+
+}
+
+
+void addGuy( uint seqI, AbstractMatch::orientation orient,
+ std::vector< AbstractMatch* >& new_ivs,
+ vector<Interval*>& new_list )
+{
+ Interval tmp_iv;
+ // set the orientation for any unaligned intervals
+ if( orient == AbstractMatch::reverse )
+ {
+ for( size_t nI = 0; nI < new_ivs.size(); nI++ )
+ if( new_ivs[nI]->LeftEnd(seqI) != NO_MATCH && new_ivs[nI]->Orientation(seqI) != orient)
+ new_ivs[nI]->Invert();
+ }
+ // add this guy
+ Interval* added_iv = tmp_iv.Copy();
+ added_iv->SetMatches( new_ivs );
+ new_list.push_back(added_iv);
+}
+
+void mergeUnalignedIntervals( uint seqI, vector< Interval* >& iv_list, vector< Interval* >& new_list )
+{
+ SSC<Interval> ivlcJ(seqI);
+ sort( iv_list.begin(), iv_list.end(), ivlcJ );
+
+ Interval tmp_iv;
+ AbstractMatch::orientation orient = AbstractMatch::undefined;
+ vector< AbstractMatch* > new_ivs;
+ vector< Interval* > to_delete;
+ for( size_t ordI = 0; ordI < iv_list.size(); ordI++ )
+ {
+ if( iv_list[ordI]->LeftEnd(seqI) == NO_MATCH )
+ {
+ new_list.push_back(iv_list[ordI]);
+ iv_list[ordI] = NULL;
+ continue;
+ }
+
+ if( orient == AbstractMatch::undefined && iv_list[ordI]->Multiplicity() == 2 )
+ {
+ orient = iv_list[ordI]->Orientation(seqI);
+ vector< AbstractMatch* > matches;
+ iv_list[ordI]->StealMatches( matches );
+ if( orient == AbstractMatch::forward )
+ new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
+ else
+ new_ivs.insert( new_ivs.begin(), matches.begin(), matches.end() );
+
+ // if it's the last one then add
+ if( ordI + 1 == iv_list.size() )
+ addGuy( seqI, orient, new_ivs, new_list );
+ continue;
+ }
+ if( orient != AbstractMatch::undefined && iv_list[ordI]->Multiplicity() == 2 )
+ {
+ // add this guy...
+ // set the orientation for any unaligned intervals
+ addGuy( seqI, orient, new_ivs, new_list );
+
+ // prepare a new one
+ vector< AbstractMatch* > matches;
+ orient = iv_list[ordI]->Orientation(seqI);
+ iv_list[ordI]->StealMatches( matches );
+ new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
+ // if it's the last one then add
+ if( ordI + 1 == iv_list.size() )
+ addGuy( seqI, orient, new_ivs, new_list );
+ continue;
+ }
+ if( new_ivs.size() == 0 )
+ {
+ vector< AbstractMatch* > matches;
+ iv_list[ordI]->StealMatches( matches );
+ new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
+ continue;
+ }
+ // split this one in half (if its not the last one and there's something to split)...
+ Interval* left_iv = iv_list[ordI]->Copy();
+ to_delete.push_back( left_iv ); // make sure this gets deleted later
+ bool cropped = (ordI + 1 < iv_list.size() && iv_list[ordI]->Length(seqI) > 1);
+ if( cropped )
+ {
+ gnSeqI lendo = left_iv->AlignmentLength() / 2;
+ left_iv->CropEnd( left_iv->AlignmentLength() - lendo );
+ iv_list[ordI]->CropStart( lendo );
+ }
+ vector< AbstractMatch* > matches;
+ left_iv->StealMatches( matches );
+ if( orient == AbstractMatch::forward )
+ new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
+ else
+ new_ivs.insert( new_ivs.begin(), matches.begin(), matches.end() );
+
+ addGuy( seqI, orient, new_ivs, new_list );
+ // prepare for the next
+ orient = AbstractMatch::undefined;
+ if(cropped)
+ ordI--; // if we split a match, make sure we get the rest of this match on the next run through the loop
+ }
+
+ if( new_ivs.size() > 0 )
+ {
+ // uh-oh. there must not have been anything aligned
+ addGuy( seqI, AbstractMatch::forward, new_ivs, new_list );
+ }
+
+ // free up any left_ivs that were allocated
+ for( size_t delI = 0; delI < to_delete.size(); delI++ )
+ to_delete[delI]->Free();
+
+ // free up ivs left in iv_list
+ for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
+ if( iv_list[ivI] != NULL )
+ iv_list[ivI]->Free();
+ iv_list.clear();
+}
+
+
+/**
+ *
+ */
+void ProgressiveAligner::createAncestralOrdering( vector<Interval*>& interval_list, vector< SuperInterval >& ancestral_sequence )
+{
+ // construct an ancestral SuperSequence
+ int64 left_end = 1;
+ ancestral_sequence.resize( interval_list.size() );
+ for( uint ivI = 0; ivI < interval_list.size(); ++ivI ){
+ if(debug_aligner)
+ interval_list[ivI]->ValidateMatches();
+ vector<AbstractMatch*> matches;
+ interval_list[ivI]->StealMatches(matches);
+ ancestral_sequence[ivI].reference_iv.SetMatches(matches);
+ ancestral_sequence[ivI].SetLeftEnd(left_end);
+ ancestral_sequence[ivI].SetLength(ancestral_sequence[ivI].reference_iv.AlignmentLength());
+ if(debug_aligner)
+ ancestral_sequence[ivI].ValidateSelf();
+ left_end += ancestral_sequence[ivI].Length();
+ }
+}
+
+void markAligned( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t subject_node, node_id_t neighbor )
+{
+ for( uint parentI = 0; parentI < alignment_tree[subject_node].parents.size(); parentI++ )
+ if( alignment_tree[subject_node].parents[parentI] == neighbor )
+ alignment_tree[subject_node].parents_aligned[parentI] = true;
+ for( uint childI = 0; childI < alignment_tree[subject_node].children.size(); childI++ )
+ if( alignment_tree[subject_node].children[childI] == neighbor )
+ alignment_tree[subject_node].children_aligned[childI] = true;
+}
+
+
+bool
+ProgressiveAligner::validateSuperIntervals(node_id_t node1, node_id_t node2, node_id_t ancestor)
+{
+ // validate the ancestor
+ bool borked = false;
+ vector< SuperInterval >& siv_list = alignment_tree[ancestor].ordering;
+ gnSeqI n1_len = 0;
+ gnSeqI n2_len = 0;
+ gnSeqI my_len = 0;
+ gnSeqI my_iv_len = 0;
+ for( size_t sivI = 0; sivI < siv_list.size(); sivI++ )
+ {
+ if( siv_list[sivI].reference_iv.Start(0) != 0 )
+ n1_len += siv_list[sivI].reference_iv.Length(0);
+ if( siv_list[sivI].reference_iv.Start(1) != 0 )
+ n2_len += siv_list[sivI].reference_iv.Length(1);
+ my_len += siv_list[sivI].Length();
+ my_iv_len += siv_list[sivI].reference_iv.AlignmentLength();
+ siv_list[sivI].ValidateSelf();
+ }
+ gnSeqI real_n1len = 0;
+ gnSeqI real_n2len = 0;
+
+ vector< SuperInterval >& siv1_list = alignment_tree[node1].ordering;
+ for( size_t sivI = 0; sivI < siv1_list.size(); sivI++ )
+ {
+ if( siv1_list[sivI].Length() == 0 )
+ borked = true;
+ real_n1len += siv1_list[sivI].Length();
+ siv1_list[sivI].ValidateSelf();
+ }
+
+ vector< SuperInterval >& siv2_list = alignment_tree[node2].ordering;
+ for( size_t sivI = 0; sivI < siv2_list.size(); sivI++ )
+ {
+ if( siv2_list[sivI].Length() == 0 )
+ borked = true;
+ real_n2len += siv2_list[sivI].Length();
+ siv2_list[sivI].ValidateSelf();
+ }
+
+ if( real_n1len != n1_len || real_n2len != n2_len )
+ borked = true;
+
+ // check that each picks up where the last left off
+ for( size_t sivI = 1; sivI < siv1_list.size(); sivI++ )
+ if( siv1_list[sivI].LeftEnd() != siv1_list[sivI-1].LeftEnd() + siv1_list[sivI-1].Length() )
+ {
+ borked = true;
+ }
+ for( size_t sivI = 1; sivI < siv2_list.size(); sivI++ )
+ if( siv2_list[sivI].LeftEnd() != siv2_list[sivI-1].LeftEnd() + siv2_list[sivI-1].Length() )
+ {
+ borked = true;
+ }
+
+ if( my_len != my_iv_len )
+ borked = true;
+
+ if( my_len < real_n1len || my_len < real_n2len )
+ borked = true;
+
+ if( borked )
+ {
+ breakHere();
+ cerr << "child1 has " << siv1_list.size() << " ivs totalling " << real_n1len << " nt\n";
+ cerr << "child2 has " << siv2_list.size() << " ivs totalling " << real_n2len << " nt\n";
+ cerr << "parent has " << siv_list.size() << " ivs, n1_len: " << n1_len << " n2_len: " << n2_len << endl;
+ }
+ return borked;
+
+}
+
+bool ProgressiveAligner::validatePairwiseIntervals(node_id_t node1, node_id_t node2, std::vector<Interval*>& pair_iv)
+{
+ // validate the ancestor
+ bool borked = false;
+ gnSeqI n1_len = 0;
+ gnSeqI n2_len = 0;
+ for( size_t sivI = 0; sivI < pair_iv.size(); sivI++ )
+ {
+ if( pair_iv[sivI]->Start(0) != 0 )
+ n1_len += pair_iv[sivI]->Length(0);
+ if( pair_iv[sivI]->Start(1) != 0 )
+ n2_len += pair_iv[sivI]->Length(1);
+
+ vector< bitset_t > aln_mat;
+ pair_iv[sivI]->GetAlignment(aln_mat);
+ if( aln_mat[0].size() != pair_iv[sivI]->AlignmentLength() )
+ {
+ cerr << "broked\n";
+ }
+ pair_iv[sivI]->ValidateMatches();
+ }
+ gnSeqI real_n1len = 0;
+ gnSeqI real_n2len = 0;
+
+ vector< SuperInterval >& siv1_list = alignment_tree[node1].ordering;
+ for( size_t sivI = 0; sivI < siv1_list.size(); sivI++ )
+ {
+ if( siv1_list[sivI].Length() == 0 )
+ borked = true;
+ real_n1len += siv1_list[sivI].Length();
+ }
+
+ vector< SuperInterval >& siv2_list = alignment_tree[node2].ordering;
+ for( size_t sivI = 0; sivI < siv2_list.size(); sivI++ )
+ {
+ if( siv2_list[sivI].Length() == 0 )
+ borked = true;
+ real_n2len += siv2_list[sivI].Length();
+ }
+
+ if( real_n1len != n1_len || real_n2len != n2_len )
+ borked = true;
+
+ // check for overlapping intervals
+ vector< Interval* > tmp_iv_list = pair_iv;
+ for( uint seqI = 0; seqI < 2; seqI++ )
+ {
+ SSC<Interval> ssc(seqI);
+ sort( tmp_iv_list.begin(), tmp_iv_list.end(), ssc );
+ for( size_t ivI = 1; ivI < tmp_iv_list.size(); ivI++ )
+ {
+ if( tmp_iv_list[ivI-1]->LeftEnd(seqI) == NO_MATCH || tmp_iv_list[ivI]->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if( tmp_iv_list[ivI-1]->RightEnd(seqI) >= tmp_iv_list[ivI]->LeftEnd(seqI) )
+ {
+ cerr << "overlap:\n";
+ cerr << "tmp_iv_list[ivI-1].RightEnd(seqI): " << tmp_iv_list[ivI-1]->RightEnd(seqI) << endl;
+ cerr << "tmp_iv_list[ivI].LeftEnd(seqI): " << tmp_iv_list[ivI]->LeftEnd(seqI) << endl;
+ breakHere();
+ }
+ }
+ }
+
+ if( borked )
+ {
+ cerr << "child1 has " << siv1_list.size() << " ivs totalling " << real_n1len << " nt\n";
+ cerr << "child2 has " << siv2_list.size() << " ivs totalling " << real_n2len << " nt\n";
+ cerr << "parent has " << pair_iv.size() << " ivs, n1_len: " << n1_len << " n2_len: " << n2_len << endl;
+ if( n2_len < real_n2len )
+ {
+ SSC<Interval> sortie(1);
+ sort( pair_iv.begin(), pair_iv.end(), sortie );
+ size_t prev_iv = 9999999;
+ for( size_t ivI = 0; ivI < pair_iv.size(); ++ivI)
+ {
+ if( pair_iv[ivI]->LeftEnd(1) == NO_MATCH )
+ continue;
+
+ if( prev_iv != 9999999 )
+ cerr << "diff: " << pair_iv[ivI]->LeftEnd(1) - pair_iv[prev_iv]->RightEnd(1) << endl;
+ cerr << "Interval " << ivI << " LeftEnd(1): " << pair_iv[ivI]->LeftEnd(1) << " RightEnd(1): " << pair_iv[ivI]->RightEnd(1) << std::endl;
+ prev_iv = ivI;
+ }
+ }else if( n2_len > real_n2len )
+ {
+ SSC<Interval> sortie(1);
+ sort( pair_iv.begin(), pair_iv.end(), sortie );
+ for( size_t ivI = 0; ivI < pair_iv.size(); ++ivI)
+ {
+ if( pair_iv[ivI]->LeftEnd(1) < real_n2len )
+ continue;
+ cerr << "Interval " << ivI << " LeftEnd(1): " << pair_iv[ivI]->LeftEnd(1) << " RightEnd(1): " << pair_iv[ivI]->RightEnd(1) << std::endl;
+ }
+ }
+ breakHere();
+ }
+ return borked;
+}
+
+void ProgressiveAligner::alignNodes( node_id_t node1, node_id_t node2, node_id_t ancestor )
+{
+ cout << "Aligning node " << node1 << " to " << node2 << " via " << ancestor << "!\n";
+ // if node1 and node2 are not already children of ancestor then make it so...
+ if( alignment_tree[node1].parents[0] != ancestor ||
+ alignment_tree[node2].parents[0] != ancestor )
+ {
+ breakHere();
+ cerr << "rotten\n";
+ }
+
+ alignProfileToProfile(node1, node2, ancestor);
+
+ // mark edges as aligned
+ markAligned( alignment_tree, node1, node2 );
+ markAligned( alignment_tree, node2, node1 );
+ markAligned( alignment_tree, node1, ancestor );
+ markAligned( alignment_tree, node2, ancestor );
+ markAligned( alignment_tree, ancestor, node1 );
+ markAligned( alignment_tree, ancestor, node2 );
+}
+
+/**
+ * finds the midpoint of a phylogenetic tree, returns the ids of the surrounding nodes in n1 and n2
+ */
+void findMidpoint( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& n1, node_id_t& n2 )
+{
+ // use boost's all pairs shortest path to find the longest path on the tree
+ // Then actually traverse the path to determine which edge
+ // is halfway.
+ double scaling_factor = 100000;
+ using namespace boost;
+ typedef adjacency_list<vecS, vecS, undirectedS, no_property,
+ property< edge_weight_t, int, property< edge_color_t, default_color_type > > > Graph;
+ const int V = alignment_tree.size();
+ const std::size_t E = alignment_tree.size()-1;
+ typedef std::pair < int, int >Edge;
+ Edge* edge_array = new Edge[ alignment_tree.size() - 1 ];
+ int* weights = new int[ alignment_tree.size() - 1 ];
+ bitset_t child_found( alignment_tree.size(), false );
+ size_t eI = 0;
+ for( size_t vI = 0; vI < V; ++vI )
+ {
+ if( alignment_tree[vI].parents.size() != 0 )
+ {
+ edge_array[eI] = Edge( vI, alignment_tree[vI].parents[0] );
+ // for some reason boost insists on using an int for weights. need to figure that out
+ weights[eI] = (int)(scaling_factor * genome::absolut(alignment_tree[vI].distance)) + 1;
+ eI++;
+ }
+ }
+
+#if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
+ // VC++ can't handle the iterator constructor
+ Graph g(V);
+ for (std::size_t j = 0; j < E; ++j)
+ add_edge(edge_array[j].first, edge_array[j].second, g);
+#else
+ Graph g(edge_array, edge_array + E, V);
+#endif
+
+ property_map < Graph, edge_weight_t >::type w = get(edge_weight, g);
+ int *wp = weights;
+
+ graph_traits < Graph >::edge_iterator e, e_end;
+ for (boost::tie(e, e_end) = edges(g); e != e_end; ++e)
+ w[*e] = *wp++;
+
+ boost::multi_array<int,2> D( boost::extents[V][V] );
+ bool success = johnson_all_pairs_shortest_paths(g, D);
+ if( !success )
+ {
+ cerr << "failed, is this really a tree?\n";
+ return;
+ }
+
+ // find the most distant pair of nodes
+ int max_dist = (std::numeric_limits<int>::min)();
+ for (int i = 0; i < V; ++i) {
+ for (int j = 0; j < V; ++j) {
+ if( D[i][j] > max_dist )
+ {
+ max_dist = D[i][j];
+ n1 = i;
+ n2 = j;
+ }
+ }
+ }
+
+ typedef graph_traits<Graph>::vertex_descriptor vertex_t;
+ std::vector < vertex_t > pred(num_vertices(g));
+ std::vector < int > dist(num_vertices(g));
+ pred[n1] = n1;
+
+ undirected_dfs(g,
+ root_vertex( vertex( n1, g ) ).
+ visitor( make_dfs_visitor( make_pair(
+ record_predecessors(&pred[0], on_tree_edge()),
+ record_distances(&dist[0], on_tree_edge())
+ ))).
+ edge_color_map(get(edge_color, g))
+ );
+
+ int cur_node = n2;
+ int prev_node = n2;
+ max_dist /= 2;
+ while( cur_node != n1 && max_dist > 0 )
+ {
+ if( alignment_tree[cur_node].parents.size() > 0 &&
+ alignment_tree[cur_node].parents[0] == pred[cur_node] )
+ {
+ max_dist -= (int)(scaling_factor * alignment_tree[cur_node].distance) + 1;
+ prev_node = cur_node;
+ cur_node = pred[cur_node];
+ }else
+ {
+ prev_node = cur_node;
+ cur_node = pred[cur_node];
+ max_dist -= (int)(scaling_factor * alignment_tree[cur_node].distance) + 1;
+ }
+ }
+ n1 = cur_node;
+ n2 = prev_node;
+
+ delete[] edge_array;
+ delete[] weights;
+}
+
+void extendRootBranches( PhyloTree< AlignmentTreeNode >& alignment_tree )
+{
+ // find the max branch length and set the root branch lengths to twice that
+ // swap children while we're at it
+ node_id_t ancestor = alignment_tree.root;
+ double max_blen = -(std::numeric_limits<double>::max)();
+ for( size_t nI = 0; nI < alignment_tree.size(); ++nI )
+ {
+ if( alignment_tree[nI].distance > max_blen )
+ max_blen = alignment_tree[nI].distance;
+ if( alignment_tree[nI].children.size() > 0 &&
+ alignment_tree[nI].children[0] > alignment_tree[nI].children[1] )
+ {
+ std::swap( alignment_tree[nI].children[0], alignment_tree[nI].children[1] );
+ }
+ }
+ for( size_t cI = 0; cI < alignment_tree[ancestor].children.size(); ++cI )
+ alignment_tree[alignment_tree[ancestor].children[cI]].distance = 2.0 * max_blen;
+}
+
+void chooseNextAlignmentPair( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& node1, node_id_t& node2, node_id_t& ancestor )
+{
+
+ // find the nearest alignable neighbor
+ node1 = 0;
+ node2 = 0;
+ ancestor = 0;
+ double nearest_distance = (numeric_limits<double>::max)();
+ for( node_id_t nodeI = 0; nodeI < alignment_tree.size(); nodeI++ )
+ {
+ AlignmentTreeNode& cur_node = alignment_tree[ nodeI ];
+
+ // skip this node if it's already been completely aligned
+ // or is an extant sequence
+ boolean completely_aligned = true;
+ for( uint alignedI = 0; alignedI < cur_node.children_aligned.size(); alignedI++ )
+ completely_aligned = completely_aligned && cur_node.children_aligned[alignedI];
+ for( uint alignedI = 0; alignedI < cur_node.parents_aligned.size(); alignedI++ )
+ completely_aligned = completely_aligned && cur_node.parents_aligned[alignedI];
+ if( cur_node.sequence != NULL || completely_aligned )
+ continue;
+
+
+ vector< node_id_t > neighbor_id;
+ vector< boolean > alignable;
+ vector< double > distance;
+
+ for( uint parentI = 0; parentI < cur_node.parents.size(); parentI++ )
+ {
+ neighbor_id.push_back( cur_node.parents[parentI] );
+ vector< node_id_t >::iterator cur_neighbor = neighbor_id.end() - 1;
+ if( *cur_neighbor == alignment_tree.root )
+ {
+ // need special handling for the root since the alignment
+ // tree is supposed to be unrooted
+ // add all of root's children except this one
+ }
+ distance.push_back( cur_node.distance );
+ alignable.push_back( !cur_node.parents_aligned[parentI] && (alignment_tree[*cur_neighbor].ordering.size() != 0 || alignment_tree[*cur_neighbor].sequence != NULL) );
+ }
+
+ for( uint childI = 0; childI < cur_node.children.size(); childI++ )
+ {
+ neighbor_id.push_back( cur_node.children[childI] );
+ vector< node_id_t >::iterator cur_neighbor = neighbor_id.end() - 1;
+ distance.push_back( alignment_tree[*cur_neighbor].distance );
+ alignable.push_back( !cur_node.children_aligned[childI] && (alignment_tree[*cur_neighbor].ordering.size() != 0 || alignment_tree[*cur_neighbor].sequence != NULL) );
+ }
+
+ if( cur_node.ordering.size() != 0 )
+ {
+ // this one already has at least two sequences aligned, if another
+ // is alignable then check its distance
+ for( int i = 0; i < neighbor_id.size(); i++ ){
+ if( !alignable[i] )
+ continue;
+ if( distance[i] < nearest_distance )
+ {
+ nearest_distance = distance[i];
+ node1 = nodeI;
+ node2 = neighbor_id[i];
+ ancestor = nodeI;
+ }
+ }
+ }else{
+ // find the nearest alignable pair
+ for( int i = 0; i < neighbor_id.size(); i++ )
+ {
+ if( !alignable[i] )
+ continue;
+ for( int j = i+1; j < neighbor_id.size(); j++ )
+ {
+ if( !alignable[j] )
+ continue;
+ if( distance[i] + distance[j] < nearest_distance )
+ {
+ nearest_distance = distance[i] + distance[j];
+ node1 = neighbor_id[i];
+ node2 = neighbor_id[j];
+ ancestor = nodeI;
+ }
+ }
+ }
+ }
+ }
+}
+
+/** use a list of precomputed matches instead of computing them */
+void ProgressiveAligner::setPairwiseMatches( MatchList& pair_ml )
+{
+ original_ml = pair_ml;
+ pair_ml.clear(); // ProgressiveAligner owns the matches now...
+}
+
+
+node_id_t createAlignmentTreeRoot( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t node1, node_id_t node2 )
+{
+ // create a new node and link it inline between node1 and node2
+ AlignmentTreeNode atn;
+ alignment_tree.push_back( atn );
+ AlignmentTreeNode& old_root = alignment_tree[alignment_tree.root];
+ AlignmentTreeNode& new_root = alignment_tree.back();
+
+ if( find( alignment_tree[node1].children.begin(), alignment_tree[node1].children.end(), node2 ) !=
+ alignment_tree[node1].children.end() )
+ {
+ new_root.children.push_back(node2);
+ new_root.parents.push_back(node1);
+ alignment_tree[node2].parents.push_back(alignment_tree.size()-1);
+ alignment_tree[node1].children.push_back(alignment_tree.size()-1);
+ }else{
+ new_root.parents.push_back(node2);
+ new_root.children.push_back(node1);
+ alignment_tree[node2].children.push_back(alignment_tree.size()-1);
+ alignment_tree[node1].parents.push_back(alignment_tree.size()-1);
+ }
+
+ // completely unlink node1 and node2 from each other
+ findAndErase( alignment_tree[node1].children, node2 );
+ findAndErase( alignment_tree[node2].children, node1 );
+ findAndErase( alignment_tree[node1].parents, node2 );
+ findAndErase( alignment_tree[node2].parents, node1 );
+
+
+ // re-root the tree on the new node
+ rerootTree( alignment_tree, alignment_tree.size()-1 );
+
+ new_root.children_aligned = vector< boolean >( new_root.children.size(), false );
+ old_root.children_aligned = vector< boolean >( old_root.children.size(), false );
+ old_root.parents_aligned = vector< boolean >( old_root.parents.size(), false );
+ new_root.sequence = NULL;
+
+ return alignment_tree.root;
+}
+
+void ProgressiveAligner::extractAlignment( node_id_t ancestor, size_t super_iv, GappedAlignment& gal )
+{
+ CompactGappedAlignment<> cga;
+ extractAlignment( ancestor, super_iv, cga );
+ vector< string > aln;
+ GetAlignment( cga, this->original_ml.seq_table, aln );
+ gal = GappedAlignment(cga.SeqCount(), 0);
+ for( size_t seqI = 0; seqI < cga.SeqCount(); ++seqI )
+ {
+ gal.SetStart(seqI, cga.Start(seqI));
+ if( cga.Orientation(seqI) != AbstractMatch::undefined )
+ gal.SetLength(cga.Length(seqI), seqI);
+ }
+ gal.SetAlignment(aln);
+
+}
+
+void ProgressiveAligner::extractAlignment( node_id_t ancestor, size_t super_iv, CompactGappedAlignment<>& cga )
+{
+ // determine the leaf node intervals below this super_iv
+ vector< pair< node_id_t, size_t > > node_siv_list;
+ stack< pair<node_id_t,size_t> > node_stack;
+ node_stack.push(make_pair(ancestor,super_iv));
+ while( node_stack.size() > 0 )
+ {
+ pair<node_id_t,size_t> cur = node_stack.top();
+ node_id_t cur_node = cur.first;
+ node_stack.pop();
+ if( alignment_tree[cur_node].children.size() == 0 )
+ node_siv_list.push_back( cur );
+ for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
+ {
+ if( alignment_tree[cur_node].ordering[cur.second].reference_iv.LeftEnd(childI) == NO_MATCH )
+ continue;
+ size_t child_siv = childI == 0 ? alignment_tree[cur_node].ordering[cur.second].c1_siv :
+ alignment_tree[cur_node].ordering[cur.second].c2_siv;
+ node_stack.push(make_pair(alignment_tree[cur_node].children[childI], child_siv) );
+ node_id_t n = alignment_tree[cur_node].children[childI];
+ if( alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI) != alignment_tree[n].ordering[child_siv].Length() )
+ {
+ breakHere();
+ cerr << "alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI): " << alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI) << endl;
+ cerr << "rotten in the state of denmark...\n";
+ }
+ }
+ }
+
+ // armed with the list of pairs, extract each one...
+
+ // for each interval at the root write out the alignment
+ SuperInterval& a_iv = alignment_tree[ancestor].ordering[super_iv];
+ cga = CompactGappedAlignment<>(seq_count, a_iv.Length());
+ vector< bitset_t > aln_mats( seq_count );
+
+ // use translateCoordinates to map out each sequence's original coordinates
+ // to the alignment coordinates
+ for( size_t pairI = 0; pairI < node_siv_list.size(); pairI++ )
+ {
+ node_id_t nodeI = node_siv_list[pairI].first;
+ size_t seq_siv = node_siv_list[pairI].second;
+
+ // translate seq_siv into ancestor alignment coordinates?
+ // we can abuse translateCoordinates and the Match data structure :
+ // - add a single "match" covering the entire sequence
+ // - translate it up to alignment root coordinates
+ uint seqI = node_sequence_map[nodeI];
+ Match mm(2);
+ mm.SetStart(0, alignment_tree[nodeI].ordering[seq_siv].LeftEnd());
+ mm.SetStart(1, alignment_tree[nodeI].ordering[seq_siv].LeftEnd());
+ mm.SetLength( alignment_tree[nodeI].ordering[seq_siv].Length() );
+
+ vector< AbstractMatch* > aml( 1, mm.Copy() );
+ translateGappedCoordinates( aml, 0, nodeI, ancestor );
+
+ if( aml.size() > 1 )
+ {
+ cerr << "huh?";
+ genome::breakHere();
+ SingleStartComparator<AbstractMatch> ssc( 0 );
+ sort( aml.begin(), aml.end(), ssc ); // huh?
+ }
+ CompactGappedAlignment<>* trans_cga = dynamic_cast<CompactGappedAlignment<>*>(aml[0]);
+ if( trans_cga == NULL )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ trans_cga = tmp_cga.Copy();
+ *trans_cga = CompactGappedAlignment<>(*aml[0]);
+ }
+
+ if( trans_cga->LeftEnd(0) + trans_cga->Length(0) > a_iv.LeftEnd() + a_iv.Length() )
+ {
+ cerr << "trans_cga->Start(0): " << trans_cga->Start(0) << " trans_cga->Length(0): " << trans_cga->Length(0) << endl;
+ cerr << "a_iv.LeftEnd(): " << a_iv.LeftEnd() << " a_iv.Length(): " << a_iv.Length() << endl;
+ breakHere();
+ }
+ bool parity = trans_cga->Orientation(0) == trans_cga->Orientation(1);
+ cga.SetLeftEnd(seqI, trans_cga->LeftEnd(1));
+ AbstractMatch::orientation o = parity ? AbstractMatch::forward : AbstractMatch::reverse;
+ cga.SetOrientation(seqI, o);
+ const vector< bitset_t >& tmp = trans_cga->GetAlignment();
+ aln_mats[seqI] = tmp[1];
+
+ size_t offset = trans_cga->LeftEnd(0) - a_iv.LeftEnd();
+ if( aln_mats[seqI].size() < a_iv.Length() )
+ {
+ // need to resize and shift appropriately
+ aln_mats[seqI].resize( a_iv.Length() );
+ aln_mats[seqI] <<= offset; // this is backwards in boost::dynamic_bitset for some reason...
+ }
+ if( trans_cga->LeftEnd(0) < a_iv.LeftEnd() )
+ {
+ cerr << "trans_cga->LeftEnd(0): " << trans_cga->LeftEnd(0) << endl;
+ cerr << "a_iv.LeftEnd(): " << a_iv.LeftEnd() << endl;
+ breakHere();
+ }
+
+ // validate match lengths
+ if( trans_cga->Length(1) != alignment_tree[nodeI].ordering[seq_siv].Length() )
+ {
+ cerr << "b0rked\n";
+ breakHere();
+ }
+ // set the length and alignment appropriately
+ cga.SetLength(trans_cga->Length(1), seqI);
+
+ // free storage used by trans_cga
+ trans_cga->Free();
+ }
+ for( uint seqI = 0; seqI < aln_mats.size(); seqI++ )
+ if( aln_mats[seqI].size() == 0 )
+ aln_mats[seqI].resize( a_iv.Length() );
+ cga.SetAlignment(aln_mats);
+}
+
+unsigned getDefaultBreakpointMax( const std::vector< genome::gnSequence* >& seq_table )
+{
+ double avg_len = 0;
+ for( size_t seqI = 0; seqI < seq_table.size(); ++seqI )
+ avg_len += seq_table[seqI]->length();
+ avg_len /= (double)(seq_table.size());
+ // heavily rearranged, recently diverged genomes like yersinia have up to 15 rearrangements per megabase of sequence
+ avg_len /= 1000000.0; // convert to number of megabases
+ avg_len *= 15.0; // "lots" of rearrangement
+ return (unsigned)avg_len;
+}
+
+// get a pairwise bp distance
+void ProgressiveAligner::CreatePairwiseBPDistance( boost::multi_array<double, 2>& bp_distmat )
+{
+ uint seq_count = original_ml.seq_table.size();
+ bp_distmat.resize(boost::extents[seq_count][seq_count]);
+ for( size_t i = 0; i < seq_count; ++i )
+ for( size_t j = 0; j < seq_count; ++j )
+ bp_distmat[i][j] = 1;
+
+#ifdef LCB_WEIGHT_LOSS_PLOT
+ stringstream pair_bp_ofname;
+ pair_bp_ofname << "pair_bp_log.txt";
+ ofstream pair_bp_out( pair_bp_ofname.str().c_str() );
+#endif
+
+ vector< pair<uint, uint> > seq_pairs( (seq_count * (seq_count-1))/2 );
+ int ii = 0;
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ for( uint seqJ = seqI + 1; seqJ < seq_count; seqJ++ )
+ seq_pairs[ii++] = make_pair(seqI,seqJ);
+
+#pragma omp parallel for
+ for(int i = 0; i < seq_pairs.size(); i++)
+ {
+ uint seqI = seq_pairs[i].first;
+ uint seqJ = seq_pairs[i].second;
+ vector<uint>::iterator n1 = find( node_sequence_map.begin(), node_sequence_map.end(), seqI );
+ vector<uint>::iterator n2 = find( node_sequence_map.begin(), node_sequence_map.end(), seqJ );
+ vector<node_id_t> n1_seqs( 1, n1-node_sequence_map.begin() );
+ vector<node_id_t> n2_seqs( 1, n2-node_sequence_map.begin() );
+ Matrix<MatchList> mml;
+ getPairwiseMatches(n1_seqs, n2_seqs, mml);
+ MatchList& ml = mml(0,0);
+
+ // eliminate overlaps as they correspond to inconsistently or
+ // multiply aligned regions
+ EliminateOverlaps_v2( ml, true );
+ ml.MultiplicityFilter(2);
+
+ // do greedy b.p. elimination on the matches
+ vector< MatchList > LCB_list;
+ vector< LCB > adjacencies;
+ vector< gnSeqI > breakpoints;
+ IdentifyBreakpoints( ml, breakpoints );
+ ComputeLCBs_v2( ml, breakpoints, LCB_list );
+ vector< double > lcb_scores( LCB_list.size() );
+ cout << "Pair " << seq_pairs[i].first << ", " << seq_pairs[i].second << " has " << LCB_list.size() << " initial LCBs\n";
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ lcb_scores[lcbI] = GetPairwiseAnchorScore( LCB_list[lcbI], ml.seq_table, this->subst_scoring, sol_list[seqI], sol_list[seqJ] );
+
+ computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
+
+ // want to discard all low-weight LCBs
+ // to arrive at a set of reliable LCBs
+ double cons_id = 1 - this->conservation_distance[seqI][seqJ];
+ double scaled_score = max( bp_dist_estimate_score * cons_id * cons_id * cons_id * cons_id, min_breakpoint_penalty);
+ cout << "Using scaled bp penalty: " << scaled_score << endl;
+ GreedyRemovalScorer wbs( adjacencies, scaled_score );
+#ifdef LCB_WEIGHT_LOSS_PLOT
+ cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, &pair_bp_out, seqI, seqJ );
+ pair_bp_out.flush();
+#else
+ cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, NULL );
+#endif
+ MatchList deleted_matches;
+ filterMatches_v2( adjacencies, LCB_list, lcb_scores, deleted_matches );
+ cout << "Pair (" << seqI << "," << seqJ << ") has " << LCB_list.size() << " well-supported breakpoints\n";
+
+ // now set the distance entry
+ bp_distmat[seqI][seqJ] = LCB_list.size();
+ bp_distmat[seqJ][seqI] = LCB_list.size();
+
+ // free the matches
+ for( size_t dI = 0; dI < ml.size(); dI++ )
+ ml[dI]->Free();
+ }
+ // normalize to [0,1]
+ double bp_max = 0;
+ for( uint i = 0; i < bp_distmat.shape()[0]; ++i )
+ for( uint j = 0; j < bp_distmat.shape()[1]; ++j )
+ {
+ if( bp_distmat[i][j] > bp_max )
+ bp_max = bp_distmat[i][j];
+ }
+
+ double default_max = getDefaultBreakpointMax(original_ml.seq_table);
+ bp_max = bp_max > default_max ? bp_max : default_max;
+
+ for( uint i = 0; i < bp_distmat.shape()[0]; ++i )
+ for( uint j = 0; j < bp_distmat.shape()[1]; ++j )
+ {
+ if( i != j )
+ bp_distmat[i][j] /= bp_max;
+ bp_distmat[i][j] *= bp_dist_scale;
+ }
+}
+
+template< typename MatchListType >
+void makeAlignmentTree( PhyloTree< AlignmentTreeNode >& alignment_tree, MatchListType& mlist, vector< uint >& node_sequence_map )
+{
+ // initialize all nodes to unaligned
+ for( node_id_t nodeI = 0; nodeI < alignment_tree.size(); nodeI++ )
+ {
+ alignment_tree[nodeI].children_aligned = vector< boolean >( alignment_tree[nodeI].children.size(), false );
+ alignment_tree[nodeI].parents_aligned = vector< boolean >( alignment_tree[nodeI].parents.size(), false );
+ alignment_tree[nodeI].sequence = NULL;
+ alignment_tree[nodeI].refined = false;
+ }
+
+ // set the sequence appropriately for extant sequences
+ node_sequence_map = vector< uint >( alignment_tree.size(), -1 );
+ for( uint seqI = 0; seqI < mlist.seq_table.size(); seqI++ )
+ {
+ stringstream seq_name;
+ seq_name << "seq" << seqI + 1;
+ node_id_t nodeI = 0;
+ for( ; nodeI < alignment_tree.size(); nodeI++ )
+ {
+ if( seq_name.str() == alignment_tree[nodeI].name )
+ {
+ alignment_tree[nodeI].sequence = mlist.seq_table[seqI];
+ Match mm(1);
+ Match* m = mm.Copy();
+ m->SetStart(0,1);
+ m->SetLength(alignment_tree[nodeI].sequence->length(),0);
+ vector<AbstractMatch*> tmp(1,m);
+ Interval iv( tmp.begin(), tmp.end() );
+ m->Free();
+ SuperInterval si( iv );
+ si.SetLeftEnd(1);
+ si.SetLength(alignment_tree[nodeI].sequence->length());
+ alignment_tree[nodeI].ordering.push_back( si );
+ node_sequence_map[nodeI] = seqI;
+ break;
+ }
+ }
+ if( nodeI == alignment_tree.size() )
+ throw "Phylogenetic tree names unrecognized. Should follow seqN naming format\n";
+ }
+}
+
+void DistanceMatrix( IntervalList& iv_list, NumericMatrix<double>& distmat )
+{
+ IdentityMatrix( iv_list, distmat );
+ TransformDistanceIdentity(distmat);
+}
+
+/*
+void makeSuperIntervals( IntervalList& iv_list, PhyloTree< TreeNode >& alignment_tree, vector< uint >& node_sequence_map )
+{
+ std::stack< node_id_t > node_stack;
+ node_stack.push( alignment_tree.root );
+ bitset_t visited( alignment_tree.size(), false );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ // visit post-order
+ for( size_t cI = 0; cI < alignment_tree[cur_node].children.size(); ++cI )
+ {
+ if( !visited[alignment_tree[cur_node].children[cI]] )
+ node_stack.push(alignment_tree[cur_node].children[cI]);
+ }
+ if( node_stack.top() != cur_node )
+ continue;
+ node_stack.pop();
+ if( alignment_tree[cur_node].children.size() == 0 )
+ continue; // only process internal nodes
+
+ // process this node
+ // construct pairwise LCBs
+
+ uint seqI = node_sequence_map[alignment_tree[cur_node].children[0]];
+ uint seqJ = node_sequence_map[alignment_tree[cur_node].children[0]];
+ vector< uint > projection( 2 );
+ projection[0] = seqI;
+ projection[1] = seqJ;
+
+ vector< vector< MatchProjectionAdapter* > > LCB_list;
+ vector< LCB > projected_adjs;
+ projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
+
+ // create a superinterval for each adj
+// alignment_tree[cur_node].ordering.resize(adjs.size());
+// for( size_t adjI = 0; adjI < adjs.size(); ++adjI )
+// {
+// SuperInterval& siv = alignment_tree[cur_node].ordering[adjI];
+// Match mleft(2);
+// mleft.SetStart(0,adjI);
+// mleft.SetStart(1,adjI);
+// mleft.SetLength(1);
+// siv.SetLeftEnd( adjI );
+// siv.SetLength(1);
+// }
+
+ }
+}
+*/
+
+void ProgressiveAligner::alignPP(IntervalList& prof1, IntervalList& prof2, IntervalList& interval_list )
+{
+ if( debug_aligner )
+ {
+ debug_interval = true;
+ debug_cga = true;
+ }
+
+ seq_count = prof1.seq_table.size() + prof2.seq_table.size();
+
+ if( this->breakpoint_penalty == -1 )
+ this->breakpoint_penalty = getDefaultBreakpointPenalty( original_ml.seq_table );
+
+ if( this->bp_dist_estimate_score == -1 )
+ this->bp_dist_estimate_score = getDefaultBpDistEstimateMinScore( original_ml.seq_table );
+ cout << "using default bp penalty: " << breakpoint_penalty << endl;
+ cout << "using default bp estimate min score: " << bp_dist_estimate_score << endl;
+
+ if( this->collinear_genomes )
+ this->breakpoint_penalty = -1;
+
+ if( collinear_genomes )
+ cout << "\nAssuming collinear genomes...\n";
+
+ EliminateOverlaps_v2( original_ml );
+ // use existing pairwise matches
+ MatchList mlist;
+ mlist.clear();
+ mlist = original_ml;
+ cout << "Starting with " << mlist.size() << " multi-matches\n";
+
+//
+// Step 1) Compute guide trees for each profile and join them
+//
+ NumericMatrix< double > distance1;
+ DistanceMatrix( prof1, distance1 );
+ NumericMatrix< double > distance2;
+ DistanceMatrix( prof2, distance2 );
+
+ // Make a phylogenetic tree
+ // use the identity matrix method and convert to a distance matrix
+ MuscleInterface& ci = MuscleInterface::getMuscleInterface();
+ string guide_tree_fname1 = CreateTempFileName("guide_tree");
+ registerFileToDelete( guide_tree_fname1 );
+ ci.CreateTree( distance1, guide_tree_fname1 );
+ string guide_tree_fname2 = CreateTempFileName("guide_tree");
+ registerFileToDelete( guide_tree_fname2 );
+ ci.CreateTree( distance2, guide_tree_fname2 );
+
+ // read the trees
+ ifstream tree_file1( guide_tree_fname1.c_str() );
+ if( !tree_file1.is_open() )
+ throw "Error opening guide tree file";
+ PhyloTree< AlignmentTreeNode > tree1;
+ tree1.readTree( tree_file1 );
+ tree_file1.close();
+ ifstream tree_file2( guide_tree_fname2.c_str() );
+ if( !tree_file2.is_open() )
+ throw "Error opening guide tree file";
+ PhyloTree< AlignmentTreeNode > tree2;
+ tree2.readTree( tree_file2 );
+ tree_file2.close();
+
+
+ // compute pairwise distances among all nodes
+ NumericMatrix< double > distance;
+ DistanceMatrix( mlist, distance );
+ conservation_distance.resize(boost::extents[seq_count][seq_count]);
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
+ if( seqJ > seqI )
+ conservation_distance[seqI][seqJ] = distance(seqI,seqJ);
+ else
+ conservation_distance[seqI][seqJ] = distance(seqJ,seqI);
+
+
+ if( !collinear_genomes )
+ {
+ cout << "Calculating pairwise breakpoint distances\n";
+ CreatePairwiseBPDistance(bp_distance);
+ cout << "bp distance matrix:\n";
+ print2d_matrix(bp_distance, cout);
+ cout << endl;
+ }
+
+ // rescale the conservation distance
+ double conservation_range = 1;
+ double bp_range = 1;
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
+ conservation_distance[seqI][seqJ] = distance(seqI,seqJ) / conservation_range;
+
+ if( !(collinear_genomes && seq_count > 20 ) )
+ {
+ cout << "genome content distance matrix:\n";
+ print2d_matrix(conservation_distance, cout);
+ cout << endl;
+ }
+
+//
+// construct the alignment tree by joining trees from each profile
+//
+ vector< uint > nsmap1;
+ vector< uint > nsmap2;
+ makeAlignmentTree( tree1, prof1, nsmap1 );
+// prepareAlignmentTree(tree1);
+ makeAlignmentTree( tree2, prof2, nsmap2 );
+// prepareAlignmentTree(tree2);
+
+ alignment_tree.resize( tree1.size() + tree2.size() + 1 );
+ // set the sequence appropriately for extant sequences
+ node_sequence_map = vector< uint >( alignment_tree.size(), -1 );
+
+ // initialize all nodes to unaligned
+ for( node_id_t nodeI = 0; nodeI < alignment_tree.size()-1; nodeI++ )
+ {
+ if( nodeI < tree1.size() )
+ {
+ alignment_tree[nodeI].sequence = tree1[nodeI].sequence;
+ alignment_tree[nodeI].children = tree1[nodeI].children;
+ alignment_tree[nodeI].parents = tree1[nodeI].parents;
+ alignment_tree[nodeI].ordering = tree1[nodeI].ordering;
+ alignment_tree[nodeI].distance = tree1[nodeI].distance;
+ alignment_tree[nodeI].name = tree1[nodeI].name;
+ node_sequence_map[nodeI] = nsmap1[nodeI];
+ }else{
+ alignment_tree[nodeI].sequence = tree2[nodeI-tree1.size()].sequence;
+ alignment_tree[nodeI].children = tree2[nodeI-tree1.size()].children;
+ alignment_tree[nodeI].parents = tree2[nodeI-tree1.size()].parents;
+ alignment_tree[nodeI].ordering = tree2[nodeI-tree1.size()].ordering;
+ alignment_tree[nodeI].distance = tree2[nodeI-tree1.size()].distance;
+ alignment_tree[nodeI].name = tree2[nodeI-tree1.size()].name;
+ for( size_t cI = 0; cI < alignment_tree[nodeI].children.size(); cI++ )
+ alignment_tree[nodeI].children[cI] += tree1.size();
+ for( size_t pI = 0; pI < alignment_tree[nodeI].parents.size(); pI++ )
+ alignment_tree[nodeI].parents[pI] += tree1.size();
+ node_sequence_map[nodeI] = nsmap2[nodeI-tree1.size()];
+ if( node_sequence_map[nodeI] != (std::numeric_limits<uint>::max)() )
+ node_sequence_map[nodeI] += prof1.seq_table.size();
+ }
+
+ alignment_tree[nodeI].children_aligned = vector< boolean >( alignment_tree[nodeI].children.size(), true );
+ alignment_tree[nodeI].parents_aligned = vector< boolean >( alignment_tree[nodeI].parents.size(), true );
+ alignment_tree[nodeI].refined = true;
+ }
+
+ alignment_tree.back().children.push_back( tree1.size()-1 );
+ alignment_tree.back().children.push_back( alignment_tree.size()-2 );
+ alignment_tree.back().distance = 100;
+ alignment_tree.back().children_aligned = vector< boolean >( alignment_tree.back().children.size(), true );
+ alignment_tree.back().parents_aligned = vector< boolean >( alignment_tree.back().parents.size(), true );
+ alignment_tree.back().refined = false;
+
+
+ getAlignment( interval_list );
+
+}
+
+void ProgressiveAligner::getAlignment( IntervalList& interval_list )
+{
+ cout << "Aligning...\n";
+ // pick each pair of sequences and align until none are left
+ while(true)
+ {
+ node_id_t node1;
+ node_id_t node2;
+ node_id_t ancestor;
+ chooseNextAlignmentPair( alignment_tree, node1, node2, ancestor );
+ if( node1 == node2 )
+ break; // all pairs have been aligned
+
+ // this is the last alignable pair in the unrooted tree
+ // create a root from which the complete alignment can be extracted
+ alignNodes( node1, node2, ancestor );
+ if( ancestor == alignment_tree.root )
+ break; // all done
+ }
+
+ if( refine )
+ {
+ // perform iterative refinement
+ cout << "Performing final pass iterative refinement\n";
+ doGappedAlignment(alignment_tree.root, false);
+ }
+
+ // peel off the alignment from the root node
+ cout << "root alignment has " << alignment_tree[alignment_tree.root].ordering.size() << " superintervals\n";
+ vector< SuperInterval >& a_ivs = alignment_tree[alignment_tree.root].ordering;
+ gnSeqI len = 0;
+ for( size_t ivI = 0; ivI < a_ivs.size(); ivI++ )
+ {
+ len += a_ivs[ivI].Length();
+ }
+ cout << "root alignment length: " << len << endl;
+
+
+ // for each interval at the root write out the alignment
+ for( size_t ivI = 0; ivI < a_ivs.size(); ivI++ )
+ {
+ GappedAlignment ga(seq_count, a_ivs[ivI].Length());
+ extractAlignment(alignment_tree.root, ivI, ga);
+ vector<AbstractMatch*> tmp(1, &ga);
+ interval_list.push_back( Interval(tmp.begin(), tmp.end()) );
+ }
+}
+
+/**
+ *
+ */
+
+void ProgressiveAligner::align( vector< gnSequence* >& seq_table, IntervalList& interval_list ){
+ if( debug_aligner )
+ {
+ debug_interval = true;
+ debug_cga = true;
+ }
+
+ seq_count = seq_table.size();
+ this->currently_recursing = false;
+ interval_list.seq_table = seq_table;
+
+ // find pairwise matches
+ MatchList mlist;
+ mlist.seq_table = seq_table;
+
+ if( this->breakpoint_penalty == -1 )
+ this->breakpoint_penalty = getDefaultBreakpointPenalty( seq_table );
+ if( this->bp_dist_estimate_score == -1 )
+ this->bp_dist_estimate_score = getDefaultBpDistEstimateMinScore( original_ml.seq_table );
+ cout << "using default bp penalty: " << breakpoint_penalty << endl;
+ cout << "using default bp estimate min score: " << bp_dist_estimate_score << endl;
+
+ if( this->collinear_genomes )
+ this->breakpoint_penalty = -1;
+
+ if( collinear_genomes )
+ cout << "\nAssuming collinear genomes...\n";
+
+ mlist.clear();
+ mlist = original_ml;
+ cout << "Starting with " << mlist.size() << " multi-matches\n";
+ cout << "Computing genome content distance matrix...\n";
+
+//
+// Step 2) Compute a phylogenetic guide tree using the pairwise matches
+//
+ NumericMatrix< double > distance;
+ SingleCopyDistanceMatrix( mlist, mlist.seq_table, distance );
+ cout << "\n\nGenome conservation distance matrix: " << endl;
+ distance.print(cout);
+ cout << endl;
+
+ bool input_tree_specified = input_guide_tree_fname != "";
+ bool output_tree_specified = output_guide_tree_fname != "";
+ if( !input_tree_specified )
+ {
+ // Make a phylogenetic guide tree
+ if( !output_tree_specified )
+ output_guide_tree_fname = CreateTempFileName("guide_tree");
+ input_guide_tree_fname = output_guide_tree_fname;
+ cout << "Writing guide tree to " << output_guide_tree_fname << endl;
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+ mi.CreateTree( distance, output_guide_tree_fname );
+
+ // ci.SetDistanceMatrix( distance, output_guide_tree_fname );
+ }
+
+ conservation_distance.resize(boost::extents[seq_count][seq_count]);
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
+ if( seqJ > seqI )
+ {
+ conservation_distance[seqI][seqJ] = distance(seqI,seqJ);
+ conservation_distance[seqJ][seqI] = distance(seqI,seqJ);
+ }
+ else
+ {
+ conservation_distance[seqI][seqJ] = distance(seqJ,seqI);
+ conservation_distance[seqJ][seqI] = distance(seqJ,seqI);
+ }
+
+ cout << "reading tree...\n";
+ // load the guide tree
+ ifstream tree_file( input_guide_tree_fname.c_str() );
+ if( !tree_file.is_open() )
+ throw "Error opening guide tree file";
+ alignment_tree.readTree( tree_file );
+ tree_file.close();
+
+ cout << "initializing alignment tree...\n";
+ node_id_t node1;
+ node_id_t node2;
+ findMidpoint( alignment_tree, node1, node2 );
+ moveRootToBranch( alignment_tree, node1, node2 );
+
+ makeAlignmentTree( alignment_tree, mlist, node_sequence_map );
+ // midpoint root the tree
+// findMidpoint( alignment_tree, node1, node2 );
+// node_id_t ancestor = 0;
+// if( seq_count > 2 ) // if only two sequences then the tree already has a root
+// ancestor = createAlignmentTreeRoot( alignment_tree, node1, node2 );
+
+ // write out the rooted guide tree, but don't clobber the user's input tree
+ if( !input_tree_specified || output_tree_specified )
+ {
+ ofstream out_tree_file( output_guide_tree_fname.c_str() );
+ if( !out_tree_file.is_open() )
+ throw "Error opening guide tree file for write";
+ alignment_tree.writeTree( out_tree_file );
+ out_tree_file.close();
+ }
+
+ // ensure the root is the last to get aligned and swap children to canonical order
+ extendRootBranches(alignment_tree);
+
+
+ if( !collinear_genomes )
+ {
+ // need sol lists for scoring
+ vector<SeedOccurrenceList> blah(seq_count);
+ swap( blah, sol_list );
+// sol_list = ;
+ // temporarily create a weight 11 SML
+/* MatchList w11_mlist;
+ w11_mlist.seq_filename = original_ml.seq_filename;
+ w11_mlist.seq_table = original_ml.seq_table;
+ cout << "Creating weight 11 SMLs for repeat detection\n";
+ w11_mlist.CreateMemorySMLs( 11, NULL );
+*/
+ cout << "Constructing seed occurrence lists for repeat detection\n";
+#pragma omp parallel for
+ for( int seqI = 0; seqI < seq_count; seqI++ )
+ {
+ sol_list[seqI].construct(*(mlist.sml_table[seqI]));
+// delete w11_mlist.sml_table[seqI];
+ }
+// w11_mlist.sml_table.clear();
+ }
+ if( !collinear_genomes && use_weight_scaling )
+ {
+ cout << "Calculating pairwise breakpoint distances\n";
+ CreatePairwiseBPDistance(bp_distance);
+ }
+
+ // rescale the conservation distance
+ if( use_weight_scaling )
+ {
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
+ conservation_distance[seqI][seqJ] = distance(seqI,seqJ) * conservation_dist_scale;
+ }else{
+ bp_distance.resize(boost::extents[seq_count][seq_count]);
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
+ {
+ conservation_distance[seqI][seqJ] = 0;
+ bp_distance[seqI][seqJ] = 0;
+ }
+ }
+
+ if( !collinear_genomes )
+ {
+ cout << "genome content distance matrix:\n";
+ print2d_matrix(conservation_distance, cout);
+ cout << endl;
+ cout << "bp distance matrix:\n";
+ print2d_matrix(bp_distance, cout);
+ cout << endl;
+ }
+
+ getAlignment( interval_list );
+}
+
+
+// broken and unused function graveyard
+
+}
diff --git a/libMems/ProgressiveAligner.h b/libMems/ProgressiveAligner.h
new file mode 100644
index 0000000..84b3756
--- /dev/null
+++ b/libMems/ProgressiveAligner.h
@@ -0,0 +1,637 @@
+/*******************************************************************************
+ * $Id: ProgressiveAligner.h,v 1.23 2004/04/19 23:10:13 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _ProgressiveAligner_h_
+#define _ProgressiveAligner_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/SuperInterval.h"
+#include "libMems/Aligner.h"
+#include "libMems/PhyloTree.h"
+#include "libMems/GreedyBreakpointElimination.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/Islands.h"
+#include <boost/type_traits/remove_pointer.hpp>
+#include <boost/multi_array.hpp>
+#include "libMems/SeedOccurrenceList.h"
+#include "libMems/SubstitutionMatrix.h"
+#include "libMems/MatchProjectionAdapter.h"
+
+namespace mems
+{
+
+/** controls whether copious debugging tests and output gets written to screen */
+extern bool debug_aligner;
+
+
+/** A class that stores alignment-related information as a node in a phylogenetic tree */
+class AlignmentTreeNode : public TreeNode
+{
+public:
+ AlignmentTreeNode() : TreeNode(), refined(false) {};
+ std::vector< SuperInterval > ordering; /**< A total ordering on alignments of sequence contained by leafs below this node */
+ std::vector< boolean > parents_aligned; /**< have parents been aligned? */
+ std::vector< boolean > children_aligned; /**< have children been aligned? */
+ genome::gnSequence* sequence; /**< The sequence associated with this node, NULL for ancestral nodes */
+ bool refined; /**< true if iterative refinement has been applied to the alignment at this node */
+};
+
+
+double getDefaultBreakpointPenalty( std::vector< genome::gnSequence* >& sequences );
+
+/**
+ * Computes multiple genome alignments using a progressive alignment algorithm
+ */
+class ProgressiveAligner : public mems::Aligner
+{
+public:
+ /**
+ * Constructs an aligner for the specified number of sequences.
+ * @param seq_count The number of sequences that will be aligned with this Aligner
+ */
+ ProgressiveAligner( uint seq_count );
+ ProgressiveAligner( const ProgressiveAligner& al );
+ ProgressiveAligner& operator=( const ProgressiveAligner& al );
+ ~ProgressiveAligner();
+
+ /** sets the breakpoint penalty */
+ void setBreakpointPenalty( double bp_penalty ){ breakpoint_penalty = bp_penalty; }
+ /** sets the the minimum breakpoint penalty after scaling */
+ void setMinimumBreakpointPenalty( double min_bp_penalty ){ min_breakpoint_penalty = min_bp_penalty; }
+ /** assume all genomes are collinear when set to true */
+ void setCollinear( boolean collinear ){ this->collinear_genomes = collinear; }
+ /** use a list of precomputed matches instead of computing them */
+ void setPairwiseMatches( mems::MatchList& pair_ml );
+ /** use a precomputed guide tree stored in the given file */
+ void setInputGuideTreeFileName( std::string& fname ){ this->input_guide_tree_fname = fname; }
+ /** write the guide tree stored to the given file */
+ void setOutputGuideTreeFileName( std::string& fname ){ this->output_guide_tree_fname = fname; }
+ /** set the max length (in columns) of alignments passed to MUSCLE */
+ void SetMaxGappedAlignmentLength( size_t len );
+ /** set whether a cache database should be used to speed up recursive anchor search */
+ void SetUseCacheDb( bool cbd ){ this->using_cache_db = cbd; }
+
+ /** Set whether iterative refinement using MUSCLE should be performed (true/false) */
+ void setRefinement( bool refine ){ this->refine = refine; }
+ /** Set whether iterative refinement using MUSCLE should be performed (true/false) */
+ void setGappedAlignment( bool do_gapped_alignment ){ this->gapped_alignment = do_gapped_alignment; }
+
+ void setPairwiseScoringScheme( const mems::PairwiseScoringScheme& pss ){ this->subst_scoring = pss; }
+
+ enum LcbScoringScheme
+ {
+ AncestralScoring,
+ AncestralSumOfPairsScoring,
+ ExtantSumOfPairsScoring
+ };
+
+ /** set LCB the scoring scheme */
+ void setLcbScoringScheme( LcbScoringScheme scheme ){ scoring_scheme = scheme; }
+ LcbScoringScheme getLcbScoringScheme(void){ return scoring_scheme; }
+
+ void setUseSeedFamilies( bool use_seed_families ){ this->use_seed_families = use_seed_families; }
+ bool getUseSeedFamilies(void){ return this->use_seed_families; }
+
+ void setUseLcbWeightScaling( bool use_weight_scaling ){ this->use_weight_scaling = use_weight_scaling; }
+ bool getUseLcbWeightScaling(void){ return this->use_weight_scaling; }
+
+ void setBreakpointDistanceScale( double bp_dist_scale ){ this->bp_dist_scale = bp_dist_scale; }
+ double getBreakpointDistanceScale(void){ return this->bp_dist_scale; }
+
+ void setConservationDistanceScale( double conservation_dist_scale ){ this->conservation_dist_scale = conservation_dist_scale; }
+ double getConservationDistanceScale(void){ return this->conservation_dist_scale; }
+
+ void setBpDistEstimateMinScore( double min_score ){ this->bp_dist_estimate_score = min_score; }
+ double getBpDistEstimateMinScore(void){ return this->bp_dist_estimate_score; }
+
+ /** determine which extant sequences have been aligned at a given node */
+ void getAlignedChildren( node_id_t node, std::vector< node_id_t >& descendants );
+
+ /** chooses an ordering for aligned intervals at an ancestor node */
+ void createAncestralOrdering( std::vector< mems::Interval* >& interval_list, std::vector< SuperInterval >& ancestral_sequence );
+
+ /** constructs an alignment of node1 and node2 at their ancestor */
+ void alignProfileToProfile( node_id_t node1, node_id_t node2, node_id_t ancestor );
+
+ /** align the sequences at the designated pair of alignment tree nodes */
+ void alignNodes( node_id_t node1, node_id_t node2, node_id_t ancestor );
+
+
+ /** Given a set of sequences, construct and output an alignment as an IntervalList */
+ void align( std::vector< genome::gnSequence* >& seq_table, mems::IntervalList& interval_list );
+
+ void getPath( node_id_t first_n, node_id_t last_n, std::vector< node_id_t >& path );
+ template<class MatchType>
+ void propagateDescendantBreakpoints( node_id_t node1, uint seqI, std::vector< MatchType* >& iv_list );
+ void linkSuperIntervals( node_id_t node1, uint seqI, node_id_t ancestor );
+ void recursiveApplyAncestralBreakpoints( node_id_t ancestor );
+ void extractAlignment( node_id_t ancestor, size_t super_iv, mems::GappedAlignment& gal );
+ void extractAlignment( node_id_t ancestor, size_t super_iv, mems::CompactGappedAlignment<>& cga );
+
+ void getPairwiseMatches( const std::vector< node_id_t >& node1_seqs, const std::vector< node_id_t >& node2_seqs, Matrix<mems::MatchList>& pairwise_matches );
+ void getAncestralMatches( const std::vector< node_id_t > node1_seqs, const std::vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< mems::AbstractMatch* >& ancestral_matches );
+ void getRepresentativeAncestralMatches( const std::vector< node_id_t > node1_seqs, const std::vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< mems::AbstractMatch* >& ancestral_matches );
+
+ // functions for recursive anchor search
+
+ template<class GappedAlignmentType>
+ void recurseOnPairs( const std::vector<node_id_t>& node1_seqs,
+ const std::vector<node_id_t>& node2_seqs, const GappedAlignmentType& iv,
+ Matrix<mems::MatchList>& matches, Matrix< std::vector< mems::search_cache_t > >& search_cache_db,
+ Matrix< std::vector< mems::search_cache_t > >& new_cache_db,
+ boost::multi_array< std::vector< std::vector< int64 > >, 2 >& iv_regions);
+ void pairwiseAnchorSearch( mems::MatchList& r_list, mems::Match* r_begin, mems::Match* r_end, const mems::AbstractMatch* iv, uint oseqI, uint oseqJ );
+
+ void translateGappedCoordinates( std::vector<mems::AbstractMatch*>& ml, uint seqI, node_id_t extant, node_id_t ancestor );
+
+ void doGappedAlignment( node_id_t ancestor, bool profile_aln );
+ void refineAlignment( mems::GappedAlignment& gal, node_id_t ancestor, bool profile_aln, AlnProgressTracker& apt );
+ void FixLeftEnds( node_id_t ancestor );
+ void ConstructSuperIntervalFromMSA( node_id_t ancestor, size_t ans_siv, mems::GappedAlignment& gal );
+
+ // determines LCBs among each pair of genomes using a somewhat stringent homology
+ // criteria. fills the distance matrix with the number of breakpoints between each pair
+ void CreatePairwiseBPDistance( boost::multi_array<double, 2>& bp_distmat );
+
+ void constructLcbTrackingMatches( node_id_t ancestral_node, std::vector< mems::AbstractMatch* >& ancestral_matches, std::vector< mems::LcbTrackingMatch< mems::AbstractMatch* > >& tracking_matches );
+
+ void pairwiseScoreTrackingMatches(
+ std::vector< mems::TrackingMatch >& tracking_matches,
+ std::vector<node_id_t>& node1_descendants,
+ std::vector<node_id_t>& node2_descendants,
+ boost::multi_array< double, 3 >& tm_score_array
+ );
+
+ void computeAvgAncestralMatchScores(
+ std::vector< TrackingMatch >& tracking_matches,
+ std::vector<node_id_t>& node1_descendants,
+ std::vector<node_id_t>& node2_descendants,
+ boost::multi_array< double, 3 >& tm_score_array
+ );
+
+ void computeInternalNodeDistances(
+ boost::multi_array<double, 2>& bp_dist_mat,
+ boost::multi_array<double, 2>& cons_dist_mat,
+ std::vector<node_id_t>& node1_descendants,
+ std::vector<node_id_t>& node2_descendants);
+
+ bool validateSuperIntervals(node_id_t node1, node_id_t node2, node_id_t ancestor);
+ bool validatePairwiseIntervals(node_id_t node1, node_id_t node2, std::vector<mems::Interval*>& pair_iv);
+
+
+ void alignPP(mems::IntervalList& prof1, mems::IntervalList& prof2, mems::IntervalList& interval_list );
+
+protected:
+ void getAlignment( mems::IntervalList& interval_list );
+
+ mems::MatchList original_ml; /**< The list of matches calculated among all sequences. Also contains the full sequences and sorted mer lists */
+ PhyloTree< AlignmentTreeNode > alignment_tree;
+ std::vector< uint > node_sequence_map;
+ double breakpoint_penalty;
+ double min_breakpoint_penalty;
+ std::string input_guide_tree_fname;
+ std::string output_guide_tree_fname;
+ boolean debug;
+ boolean refine;
+ bool using_cache_db;
+
+ std::vector< SeedOccurrenceList > sol_list;
+ boost::multi_array<double, 2> bp_distance; /**< pairwise breakpoint distances. dims will be [seq_count][seq_count] */
+ boost::multi_array<double, 2> conservation_distance; /**< pairwise genome conservation distances. dims will be [seq_count][seq_count] */
+
+ LcbScoringScheme scoring_scheme;
+ bool use_weight_scaling;
+ bool use_seed_families;
+
+ double bp_dist_scale;
+ double conservation_dist_scale;
+
+ double bp_dist_estimate_score; /**< the minimum LCB score to use when estimating BP distance. should be conservative (high) */
+
+ size_t max_gapped_alignment_length;
+
+ mems::PairwiseScoringScheme subst_scoring;
+};
+
+extern bool debug_aligner;
+
+ /** Select the next pair of nodes to align
+ * The chosen pair will either be unaligned extant sequences or unaligned
+ * ancestral sequences whose descendants have all been aligned. The chosen pair has
+ * the shortest path on the tree
+ * When no sequences remain to be aligned, returns node1 == node2
+ */
+void chooseNextAlignmentPair( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& node1, node_id_t& node2, node_id_t& ancestor );
+
+void markAligned( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t subject_node, node_id_t neighbor );
+
+node_id_t createAlignmentTreeRoot( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t node1, node_id_t node2 );
+
+// homogenizes an alignment tree and ordering to prepare for alignment
+void prepareAlignmentTree( PhyloTree< AlignmentTreeNode >& alignment_tree );
+
+inline
+ProgressiveAligner::~ProgressiveAligner()
+{
+ for( size_t mI = 0; mI < original_ml.size(); mI++ )
+ original_ml[mI]->Free();
+}
+
+template<class T>
+class AbsolutComparator
+{
+public:
+ boolean operator()(const T& a, const T& b) const
+ {
+ return (genome::absolut(a) < genome::absolut(b));
+ }
+};
+
+
+
+template <class MatchVector>
+void processNewMatch( uint seqI, MatchVector& new_matches, typename MatchVector::value_type& new_match )
+{
+ new_match->SetStart( seqI, 0 );
+ if( new_match->Multiplicity() > 1 && new_match->Length(seqI) > 0 )
+ new_matches.push_back( new_match );
+ else
+ {
+ new_match->Free();
+ new_match = NULL;
+ }
+}
+inline
+bool checkConsistent(const AbstractMatch* a, const AbstractMatch* b)
+{
+ bool consistent_overlap = true;
+ int64 o = (std::numeric_limits<int64>::max)();
+ int64 inter = 0;
+ uint seq_count = a->SeqCount();
+ for( size_t seqI = 0; seqI < seq_count; seqI++ )
+ {
+ if(b->LeftEnd(seqI) == 0 || a->LeftEnd(seqI) == 0)
+ continue;
+ inter++;
+ if(o == (std::numeric_limits<int64>::max)())
+ o = b->Start(seqI) - a->Start(seqI);
+ if(o != b->Start(seqI) - a->Start(seqI))
+ consistent_overlap = false;
+ }
+ consistent_overlap = consistent_overlap && inter > 1;
+ return consistent_overlap;
+}
+
+/**
+ * Delete overlapping regions in favor of the larger match.
+ * This code isn't perfect, it can delete too many base pairs in some cases
+ * @param ml The vector of matches
+ * @param seq_ids The indexes of sequences in which overlaps should be eliminated
+ * @param eliminate_both Delete both of the overlapping matches, instead of leaving one remaining
+ */
+template <class MatchVector>
+void EliminateOverlaps_v2( MatchVector& ml, const std::vector< uint >& seq_ids, bool eliminate_both = false ){
+ if( ml.size() < 2 )
+ return;
+ uint seq_count = ml[0]->SeqCount();
+ for( uint sidI = 0; sidI < seq_ids.size(); sidI++ ){
+ uint seqI = seq_ids[ sidI ];
+ mems::SingleStartComparator<mems::AbstractMatch> msc( seqI );
+ std::sort( ml.begin(), ml.end(), msc );
+ int64 matchI = 0;
+ int64 nextI = 0;
+ int64 deleted_count = 0;
+ MatchVector new_matches;
+
+ // scan forward to first defined match
+ for(; matchI != ml.size(); matchI++ )
+ if( ml[ matchI ]->Start( seqI ) != mems::NO_MATCH )
+ break;
+
+ for(; matchI < ml.size(); matchI++ ){
+ if( ml[ matchI ] == NULL )
+ continue;
+
+ for( nextI = matchI + 1; nextI < ml.size(); nextI++ ){
+ if( ml[ nextI ] == NULL )
+ continue;
+
+ boolean deleted_matchI = false;
+ // check for overlaps
+ int64 startI = ml[ matchI ]->Start( seqI );
+ int64 lenI = ml[ matchI ]->Length( seqI );
+ int64 startJ = ml[ nextI ]->Start( seqI );
+ int64 diff = genome::absolut( startJ ) - genome::absolut( startI ) - lenI;
+
+ if( diff >= 0 )
+ break; // there are no more overlaps
+
+ diff = -diff;
+ typename MatchVector::value_type new_match;
+ bool mem_iter_smaller = ( ml[ nextI ]->Multiplicity() > ml[ matchI ]->Multiplicity() ) ||
+ ( ml[ nextI ]->Multiplicity() == ml[ matchI ]->Multiplicity() && ml[ nextI ]->Length(seqI) > ml[ matchI ]->Length(seqI) );
+
+ bool consistent_overlap = checkConsistent( ml[ matchI ], ml[ nextI ] );
+
+ // delete bases from the smaller match
+ if( (!consistent_overlap && eliminate_both) || mem_iter_smaller )
+ {
+ // mem_iter is smaller
+ new_match = ml[matchI]->Copy();
+ // erase base pairs from new_match
+ if( diff >= lenI ){
+// cerr << "Deleting " << **mem_iter << " at the hands of\n" << **next_iter << endl;
+ ml[ matchI ]->Free();
+ ml[ matchI ] = NULL;
+ matchI--;
+ deleted_matchI = true;
+ deleted_count++;
+ }else{
+ ml[ matchI ]->CropRight( diff, seqI );
+ new_match->CropLeft( new_match->Length(seqI) - diff, seqI );
+ }
+ processNewMatch( seqI, new_matches, new_match );
+ }
+ if( (!consistent_overlap && eliminate_both) || !mem_iter_smaller )
+ {
+ // match_iter is smaller
+ new_match = ml[nextI]->Copy();
+ // erase base pairs from new_match
+ if( diff >= ml[ nextI ]->Length(seqI) ){
+// cerr << "Deleting " << **next_iter << " at the hands of\n" << **mem_iter << endl;
+ ml[ nextI ]->Free();
+ ml[ nextI ] = NULL;
+ deleted_count++;
+ }else{
+ ml[ nextI ]->CropLeft( diff, seqI );
+ new_match->CropRight( new_match->Length(seqI) - diff, seqI );
+ }
+ processNewMatch( seqI, new_matches, new_match );
+ }
+ if( deleted_matchI )
+ break;
+ }
+ }
+
+ if( deleted_count > 0 ){
+ size_t cur = 0;
+ for( size_t mI = 0; mI < ml.size(); ++mI )
+ if( ml[mI] != NULL )
+ ml[cur++] = ml[mI];
+ ml.erase( ml.begin() + cur, ml.end() );
+ }
+ ml.insert( ml.end(), new_matches.begin(), new_matches.end() );
+ new_matches.clear();
+ }
+}
+
+template <class MatchVector>
+void EliminateOverlaps_v2( MatchVector& ml, bool eliminate_both = false )
+{
+ if( ml.size() < 2 )
+ return; // can't eliminate overlaps between fewer than 2 matches
+ uint seq_count = ml[0]->SeqCount();
+ std::vector< uint > seq_ids( seq_count );
+ for( uint i = 0; i < seq_count; ++i )
+ seq_ids[i] = i;
+ EliminateOverlaps_v2( ml, seq_ids, eliminate_both );
+};
+
+template< class MatchVector >
+uint64 SimpleGetLCBCoverage( MatchVector& lcb ){
+ typename MatchVector::iterator match_iter = lcb.begin();
+ uint64 coverage = 0;
+ bool debug = true;
+ for( ; match_iter != lcb.end(); ++match_iter ){
+ double maxlen = 0;
+ double minlen = 0;
+ for( uint seqI = 0; seqI < (*match_iter)->SeqCount(); seqI++ )
+ {
+ if( (*match_iter)->LeftEnd(seqI) != mems::NO_MATCH )
+ {
+ maxlen += (double)(*match_iter)->Length(seqI);
+ if( (*match_iter)->Length(seqI) > minlen )
+ minlen = (double)(*match_iter)->Length(seqI);
+ }
+ }
+ double score = exp( ((*match_iter)->AlignmentLength() - minlen) / (maxlen - minlen) );
+ score *= maxlen;
+ coverage += (uint64)score;
+ }
+ return coverage;
+}
+
+template< class MatchVectorType >
+void addUnalignedIntervals_v2( MatchVectorType& iv_list, std::set< uint > seq_set, std::vector<gnSeqI> seq_lengths )
+{
+ std::vector< mems::LCB > adjacencies;
+ uint lcbI;
+ uint seqI;
+ uint seq_count = seq_lengths.size();
+
+
+ if( seq_set.size() == 0 )
+ {
+ // if an empty seq set was passed then assume all seqs
+ // should be processed
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ seq_set.insert( seqI );
+ }
+ std::vector< std::vector< typename MatchVectorType::value_type > > ymmv;
+ for( size_t ivI = 0; ivI < iv_list.size(); ++ivI )
+ ymmv.push_back( std::vector< typename MatchVectorType::value_type >( 1, iv_list[ivI] ) );
+
+ std::vector< double > scores( iv_list.size(), 0 );
+ computeLCBAdjacencies_v3( ymmv, scores, adjacencies );
+
+ std::vector< int > rightmost;
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ rightmost.push_back( -1 );
+ }
+
+ for( lcbI = 0; lcbI <= adjacencies.size(); lcbI++ ){
+ std::set< uint >::iterator seq_set_iterator = seq_set.begin();
+ for( ; seq_set_iterator != seq_set.end(); seq_set_iterator++ ){
+ seqI = *seq_set_iterator;
+ // scan left
+ int leftI;
+ if( lcbI < adjacencies.size() ){
+// left is always to the left!!
+ leftI = adjacencies[ lcbI ].left_adjacency[ seqI ];
+ }else
+ leftI = rightmost[ seqI ];
+
+ int rightI = lcbI < adjacencies.size() ? lcbI : -1;
+// right is always to the right!!
+ if( lcbI < adjacencies.size() )
+ if( adjacencies[ lcbI ].right_adjacency[ seqI ] == -1 )
+ rightmost[ seqI ] = lcbI;
+
+ int64 left_start, right_start;
+ mems::getGapBounds( seq_lengths, adjacencies, seqI, leftI, rightI, left_start, right_start );
+ int64 gap_len = genome::absolut( right_start ) - genome::absolut( left_start );
+ if( gap_len > 0 ){
+ mems::Match mm( seq_count );
+ mems::Match* m = mm.Copy();
+ for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
+ m->SetStart( seqJ, 0 );
+ }
+ m->SetStart( seqI, left_start );
+ m->SetLength( gap_len );
+ mems::Interval iv;
+ std::vector< mems::AbstractMatch* > tmpvec(1, m);
+ iv.SetMatches( tmpvec );
+ iv_list.push_back( iv.Copy() );
+ }
+ }
+ }
+}
+
+inline
+void projectIntervalList( mems::IntervalList& iv_list, std::vector< uint >& projection, std::vector< std::vector< mems::MatchProjectionAdapter* > >& LCB_list, std::vector< mems::LCB >& projected_adjs )
+{
+ std::vector< size_t > proj(projection.size());
+ for( size_t i = 0; i < projection.size(); ++i )
+ proj[i] = projection[i];
+ std::vector< mems::MatchProjectionAdapter* > mpa_list;
+ // construct pairwise Interval projections
+ for( size_t corI = 0; corI < iv_list.size(); corI++ )
+ {
+ size_t projI = 0;
+ for( ; projI < projection.size(); ++projI )
+ if( iv_list[corI].LeftEnd(projection[projI]) == mems::NO_MATCH )
+ break;
+ if( projI != projection.size() )
+ continue;
+ mems::MatchProjectionAdapter mpa_tmp( &iv_list[corI], proj );
+ mpa_list.push_back( mpa_tmp.Copy() );
+ if( mpa_list.back()->Orientation(0) == mems::AbstractMatch::reverse )
+ mpa_list.back()->Invert();
+ }
+ std::vector< gnSeqI > breakpoints;
+ IdentifyBreakpoints( mpa_list, breakpoints );
+ ComputeLCBs_v2( mpa_list, breakpoints, LCB_list );
+ std::vector< double > lcb_scores( LCB_list.size(), 0 );
+ computeLCBAdjacencies_v3( LCB_list, lcb_scores, projected_adjs );
+}
+
+
+template< class MatchType = mems::AbstractMatch >
+class GenericMatchSeqManipulator
+{
+public:
+ GenericMatchSeqManipulator( uint seq ) : m_seq(seq) {}
+ gnSeqI LeftEnd(MatchType*& m) const{ return m->LeftEnd(m_seq); }
+ gnSeqI Length(MatchType*& m) const{ return m->Length(m_seq); }
+ void CropLeft(MatchType*& m, gnSeqI amount ) const{ m->CropLeft(amount, m_seq); }
+ void CropRight(MatchType*& m, gnSeqI amount ) const{ m->CropRight(amount, m_seq); }
+ template< typename ContainerType >
+ void AddCopy(ContainerType& c, MatchType*& m) const{ c.push_back( m->Copy() ); }
+private:
+ uint m_seq;
+};
+
+typedef GenericMatchSeqManipulator<> AbstractMatchSeqManipulator;
+
+class SuperIntervalManipulator
+{
+public:
+ gnSeqI LeftEnd(const SuperInterval& siv) const{ return siv.LeftEnd(); }
+ gnSeqI Length(const SuperInterval& siv) const{ return siv.Length(); }
+ void CropLeft( SuperInterval& siv, gnSeqI amount ) const{ siv.CropLeft( amount );}
+ void CropRight( SuperInterval& siv, gnSeqI amount ) const{ siv.CropRight( amount );}
+ template< typename ContainerType >
+ void AddCopy(ContainerType& c, const SuperInterval& siv) const{ c.push_back( siv ); }
+};
+
+
+// iv_list is a container class that contains pointers to intervals or
+// matches of some sort
+// precondition: both bp_list and intervals *must* be sorted
+template< class T, class Maniplator >
+void applyBreakpoints( std::vector< gnSeqI >& bp_list, std::vector<T>& iv_list, Maniplator& manip )
+{
+
+ size_t iv_count = iv_list.size();
+ size_t bpI = 0;
+ size_t ivI = 0;
+ while( ivI < iv_count && bpI < bp_list.size() )
+ {
+ if( manip.LeftEnd(iv_list[ivI]) == NO_MATCH )
+ {
+ ++ivI;
+ continue; // undefined in seqI, so no breakpoint here
+ }
+ // -(ivI)----
+ // -------|--
+ if( manip.LeftEnd(iv_list[ivI]) + manip.Length(iv_list[ivI]) <= bp_list[bpI] )
+ {
+ ++ivI;
+ continue;
+ }
+ // -----(ivI)-
+ // --|--------
+ if( bp_list[bpI] <= manip.LeftEnd(iv_list[ivI]) )
+ {
+ ++bpI;
+ continue;
+ }
+
+ // if split_at isn't 0 then we need to split cur_iv
+ // put the left side in the new list and crop cur_iv
+ gnSeqI crop_amt = bp_list[bpI] - manip.LeftEnd(iv_list[ivI]);
+ manip.AddCopy( iv_list, iv_list[ivI] );
+ T& left_iv = iv_list.back();
+
+ manip.CropLeft( iv_list[ivI], crop_amt );
+ manip.CropRight( left_iv, manip.Length(left_iv)-crop_amt );
+ // restore ordering
+ size_t nextI = ivI + 1;
+ while( nextI < iv_count && manip.LeftEnd( iv_list[nextI-1] ) > manip.LeftEnd( iv_list[nextI] ) )
+ {
+ std::swap( iv_list[nextI-1], iv_list[nextI] );
+ nextI++;
+ }
+
+// assume that crop works correctly and that it's okay to pass matches with NO_MATCH
+/**/
+ if( manip.Length( iv_list[ivI] ) == 0 )
+ {
+ std::cerr << "Big fat generic zero 1\n";
+ genome::breakHere();
+ }
+ if( manip.Length( left_iv ) == 0 )
+ {
+ std::cerr << "Big fat generic zero 2\n";
+ genome::breakHere();
+ }
+ if( manip.LeftEnd( iv_list[ivI] ) == 0 )
+ {
+ std::cerr << "uh oh\n";
+ genome::breakHere();
+ }
+ if( manip.LeftEnd( left_iv ) == 0 )
+ {
+ std::cerr << "uh oh 2\n";
+ genome::breakHere();
+ }
+/**/
+ }
+}
+
+
+}
+
+//namespace std {
+// void swap( PhyloTree<mems::AlignmentTreeNode>& a, PhyloTree<mems::AlignmentTreeNode>& b);
+//}
+
+#endif // _ProgressiveAligner_h_
diff --git a/libMems/RepeatHash.cpp b/libMems/RepeatHash.cpp
new file mode 100755
index 0000000..dd4c02d
--- /dev/null
+++ b/libMems/RepeatHash.cpp
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ * $Id: RepeatHash.cpp,v 1.13 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/RepeatHash.h"
+#include <list>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+
+RepeatHash* RepeatHash::Clone() const{
+ return new RepeatHash(*this);
+}
+
+boolean RepeatHash::CreateMatches(){
+ if(seq_count == 1){
+ MatchFinder::FindMatchSeeds();
+ return true;
+ }
+
+ return false;
+}
+
+boolean RepeatHash::EnumerateMatches( IdmerList& match_list ){
+ return HashMatch(match_list);
+}
+
+//why have separate hash tables?
+// RepeatHashEntries use GENETICIST coordinates. They start at 1, not 0.
+boolean RepeatHash::HashMatch(IdmerList& match_list){
+ //check that there is at least one forward component
+ match_list.sort(&idmer_position_lessthan);
+ // initialize the hash entry
+ MatchHashEntry mhe = MatchHashEntry( match_list.size(), GetSar(0)->SeedLength());
+ mhe.SetLength( GetSar(0)->SeedLength() );
+
+ //Fill in the new Match and set direction parity if needed.
+ IdmerList::iterator iter = match_list.begin();
+
+ uint32 repeatI = 0;
+ for(; iter != match_list.end(); iter++)
+ mhe.SetStart(repeatI++, iter->position + 1);
+
+ SetDirection( mhe );
+ mhe.CalculateOffset();
+ if(mhe.Multiplicity() < 2){
+ cout << "red flag " << mhe << "\n";
+ }else{
+ AddHashEntry(mhe);
+ }
+ return true;
+}
+
+} // namespace mems
diff --git a/libMems/RepeatHash.h b/libMems/RepeatHash.h
new file mode 100755
index 0000000..09888c4
--- /dev/null
+++ b/libMems/RepeatHash.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * $Id: RepeatHash.h,v 1.8 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _RepeatHash_h_
+#define _RepeatHash_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MemHash.h"
+
+namespace mems {
+
+/**
+ * Finds repeats within a single sequence.
+ * This class extends the functionality of MemHash to search for repetitive
+ * matches within a single sequence.
+ */
+class RepeatHash : public MemHash{
+public:
+ virtual RepeatHash* Clone() const;
+ virtual boolean CreateMatches();
+protected:
+
+ virtual boolean EnumerateMatches( IdmerList& match_list );
+ virtual boolean HashMatch(IdmerList& match_list);
+ virtual SortedMerList* GetSar(uint32 sarI) const;
+};
+
+
+inline
+SortedMerList* RepeatHash::GetSar(uint32 sarI) const{
+ return sar_table[0];
+}
+
+inline
+bool idmer_greaterthan(idmer& a_v, idmer& m_v){
+ return (a_v.mer < m_v.mer);// ? true : false;
+};
+
+inline
+bool idmer_position_lessthan(idmer& a_v, idmer& m_v){
+ return (a_v.position < m_v.position);// ? true : false;
+};
+
+}
+
+#endif //_RepeatHash_h_
diff --git a/libMems/RepeatMatch.cpp b/libMems/RepeatMatch.cpp
new file mode 100644
index 0000000..2cea740
--- /dev/null
+++ b/libMems/RepeatMatch.cpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ * $Id: Match.cpp,v 1.9 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/RepeatMatch.h"
+#include "libGenome/gnException.h"
+#include "libGenome/gnDebug.h"
+
+namespace mems {
+
+RepeatMatch::RepeatMatch() : MatchHashEntry()
+{
+}
+
+RepeatMatch::~RepeatMatch(){
+
+}
+
+void RepeatMatch::FromSeq( uint32 match_id, uint32 seq_id )
+{
+// unsure what to do with this: (it doesn't compile)
+// this->m_seq_id.insert( match_id, seq_id);
+
+}
+
+uint32 RepeatMatch::SeqId( uint32 match_id )
+{
+ return this->m_seq_id.at(match_id);
+
+}
+
+std::ostream& operator<<(std::ostream& os, const RepeatMatch& mhe){ //write to stream.
+ os << mhe.Length();
+ for(uint32 i=0; i < mhe.SeqCount(); i++)
+ {
+
+ //if ( mhe.Start(i) <
+ os << '\t' << mhe.Start(i);
+ }
+ return os;
+}
+
+} // namespace mems
diff --git a/libMems/RepeatMatch.h b/libMems/RepeatMatch.h
new file mode 100644
index 0000000..ecbd9e2
--- /dev/null
+++ b/libMems/RepeatMatch.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ * $Id: Match.h,v 1.10 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _RepeatMatch_h_
+#define _RepeatMatch_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include <iostream>
+#include <vector>
+#include <set>
+#include "libMems/MatchHashEntry.h"
+
+namespace mems {
+
+/**
+ * The Match class stores the location of an <b>equal size</b> (inexact or exactly)
+ * matching region
+ * between several sequences. There are numerous functions in this
+ * class which can be used to compare and manipulate this match.
+ */
+class RepeatMatch : public MatchHashEntry {
+
+public:
+ RepeatMatch();
+ RepeatMatch( const uint32 seq_count, const gnSeqI mersize, const MemType m_type = seed );
+ RepeatMatch(const RepeatMatch& mhe);
+ ~RepeatMatch();
+ void FromSeq( uint32 match_id, uint32 seq_id );
+ uint32 SeqId( uint32 match_id );
+protected:
+ std::vector<uint32> m_seq_id;
+
+private:
+
+
+};
+std::ostream& operator<<(std::ostream& os, const RepeatMatch& mhe); //write to source.
+
+} // namespace mems
+
+#endif // _RepeatMatch_h_
+
diff --git a/libMems/RepeatMatchList.cpp b/libMems/RepeatMatchList.cpp
new file mode 100644
index 0000000..b1384d0
--- /dev/null
+++ b/libMems/RepeatMatchList.cpp
@@ -0,0 +1,300 @@
+/*******************************************************************************
+ * $Id: MatchList.cpp,v 1.22 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/RepeatMatchList.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/DNAMemorySML.h"
+#include "libMems/MemHash.h"
+#include <map>
+#include <sstream>
+#include <ctime>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+typedef void* MatchID_t;
+
+
+RepeatMatchList::RepeatMatchList() :MatchList()
+{
+}
+
+
+void RepeatMatchList::LoadSequences( ostream* log_stream ){
+
+ if( seq_filename.size() == 0 )
+ return;
+
+ gnSeqI total_len = 0;
+ for( uint seqI = 0; seqI < seq_filename.size(); seqI++ ){
+ gnSequence* file_sequence = new gnSequence();
+ // Load the sequence and tell the user if it loaded successfully
+ try{
+ file_sequence->LoadSource( seq_filename[ seqI ] );
+ }catch( gnException& gne ){
+ delete file_sequence;
+ if( gne.GetCode() == FileNotOpened() )
+ cerr << "Error loading " << seq_filename[ seqI ] << endl;
+ else
+ cerr << gne;
+ return;
+ }catch( exception& e ){
+ delete file_sequence;
+ cerr << "Unhandled exception loading " << seq_filename[ seqI ] << endl;
+ cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
+ cerr << e.what();
+ return;
+ }catch( ... ){
+ delete file_sequence;
+ cerr << "Unknown exception when loading " << seq_filename[ seqI ] << endl;
+ return;
+ }
+
+ total_len += file_sequence->length();
+ seq_table.push_back( file_sequence );
+ if( log_stream != NULL ){
+ (*log_stream) << "Sequence loaded successfully.\n";
+ (*log_stream) << seq_filename[ seqI ] << " " << file_sequence->length() << " base pairs.\n";
+ }
+ }
+
+}
+
+void RepeatMatchList::LoadSMLs( uint mer_size, ostream* log_stream ){
+
+ // if the mer_size parameter is 0 then calculate a default mer size for these sequences
+ if( mer_size == 0 ){
+ mer_size = GetDefaultMerSize( seq_table );
+ if( log_stream != NULL ){
+ (*log_stream) << "Using weight " << mer_size << " mers for initial seeds\n";
+ }
+ }
+
+ // load and creates SMLs as necessary
+ //punt: tjt
+ //uint64 default_seed = getSeed( mer_size );
+ uint64 default_seed = getSolidSeed( mer_size );
+ vector< uint > create_list;
+ uint seqI = 0;
+ for( seqI = 0; seqI < seq_table.size(); seqI++ ){
+ // define a DNAFileSML to store a sorted mer list
+ DNAFileSML* file_sml = new DNAFileSML();
+ sml_table.push_back( file_sml );
+
+ boolean success = true;
+ try{
+ file_sml->LoadFile( sml_filename[ seqI ] );
+ }catch( gnException& gne ){
+ success = false;
+ create_list.push_back( seqI );
+ }
+ boolean recreate = false;
+ if(success && (file_sml->Seed() != default_seed )){
+ if( log_stream != NULL )
+ (*log_stream) << "Default seed mismatch. A new sorted mer list will be created.\n";
+ recreate = true;
+ create_list.push_back( seqI );
+ }
+
+ if( success && !recreate && log_stream != NULL )
+ (*log_stream) << "Sorted mer list loaded successfully\n";
+ }
+
+ // free up memory before creating any SMLs
+ if( create_list.size() > 0 )
+ for( seqI = 0; seqI < sml_table.size(); seqI++ ){
+ sml_table[ seqI ]->Clear();
+ delete sml_table[ seqI ];
+ sml_table[ seqI ] = NULL;
+ }
+
+ // create any SMLs that need to be created
+ for( uint createI = 0; createI < create_list.size(); createI++ ){
+ if( log_stream != NULL )
+ (*log_stream) << "Creating sorted mer list\n";
+ try{
+
+ time_t start_time = time(NULL);
+ sml_table[ create_list[ createI ] ] = new DNAFileSML( sml_filename[ create_list[ createI ] ] );
+ sml_table[ create_list[ createI ] ]->Create( *seq_table[ create_list[ createI ] ], default_seed );
+ time_t end_time = time(NULL);
+ if( log_stream != NULL )
+ (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
+
+ }catch(...){
+ cerr << "Error creating sorted mer list\n";
+ throw;
+ }
+ }
+
+ // reload the other SMLs now that creation has completed
+ if( create_list.size() > 0 ){
+ for( seqI = 0; seqI < sml_filename.size(); seqI++ ){
+ if( sml_table[ seqI ] != NULL )
+ continue;
+ sml_table[ seqI ] = new DNAFileSML( sml_filename[ seqI ] );
+ try{
+ ((DNAFileSML*)sml_table[ seqI ])->LoadFile( sml_filename[ seqI ] );
+ }catch( gnException& gne ){
+ cerr << "Error loading sorted mer list\n";
+ throw;
+ }
+ }
+ }
+}
+void RepeatMatchList::ReadList(istream& match_file){
+ string tag;
+ gnSeqI len;
+ int64 start;
+ unsigned int seq_count;
+
+ match_file >> tag; //format version tag
+ if( tag != "FormatVersion" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> tag; //format version
+ if( tag != "3" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> tag; //sequence count tag
+ if( tag != "SequenceCount" ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+ match_file >> seq_count; //sequence count
+ if(seq_count < 2){
+ Throw_gnEx(InvalidFileFormat());
+ }
+
+ // read the sequence file names and lengths
+ for( unsigned int seqI = 0; seqI < seq_count; seqI++ ){
+ match_file >> tag; // name tag
+ getline( match_file, tag );
+ // skip the tab character
+ tag = tag.substr( 1 );
+ seq_filename.push_back(tag);
+// try{
+// gnSequence *new_seq = new gnSequence();
+// new_seq->LoadSource(tag);
+// seq_table.push_back( new_seq );
+// }catch( gnException& gne );
+ match_file >> tag; // length tag
+ gnSeqI seq_len;
+ match_file >> seq_len; // length
+ if( seqI < seq_table.size() )
+ if( seq_table[ seqI ]->length() != seq_len ){
+ cerr << "Warning: Genome sizes in the match list differ.\n";
+ cerr << "seq_table[ " << seqI << " ]->length() " << seq_table[ seqI ]->length() << " seq_len: " << seq_len << endl;
+ }
+ }
+
+ // read the number of matches
+ unsigned int match_count;
+ match_file >> tag; // match count tag
+ match_file >> match_count; // match count
+
+ // read the matches
+ map< MatchID_t, Match* > match_map;
+ string cur_line;
+ getline( match_file, cur_line );
+ while( getline( match_file, cur_line ) ){
+ MatchHashEntry mhe( seq_count, 0 );
+ stringstream line_stream( cur_line );
+
+ line_stream >> len;
+ mhe.SetLength(len);
+
+ for(uint32 seqI = 0; seqI < seq_count; seqI++){
+ line_stream >> start;
+ mhe.SetStart(seqI, start);
+ }
+
+ mhe.CalculateOffset();
+
+ MatchID_t match_id;
+ line_stream >> match_id;
+
+ uint sub_count;
+ boolean bad_stream = false;
+ line_stream >> sub_count;
+ if(sub_count > 0)
+ throw "Unable to read file, invalid format, cannot read subset information\n";
+
+ if( bad_stream )
+ break;
+
+ uint sup_count;
+ line_stream >> sup_count;
+ if(sup_count > 0)
+ throw "Unable to read file, invalid format, cannot read superset information\n";
+ if( bad_stream )
+ break;
+
+ Match* new_match = mhe.Copy();
+ push_back( new_match );
+ match_map.insert( map< MatchID_t, Match* >::value_type( match_id, new_match ));
+ }
+ if( match_count != size() ){
+ Throw_gnEx(InvalidFileFormat());
+ }
+}
+
+void RepeatMatchList::WriteList(ostream& match_file) const{
+ if( size() == 0 )
+ return;
+ Match* first_mem = *(begin());
+ unsigned int seq_count = first_mem->SeqCount();
+
+ match_file << "FormatVersion" << '\t' << 3 << "\n";
+ match_file << "SequenceCount" << '\t' << seq_count << "\n";
+ for(unsigned int seqI = 0; seqI < seq_count; seqI++){
+ match_file << "Sequence" << seqI << "File" << '\t';
+ if( seq_filename.size() > seqI )
+ match_file << seq_filename[seqI];
+ else
+ match_file << "null";
+ match_file << "\n";
+ match_file << "Sequence" << seqI << "Length" << '\t';
+ if( seq_table.size() > seqI )
+ match_file << seq_table[seqI]->length();
+ else
+ match_file << "0";
+ match_file << "\n";
+ }
+
+ match_file << "MatchCount" << '\t' << size() << endl;
+
+ //get all the mems out of the hash table and write them out
+ vector<Match*>::const_iterator match_iter;
+ match_iter = begin();
+ set<Match*> cur_set;
+ set<Match*>::iterator set_iter;
+ for(; match_iter != end(); match_iter++){
+ // print the match
+ match_file << **match_iter << '\t';
+
+ // print the Multiplicity
+ match_file << (*match_iter)->Multiplicity() << '\t';
+
+ // print the match address
+ match_file << (MatchID_t)(*match_iter) << '\t';
+
+ // print subset id's
+ match_file << 0;
+
+ // print superset id's
+ match_file << '\t' << 0;
+ match_file << endl;
+ }
+}
+
+} // namespace mems
diff --git a/libMems/RepeatMatchList.h b/libMems/RepeatMatchList.h
new file mode 100644
index 0000000..fbb2cf8
--- /dev/null
+++ b/libMems/RepeatMatchList.h
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ * $Id: MatchList.h,v 1.10 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _RepeatMatchList_h_
+#define _RepeatMatchList_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <iostream>
+#include <list>
+#include "libMems/SortedMerList.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/Match.h"
+#include "libMems/MatchList.h"
+//#include <valarray>
+
+namespace mems {
+
+// punt: Need to subclass AbstractMatchList, which can be a MatchList or RepeatMatchList
+
+class RepeatMatchList : public MatchList {
+public:
+ RepeatMatchList();
+ RepeatMatchList( const RepeatMatchList& ml );
+
+ void LoadSequences( std::ostream* log_stream );
+ void LoadSMLs( uint mer_size, std::ostream* log_stream );
+
+ /**
+ * Reads a MatchList from an input stream
+ * Sequence and SML file names are read into the seq_filename
+ * and sml_filename vectors, but the actual files are not
+ * opened. The calling function should load them after
+ * using this method.
+ * @param match_stream The input stream to read from
+ */
+ void ReadList( std::istream& match_stream );
+
+ /**
+ * Writes a MatchList to the designated output stream
+ * @param match_stream The output stream to write to
+ */
+ void WriteList( std::ostream& match_stream ) const;
+
+ //vector<string> sml_filename; /**< The file names of the sorted mer list for each sequence, may be empty or null */
+ //vector<string> seq_filename; /**< The file names of the sequence data, may be empty or null */
+ //vector<SortedMerList*> sml_table; /**< The sorted mer list associated with each sequence, may be empty or null */
+ //vector<genome::gnSequence*> seq_table; /**< The actual sequences associated with the matches stored in this list. Should not be empty or null. */
+
+
+protected:
+
+};
+
+} // namespace mems
+
+#endif
+
+
diff --git a/libMems/Scoring.h b/libMems/Scoring.h
new file mode 100644
index 0000000..712f6ce
--- /dev/null
+++ b/libMems/Scoring.h
@@ -0,0 +1,335 @@
+/*******************************************************************************
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __Scoring_h__
+#define __Scoring_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/SubstitutionMatrix.h"
+#include <string>
+#include <vector>
+
+namespace mems {
+
+static const score_t INVALID_SCORE = (std::numeric_limits<score_t>::max)();
+
+//tjtaed: function to compute the SP column score, and cumulative SP score from an alignment
+void computeSPScore( const std::vector<std::string>& alignment, const PairwiseScoringScheme& pss, std::vector<score_t>& scores, score_t& score );
+//tjt: function to compute the consensus column score, consensus sequence, and cumulative consensus score from an alignment
+void computeConsensusScore( const std::vector<std::string>& alignment, const PairwiseScoringScheme& pss, std::vector<score_t>& scores, std::string& consensus, score_t& score );
+void computeMatchScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
+void computeGapScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
+
+
+//tjt: function to compute the consensus column score, consensus sequence, and cumulative consensus score from an alignment
+inline
+void computeConsensusScore( const std::vector<std::string>& alignment, const PairwiseScoringScheme& pss,
+ std::vector<score_t>& scores, std::string& consensus, score_t& score )
+{
+
+ consensus.clear();
+ std::vector< std::vector< score_t > > allscores;
+
+ scores.resize( alignment.at(0).size() );
+ std::fill(scores.begin(), scores.end(), INVALID_SCORE);
+
+ score = INVALID_SCORE;
+
+ std::vector< string > nucleotides;
+ nucleotides.push_back(std::string(alignment.at(0).size(),'A'));
+ nucleotides.push_back(std::string(alignment.at(0).size(),'G'));
+ nucleotides.push_back(std::string(alignment.at(0).size(),'C'));
+ nucleotides.push_back(std::string(alignment.at(0).size(),'T'));
+
+ for( size_t i = 0; i < nucleotides.size(); i++)
+ {
+ //tjt: score alignment!
+ //for each row in the alignment, compare to string of A,G,C,T and build consensus
+ std::vector< score_t > consensus_scores(alignment.at(0).size(), 0);
+
+ for( gnSeqI j = 0; j < alignment.size(); j++)
+ {
+ std::vector< score_t > tscores(alignment.at(0).size(), 0);
+
+ computeMatchScores( alignment.at(j), nucleotides.at(i), pss, tscores );
+
+ for( gnSeqI k = 0; k < alignment.at(j).size(); k++)
+ if( tscores.at(k) != INVALID_SCORE )
+ consensus_scores.at(k) += tscores.at(k);
+
+ computeGapScores( alignment.at(j), nucleotides.at(i), pss, tscores );
+
+ for( gnSeqI k = 0; k < alignment.at(j).size(); k++)
+ if( tscores.at(k) != INVALID_SCORE )
+ consensus_scores.at(k) += tscores.at(k);
+
+ }
+ allscores.push_back(consensus_scores);
+ }
+
+ //tjt: find maxvalue for each column
+ // 0 = A, 1 = G, 2 = C, 3 = T
+
+ std::vector< int > columnbp( alignment.at(0).size(), (std::numeric_limits<int>::min)());
+
+ //for A,G,C,T
+ for( size_t i = 0; i < nucleotides.size(); i++)
+ {
+ //for each column
+ for( size_t j = 0; j < alignment.at(0).size(); j++)
+ {
+ if( allscores.at(i).at(j) == INVALID_SCORE )
+ continue;
+ if( i == 0 )
+ {
+ scores.at(j) = allscores.at(i).at(j);
+ columnbp.at(j) = 0;
+ }
+ else if (allscores.at(i).at(j) > scores.at(j))
+ {
+ scores.at(j) = allscores.at(i).at(j);
+ columnbp.at(j) = i;
+ }
+ }
+ }
+ //update score with maxvalue from each column
+ for( size_t j = 0; j < alignment.at(0).size(); j++)
+ {
+ if( scores.at(j) != INVALID_SCORE )
+ score += scores.at(j);
+ if (columnbp.at(j) == 0)
+ consensus.append("A");
+ else if (columnbp.at(j) == 1)
+ consensus.append("G");
+ else if (columnbp.at(j) == 2)
+ consensus.append("C");
+ else if (columnbp.at(j) == 3)
+ consensus.append("T");
+
+ }
+}
+
+inline
+void computeMatchScores( const std::string& seq1, const std::string& seq2,
+ const PairwiseScoringScheme& scoring, std::vector<score_t>& scores )
+{
+ scores.resize( seq1.size() );
+ std::fill(scores.begin(), scores.end(), INVALID_SCORE);
+ const uint8* table = SortedMerList::BasicDNATable();
+
+ for (unsigned uColIndex = 0; uColIndex < seq1.size(); ++uColIndex)
+ {
+ char c1 = seq1[uColIndex];
+ char c2 = seq2[uColIndex];
+ if( c1 == '-' || c2 == '-' )
+ continue;
+ unsigned uLetter1 = table[c1];
+ unsigned uLetter2 = table[c2];
+
+ score_t scoreMatch = scoring.matrix[uLetter1][uLetter2];
+ scores[uColIndex] = scoreMatch;
+ }
+}
+
+inline
+void computeGapScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring,
+ std::vector<score_t>& scores )
+{
+ scores.resize(seq1.size());
+
+ bool bGapping1 = false;
+ bool bGapping2 = false;
+ score_t gap_open_score = scoring.gap_open;
+ score_t gap_extend_score = scoring.gap_extend;
+ score_t term_gap_score = gap_open_score;
+
+ unsigned uColCount = seq1.size();
+ unsigned uColStart = 0;
+ bool bLeftTermGap = false;
+ for (unsigned uColIndex = 0; uColIndex < seq1.size(); ++uColIndex)
+ {
+ bool bGap1 = seq1[uColIndex] == '-';
+ bool bGap2 = seq2[uColIndex] == '-';
+ if (!bGap1 || !bGap2)
+ {
+ if (bGap1 || bGap2)
+ bLeftTermGap = true;
+ uColStart = uColIndex;
+ break;
+ }
+ }
+
+ unsigned uColEnd = uColCount - 1;
+ bool bRightTermGap = false;
+ for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
+ {
+ bool bGap1 = seq1[iColIndex] == '-';
+ bool bGap2 = seq2[iColIndex] == '-';
+ if (!bGap1 || !bGap2)
+ {
+ if (bGap1 || bGap2)
+ bRightTermGap = true;
+ uColEnd = (unsigned) iColIndex;
+ break;
+ }
+ }
+
+ unsigned gap_left_col = 0;
+ score_t cur_gap_score = 0;
+ for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
+ {
+ bool bGap1 = seq1[uColIndex] == '-';
+ bool bGap2 = seq2[uColIndex] == '-';
+
+ if (bGap1 && bGap2)
+ continue;
+
+ if (bGap1)
+ {
+ if (!bGapping1)
+ {
+ gap_left_col = uColIndex;
+ if (uColIndex == uColStart)
+ {
+ cur_gap_score += term_gap_score;
+ }else{
+ cur_gap_score += gap_open_score;
+ }
+ bGapping1 = true;
+ }
+ else
+ {
+ cur_gap_score += gap_extend_score;
+ }
+ continue;
+ }
+
+ else if (bGap2)
+ {
+ if (!bGapping2)
+ {
+ gap_left_col = uColIndex;
+ if (uColIndex == uColStart)
+ {
+ cur_gap_score += term_gap_score;
+ }else{
+ cur_gap_score += gap_open_score;
+ }
+ bGapping2 = true;
+ }
+ else
+ {
+ cur_gap_score += gap_extend_score;
+ }
+ continue;
+ }
+
+ if( (bGapping1 || bGapping2) )
+ {
+ score_t valid_cols = 0;
+ for( unsigned uGapIndex = gap_left_col; uGapIndex < uColIndex; ++uGapIndex )
+ if( seq1[uGapIndex] != '-' || seq2[uGapIndex] != '-' )
+ valid_cols++;
+ // spread the total gap penalty evenly across all columns
+ score_t per_site_penalty = cur_gap_score / valid_cols;
+ score_t extra = cur_gap_score - (per_site_penalty * valid_cols);
+ for( unsigned uGapIndex = gap_left_col; uGapIndex < uColIndex; ++uGapIndex )
+ {
+ if( seq1[uGapIndex] == '-' && seq2[uGapIndex] == '-' )
+ continue;
+ if( scores[uGapIndex] != INVALID_SCORE )
+ {
+ genome::breakHere();
+ cerr << "asdgohasdoghasodgh\n";
+ }
+ scores[uGapIndex] = per_site_penalty;
+ }
+ if( scores[gap_left_col] == INVALID_SCORE )
+ {
+ cerr << "crap!\n";
+ genome::breakHere();
+ }
+ scores[gap_left_col] += extra;
+ gap_left_col = (std::numeric_limits<unsigned>::max)();
+ cur_gap_score = 0;
+ }
+ bGapping1 = false;
+ bGapping2 = false;
+ }
+
+ if (bGapping1 || bGapping2)
+ {
+ cur_gap_score -= gap_open_score;
+ cur_gap_score += term_gap_score;
+
+ score_t valid_cols = 0;
+ for( unsigned uGapIndex = gap_left_col; uGapIndex < uColCount; ++uGapIndex )
+ if( seq1[uGapIndex] != '-' || seq2[uGapIndex] != '-' )
+ valid_cols++;
+ // spread the total gap penalty evenly across all columns
+ score_t per_site_penalty = cur_gap_score / valid_cols;
+ score_t extra = cur_gap_score - (per_site_penalty * valid_cols);
+ for( unsigned uGapIndex = gap_left_col; uGapIndex < uColCount; ++uGapIndex )
+ {
+ if( seq1[uGapIndex] == '-' && seq2[uGapIndex] == '-' )
+ continue;
+ scores[uGapIndex] = per_site_penalty;
+ }
+ if( valid_cols > 0 )
+ {
+ if( scores[gap_left_col] == INVALID_SCORE )
+ {
+ cerr << "crap!\n";
+ genome::breakHere();
+ }
+ scores[gap_left_col] += extra;
+ }
+ }
+}
+
+inline
+void computeSPScore( const std::vector<string>& alignment, const PairwiseScoringScheme& pss,
+ std::vector<score_t>& scores, score_t& score )
+{
+ std::vector< score_t > cur_m_scores( alignment[0].size(), INVALID_SCORE );
+ std::vector< score_t > cur_g_scores( alignment[0].size(), INVALID_SCORE );
+ scores.resize(alignment[0].size());
+ std::fill(scores.begin(), scores.end(), 0);
+ score = 0;
+ double w = 1; // weight, to be determined later...
+ for( size_t i = 0; i < alignment.size(); ++i )
+ {
+ for( size_t j = i+1; j < alignment.size(); ++j )
+ {
+ std::fill( cur_m_scores.begin(), cur_m_scores.end(), INVALID_SCORE );
+ std::fill( cur_g_scores.begin(), cur_g_scores.end(), INVALID_SCORE );
+ computeMatchScores( alignment.at(i), alignment.at(j), pss, cur_m_scores );
+ computeGapScores( alignment.at(i), alignment.at(j), pss, cur_g_scores );
+ for( size_t k = 0; k < cur_m_scores.size(); ++k )
+ {
+ score_t s = 0;
+ if( cur_m_scores[k] != INVALID_SCORE )
+ s += cur_m_scores[k];
+ if( cur_g_scores[k] != INVALID_SCORE )
+ s += cur_g_scores[k];
+ scores[k] += (score_t)(w * (double)s);
+ }
+ }
+ }
+ for( size_t k = 0; k < scores.size(); ++k )
+ score += scores[k];
+}
+
+
+} // namespace mems
+
+
+#endif // __Scoring_h__
+
diff --git a/libMems/SeedMasks.h b/libMems/SeedMasks.h
new file mode 100644
index 0000000..64d65e3
--- /dev/null
+++ b/libMems/SeedMasks.h
@@ -0,0 +1,404 @@
+/*******************************************************************************
+ * $Id: SortedMerList.h,v 1.13 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _SeedMasks_h_
+#define _SeedMasks_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef __cplusplus
+#include <cmath>
+#else
+#include <math.h>
+#endif
+#include "libGenome/gnDefs.h"
+
+/* Seed patterns taken from: AE Darling, T Treangen, L Zhang, C Kuiken, X Messeguer, NT Perna
+ * "Procrastination leads to efficient match filtration for local multiple alignment"
+ * Lecture Notes in Bioinformatics 4175:126-137 Springer-Verlag 2006
+ */
+
+/**
+ * returns the array of default seed mask patterns
+ * Each seed is represented by a pair of 32 bit integers
+ */
+#ifdef __cplusplus
+static
+#endif
+uint32** seedMasks();
+
+/**
+ * the first three seed masks in each of the following are
+ * 'good' seeds according to Louxin Zhang
+ */
+#ifdef __cplusplus
+inline static
+#endif
+uint32** seedMasks(){
+ static uint32 seed_masks_3[] =
+ {
+ 0,0xb, //0b1011
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_4[] =
+ {
+ 0,0x3b, //0b101011,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_5[] =
+ {
+ 0,0x6b, //0b1101011,
+ 0,0x139, //0b100111001,
+ 0,0x193, //0b110010011,
+ 0,0x6b, //0b1101011,
+ 0,0, 0,0, };
+ static uint32 seed_masks_6[] =
+ {
+ 0,0x58D, //0b10110001101,
+ 0,0x653, //0b11001010011,
+ 0,0x1AB, //0b110101011,
+ 0,0xdb, //0b11011011,
+ 0,0, 0,0, };
+ static uint32 seed_masks_7[] =
+ {
+ 0,0x1953, //0b1100101010011
+ 0,0x588d, //0b101100010001101
+ 0,0x688b, //0b110100010001011
+ 0,0x17d, //0b101111101,
+ 0,0x164d, //0b1011001001101,
+ 0,0, 0,0, };
+ static uint32 seed_masks_8[] =
+ {
+ 0,0x3927, //0b11100100100111,
+ 0,0x1CA7, //0b1110010100111,
+ 0,0x6553, //0b110010101010011,
+ 0,0xb6d, //0b101101101101,
+ 0,0, 0,0, };
+ static uint32 seed_masks_9[] =
+ {
+ 0,0x7497, //0b111010010010111,
+ 0,0x1c927, //0b11100100100100111,
+ 0,0x72a7, //0b111001010100111,
+ 0,0x6fb, //0b11011111011,
+ 0,0x16ed, //0b1011011101101,
+ 0,0, };
+ static uint32 seed_masks_10[] =
+ {
+ 0,0x1d297, // 0,0b11101001010010111,
+ 0,0x3A497, // 0,0b111010010010010111,
+ 0,0xE997, // 0,0b1110100110010111,
+ 0,0x6D5B, // 0,0b110110101011011,
+ 0,0, 0,0, };
+ static uint32 seed_masks_11[] =
+ {
+ 0,0x7954f, //0b11110010101001111,
+ 0,0x75257, //0b1110101001001010111,
+ 0,0x1c9527, //0b111001001010100100111,
+ 0,0x5bed, //0b101101111101101, // third b.p. coding pattern
+ 0,0x5b26d, //0b1011011001001101101,
+ 0,0, };
+ static uint32 seed_masks_12[] =
+ {
+ 0,0x7954f, // 0,0b1111001010101001111,
+ 0,0x3D32F, // 0,0b111101001100101111,
+ 0,0x768B7, // 0,0b1110110100010110111,
+ 0,0x5B56D, // 0,0b1011011010101101101,
+ 0,0, 0,0, };
+ static uint32 seed_masks_13[] =
+ {
+ 0,0x792a4f, //0b11110010010101001001111,
+ 0,0x1d64d7, //0b111010110010011010111,
+ 0,0x1d3597, //0b111010011010110010111,
+ 0,0x1b7db, //0b11011011111011011, // third b.p. coding pattern
+ 0,0x75ad7, //0b1110101101011010111,
+ 0,0, };
+ static uint32 seed_masks_14[] =
+ {
+ 0,0x1e6acf, // 0,0b111100110101011001111,
+ 0,0xF59AF, // 0,0b11110101100110101111,
+ 0,0x3D4CAF, // 0,0b1111010100110010101111,
+ 0,0x35AD6B, // 0,0b1101011010110101101011,
+ 0,0, 0,0, };
+ static uint32 seed_masks_15[] =
+ {
+ 0,0x7ac9af, //0b11110101100100110101111,
+ 0,0x7b2a6f, //0b11110110010101001101111,
+ 0,0x79aacf, //0b11110011010101011001111,
+ 0,0x16df6d, //0b101101101111101101101, // third b.p. coding pattern
+ 0,0x6b5d6b, //0b11010110101110101101011,
+ 0,0, };
+ static uint32 seed_masks_16[] =
+ {
+ 0,0xf599af, // 0,0b111101011001100110101111,
+ 0,0xEE5A77, // 0,0b111011100101101001110111,
+ 0,0x7CD59F, // 0,0b11111001101010110011111,
+ 0,0xEB5AD7, // 0,0b111010110101101011010111,
+ 0,0, 0,0, };
+ static uint32 seed_masks_17[] =
+ {
+ 0,0x6dbedb, //0b11011011011111011011011, // third b.p. coding pattern
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_18[] =
+ {
+ 0,0x3E6B59F,// 0,0b11111001101011010110011111,
+ 0,0x3EB335F,// 0,0b11111010110011001101011111,
+ 0,0x7B3566F,// 0,0b111101100110101011001101111,
+ 0,0, 0,0, 0,0, };
+
+ static uint32 seed_masks_19[] =
+ {
+ 0,0x7b974ef, //0b111101110010111010011101111
+ 0,0x7d6735f, //0b111110101100111001101011111
+ 0,0x1edd74f, //0b1111011011101011101101111
+ 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_20[] =
+ {
+ 0,0x1F59B35F, //0b11111010110011011001101011111,
+ 0,0x3EDCEDF, //0b11111011011100111011011111,
+ 0,0xFAE675F, //0b1111101011100110011101011111,
+ 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_21[] =
+ {
+ 0,0x7ddaddf, //0b111110111011010110111011111,
+ 0,0xaeb3f, //0b11111100110101110101100111111,
+ 0,0x7eb76bf, //0b111111010110111011010111111,
+ 0,0, 0,0, 0,0, };
+ // default to solid seeds for weight 22+
+ static uint32 seed_masks_22[] =
+ {
+ 0,0x003fffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_23[] =
+ {
+ 0,0x007fffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_24[] =
+ {
+ 0,0x00ffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_25[] =
+ {
+ 0,0x01ffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_26[] =
+ {
+ 0,0x03ffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_27[] =
+ {
+ 0,0x07ffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_28[] =
+ {
+ 0,0x0fffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_29[] =
+ {
+ 0,0x1fffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_30[] =
+ {
+ 0,0x3fffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+ static uint32 seed_masks_31[] =
+ {
+ 0,0x7fffffff,
+ 0,0, 0,0, 0,0, 0,0, 0,0, };
+
+ static uint32 no_seeds[] =
+ {
+ 0,0,
+ 0,0,
+ 0,0,
+ 0,0,
+ 0,0,
+ 0,0,
+ };
+
+ static uint32* seed_masks[] =
+ {
+ no_seeds,
+ no_seeds,
+ no_seeds,
+ seed_masks_3,
+ seed_masks_4,
+ seed_masks_5,
+ seed_masks_6,
+ seed_masks_7,
+ seed_masks_8,
+ seed_masks_9,
+ seed_masks_10,
+ seed_masks_11,
+ seed_masks_12,
+ seed_masks_13,
+ seed_masks_14,
+ seed_masks_15,
+ seed_masks_16,
+ seed_masks_17,
+ seed_masks_18,
+ seed_masks_19,
+ seed_masks_20,
+ seed_masks_21,
+ seed_masks_22,
+ seed_masks_23,
+ seed_masks_24,
+ seed_masks_25,
+ seed_masks_26,
+ seed_masks_27,
+ seed_masks_28,
+ seed_masks_29,
+ seed_masks_30,
+ seed_masks_31,
+ };
+
+ return seed_masks;
+}
+
+static const int CODING_SEED = 3;
+static const int SOLID_SEED = INT_MAX;
+
+/**
+ * Returns a solid seed of a given weight.
+ */
+#ifdef __cplusplus
+static
+#endif
+int64 getSolidSeed( int weight );
+
+#ifdef __cplusplus
+inline static
+#endif
+int64 getSolidSeed( int weight ){
+ int64 seed = 1;
+ seed <<= weight;
+ seed--;
+ return seed;
+};
+
+
+
+/**
+ * returns a seed of a given weight. Setting seed_rank > 0 will select a seed
+ * of a lower sensitivity rank according to Choi et. al. 2004
+ */
+#ifdef __cplusplus
+static int64 getSeed( int weight, int seed_rank = 0 );
+#else
+int64 getSeed( int weight, int seed_rank );
+#endif
+
+#ifdef __cplusplus
+inline static
+#endif
+int64 getSeed( int weight, int seed_rank ){
+ uint32** masks;
+ int high;
+ int low;
+ int i = 1;
+ int64 seed = 0;
+ if( seed_rank == SOLID_SEED )
+ return getSolidSeed( weight );
+
+ masks = seedMasks();
+ if(weight > 31)
+ return getSolidSeed(32);
+ if( seed_rank > 5 )
+ return getSolidSeed(weight);
+ if( masks[weight][seed_rank*2+1] == 0 )
+ return getSolidSeed(weight);
+ high = masks[ weight ][ seed_rank*2 ];
+ low = masks[ weight ][ seed_rank*2 + 1 ];
+
+ seed |= high;
+ seed <<= 32;
+ seed |= low;
+ return seed;
+};
+
+
+/**
+ * calculates the length of a seed pattern
+ */
+#ifdef __cplusplus
+static
+#endif
+int getSeedLength( int64 seed );
+
+#ifdef __cplusplus
+inline static
+#endif
+int getSeedLength( int64 seed ){
+ int right_bit = -1;
+ int left_bit = -1;
+ uint bitI = 0;
+ for( ; bitI < 64; ++bitI ){
+ if( (seed & 1) == 1 ){
+ left_bit = bitI;
+ if( right_bit == -1 )
+ right_bit = bitI;
+ }
+ seed >>= 1;
+ }
+ if( left_bit != -1 )
+ return left_bit - right_bit + 1;
+ return 0;
+}
+
+/**
+ * calculates the weight of a seed pattern
+ */
+#ifdef __cplusplus
+static
+#endif
+int getSeedWeight( int64 seed );
+
+#ifdef __cplusplus
+inline static
+#endif
+int getSeedWeight( int64 seed ){
+ int weight = 0;
+ uint bitI = 0;
+ for( ; bitI < 64; ++bitI ){
+ if( (seed & 1) == 1 ){
+ ++weight;
+ }
+ seed >>= 1;
+ }
+ return weight;
+}
+
+const uint MIN_DNA_SEED_WEIGHT = 5;
+const uint MAX_DNA_SEED_WEIGHT = 31;
+
+/**
+ * Calculate the default seed weight based on sequence length
+ */
+#ifdef __cplusplus
+static
+#endif
+uint getDefaultSeedWeight( gnSeqI avg_sequence_length );
+
+#ifdef __cplusplus
+inline static
+#endif
+uint getDefaultSeedWeight( gnSeqI avg_sequence_length ){
+ uint mer_size = (uint)ceil((log( (double)avg_sequence_length ) / log( 2.0 ))/1.5);
+ // don't allow even weights-- they can be palindromic
+ if( !(mer_size & 0x1 ) )
+ ++mer_size;
+ mer_size = mer_size < MIN_DNA_SEED_WEIGHT ? 0 : mer_size;
+ if( avg_sequence_length == 0 )
+ mer_size = 0;
+
+ // 31 is the maximum DNA seed weight
+ mer_size = mer_size > MAX_DNA_SEED_WEIGHT ? MAX_DNA_SEED_WEIGHT : mer_size;
+ return mer_size;
+}
+
+
+#endif // _SeedMasks_h_
diff --git a/libMems/SeedOccurrenceList.h b/libMems/SeedOccurrenceList.h
new file mode 100644
index 0000000..e2cbcb0
--- /dev/null
+++ b/libMems/SeedOccurrenceList.h
@@ -0,0 +1,100 @@
+#ifndef __SeedOccurrenceList_h__
+#define __SeedOccurrenceList_h__
+
+#include <vector>
+#include "libMems/SortedMerList.h"
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <boost/filesystem.hpp>
+#include <fstream>
+#include "libMems/Files.h"
+
+namespace mems
+{
+
+class SeedOccurrenceList
+{
+public:
+ typedef float32 frequency_type;
+
+ SeedOccurrenceList(){}
+
+ template< typename SMLType >
+ void construct( SMLType& sml )
+ {
+ const size_t total_len = sml.Length();
+ count.resize(total_len);
+ size_t seed_start = 0;
+ size_t cur_seed_count = 1;
+ uint64 mer_mask = sml.GetSeedMask();
+ size_t seedI = 1;
+ bmer prevmer;
+ bmer merI;
+ const size_t sml_length = sml.SMLLength();
+ if( sml_length > 0 )
+ merI = sml[0];
+ for( seedI = 1; seedI < sml_length; seedI++ )
+ {
+ prevmer = merI;
+ merI = sml[seedI];
+ if( (merI.mer & mer_mask) == (prevmer.mer & mer_mask) )
+ {
+ ++cur_seed_count;
+ continue;
+ }
+ // set seed frequencies
+ for( size_t i = seed_start; i < seedI; ++i )
+ count[sml[i].position] = (frequency_type)cur_seed_count;
+ seed_start = seedI;
+ cur_seed_count = 1;
+ }
+ // set seed frequencies for the last few
+ for( size_t i = seed_start; i < seedI && i < sml_length; ++i )
+ count[sml[i].position] = (frequency_type)cur_seed_count;
+ // hack: fudge the last few values on the end of the sequence, necessary when sequence isn't circular
+ for( ; seedI < total_len; ++seedI )
+ count[seedI]=1;
+
+ smoothFrequencies( sml );
+
+ // wipe out any stray zeros
+ for( size_t i = 0; i < total_len; ++i )
+ if( count[i]== 0 )
+ count[i] = 1;
+ }
+
+
+ frequency_type getFrequency( gnSeqI position )
+ {
+ return count[position];
+ }
+
+protected:
+ /**
+ * converts position freqs to the average freq of all k-mers containing that position
+ */
+ template< typename SMLType >
+ void smoothFrequencies( const SMLType& sml )
+ {
+ size_t seed_length = sml.SeedLength();
+ // hack: for beginning (seed_length) positions assume that previous
+ // containing seeds were unique
+ double sum = seed_length - 1 + count[0];
+ std::vector<frequency_type> buf(seed_length, 1);
+ buf[0] = count[0];
+ for( size_t i = 1; i < sml.Length(); i++ )
+ {
+ count[i-1] = sum / seed_length;
+ sum += count[i];
+ size_t bufI = i % seed_length;
+ sum -= buf[bufI];
+ buf[bufI] = count[i];
+ }
+ }
+
+ std::vector<frequency_type> count;
+};
+
+}
+
+#endif // __SeedOccurrenceList_h__
+
diff --git a/libMems/SlotAllocator.cpp b/libMems/SlotAllocator.cpp
new file mode 100644
index 0000000..8df67a8
--- /dev/null
+++ b/libMems/SlotAllocator.cpp
@@ -0,0 +1,5 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "SlotAllocator.h"
+
diff --git a/libMems/SlotAllocator.h b/libMems/SlotAllocator.h
new file mode 100644
index 0000000..59bd687
--- /dev/null
+++ b/libMems/SlotAllocator.h
@@ -0,0 +1,170 @@
+/*******************************************************************************
+ * $Id: SlotAllocator.h,v 1.6 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _SlotAllocator_h_
+#define _SlotAllocator_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <vector>
+#include <list>
+#include <stdexcept>
+#include <iostream>
+#include "libMUSCLE/threadstorage.h"
+
+
+namespace mems {
+
+
+/** When more space is needed to store a datatype, the memory pool will grow by this factor */
+const double POOL_GROWTH_RATE = 1.6;
+
+/**
+ * This class allocates memory according to the slot allocation scheme for
+ * fixed size objects. Each time all slots are full it allocates twice the
+ * previous allocation. If it is unable to allocate twice the previous
+ * allocation, it does a binary 'search' for the largest amount of memory it
+ * can allocate.
+ * The current implementation does not allow memory to
+ * be freed once allocated.
+ */
+template< class T >
+class SlotAllocator {
+public:
+ static SlotAllocator<T>& GetSlotAllocator();
+ T* Allocate();
+ void Free( T* t );
+ void Free( std::vector<T*>& chunk );
+ ~SlotAllocator(){
+ Purge();
+ };
+ void Purge(){
+//#pragma omp critical
+//{
+ std::vector<T*>& data = this->data.get();
+ unsigned& tail_free = this->tail_free.get();
+ unsigned& n_elems = this->n_elems.get();
+ std::vector< T* >& free_list = this->free_list.get();
+ for( unsigned dataI = 0; dataI < data.size(); dataI++ )
+ free(data[dataI]);
+ data.clear();
+ free_list.clear();
+ tail_free = 0;
+ n_elems = 0;
+//}
+ }
+
+protected:
+ TLS< std::vector<T*> > data;
+ TLS< unsigned > tail_free;
+ TLS< unsigned > n_elems; /**< number of T in the most recently allocated block */
+
+ TLS< std::vector< T* > > free_list;
+
+private:
+ SlotAllocator() : tail_free(0), n_elems(0) {};
+ SlotAllocator& operator=( SlotAllocator& sa ){ n_elems = sa.n_elems; data = sa.data; tail_free = sa.tail_free; return *this;};
+ SlotAllocator( SlotAllocator& sa ){ *this = sa; };
+
+};
+
+template< class T >
+inline
+SlotAllocator< T >& SlotAllocator< T >::GetSlotAllocator(){
+ static SlotAllocator< T >* sa = new SlotAllocator< T >();
+ return *sa;
+}
+
+
+template< class T >
+inline
+T* SlotAllocator< T >::Allocate(){
+ T* t_ptr = NULL;
+
+{
+ std::vector<T*>& data = this->data.get();
+ unsigned& tail_free = this->tail_free.get();
+ unsigned& n_elems = this->n_elems.get();
+ std::vector< T* >& free_list = this->free_list.get();
+// omp_guard rex( locker );
+ if( free_list.begin() != free_list.end() ){
+ t_ptr = free_list.back();
+ free_list.pop_back();
+ }else if( tail_free > 0 ){
+ int T_index = n_elems - tail_free--;
+ t_ptr = &(data.back()[ T_index ]);
+ }else{
+
+ // Last resort:
+ // increase the size of the data array
+ unsigned new_size = (unsigned)(((double)n_elems * POOL_GROWTH_RATE)+0.5);
+ if( new_size == 0 )
+ new_size++;
+ T* new_data = NULL;
+ while( true ){
+ try{
+ new_data = (T*)malloc(sizeof(T)*new_size);
+ break;
+ }catch(...){
+ new_size = new_size / 2;
+ if( new_size == 0 )
+ break;
+ }
+ }
+ if( new_data == NULL || new_size == 0 ){
+ throw std::out_of_range( "SlotAllocator::Allocate(): Unable to allocate more memory" );
+ }
+ data.push_back( new_data );
+ tail_free = new_size - 1;
+ t_ptr = & data.back()[0];
+ n_elems = new_size;
+ }
+}
+ return t_ptr;
+}
+
+template< class T >
+inline
+void SlotAllocator< T >::Free( T* t ){
+ // for debugging double free
+/* for(size_t i = 0; i < free_list.size(); i++ )
+ if( free_list[i] == t )
+ std::cerr << "ERROR DOUBLE FREE\n";
+*/
+ t->~T();
+{
+// omp_guard rex( locker );
+ std::vector< T* >& free_list = this->free_list.get();
+
+ free_list.push_back( t );
+}
+}
+
+template< class T >
+inline
+void SlotAllocator< T >::Free( std::vector<T*>& chunk ){
+ // for debugging double free
+/* for(size_t i = 0; i < free_list.size(); i++ )
+ if( free_list[i] == t )
+ std::cerr << "ERROR DOUBLE FREE\n";
+*/
+ for( size_t i = 0; i < chunk.size(); i++ )
+ chunk[i]->~T();
+{
+// omp_guard rex( locker );
+ std::vector< T* >& free_list = this->free_list.get();
+ free_list.insert(free_list.end(), chunk.begin(), chunk.end());
+}
+ chunk.clear();
+}
+
+}
+
+#endif // _SlotAllocator_h_
diff --git a/libMems/SortedMerList.cpp b/libMems/SortedMerList.cpp
new file mode 100644
index 0000000..dcfe808
--- /dev/null
+++ b/libMems/SortedMerList.cpp
@@ -0,0 +1,826 @@
+/*******************************************************************************
+ * $Id: SortedMerList.cpp,v 1.23 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/SortedMerList.h"
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+const uint8* SortedMerList::BasicDNATable(){
+ static const uint8* const bdt = SortedMerList::CreateBasicDNATable();
+ return bdt;
+}
+
+const uint8* SortedMerList::ProteinTable(){
+ static const uint8* const bdt = SortedMerList::CreateProteinTable();
+ return bdt;
+}
+
+const uint8* SortedMerList::CreateBasicDNATable(){
+ uint8* bdt = new uint8[UINT8_MAX];
+ memset(bdt, 0, UINT8_MAX);
+ bdt['c'] = 1;
+ bdt['C'] = 1;
+ bdt['b'] = 1;
+ bdt['B'] = 1;
+ bdt['y'] = 1;
+ bdt['Y'] = 1;
+ bdt['g'] = 2;
+ bdt['G'] = 2;
+ bdt['s'] = 2;
+ bdt['S'] = 2;
+ bdt['k'] = 2;
+ bdt['K'] = 2;
+ bdt['t'] = 3;
+ bdt['T'] = 3;
+ return bdt;
+}
+
+const uint8* SortedMerList::CreateProteinTable(){
+ uint8* pt = new uint8[UINT8_MAX];
+ memset(pt, 0, UINT8_MAX);
+ pt['A'] = 0;
+ pt['R'] = 1;
+ pt['N'] = 2;
+ pt['D'] = 3;
+ pt['C'] = 4;
+ pt['Q'] = 5;
+ pt['E'] = 6;
+ pt['G'] = 7;
+ pt['H'] = 8;
+ pt['I'] = 9;
+ pt['L'] = 10;
+ pt['K'] = 11;
+ pt['M'] = 12;
+ pt['F'] = 13;
+ pt['P'] = 14;
+ pt['S'] = 15;
+ pt['T'] = 16;
+ pt['W'] = 17;
+ pt['Y'] = 18;
+ pt['V'] = 19;
+
+ pt['a'] = 0;
+ pt['r'] = 1;
+ pt['n'] = 2;
+ pt['d'] = 3;
+ pt['c'] = 4;
+ pt['q'] = 5;
+ pt['e'] = 6;
+ pt['g'] = 7;
+ pt['h'] = 8;
+ pt['i'] = 9;
+ pt['l'] = 10;
+ pt['k'] = 11;
+ pt['m'] = 12;
+ pt['f'] = 13;
+ pt['p'] = 14;
+ pt['s'] = 15;
+ pt['t'] = 16;
+ pt['w'] = 17;
+ pt['y'] = 18;
+ pt['v'] = 19;
+ return pt;
+}
+
+SortedMerList::SortedMerList(){
+ //default to BasicDNA settings
+ header.length = 0;
+ header.alphabet_bits = 2;
+ header.unique_mers = NO_UNIQUE_COUNT;
+ memcpy(header.translation_table, BasicDNATable(), UINT8_MAX);
+ header.description[0] = 0;
+ header.seed_length = DNA_MER_SIZE;
+ header.id = 0;
+ header.circular = false;
+ mask_size = DNA_MER_SIZE;
+ mer_mask = 0;
+ seed_mask = 0;
+ // init sequence data to null
+ sequence = NULL;
+ binary_seq_len = 0;
+}
+
+SortedMerList::SortedMerList( const SortedMerList& sa ){
+ sequence = NULL;
+ *this = sa;
+}
+
+SortedMerList& SortedMerList::operator=(const SortedMerList& sa)
+{
+ header = sa.header;
+ mer_mask = sa.mer_mask;
+ seed_mask = sa.seed_mask;
+ mask_size = sa.mask_size;
+ binary_seq_len = sa.binary_seq_len;
+
+ // copy binary sequence data
+ if( sa.sequence != NULL ){
+ if( sequence != NULL )
+ delete[] sequence;
+ sequence = new uint32[binary_seq_len];
+ memcpy(sequence, sa.sequence, sizeof(uint32) * binary_seq_len);
+ }else
+ sequence = NULL;
+
+ return *this;
+}
+
+SortedMerList::~SortedMerList(){
+ if( sequence != NULL )
+ delete[] sequence;
+}
+
+void SortedMerList::Clear(){
+ //default to BasicDNA settings
+ header.length = 0;
+ header.alphabet_bits = 2;
+ header.unique_mers = NO_UNIQUE_COUNT;
+ memcpy(header.translation_table, BasicDNATable(), UINT8_MAX);
+ header.description[0] = 0;
+ header.seed_length = DNA_MER_SIZE;
+ header.id = 0;
+ header.circular = false;
+ mask_size = DNA_MER_SIZE;
+ mer_mask = 0;
+ seed_mask = 0;
+ // delete sequence data
+ if( sequence != NULL ){
+ delete[] sequence;
+ sequence = NULL;
+ }
+ binary_seq_len = 0;
+}
+
+uint32 SortedMerList::CalculateMaxMerSize() const{
+ bmer tmp;
+ return (sizeof(tmp.mer) * 8) / header.alphabet_bits;
+}
+
+boolean SortedMerList::FindMer(const uint64 query_mer, gnSeqI& result){
+ bmer merle;
+ merle.mer = query_mer;
+ gnSeqI last_pos = Length();
+ if( last_pos == 0 || (last_pos < header.seed_length && !header.circular) )
+ return false;
+ last_pos -= header.circular ? 1 : header.seed_length;
+ result = bsearch(merle, 0, last_pos );
+ return ((*this)[result].mer == merle.mer);
+}
+
+boolean SortedMerList::Find(const string& query_seq, gnSeqI& result) {
+ struct bmer merle;
+ merle.mer = 0;
+
+ //check the length to make sure it is small enough
+ gnSeqI len = query_seq.length() * header.alphabet_bits < 64 ?
+ query_seq.length() : 64 / header.alphabet_bits;
+
+ translate((uint8*)&merle.mer, query_seq.c_str(), len);
+ return FindMer( merle.mer, result );
+}
+
+void SortedMerList::FindAll(const string& query_seq, vector<gnSeqI> result) {
+ struct bmer merle;
+ merle.mer = 0;
+
+ //check the length to make sure it is small enough
+ gnSeqI len = query_seq.length() * header.alphabet_bits < 64 ?
+ query_seq.length() : 64 / header.alphabet_bits;
+
+ translate((uint8*)&merle.mer, query_seq.c_str(), len);
+
+ //find the first match then start filling forward.
+ gnSeqI matchI = 0;
+ gnSeqI last_pos = Length();
+ last_pos -= header.circular ? 1 : header.seed_length;
+ bmer matchmer;
+ matchI = bsearch(merle, 0, last_pos);
+
+ //first seek backwards
+ int64 cur_matchI = matchI;
+ matchmer = (*this)[matchI];
+ while(cur_matchI >= 0 && matchmer.mer == merle.mer){
+ cur_matchI--;
+ matchmer = (*this)[cur_matchI];
+ }
+ int64 first_matchI = cur_matchI+1;
+
+ //now seek forwards
+ cur_matchI = matchI+1;
+ matchmer = (*this)[cur_matchI];
+ while(cur_matchI < GNSEQI_END && matchmer.mer == merle.mer){
+ cur_matchI++;
+ matchmer = (*this)[cur_matchI];
+ }
+ //fill the result array
+ for(matchI = first_matchI; matchI < cur_matchI; matchI++)
+ result.push_back(matchI);
+}
+
+string SortedMerList::Description() const{
+ return header.description;
+}
+
+void SortedMerList::SetDescription(const string& d){
+ strncpy(header.description, d.c_str(), DESCRIPTION_SIZE-1);
+}
+
+uint SortedMerList::SeedLength() const{
+ return header.seed_length;
+}
+/**
+ * Returns the weight of the seed that this SML was sorted on.
+ */
+uint SortedMerList::SeedWeight() const{
+ return header.seed_weight;
+}
+/**
+ * Returns the seed pattern that this SML was sorted on.
+ */
+uint64 SortedMerList::Seed() const{
+ return header.seed;
+}
+
+boolean SortedMerList::IsCircular() const{
+ return header.circular;
+}
+
+uint64 SortedMerList::GetMerMask() const{
+ return mer_mask;
+}
+
+uint64 SortedMerList::GetSeedMask() const{
+ return seed_mask;
+}
+
+uint32 SortedMerList::GetMerMaskSize() const{
+ return mask_size;
+}
+
+void SortedMerList::SetMerMaskSize(uint32 mer_size){
+ if(mer_size > header.seed_length)
+ mask_size = header.seed_length;
+ else
+ mask_size = mer_size;
+
+ // calculate the mer mask
+ mer_mask = UINT32_MAX;
+ mer_mask <<= 32;
+ mer_mask |= UINT32_MAX;
+ mer_mask <<= (64 - header.alphabet_bits * mer_size);
+}
+
+gnSeqI SortedMerList::Length() const{
+ return header.length;
+}
+
+gnSeqI SortedMerList::SMLLength() const{
+ // make sure there was at least one seed
+ if( header.length < header.seed_length )
+ return 0;
+ if( !header.circular )
+ return header.length - header.seed_length + 1;
+ return header.length;
+}
+
+sarID_t SortedMerList::GetID() const{
+ return header.id;
+}
+void SortedMerList::SetID(const sarID_t d){
+ header.id = d;
+}
+
+#define OPT_HEADER_ALPHABET_BITS DNA_ALPHA_BITS
+
+void SortedMerList::SetSequence(gnSeqC* seq_buf, gnSeqI seq_len){
+ binary_seq_len = (seq_len * header.alphabet_bits) / 32;
+ if((seq_len * header.alphabet_bits) % 32 != 0)
+ binary_seq_len++;
+
+ binary_seq_len+=2; // zero-pad the end for extra working room
+
+ if( sequence != NULL )
+ delete[] sequence;
+ sequence = new uint32[binary_seq_len];
+ translate32(sequence, seq_buf, seq_len);
+}
+
+// this should return a mer containing all characters covered by the
+// spaced seed
+uint64 SortedMerList::GetMer(gnSeqI position) const
+{
+ //check this for access violations.
+ uint64 mer_a;
+ gnSeqI mer_word, mer_bit;
+ uint32 merle;
+ //get mer_a
+ mer_a = 0;
+ mer_word = (position * (gnSeqI)OPT_HEADER_ALPHABET_BITS) / (gnSeqI)32;
+ mer_bit = (position * (gnSeqI)OPT_HEADER_ALPHABET_BITS) % (gnSeqI)32;
+ mer_a |= sequence[mer_word++];
+ mer_a <<= 32;
+ mer_a |= sequence[mer_word++];
+ if(mer_bit > 0){
+ merle = sequence[mer_word];
+ merle >>= 32 - mer_bit;
+ mer_a <<= mer_bit;
+ mer_a |= merle;
+ }
+ mer_a &= mer_mask;
+ return mer_a;
+}
+
+//potential buffer overflows here. make dest extra big.
+void SortedMerList::GetBSequence(uint32* dest, const gnSeqI len, const gnSeqI offset){
+ //first determine the byte offset of the sequence within the file.
+ if(offset >= header.length){
+ Throw_gnEx( IndexOutOfBounds() );
+ }
+ uint64 startpos = (offset * OPT_HEADER_ALPHABET_BITS) / 32;
+ int begin_remainder = (offset * OPT_HEADER_ALPHABET_BITS) % 32;
+ uint64 readlen = offset + len < header.length ? len : header.length - offset;
+
+ gnSeqI word_read_len = (readlen * OPT_HEADER_ALPHABET_BITS) / 32;
+ int end_remainder = (readlen * OPT_HEADER_ALPHABET_BITS) % 32;
+ if(begin_remainder + (readlen * OPT_HEADER_ALPHABET_BITS) > 32
+ && end_remainder > 0)
+ word_read_len++;
+ if(begin_remainder > 0)
+ word_read_len++;
+
+ //now do the actual read
+ memcpy((char*)dest, (char*)sequence + (startpos * 4), word_read_len * 4);
+
+ //now shift if needed
+ ShiftWords(dest, word_read_len, -begin_remainder);
+
+ //now mask if needed
+ if(end_remainder > begin_remainder){
+ uint32 mask = 0xFFFFFFFF;
+ mask <<= 32 - (end_remainder - begin_remainder);
+ dest[word_read_len-1] &= mask;
+ }else if(end_remainder < begin_remainder){
+ uint32 mask = 0xFFFFFFFF;
+ mask <<= (begin_remainder - end_remainder);
+ dest[word_read_len-2] &= mask;
+ }
+}
+
+gnSeqI SortedMerList::bsearch(const struct bmer& query_mer, const gnSeqI start, const gnSeqI end) {
+
+ gnSeqI middle = (start + end) / 2;
+ struct bmer midmer = (*this)[middle];
+ if(midmer.mer == query_mer.mer)
+ return middle;
+ else if((midmer.mer < query_mer.mer) && (middle < end))
+ return bsearch(query_mer, middle + 1, end);
+ else if((midmer.mer > query_mer.mer) && (start < middle))
+ return bsearch(query_mer, start, middle - 1);
+
+ //if we get here then the mer was not found.
+ //return where it would be if it existed.
+ return middle;
+}
+
+//translate the character sequence to binary form based on the
+//translation table.
+void SortedMerList::translate(uint8* dest, const gnSeqC* src, const gnSeqI len) const{
+ uint8 start_bit = 0;
+ gnSeqI cur_byte = 0;
+ const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
+ dest[cur_byte] = 0;
+ for(uint32 i=0; i < len; i++){
+ uint8 tmp = header.translation_table[src[i]];
+ if(start_bit + alpha_bits <= 8){
+ tmp <<= 8 - start_bit - alpha_bits;
+ dest[cur_byte] |= tmp;
+ }else{
+ uint8 over_bits = (start_bit + alpha_bits) % 8;
+ uint8 tmp2 = tmp;
+ tmp2 <<= 8 - over_bits;
+ tmp >>= over_bits;
+ dest[cur_byte] |= tmp;
+ dest[cur_byte+1] |= tmp2;
+ }
+ start_bit += alpha_bits;
+ if(start_bit >= 8){
+ start_bit %= 8;
+ cur_byte++;
+ dest[cur_byte] = 0;
+ }
+ }
+}
+
+void SortedMerList::translate32(uint32* dest, const gnSeqC* src, const gnSeqI len) const{
+ if( len == 0 )
+ return;
+ uint8 start_bit = 0;
+ gnSeqI cur_word = 0;
+ const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
+ dest[cur_word] = 0;
+ for(uint32 i=0; i < len; i++){
+ if(src[i]=='-'){
+ cerr << "ERROR! gap character encountered at genome sequence position " << i << std::endl;
+ cerr << "Input sequences must be unaligned and ungapped!\n";
+ throw "Gap in genome sequence\n";
+ }
+ uint32 tmp = header.translation_table[src[i]];
+ if(start_bit + alpha_bits <= 32){
+ tmp <<= 32 - start_bit - alpha_bits;
+ dest[cur_word] |= tmp;
+ start_bit += alpha_bits;
+ if(start_bit >= 32 && i < len - 1){
+ start_bit %= 32;
+ cur_word++;
+ dest[cur_word] = 0;
+ }
+ }else{
+ uint8 over_bits = (start_bit + alpha_bits) % 32;
+ uint32 tmp2 = tmp;
+ tmp2 <<= 32 - over_bits;
+ tmp >>= over_bits;
+ dest[cur_word] |= tmp;
+ cur_word++;
+ dest[cur_word] = 0;
+ dest[cur_word] |= tmp2;
+ start_bit = over_bits;
+ }
+ }
+}
+SMLHeader SortedMerList::GetHeader() const{
+ return header;
+}
+
+gnSeqI SortedMerList::UniqueMerCount(){
+ if(header.unique_mers != NO_UNIQUE_COUNT)
+ return header.unique_mers;
+
+ uint32 MER_BUFFER_SIZE = 16384; //not quite arbitrary (2^14)
+ gnSeqI cur_pos = 0;
+ vector<bmer> mer_vector;
+ bmer prev_mer;
+ gnSeqI m_unique = 0;
+ gnSeqI report_interval = MER_BUFFER_SIZE * 212;
+ while(cur_pos < header.length){
+ if(!Read(mer_vector, MER_BUFFER_SIZE, cur_pos)){
+ break;
+// DebugMsg("SortedMerList::UniqueMerCount: Error reading bmer vector.");
+// return NO_UNIQUE_COUNT;
+ }
+ uint32 mer_count = mer_vector.size();
+ if(mer_count == 0)
+ break;
+ if(cur_pos > 0 && prev_mer.mer != mer_vector[0].mer)
+ m_unique++;
+
+ //count them up.
+ uint32 i = 0;
+ for(uint32 j = 1; j < mer_count; j++){
+ if((mer_vector[i].mer & mer_mask) != (mer_vector[j].mer & mer_mask) )
+ m_unique++;
+ i++;
+ }
+ prev_mer = mer_vector[i];
+ cur_pos += mer_count;
+ if( cur_pos % report_interval == 0 ){
+// cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+ cout << m_unique << "/" << cur_pos << endl;
+ }
+ }
+ cout << endl;
+ m_unique++;
+ header.unique_mers = m_unique;
+ return header.unique_mers;
+}
+
+//will not handle more than 8GB sequence on 32-bit systems
+void SortedMerList::ShiftWords(unsigned int* data, uint32 length, int32 bits)
+{
+ int32 word_bits = 8 * sizeof(unsigned int);
+ if(bits > 0 && bits < word_bits){
+ //shift everything right starting at the end
+ data[length - 1] >>= bits;
+ for(int i=length-2; i >= 0; i--){
+ uint32 tmp = data[i];
+ tmp <<= word_bits - bits;
+ data[i+1] |= tmp;
+ data[i] >>= bits;
+ }
+ }else if(bits < 0 && bits > (-1)*word_bits){
+ bits *= -1;
+ //shift everything left
+ data[0] <<= bits;
+ for(uint32 i=0; i < length; i++){
+ uint32 tmp = data[i+1];
+ tmp >>= word_bits - bits;
+ data[i] |= tmp;
+ data[i+1] <<= bits;
+ }
+ }
+}
+
+void SortedMerList::FillSML(gnSeqC* seq_buf, gnSeqI seq_len, boolean circular, vector<bmer>& sml_array){
+ const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
+ const uint32 mer_size = header.seed_length;
+ gnSeqI sar_len = seq_len;
+ if(!circular)
+ sar_len -= header.seed_length - 1;
+ sml_array.reserve(sar_len);
+
+ bmer cur_suffix;
+ cur_suffix.mer = 0;
+ cur_suffix.position = 0;
+
+ /* now fill in the suffix array with the forward sequence*/
+ for(gnSeqI i=0; i < mer_size; i++){
+ cur_suffix.mer <<= alpha_bits;
+ cur_suffix.mer |= header.translation_table[seq_buf[i]];
+ }
+ uint8 dead_bits = 64 - (mer_size * alpha_bits);
+ cur_suffix.mer <<= dead_bits;
+
+ sml_array.push_back(cur_suffix);
+
+ //fill sml_array with mers
+ for(gnSeqI seqI = 1; seqI < sar_len; seqI++){//already added the
+ //first one
+ cur_suffix.position++;
+ cur_suffix.mer <<= alpha_bits;
+ uint64 new_mer = header.translation_table[seq_buf[seqI+(mer_size-1)]];
+ new_mer <<= dead_bits;
+ cur_suffix.mer |= new_mer;
+ sml_array.push_back(cur_suffix);
+ }
+}
+
+void SortedMerList::FillSML(const gnSequence& seq, vector<bmer>& sml_array){
+ gnSeqI seq_len = seq.length();
+ Array<gnSeqC> seq_buf( seq_len );
+ seq.ToArray(seq_buf.data, seq_len);
+ FillSML(seq_buf.data, seq_len, seq.isCircular(), sml_array);
+}
+
+void SortedMerList::FillSML(gnSeqI seq_len, vector<gnSeqI>& pos_array){
+ pos_array.clear();
+ pos_array.reserve( seq_len );
+ for(gnSeqI seqI = 0; seqI < seq_len; seqI++ )
+ pos_array.push_back(seqI);
+}
+
+uint64 SortedMerList::GetDnaMer(gnSeqI offset) const
+{
+ // get the forward orientation mer
+ uint64 mer_a = SortedMerList::GetMer( offset );
+ //find the reverse complement of mer_a and return it if it's
+ //smaller
+ uint64 mer_c = RevCompMer( mer_a, header.seed_length ); //mer_c will be the reverse complement
+
+ // for debugging
+// if( mer_c < mer_a )
+// return mer_c;
+ return mer_a < mer_c ? mer_a : mer_c;
+}
+
+#define OPT_ALPHA_MASQ 0x00000003
+
+uint64 SortedMerList::RevCompMer( uint64 mer_a, int mer_length ) const
+{
+ //find the reverse complement of mer_a and return it if it's
+ //smaller
+ uint64 mer_b, mer_c = 0; //mer_c will be the reverse complement
+ mer_b = ~mer_a;
+// uint32 masq = 0xffffffff;
+// masq >>= 32 - header.alphabet_bits;
+ for(uint32 i = 0; i < 64; i += OPT_HEADER_ALPHABET_BITS){
+ mer_c |= mer_b & OPT_ALPHA_MASQ;
+// mer_c |= mer_b & masq;
+ mer_b >>= OPT_HEADER_ALPHABET_BITS;
+ mer_c <<= OPT_HEADER_ALPHABET_BITS;
+ }
+ mer_c <<= 64 - (OPT_HEADER_ALPHABET_BITS * (mer_length+1));
+ mer_c |= 1;
+ return mer_c;
+}
+
+
+void SortedMerList::FillDnaSML(const gnSequence& seq, vector<bmer>& sml_array){
+ /* now fill in the suffix array with the forward sequence*/
+ const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
+ const uint32 mer_size = header.seed_length;
+ gnSeqI sar_len = seq.length();
+ if( sar_len < header.seed_length )
+ return; // can't have an sml if there ain't enough sequence
+ if( !seq.isCircular() )
+ sar_len -= ( header.seed_length - 1);
+ sml_array.reserve(sar_len);
+
+ uint32 dead_bits = 64 - (mer_size * alpha_bits);
+ uint64 create_mask = UINT32_MAX;
+ create_mask <<= 32;
+ create_mask |= UINT32_MAX;
+ create_mask <<= dead_bits;
+
+ bmer cur_suffix, rcur_suffix;
+ cur_suffix.mer = sequence[0];
+ cur_suffix.mer <<= 32;
+ cur_suffix.mer |= sequence[1];
+ cur_suffix.mer &= create_mask;
+ cur_suffix.position = 0;
+ rcur_suffix.mer = 0;
+ rcur_suffix.position = 0;
+
+ //find the reverse complement of cur_suffix.mer and return it if it's
+ //smaller
+ uint64 mer_b = 0;
+ mer_b = ~cur_suffix.mer;
+// uint32 masq = 0xffffffff;
+// masq >>= 32 - alpha_bits;
+ for(uint32 i = 0; i < 64; i += alpha_bits){
+// rcur_suffix.mer |= mer_b & masq;
+ rcur_suffix.mer |= mer_b & OPT_ALPHA_MASQ;
+ mer_b >>= alpha_bits;
+ rcur_suffix.mer <<= alpha_bits;
+ }
+ rcur_suffix.mer <<= dead_bits - alpha_bits;
+ rcur_suffix.mer |= 1;
+
+ //add the first mer
+ if(cur_suffix.mer < rcur_suffix.mer)
+ sml_array.push_back(cur_suffix);
+ else
+ sml_array.push_back(rcur_suffix);
+
+ //fill sml_array with mers
+ gnSeqI endI = sar_len + mer_size;
+ if(seq.isCircular())
+ endI += mer_size;
+
+ uint32 rdead_bits = 64 - alpha_bits - dead_bits;
+ uint64 tmp_rseq = 0;
+ uint32 seqI = (mer_size * alpha_bits) / 32;
+ int32 cur_bit = 32 - alpha_bits - ((mer_size * alpha_bits) % 32);
+ uint32 cur_seq = sequence[seqI];
+ uint64 tmp_seq;
+// uint32 alpha_mask = 0xFFFFFFFF;
+// alpha_mask >>= 32 - alpha_bits;
+ uint64 revalpha_mask = OPT_ALPHA_MASQ;
+ revalpha_mask <<= dead_bits;
+
+ //which is slower? a memory operation or a conditional?
+ //probably a memory operation.
+ for(gnSeqI cur_pos = mer_size + 1; cur_pos < endI; cur_pos++){//already added the
+ //first one
+ //increment positions
+ cur_suffix.position++;
+ rcur_suffix.position++;
+ //extract the next character
+ tmp_seq = cur_seq;
+ tmp_seq >>= cur_bit;
+ tmp_seq &= OPT_ALPHA_MASQ;
+ tmp_seq <<= dead_bits;
+
+ //add it to the forward mer
+ cur_suffix.mer <<= alpha_bits;
+ cur_suffix.mer |= tmp_seq;
+
+ //do the reverse complement mer
+ tmp_seq = ~tmp_seq;
+ tmp_seq &= revalpha_mask;
+ tmp_rseq = tmp_seq;
+ tmp_rseq <<= rdead_bits;
+ rcur_suffix.mer >>= alpha_bits;
+ rcur_suffix.mer |= tmp_rseq;
+ rcur_suffix.mer &= create_mask;
+ rcur_suffix.mer |= 1;
+ if(cur_suffix.mer < rcur_suffix.mer)
+ sml_array.push_back(cur_suffix);
+ else
+ sml_array.push_back(rcur_suffix);
+
+ cur_bit -= alpha_bits;
+ if(cur_bit < 0){
+ cur_bit += alpha_bits;
+ cur_seq <<= 16; //trade bitwise ops for conditional
+ cur_seq <<= 16 - (cur_bit);
+ seqI++;
+ tmp_seq = sequence[seqI];
+ tmp_seq >>= cur_bit;
+ cur_seq |= tmp_seq;
+ cur_bit += 32 - alpha_bits;
+ }
+ }
+}
+
+
+uint64 SortedMerList::GetSeedMer( gnSeqI offset ) const
+{
+ //check this for access violations.
+ uint64 mer_a = SortedMerList::GetMer( offset );
+ uint64 mer_b = SortedMerList::GetMer( offset + 1 );
+ uint64 seed_mer = 0;
+ uint64 alpha_mask = 1;
+ alpha_mask <<= OPT_HEADER_ALPHABET_BITS;
+ alpha_mask--;
+ alpha_mask <<= 62;
+ uint64 cur_alpha_mask = alpha_mask;
+ uint64 char_mask = 1;
+ char_mask <<= header.seed_length - 1;
+ uint64 cur_mer = mer_a;
+ const int mer_transition = 64 / OPT_HEADER_ALPHABET_BITS;
+ int patternI = 0;
+ int rshift_amt = 64 - OPT_HEADER_ALPHABET_BITS;
+ for( ; patternI < header.seed_length; patternI++ ){
+ if( patternI == mer_transition ){
+ cur_mer = mer_b;
+ cur_alpha_mask = alpha_mask;
+ rshift_amt = 64 - OPT_HEADER_ALPHABET_BITS;
+ }
+ if( (header.seed & char_mask) != 0 ){
+ uint64 char_tmp = cur_mer & cur_alpha_mask;
+ char_tmp >>= rshift_amt;
+ seed_mer <<= OPT_HEADER_ALPHABET_BITS;
+ seed_mer |= char_tmp;
+ }
+ cur_alpha_mask >>= OPT_HEADER_ALPHABET_BITS;
+ char_mask >>= 1;
+ rshift_amt -= OPT_HEADER_ALPHABET_BITS;
+ }
+
+ seed_mer <<= 64 - (OPT_HEADER_ALPHABET_BITS * header.seed_weight);
+ return seed_mer;
+}
+
+uint64 SortedMerList::GetDnaSeedMer( gnSeqI offset ) const
+{
+ uint64 seed_mer = SortedMerList::GetSeedMer( offset );
+ uint64 rev_mer = RevCompMer( seed_mer, header.seed_weight );
+ return seed_mer < rev_mer ? seed_mer : rev_mer;
+}
+
+void SortedMerList::FillDnaSeedSML(const gnSequence& seq, vector<bmer>& sml_array){
+ // first get the length of the sequence
+ gnSeqI sar_len = SMLLength();
+ if( sar_len == 0 )
+ return; // can't have an sml if there ain't enough sequence
+ sml_array.resize(sar_len);
+
+ /* now fill in the sml_array with the forward sequence */
+ for( gnSeqI seedI = 0; seedI < sar_len; seedI++ ){
+ sml_array[seedI].mer = GetDnaSeedMer( seedI );
+ sml_array[seedI].position = seedI;
+ }
+}
+
+
+void SortedMerList::Create(const gnSequence& seq, const uint64 seed){
+
+ if(CalculateMaxMerSize() == 0)
+ Throw_gnExMsg( SMLCreateError(), "Alphabet size is too large" );
+
+ int seed_length = getSeedLength( seed );
+ int seed_weight = getSeedWeight( seed );
+
+ if(seed_length > CalculateMaxMerSize())
+ Throw_gnExMsg( SMLCreateError(), "Mer size is too large" );
+
+ if(seed_length == 0)
+ Throw_gnExMsg( SMLCreateError(), "Can't have 0 seed length" );
+
+ //determine sequence and sar length and read in sequence
+ gnSeqI seq_len = seq.length();
+ if(!seq.isCircular()){
+ header.circular = false;
+ }else
+ header.circular = true;
+ // use the nifty Array class as a wrapper for the buffer to ensure correct deallocation
+ gnSeqI buf_len = seq.isCircular() ? seq_len + seed_length : seq_len;
+ Array<gnSeqC> seq_buf( buf_len );
+ seq.ToArray(seq_buf.data, seq_len);
+ if( seq.isCircular() )
+ seq.ToArray(seq_buf.data + seq_len, seed_length-1);
+
+ // set header information
+ header.length = seq_len;
+ header.seed_length = seed_length;
+ header.seed_weight = seed_weight;
+ header.seed = seed;
+
+ SetMerMaskSize( seed_weight );
+ seed_mask = mer_mask;
+ SetMerMaskSize( seed_length );
+
+ SetSequence( seq_buf.data, buf_len );
+}
+
+} // namespace mems
diff --git a/libMems/SortedMerList.h b/libMems/SortedMerList.h
new file mode 100644
index 0000000..3caa91c
--- /dev/null
+++ b/libMems/SortedMerList.h
@@ -0,0 +1,323 @@
+/*******************************************************************************
+ * $Id: SortedMerList.h,v 1.13 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef _SortedMerList_h_
+#define _SortedMerList_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnDefs.h"
+#include "libGenome/gnClone.h"
+#include "libGenome/gnDebug.h"
+#include "libGenome/gnSequence.h"
+#include "libGenome/gnException.h"
+#include "stdlib.h"
+#include <string>
+#include <vector>
+#include "libMems/SeedMasks.h"
+
+namespace mems {
+
+#define DNA_ALPHA_BITS 2 /**< number of bits to represent each nucleotide of DNA */
+#define DNA_MER_SIZE 31 /**< largest possible number of characters in each dna mer ALWAYS ODD */
+
+#define PROTEIN_ALPHA_BITS 5 /**< number of bits to represent each amino acid */
+#define PROTEIN_MER_SIZE 12 /**< default number of characters in each protein mer */
+
+#define DESCRIPTION_SIZE 2048 /**< Number of bytes for the freeform text description of an SML */
+
+#define NO_UNIQUE_COUNT UINT32_MAX
+
+typedef int16 sarID_t;
+
+typedef uint32 smlSeqI_t;
+
+//4 + 8 = 16 (blame C alignment rules.)
+struct bmer{
+ smlSeqI_t position; /**< starting position of this mer in the sequence */
+ uint64 mer; /**< the actual binary encoded mer */
+};
+
+struct SMLHeader{
+ uint32 version; /**< Format version - 4 bytes */
+ uint32 alphabet_bits; /**< Bits per character in the alphabet - 4 bytes */
+// uint32 mer_size; /**< Size of mers used for sorting the list - 4 bytes */
+ uint64 seed; /**< The pattern used in each seed */
+ uint32 seed_length; /**< The length of the seed mask */
+ uint32 seed_weight; /**< The weight of the seed mask */
+ uint64 length; /**< length of the sequence before circularity - 8 bytes */
+ uint32 unique_mers; /**< Number of unique mers in the sequence 4 bytes */
+ uint32 word_size; /**< Word size on the machine the sequence was translated */
+ boolean little_endian; /**< Is the byte order little endian? 0==no, !0==yes */
+ sarID_t id; /**< Obsolete ID value - 1 byte, eaten by alignment? */
+ boolean circular; /**< Circularity of sequence - 1 byte */
+ uint8 translation_table[UINT8_MAX]; /**< Translation table for ascii characters to binary values -- 256 bytes */
+ char description[DESCRIPTION_SIZE]; /**< Freeform text description of sequence data -- 2048 bytes */
+};
+
+
+/**
+ * A base class which defines an interface common to all sorted mer lists
+ */
+class SortedMerList : public genome::gnClone
+{
+public:
+ SortedMerList();
+ SortedMerList( const SortedMerList& sa );
+ SortedMerList& operator=(const SortedMerList& sa);
+ ~SortedMerList();
+
+ /**
+ * Set data structures to default values
+ */
+ virtual void Clear();
+
+ /**
+ * Creates a new sorted mer list.
+ * This function enumerates each possible mer of the specified size and
+ * sorts them alphabetically in order to construct a sorted mer list.
+ * @param seq The sequence to create an SML for.
+ * @param mersize The size of the mers to sort on.
+ */
+ virtual void Create(const genome::gnSequence& seq, const uint64 seed);
+ /**
+ * Read a range of mers in the sorted mer list.
+ * This function reads a section of data from the sorted mer list starting at 'offset'
+ * and continuing for 'size' mers. The mers are placed into readVector. Anything
+ * already in readVector is cleared. Returns false if there was a problem completing the
+ * read. If the end of the list is reached, all mers which could be read will be placed
+ * into readVector and false will be returned
+ * @param readVector the vector to read bmers into.
+ * @param size The number of bmers to read.
+ * @param offset The mer index in the sorted mer list to start reading from.
+ * @return false if a problem was encountered while reading.
+ */
+ virtual boolean Read(std::vector<bmer>& readVector, gnSeqI size, gnSeqI offset) = 0;
+ /**
+ * Merges two SortedMerLists.
+ */
+ virtual void Merge(SortedMerList& sa, SortedMerList& sa2) = 0;
+
+ /**
+ * Get the mer at the specified index in the sorted mer list.
+ * @param index The index of the mer to return.
+ * @return The specified mer.
+ */
+ virtual bmer operator[](gnSeqI index) = 0;
+ /**
+ * Get the mer at the specified index in the sorted mer list.
+ * @param position The index of the mer to return.
+ * @return The specified mer.
+ */
+ virtual uint64 GetMer(gnSeqI position) const;
+ /**
+ * Searches the SML for a subsequence which matches the query string.
+ * Returns true if one is found, false otherwise.
+ * If no matching mer is found, 'result' contains the index that the query
+ * sequence would be in if it existed in the SML.
+ */
+ virtual boolean Find(const std::string& query_seq, gnSeqI& result);
+ /**
+ * Searches the SML for a mer which matches the query mer.
+ * Returns true if one is found, false otherwise.
+ * If no matching mer is found, 'result' contains the index that the query
+ * mer would be in if it existed in the SML.
+ */
+ virtual boolean FindMer(const uint64 query_mer, gnSeqI& result);
+ /**
+ * Searches the SML for mers which match the query mer.
+ * Puts the indices of all matching mers into the 'result' vector
+ */
+ virtual void FindAll(const std::string& query_seq, std::vector<gnSeqI> result);
+ /**
+ * Returns the number of unique mers in the sequence
+ */
+ virtual gnSeqI UniqueMerCount();
+
+ /**
+ * Returns a freeform text description of the SML.
+ */
+ virtual std::string Description() const;
+ /**
+ * Sets the freeform text description of the SML.
+ */
+ virtual void SetDescription(const std::string& d);
+ /**
+ * Returns the length of the seed pattern that this SML was sorted on.
+ */
+ virtual uint SeedLength() const;
+ /**
+ * Returns the weight of the seed that this SML was sorted on.
+ */
+ virtual uint SeedWeight() const;
+ /**
+ * Returns the seed pattern that this SML was sorted on.
+ */
+ virtual uint64 Seed() const;
+ /**
+ * Returns the length of the mer mask.
+ * Some types of sorted mer list support a configurable mer mask size, allowing
+ * the same sorted mer list to behave as though it were sorted on a shorter mer size.
+ * DNA sorted mer lists do not support this feature.
+ */
+ virtual uint32 GetMerMaskSize() const;
+ /**
+ * Sets the length of the mer mask.
+ * Some types of sorted mer list support a configurable mer mask size, allowing
+ * the same sorted mer list to behave as though it were sorted on a shorter mer size.
+ * DNA sorted mer lists do not support this feature.
+ */
+ virtual void SetMerMaskSize(uint32 mer_size);
+ /**
+ * Returns the length of the sequence encoded in this sorted mer list.
+ */
+ gnSeqI Length() const;
+ /**
+ * Returns the length of the sorted mer list itself. This value will be less
+ * than the sequence length if the sequence isn't circular
+ */
+ gnSeqI SMLLength() const;
+ /**
+ * Ignore this.
+ */
+ virtual sarID_t GetID() const;
+ /**
+ * Ignore this.
+ */
+ virtual void SetID(const sarID_t d);
+ /**
+ * Returns true if this SML is circular. False otherwise.
+ */
+ virtual boolean IsCircular() const;
+ /**
+ * Returns a mask which can be bitwise AND'ed to a mer in order to
+ * get only the relevant bits of sequence data without direction bits.
+ */
+ virtual uint64 GetMerMask() const;
+ /**
+ * Returns a mask which can be bitwise AND'ed to a seed mer in order to
+ * get only the relevant bits of sequence data without direction bits.
+ */
+ virtual uint64 GetSeedMask() const;
+ /**
+ * Returns a copy of the header information for this SML.
+ */
+ virtual SMLHeader GetHeader() const;
+ /**
+ * Returns a translation table for DNA sequence which disambiguates each nucleotide.
+ */
+ static const uint8* BasicDNATable();
+ /**
+ * Returns a translation table for Protein sequence.
+ */
+ static const uint8* ProteinTable();
+ /**
+ * Places a copy of the binary encoded sequence data into dest.
+ * @param len The length in sequence characters to copy
+ * @param offset The sequence offset to start copying from
+ * @throws IndexOutOfBounds if offset or len are invalid
+ */
+ virtual void GetBSequence(uint32* dest, const gnSeqI len, const gnSeqI offset);
+
+ /**
+ * Returns the reverse complement of a mer
+ */
+ virtual uint64 RevCompMer( uint64 mer_a, int mer_length ) const;
+ /**
+ * Applies the seed mask to the sequence at the given offset and returns the resulting
+ * seed.
+ */
+ virtual uint64 GetSeedMer( gnSeqI offset ) const;
+ /**
+ * Returns the lesser of the forward and reverse complement seeds at the given offset.
+ * Note: The seed pattern should be palindromic, otherwise the returned rev. complement
+ * match will be under a different pattern.
+ */
+ virtual uint64 GetDnaSeedMer( gnSeqI offset ) const;
+
+ /**
+ * Applies the seed mask to the sequence at the given offset and returns the resulting
+ * seed.
+ */
+ virtual void FillDnaSeedSML(const genome::gnSequence& seq, std::vector<bmer>& sml_array);
+
+protected:
+ struct SMLHeader header; /**< stores general information about this sorted mer list */
+ uint64 mer_mask; /**< a mask for the used bits in a mer */
+ uint64 seed_mask; /**< a mask covering only the number of characters a seed covers */
+ uint32 mask_size; /**< the number of characters covered by the mask */
+ uint32 *sequence; /**< Stores the sequence data */
+ gnSeqI binary_seq_len; /**< Stores the length in 32 bit words of the sequence */
+
+ /** Set the sequence data to the seq_len characters in seq_buf */
+ virtual void SetSequence(gnSeqC* seq_buf, gnSeqI seq_len);
+ /** Fill in the vector of bmers with the initial unsorted bmers for the sequence in seq_buf */
+ virtual void FillSML(gnSeqC* seq_buf, gnSeqI seq_len, boolean circular, std::vector<bmer>& sml_array);
+ virtual void FillSML(const genome::gnSequence& seq, std::vector<bmer>& sml_array);
+ virtual void FillDnaSML(const genome::gnSequence& seq, std::vector<bmer>& sml_array);
+ /** Fill in the vector of positions with the initial unsorted positions for the sequence in seq_buf */
+ virtual void FillSML(gnSeqI seq_len, std::vector<gnSeqI>& sml_array);
+ virtual uint64 GetDnaMer(gnSeqI offset) const;
+
+ virtual gnSeqI bsearch(const struct bmer& query_mer, const gnSeqI start, const gnSeqI end);
+ virtual void translate(uint8* dest, const gnSeqC* src, const gnSeqI len) const;
+ virtual void translate32(uint32* dest, const gnSeqC* src, const gnSeqI len) const;
+ /**
+ * Shifts an entire array of words left or right by a few bits
+ * @param data A pointer to the array of words
+ * @param bits The number of bits to shift by. A positive number shifts right and a negative number shifts left.
+ */
+ virtual void ShiftWords(uint32* data, uint32 length, int32 bits);
+ virtual uint32 CalculateMaxMerSize() const;
+
+ static const uint8* CreateBasicDNATable();
+ static const uint8* CreateProteinTable();
+};
+
+/**
+ * Thrown when there is an error creating a sorted mer list.
+ */
+CREATE_EXCEPTION(SMLCreateError);
+
+/**
+ * Thrown when there is an error merging two sorted mer lists.
+ */
+CREATE_EXCEPTION(SMLMergeError);
+
+class MerCompare {
+public:
+ MerCompare( SortedMerList* sa ){ sar = sa; }
+ boolean operator()(const gnSeqI a, const gnSeqI b) const{
+ return sar->GetMer(a) < sar->GetMer(b);
+ }
+protected:
+ SortedMerList* sar;
+};
+
+bool bmer_lessthan(const bmer& a_v, const bmer& m_v);
+bool bmer_id_lessthan(const bmer& a_v, const bmer& m_v);
+
+int bmer_compare(const void* a_v, const void* m_v);
+bool bmer_id_lessthan(const bmer& a_v, const bmer& m_v);
+
+//less than function for STL sort functions
+inline
+bool bmer_lessthan(const bmer& a_v, const bmer& m_v){
+ return (a_v.mer < m_v.mer);// ? true : false;
+};
+
+inline
+int bmer_compare(const void* a_v, const void* m_v){
+ return (int)((int64)(((bmer*)a_v)->mer) - (int64)(((bmer*)m_v)->mer));
+}
+
+}
+
+#endif //_SortedMerList_h_
diff --git a/libMems/SparseAbstractMatch.h b/libMems/SparseAbstractMatch.h
new file mode 100644
index 0000000..e42844c
--- /dev/null
+++ b/libMems/SparseAbstractMatch.h
@@ -0,0 +1,250 @@
+/*******************************************************************************
+ * $Id: SparseAbstractMatch.h,v 1.8 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __SparseAbstractMatch_h__
+#define __SparseAbstractMatch_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include "libGenome/gnDefs.h"
+#include "libMems/AbstractMatch.h"
+#include <vector>
+#include <limits>
+
+namespace mems {
+
+//template< class gnSeqIAlloc=boost::pool_allocator<gnSeqI>, class uintAlloc=boost::pool_allocator<uint> >
+/**
+ * The SparseAbstractMatch implements the AbstractMatch interface in a way
+ * that allows matches with a large SeqCount and low Multiplicity to be stored efficiently
+ */
+template< class gnSeqIAlloc=std::allocator<gnSeqI>, class uintAlloc=std::allocator<uint> >
+class SparseAbstractMatch : public AbstractMatch {
+public:
+ SparseAbstractMatch() : m_seq_count(0) {}
+ /**
+ * Creates a new SparseAbstractMatch.
+ * @param seq_count The total number of sequences in the alignment
+ */
+ SparseAbstractMatch(const uint seq_count );
+
+ // use compiler-generated copy constructor, assignment operator, and destructor
+
+ // see AbstractMatch base class documentation for these functions
+
+ int64 Start(uint seqI) const;
+ void SetStart(uint seqI, int64 startI);
+ uint Multiplicity() const{return (uint)seq_ids.size();}
+ uint SeqCount() const{return m_seq_count;}
+ uint FirstStart() const;
+ virtual void Invert();
+
+ gnSeqI LeftEnd(uint seqI) const;
+ orientation Orientation(uint seqI) const;
+ void SetLeftEnd(uint seqI, gnSeqI position);
+ void SetOrientation(uint seqI, orientation o);
+
+ // these functions manipulate the start coordinates quickly
+ virtual void MoveStart(int64 move_amount);
+ virtual void MoveEnd(int64 move_amount);
+
+ virtual boolean operator==( const SparseAbstractMatch& sam ) const;
+
+ virtual uint UsedSeq( uint seqI ) const;
+protected:
+
+ std::vector<uint, uintAlloc > seq_ids;
+ uint m_seq_count;
+ std::vector<gnSeqI, gnSeqIAlloc > leftend;
+ bitset_t orient; // bitset_t has its own allocator
+ uint SeqToIndex( uint seqI ) const;
+
+ // for use by derived classes in order to swap contents
+ void swap( SparseAbstractMatch* other );
+};
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::SparseAbstractMatch(const uint seq_count ) :
+m_seq_count(seq_count)
+{}
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::swap( SparseAbstractMatch* other )
+{
+ std::swap(seq_ids, other->seq_ids);
+ std::swap(m_seq_count, other->m_seq_count);
+ std::swap(leftend, other->leftend);
+ std::swap(orient, other->orient);
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+uint SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::FirstStart() const
+{
+ uint minI = (std::numeric_limits<uint>::max)();
+ for( std::size_t i = 0; i < seq_ids.size(); ++i )
+ minI = seq_ids[i] < minI ? seq_ids[i] : minI;
+ return minI;
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+uint SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::SeqToIndex( uint seqI ) const
+{
+ uint posI = 0;
+ for( ; posI < seq_ids.size(); ++posI )
+ if( seq_ids[posI] == seqI )
+ break;
+ return posI;
+}
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+int64 SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::Start(uint seqI) const
+{
+ uint posI = SeqToIndex( seqI );
+ if( posI >= seq_ids.size() )
+ return NO_MATCH;
+ int64 s = leftend[posI];
+ return orient.test(posI)? -s : s;
+}
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::SetStart(uint seqI, int64 startI)
+{
+ uint posI = SeqToIndex( seqI );
+ if( startI == NO_MATCH && posI >= seq_ids.size() )
+ return;
+ if( startI == NO_MATCH )
+ {
+ seq_ids.erase( seq_ids.begin() + posI );
+ leftend.erase( leftend.begin() + posI );
+ for( size_t i = posI; i + 1 < orient.size(); ++i )
+ orient.set( i, orient.test( i + 1 ) );
+ orient.resize( orient.size()-1 );
+ return;
+ }
+ if( posI >= seq_ids.size() )
+ {
+ seq_ids.push_back(seqI);
+ leftend.push_back(genome::absolut(startI));
+ orient.resize( orient.size() + 1, (startI < 0) );
+ }else{
+ leftend[posI] = genome::absolut(startI);
+ orient.set(posI, startI < 0);
+ }
+}
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::Invert()
+{
+ orient.flip();
+}
+
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+gnSeqI SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::LeftEnd(uint seqI) const
+{
+ uint posI = SeqToIndex( seqI );
+ return posI < leftend.size() ? leftend[posI] : 0;
+}
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+AbstractMatch::orientation SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::Orientation(uint seqI) const
+{
+ uint posI = SeqToIndex( seqI );
+ if( posI < leftend.size() && leftend[posI] != NO_MATCH )
+ return orient.test(posI) ? reverse : forward;
+ return undefined;
+}
+
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::SetLeftEnd(uint seqI, gnSeqI position)
+{
+ uint posI = SeqToIndex( seqI );
+ if( position == NO_MATCH && posI >= seq_ids.size() )
+ return;
+ if( posI >= leftend.size() )
+ {
+ seq_ids.push_back(seqI);
+ leftend.push_back(position);
+ orient.resize( orient.size() + 1 ); // defaults to false
+ }else if( position == NO_MATCH )
+ {
+ seq_ids.erase( seq_ids.begin() + posI );
+ leftend.erase( leftend.begin() + posI );
+ for( size_t i = posI; i + 1 < orient.size(); ++i )
+ orient.set( i, orient.test( i + 1 ) );
+ orient.resize( orient.size()-1 );
+ return;
+ }
+
+ leftend[posI]=position;
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::SetOrientation(uint seqI, orientation o)
+{
+ uint posI = SeqToIndex( seqI );
+ // just assume that posI is in-bounds... if not throw an exception!
+ if( posI >= orient.size() )
+ throw "ArrayIndexOutOfBounds!\n";
+ orient.set(posI, o == reverse);
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::MoveStart(int64 move_amount)
+{
+ for( uint i=0; i < leftend.size(); ++i )
+ if( orient.test(i) == false && leftend[i] != NO_MATCH )
+ leftend[i] += move_amount;
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+void SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::MoveEnd(int64 move_amount)
+{
+ for( uint i=0; i < leftend.size(); ++i )
+ if( orient.test(i) && leftend[i] != NO_MATCH )
+ leftend[i] += move_amount;
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+boolean SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::operator==( const SparseAbstractMatch< gnSeqIAlloc, uintAlloc >& sam ) const
+{
+ for( uint i=0; i < leftend.size(); ++i ){
+ if( leftend[i] != sam.leftend[i] ||
+ (leftend[i] != 0 && orient.test(i) != sam.orient.test(i)))
+ return false;
+ }
+ return true;
+}
+
+template< class gnSeqIAlloc, class uintAlloc >
+uint SparseAbstractMatch< gnSeqIAlloc, uintAlloc >::UsedSeq( uint seqI ) const
+{
+ uint count = 0;
+ for( uint i = 0; i < leftend.size(); i++ )
+ {
+ if(leftend[i] != 0)
+ count++;
+ if( count > seqI )
+ return i;
+ }
+ return (std::numeric_limits<uint>::max)();
+}
+
+}
+
+#endif // __SparseAbstractMatch_h__
diff --git a/libMems/SubstitutionMatrix.h b/libMems/SubstitutionMatrix.h
new file mode 100644
index 0000000..07c5cd2
--- /dev/null
+++ b/libMems/SubstitutionMatrix.h
@@ -0,0 +1,111 @@
+/*******************************************************************************
+ * $Id: SubstitutionMatrix.h,v 1.7 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __SubstitutionMatrix_h__
+#define __SubstitutionMatrix_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include <iostream>
+#include <sstream>
+
+namespace mems {
+
+typedef int score_t;
+static const score_t hoxd_matrix[4][4] =
+{
+ {91, -114, -31, -123}, // A
+
+ {-114, 100, -125, -31}, // C
+
+ {-31, -125, 100, -114}, // G
+
+ {-123, -31, -114, 91}, // T
+};
+
+static const score_t default_gap_open = -400;
+static const score_t default_gap_extend = -30;
+
+class PairwiseScoringScheme
+{
+public:
+ score_t matrix[4][4]; /**< 4x4 nucleotide substitution matrix */
+ score_t gap_open; /**< gap open penalty */
+ score_t gap_extend; /**< gap extend penalty */
+
+ PairwiseScoringScheme( const score_t matrix[4][4], score_t gap_open, score_t gap_extend )
+ {
+ setMatrix(matrix);
+ this->gap_open = gap_open;
+ this->gap_extend = gap_extend;
+ }
+
+ PairwiseScoringScheme(){ *this = PairwiseScoringScheme( hoxd_matrix, default_gap_open, default_gap_extend ); }
+ PairwiseScoringScheme& operator=( const PairwiseScoringScheme& pss )
+ {
+ setMatrix(pss.matrix);
+ this->gap_open = pss.gap_open;
+ this->gap_extend = pss.gap_extend;
+ return *this;
+ }
+ void setMatrix( const score_t matrix[4][4] )
+ {
+ for( int i = 0; i < 4; ++i )
+ for( int j = 0; j < 4; ++j )
+ this->matrix[i][j] = matrix[i][j];
+ }
+};
+
+static PairwiseScoringScheme& getDefaultScoringScheme()
+{
+ static PairwiseScoringScheme pss( hoxd_matrix, default_gap_open, default_gap_extend );
+ return pss;
+}
+
+void readSubstitutionMatrix( std::istream& is, score_t matrix[4][4] );
+
+inline
+void readSubstitutionMatrix( std::istream& is, score_t matrix[4][4] )
+{
+ std::string tmp;
+ std::getline( is, tmp ); // first line contains header info
+ std::getline( is, tmp ); // second line contains sub mat column labels
+ std::stringstream ss( tmp );
+ std::string letter;
+ bool format_ok = true;
+ ss >> letter;
+ format_ok = format_ok && letter == "A";
+ ss >> letter;
+ format_ok = format_ok && letter == "C";
+ ss >> letter;
+ format_ok = format_ok && letter == "G";
+ ss >> letter;
+ format_ok = format_ok && letter == "T";
+ ss >> letter;
+ format_ok = format_ok && letter == "N";
+ if( !format_ok )
+ {
+ std::cerr << "Invalid substitution matrix format\n";
+ throw "Invalid substitution matrix format\n";
+ }
+
+ for( int i = 0; i < 4; i++ )
+ {
+ is >> letter; // the first character on each line should be a letter
+ for( int j = 0; j < 4; j++ )
+ is >> matrix[i][j];
+ is >> letter; // this should be the N sub score (which gets ignored)
+ }
+}
+
+}
+
+#endif // __SubstitutionMatrix_h__
diff --git a/libMems/SuperInterval.cpp b/libMems/SuperInterval.cpp
new file mode 100644
index 0000000..d2d5577
--- /dev/null
+++ b/libMems/SuperInterval.cpp
@@ -0,0 +1,124 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "libMems/SuperInterval.h"
+
+using namespace std;
+using namespace genome;
+
+namespace mems {
+// working in mems
+
+bool debug_aligner = false;
+
+SuperInterval::SuperInterval() :
+length(0),
+left_end(0),
+c1_siv((std::numeric_limits<size_t>::max)()),
+c2_siv((std::numeric_limits<size_t>::max)()),
+parent_siv((std::numeric_limits<size_t>::max)())
+{}
+
+SuperInterval::SuperInterval( const Interval& reference_iv ) :
+reference_iv(reference_iv),
+length(0),
+left_end(0),
+c1_siv((std::numeric_limits<size_t>::max)()),
+c2_siv((std::numeric_limits<size_t>::max)()),
+parent_siv((std::numeric_limits<size_t>::max)())
+{
+}
+
+SuperInterval::SuperInterval(const SuperInterval& siv) :
+left_end(siv.left_end),
+length( siv.length ),
+reference_iv( siv.reference_iv ),
+c1_siv(siv.c1_siv),
+c2_siv(siv.c2_siv),
+parent_siv(siv.parent_siv)
+{
+}
+SuperInterval& SuperInterval::operator=(const SuperInterval& siv)
+{
+ left_end = siv.left_end;
+ length = siv.length;
+ reference_iv = siv.reference_iv;
+ c1_siv = siv.c1_siv;
+ c2_siv = siv.c2_siv;
+ parent_siv = siv.parent_siv;
+ return *this;
+}
+
+
+
+/** Sets the length of this match to @param len */
+void SuperInterval::SetLength( gnSeqI len )
+{
+ length = len;
+}
+
+void SuperInterval::CropLeft( gnSeqI amount )
+{
+ reference_iv.CropStart(amount);
+
+ left_end += amount;
+ length -= amount;
+
+ if(debug_aligner)
+ ValidateSelf();
+}
+
+void SuperInterval::CropRight( gnSeqI amount )
+{
+ reference_iv.CropEnd(amount);
+ length -= amount;
+
+ if(debug_aligner)
+ ValidateSelf();
+}
+
+void SuperInterval::ValidateSelf() const
+{
+ vector< bitset_t > aln_mat;
+ reference_iv.GetAlignment(aln_mat);
+ if( aln_mat[0].size() != reference_iv.AlignmentLength() )
+ {
+ breakHere();
+ cerr << "trouble! aln_mat[0].size() is: " << aln_mat[0].size() << " while reference_iv.AlignmentLength() is: " << reference_iv.AlignmentLength() << endl;
+ cerr << "mult: " << reference_iv.Multiplicity() << endl;
+ cerr << "matches.size(): " << reference_iv.GetMatches().size() << endl;
+ }
+ for( size_t i = 0; i < aln_mat.size(); i++ )
+ {
+ gnSeqI lenny = 0;
+ for( size_t j = 0; j < aln_mat[i].size(); j++ )
+ if( aln_mat[i][j] )
+ lenny++;
+ if( lenny != reference_iv.Length(i) )
+ {
+ cerr << "krudunkle, ref_iv.Length(" << i << "): " << reference_iv.Length(i) << "\n";
+ cerr << "should be: " << lenny << endl;
+ breakHere();
+ }
+ }
+ if( reference_iv.LeftEnd(0) != NO_MATCH && reference_iv.Length(0) == 0 )
+ {
+ cerr << "brozooka\n";
+ breakHere();
+ }
+ if( reference_iv.LeftEnd(1) != NO_MATCH && reference_iv.Length(1) == 0 )
+ {
+ cerr << "brokazooka\n";
+ breakHere();
+ }
+
+ if( Length() != reference_iv.AlignmentLength() )
+ {
+ breakHere();
+ cerr << "crapola\n";
+ }
+}
+
+} // namespace mems
diff --git a/libMems/SuperInterval.h b/libMems/SuperInterval.h
new file mode 100644
index 0000000..fefcc07
--- /dev/null
+++ b/libMems/SuperInterval.h
@@ -0,0 +1,81 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SuperInterval_h__
+#define __SuperInterval_h__
+
+#include "libMems/Interval.h"
+
+namespace mems {
+
+/**
+ * A class that stores an alignment and coordinate mapping between collinear segments of an ancestral genome and two
+ * descendant genomes.
+ */
+class SuperInterval
+{
+public:
+
+ SuperInterval();
+ /**
+ * Creates a new SuperInterval.
+ */
+ SuperInterval( const mems::Interval& reference_iv );
+ SuperInterval(const SuperInterval& siv);
+ SuperInterval& operator=(const SuperInterval& siv);
+ ~SuperInterval(){}
+
+ /** Returns the length */
+ virtual gnSeqI Length() const { return length; }
+
+ /** Sets the length to @param len */
+ virtual void SetLength( gnSeqI len );
+
+ virtual int64 LeftEnd() const { return left_end; }
+
+ virtual void SetLeftEnd( const int64& left_end ) { this->left_end = left_end; }
+
+ mems::Interval reference_iv;
+
+ /** the index of the SuperInterval this is aligned to in c1 */
+ size_t c1_siv;
+ /** the index of the SuperInterval this is aligned to in c2 */
+ size_t c2_siv;
+ /** the index of the SuperInterval this is aligned to in the parent */
+ size_t parent_siv;
+
+ void CropLeft( gnSeqI amount );
+ void CropRight( gnSeqI amount );
+
+ bool operator<( const SuperInterval& si ) const{ return left_end < si.left_end; }
+
+ void ValidateSelf() const;
+
+ void swap( SuperInterval& other )
+ {
+ reference_iv.swap(other.reference_iv);
+ std::swap(c1_siv, other.c1_siv);
+ std::swap(c2_siv, other.c2_siv);
+ std::swap(parent_siv, other.parent_siv);
+ std::swap(left_end, other.left_end);
+ std::swap(length, other.length);
+ }
+
+protected:
+ int64 left_end;
+ int64 length;
+};
+
+
+} // namespace mems
+
+namespace std {
+template<> inline
+void swap( mems::SuperInterval& a, mems::SuperInterval& b )
+{
+ a.swap(b);
+}
+}
+
+#endif //__SuperInterval_h__
diff --git a/libMems/TreeUtilities.h b/libMems/TreeUtilities.h
new file mode 100644
index 0000000..26148ca
--- /dev/null
+++ b/libMems/TreeUtilities.h
@@ -0,0 +1,138 @@
+#ifndef __TreeUtilities_h__
+#define __TreeUtilities_h__
+
+#include <stack>
+
+namespace mems {
+
+template<class T, class S>
+void findAndErase( T& container, S& item )
+{
+ T new_container;
+ for( typename T::iterator t_iter = container.begin(); t_iter != container.end(); t_iter++ )
+ if( *t_iter != item )
+ new_container.push_back( *t_iter );
+ container = new_container;
+};
+
+/**
+ * Depth first search to check whether a subtree contains a given node
+ */
+template<class Tree>
+bool containsNode( Tree& t, node_id_t subtree_nodeI, node_id_t query_nodeI )
+{
+ std::stack< node_id_t > node_stack;
+ node_stack.push( subtree_nodeI );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( cur_node == query_nodeI )
+ return true;
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ }
+ return false;
+}
+
+
+/** place a root on the branch with endpoints root_left and root_right
+ */
+template<class Tree>
+void rerootTree( Tree& t, node_id_t new_root )
+{
+ // new root must be an internal node
+ if( t[new_root].children.size() == 0 )
+ throw "Can't root on a leaf node";
+ if( new_root == t.root )
+ return; // idiot caller didn't realize it's already rooted here
+
+ // change the old root node to an internal node
+ uint childI = 0;
+ for( ; childI < t[t.root].children.size(); childI++ ){
+ if( containsNode( t, t[t.root].children[childI], new_root ) )
+ {
+ t[t.root].parents.push_back( t[t.root].children[childI] );
+ findAndErase( t[t.root].children, t[t.root].children[childI] );
+ break;
+ }
+ }
+ // shake the tree out on the new root node
+ t.root = new_root;
+ t[t.root].children.insert( t[t.root].children.end(), t[t.root].parents.begin(), t[t.root].parents.end() );
+ t[t.root].parents.clear();
+
+ std::stack<node_id_t> node_stack;
+ node_stack.push(t.root);
+ while( node_stack.size() > 0 )
+ {
+ // delete the current node from all of its child nodes lists
+ // and insert it as a parent
+ // make all other nodes reference by the child grandchildren
+ // recurse on each child
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ for( uint childI = 0; childI < t[cur_node].children.size(); childI++ )
+ {
+ findAndErase( t[t[cur_node].children[childI]].children, cur_node );
+ findAndErase( t[t[cur_node].children[childI]].parents, cur_node );
+ t[t[cur_node].children[childI]].children.insert( t[t[cur_node].children[childI]].children.end(), t[t[cur_node].children[childI]].parents.begin(), t[t[cur_node].children[childI]].parents.end() );
+ t[t[cur_node].children[childI]].parents.clear();
+ t[t[cur_node].children[childI]].parents.push_back(cur_node);
+ node_stack.push(t[cur_node].children[childI]);
+ }
+ }
+}
+
+/**
+ * takes a rooted tree and moves the root to a branch
+ */
+template<class Tree>
+void moveRootToBranch( Tree& t, node_id_t left_node, node_id_t right_node )
+{
+ // this function has no effect if left_node or right_node are already the root
+ if( left_node == t.root || right_node == t.root )
+ return;
+ // left_node and right_node must be adjacent
+ if( (t[left_node].parents.size() == 0 || t[right_node].parents.size() == 0 ) ||
+ (t[left_node].parents[0] != right_node && t[right_node].parents[0] != left_node ) )
+ return;
+
+ if( t[left_node].children.size() == 0 )
+ swap( left_node, right_node ); // left node was a leaf so root on right node
+
+ // save the root
+ node_id_t old_root = t.root;
+ // reroot the tree on left_node, then move the old root on the branch leading to right_node
+ rerootTree( t, left_node );
+ // remove old_root
+ node_id_t rp = t[old_root].parents[0];
+ findAndErase( t[rp].children, old_root );
+ for( size_t cI = 0; cI < t[old_root].children.size(); cI++ )
+ {
+ t[t[old_root].children[cI]].parents[0] = rp;
+ t[t[old_root].children[cI]].distance += t[old_root].distance;
+ t[rp].children.push_back( t[old_root].children[cI] );
+ }
+ t[old_root].children.clear();
+
+ // link old_root in between left_node and right_node
+ findAndErase( t[left_node].children, right_node );
+ t[left_node].children.push_back( old_root );
+ t[old_root].parents[0] = left_node;
+ t[right_node].parents[0] = old_root;
+ t[old_root].children.push_back( right_node );
+ t[old_root].distance = t[right_node].distance / 2.0;
+ t[right_node].distance /= 2.0;
+
+ // finally reroot on old_root
+ rerootTree( t, old_root );
+}
+
+
+} // namespace mems
+
+#endif // __TreeUtilities_h__
diff --git a/libMems/UngappedLocalAlignment.h b/libMems/UngappedLocalAlignment.h
new file mode 100644
index 0000000..3bea0d2
--- /dev/null
+++ b/libMems/UngappedLocalAlignment.h
@@ -0,0 +1,227 @@
+/*******************************************************************************
+ * $Id: UngappedLocalAlignment.h,v 1.10 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifndef __UngappedLocalAlignment_h__
+#define __UngappedLocalAlignment_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnClone.h"
+#include "libGenome/gnException.h"
+#include "libMems/AbstractMatch.h"
+
+namespace mems {
+
+/**
+ * The UngappedLocalAlignment class stores the location of an <b>equal size</b> (inexact or exactly)
+ * matching region between several sequences. This class can use one of several storage schemes
+ * such as DenseAbstractMatch or SparseAbstractMatch
+ */
+template< class AbstractMatchImpl >
+class UngappedLocalAlignment : public AbstractMatchImpl
+{
+
+public:
+ UngappedLocalAlignment();
+ /**
+ * Creates a new UngappedLocalAlignment.
+ * @param seq_count The total number of sequences in the alignment
+ */
+ UngappedLocalAlignment( const uint seq_count );
+
+ // use trivial copy constructor, destructor, and operator =
+
+ UngappedLocalAlignment* Clone() const;
+ UngappedLocalAlignment* Copy() const;
+ virtual void Free();
+
+ /** comparison operator, compares two UngappedLocalAlignmentes to see if they are the same */
+ boolean operator==(const UngappedLocalAlignment& mhe) const;
+
+ gnSeqI Length( uint seqI = (std::numeric_limits<uint>::max)() ) const
+ {
+ if( seqI == (std::numeric_limits<uint>::max)() )
+ return m_length;
+ if( this->LeftEnd(seqI) == NO_MATCH )
+ return 0;
+ return m_length;
+ }
+ void SetLength(gnSeqI len, uint seqI = 0){m_length = len;}
+ gnSeqI AlignmentLength() const{return m_length;}
+
+ //warning: none of the following do bounds checking.
+ virtual void Move( int64 distance );
+ virtual void CropStart(gnSeqI crop_amount);
+ virtual void CropEnd(gnSeqI crop_amount);
+ virtual void ExtendStart(gnSeqI extend_amount);
+ virtual void ExtendEnd(gnSeqI extend_amount);
+
+ virtual void CropLeft(gnSeqI crop_amount, uint seqI);
+ virtual void CropRight(gnSeqI crop_amount, uint seqI);
+
+ void GetAlignment( std::vector< bitset_t >& align_matrix ) const;
+
+ void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const;
+
+ /**
+ * Writes the location of this UngappedLocalAlignment to the specified output stream (e.g. cout).
+ */
+ template<typename AMImpl> friend std::ostream& operator<<(std::ostream& os, const UngappedLocalAlignment<AMImpl>& ula); //write to source.
+
+ bool IsGap( uint seqI, gnSeqI col ) const {
+ return (this->LeftEnd(seqI) != NO_MATCH && col < m_length);
+ }
+
+protected:
+
+ gnSeqI m_length;
+};
+
+
+template< class AbstractMatchImpl >
+UngappedLocalAlignment< AbstractMatchImpl >::UngappedLocalAlignment() : AbstractMatchImpl()
+{
+}
+
+
+template< class AbstractMatchImpl >
+UngappedLocalAlignment< AbstractMatchImpl >::UngappedLocalAlignment(uint seq_count)
+ : AbstractMatchImpl( seq_count )
+{
+}
+
+
+template< class AbstractMatchImpl >
+UngappedLocalAlignment< AbstractMatchImpl >*
+UngappedLocalAlignment< AbstractMatchImpl >::Clone() const
+{
+ return new UngappedLocalAlignment(*this);
+}
+
+template< class AbstractMatchImpl >
+UngappedLocalAlignment<AbstractMatchImpl>* UngappedLocalAlignment<AbstractMatchImpl>::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::Free()
+{
+ m_free(this);
+}
+
+template< class AbstractMatchImpl >
+boolean UngappedLocalAlignment<AbstractMatchImpl>::operator==(const UngappedLocalAlignment& ula) const
+{
+ if(m_length != ula.m_length)
+ return false;
+ return AbstractMatchImpl::operator==(ula);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::Move( int64 distance )
+{
+ for( uint32 i=0; i < AbstractMatchImpl::SeqCount(); i++ ){
+ int64 start = AbstractMatchImpl::Start(i);
+ if( start != NO_MATCH )
+ AbstractMatchImpl::SetStart(i, start + distance );
+ }
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::CropStart(gnSeqI crop_amount)
+{
+ if( crop_amount > m_length )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ m_length -= crop_amount;
+ AbstractMatchImpl::MoveStart(crop_amount);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::CropEnd(gnSeqI crop_amount){
+ if( crop_amount > m_length )
+ Throw_gnEx( genome::SeqIndexOutOfBounds() );
+ m_length -= crop_amount;
+ AbstractMatchImpl::MoveEnd(crop_amount);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::ExtendStart(gnSeqI extend_amount){
+ m_length += extend_amount;
+ int64 amt = extend_amount;
+ AbstractMatchImpl::MoveStart(-amt);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::ExtendEnd(gnSeqI extend_amount){
+ m_length += extend_amount;
+ int64 amt = extend_amount;
+ AbstractMatchImpl::MoveEnd(-amt);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::CropLeft(gnSeqI crop_amount, uint seqI)
+{
+ if(AbstractMatchImpl::Orientation(seqI) == AbstractMatch::forward)
+ CropStart(crop_amount);
+ else
+ CropEnd(crop_amount);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment<AbstractMatchImpl>::CropRight(gnSeqI crop_amount, uint seqI)
+{
+ if(AbstractMatchImpl::Orientation(seqI) == AbstractMatch::forward)
+ CropEnd(crop_amount);
+ else
+ CropStart(crop_amount);
+}
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment< AbstractMatchImpl >::GetAlignment( std::vector< bitset_t >& align_matrix ) const
+{
+ align_matrix = std::vector< bitset_t >(this->SeqCount(), bitset_t( this->AlignmentLength(), false ) );
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( this->LeftEnd(seqI) != NO_MATCH )
+ align_matrix[seqI].flip();
+ }
+}
+
+template< typename AbstractMatchImpl >
+std::ostream& operator<<(std::ostream& os, const UngappedLocalAlignment< AbstractMatchImpl >& ula);
+
+template< typename AbstractMatchImpl >
+std::ostream& operator<<(std::ostream& os, const UngappedLocalAlignment< AbstractMatchImpl >& ula){ //write to stream.
+ os << ula.m_length;
+ for(uint i=0; i < ula.SeqCount(); i++)
+ os << '\t' << ula.Start(i);
+ return os;
+}
+
+
+template< class AbstractMatchImpl >
+void UngappedLocalAlignment< AbstractMatchImpl >::GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
+{
+ pos = std::vector<gnSeqI>(this->SeqCount(), NO_MATCH);
+ column = std::vector<bool>(this->SeqCount(), true);
+ for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
+ {
+ if( this->Orientation(seqI) == AbstractMatch::forward )
+ pos[seqI] = this->LeftEnd(seqI) + col;
+ else if( this->Orientation(seqI) == AbstractMatch::reverse )
+ pos[seqI] = this->RightEnd(seqI) - col;
+ else
+ column[seqI] = false;
+ }
+}
+
+}
+
+#endif // _UngappedLocalAlignment_h_
diff --git a/libMems/configuration.h b/libMems/configuration.h
new file mode 100644
index 0000000..15928b7
--- /dev/null
+++ b/libMems/configuration.h
@@ -0,0 +1,37 @@
+#ifndef __libMems_configuration_h__
+#define __libMems_configuration_h__
+
+#if defined(WIN32)||defined(WIN64)
+
+// set the mems library name to include based on the configuration...
+
+#if defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "mems64omp.lib")
+#endif
+#if defined(WIN64)&&defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "mems64fdomp.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "memsomp.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(FASTDEBUG)&&defined(_OPENMP)
+#pragma comment(lib, "memsfdomp.lib")
+#endif
+#if defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "mems64.lib")
+#endif
+#if defined(WIN64)&&defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "mems64fd.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(NDEBUG)&&!defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "mems.lib")
+#endif
+#if defined(WIN32)&&!defined(WIN64)&&defined(FASTDEBUG)&&!defined(_OPENMP)
+#pragma comment(lib, "memsfd.lib")
+#endif
+
+
+#endif
+
+#endif // __libMems_configuration_h__
+
diff --git a/libMems/dmSML/Makefile.am b/libMems/dmSML/Makefile.am
new file mode 100644
index 0000000..7df4a8c
--- /dev/null
+++ b/libMems/dmSML/Makefile.am
@@ -0,0 +1,22 @@
+AM_CFLAGS = -DUSE_POSIX_AIO
+
+DMSML_H = \
+asyncio.h alinuxaio.h aPOSIXaio.h \
+alibc.h awin32aio.h buffer.h \
+util.h sorting.h dmsort.h \
+timing.h sml.h
+
+DMSML_SRC = \
+asyncio.c alinuxaio.c aPOSIXaio.c \
+alibc.c awin32aio.c buffer.c \
+util.c sorting.c dmsort.c \
+timing.c sml.c
+
+library_includedir=$(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)/dmSML
+
+library_include_HEADERS = $(DMSML_H)
+
+noinst_LTLIBRARIES = libdmSML.la
+libdmSML_la_SOURCES = $(DMSML_SRC)
+
+INCLUDES = -I$(top_srcdir) $(DEPS_CFLAGS)
diff --git a/libMems/dmSML/aPOSIXaio.c b/libMems/dmSML/aPOSIXaio.c
new file mode 100644
index 0000000..9d5851c
--- /dev/null
+++ b/libMems/dmSML/aPOSIXaio.c
@@ -0,0 +1,124 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/aPOSIXaio.h"
+#ifdef USE_POSIX_AIO
+
+#include "libMems/dmSML/asyncio.h"
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+int OpenPAIO( aFILE * file, const char *path, int mode ){
+ int flags = 0;
+#ifdef O_LARGEFILE
+ flags |= O_LARGEFILE;
+#endif
+ if(mode == A_READ){
+ file->file_descriptor = open(path, flags | O_RDONLY, S_IREAD | S_IWRITE | S_IRGRP | S_IWGRP );
+ }else{
+ file->file_descriptor = open(path, flags | O_RDWR | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE | S_IRGRP | S_IWGRP);
+ }
+ if(file->file_descriptor < 0){
+
+ perror(path);
+ }
+ return file->file_descriptor >= 0;
+}
+
+int ClosePAIO( aFILE * file ){
+ return close( file->file_descriptor ) == 0;
+}
+
+int FillAIOStruct( aFILE * file, aIORec * rec ){
+// fill the request data structure
+ rec->aio_cb = (aiocb_t*) malloc( sizeof(aiocb_t));
+ if(rec->aio_cb == 0)
+ return 0;
+
+ if( rec->pos != CURRENT_POS ){
+ offset_t tmppos = rec->pos;
+ tmppos >>= 32;
+ file->filep_high = tmppos;
+ // clear high bits. Is this really necessary?
+ tmppos = rec->pos;
+ tmppos <<= 32;
+ tmppos >>= 32;
+ file->filep_low = tmppos;
+ }
+
+ rec->aio_cb->aio_fildes = file->file_descriptor;
+ rec->aio_cb->aio_offset = file->filep_high;
+ rec->aio_cb->aio_offset <<= 32;
+ rec->aio_cb->aio_offset |= file->filep_low;
+ rec->aio_cb->aio_buf = rec->buf;
+ rec->aio_cb->aio_nbytes = rec->size * rec->count;
+ rec->aio_cb->aio_reqprio = 0;
+ memset(&(rec->aio_cb->aio_sigevent), 0, sizeof(struct sigevent) );
+ return 1;
+}
+
+int WritePAIO( aFILE * file, aIORec * rec ){
+ int req_error;
+ if( FillAIOStruct( file, rec ) ){
+ // request the io
+ rec->aio_cb->aio_lio_opcode = LIO_WRITE;
+ req_error = aio_write(rec->aio_cb);
+ if(req_error == -1){
+ perror("write");
+// printf( "aiocb->aio_filedes = %d\n", rec->aio_cb->aio_filedes );
+// printf( "aiocb->aio_offset = %llu\n", rec->aio_cb->aio_offset );
+// printf( "aiocb->aio_buf = %lx\n", rec->aio_cb->aio_buf );
+// printf( "aiocb->aio_nbytes = %llu\n", rec->aio_cb->aio_nbytes );
+ printf( "aiocb->aio_reqprio = %d\n", rec->aio_cb->aio_reqprio );
+ }
+ return req_error == 0;
+ }
+ return 0;
+}
+
+int ReadPAIO( aFILE * file, aIORec * rec ){
+ int req_error;
+// fill the request data structure
+ if( FillAIOStruct( file, rec ) ){
+ // request the io
+ rec->aio_cb->aio_lio_opcode = LIO_READ;
+ req_error = aio_read(rec->aio_cb);
+ if(req_error == -1){
+ perror("write");
+// printf( "aiocb->aio_filedes = %d\n", rec->aio_cb->aio_filedes );
+// printf( "aiocb->aio_offset = %llu\n", rec->aio_cb->aio_offset );
+// printf( "aiocb->aio_buf = %lx\n", rec->aio_cb->aio_buf );
+// printf( "aiocb->aio_nbytes = %llu\n", rec->aio_cb->aio_nbytes );
+ printf( "aiocb->aio_reqprio = %d\n", rec->aio_cb->aio_reqprio );
+ }
+ return req_error == 0;
+ }
+ return 0;
+}
+
+// PRECONDITION: file->queuetail is not null
+// simply queries wether the first request submitted to the file has
+// completed yet.
+int QueryLastCompletePAIO( aFILE * file ){
+ int rval;
+ struct aiocb *request_array[] = { file->queuetail->aio_cb };
+ struct timespec zero_wait;
+
+ zero_wait.tv_sec = 0;
+ zero_wait.tv_nsec = 0;
+
+ rval = aio_suspend(request_array, 1, &zero_wait);
+ if(rval == 0){
+ return 1; //why, shouldnt we tell the caller what finished?
+ }else if(rval == -1)
+ ;
+// perror("aio_suspend");
+ return 0;
+}
+
+#endif /* USE_POSIX_AIO */
diff --git a/libMems/dmSML/aPOSIXaio.h b/libMems/dmSML/aPOSIXaio.h
new file mode 100644
index 0000000..410eb8b
--- /dev/null
+++ b/libMems/dmSML/aPOSIXaio.h
@@ -0,0 +1,18 @@
+#ifndef _aPOSIXaio_h_
+#define _aPOSIXaio_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+
+int OpenPAIO( aFILE * file, const char *path, int mode );
+int ClosePAIO( aFILE * file );
+
+int WritePAIO( aFILE * file, aIORec * rec );
+int ReadPAIO( aFILE * file, aIORec * rec );
+
+int QueryLastCompletePAIO( aFILE * file );
+
+#endif /* _aPOSIXaio_h_ */
diff --git a/libMems/dmSML/alibc.c b/libMems/dmSML/alibc.c
new file mode 100644
index 0000000..b14bf7a
--- /dev/null
+++ b/libMems/dmSML/alibc.c
@@ -0,0 +1,47 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+#include "libMems/dmSML/alibc.h"
+
+#if defined USE_LIBC
+
+int OpenLibC( aFILE * file, const char *path, int mode ) {
+ FILE * result = fopen( path, mode == A_READ ? "rb" : "wb" );
+ file->libchandle = result;
+ if( result == NULL ) {
+ return( 0 );
+ }
+ return( 1 );
+}
+
+
+int CloseLibC( aFILE * file ) {
+ fclose( file->libchandle );
+ return( 1 );
+}
+
+
+int WriteLibC( aFILE * file, aIORec * rec ) {
+ fwrite( rec->buf, rec->size, rec->count, file->libchandle );
+ return( 1 );
+}
+
+int ReadLibC( aFILE * file, aIORec * rec ) {
+ fread( rec->buf, rec->size, rec->count, file->libchandle );
+ return( 1 );
+}
+
+
+int OperationCompleteLibC( aFILE * file ) {
+ // libc operations are atomic
+ return( 1 );
+}
+
+int FileBusyLibC( aFILE * file ) {
+ // libc operations are atomic
+ return( 1 );
+}
+
+#endif /* USE_LIBC */
diff --git a/libMems/dmSML/alibc.h b/libMems/dmSML/alibc.h
new file mode 100644
index 0000000..e9e626e
--- /dev/null
+++ b/libMems/dmSML/alibc.h
@@ -0,0 +1,15 @@
+#ifndef _alibc_h_
+#define _alibc_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+int OpenLibC( aFILE * file, const char *path, int mode );
+int CloseLibC( aFILE * file );
+int WriteLibC( aFILE * file, aIORec * rec );
+int ReadLibC( aFILE * file, aIORec * rec );
+
+/* Line ending test modification... */
+
+#endif /* _alibc_h_ */
diff --git a/libMems/dmSML/alinuxaio.c b/libMems/dmSML/alinuxaio.c
new file mode 100644
index 0000000..15aee68
--- /dev/null
+++ b/libMems/dmSML/alinuxaio.c
@@ -0,0 +1,283 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/alinuxaio.h"
+#ifdef USE_LINUX_AIO
+
+#include <libaio.h>
+
+#include "libMems/dmSML/asyncio.h"
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+/*
+#define __NR_io_setup 245
+#define __NR_io_destroy 246
+#define __NR_io_getevents 247
+#define __NR_io_submit 248
+#define __NR_io_cancel 249
+*/
+
+io_context_t ctx_id = NULL;
+
+#ifndef __u64
+typedef unsigned long long __u64;
+#endif
+
+__u64 current_id = 0;
+
+unsigned event_max = 10000;
+
+// error = sys_io_destroy( ctx_id );
+
+
+typedef struct completion_id_s {
+ __u64 data;
+ struct completion_id_s* next;
+ struct completion_id_s* last;
+} completion_id_t;
+
+typedef struct completion_id_list_s {
+ int nitems;
+ completion_id_t * head;
+} completion_id_list_t;
+
+// buffer list manipulations
+// returns argument
+completion_id_list_t * InitListComp( completion_id_list_t * list );
+void PushHeadComp( completion_id_list_t * list, completion_id_t * item );
+void PushTailComp( completion_id_list_t * list, completion_id_t * item );
+completion_id_t * PopHeadComp( completion_id_list_t * list );
+completion_id_t * PopTailComp( completion_id_list_t * list );
+// returns second argument
+completion_id_t * RemoveItemComp( completion_id_list_t * list, completion_id_t * item );
+
+
+// buffer list manipulations
+// returns argument
+completion_id_list_t * InitListComp( completion_id_list_t * list ) {
+ list->head = NULL;
+ list->nitems = 0;
+ return( list );
+}
+
+
+void PushHeadComp( completion_id_list_t * list, completion_id_t * item ) {
+ // one special case for empty list, because we can't
+ // dereference list->head until we assign to it.
+ if( list->head == NULL ) {
+ list->head = item;
+ list->nitems = 1;
+ list->head->next = list->head;
+ list->head->last = list->head;
+ return;
+ }
+ // other cases are easier, because no more null pointers.
+ item->last = list->head->last;
+ item->next = list->head;
+ list->head->last->next = item;
+ list->head->last = item;
+ list->head = item;
+ // we added an item.
+ list->nitems++;
+}
+
+void PushTailComp( completion_id_list_t * list, completion_id_t * item ) {
+ // this is exactly equivalent to doing a PushHead and
+ // then backing up the list head one.
+ // get the item in there
+ PushHeadComp( list, item );
+ // back up the head.
+ list->head = list->head->last;
+}
+
+completion_id_t * PopHeadComp( completion_id_list_t * list ) {
+ completion_id_t *ret;
+ // just get rid of the head item and return it.
+ if( list->head == NULL ) {
+ return( NULL );
+ }
+ list->head->next->last = list->head->last;
+ list->head->last->next = list->head->next;
+ ret = list->head;
+ list->head = list->head->next;
+ ret->next = ret->last = NULL;
+ list->nitems--;
+ if( list->nitems == 0 ) {
+ list->head = NULL;
+ }
+ return( ret );
+}
+
+completion_id_t * PopTailComp( completion_id_list_t * list ) {
+ // just get rid of the tail item and return it.
+ if( list->head == NULL ) {
+ return( list->head );
+ }
+ // otherwise, a pop tail is equivalent to moving the
+ // head back one and popping head.
+ list->head = list->head->last;
+ return( PopHeadComp( list ) );
+}
+
+// returns second argument
+completion_id_t * RemoveItemComp( completion_id_list_t * list, completion_id_t * item ) {
+ // FIXME: handle NULL cases in a reasonable way?
+ if( item == list->head ) {
+ return( PopHeadComp( list ) );
+ }
+ item->next->last = item->last;
+ item->last->next = item->next;
+ item->next = item->last = NULL;
+ list->nitems--;
+ if( list->nitems == 0 ) {
+ list->head = NULL;
+ }
+ return( item );
+}
+
+
+completion_id_list_t *completion_list = NULL;
+
+int OpenLinux( aFILE * file, const char *path, int mode ){
+ long error;
+ if( ctx_id == 0 ){
+ error = io_queue_init( event_max, &ctx_id );
+ if( error != 0 )
+ perror( "io_setup" );
+ }
+ if( completion_list == NULL ){
+ completion_list = (completion_id_list_t*)malloc( sizeof( completion_id_list_t ) );
+ completion_list = InitListComp( completion_list );
+ }
+
+ if(mode == A_READ){
+ file->file_descriptor = open(path, O_LARGEFILE | O_RDONLY, S_IREAD | S_IWRITE | S_IRGRP | S_IWGRP );
+ }else{
+ file->file_descriptor = open(path, O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE, S_IREAD | S_IWRITE | S_IRGRP | S_IWGRP);
+ }
+ if(file->file_descriptor < 0){
+
+ perror(path);
+ }
+ return file->file_descriptor >= 0;
+}
+
+int CloseLinux( aFILE * file ){
+ return close( file->file_descriptor ) == 0;
+}
+
+void CleanupLinux(){
+ // free the completion list
+ free( completion_list );
+ completion_list = NULL;
+ ctx_id = NULL;
+}
+
+int FillAIOStruct( aFILE * file, aIORec * rec ){
+// fill the request data structure
+ rec->aio_cb = (iocb_t*) malloc( sizeof(iocb_t));
+ if(rec->aio_cb == 0)
+ return 0;
+
+ memset(rec->aio_cb, 0, sizeof(iocb_t));
+ if( rec->pos != CURRENT_POS ){
+ offset_t tmppos = rec->pos;
+ tmppos >>= 32;
+ file->filep_high = tmppos;
+ // clear high bits. Is this really necessary?
+ tmppos = rec->pos;
+ tmppos <<= 32;
+ tmppos >>= 32;
+ file->filep_low = tmppos;
+ }
+
+// rec->aio_cb->aio_data = current_id++;
+ rec->aio_cb->aio_fildes = file->file_descriptor;
+ rec->aio_cb->u.c.offset = file->filep_high;
+ rec->aio_cb->u.c.offset <<= 32;
+ rec->aio_cb->u.c.offset |= file->filep_low;
+ rec->aio_cb->u.c.buf = rec->buf;
+ rec->aio_cb->u.c.nbytes = rec->size * rec->count;
+
+ return 1;
+}
+
+int WriteLinux( aFILE * file, aIORec * rec ){
+ int req_error;
+ struct iocb *request_array[] = { rec->aio_cb };
+ if( FillAIOStruct( file, rec ) ){
+ // request the io
+ rec->aio_cb->aio_lio_opcode = IO_CMD_PWRITE;
+ req_error = io_submit( ctx_id, 1, &rec->aio_cb );
+ if(req_error != 1){
+ printf("write_submit: io_submit res=%d [%s]\n", req_error, strerror(-req_error));
+ printf( "aiocb->aio_fildes = %d\n", rec->aio_cb->aio_fildes );
+ printf( "aiocb->u.c.offset = %llu\n", rec->aio_cb->u.c.offset );
+ printf( "aiocb->u.c.buf = %lx\n", rec->aio_cb->u.c.buf );
+ printf( "aiocb->u.c.nbytes = %llu\n", rec->aio_cb->u.c.nbytes );
+ printf( "aiocb->aio_reqprio = %d\n", rec->aio_cb->aio_reqprio );
+ }
+ return req_error == 1;
+ }
+ return 0;
+}
+
+int ReadLinux( aFILE * file, aIORec * rec ){
+ int req_error;
+ struct iocb *request_array[] = { rec->aio_cb };
+// fill the request data structure
+ if( FillAIOStruct( file, rec ) ){
+ // request the io
+ rec->aio_cb->aio_lio_opcode = IO_CMD_PREAD;
+ req_error = io_submit( ctx_id, 1, &rec->aio_cb );
+ if(req_error != 1){
+ printf("read_submit: io_submit res=%d [%s]\n", req_error, strerror(-req_error));
+// printf( "aiocb->aio_filedes = %d\n", rec->aio_cb->aio_filedes );
+// printf( "aiocb->aio_offset = %llu\n", rec->aio_cb->aio_offset );
+// printf( "aiocb->aio_buf = %lx\n", rec->aio_cb->aio_buf );
+// printf( "aiocb->aio_nbytes = %llu\n", rec->aio_cb->aio_nbytes );
+ printf( "aiocb->aio_reqprio = %d\n", rec->aio_cb->aio_reqprio );
+ }
+ return req_error == 1;
+ }
+ return 0;
+}
+
+
+// PRECONDITION: file->queuetail is not null
+// simply queries wether the first request submitted to the file has
+// completed yet.
+int QueryLastCompleteLinux( aFILE * file ){
+ int rval;
+ int compI;
+ completion_id_t *comp;
+ struct io_event ioe;
+ struct timespec zero_wait;
+
+ zero_wait.tv_sec = 0;
+ zero_wait.tv_nsec = 10000000;
+
+ rval = io_getevents( ctx_id, 0, 1, &ioe, &zero_wait );
+ if( rval == 1 ){
+ completion_id_t *completion = (completion_id_t*)malloc( sizeof(completion_id_t) );
+ completion->data = ioe.data;
+ PushTailComp( completion_list, completion );
+ }
+ comp = completion_list->head;
+ for( compI = 0; compI < completion_list->nitems; compI++ ){
+ if( comp->data == ioe.data )
+ break;
+ }
+ if( compI != completion_list->nitems ){
+ RemoveItemComp( completion_list, comp );
+ return 1; // success
+ }
+ return 0; // hasn't completed yet
+}
+
+#endif
diff --git a/libMems/dmSML/alinuxaio.h b/libMems/dmSML/alinuxaio.h
new file mode 100644
index 0000000..9474c61
--- /dev/null
+++ b/libMems/dmSML/alinuxaio.h
@@ -0,0 +1,19 @@
+#ifndef _alinuxaio_h_
+#define _alinuxaio_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+
+
+int OpenLinux( aFILE * file, const char *path, int mode );
+int CloseLinux( aFILE * file );
+
+int WriteLinux( aFILE * file, aIORec * rec );
+int ReadLinux( aFILE * file, aIORec * rec );
+
+int QueryLastCompleteLinux( aFILE * file );
+
+#endif /* _alinuxaio_h_ */
diff --git a/libMems/dmSML/asyncio.c b/libMems/dmSML/asyncio.c
new file mode 100644
index 0000000..218631d
--- /dev/null
+++ b/libMems/dmSML/asyncio.c
@@ -0,0 +1,358 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+
+#include "libMems/dmSML/alibc.h"
+#include "libMems/dmSML/awin32aio.h"
+#include "libMems/dmSML/aPOSIXaio.h"
+
+#include "libMems/dmSML/util.h"
+
+#include "libMems/dmSML/buffer.h"
+#include <string.h>
+
+#if defined(USE_POSIX_AIO)||defined(USE_LINUX_AIO)
+#include <unistd.h>
+#include <sys/stat.h>
+#endif
+
+static int OperationNumber = 0;
+
+int QueueEmpty( aFILE * file );
+void RemoveOperation( aFILE * file );
+void FreeQueue( aFILE * file );
+int ExecuteWrite( aFILE * file, aIORec * rec );
+int ExecuteRead( aFILE * file, aIORec * rec );
+int EnqueueOperation( char * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos );
+void ExecuteOperation( aFILE * file );
+int QueryOpComplete( aFILE * file );
+int aAct( void * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos );
+
+int QueueEmpty( aFILE * file ) {
+ return( (file->queuehead == file->queuetail) && (file->queuehead == NULL) );
+}
+
+void RemoveOperation( aFILE * file ) {
+ aIORec * tofree;
+ if( !QueueEmpty( file ) ) {
+ tofree = file->queuetail;
+ if( file->queuetail == file->queuehead ) {
+ file->queuehead = file->queuetail = NULL;
+ } else {
+ file->queuetail->next->last = NULL;
+ file->queuetail = file->queuetail->next;
+ }
+ // FIXME: ack hack from my poor design for win32
+#if defined USE_WIN32
+ free( tofree->w32overlapped );
+#elif defined(USE_POSIX_AIO)||defined(USE_LINUX_AIO)
+ free( tofree->aio_cb );
+#endif
+ free( tofree );
+ }
+}
+
+
+void FreeQueue( aFILE * file ) {
+ while( !QueueEmpty( file ) ) {
+ RemoveOperation( file );
+ }
+}
+
+
+
+// opens a file
+aFILE * aOpen( const char * path, int mode ) {
+ int err = 0;
+ aFILE *ret = malloc( sizeof( *ret ) );
+
+ memset( ret, 0, sizeof( *ret ) );
+ ret->mode = mode;
+ ret->busy = 0;
+#if defined USE_LINUX_AIO
+ err = !OpenLinux( ret, path, mode );
+#elif defined USE_POSIX_AIO
+ err = !OpenPAIO( ret, path, mode );
+#elif defined USE_LIBC
+ err = !OpenLibC( ret, path, mode );
+#elif defined USE_WIN32
+ err = !OpenWIN32( ret, path, mode );
+#endif
+ if( err ) {
+ free( ret );
+ ret = NULL;
+ }
+ return( ret );
+}
+
+
+// helper to close a file
+int aClose( aFILE * file ) {
+ int err = 0;
+ // block until the file is no longer busy.
+ aWaitNotBusy( file );
+#if defined USE_LINUX_AIO
+ err = CloseLinux( file );
+#elif defined USE_POSIX_AIO
+ err = ClosePAIO( file );
+#elif defined USE_LIBC
+ err = CloseLibC( file );
+#elif defined USE_WIN32
+ err = CloseWIN32( file );
+#endif
+ FreeQueue( file );
+ free( file );
+ return( err );
+}
+
+
+
+int ExecuteWrite( aFILE * file, aIORec * rec ) {
+ int err = 0;
+#if defined USE_LINUX_AIO
+ err = !WriteLinux( file, rec );
+#elif defined USE_POSIX_AIO
+ err = !WritePAIO( file, rec );
+#elif defined USE_LIBC
+ err = !WriteLibC( file, rec );
+#elif defined USE_WIN32
+ err = !WriteWIN32( file, rec );
+#endif
+ if( err ) {
+ //printf( "error in ExecuteWrite\n" );
+ } else {
+ file->busy = 1;
+ }
+ return( err );
+}
+
+
+int ExecuteRead( aFILE * file, aIORec * rec ) {
+ int err = 0;
+#if defined USE_LINUX_AIO
+ err = !ReadLinux( file, rec );
+#elif defined USE_POSIX_AIO
+ err = !ReadPAIO( file, rec );
+#elif defined USE_LIBC
+ err = !ReadLibC( file, rec );
+#elif defined USE_WIN32
+ err = !ReadWIN32( file, rec );
+#endif
+ if( err ) {
+ //printf( "error in ExecuteRead\n" );
+ } else {
+ file->busy = 1;
+ }
+ return( err );
+}
+
+
+
+int EnqueueOperation( char * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos ) {
+ if( QueueEmpty( file ) ) {
+
+ file->queuehead = file->queuetail = malloc( sizeof( *file->queuehead ) );
+ memset( file->queuehead, 0, sizeof( *(file->queuehead) ) );
+ file->queuehead->last = NULL;
+ } else {
+ file->queuehead->next = malloc( sizeof( *file->queuehead->next ) );
+ memset( file->queuehead->next, 0, sizeof( *(file->queuehead->next) ) );
+ file->queuehead->next->last = file->queuehead;
+ file->queuehead = file->queuehead->next;
+ }
+ file->queuehead->buf = buffer;
+ file->queuehead->size = size;
+ file->queuehead->count = count;
+ file->queuehead->pos = pos;
+ file->queuehead->operation = ++OperationNumber;
+ file->queuehead->next = NULL;
+ return( file->queuehead->operation );
+}
+
+
+
+void ExecuteOperation( aFILE * file ) {
+ // if file is busy or there are no pending ops, we can't do much.
+ if( !QueueEmpty( file ) && !file->busy ) {
+ int err = 0;
+ if( file->mode == A_WRITE ) {
+ err = ExecuteWrite( file, file->queuetail );
+ } else {
+ err = ExecuteRead( file, file->queuetail );
+ }
+ if( !err ) {
+ // advance file pointer
+ AddTo64( file->queuetail->size * file->queuetail->count, &(file->filep_high), &(file->filep_low) );
+ }
+
+ }
+}
+
+
+int QueryOpComplete( aFILE * file ) {
+ // check to see if the last operation was completed.
+ if( file->queuetail != NULL ) {
+#if defined USE_LINUX_AIO
+ return( QueryLastCompleteLinux( file ) );
+#elif defined USE_POSIX_AIO
+ return( QueryLastCompletePAIO( file ) );
+#elif defined USE_LIBC
+ return( 1 );
+#elif defined USE_WIN32
+ return( QueryLastCompleteWIN32( file ) );
+#endif
+ }
+ return( 1 );
+}
+
+
+
+
+// for files open for writing, ensures that all data is
+// safely on disk (flushes buffer cache).
+void aFlush( aFILE *file ) {
+#if defined(USE_POSIX_AIO)||defined(USE_LINUX_AIO)
+ if( fsync( file->file_descriptor ) )
+ perror("fsync");
+#elif defined USE_LIBC
+ if( fflush( file->libchandle ) ) {
+ printf( "error flushing stdio libc file\n" );
+ }
+#elif defined USE_WIN32
+ if( !FlushFileBuffers( file->w32handle ) ) {
+ printf( "error flushing win32 file\n" );
+ }
+#endif
+}
+
+// get the size in bytes of a particular file
+unsigned long long aStatFileSize( const char * path ) {
+#if defined(USE_POSIX_AIO)||defined(USE_LINUX_AIO)
+ struct stat stat_data;
+ if( stat( path , &stat_data) ){
+ perror(path);
+ return 0;
+ }
+ return stat_data.st_size;
+#elif defined USE_LIBC
+#error "libc aStatSize not implemented"
+#elif defined USE_WIN32
+ WIN32_FILE_ATTRIBUTE_DATA file_data;
+ unsigned long long f_size;
+ GetFileAttributesEx( path, GetFileExInfoStandard, (void*)&file_data );
+ f_size = file_data.nFileSizeHigh;
+ f_size <<= 32;
+ f_size += file_data.nFileSizeLow;
+ return f_size;
+//#error "Implement me! WIN32 aStatSize"
+#endif
+}
+
+
+// get the size in records of a particular file
+// used when skipping the binning phase
+unsigned long aStatSize( const char * path ) {
+#if defined(USE_POSIX_AIO)||defined(USE_LINUX_AIO)
+ struct stat stat_data;
+ if( stat( path , &stat_data) ){
+ perror(path);
+ return 0;
+ }
+ return stat_data.st_size / sizeof(record_t);
+#elif defined USE_LIBC
+#error "libc aStatSize not implemented"
+#elif defined USE_WIN32
+ return aStatFileSize( path ) / sizeof(record_t);
+ printf("Implement me! WIN32 aStatSize");
+//#error "Implement me! WIN32 aStatSize"
+#endif
+}
+
+
+void aUpdateOperations( aFILE * file ) {
+ int op_complete;
+ // if we are busy, see if the last thing has completed.
+ op_complete = QueryOpComplete( file );
+ if( !op_complete ) {
+ //printf( "op not yet complete on file 0x%X\n", file );
+ }
+ if( !QueueEmpty( file ) && file->busy && op_complete ) {
+ RemoveOperation( file );
+ file->busy = 0;
+ }
+ // if the queue is still not empty, start the next one up.
+ if( !QueueEmpty( file ) ) {
+ ExecuteOperation( file );
+ }
+
+}
+
+
+
+
+int aAct( void * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos ) {
+ int operation = 0;
+ // enter the operation in the queue, and then
+ // try to execute what we can.
+ // enqueue the op.
+ operation = EnqueueOperation( buffer, size, count, file, pos );
+ // execute operations
+ ExecuteOperation( file );
+ return( operation );
+}
+
+
+// these allow you to queue reads and writes.
+// these return 0 for a failure, or an operation
+// code that can be checked for completion with
+// a_OperationComplete
+int aWrite( void * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos ) {
+ return( aAct( buffer, size, count, file, pos ) );
+}
+int aRead( void * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos ) {
+ return( aAct( buffer, size, count, file, pos ) );
+}
+
+
+// returns 1 if the operation was completed, 0 otherwise.
+int aOperationComplete( aFILE * file, int operation ) {
+ aIORec *qp;
+ // scan through the queue until we find the op
+ // or we get to the end. if we get to the end
+ // and don't find it, it must have completed,
+ // otherwise it hasn't.
+ for( qp = file->queuetail; qp != NULL; qp = qp->next ) {
+ if( qp->operation == operation ) {
+ return( 0 );
+ }
+ }
+ return( 1 );
+}
+
+
+// returns 1 if the file is doing IO, 0 otherwise.
+int aFileBusy( aFILE * file ) {
+ return( file->busy );
+}
+
+
+// blocks and waits for the specified operation to
+// complete.
+void aWaitComplete( aFILE * file, int operation ) {
+ while( !aOperationComplete( file, operation ) ) {
+ aUpdateOperations( file );
+ }
+}
+
+
+// blocks and waits for the file to not be busy
+// and for all IO operations to complete.
+void aWaitNotBusy( aFILE * file ) {
+ while( file->busy ) {
+ aUpdateOperations( file );
+ }
+}
+
+
diff --git a/libMems/dmSML/asyncio.h b/libMems/dmSML/asyncio.h
new file mode 100644
index 0000000..03caa7d
--- /dev/null
+++ b/libMems/dmSML/asyncio.h
@@ -0,0 +1,166 @@
+#ifndef _asyncio_h_
+#define _asyncio_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+//#define USE_LINUX_AIO
+//#define USE_LIBC_AIO // don't use kaio
+
+#ifdef WIN32
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+# define USE_WIN32
+#else
+# ifndef _LARGEFILE64_SOURCE
+# define _FILE_OFFSET_BITS 64
+# define _LARGEFILE_SOURCE
+# define _LARGEFILE64_SOURCE
+# endif
+// use kaio by default
+# if defined(USE_LIBC_AIO) || defined(USE_POSIX_AIO)
+# ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+# endif
+# if defined HAVE_SYS_AIO_H
+# include <sys/aio.h>
+# elif HAVE_AIO_H
+# include <aio.h>
+# endif
+# ifdef HAVE_FEATURES_H
+# include <features.h>
+# endif
+typedef struct aiocb aiocb_t;
+# endif
+# ifdef USE_LINUX_AIO
+# define _FILE_OFFSET_BITS 64
+# define _LARGEFILE_SOURCE
+# define _LARGEFILE64_SOURCE
+# include <libaio.h>
+typedef struct iocb iocb_t;
+# endif
+# ifdef HAVE_FEATURES_H
+# include <features.h>
+# endif
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+#define CURRENT_POS -1
+typedef unsigned long long offset_t;
+
+// is this a struct to store RECORDS to write out?
+// the way it's used looks like it's not intended
+// for generic data...
+typedef struct _aIORec {
+#if defined USE_POSIX_AIO
+// posix aio uses the aiocb_t type to describe aio requests
+ aiocb_t *aio_cb;
+#elif defined USE_LINUX_AIO
+ iocb_t* aio_cb;
+#elif defined USE_LIBC
+#elif defined USE_WIN32
+ // win32-specific data.
+ // this is a pointer because windows needs it to
+ // be in a fixed spot. But we have to resize the
+ // data structure that contains these, so we need
+ // to allocate them separately.
+ // unfortunately, this means we need to do linear
+ // search to figure out what thing in the queue some
+ // completion corresponds to. Fortunately, this
+ // rarely needs to be done. I think this is The
+ // Right Thing, given the tools and our goals.
+ OVERLAPPED * w32overlapped;
+#endif
+ // must do linear search to find specific operations,
+ // but no big deal.
+ int operation;
+ char * buf;
+ offset_t size;
+ offset_t count; //what is count for??
+ offset_t pos;
+ struct _aIORec * next;
+ struct _aIORec * last;
+
+} aIORec;
+
+
+// users don't need to concern themselves with this.
+typedef struct _aFILE {
+#if defined(USE_POSIX_AIO)||defined(USE_LINUX_AIO)
+ int file_descriptor;
+#elif defined USE_LIBC
+ FILE * libchandle;
+#elif defined USE_WIN32
+ HANDLE w32handle;
+#endif
+ // read or write (both read and write??)
+ int mode;
+ // file seek pointer
+ unsigned int filep_high;
+ unsigned int filep_low;
+ // is a read/write operation in progress?
+ int busy;
+ // operation serial number (to ensure serial operation).
+ int op;
+ // are we to be closed?
+ int toclose;
+ // queue of io operations
+ aIORec *queuehead, *queuetail;
+} aFILE;
+
+
+enum {
+ A_READ,
+ A_WRITE
+};
+
+
+// these work just like fopen and fclose
+aFILE * aOpen( const char * path, int mode );
+// close will block until all operations
+// on the file are complete.
+int aClose( aFILE * file );
+
+// these allow you to queue reads and writes.
+// these return 0 for a failure, or an operation
+// code that can be checked for completion with
+// a_OperationComplete
+int aWrite( void * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos );
+int aRead( void * buffer, offset_t size, offset_t count, aFILE * file, offset_t pos );
+
+// returns 1 if the operation was completed, 0 otherwise.
+int aOperationComplete( aFILE * file, int operation );
+
+// returns 1 if the file is doing IO, 0 otherwise.
+int aFileBusy( aFILE * file );
+
+// blocks and waits for the specified operation to
+// complete.
+void aWaitComplete( aFILE * file, int operation );
+
+// blocks and waits for the file to not be busy
+// and for *all* IO operations to complete.
+void aWaitNotBusy( aFILE * file );
+
+// polls the aio file to see if anything's completed, and
+// starts the next queued up jobs if they are. does not
+// block.
+void aUpdateOperations( aFILE * file );
+
+
+// for files open for writing, ensures that all data is
+// safely on disk (flushes buffer cache).
+void aFlush( aFILE *file );
+
+// get the size in records of a particular file
+// used when skipping the binning phase
+unsigned long aStatSize( const char * path );
+
+// get the size in bytes of a particular file
+unsigned long long aStatFileSize( const char * path );
+
+#endif /* _asyncio_h_ */
diff --git a/libMems/dmSML/awin32aio.c b/libMems/dmSML/awin32aio.c
new file mode 100644
index 0000000..93b4d39
--- /dev/null
+++ b/libMems/dmSML/awin32aio.c
@@ -0,0 +1,160 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/awin32aio.h"
+#include "libMems/dmSML/util.h"
+#ifdef USE_WIN32
+
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+
+static VOID CALLBACK DummyCompletionRoutine( DWORD err, DWORD nbytes, LPOVERLAPPED lpo ) {
+ // we poll for completion, so this is just a dummy to make windows happy.
+ printf( "completion routine!\n" );
+}
+
+
+int OpenWIN32( aFILE * file, const char *path, int mode ) {
+ HANDLE result;
+ DWORD access = mode == A_READ ? GENERIC_READ : GENERIC_WRITE;
+ DWORD disposition = mode == A_READ ? OPEN_EXISTING : CREATE_ALWAYS;
+ result = CreateFile(
+ path,
+ access,
+ FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+ NULL,
+ disposition,
+ FILE_FLAG_OVERLAPPED,
+ NULL );
+ if( result == INVALID_HANDLE_VALUE ) {
+ access = GetLastError();
+ printf( "Error opening %s, code %d\n", path, access );
+ return( 0 );
+ }
+ file->w32handle = result;
+ return( 1 );
+}
+
+
+int CloseWIN32( aFILE * file ) {
+ return( CloseHandle( file->w32handle ) );
+}
+
+
+int WriteWIN32( aFILE * file, aIORec * rec ) {
+
+ static offset_t total_bytes = 0;
+ DWORD err;
+ if( file->mode != A_WRITE ) {
+ return( 0 );
+ }
+
+ rec->w32overlapped = malloc( sizeof( *(rec->w32overlapped) ) );
+ memset( rec->w32overlapped, 0, sizeof( *(rec->w32overlapped) ) );
+
+ if( rec->pos != CURRENT_POS ){
+ offset_t tmppos = rec->pos;
+ tmppos >>= 32;
+ file->filep_high = tmppos;
+ // clear high bits. Is this really necessary?
+ tmppos = rec->pos;
+ tmppos <<= 32;
+ tmppos >>= 32;
+ file->filep_low = tmppos;
+ }
+
+ rec->w32overlapped->OffsetHigh = file->filep_high;
+ rec->w32overlapped->Offset = file->filep_low;
+
+ //printf( "issuing write -- first few bytes of buffer are\n" );
+ //for( i = 0; i < 20; i++ ) {
+ // printf( "%c", rec->buf[i] );
+ //}
+ //printf( "\n" );
+ total_bytes += rec->size * rec->count;
+ //printf( "total bytes: %d\n", total_bytes );
+ if( WriteFileEx(
+ file->w32handle,
+ rec->buf,
+ rec->size*rec->count,
+ rec->w32overlapped,
+ DummyCompletionRoutine ) == 0 ) {
+ err = GetLastError();
+ printf( "error with WriteFileEx: %d\n", err );
+ return( 0 );
+ }
+ return( 1 );
+}
+
+
+int ReadWIN32( aFILE * file, aIORec * rec ) {
+ DWORD err;
+ if( file->mode != A_READ ) {
+ return( 0 );
+ }
+ rec->w32overlapped = malloc( sizeof( *(rec->w32overlapped) ) );
+ memset( rec->w32overlapped, 0, sizeof( *(rec->w32overlapped) ) );
+
+ if( rec->pos != CURRENT_POS ){
+ offset_t tmppos = rec->pos;
+ tmppos >>= 32;
+ file->filep_high = tmppos;
+ // clear high bits. Is this really necessary?
+ tmppos = rec->pos;
+ tmppos <<= 32;
+ tmppos >>= 32;
+ file->filep_low = tmppos;
+ }
+
+ rec->w32overlapped->OffsetHigh = file->filep_high;
+ rec->w32overlapped->Offset = file->filep_low;
+ if( ReadFileEx(
+ file->w32handle,
+ rec->buf,
+ rec->size*rec->count,
+ rec->w32overlapped,
+ DummyCompletionRoutine ) == 0 ) {
+ err = GetLastError();
+ switch( err ) {
+ case ERROR_HANDLE_EOF:
+ printf( "readfileex says EOF -- we'll pretend it worked\n" );
+ return( 1 );
+ default:
+ printf( "error with ReadFileEx -- Last Error: %d\n", GetLastError() );
+ printf( "called: ReadFileEx( %d, %d, %d, %d, %d )\n",
+ file->w32handle,
+ rec->buf,
+ rec->size*rec->count,
+ rec->w32overlapped,
+ DummyCompletionRoutine );
+ return( 0 );
+ }
+ }
+ return( 1 );
+}
+
+
+int QueryLastCompleteWIN32( aFILE * file ) {
+ DWORD result;
+ // this operation may not have ever been executed yet (the case
+ // where w32overlapped is NULL) so we must detect this.
+ if( file->queuetail && file->queuetail->w32overlapped ) {
+ // this is a simple poll, because we're waiting for 0 msec.
+ result = WaitForSingleObject( file->w32handle, 0 );
+ if( result != WAIT_TIMEOUT ) {
+ return( 1 );
+ } else {
+ return( 0 );
+ }
+ //return( HasOverlappedIoCompleted( file->queuetail->w32overlapped ) );
+ } else {
+ return( 0 );
+ }
+}
+
+
+
+#endif /* USE_WIN32 */
diff --git a/libMems/dmSML/awin32aio.h b/libMems/dmSML/awin32aio.h
new file mode 100644
index 0000000..87aa604
--- /dev/null
+++ b/libMems/dmSML/awin32aio.h
@@ -0,0 +1,18 @@
+#ifndef _awin32_h_
+#define _awin32_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+
+int OpenWIN32( aFILE * file, const char *path, int mode );
+int CloseWIN32( aFILE * file );
+
+int WriteWIN32( aFILE * file, aIORec * rec );
+int ReadWIN32( aFILE * file, aIORec * rec );
+
+int QueryLastCompleteWIN32( aFILE * file );
+
+#endif /* _awin32_h_ */
diff --git a/libMems/dmSML/buffer.c b/libMems/dmSML/buffer.c
new file mode 100644
index 0000000..d5d7219
--- /dev/null
+++ b/libMems/dmSML/buffer.c
@@ -0,0 +1,407 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <time.h>
+#include <stddef.h>
+#include "libMems/dmSML/buffer.h"
+#include <string.h>
+
+// portably fills an int with reasonably random bits.
+// one assumption is that MAX_RAND is bigger than 256.
+static int BigRandom() {
+ static char firsttime = 1;
+ int i, result;
+ if( firsttime ) {
+ firsttime = 0;
+ srand( 0 );
+ //srand( time( NULL ) );
+ }
+
+ result = 0;
+ for( i = 0; i < sizeof( result ); i++ ) {
+ result <<= sizeof( result );
+ result ^= rand();
+ }
+ // the funny test here because if result == INT_MIN on a
+ // two's complement machine, -result *also* == INT_MIN.
+ return( result < 0 ? (-result < 0 ? 0 : -result) : result );
+}
+
+
+// Working Set support.
+// returns resulting size of the entire structure.
+int MakeWorkingSet( working_set_t * ws, offset_t goalsize, offset_t minrecs, offset_t maxrecs ) {
+// wrap the memory allocation loop with an outer loop
+// that will attempt smaller working set sizes if large ones fail to allocate
+ while( 1 ){
+
+ // we incrementally grow the working set to the desired size.
+ // however, we just compute the growth and how big the buffers will be,
+ // then we malloc one single large chunk of memory, and arrange things
+ // such that all of the buffer_ts are contiguous in one chunk, and
+ // all the actual data is contiguous after.
+ offset_t cursize = 0;
+ offset_t overhead = sizeof( ws->bufs[0] );
+ offset_t minsize = overhead + minrecs * sizeof( record_t );
+ offset_t maxsize = overhead + maxrecs * sizeof( record_t );
+ offset_t nbufs = 0; // number of real buffers pleged to the working set
+ offset_t maxbufs = 256; // the max number of buffers we track (this grows if necessary)
+ offset_t *buflist = malloc( sizeof( *buflist ) * maxbufs ); // grows when necessary
+
+ record_t *recordptr;
+ offset_t i;
+ // if we can't possibly do anything useful
+ if( goalsize < minsize || maxrecs < minrecs || !buflist ) {
+ if( buflist )
+ free( buflist );
+ return( 0 );
+ }
+
+ // just start allocating buffers until we can't anymore
+ while( goalsize - cursize >= maxsize ) {
+ offset_t randrecs = BigRandom() % (maxrecs - minrecs + 1) + minrecs;
+ if( nbufs == maxbufs ) {
+ // resize the array
+ maxbufs *= 2;
+ buflist = realloc( buflist, sizeof( *buflist ) * maxbufs );
+ }
+ buflist[nbufs++] = randrecs;
+ // update the number of bytes we've currently decided to allocate.
+ cursize += overhead + randrecs * sizeof( record_t );
+ }
+ // now we have nbufs buffers, and the number of records they should
+ // store is in the buflist list.
+ // allocate one big chunk of memory
+ printf( "allocating %llu bytes for working set (%llu bufs)\n", cursize, nbufs );
+
+ ws->bufs = malloc( cursize );
+ // if it failed to allocate try a smaller size
+ if( !ws->bufs ){
+ goalsize /= 2;
+ continue;
+ }
+
+ ws->size = cursize;
+ ws->nbufs = nbufs;
+ // clear it out
+ memset( ws->bufs, 0, cursize );
+
+ // Now fill in the pointers to the records for all the buffers.
+ // these all reside after the buffers in the working set.
+ // Something convenient from this scheme is that in order to free
+ // the working set when we're through, we just free ws->bufs.
+ // pointer to first set of records.
+ recordptr = (record_t *)( ((ptrdiff_t)ws->bufs) + (ws->nbufs * sizeof( ws->bufs[0] )) );
+ for( i = 0; i < nbufs; i++ ) {
+ ws->bufs[i].totalrecs = buflist[i];
+ ws->bufs[i].recs = recordptr;
+ recordptr += ws->bufs[i].totalrecs;
+ }
+
+ free( buflist );
+ return( cursize );
+ }
+ return 0;
+}
+
+
+
+
+
+// Working Set support.
+// Reorganize the working set with a different distribution of buffers.
+void ReorganizeWorkingSet( working_set_t * ws, offset_t minrecs, offset_t maxrecs ) {
+ // we incrementally grow the working set to the desired size.
+ // however, we just compute the growth and how big the buffers will be,
+ // then we malloc one single large chunk of memory, and arrange things
+ // such that all of the buffer_ts are contiguous in one chunk, and
+ // all the actual data is contiguous after.
+ offset_t goalsize = ws->size;
+ offset_t cursize = 0;
+ offset_t overhead = sizeof( ws->bufs[0] );
+ offset_t minsize = overhead + minrecs * sizeof( record_t );
+ offset_t maxsize = overhead + maxrecs * sizeof( record_t );
+ offset_t nbufs = 0; // number of real buffers pledged to the working set
+ offset_t maxbufs = 256; // the max number of buffers we're tracking (this grows if necessary)
+ offset_t *buflist = malloc( sizeof( *buflist ) * maxbufs ); // grows when necessary
+ offset_t leftovers;
+ record_t *recordptr;
+ offset_t i;
+
+ // if we can't possibly do anything useful
+ if( maxrecs < minrecs ) {
+ free( buflist );
+ return;
+ }
+
+ if( goalsize < minsize ) {
+ minsize = goalsize;
+ minrecs = (minsize-overhead) / sizeof( record_t );
+ }
+
+ // just start allocating buffers until we can't anymore
+ while( goalsize - cursize >= maxsize ) {
+ offset_t randrecs = BigRandom() % (maxrecs - minrecs + 1) + minrecs;
+ if( nbufs == maxbufs ) {
+ // resize the array
+ maxbufs *= 2;
+ buflist = realloc( buflist, sizeof( *buflist ) * maxbufs );
+ }
+ buflist[nbufs++] = randrecs;
+ // update the number of bytes we've currently decided to allocate.
+ cursize += overhead + randrecs * sizeof( record_t );
+ }
+
+ // clean up the last bit
+ if( goalsize - cursize > overhead ) {
+ leftovers = (goalsize - cursize - overhead) / sizeof( record_t );
+ if( leftovers ) {
+ if( nbufs == maxbufs ) {
+ // resize the array
+ maxbufs *= 2;
+ buflist = realloc( buflist, sizeof( *buflist ) * maxbufs );
+ }
+ buflist[nbufs++] = leftovers;
+ cursize += overhead + leftovers * sizeof( record_t );
+ }
+ }
+
+ // now we have nbufs buffers, and the number of records they should
+ // store is in the buflist list.
+
+ ws->nbufs = nbufs;
+ // clear it out
+ memset( ws->bufs, 0, cursize );
+ // Now fill in the pointers to the records for all the buffers.
+ // these all reside after the buffers in the working set.
+ // Something convenient from this scheme is that in order to free
+ // the working set when we're through, we just free ws->bufs.
+ // pointer to first set of records.
+ recordptr = (record_t *)( ((ptrdiff_t)ws->bufs) + (ws->nbufs * sizeof( ws->bufs[0] )) );
+ for( i = 0; i < nbufs; i++ ) {
+ ws->bufs[i].totalrecs = buflist[i];
+ ws->bufs[i].recs = recordptr;
+ recordptr += ws->bufs[i].totalrecs;
+ }
+
+ free( buflist );
+ return;
+}
+
+
+
+
+
+
+
+
+
+// this updates all the IO on the working set buffers, querying those that
+// are not in OP_FINISHED or OP_NONE and putting those that finish into OP_FINISHED
+void UpdateWSIOFinishedState( working_set_t * ws ) {
+ // gets rid of an indirection in the loop.
+ // this method (rather than using an index)
+ // (I also think it's cleaner)
+ buffer_t *b;
+ // simply walk all of them
+ for( b = ws->bufs; b - ws->bufs < ws->nbufs; b++ ) {
+ // real operation #s are whole numbers.
+ if( b->operation > OP_NONE ) {
+ //printf( "examining operation %d\n", b->operation );
+ if( aOperationComplete( b->file, b->operation ) ) {
+ //printf( "* Completed operation %d on device %x\n", b->operation, b->device );
+ b->operation = OP_FINISHED;
+ } else {
+ //printf( "operation %d INCOMPLETE IO\n", b->operation );
+ }
+ }
+ }
+}
+
+
+
+// buffer list manipulations
+// returns argument
+buffer_list_t * InitList( buffer_list_t * list ) {
+ list->head = NULL;
+ list->nitems = 0;
+ return( list );
+}
+
+
+void PushHead( buffer_list_t * list, buffer_t * item ) {
+ // one special case for empty list, because we can't
+ // dereference list->head until we assign to it.
+ if( list->head == NULL ) {
+ list->head = item;
+ list->nitems = 1;
+ list->head->next = list->head;
+ list->head->last = list->head;
+ return;
+ }
+ // other cases are easier, because no more null pointers.
+ item->last = list->head->last;
+ item->next = list->head;
+ list->head->last->next = item;
+ list->head->last = item;
+ list->head = item;
+ // we added an item.
+ list->nitems++;
+}
+
+void PushTail( buffer_list_t * list, buffer_t * item ) {
+ // this is exactly equivalent to doing a PushHead and
+ // then backing up the list head one.
+ // get the item in there
+ PushHead( list, item );
+ // back up the head.
+ list->head = list->head->last;
+}
+
+buffer_t * PopHead( buffer_list_t * list ) {
+ buffer_t *ret;
+ // just get rid of the head item and return it.
+ if( list->head == NULL ) {
+ return( NULL );
+ }
+ list->head->next->last = list->head->last;
+ list->head->last->next = list->head->next;
+ ret = list->head;
+ list->head = list->head->next;
+ ret->next = ret->last = NULL;
+ list->nitems--;
+ if( list->nitems == 0 ) {
+ list->head = NULL;
+ }
+ return( ret );
+}
+
+buffer_t * PopTail( buffer_list_t * list ) {
+ // just get rid of the tail item and return it.
+ if( list->head == NULL ) {
+ return( list->head );
+ }
+ // otherwise, a pop tail is equivalent to moving the
+ // head back one and popping head.
+ list->head = list->head->last;
+ return( PopHead( list ) );
+}
+
+// returns second argument
+buffer_t * RemoveItem( buffer_list_t * list, buffer_t * item ) {
+ // FIXME: handle NULL cases in a reasonable way?
+ if( item == list->head ) {
+ return( PopHead( list ) );
+ }
+ item->next->last = item->last;
+ item->last->next = item->next;
+ item->next = item->last = NULL;
+ list->nitems--;
+ if( list->nitems == 0 ) {
+ list->head = NULL;
+ }
+ return( item );
+}
+
+
+
+
+
+
+int CompareKeys_qsort_wrapper( const void *r1, const void *r2 ) {
+
+ return( CompareKeys( (record_t *)r1, (record_t *)r2 ) );
+
+}
+
+
+
+int CompareKeys( const record_t *r1, const record_t *r2 ) {
+
+ return( COMPARE_KEYS( *r1, *r2 ) );
+ //return( memcmp( r1->key, r2->key, sizeof( r1->key ) ) );
+
+}
+
+
+
+
+
+
+
+// This *must* enforce a serialized order for reading and writing, lest
+// we write sorted data out in the wrong order!
+void UpdateDeviceIOExecuteState( working_set_t * ws, iodevice_t * dev ) {
+ // check to see if the device's IO job completed
+ if( !dev->buf || dev->state == DEV_FREE || dev->buf->operation == OP_FINISHED ) {
+ // find another job to take its place and execute it.
+ buffer_t *b;
+ buffer_t *found_buf = NULL;
+ dev->state = DEV_FREE;
+ dev->buf = NULL;
+ // simply walk all of them, find the operation on this device
+ // that has the lowest op number for its file. This is made "more fair"
+ // by picking the first operation that matches the device, then finding
+ // all other buffers that operate on the same file
+ for( b = ws->bufs; b - ws->bufs < ws->nbufs; b++ ) {
+ // is this one that should be executed next?
+
+ if( b->operation == OP_PENDING && b->device == dev ) {
+ if( !found_buf ) {
+ found_buf = b;
+ } else if( (b->file == found_buf->file) &&
+ (b->fileop < found_buf->fileop) ) {
+ found_buf = b;
+ }
+ }
+
+ /*
+ if( b->operation == OP_PENDING && b->device == dev ) {
+ dev->buf = b;
+ b->operation = b->file->mode == A_READ
+ ? aRead( b->recs, sizeof( b->recs[0] ), b->numrecs, b->file )
+ : aWrite( b->recs, sizeof( b->recs[0] ), b->numrecs, b->file );
+ dev->state = DEV_BUSY;
+ //printf( "* Created operation %d on device %x\n", b->operation, b->device );
+ // found one, so quit.
+ break;
+ }
+ */
+ }
+
+ if( found_buf ) {
+ dev->buf = found_buf;
+ found_buf->operation = found_buf->file->mode == A_READ
+ ? aRead( found_buf->recs, 1, found_buf->io_size, found_buf->file, found_buf->io_pos )
+ : aWrite( found_buf->recs, 1, found_buf->io_size, found_buf->file, found_buf->io_pos );
+ dev->state = DEV_BUSY;
+ }
+
+ }
+}
+
+// read and write to/from disk.
+void ReadBuffer( buffer_t * buffer, offset_t num_recs, iodevice_t * dev ) {
+ buffer->io_size = num_recs * sizeof( record_t );
+ buffer->numrecs = num_recs;
+ buffer->device = dev;
+ buffer->fileop = buffer->file->op++;
+ buffer->io_pos = CURRENT_POS;
+ if( buffer->operation != OP_NONE ) {
+ printf( "weird!\n" );
+ } else {
+ buffer->operation = OP_PENDING;
+ }
+ //printf( "* Initiated (pending) operation on %x\n", dev );
+}
+
+void WriteBuffer( buffer_t * buffer, offset_t num_recs, iodevice_t * dev ) {
+ // exactly the same as a read -- the operation is just scheduled.
+ // the exact nature (read or write) is determined by the mode
+ // of the opened file at the time operation is in fact
+ // executed.
+ ReadBuffer( buffer, num_recs, dev );
+
+}
+
diff --git a/libMems/dmSML/buffer.h b/libMems/dmSML/buffer.h
new file mode 100644
index 0000000..69f8370
--- /dev/null
+++ b/libMems/dmSML/buffer.h
@@ -0,0 +1,203 @@
+#ifndef _buffer_h_
+#define _buffer_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+
+// forward decl for the benefit of iodevice_t
+// (can't be avoided)
+typedef struct buffer_s buffer_t;
+
+enum {
+ DEV_FREE,
+ DEV_BUSY,
+};
+
+/*
+================
+iodevice_t
+An IO device represents a physical disk. This is used to make
+sure that we're not doing more than one operation on any disk
+at a time. The reason is if we are, the OS threads that do the
+asynchronous IO will contend for the disk and we'll start seeking.
+Seeking is bad.
+================
+*/
+typedef struct iodevice_s {
+ int op; // an operation number used to enforce serial operation on each device.
+ int state; // either DEV_FREE or DEV_BUSY as above.
+ buffer_t *buf; // if we're DEV_BUSY, the buffer we're operating on.
+} iodevice_t;
+
+
+/*
+================
+buffer_state_t
+Buffers are used for IO, and the IO is asynchronous. Buffers
+have a bit of state to indicate what their current IO status is.
+Ordinarily, buffers are in an OP_NONE state, to indicate no
+operation is being performed. When an operation is initiated,
+the buffer transitions to OP_PENDING and changes to a valid operation
+code when the operation actually starts. When the operation completes,
+the buffer is transitioned to the OP_FINISHED state, so that the
+program may determine when buffers have completed their operations.
+Then the app should transition the state back to OP_NONE.
+================
+*/
+enum {
+ OP_PENDING = -2,
+ OP_FINISHED = -1,
+ OP_NONE = 0,
+};
+
+
+
+#define CompareKeyPtrs( a, b ) \
+((int)(a)->key[0]-(int)(b)->key[0] ? (int)(a)->key[0]-(int)(b)->key[0] : \
+(int)(a)->key[1]-(int)(b)->key[1] ? (int)(a)->key[1]-(int)(b)->key[1] : \
+(int)(a)->key[2]-(int)(b)->key[2] ? (int)(a)->key[2]-(int)(b)->key[2] : \
+(int)(a)->key[3]-(int)(b)->key[3] ? (int)(a)->key[3]-(int)(b)->key[3] : \
+(int)(a)->key[4]-(int)(b)->key[4] ? (int)(a)->key[4]-(int)(b)->key[4] : \
+(int)(a)->key[5]-(int)(b)->key[5] ? (int)(a)->key[5]-(int)(b)->key[5] : \
+(int)(a)->key[6]-(int)(b)->key[6] ? (int)(a)->key[6]-(int)(b)->key[6] : \
+(int)(a)->key[7]-(int)(b)->key[7] ? (int)(a)->key[7]-(int)(b)->key[7] : \
+(int)(a)->key[8]-(int)(b)->key[8] ? (int)(a)->key[8]-(int)(b)->key[8] : \
+(int)(a)->key[9]-(int)(b)->key[9] ? (int)(a)->key[9]-(int)(b)->key[9] : 0)
+
+#define COMPARE_KEYS( a, b ) \
+((a).key[0]!=(b).key[0] ? (a).key[0]-(b).key[0] : \
+(a).key[1]!=(b).key[1] ? (a).key[1]-(b).key[1] : \
+(a).key[2]!=(b).key[2] ? (a).key[2]-(b).key[2] : \
+(a).key[3]!=(b).key[3] ? (a).key[3]-(b).key[3] : \
+(a).key[4]!=(b).key[4] ? (a).key[4]-(b).key[4] : \
+(a).key[5]!=(b).key[5] ? (a).key[5]-(b).key[5] : \
+(a).key[6]!=(b).key[6] ? (a).key[6]-(b).key[6] : \
+(a).key[7]!=(b).key[7] ? (a).key[7]-(b).key[7] : \
+(a).key[8]!=(b).key[8] ? (a).key[8]-(b).key[8] : \
+(a).key[9]!=(b).key[9] ? (a).key[9]-(b).key[9] : 0)
+
+
+
+
+// this is the record as in the files to be sorted
+typedef struct record_s {
+ unsigned char key[10];
+ unsigned char num[1];
+ unsigned char payload[1];
+} record_t;
+
+
+
+
+
+int CompareKeys_qsort_wrapper( const void *r1, const void *r2 );
+int CompareKeys( const record_t *r1, const record_t *r2 );
+
+
+
+
+
+/*
+================
+buffer_t
+This is the unit of information most commonly dealt with.
+We read into these, and write these out, and use these for
+binning. A single Working Set of these buffers should be
+used for the duration of the program, and they should be
+managed with the buffer lists below.
+sizeof( buffer_t ) == 32, so the overhead isn't bad.
+================
+*/
+struct buffer_s {
+
+ aFILE *file; // the file this buffer is attached to for IO ops
+ iodevice_t *device; // which IO device this is on (for scheduling IO)
+ int operation; // either OP_NONE, OP_FINISHED, or the op #.
+ offset_t numrecs; // the number of valid records in this buffer.
+ offset_t totalrecs; // number of real records in recs
+ int fileop; // operation number on device to ensure serialized ops.
+ record_t *recs; // actual record storage
+ struct buffer_s *next; // for chaining lists together.
+ struct buffer_s *last;
+ offset_t io_size; // amount of data for i/o, need not be equal to numrecs
+ long long input_pos; // the sequence offset that this data was read from, only valid during binning phase
+ offset_t io_pos; // the file offset for I/O, set to CURRENT_POS to use the current file seek pointer
+};
+
+
+/*
+================
+buffer_list_t
+Buffer lists are used to manage pools, like the free list,
+the reading list, the to process list, and a list for each bin.
+We use circular lists because they're simpler.
+================
+*/
+typedef struct buffer_list_s {
+ int nitems;
+ buffer_t * head;
+} buffer_list_t;
+
+
+/*
+================
+working_set_t
+Working sets are collections of buffers. They are useful so that
+you can use a fixed amount of memory to deal with things. The
+problem then becomes internal working set management.
+================
+*/
+typedef struct working_set_s {
+ offset_t size; // actual size of working set in bytes
+ int nbufs;
+ buffer_t *bufs;
+} working_set_t;
+
+
+
+
+// Working Set support.
+// returns resulting size of the entire structure.
+// goalsize is the desired size of the working set in bytes, minbufsize and maxbufsize
+// are the minimum and maximum desired number of records in buffers. The buffers will
+// be allocated with random sizes in this range until the desired goalsize is reached.
+// this will return 0 in the case of a malloc error or if the goalsize is too small to
+// have any buffers allocated for it.
+int MakeWorkingSet( working_set_t * ws, offset_t goalsize, offset_t minrecs, offset_t maxrecs );
+
+// Working Set support.
+// Reorganize the working set with a different distribution of buffers.
+void ReorganizeWorkingSet( working_set_t * ws, offset_t minrecs, offset_t maxrecs );
+
+
+// this updates all the IO on the working set buffers, querying those that
+// are not in OP_FINISHED or OP_NONE and putting those that finish into OP_FINISHED
+void UpdateWSIOFinishedState( working_set_t * ws );
+
+
+// this updates the IO on a particular device. this routine and the one above
+// should probably be called as this one after that one, and in addtion, this
+// one called for every device in the system.
+void UpdateDeviceIOExecuteState( working_set_t * ws, iodevice_t * dev );
+
+
+// buffer list manipulations
+// returns argument
+buffer_list_t * InitList( buffer_list_t * list );
+void PushHead( buffer_list_t * list, buffer_t * item );
+void PushTail( buffer_list_t * list, buffer_t * item );
+buffer_t * PopHead( buffer_list_t * list );
+buffer_t * PopTail( buffer_list_t * list );
+// returns second argument
+buffer_t * RemoveItem( buffer_list_t * list, buffer_t * item );
+
+// read and write to/from disk.
+void ReadBuffer( buffer_t * buffer, offset_t num_recs, iodevice_t * dev );
+void WriteBuffer( buffer_t * buffer, offset_t num_recs, iodevice_t * dev );
+
+
+#endif /* _buffer_h_ */
+
diff --git a/libMems/dmSML/dmsort.c b/libMems/dmSML/dmsort.c
new file mode 100644
index 0000000..4c99215
--- /dev/null
+++ b/libMems/dmSML/dmsort.c
@@ -0,0 +1,1942 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "libMems/dmSML/util.h"
+#include "libMems/dmSML/timing.h"
+#include "libMems/dmSML/asyncio.h"
+#include "libMems/dmSML/buffer.h"
+#include "libMems/dmSML/sorting.h"
+#include "libMems/dmSML/sml.h"
+#include "libMems/dmSML/dmsort.h"
+
+// define this if you're using the ASCII sortgen data.
+// don't define if you're using random data (dmsortgen)
+//#define ASCII_KEYBYTES
+
+// define this if using dmSML with sequences that have large
+// stretches of NNNNN... such as an unfinished eukaryote
+//#define NNNNN_KEYBYTES
+
+// define this if you want to measure the overlapping
+// of your sorting with I/O in the sorting phase --
+// this makes the sort routine do nothing.
+//#define NO_SORT_PERF_TEST
+
+// define the following if you don't want to write
+// data during the sort phase in order to get timings
+//#define NO_WRITE_PERF_TEST
+
+// define this to skip the binning phase in order to
+// perform measurements on the sort phase. The bin
+// files to use during sorting must already exist (duh!)
+// #define NO_BINNING_PERF_TEST
+
+// define this to test the performance of binning and
+// restructuring without bin writing
+//#define NO_BIN_WRITE_PERF_TEST
+
+// define this to test the performance without restructuring
+// each SML bin
+//#define NO_RESTRUCTURE_PERF_TEST
+
+/*
+
+#define NELEMS(x) \
+ ( sizeof((x)) / sizeof((x)[0]) )
+
+
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#define MINRECS (1311)
+#define MAXRECS (1311)
+
+
+// this is somewhat less appealing than a config file,
+// but speed is critical and parsing a config file at
+// startup is just inconvenient. Besides, specifying
+// what we care about is easy enough this way.
+typedef struct device_s {
+ const char *devname;
+ const char *path;
+ iodevice_t dev;
+} device_t;
+*/
+
+device_t *Devices;
+int NumDevices;
+
+
+
+// ugly hack
+//#define BIN_SPECIAL (-10000)
+
+int NSortBufs;
+sort_buf_t *SortBufs;
+
+
+// how the working set is allocated originally.
+offset_t BufferSizeMin;
+offset_t BufferSizeMax;
+
+
+
+/*
+// what we use to represent a bin.
+typedef struct bin_s {
+ aFILE *file; // File we write/read on.
+ int dev; // This is an index into the Devices table.
+ offset_t nrecs; // Number of records written to bin.
+ buffer_list_t bufs; // Our list of buffers that holds our data.
+} bin_t;
+
+*/
+
+// number specified by a cmdline param at runtime.
+bin_t *Bins;
+int NumBins;
+int NumBinDevs; // number of binning devices
+
+/*
+typedef struct seqbuf_s {
+ aFILE *file; // Output file
+ int dev; // device table index for output file
+ offset_t bufpos; // position in current buffer
+ uint64 seq_pos; // position in sequence that is next to translate
+ buffer_list_t bufs; // list of buffers for data
+} seqbuf_t;
+*/
+
+seqbuf_t Seqbuf;
+
+aFILE *Data; // the data to sort
+int DataDev; // the device the data file is on.
+
+
+const char *OutFileName = "unset"; // the output file name.
+aFILE *Output; // the output file (sorted data goes here)
+int OutputDev; // the device the output goes on.
+
+
+
+int BinToRead, BinToWrite, BinToSort;
+
+
+
+working_set_t WS; // the Working Set we use to do our sorting.
+
+offset_t NumRecs; // the total number of blocks to process
+offset_t RecsProcessed; // number of blocks processed (put in bins to write out)
+offset_t RecsRead; // number of records fully read in.
+offset_t RecsUnread; // number of blocks on disk (not yet had 'read' called)
+offset_t RecsCommitted; // number of records committed to be written.
+offset_t RecsWritten; // number of records actually written on disk.
+
+
+// timers
+double RunningTime;
+dmtimer_t *RunningTimer;
+double BinningTime;
+dmtimer_t *BinningTimer;
+double SortingTime;
+dmtimer_t *SortingTimer;
+
+double QSortTime;
+dmtimer_t *QSortTimer;
+
+double ReadIdleTime;
+dmtimer_t *ReadIdleTimer;
+double SortIdleTime;
+dmtimer_t *SortIdleTimer;
+double WriteIdleTime;
+dmtimer_t *WriteIdleTimer;
+
+// buffer lists
+buffer_list_t Free; // the free list
+buffer_list_t ToProcess; // list read and to be processed
+buffer_list_t Reading; // the list that's waiting on stuff to read.
+buffer_list_t Restructure; // buffers that need post-read and pre-binning processing
+
+
+static buffer_t * AllocateFree( void ) {
+ buffer_t * ret;
+ if( Free.nitems ) {
+ ret = PopHead( &Free );
+ } else {
+ printf( "error: called AllocateFree but free list is empty\n" );
+ return( NULL );
+ }
+ ret->device = NULL;
+ ret->file = NULL;
+ ret->last = ret->next = NULL;
+ ret->numrecs = 0;
+ ret->operation = OP_NONE;
+ return( ret );
+}
+
+
+static unsigned int divisor = 0;
+
+static int ComputeBinNumber( const unsigned char key[10] ) {
+ int i;
+ unsigned int keyval = 0;
+ // how many bits can we use for the binning number?
+ // first time through, compute divisor
+ // assume even distribution
+ // strange constant is 256^3, because we're dealing
+ // with effectively a base 256 number here, and we can
+ // only handle 3 places without overflowing.
+ if( divisor == 0 ) {
+ divisor = (unsigned)16777216 / (unsigned)NumBins;
+ // need ceiling of this
+ divisor += (unsigned)16777216 % (unsigned)NumBins ? 1 : 0;
+ printf( "Divisor is: %u\n", divisor );
+ }
+ // now we compute the number represented by the first 3
+ // characters of the key, and divide it by divisor, the
+ // integral part gives the bin number.
+ for( i = 0; i < 3; i++ ) {
+ keyval <<= 8;
+ keyval += key[i];
+ }
+// printf( "Key is %.2x %.2x %.2x \n", key[0],key[1], key[2] );
+
+// printf( "Keyval is: %u\n", keyval );
+// printf( "Bin is: %u\n", keyval / divisor );
+ return( keyval / divisor );
+}
+
+// just like ComputeBinNumber except we reserve one bin for zero keys.
+static int ComputeNNNNNBinNumber( const unsigned char key[10] ) {
+ int i;
+ unsigned int keyval = 0;
+ if( divisor == 0 ) {
+ divisor = (unsigned)16777216 / ((unsigned)NumBins - 1);
+ // need ceiling of this
+ divisor += (unsigned)16777216 % ((unsigned)NumBins - 1) ? 1 : 0;
+ printf( "Divisor is: %u\n", divisor );
+ }
+ // now we compute the number represented by the first 3
+ // characters of the key, and divide it by divisor, the
+ // integral part gives the bin number.
+ for( i = 0; i < 3; i++ ) {
+ keyval <<= 8;
+ keyval += key[i];
+ }
+// printf( "Key is %.2x %.2x %.2x \n", key[0],key[1], key[2] );
+
+// printf( "Keyval is: %u\n", keyval );
+// printf( "Bin is: %u\n", keyval / divisor );
+ if( keyval == 0 )
+ return 0;
+ return ( keyval / divisor ) + 1;
+}
+
+
+
+static int ComputeAsciiBinNumber( const unsigned char key[10] ) {
+ int i;
+ unsigned int keyval = 0;
+ // how many bits can we use for the binning number?
+ // first time through, compute divisor
+ if( divisor == 0 ) {
+ // strange constant is 95^4 -- the max possible value
+ // of the first five key characters + 1.
+ divisor = 81450625 / NumBins;
+ // need ceiling of this
+ divisor += 81450625 % NumBins ? 1 : 0;
+ }
+ // now we compute the number represented by the first 4
+ // characters of the key, and divide it by divisor, the
+ // integral part gives the bin number.
+
+ for( i = 0; i < 4; i++ ) {
+ keyval *= 95;
+ keyval += key[i] - ' ';
+ }
+ return( keyval / divisor );
+}
+
+
+
+static offset_t consumed_recs = 0;
+static buffer_t *toprocess = NULL;
+
+static void DoBinning( void ) {
+ //printf( "--------------- do binning -------------\n" );
+ while( 1 ) {
+ int bin = -1;
+ // if we don't already have a buffer to process, see if we
+ // can get one.
+ if( toprocess == NULL ) {
+ //printf( "toprocess == null -- no currently processing buffer\n" );
+ if( ToProcess.nitems ) {
+ //printf( "getting one off ToProcess list\n" );
+ toprocess = PopHead( &(ToProcess) );
+ consumed_recs = 0;
+ } else {
+ // we can't get anything to process
+ //printf( "nothing to process\n" );
+ return;
+ }
+ }
+ //printf( "processing records in current toprocess buffer\n" );
+ // try to process all the records in the toprocess buffer.
+ //printf( "for( ; consumed_recs (%d) < toprocess->numrecs (%d); ... ) {\n", consumed_recs, toprocess->numrecs );
+ for( ; consumed_recs < toprocess->numrecs; consumed_recs++, RecsProcessed++ ) {
+
+ buffer_t *headbuf;
+ record_t *rec = &(toprocess->recs[consumed_recs]);
+
+ // find what bin this next record belongs in.
+#ifdef ASCII_KEYBYTES
+ bin = ComputeAsciiBinNumber( rec->key );
+#else
+#ifdef NNNNN_KEYBYTES
+ bin = ComputeNNNNNBinNumber( rec->key );
+#else
+ bin = ComputeBinNumber( rec->key );
+#endif
+#endif
+ if( (bin >= NumBins) || (bin < 0) ) {
+ printf( "error: invalid bin from ComputeBinNumber: %d\n", bin );
+ }
+
+ //printf( "record bound for bin %d\n", bin );
+
+ // now, let's see what the situation is with that bin and its
+ // buffers. In particular, do we have a spot to put this record?
+ headbuf = Bins[bin].bufs.head;
+ // if we have a buffer, and the buffer is full or executing or
+ // if there's no buffer at all, let's try to get one
+ if( !headbuf ||
+ headbuf->numrecs == headbuf->totalrecs ||
+ headbuf->operation != OP_NONE ) {
+ //printf( "headbuf busy or full -- op: %d, numrecs: %d, totalrecs: %d\n",
+ //headbuf->operation, headbuf->numrecs, headbuf->totalrecs );
+ // first see if this is our 'special' buffer and if we can use it
+ if( headbuf->operation == BIN_SPECIAL ) {
+ //printf( "headbuf is only one left and finished so reclaiming for use\n" );
+ headbuf->numrecs = 0;
+ headbuf->operation = OP_NONE;
+ } else {
+ //printf( "trying to get buffer from free list\n" );
+ if( Free.nitems ) {
+ //printf( "got one from freelist\n" );
+ PushHead( &(Bins[bin].bufs), AllocateFree() );
+ headbuf = Bins[bin].bufs.head;
+ } else {
+// printf( "no free buffers to use for bin -- binning BLOCKS!\n" );
+ return;
+ }
+ }
+ }
+ // now headbuf must exist, and it must be non-full so we can
+ // add our item.
+ headbuf->recs[headbuf->numrecs++] = *rec;
+ Bins[bin].nrecs++;
+ //printf( "added rec to bin\n" );
+ // if we made it full, write the thing
+ if( headbuf->numrecs >= headbuf->totalrecs ) {
+ //printf( "writing bin buffer because full\n" );
+ headbuf->file = Bins[bin].file;
+ headbuf->device = &(Devices[Bins[bin].dev].dev);
+ RecsCommitted += headbuf->numrecs;
+#ifdef NO_BIN_WRITE_PERF_TEST
+ // just put it in the finished state
+ headbuf->operation = OP_FINISHED;
+#else
+ WriteBuffer( headbuf, headbuf->numrecs, headbuf->device );
+#endif
+ headbuf = NULL;
+ }
+
+ }
+
+ // if we hit the end of this buffer,
+ // put it back on the free list, and start the loop over
+ if( consumed_recs >= toprocess->numrecs ) {
+ //printf( "finished with this block\n" );
+ PushTail( &Free, toprocess );
+ toprocess = NULL;
+ }
+
+ //printf( "going back for more\n" );
+
+ }
+}
+
+
+
+
+
+void FinishBinning() {
+ int i;
+ buffer_t *b;
+ offset_t recs = 0;
+ // be sure to finish off the write operations.
+ for( i = 0; i < NumBins; i++ ) {
+ //printf( "bin: %d, nrecs: %d, operation: %d\n", i, Bins[i].nrecs, Bins[i].operation );
+ while( Bins[i].bufs.nitems ) {
+ // walk through the buffers, and if they haven't been executed,
+ // execute them.
+ b = PopHead( &(Bins[i].bufs) );
+ if( b->operation == OP_NONE && b->numrecs ) {
+ recs += b->numrecs;
+ b->file = Bins[i].file;
+ b->device = &(Devices[Bins[i].dev].dev);
+#ifdef NO_BIN_WRITE_PERF_TEST
+ // just put it in the finished state
+ b->operation = OP_FINISHED;
+#else
+ WriteBuffer( b, b->numrecs, b->device );
+#endif
+ }
+ }
+ }
+ RecsCommitted += recs;
+}
+
+
+
+offset_t CalculateDataReadSize( buffer_t* b ){
+// commented version is for traditional dmsort
+// return MIN(b->totalrecs, RecsUnread) * sizeof( record_t );
+ return MIN(b->totalrecs + mask_length - 1, RecsUnread + mask_length - 1 );
+}
+
+static void DoReading( void ) {
+ buffer_t * b;
+ //printf( "do reading\n" );
+ if( RecsUnread && Free.nitems ) {
+ // allocate a buffer
+ b = AllocateFree();
+
+ // start reading into it.
+ b->file = Data;
+ ReadBuffer( b, MIN(b->totalrecs, RecsUnread), &(Devices[DataDev].dev) );
+
+ b->input_pos = NumRecs - RecsUnread;
+ // need to step back mask_length - 1 characters to get the complete sequence!!
+// if( b->input_pos >= mask_length - 1 )
+// b->input_pos -= mask_length - 1;
+ b->io_pos = b->input_pos;
+// printf( "Reading offset %llu\n", b->io_pos );
+ b->io_size = CalculateDataReadSize( b );
+ // decrement recsunread appropriately
+ RecsUnread -= MIN(MIN(b->totalrecs,RecsUnread),RecsUnread);
+
+ // put the thing on the Reading list.
+ //printf( "new buffer on reading list\n" );
+ PushTail( &Reading, b );
+ }
+}
+
+
+
+
+
+static void HandleBinWriteCompletions( void ) {
+ int i;
+ buffer_t *b, *tmpnext;
+ //printf( "handle bin write completions\n" );
+ for( i = 0; i < NumBins; i++ ) {
+ b = Bins[i].bufs.head;
+ do {
+ if( !b ) {
+ break;
+ }
+ tmpnext = b->next;
+ if( b->operation == OP_FINISHED ) {
+ RecsWritten += b->numrecs;
+ if( Bins[i].bufs.nitems > 1 ) {
+ b->operation = OP_NONE;
+ PushHead( &Free, RemoveItem( &(Bins[i].bufs), b ) );
+ } else {
+ b->operation = BIN_SPECIAL;
+ }
+ }
+ b = tmpnext;
+ } while( b != Bins[i].bufs.head && Bins[i].bufs.nitems > 1 );
+ }
+}
+
+static void HandleSeqbufWriteCompletions( void ) {
+ buffer_t *b, *tmpnext;
+ //printf( "handle bin write completions\n" );
+ b = Seqbuf.bufs.head;
+ do {
+ if( !b ) {
+ break;
+ }
+ tmpnext = b->next;
+ if( b->operation == OP_FINISHED ) {
+ if( Seqbuf.bufs.nitems > 1 ) {
+ b->operation = OP_NONE;
+ PushHead( &Free, RemoveItem( &(Seqbuf.bufs), b ) );
+ }
+ }
+ b = tmpnext;
+ } while( b != Seqbuf.bufs.head && Seqbuf.bufs.nitems > 1 );
+}
+
+#define ALPHA_BITS 2
+
+static void Translate32(uint32* dest, const char* src, const unsigned len){
+ uint8 start_bit = 0;
+ unsigned cur_word = 0;
+ uint32 word_mer = 0;
+ uint32 i = 0;
+ if( len == 0 )
+ return;
+ for(i=0; i < len; i++){
+// uint32 tmp = DNA_TABLE[src[i]];
+ if(start_bit + ALPHA_BITS <= 32){
+ word_mer <<= ALPHA_BITS;
+ word_mer |= DNA_TABLE[src[i]];
+ dest[cur_word] = word_mer;
+ start_bit += ALPHA_BITS;
+ if(start_bit >= 32 && i < len - 1){
+ word_mer = 0;
+ start_bit %= 32;
+ cur_word++;
+ }
+ }else{
+ printf("Error, this should never happen with DNA sequence\n" );
+/* uint8 over_bits = (start_bit + ALPHA_BITS) % 32;
+ uint32 tmp2 = tmp;
+ tmp2 <<= 32 - over_bits;
+ tmp >>= over_bits;
+ dest[cur_word] |= tmp;
+ cur_word++;
+ dest[cur_word] = 0;
+ dest[cur_word] |= tmp2;
+ start_bit = over_bits;
+*/ }
+ }
+ if( start_bit != 0 ){
+ dest[cur_word] <<= 32 - start_bit;
+ }
+}
+
+
+void RestructureReadSMLBins( void ) {
+ char little_endian = 1;
+ mask_t bit;
+ mask_t mer, rc_mer;
+ record_t forward, reverse;
+ record_t begin[6]; // the first six records could potentially overwrite the sequence
+ int i;
+ offset_t seqI, extras, weight;
+ char* sequence;
+ sml_t *sml;
+
+ buffer_t *b, *tmpnext;
+
+ // variables for translation to 2-bit per base
+ buffer_t *headbuf;
+ int seq_bit;
+ int seq_word;
+ int word_remainder;
+ offset_t translate_length;
+ int config_value = 4554307;
+// int seq_offset;
+ // go through and see if any have completed.
+ b = Restructure.head;
+ do {
+ if( !b ) {
+ break;
+ }
+ // is this the buffer we need to translate next?
+ if( b->input_pos != Seqbuf.seq_pos ){
+ b = b->next;
+ continue;
+ }
+
+ tmpnext = b->next;
+ sequence = (char *)b->recs;
+ sml = (sml_t*)b->recs;
+
+ // translate the sequence that was just read and write it out
+ headbuf = Seqbuf.bufs.head;
+ // if we have a buffer, and the buffer is full or executing or
+ // if there's no buffer at all, let's try to get one
+ if( !headbuf ||
+ headbuf->operation != OP_NONE ) {
+ //printf( "headbuf busy or full -- op: %d, numrecs: %d, totalrecs: %d\n",
+ //headbuf->operation, headbuf->numrecs, headbuf->totalrecs );
+ // first see if this is our 'special' buffer and if we can use it
+ if( headbuf->operation == OP_FINISHED ) {
+ //printf( "headbuf is only one left and finished so reclaiming for use\n" );
+ headbuf->numrecs = 0;
+ headbuf->operation = OP_NONE;
+ Seqbuf.bufpos = 0;
+ } else {
+ //printf( "trying to get buffer from free list\n" );
+ if( Free.nitems ) {
+// printf( "got one from freelist\n" );
+ PushHead( &(Seqbuf.bufs), AllocateFree() );
+ headbuf = Seqbuf.bufs.head;
+ Seqbuf.bufpos = 0;
+ } else {
+// printf( "no free buffers to use for Seqbuf -- restructuring BLOCKS!\n" );
+ return;
+ }
+ }
+ }
+
+ seq_bit = Seqbuf.bufpos * 2;
+ seq_word = seq_bit / 32;
+ word_remainder = seq_bit % 32;
+ if( word_remainder != 0 ){
+ seq_word++;
+ }
+
+// int end_bit = 2 * (Seqbuf->bufpos + b->io_size - mask_length + 1);
+// int end_remainder = end_bit % 32;
+ translate_length = b->io_size - mask_length + 1 - (word_remainder / 2);
+ if( b->io_size + b->input_pos >= NumRecs ){
+ // this is the last I/O, translate the whole thing
+ translate_length += mask_length - 1;
+ }
+// translate_length -= end_remainder / 2;
+
+ // The number of bytes in headbuf->recs must ALWAYS be divisible by 4 when using
+ // Translate32, otherwise corruption will result
+#ifndef NO_RESTRUCTURE_PERF_TEST
+ Translate32( (uint32*)(headbuf->recs) + seq_word, ((char*)b->recs) + (word_remainder / 2), translate_length );
+#endif
+
+ // need to fill in beginning
+ if( word_remainder != 0 ){
+ int begin_mer = 0;
+ for( seqI = 0; seqI < word_remainder / 2; seqI++ ){
+ begin_mer <<= 2;
+ begin_mer |= DNA_TABLE[ sequence[ seqI ] ];
+ }
+// ((uint32*)headbuf->recs)[ seq_word - 1 ] <<= 32 - word_remainder;
+ ((uint32*)headbuf->recs)[ seq_word - 1 ] |= begin_mer;
+ }
+
+ Seqbuf.bufpos += translate_length + (word_remainder / 2);
+ Seqbuf.seq_pos += translate_length + (word_remainder / 2);
+
+ // if we made it full, write the thing
+ // each buf will consume headbuf->totalrecs / 4 bytes.
+ // there are headbuf->totalrecs * sizeof( record_t ) bytes available in the Seqbuf.
+ // thus we can fit 4 * sizeof( record_t ) bufs in each Seqbuf
+ if( Seqbuf.bufpos == headbuf->totalrecs * sizeof( record_t ) * 4 ||
+ b->io_size + b->input_pos >= NumRecs ) {
+ //printf( "writing bin buffer because full\n" );
+ headbuf->file = Seqbuf.file;
+ headbuf->device = &(Devices[Seqbuf.dev].dev);
+ WriteBuffer( headbuf, headbuf->totalrecs, headbuf->device );
+ headbuf->io_size = Seqbuf.bufpos / 4;
+ if( b->io_size + b->input_pos >= NumRecs ){
+ offset_t offI = 0;
+ offI = headbuf->io_size % 4;
+ if( offI != 0 )
+ headbuf->io_size += 4 - offI;
+ for( offI = 0; offI < 8; offI++ )
+ ((char*)headbuf->recs)[ headbuf->io_size + offI ] = 0;
+ headbuf->io_size += 8;
+ }
+ headbuf = NULL;
+ }else if( Seqbuf.bufpos > headbuf->totalrecs * sizeof( record_t ) * 4 ){
+ printf( "Error. Over filled Seqbuf\n" );
+ }
+
+
+ // translate the sequence according to the current sequence mask
+#ifndef NO_RESTRUCTURE_PERF_TEST
+ for( seqI = b->io_size - mask_length + 1; seqI > 0; seqI-- ){
+ bit = 1;
+ bit <<= mask_length - 1;
+ mer = 0;
+ weight = 0;
+ for( i = 0; i < mask_length; i++ ){
+ if( bit & seed_mask ){
+ mer <<= 2;
+ mer |= DNA_TABLE[ sequence[ seqI + i - 1 ] ];
+ }
+ bit >>= 1;
+ }
+ // copy the mer from the 64-bit integer based on the endian-ness of the system
+ // copy mer to forward key
+ mer <<= 64 - (2 * mask_weight);
+// if( seqI + b->input_pos == config_value )
+// __asm( nop );
+ if( little_endian ){
+ for( i = 0; i < MASK_T_BYTES; i++ )
+ forward.key[i] = ((char*)(&mer))[ sizeof( mer ) - i - 1 ];
+
+ }else{
+ for( i = 0; i < MASK_T_BYTES; i++ )
+ forward.key[i] = ((char*)(&mer))[ i ];
+ }
+
+ // reverse complement the mer
+ mer = ~mer;
+ for( i = 0; i < 64; i += 2 ){
+ rc_mer <<= 2;
+ rc_mer |= mer & 3;
+ mer >>= 2;
+ }
+ rc_mer <<= 64 - (2 * mask_weight);
+ // copy mer to reverse key
+ if( little_endian ){
+ for( i = 0; i < MASK_T_BYTES; i++ )
+ reverse.key[i] = ((char*)(&rc_mer))[ sizeof( mer ) - i - 1 ];
+ }else{
+ for( i = 0; i < MASK_T_BYTES; i++ )
+ reverse.key[i] = (((char*)(&rc_mer))[i]);
+ }
+ // put the lesser key in forward
+ if( COMPARE_KEYS( forward, reverse ) > 0)
+ forward = reverse;
+
+ // watch out for the last 6 records
+ if( seqI <= 6 ){
+ begin[ seqI - 1] = forward;
+ }else{
+ b->recs[ seqI - 1 ] = forward;
+ // set the position
+ sml[ seqI - 1 ].pos = b->input_pos + seqI - 1;
+ }
+ }
+
+ extras = b->io_size - mask_length + 1 < 6 ? b->io_size - mask_length + 1 : 6;
+
+ // fill in the first six records
+ for(; seqI < extras; seqI++ ){
+ b->recs[ seqI ] = begin[ seqI ];
+ // set the position
+ sml[ seqI ].pos = b->input_pos + seqI;
+ }
+#else
+ if(1){ // define a new scope so the variables can be local
+ // simulate random data in each bin
+ int i;
+ unsigned int keyval = 0;
+ unsigned int tmpval = 0;
+ if( divisor == 0 ) {
+ divisor = (unsigned)16777216 / (unsigned)NumBins;
+ // need ceiling of this
+ divisor += (unsigned)16777216 % (unsigned)NumBins ? 1 : 0;
+ printf( "Divisor is: %u\n", divisor );
+ }
+ for( seqI = 0; seqI < b->numrecs; seqI++ ){
+ tmpval = keyval;
+ for( i = 3; i > 0; i-- ) {
+ b->recs[ seqI ].key[ i - 1 ] = (tmpval & 0xFF);
+ b->recs[ seqI ].key[ i - 1 ] = 0;
+ tmpval >>= 8;
+ }
+ keyval += divisor;
+ }
+ }
+#endif
+
+ // b has been restructured, add it to the ToProcess list
+ PushTail( &ToProcess, RemoveItem( &Restructure, b ) );
+
+
+ b = tmpnext;
+ } while( b != Restructure.head && Restructure.nitems );
+}
+
+static void HandleReadingCompletions( void ) {
+ buffer_t *b, *tmpnext;
+ // just go through and see if any have completed.
+ b = Reading.head;
+ do {
+ if( !b ) {
+ break;
+ }
+ tmpnext = b->next;
+ if( b->operation == OP_FINISHED ) {
+ // migrate this to the toprocess list
+ b->operation = OP_NONE;
+ PushTail( &Restructure, RemoveItem( &Reading, b ) );
+ // bookkeeping
+ RecsRead += b->numrecs;
+ }
+ b = tmpnext;
+ } while( b != Reading.head && Reading.nitems );
+}
+
+
+
+void print_usage( const char* pname ){
+ printf( "Usage: %s <-m Working set size in MB> <-b buffer size> <-i input file> <-o output file> [-n number of records] <bin directory> <num bins> ... [bin directory] [num bins]\n", pname );
+}
+
+int InitdmSML( long working_mb, long buffer_size, const char* input_filename, const char* output_filename, const char* const* scratch_paths, uint64 seed ) {
+ int i, j;
+ offset_t desired_ws_size, actual_ws_size;
+ SMLHeader_t header;
+ struct {
+ const char * bin_dev;
+ int devnum;
+ int nbins;
+ } bins[8];
+
+ char *bin_name;
+ int scratchI = 0;
+
+ // initialize the timing stuff
+ InitTime();
+
+ // start the running timer now.
+ RunningTime = 0;
+ RunningTimer = StartTimer();
+
+ if( working_mb != 0 ){
+
+ desired_ws_size = working_mb;
+ desired_ws_size *= 1024 * 1024; // convert to bytes
+
+ }else{
+ // set desired working set size to half of physical memory...
+#ifdef WIN32
+ {
+/* MEMORYSTATUSEX ms;
+ memset( &ms, 0, sizeof( MEMORYSTATUSEX ) );
+ GlobalMemoryStatusEx( &ms );
+ desired_ws_size = ms.ullTotalPhys / 2;
+*/
+ MEMORYSTATUS ms;
+ memset( &ms, 0, sizeof( MEMORYSTATUS ) );
+ GlobalMemoryStatus( &ms );
+ desired_ws_size = ms.dwTotalPhys / 2;
+ }
+#else
+
+ {
+ // get it from /proc/meminfo
+ FILE *fp = fopen("/proc/meminfo", "r");
+ if ( fp )
+ {
+ long memTotal;
+
+ char buf[1024];
+ if ( fgets(buf, sizeof(buf), fp) )
+ {
+ sscanf(buf, "MemTotal: %ld kB", &memTotal);
+ fprintf( stderr, buf );
+ }
+ fclose(fp);
+ // allocate about 6/10 of physical memory
+ // leave the rest for buffer cache
+ desired_ws_size = memTotal * 512;
+ }
+ }
+
+#endif
+ // never allocate more than 2GB
+ if( desired_ws_size / 1024 > 2048 * 1024 ){
+ desired_ws_size = 1024 * 1024;
+ desired_ws_size *= 2048;
+ }
+// desired_ws_size /= sizeof( record_t ); // get working set size in records
+ }
+
+ if( buffer_size == 0 ){
+ buffer_size = 1;
+ while( desired_ws_size / (buffer_size*sizeof(record_t)) > 2048 ){
+ buffer_size *= 2;
+ }
+ }
+
+ BufferSizeMin = BufferSizeMax = buffer_size;
+ OutFileName = output_filename;
+
+ // find out how many scratch paths were given before the null terminator
+ for( ; ; scratchI++ ){
+ if( !scratch_paths || scratch_paths[ scratchI ] == NULL )
+ break;
+ }
+
+
+
+ NumBinDevs = scratchI;
+ NumDevices = 2 + NumBinDevs;
+ Devices = (device_t*)malloc( NumDevices * sizeof(device_t) );
+ DataDev = 0;
+ OutputDev = 1;
+ Devices[DataDev].devname = "Input device";
+ Devices[DataDev].path = input_filename;
+ Devices[DataDev].dev.buf = NULL;
+ Devices[OutputDev].devname = "Output device";
+ Devices[OutputDev].path = OutFileName;
+ Devices[OutputDev].dev.buf = NULL;
+
+
+ if( NumBinDevs == 0 ) {
+ return TOO_FEW_BINS;
+ } else if( NumBinDevs > 8 ) {
+ return TOO_MANY_BINS;
+ }
+
+ NumRecs = aStatFileSize( input_filename );
+
+ // calculate number of bins using nrecs and ws_size
+ NumBins = desired_ws_size / (200 * NumBinDevs);
+ NumBins = NumRecs / NumBins;
+ NumBins = NumBins < 5 * NumBinDevs ? 5 * NumBinDevs : NumBins; // don't allow fewer than 5 bins per dev
+ // round for equal number of bins per dev
+ if( NumBins % NumBinDevs != 0 )
+ NumBins = ( (NumBins / NumBinDevs) + 1 ) * NumBinDevs;
+ printf( "Creating %d bin files\n", NumBins );
+ for( i = 2; i < NumDevices; i++ ){
+ bin_name = (char*)malloc( 10 );
+ strcpy( bin_name, "bin dev__" );
+ bin_name[8] = 0x40 + i - 2;
+ Devices[i].devname = bin_name;
+ Devices[i].path = scratch_paths[ i - 2 ];
+ Devices[i].dev.buf = NULL;
+ bins[i - 2].bin_dev = bin_name;
+ bins[i - 2].nbins = NumBins / NumBinDevs; // allocate even an portion of bins per device
+ bins[i - 2].devnum = i;
+ }
+
+ // get buffer size.
+ if( BufferSizeMin == 0 ) {
+ BufferSizeMin = MINRECS;
+ BufferSizeMax = MAXRECS;
+ }
+
+
+ // open the input file
+ Data = aOpen( input_filename, A_READ );
+ if( Data == NULL ) {
+ printf( "couldn't open data file\n" );
+ return INPUT_NOT_OPENED;
+ }
+
+ // get working set size
+ if( desired_ws_size == 0 ) {
+ printf( "invalid working set size (%llu) -- must be at least 0\n", desired_ws_size );
+ return INVALID_WS_SIZE;
+ }
+
+ // init translation table
+ DNA_TABLE = CreateBasicDNATable();
+
+ // open the output file
+ Output = aOpen( OutFileName, A_WRITE );
+ if( !Output ) {
+ printf( "couldn't open output file!\n" );
+ return OUTPUT_NOT_OPENED;
+ }
+
+ header = InitSML( Output, NumRecs, seed );
+ seed_mask = header.seed;
+ mask_length = header.seed_length;
+ mask_weight = header.seed_weight;
+
+ if( NumRecs <= mask_length - 1 ){
+ printf( "Sequence must be at least %d characters in length\n", mask_length );
+ return SEQUENCE_TOO_SHORT;
+ }
+
+ NumRecs -= mask_length - 1;
+ printf( "NumRecs is: %llu \n", NumRecs );
+ // get the number of records we should process
+ RecsProcessed = 0;
+ RecsUnread = NumRecs;
+ if( NumRecs <= 0 ) {
+ return INVALID_NUMRECS;
+ printf( "invalid NumRecs: %llu\n", NumRecs );
+ }
+
+
+
+ // go ahead and create the working set.
+ actual_ws_size = MakeWorkingSet( &WS, desired_ws_size, BufferSizeMin, BufferSizeMax );
+ printf( "desired working set: %llu, actual working set: %llu\n",
+ desired_ws_size, actual_ws_size );
+
+ // initialize the Free list -- just put all the buffers on it.
+ for( i = 0; i < WS.nbufs; i++ ) {
+ PushHead( &Free, &(WS.bufs[i]) );
+ }
+
+ printf( "working set size : %llu\n", actual_ws_size );
+ printf( "total buffers : %d\n", WS.nbufs );
+ // FIXME: can any touching of the memory here help us?
+ // toprocess and reading list empty to start
+ ToProcess.nitems = Reading.nitems = 0;
+ ToProcess.head = Reading.head = NULL;
+ Restructure.nitems = 0;
+ Restructure.head = NULL;
+
+ // allocate Seqbuf
+ Seqbuf.file = Output;
+ Seqbuf.dev = OutputDev;
+ Seqbuf.bufpos = 0;
+ Seqbuf.seq_pos = 0;
+ if( Free.nitems ) {
+ PushHead( &(Seqbuf.bufs), AllocateFree() );
+ } else {
+ printf( "error: could not give a buffer to Seqbuf\n" );
+ return NO_FREE_BUFFERS;
+ }
+
+ // allocate the bins.
+ Bins = malloc( sizeof( *Bins ) * NumBins );
+ memset( Bins, 0, sizeof( *Bins ) * NumBins );
+
+ // allocate the bins in a round-robin fashion, so when we read
+ // things back for sorting, we're not swamping one device at a time --
+ // instead, things are spread out.
+ printf( "opening %d bins\n", NumBins );
+ j = -1;
+ for( i = 0; i < NumBins; i++ ) {
+ // find a bin on the next device.
+ while( 1 ) {
+ j = (j+1) % NumBinDevs;
+ if( bins[j].nbins ) {
+ // make this bin on that device, and
+ // round-robin switch to the next device.
+ const char *fname = Fmt("%sout%05d.binned",Devices[bins[j].devnum].path,i);
+ Bins[i].dev = bins[j].devnum;
+ Bins[i].fname = malloc( strlen( fname ) + 1 );
+ strcpy( Bins[i].fname, fname );
+
+#ifndef NO_BINNING_PERF_TEST
+ Bins[i].file = aOpen( fname, A_WRITE );
+ //printf( "opened '%s' on device '%s'\n", fname, Devices[bins[j].devnum].devname );
+ if( Bins[i].file == NULL ) {
+ printf( "couldn't open output bin file '%s'\n", fname );
+ return BIN_NOT_OPENED;
+ }
+#else
+ Bins[i].nrecs = aStatSize( fname );
+ if( Bins[i].nrecs == 0 ){
+ // just make sure the file exists
+ Bins[i].file = aOpen( fname, A_WRITE );
+ aClose( Bins[i].file );
+ Bins[i].file = NULL;
+ }
+#endif // NO_BINNING_PERF_TEST
+ bins[j].nbins--;
+ break;
+ }
+ }
+ }
+
+ // now we allocate one buffer for each bin
+ // and each bin will hold onto at least one buffer
+ // so that we can guarantee no locking cases
+ for( i = 0; i < NumBins; i++ ) {
+ if( Free.nitems ) {
+ PushHead( &(Bins[i].bufs), AllocateFree() );
+ } else {
+ printf( "error: could not give one buffer to each bin\n" );
+ return NO_FREE_BUFFERS;
+ }
+ }
+
+ // all went well
+ return 0;
+}
+
+
+
+void DisplayStatusHeader( void ) {
+ printf( "time recs_read recs_processed recs_committed recs_written binning_rate free reading toprocess bins restructure\n" );
+}
+
+
+void DisplayStatus( void ) {
+
+ printf( "%f %llu %llu %llu %llu %f %d %d %d %d %d\n",
+ RunningTime, RecsRead, RecsProcessed, RecsCommitted, RecsWritten,
+ RecsProcessed/RunningTime, Free.nitems, Reading.nitems, ToProcess.nitems,
+ WS.nbufs - Free.nitems - Reading.nitems - ToProcess.nitems - Restructure.nitems, Restructure.nitems );
+
+ /*
+ int i;
+ printf( "-----------------------------------------------------------\n" );
+ printf( "Records Processed : %d/%d\n", RecsProcessed, NumRecs );
+ printf( "Records Committed : %d\n", RecsCommitted );
+ printf( "Records Written : %d\n", RecsWritten );
+ printf( "Records Read : %d\n", RecsRead );
+ printf( "Running Time : %f seconds\n", RunningTime );
+ printf( "Binning Rate : %f records/sec (%f bytes/sec)\n",
+ RecsProcessed / RunningTime, RecsProcessed * sizeof(record_t) / RunningTime );
+ printf( "Freelist entries : %d\n", Free.nitems );
+ printf( "Reading entries : %d\n", Reading.nitems );
+ printf( "ToProcess entries : %d\n", ToProcess.nitems );
+ printf( "Bin entries:\n" );
+ for( i = 0; i < NumBins; i++ ) {
+ printf( " %4d : %4d\n", i, Bins[i].bufs.nitems );
+ }
+ printf( "Device status:\n" );
+ for( i = 0; i < NumDevices; i++ ) {
+ printf( " %d : '%16s' : '%16s' : %s\n", i, Devices[i].devname,
+ Devices[i].path, Devices[i].dev.state == DEV_FREE ? "FREE" : "BUSY" );
+ }
+ */
+}
+
+
+void UpdateIOState( void ) {
+ int i;
+ //printf( "update io state\n" );
+
+ // first update aio ops on the data file
+ aUpdateOperations( Data );
+ // next update aio ops on the bin files
+ for( i = 0; i < NumBins; i++ ) {
+ aUpdateOperations( Bins[i].file );
+ }
+ // update aio ops on the output file
+ aUpdateOperations( Output );
+ // next, let the working set adjust operation states and such
+ UpdateWSIOFinishedState( &WS );
+ // finally, let the devices start new operations if possible.
+ for( i = 0; i < NumDevices; i++ ) {
+ UpdateDeviceIOExecuteState( &WS, &(Devices[i].dev) );
+ }
+
+}
+
+
+void EnsureAllOperationsComplete( void ) {
+ int i;
+ int not_complete = 1;
+ dmtimer_t *wait;
+ wait = StartTimer();
+ while( not_complete ) {
+ UpdateIOState();
+ // see if we're done
+ not_complete = 0;
+ for( i = 0; i < WS.nbufs; i++ ) {
+ if( WS.bufs[i].device &&
+ WS.bufs[i].file &&
+ (WS.bufs[i].operation == OP_PENDING || WS.bufs[i].operation > OP_NONE) ) {
+ not_complete = 1;
+ break;
+ }
+ }
+ }
+ printf( "Ensure All Operations Complete: %d msec\n", ReadTimer( wait ) );
+ StopTimer( wait );
+}
+
+
+
+
+
+static double lasttime = 0;
+
+void BinningPhase( void ) {
+
+ int i;
+ // for progress output
+ int iter;
+ int timeaccum;
+
+ // the main loop.
+ printf( "----------------- Starting -----------------\n" );
+ printf( "working set buffers : %d\n", WS.nbufs );
+ printf( "number of bins : %d\n", NumBins );
+ timeaccum = 0;
+ iter = 0;
+ DisplayStatusHeader();
+ while( RecsProcessed < NumRecs ) {
+
+ // print status every few seconds or so.
+ // not until timing gets fixed
+ //if( RunningTime - lasttime >= 5.0f ) {
+ if( (RunningTime - lasttime) >= 2.0f ) {
+ DisplayStatus();
+ lasttime = RunningTime;
+ }
+
+ // keep the async io running
+ // first update the operations on all our files.
+ UpdateIOState();
+
+ // Handle read and write completions
+ // (transition reads to ToProcess, writes to Free)
+ HandleReadingCompletions();
+ HandleSeqbufWriteCompletions();
+ RestructureReadSMLBins();
+ HandleBinWriteCompletions();
+
+ // do reading and binning
+ DoReading();
+ DoBinning();
+
+ // finish up the loop.
+ iter++;
+
+ RunningTime = (double)ReadTimer( RunningTimer ) / 1000.0;
+
+ }
+
+ printf( "total iters: %d\n", iter );
+ // now, we *must* take care to make sure all writes have completed
+ // We can't simply call aClose on a file. It's true that that will
+ // wait until all the currently scheduled operations on that file
+ // complete, but with the device method, we only allow one operation
+ // on any device at a time. Thus, we must ask the device managers
+ // to complete their own IO.
+ // FIXME: this could potentially be moved into the buffer stuff for
+ // a DeviceClose type of call, but then if there is lots of stuff
+ // pending, unless DeviceClose could know about more than one device
+ // at a time, we would get effectively synchronous IO here, so we
+ // have the ugly hack for now.
+ FinishBinning();
+ EnsureAllOperationsComplete();
+
+ // close the input file.
+ aClose( Data );
+ Data = NULL;
+ // Finally, close all the bin files
+ for( i = 0; i < NumBins; i++ ) {
+ aClose( Bins[i].file );
+ Bins[i].file = NULL;
+ }
+ printf( "Finally, RecsCommitted: %llu\n", RecsCommitted );
+
+ DisplayStatus();
+
+}
+
+
+
+
+
+void SortReading( void ) {
+
+ int i;
+
+ // if anything is in WAIT_READ, and we have crap to read yet,
+ // start reading it in.
+
+ for( i = 0; i < NSortBufs; i++ ) {
+ // quick out if we're done reading.
+ if( BinToRead >= NumBins ) {
+ return;
+ }
+ if( SortBufs[i].state == WAIT_READ ) {
+ // schedule a read here.
+ const char *fname = Fmt("%sout%05d.binned",Devices[Bins[BinToRead].dev].path,BinToRead);
+ aFILE *in = aOpen( fname, A_READ );
+ if( !in ) {
+ printf( "couldn't open '%s' to read!\n", fname );
+ }
+ if( Bins[BinToRead].nrecs > SortBufs[i].buf->totalrecs ) {
+ printf( "buffer not big enough to hold bin!\n" );
+ }
+ SortBufs[i].bin = BinToRead;
+ SortBufs[i].dev = &(Devices[Bins[BinToRead].dev].dev);
+ SortBufs[i].state = BUSY_READ;
+ SortBufs[i].buf->file = in;
+ ReadBuffer( SortBufs[i].buf, Bins[BinToRead].nrecs, SortBufs[i].dev );
+ printf( "scheduled read of bin %d\n", BinToRead );
+ BinToRead++;
+ return;
+ }
+ }
+
+}
+
+
+
+#ifdef USE_QSORT_ONLY
+
+int comp_keys( record_t a, record_t b ){
+ int compval;
+ sml_t *mer_a, *mer_b;
+ mer_a = (sml_t*)&a;
+ mer_b = (sml_t*)&b;
+/* if( ( mer_a->pos == 4554307 &&
+ mer_b->pos == 4407600 ) ||
+ ( mer_a->pos == 4407600 &&
+ mer_b->pos == 4554307 ) )
+ __asm( nop );
+*/ compval = COMPARE_KEYS( a, b );
+ return compval;
+}
+
+void QBrute( record_t a[], int lo, int hi ) {
+ if ((hi-lo) == 1) {
+ if( comp_keys( a[hi], a[lo] ) < 0 ) {
+ record_t T = a[lo];
+ a[lo] = a[hi];
+ a[hi] = T;
+ }
+ }
+ if ((hi-lo) == 2) {
+ int pmin = comp_keys( a[lo], a[lo+1] ) < 0 ? lo : lo+1;
+ pmin = comp_keys( a[pmin], a[lo+2] ) < 0 ? pmin : lo+2;
+ if (pmin != lo) {
+ record_t T = a[lo];
+ a[lo] = a[pmin];
+ a[pmin] = T;
+ }
+ QBrute(a, lo+1, hi);
+ }
+ if ((hi-lo) == 3) {
+ int pmin, pmax;
+ pmin = comp_keys( a[lo], a[lo+1] ) < 0 ? lo : lo+1;
+ pmin = comp_keys( a[pmin], a[lo+2] ) < 0 ? pmin : lo+2;
+ pmin = comp_keys( a[pmin], a[lo+3] ) < 0 ? pmin : lo+3;
+ if (pmin != lo) {
+ record_t T = a[lo];
+ a[lo] = a[pmin];
+ a[pmin] = T;
+ }
+ pmax = comp_keys( a[hi], a[hi-1] ) > 0 ? hi : hi-1;
+ pmax = comp_keys( a[pmax], a[hi-2] ) > 0 ? pmax : hi-2;
+ if (pmax != hi) {
+ record_t T = a[hi];
+ a[hi] = a[pmax];
+ a[pmax] = T;
+ }
+ QBrute(a, lo+1, hi-1);
+ }
+}
+
+
+
+void QSort( record_t a[], int lo0, int hi0 ) {
+
+ int lo = lo0;
+ int hi = hi0;
+
+ record_t pivot;
+
+ if ((hi-lo) <= 3) {
+ QBrute(a, lo, hi);
+ return;
+ }
+
+ // Pick a pivot and move it out of the way
+ pivot = a[(lo + hi) / 2];
+ a[(lo + hi) / 2] = a[hi];
+ a[hi] = pivot;
+
+ while( lo < hi ) {
+
+ // Search forward from a[lo] until an element is found that
+ // is greater than the pivot or lo >= hi
+ //while( a[lo] <= pivot && lo < hi ) {
+ while( (comp_keys( a[lo], pivot ) <= 0) && lo < hi ) {
+ lo++;
+ }
+
+ //
+ // Search backward from a[hi] until element is found that
+ // is less than the pivot, or hi <= lo
+ //
+ //while (pivot <= a[hi] && lo < hi ) {
+ while( (comp_keys( pivot, a[hi] ) <= 0) && lo < hi ) {
+ hi--;
+ }
+
+ //
+ // Swap elements a[lo] and a[hi]
+ //
+ if( lo < hi ) {
+ record_t T = a[lo];
+ a[lo] = a[hi];
+ a[hi] = T;
+ }
+ }
+
+ //
+ // Put the median in the "center" of the list
+ //
+ a[hi0] = a[hi];
+ a[hi] = pivot;
+
+ //
+ // Recursive calls, elements a[lo0] to a[lo-1] are less than or
+ // equal to pivot, elements a[hi+1] to a[hi0] are greater than
+ // pivot.
+ //
+ QSort( a, lo0, lo-1 );
+ QSort( a, hi+1, hi0 );
+}
+
+
+
+
+
+void RecSort( record_t a[], int nelems ) {
+
+ QSort( a, 0, nelems-1 );
+
+}
+
+
+int SortBuffer( buffer_t * buf ) {
+
+ RecSort( buf->recs, buf->numrecs );
+ return( 1 );
+
+}
+
+
+void SortSorting( void ) {
+
+ int i, finished;
+ int lowest = -1;
+ QSortTimer = StartTimer();
+
+ for( i = 0; i < NSortBufs; i++ ) {
+ if( SortBufs[i].state == SORTING ) {
+ if( lowest == -1 || SortBufs[i].bin < SortBufs[lowest].bin ) {
+ lowest = i;
+ }
+ }
+ }
+
+ if( lowest != -1 ) {
+ printf( "sorting bin %d\n", SortBufs[lowest].bin );
+ finished = SortBuffer( SortBufs[lowest].buf );
+ if( finished ) {
+ SortBufs[lowest].state = WRITE_RESTRUCTURE;
+// SortBufs[lowest].state = WAIT_WRITE;
+ }
+ }
+
+ QSortTime += ReadTimer( QSortTimer ) / 1000.0;
+ StopTimer( QSortTimer );
+
+}
+
+#elif defined NO_SORT_PERF_TEST
+
+
+
+void SortSorting( void ) {
+
+ int i;
+
+ QSortTimer = StartTimer();
+
+ for( i = 0; i < NSortBufs; i++ ) {
+ if( SortBufs[i].state == SORTING ) {
+ SortBufs[i].state = WAIT_WRITE;
+ }
+ }
+
+ QSortTime += ReadTimer( QSortTimer ) / 1000.0;
+ StopTimer( QSortTimer );
+
+}
+
+
+
+#else
+
+sort_buf_t* CurrentSortBuf;
+buffer_t* SortScratchBuffer;
+
+void SortSorting( void ) {
+
+ int i;
+
+ QSortTimer = StartTimer();
+
+ // SortData -- sort everything in SORTING -- if it finishes, transition
+ // to WAIT_WRITE.
+ if( CurrentSortBuf == NULL ){
+ for( i = 0; i < NSortBufs; i++ ) {
+ // if this one is ready to sort, and it's the bin we're looking for...
+ if( SortBufs[i].state == SORTING && SortBufs[i].bin == BinToSort ) {
+ CurrentSortBuf = &SortBufs[i];
+ InitRadixSort( CurrentSortBuf, SortScratchBuffer );
+ printf( "scheduling sort of bin %d\n", BinToSort );
+ break;
+ }
+ }
+ }
+
+ // if there is something to sort right now then try to sort it.
+ if( CurrentSortBuf != NULL ){
+ if( CurrentSortBuf->state != WRITE_RESTRUCTURE ){
+
+ // automatically transitions to WAIT_WRITE when done.
+ RadixSort( CurrentSortBuf );
+
+ // prepare this bin for writing and setup to sort the next
+ if( CurrentSortBuf->state == WRITE_RESTRUCTURE ){
+ CurrentSortBuf = NULL;
+ BinToSort++;
+ }
+ }
+ }
+
+ QSortTime += ReadTimer( QSortTimer ) / 1000.0;
+ StopTimer( QSortTimer );
+
+}
+
+#endif
+
+void RestructureSMLBinsForWrite( void ) {
+ int i;
+ offset_t j;
+ position_t* positions;
+ sml_t *sml;
+
+ for( i = 0; i < NSortBufs; i++ ) {
+ // if this one is ready to be restructured...
+ if( SortBufs[i].state == WRITE_RESTRUCTURE ) {
+ printf( "restructuring bin %d\n", SortBufs[i].bin );
+ positions = (position_t*)SortBufs[i].buf->recs;
+ sml = (sml_t*)SortBufs[i].buf->recs;
+ for( j = 0; j < Bins[SortBufs[i].bin].nrecs; j++ ){
+ positions[ j ] = sml[ j ].pos;
+ }
+
+ // set its state for writing
+ SortBufs[i].state = WAIT_WRITE;
+ }
+ }
+}
+
+// use this version if no pre-write modifications are required
+/*
+void RestructureSMLBinsForWrite( void ) {
+ int i;
+
+ for( i = 0; i < NSortBufs; i++ ) {
+ // if this one is ready to be restructured...
+ if( SortBufs[i].state == WRITE_RESTRUCTURE ) {
+ // set its state for writing
+ SortBufs[i].state = WAIT_WRITE;
+ }
+ }
+}
+*/
+
+int CalculateSortWriteSize( int sortI ){
+ return Bins[SortBufs[sortI].bin].nrecs * sizeof( position_t );
+}
+
+void SortWriting( void ) {
+
+ int i;
+
+ for( i = 0; i < NSortBufs; i++ ) {
+ // if this one is ready to write, and it's the bin we're looking for...
+ if( SortBufs[i].state == WAIT_WRITE && SortBufs[i].bin == BinToWrite ) {
+#ifdef NO_WRITE_PERF_TEST
+ // skip writing by setting the state to wait_read
+ SortBufs[i].state = WAIT_READ;
+#else
+ printf( "scheduling write of bin %d\n", BinToWrite );
+ // write it out.
+ SortBufs[i].dev = &(Devices[OutputDev].dev);
+ SortBufs[i].state = BUSY_WRITE;
+ SortBufs[i].buf->file = Output;
+ WriteBuffer( SortBufs[i].buf, Bins[SortBufs[i].bin].nrecs, &(Devices[OutputDev].dev) );
+ SortBufs[i].buf->io_size = CalculateSortWriteSize( i );
+#endif // NO_WRITE_PERF_TEST
+ BinToWrite++;
+ }
+ }
+
+}
+
+
+
+
+
+void SortHandleCompletions( void ) {
+
+ int i;
+
+ // transition states of those that finished.
+ for( i = 0; i < NSortBufs; i++ ) {
+ if( SortBufs[i].state == BUSY_READ || SortBufs[i].state == BUSY_WRITE ) {
+ if( SortBufs[i].buf->operation == OP_FINISHED ) {
+ //printf( "operation finished on buf %d\n", i );
+ SortBufs[i].buf->operation = OP_NONE;
+ SortBufs[i].state = SortBufs[i].state == BUSY_READ ? SORTING : WAIT_READ;
+#ifdef NNNNN_KEYBYTES
+ // bin 0 doesn't need to be sorted
+ if( SortBufs[i].bin == 0 && SortBufs[i].state == SORTING )
+ SortBufs[i].state = WAIT_WRITE;
+#endif
+ }
+ }
+ }
+
+}
+
+
+
+
+
+void SortUpdateIOState() {
+
+ int i;
+ //printf( "update io state\n" );
+
+ // first update aio ops on the data file
+ aUpdateOperations( Output );
+ // next update aio ops on the sortbuf files
+ for( i = 0; i < NSortBufs; i++ ) {
+ if( SortBufs[i].buf->file ) {
+ aUpdateOperations( SortBufs[i].buf->file );
+ }
+ }
+ // next, let the working set adjust operation states and such
+ UpdateWSIOFinishedState( &WS );
+ // finally, let the devices start new operations if possible.
+ for( i = 0; i < NumDevices; i++ ) {
+ UpdateDeviceIOExecuteState( &WS, &(Devices[i].dev) );
+ }
+
+}
+
+
+
+
+
+
+void SortingEnsureAllOperationsComplete() {
+ int i;
+ int not_complete = 1;
+ dmtimer_t *wait;
+ wait = StartTimer();
+ while( not_complete ) {
+ SortUpdateIOState();
+ // see if we're done
+ not_complete = 0;
+ for( i = 0; i < WS.nbufs; i++ ) {
+ if( WS.bufs[i].device &&
+ WS.bufs[i].file &&
+ (WS.bufs[i].operation == OP_PENDING || WS.bufs[i].operation > OP_NONE) ) {
+ not_complete = 1;
+ break;
+ }
+ }
+ }
+
+ // flush the output file to disk.
+ aFlush( Output );
+
+ printf( "Sort Ensure All Operations Complete: %d msec\n", ReadTimer( wait ) );
+ StopTimer( wait );
+}
+
+
+
+
+
+
+
+void SortingPhase( void ) {
+
+ // now reorganize the working set, and start up the sort procedure.
+
+ // we need to have the ability to read from N bin files at a time, where
+ // N is the number of bin devices.
+
+ // We read entire bin files at a time into each slot. We wait for the
+ // first one to finish, and then we sort it. We can start sorting the
+ // others too, as they finish. When the first sort is done, we write it
+ // out to the sorted output file, similarly we write everything out in
+ // order. When one is confirmed finished writing, we can start reading
+ // the next bin file from that device in.
+
+ int i;
+ offset_t recs_per_buffer;
+ offset_t biggest_bin = 0;
+ offset_t biggest_nrecs = 0;
+
+ NSortBufs = NumBinDevs;
+
+ for( i = 0; i < NumBins; i++ ) {
+ if( Bins[i].nrecs > biggest_nrecs ) {
+ biggest_nrecs = Bins[i].nrecs;
+ biggest_bin = i;
+ }
+ }
+
+ //recs_per_buffer = (WS.size / sizeof( record_t )) / NSortBufs;
+
+ recs_per_buffer = biggest_nrecs;
+
+ if( (WS.size / sizeof( record_t )) < (unsigned)recs_per_buffer ) {
+ printf( "working set holds %llu recs, but we need %llu\n",
+ (WS.size / sizeof( record_t )), recs_per_buffer );
+ }
+
+ NSortBufs = (WS.size / sizeof( record_t )) / recs_per_buffer;
+
+ printf( "NSortBufs = %d\n", NSortBufs );
+
+ // this goes from 0 to NumBins-1 as we read stuff.
+ BinToRead = 0;
+ BinToWrite = 0;
+ BinToSort = 0;
+
+ printf( "reorganizing working set: %llu recs per buffer, %d sort bufs\n", recs_per_buffer, NSortBufs );
+ ReorganizeWorkingSet( &WS, recs_per_buffer, recs_per_buffer );
+
+#if !defined USE_QSORT_ONLY && !defined NO_SORT_PERF_TEST
+ // steal the last buffer for scratch space
+ NSortBufs--;
+ SortScratchBuffer = &(WS.bufs[NSortBufs]);
+ SortScratchBuffer->operation = SORTING_SCRATCH;
+#endif
+
+ // nbufs should be same as NumBinDevs
+ printf( "reorganized working set has %d buffers of %llu bytes\n", WS.nbufs, recs_per_buffer * sizeof(record_t) );
+ SortBufs = malloc( sizeof( *SortBufs ) * NSortBufs );
+ memset( SortBufs, 0, sizeof( *SortBufs ) * NSortBufs );
+
+ // put everything in WAIT_READ;
+
+ for( i = 0; i < NSortBufs; i++ ) {
+
+ SortBufs[i].state = WAIT_READ;
+ SortBufs[i].buf = &(WS.bufs[i]);
+ SortBufs[i].dev = NULL;
+
+ }
+#ifdef NNNNN_KEYBYTES
+ // process the first bin then restructure the working set again
+
+ while( BinToWrite < 1 ) {
+ SortReading();
+ SortSorting();
+ RestructureSMLBinsForWrite();
+ SortWriting();
+ SortUpdateIOState();
+ SortHandleCompletions();
+ }
+ SortingEnsureAllOperationsComplete();
+
+ for( i = 1; i < NumBins; i++ ) {
+ if( Bins[i].nrecs > biggest_nrecs ) {
+ biggest_nrecs = Bins[i].nrecs;
+ biggest_bin = i;
+ }
+ }
+ recs_per_buffer = biggest_nrecs;
+ if( (WS.size / sizeof( record_t )) < (unsigned)recs_per_buffer ) {
+ printf( "working set holds %llu recs, but we need %llu\n",
+ (WS.size / sizeof( record_t )), recs_per_buffer );
+ }
+ NSortBufs = (WS.size / sizeof( record_t )) / recs_per_buffer;
+ printf( "NSortBufs = %d\n", NSortBufs );
+ // this goes from 0 to NumBins-1 as we read stuff.
+ BinToRead = 1;
+ BinToWrite = 1;
+ BinToSort = 1;
+
+ printf( "reorganizing working set: %llu recs per buffer, %d sort bufs\n", recs_per_buffer, NSortBufs );
+ ReorganizeWorkingSet( &WS, recs_per_buffer, recs_per_buffer );
+
+#if !defined USE_QSORT_ONLY && !defined NO_SORT_PERF_TEST
+ // steal the last buffer for scratch space
+ NSortBufs--;
+ SortScratchBuffer = &(WS.bufs[NSortBufs]);
+ SortScratchBuffer->operation = SORTING_SCRATCH;
+#endif
+
+ // nbufs should be same as NumBinDevs
+ printf( "reorganized working set has %d buffers of %llu bytes\n", WS.nbufs, recs_per_buffer * sizeof(record_t) );
+ SortBufs = malloc( sizeof( *SortBufs ) * NSortBufs );
+ memset( SortBufs, 0, sizeof( *SortBufs ) * NSortBufs );
+
+ // put everything in WAIT_READ;
+ for( i = 0; i < NSortBufs; i++ ) {
+ SortBufs[i].state = WAIT_READ;
+ SortBufs[i].buf = &(WS.bufs[i]);
+ SortBufs[i].dev = NULL;
+ }
+#endif
+
+
+ while( BinToWrite < NumBins ) {
+
+ // ReadFiles -- schedule reading operations if we can (are any buffers
+ // in WAIT_READ?)
+ //printf( "sortreading\n" );
+ SortReading();
+
+ // SortData -- sort everything in SORTING -- if it finishes, transition
+ // to WAIT_WRITE.
+ //printf( "sortsorting\n" );
+ SortSorting();
+
+ // Perform any necessary post-sort processing on the data to prepare it for
+ // writing out to the sorted file
+ RestructureSMLBinsForWrite();
+ // WriteFiles -- schedule writing operations for everything in WAIT_WRITE, if
+ // it is the next file we need to write (make sure to schedule in order).
+ //printf( "sortwriting\n" );
+ SortWriting();
+
+ // update io state
+ //printf( "sortupdateiostate\n" );
+ SortUpdateIOState();
+
+
+ // HandleCompletions -- if something finishes,
+ // if it was reading, transition to SORTING
+ // if it was writing, transition to WAIT_READ.
+ //printf( "sorthandlecompletions\n" );
+ SortHandleCompletions();
+
+
+ }
+
+ SortingEnsureAllOperationsComplete();
+
+ printf( "QSort took %f seconds\n", QSortTime );
+
+}
+
+
+
+
+
+
+
+int dmsort() {
+
+
+ // Do the first pass binning stuff
+ BinningTimer = StartTimer();
+#ifndef NO_BINNING_PERF_TEST
+ BinningPhase();
+ BinningTime = ReadTimer( BinningTimer ) / 1000.0;
+#endif // NO_BINNING_PERF_TEST
+ StopTimer( BinningTimer );
+
+
+ // Do the second pass sort
+ SortingTimer = StartTimer();
+ SortingPhase();
+ SortingTime = ReadTimer( SortingTimer ) / 1000.0;
+ StopTimer( SortingTimer );
+
+
+ RunningTime = ReadTimer( RunningTimer ) / 1000.0;
+ StopTimer( RunningTimer );
+
+ printf( "total time : %f sec\n", RunningTime );
+ printf( "binning time : %f sec (%f%%)\n", BinningTime, BinningTime/RunningTime * sizeof(record_t) );
+ printf( "sorting time : %f sec (%f%%)\n", SortingTime, SortingTime/RunningTime * sizeof(record_t) );
+
+ printf( "total rate : %f MB/sec\n", (((double)NumRecs)/10485.760)/RunningTime );
+ printf( "total bin rate : %f MB/sec\n", (((double)NumRecs)/10485.760)/BinningTime );
+ printf( "total sort rate : %f MB/sec\n", (((double)NumRecs)/10485.760)/SortingTime );
+
+ return 0;
+}
+
+
+int dmSML( const char* input_file, const char* output_file, const char* const* scratch_paths, uint64 seed ) {
+ long working_mb = 300;
+ long buffer_size = 1000;
+ int rval = 0;
+ int i = 0;
+ rval = InitdmSML( 0, 0, input_file, output_file, scratch_paths, seed );
+ if( rval != 0 )
+ return rval;
+ rval = dmsort();
+
+ // Hey slob! cleanup after yourself!
+ for( i = 0; i < NumBins; i++ ){
+ removeFile( Bins[ i ].fname, FALSE );
+ free( Bins[ i ].fname );
+ }
+ if( Bins )
+ free( Bins );
+ Bins = NULL;
+ NumBins = 0;
+// for( i = 0; i < NumDevices; i++ )
+// free( Devices[i].devname );
+ NumDevices = 0;
+ if( Devices )
+ free( Devices );
+ Devices = NULL;
+ if( SortBufs )
+ free( SortBufs );
+ SortBufs = NULL;
+
+ NSortBufs = 0;
+
+ BufferSizeMin = 0;
+ BufferSizeMax = 0;
+
+ memset( &Seqbuf, 0, sizeof( seqbuf_t ) );
+
+ DataDev = 0;
+
+ OutFileName = "unset";
+
+ // close the sorted file
+ aClose( Output );
+ Output = NULL;
+ OutputDev = 0;
+
+ BinToRead = 0;
+ BinToWrite = 0;
+ BinToSort = 0;
+
+ free( WS.bufs );
+ memset( &WS, 0, sizeof( working_set_t ) );
+
+ NumRecs = 0;
+ RecsProcessed = 0;
+ RecsRead = 0;
+ RecsUnread = 0;
+ RecsCommitted = 0;
+ RecsWritten = 0;
+
+
+// timers
+ RunningTime = 0;
+ RunningTimer= NULL;
+ BinningTime = 0;
+ BinningTimer= NULL;
+ SortingTime = 0;
+ SortingTimer = NULL;
+
+ QSortTime = 0;
+ QSortTimer = NULL;
+
+ ReadIdleTime = 0;
+ ReadIdleTimer = NULL;
+ SortIdleTime = 0;
+ SortIdleTimer = NULL;
+ WriteIdleTime = 0;
+ WriteIdleTimer = NULL;
+
+
+ memset( &Free, 0, sizeof( buffer_list_t ) );
+ memset( &ToProcess, 0, sizeof( buffer_list_t ) );
+ memset( &Reading, 0, sizeof( buffer_list_t ) );
+ memset( &Restructure, 0, sizeof( buffer_list_t ) );
+
+ // static variables
+ divisor = 0;
+ consumed_recs = 0;
+ toprocess = NULL;
+ lasttime = 0;
+
+ // from asyncio.c
+// OperationNumber = 0;
+
+ return rval;
+}
+
diff --git a/libMems/dmSML/dmsort.h b/libMems/dmSML/dmsort.h
new file mode 100644
index 0000000..2f3cf09
--- /dev/null
+++ b/libMems/dmSML/dmsort.h
@@ -0,0 +1,197 @@
+#ifndef __DMSORT_H__
+#define __DMSORT_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "libMems/dmSML/util.h"
+#include "libMems/dmSML/timing.h"
+#include "libMems/dmSML/asyncio.h"
+#include "libMems/dmSML/buffer.h"
+#include "libMems/dmSML/sorting.h"
+#include "libMems/dmSML/sml.h"
+
+// define this if you're using the ASCII sortgen data.
+// don't define if you're using random data (dmsortgen)
+//#define ASCII_KEYBYTES
+
+// define this if using dmSML with sequences that have large
+// stretches of NNNNN... such as an unfinished eukaryote
+//#define NNNNN_KEYBYTES
+
+// define this if you want to measure the overlapping
+// of your sorting with I/O in the sorting phase --
+// this makes the sort routine do nothing.
+//#define NO_SORT_PERF_TEST
+
+// define the following if you don't want to write
+// data during the sort phase in order to get timings
+//#define NO_WRITE_PERF_TEST
+
+// define this to skip the binning phase in order to
+// perform measurements on the sort phase. The bin
+// files to use during sorting must already exist (duh!)
+//#define NO_BINNING_PERF_TEST
+
+// define this to test the performance of binning and
+// restructuring without bin writing
+//#define NO_BIN_WRITE_PERF_TEST
+
+// define this to test the performance without restructuring
+// each SML bin
+//#define NO_RESTRUCTURE_PERF_TEST
+
+#ifndef NELEMS
+#define NELEMS(x) \
+ ( sizeof((x)) / sizeof((x)[0]) )
+#endif
+
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#define MINRECS (1311)
+#define MAXRECS (1311)
+
+
+// this is somewhat less appealing than a config file,
+// but speed is critical and parsing a config file at
+// startup is just inconvenient. Besides, specifying
+// what we care about is easy enough this way.
+typedef struct device_s {
+ const char *devname;
+ const char *path;
+ iodevice_t dev;
+} device_t;
+
+
+// ugly hack
+#define BIN_SPECIAL (-10000)
+
+
+
+// what we use to represent a bin.
+typedef struct bin_s {
+ aFILE *file; // File we write/read on.
+ int dev; // This is an index into the Devices table.
+ offset_t nrecs; // Number of records written to bin.
+ buffer_list_t bufs; // Our list of buffers that holds our data.
+ char* fname; /**< The file name of this bin */
+} bin_t;
+
+typedef struct seqbuf_s {
+ aFILE *file; // Output file
+ int dev; // device table index for output file
+ offset_t bufpos; // position in current buffer
+ uint64 seq_pos; // position in sequence that is next to translate
+ buffer_list_t bufs; // list of buffers for data
+} seqbuf_t;
+
+enum dm_errors {
+ SUCCESS,
+ TOO_FEW_BINS,
+ TOO_MANY_BINS,
+ INPUT_NOT_OPENED,
+ INVALID_WS_SIZE,
+ SEQUENCE_TOO_SHORT,
+ OUTPUT_NOT_OPENED,
+ INVALID_NUMRECS,
+ NO_FREE_BUFFERS,
+ BIN_NOT_OPENED,
+};
+
+
+void print_usage( const char* pname );
+
+
+static buffer_t * AllocateFree( void );
+
+static int ComputeBinNumber( const unsigned char key[10] );
+
+// just like ComputeBinNumber except we reserve one bin for zero keys.
+static int ComputeNNNNNBinNumber( const unsigned char key[10] );
+
+static int ComputeAsciiBinNumber( const unsigned char key[10] );
+
+static void DoBinning( void );
+
+void FinishBinning();
+
+offset_t CalculateDataReadSize( buffer_t* b );
+
+static void DoReading( void );
+
+static void HandleBinWriteCompletions( void );
+
+static void HandleSeqbufWriteCompletions( void );
+
+#define ALPHA_BITS 2
+
+static void Translate32(uint32* dest, const char* src, const unsigned len);
+
+void RestructureReadSMLBins( void );
+
+static void HandleReadingCompletions( void );
+
+int InitdmSML( long working_mb, long buffer_size, const char* input_filename, const char* output_filename, const char* const* scratch_paths, uint64 seed );
+
+void DisplayStatusHeader( void );
+
+void DisplayStatus( void );
+
+void UpdateIOState( void );
+
+void EnsureAllOperationsComplete( void );
+
+void BinningPhase( void );
+
+void SortReading( void );
+
+#ifdef USE_QSORT_ONLY
+
+int comp_keys( record_t a, record_t b );
+
+void QBrute( record_t a[], int lo, int hi );
+
+void QSort( record_t a[], int lo0, int hi0 );
+
+void RecSort( record_t a[], int nelems );
+
+int SortBuffer( buffer_t * buf );
+
+void SortSorting( void );
+
+#elif defined NO_SORT_PERF_TEST
+
+void SortSorting( void );
+
+#else
+
+sort_buf_t* CurrentSortBuf;
+buffer_t* SortScratchBuffer;
+
+void SortSorting( void );
+
+#endif
+
+void RestructureSMLBinsForWrite( void );
+
+int CalculateSortWriteSize( int sortI );
+
+void SortWriting( void );
+
+void SortHandleCompletions( void );
+
+void SortUpdateIOState();
+
+void SortingEnsureAllOperationsComplete();
+
+void SortingPhase( void );
+
+int dmsort( void );
+
+int dmSML( const char* input_file, const char* output_file, const char* const* scratch_paths, uint64 seed );
+
+
+#endif // __DMSORT_H__
diff --git a/libMems/dmSML/sml.c b/libMems/dmSML/sml.c
new file mode 100644
index 0000000..310d879
--- /dev/null
+++ b/libMems/dmSML/sml.c
@@ -0,0 +1,55 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/sml.h"
+#include "libMems/SeedMasks.h"
+
+
+SMLHeader_t InitSML( aFILE* file, uint64 file_size, uint64 seed ){
+ SMLHeader_t header;
+ int retcode;
+
+ header.version = 5;
+ header.alphabet_bits = 2;
+ header.seed = seed;
+ header.seed_length = getSeedLength( seed );
+ header.seed_weight = getSeedWeight( seed );
+ header.length = file_size;
+ header.unique_mers = -1;
+ header.word_size = 32;
+ header.little_endian = 1;
+ header.id = 0;
+ header.circular = 0;
+ memcpy(header.translation_table, CreateBasicDNATable(), UINT8_MAX);
+ header.description[ 0 ] = 0;
+
+ retcode = aWrite( (void*)&header, sizeof( header ), 1, file, 0 );
+ if( retcode == 0 )
+ printf( "Error writing to SML\n" );
+ aWaitComplete( file, retcode );
+ return header;
+}
+
+/*
+// use this version of RestructureReadSMLBins when no restructuring is necessary
+void RestructureReadSMLBins( void ) {
+ buffer_t *b, *tmpnext;
+ // go through and see if any have completed.
+ b = Restructure.head;
+ do {
+ if( !b ) {
+ break;
+ }
+
+ tmpnext = b->next;
+
+ // b has been restructured, add it to the ToProcess list
+ PushTail( &ToProcess, RemoveItem( &Restructure, b ) );
+ // bookkeeping
+ RecsRead += b->numrecs;
+
+ b = tmpnext;
+ } while( b != Restructure.head && Restructure.nitems );
+}
+*/
diff --git a/libMems/dmSML/sml.h b/libMems/dmSML/sml.h
new file mode 100644
index 0000000..0af1cf9
--- /dev/null
+++ b/libMems/dmSML/sml.h
@@ -0,0 +1,79 @@
+#ifndef _sml_h_
+#define _sml_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/asyncio.h"
+#include <string.h>
+#include "libGenome/gnDefs.h"
+
+#ifndef UINT8_MAX
+
+#define UINT8_MAX 256
+typedef unsigned char uint8;
+typedef unsigned uint32;
+typedef unsigned long long uint64;
+
+#endif
+
+static uint8* CreateBasicDNATable(){
+ uint8* bdt = (uint8*)malloc( sizeof(uint8) * UINT8_MAX );
+ memset(bdt, 0, UINT8_MAX);
+ bdt['c'] = 1;
+ bdt['C'] = 1;
+ bdt['b'] = 1;
+ bdt['B'] = 1;
+ bdt['y'] = 1;
+ bdt['Y'] = 1;
+ bdt['g'] = 2;
+ bdt['G'] = 2;
+ bdt['s'] = 2;
+ bdt['S'] = 2;
+ bdt['k'] = 2;
+ bdt['K'] = 2;
+ bdt['t'] = 3;
+ bdt['T'] = 3;
+ return bdt;
+}
+
+static uint8* DNA_TABLE;
+typedef unsigned position_t;
+typedef unsigned long long mask_t;
+#define MASK_T_BYTES 8
+static mask_t seed_mask = 0x7FFFFFFF;
+static int mask_length = 31;
+static int mask_weight = 31;
+
+#define DESCRIPTION_SIZE 2048 /**< Number of bytes for the freeform text description of an SML */
+
+
+typedef signed short sarID_t;
+
+typedef struct SMLHeader_s{
+ uint32 version; /**< Format version - 4 bytes */
+ uint32 alphabet_bits; /**< Bits per character in the alphabet - 4 bytes */
+ uint64 seed; /**< The pattern used in each seed */
+ uint32 seed_length; /**< The length of the seed mask */
+ uint32 seed_weight; /**< The weight of the seed mask */
+ uint64 length; /**< length of the sequence before circularity - 8 bytes */
+ uint32 unique_mers; /**< Number of unique mers in the sequence 4 bytes */
+ uint32 word_size; /**< Word size on the machine the sequence was translated */
+ boolean little_endian; /**< Is the byte order little endian? 0==no, !0==yes */
+ signed short id; /**< Obsolete ID value - 1 byte, eaten by alignment? */
+ boolean circular; /**< Circularity of sequence - 1 byte */
+ uint8 translation_table[UINT8_MAX]; /**< Translation table for ascii characters to binary values -- 256 bytes */
+ char description[DESCRIPTION_SIZE]; /**< Freeform text description of sequence data -- 2048 bytes */
+} SMLHeader_t;
+
+
+typedef struct sml_s {
+ char key[8];
+ position_t pos;
+} sml_t;
+
+SMLHeader_t InitSML( aFILE* file, uint64 file_size, uint64 seed );
+
+
+#endif /* _sml_h_ */
diff --git a/libMems/dmSML/sorting.c b/libMems/dmSML/sorting.c
new file mode 100644
index 0000000..172e310
--- /dev/null
+++ b/libMems/dmSML/sorting.c
@@ -0,0 +1,323 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/sorting.h"
+#include "math.h"
+#include <string.h>
+
+#ifndef USE_QSORT_ONLY
+
+
+// Other helper functions in this file:
+void RadixHistogram( sort_buf_t* sortbuf );
+void RadixCopy( sort_buf_t* sortbuf );
+void QSortPointers( sort_buf_t* sortbuf );
+void QBrute( record_t* a[], int lo, int hi );
+void QSort( record_t* a[], int lo0, int hi0 );
+void CopySortedData ( sort_buf_t* sortbuf );
+
+
+void InitRadixSort( sort_buf_t* sortbuf, buffer_t* scratch_buffer )
+{
+ // allocate the sortbuf struct
+ unsigned int bin_divisor;
+ unsigned int i, keyval = 0;
+ // allocate the histogram memory
+ sortbuf->histogram_size = 1;
+ sortbuf->histogram_size <<= RADIX_BITS;
+ sortbuf->histogram = (unsigned*) malloc( sortbuf->histogram_size * sizeof(unsigned) );
+ sortbuf->cur_ptr_offsets = (unsigned*) malloc( sortbuf->histogram_size * sizeof(unsigned) );
+
+ // init histogram to 0's
+ memset( sortbuf->histogram, 0, sortbuf->histogram_size * sizeof(unsigned) );
+
+ // calculate the base number and divisor
+
+ bin_divisor = (unsigned)16777216 / (unsigned)NumBins;
+ // need ceiling of this
+ bin_divisor += (unsigned)16777216 % (unsigned)NumBins ? 1 : 0;
+
+ for( i = 0; i < 3; i++ ) {
+ keyval <<= 8;
+ keyval += sortbuf->buf->recs[0].key[i];
+ }
+ sortbuf->base_number = (keyval / bin_divisor) * bin_divisor;
+
+ sortbuf->divisor = (unsigned)bin_divisor / (unsigned)sortbuf->histogram_size;
+ sortbuf->divisor += (unsigned)bin_divisor % (unsigned)sortbuf->histogram_size ? 1 : 0;
+
+ // init some values
+ sortbuf->cur_position = 0;
+ sortbuf->sort_state = CalculateHistogram;
+ sortbuf->radix_tmp = scratch_buffer;
+
+ // allocate ptr buffer memory
+ sortbuf->rec_ptrs = (record_t**) malloc( sortbuf->buf->numrecs * sizeof(record_t*) );
+
+}
+
+void RadixSort( sort_buf_t* sortbuf )
+{
+ switch(sortbuf->sort_state){
+ case CalculateHistogram:
+ RadixHistogram( sortbuf );
+ break;
+ case CopyPointers:
+ RadixCopy( sortbuf );
+ break;
+ case QsortPointers:
+ QSortPointers( sortbuf );
+ break;
+ case CopyData:
+ CopySortedData( sortbuf );
+ break;
+ default:
+ printf("Error in sort_state\n");
+ }
+}
+
+void RadixHistogram( sort_buf_t* sortbuf ){
+ unsigned data_bucket;
+ unsigned maxI;
+ unsigned histI;
+ unsigned cur_offset;
+ unsigned tmp;
+ record_t* cur_rec;
+
+ maxI = sortbuf->cur_position + HISTOGRAM_CHUNK_SIZE;
+ maxI = maxI < (unsigned)sortbuf->buf->numrecs ? maxI : (unsigned)sortbuf->buf->numrecs;
+
+ // do a complete pass over the data set, summing the number of entries
+ // in each bucket
+ for(; sortbuf->cur_position < maxI; sortbuf->cur_position++){
+ cur_rec = &(sortbuf->buf->recs[ sortbuf->cur_position ]);
+ data_bucket = cur_rec->key[0];
+ data_bucket <<= 8;
+ data_bucket += cur_rec->key[1];
+ data_bucket <<= 8;
+ data_bucket += cur_rec->key[2];
+
+ data_bucket -= sortbuf->base_number;
+ data_bucket /= sortbuf->divisor;
+ sortbuf->histogram[data_bucket]++;
+ }
+
+ // check if we've completed this stage
+ if( sortbuf->cur_position == (unsigned)sortbuf->buf->numrecs ){
+
+ // do a pass over the histogram converting the counts to offsets
+ cur_offset = 0;
+ for( histI = 0; histI < sortbuf->histogram_size; histI++){
+ tmp = sortbuf->histogram[ histI ];
+ sortbuf->histogram[ histI ] = cur_offset;
+ cur_offset += tmp;
+ }
+
+ // copy pointers is the next stage
+ sortbuf->sort_state = CopyPointers;
+ sortbuf->cur_position = 0;
+ }
+}
+
+void RadixCopy( sort_buf_t* sortbuf ){
+
+ unsigned data_bucket;
+
+ unsigned maxI;
+ record_t* cur_rec;
+
+ maxI = sortbuf->cur_position + PTR_COPY_CHUNK_SIZE;
+ maxI = maxI < (unsigned)sortbuf->buf->numrecs ? maxI : (unsigned)sortbuf->buf->numrecs;
+
+ // if its the first time through then initialize cur_ptr_offsets
+ if(sortbuf->cur_position == 0 )
+ memcpy(sortbuf->cur_ptr_offsets, sortbuf->histogram, sortbuf->histogram_size * sizeof(unsigned) );
+
+ // do a complete pass over the data set, setting an entry in the pointer
+ // array for the correct bucket
+ for(; sortbuf->cur_position < maxI; sortbuf->cur_position++){
+ cur_rec = &(sortbuf->buf->recs[ sortbuf->cur_position ]);
+ data_bucket = cur_rec->key[0];
+ data_bucket <<= 8;
+ data_bucket += cur_rec->key[1];
+ data_bucket <<= 8;
+ data_bucket += cur_rec->key[2];
+
+ data_bucket -= sortbuf->base_number;
+ data_bucket /= sortbuf->divisor;
+
+ sortbuf->rec_ptrs[ sortbuf->cur_ptr_offsets[ data_bucket ] ] = cur_rec;
+ sortbuf->cur_ptr_offsets[ data_bucket ]++;
+ }
+
+ // check if we've completed this stage
+ if( sortbuf->cur_position == (unsigned)sortbuf->buf->numrecs ){
+ sortbuf->sort_state = QsortPointers;
+ sortbuf->cur_position = 0;
+ }
+
+}
+
+void QSortPointers( sort_buf_t* sortbuf )
+{
+ unsigned binI = sortbuf->cur_position;
+ unsigned maxI = binI + SORT_BINS_SIZE;
+
+ maxI = maxI < sortbuf->histogram_size ? maxI : sortbuf->histogram_size - 1;
+
+ for(; binI < maxI; binI++){
+ if( sortbuf->histogram[binI + 1] - sortbuf->histogram[binI] > 1 )
+ QSort( sortbuf->rec_ptrs, sortbuf->histogram[binI], sortbuf->histogram[binI + 1] - 1 );
+ }
+ sortbuf->cur_position = binI;
+
+ if( binI == sortbuf->histogram_size - 1 ){
+ if( (sortbuf->buf->numrecs - 1) - sortbuf->histogram[binI] > 1 )
+ QSort( sortbuf->rec_ptrs, sortbuf->histogram[binI], sortbuf->buf->numrecs - 1 );
+ sortbuf->sort_state = CopyData;
+ sortbuf->cur_position = 0;
+ }
+}
+
+
+void CopySortedData ( sort_buf_t* sortbuf ){
+ unsigned recordI = sortbuf->cur_position;
+ unsigned maxI = recordI + COPY_CHUNK_SIZE;
+ record_t* tmp;
+
+ // set the processing limit for this time through.
+ maxI = maxI < (unsigned)sortbuf->buf->numrecs ? maxI : (unsigned)sortbuf->buf->numrecs;
+
+ for(; recordI < maxI; recordI++ )
+ sortbuf->radix_tmp->recs[recordI] = *(sortbuf->rec_ptrs[recordI]);
+
+ sortbuf->cur_position = recordI;
+
+ // check if we're all done with sorting
+ if(recordI == (unsigned)sortbuf->buf->numrecs){
+ // swap the pointers
+ tmp = sortbuf->radix_tmp->recs;
+ sortbuf->radix_tmp->recs = sortbuf->buf->recs;
+ sortbuf->buf->recs = tmp;
+
+ // set our state to completion
+ sortbuf->state = WRITE_RESTRUCTURE;
+
+ // release memory
+ free( sortbuf->rec_ptrs );
+ free( sortbuf->histogram );
+ free( sortbuf->cur_ptr_offsets );
+ }
+
+
+}
+
+
+// QBrute sorts less than 3 elements at a time
+void QBrute( record_t* a[], int lo, int hi ) {
+ if ((hi-lo) == 1) {
+ if( CompareKeyPtrs( a[hi], a[lo] ) < 0 ) {
+ record_t* T = a[lo];
+ a[lo] = a[hi];
+ a[hi] = T;
+ }
+ }else
+ if ((hi-lo) == 2) {
+ int pmin = CompareKeyPtrs( a[lo], a[lo+1] ) < 0 ? lo : lo+1;
+ pmin = CompareKeyPtrs( a[pmin], a[lo+2] ) < 0 ? pmin : lo+2;
+ if (pmin != lo) {
+ record_t* T = a[lo];
+ a[lo] = a[pmin];
+ a[pmin] = T;
+ }
+ QBrute(a, lo+1, hi);
+ }else
+ if ((hi-lo) == 3) {
+ int pmin, pmax;
+ pmin = CompareKeyPtrs( a[lo], a[lo+1] ) < 0 ? lo : lo+1;
+ pmin = CompareKeyPtrs( a[pmin], a[lo+2] ) < 0 ? pmin : lo+2;
+ pmin = CompareKeyPtrs( a[pmin], a[lo+3] ) < 0 ? pmin : lo+3;
+ if (pmin != lo) {
+ record_t* T = a[lo];
+ a[lo] = a[pmin];
+ a[pmin] = T;
+ }
+ pmax = CompareKeyPtrs( a[hi], a[hi-1] ) > 0 ? hi : hi-1;
+ pmax = CompareKeyPtrs( a[pmax], a[hi-2] ) > 0 ? pmax : hi-2;
+ if (pmax != hi) {
+ record_t* T = a[hi];
+ a[hi] = a[pmax];
+ a[pmax] = T;
+ }
+ QBrute(a, lo+1, hi-1);
+ }
+}
+
+
+
+void QSort( record_t* a[], int lo0, int hi0 ) {
+
+ int lo = lo0;
+ int hi = hi0;
+
+ record_t* pivot;
+
+ if ((hi-lo) <= 3) {
+ QBrute(a, lo, hi);
+ return;
+ }
+
+ /*
+ * Pick a pivot and move it out of the way
+ */
+ pivot = a[(lo + hi) / 2];
+ a[(lo + hi) / 2] = a[hi];
+ a[hi] = pivot;
+
+ while( lo < hi ) {
+ /*
+ * Search forward from a[lo] until an element is found that
+ * is greater than the pivot or lo >= hi
+ */
+ //while( a[lo] <= pivot && lo < hi ) {
+ while( (CompareKeyPtrs( a[lo], pivot ) <= 0) && lo < hi ) {
+ lo++;
+ }
+
+ /*
+ * Search backward from a[hi] until element is found that
+ * is less than the pivot, or hi <= lo
+ */
+ //while (pivot <= a[hi] && lo < hi ) {
+ while( (CompareKeyPtrs( pivot, a[hi] ) <= 0) && lo < hi ) {
+ hi--;
+ }
+
+ /*
+ * Swap elements a[lo] and a[hi]
+ */
+ if( lo < hi ) {
+ record_t* T = a[lo];
+ a[lo] = a[hi];
+ a[hi] = T;
+ }
+ }
+
+ /*
+ * Put the median in the "center" of the list
+ */
+ a[hi0] = a[hi];
+ a[hi] = pivot;
+
+ /*
+ * Recursive calls, elements a[lo0] to a[lo-1] are less than or
+ * equal to pivot, elements a[hi+1] to a[hi0] are greater than
+ * pivot.
+ */
+ QSort( a, lo0, lo-1 );
+ QSort( a, hi+1, hi0 );
+}
+
+
+#endif /* !USE_QSORT_ONLY */
diff --git a/libMems/dmSML/sorting.h b/libMems/dmSML/sorting.h
new file mode 100644
index 0000000..fe527ec
--- /dev/null
+++ b/libMems/dmSML/sorting.h
@@ -0,0 +1,81 @@
+#ifndef _sorting_h_
+#define _sorting_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/dmSML/buffer.h"
+
+
+// define this if you want to use the qsort only version
+// of dmsort.
+#define USE_QSORT_ONLY
+
+
+
+
+// START configurable values
+
+// the number of bits in each radix.
+#define RADIX_BITS 12
+
+// the number of bins to qsort during each call to RadixSort during the qsort phase
+#define SORT_BINS_SIZE 1000
+// the number of records to copy into sorted order during each call to CopySortedData
+#define COPY_CHUNK_SIZE 50000
+#define HISTOGRAM_CHUNK_SIZE 50000
+#define PTR_COPY_CHUNK_SIZE 50000
+
+// END configurable values
+
+// sorting states -- this is for the second phase, after binning
+#define WAIT_WRITE (-100)
+#define WAIT_READ (-200)
+#define SORTING (-300)
+#define BUSY_READ (-400)
+#define BUSY_WRITE (-500)
+#define SORTING_SCRATCH (-600)
+#define WRITE_RESTRUCTURE (-700)
+
+enum{
+ CalculateHistogram = 0, // At this stage we compute a histogram on the current radix
+ CopyPointers = 1, // This stage copies the pointers into (more) sorted order
+ QsortPointers = 2, // This stage qsorts the pointers
+ CopyData = 3 // This stage copies the data into totally sorted order
+};
+
+typedef struct sort_buf_s {
+ int state; // WAIT_READ, WAIT_WRITE, SORTING, BUSY
+ int bin; // what bin this buffer holds right now.
+ iodevice_t *dev;
+ buffer_t *buf; // the buffer where records live
+ buffer_t *radix_tmp; // temp space for the radix sort copy
+ record_t **rec_ptrs; // array of pointers to records
+
+ unsigned base_number;
+ unsigned divisor;
+ unsigned histogram_size;
+ unsigned *histogram; // the histogram of bins
+ unsigned *cur_ptr_offsets; // the locations to copy data in each histogram bucket
+ unsigned cur_position; // the current record or bin position in the current stage.
+ int sort_state; // current state of the sort algorithm
+} sort_buf_t;
+
+// Need NumBins so that we can compute the amount already sorted
+extern int NumBins;
+
+//typedef unsigned long long uint64;
+
+/* Fills and returns a new sort_buf_t with the appropriate
+ * data.
+ */
+void InitRadixSort( sort_buf_t* sortbuf, buffer_t* scratch_buffer );
+
+/* Checks the current state of the radix sort and performs a fixed
+ * amount of sorting computation before returning.
+ * call until state is set to WriteData
+ */
+void RadixSort( sort_buf_t* sortbuffer );
+
+#endif /* _sorting_h_ */
diff --git a/libMems/dmSML/timing.c b/libMems/dmSML/timing.c
new file mode 100644
index 0000000..d651d9a
--- /dev/null
+++ b/libMems/dmSML/timing.c
@@ -0,0 +1,164 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef WIN32
+#include <sys/time.h>
+#include <unistd.h>
+//#include <malloc.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+#include "libMems/dmSML/util.h"
+#include "libMems/dmSML/timing.h"
+
+
+struct dmtimer_s {
+#ifdef WIN32
+ unsigned int last;
+#else
+ struct timeval tv;
+#endif
+};
+
+
+
+typedef int Int;
+typedef unsigned int UInt;
+typedef double Float64;
+
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <mmsystem.h>
+// keep this many significant bits from the PerformanceCounter values.
+#define NUM_FREQ_BITS (14)
+static Int ShiftAmt;
+static Int TicksPerSecond;
+static Int LastReadValue;
+static Int BaseTime;
+#endif /* WIN32 */
+
+
+dmtimer_t * StartTimer() {
+#ifdef WIN32
+ dmtimer_t * t = malloc( sizeof( *t ) );
+ t->last = timeGetTime();
+ return( t );
+#else
+ dmtimer_t * t = malloc( sizeof( *t ) );
+ gettimeofday( &(t->tv), NULL );
+ return( t );
+#endif /* WIN32 */
+}
+
+
+
+unsigned int ReadTimer( dmtimer_t * t ) {
+#ifdef WIN32
+ /*
+ Int ticks;
+ LARGE_INTEGER pcnow;
+ Float64 seconds;
+ QueryPerformanceCounter( &pcnow );
+ Shift64( ShiftAmt, (int*)&pcnow.HighPart, (int*)&pcnow.LowPart );
+ ticks = pcnow.LowPart;
+ LastReadValue = ticks;
+ if( ticks < BaseTime ) {
+ // handle wraparound.
+ ticks += ((1 << NUM_FREQ_BITS)) - BaseTime;
+ } else {
+ ticks -= BaseTime;
+ }
+ seconds = (Float64)ticks / (Float64)TicksPerSecond;
+ return( (int)(seconds * 10000 + 0.5) );
+ */
+ unsigned int cur = timeGetTime();
+ return( cur - t->last );
+#else
+ struct timeval current;
+ struct timezone dummy;
+ unsigned int begintime, endtime;
+ gettimeofday( ¤t, &dummy );
+ begintime = 1000 * t->tv.tv_sec + (t->tv.tv_usec/1000);
+ endtime = 1000 * current.tv_sec + (current.tv_usec/1000);
+ return( endtime - begintime );
+#endif
+}
+
+
+
+void StopTimer( dmtimer_t * t ) {
+ free( t );
+}
+
+
+
+#ifdef WIN32
+static void InitTimeWIN32() {
+
+ timeBeginPeriod( 1 );
+
+ /*
+ LARGE_INTEGER pcfreq;
+ UInt pchi, pclow, hihibit, lowhibit, highbit;
+ UInt i;
+ ShiftAmt = 0;
+ QueryPerformanceFrequency( &pcfreq );
+ pchi = pcfreq.HighPart;
+ pclow = pcfreq.LowPart;
+ // we want to look at the most significant 14 bits of the counter,
+ // so we get about 1/10000th second accuracy
+ // (between 8192ths - 16383ths second accuracy to be exact).
+ // find the highest bit set in the high part.
+ for( i = sizeof( pchi ) * 8; i ; i-- ) {
+ if( pchi & 0x80000000 ) {
+ break;
+ }
+ pchi = pchi << 1;
+ }
+ hihibit = i;
+ // find the highest bit set in the low part.
+ for( i = sizeof( pclow ) * 8; i ; i-- ) {
+ if( pclow & 0x80000000 ) {
+ break;
+ }
+ pclow = pclow << 1;
+ }
+ lowhibit = i;
+ if( hihibit ) {
+ highbit = hihibit + 32;
+ } else {
+ highbit = lowhibit;
+ }
+ pchi = pcfreq.HighPart;
+ pclow = pcfreq.LowPart;
+ if( highbit <= NUM_FREQ_BITS ) {
+ ShiftAmt = 0;
+ } else {
+ ShiftAmt = highbit - NUM_FREQ_BITS;
+ }
+ Shift64( ShiftAmt, (int*)&pchi, (int*)&pclow );
+ // now we have the most significant 14 bits of frequency.
+ TicksPerSecond = pclow;
+ // now actually read the counter, compute the ticks and store it away
+ // so we have a base for the first call.
+ QueryPerformanceCounter( &pcfreq );
+ // this demonstrates the procedure for converting a LARGE_INTEGER
+ // to ticks.
+ Shift64( ShiftAmt, (int*)&pcfreq.HighPart, (int*)&pcfreq.LowPart );
+ LastReadValue = pcfreq.LowPart;
+ BaseTime = LastReadValue;
+ */
+}
+#endif /* WIN32 */
+
+
+void InitTime() {
+#ifdef WIN32
+ InitTimeWIN32();
+#endif
+}
diff --git a/libMems/dmSML/timing.h b/libMems/dmSML/timing.h
new file mode 100644
index 0000000..28d921e
--- /dev/null
+++ b/libMems/dmSML/timing.h
@@ -0,0 +1,24 @@
+#ifndef _timing_h_
+#define _timing_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+// an opaque timer type.
+typedef struct dmtimer_s dmtimer_t;
+
+// starts the timer
+dmtimer_t * StartTimer();
+
+// reads the timer (msec)
+unsigned int ReadTimer( dmtimer_t * t );
+
+// stops the timer.
+void StopTimer( dmtimer_t * t );
+
+// initialize the timing code.
+void InitTime();
+
+
+#endif /* _timing_h_ */
diff --git a/libMems/dmSML/util.c b/libMems/dmSML/util.c
new file mode 100644
index 0000000..14f285e
--- /dev/null
+++ b/libMems/dmSML/util.c
@@ -0,0 +1,132 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "libMems/dmSML/util.h"
+
+#define FMT_BUFFER_SIZE (32)
+#define FMT_MAX_STRING (1024)
+
+static char FmtBuffer[FMT_BUFFER_SIZE][FMT_MAX_STRING];
+static int FmtIdx;
+
+const char * Fmt( const char * fmt, ... ) {
+ const char * ret;
+ va_list args;
+ va_start( args, fmt );
+ ret = VFmt( fmt, args );
+ va_end( args );
+ return( ret );
+}
+
+
+const char * VFmt( const char * fmt, va_list args ) {
+ if( ++FmtIdx >= FMT_BUFFER_SIZE ) {
+ FmtIdx = 0;
+ }
+ // silly windows....
+#ifdef WIN32
+ _vsnprintf( FmtBuffer[FmtIdx], sizeof( FmtBuffer[FmtIdx] ), fmt, args );
+#else
+ vsnprintf( FmtBuffer[FmtIdx], sizeof( FmtBuffer[FmtIdx] ), fmt, args );
+#endif
+ FmtBuffer[FmtIdx][FMT_MAX_STRING-1] = '\0';
+ return( FmtBuffer[FmtIdx] );
+}
+
+
+/// shifts a 64-bit value (in two 32 bit parts) either right or left.
+/// amt negative -> left, positive -> right
+void Shift64( int amt, int * hi, int * lo ) {
+ if( amt == 0 ) {
+ return;
+ }
+ if( amt > 0 ) {
+ *lo >>= amt;
+ *lo |= *hi << ((sizeof( *hi ) * 8) - amt);
+ *hi >>= amt;
+ } else {
+ amt = -amt;
+ *hi <<= amt;
+ *hi |= *lo >> ((sizeof( *lo ) * 8) - amt);
+ *lo <<= amt;
+ }
+}
+
+
+
+
+void AddTo64( unsigned int amt, unsigned int *hi, unsigned int *lo ) {
+
+ int i;
+ // holds each byte value.
+ int in[8], out[8], tmp[8];
+ int carry;
+
+ for( i = 0; i < 8; i++ ) {
+ in[i] = out[i] = tmp[i] = 0;
+ }
+
+ in[0] = amt & 0xFF;
+ in[1] = (amt >> 8) & 0xFF;
+ in[2] = (amt >> 16) & 0xFF;
+ in[3] = (amt >> 24) & 0xFF;
+
+ tmp[0] = *lo & 0xFF;
+ tmp[1] = (*lo >> 8) & 0xFF;
+ tmp[2] = (*lo >> 16) & 0xFF;
+ tmp[3] = (*lo >> 24) & 0xFF;
+ tmp[4] = *hi & 0xFF;
+ tmp[5] = (*hi >> 8) & 0xFF;
+ tmp[6] = (*hi >> 16) & 0xFF;
+ tmp[7] = (*hi >> 24) & 0xFF;
+
+
+ /*
+ out[0] = (tmp[0] + in[0]);
+ carry = out[0] >> 8;
+ out[0] &= 0xFF;
+
+ out[1] = (tmp[1] + in[1] + carry);
+ carry = out[1] >> 8;
+ out[1] &= 0xFF;
+
+ out[2] = (tmp[2] + in[2] + carry);
+ carry = out[2] >> 8;
+ out[2] &= 0xFF;
+
+ out[3] = (tmp[3] + in[3] + carry);
+ carry = out[3] >> 8;
+ out[3] &= 0xFF;
+ */
+
+ carry = 0;
+ for( i = 0; i < 8; i++ ) {
+ out[i] = in[i] + tmp[i] + carry;
+ carry = out[i] >> 8;
+ out[i] &= 0xFF;
+ }
+
+ // convert back to 2-int form.
+ *lo = out[0] + (out[1] << 8) + (out[2] << 16) + (out[3] << 24);
+ *hi = out[4] + (out[5] << 8) + (out[6] << 16) + (out[7] << 24);
+
+}
+
+/** Utility function to delete a file */
+int removeFile( const char* filename, int verbose )
+{
+#ifdef WIN32
+ return remove( filename );
+// return !DeleteFile( filename );
+#else
+ char* rm_cmd;
+ if( verbose )
+ rm_cmd = Fmt( "/bin/rm -fv %s", filename );
+ else
+ rm_cmd = Fmt( "/bin/rm -f %s", filename );
+ return system( rm_cmd );
+#endif
+}
+
diff --git a/libMems/dmSML/util.h b/libMems/dmSML/util.h
new file mode 100644
index 0000000..5d2acc8
--- /dev/null
+++ b/libMems/dmSML/util.h
@@ -0,0 +1,28 @@
+#ifndef _util_h_
+#define _util_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdarg.h>
+
+// these just let you get a temporary string -- don't hang onto it for very
+// long though -- it will get overwritten at some point. This is useful for
+// passing as parms and such.
+const char * Fmt( const char * fmt, ... );
+const char * VFmt( const char * fmt, va_list args );
+
+
+/// shifts a 64-bit value (in two 32 bit parts) either right or left.
+/// amt negative -> left, positive -> right
+void Shift64( int amt, int * hi, int * lo );
+
+
+void AddTo64( unsigned int amt, unsigned int *hi, unsigned int *lo );
+
+/** cross-platform file deletion */
+int removeFile( const char* filename, int verbose );
+
+
+#endif /* _util_h_ */
diff --git a/libMems/gnAlignedSequences.cpp b/libMems/gnAlignedSequences.cpp
new file mode 100644
index 0000000..b00cd28
--- /dev/null
+++ b/libMems/gnAlignedSequences.cpp
@@ -0,0 +1,1570 @@
+/*******************************************************************************
+ * $Id: gnAlignedSequences.cpp,v 1.11 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/gnAlignedSequences.h"
+#include <sstream>
+
+using namespace std;
+using namespace genome;
+namespace mems {
+
+gnAlignedSequences::gnAlignedSequences()
+{
+ alignedSequenceFileName = "";
+
+
+}
+
+
+gnAlignedSequences::gnAlignedSequences(const gnAlignedSequences &toCopy)
+{
+ alignedSequenceFileName = toCopy.alignedSequenceFileName;
+ consensus = toCopy.consensus;
+
+ names = toCopy.names;
+ sequences = toCopy.sequences;
+ positions = toCopy.positions;
+}
+
+
+void gnAlignedSequences::constructFromClustalW(string alignedFileName)
+{
+ alignedSequenceFileName = alignedFileName;
+
+ readClustalWAlignment();
+ buildConsensus();
+
+ indexPositions.resize(consensus.size());
+ for (int i=0; i<consensus.size(); i++)
+ indexPositions[i] = i+1;
+}
+
+
+void gnAlignedSequences::constructFromPhylip(string alignedFileName)
+{
+ alignedSequenceFileName = alignedFileName;
+
+ readPhylipAlignment();
+ buildConsensus();
+
+ indexPositions.resize(consensus.size());
+ for (int i=0; i<consensus.size(); i++)
+ indexPositions[i] = i+1;
+}
+
+
+void gnAlignedSequences::constructFromMSF(string alignedFileName)
+{
+ alignedSequenceFileName = alignedFileName;
+
+ readMSFAlignment();
+ buildConsensus();
+
+ indexPositions.resize(consensus.size());
+ for (int i=0; i<consensus.size(); i++)
+ indexPositions[i] = i+1;
+}
+
+
+void gnAlignedSequences::constructFromRelaxedNexus( istream& align_stream ){
+ readRelaxedNexusAlignment( align_stream );
+// buildConsensus();
+
+// indexPositions.resize(consensus.size());
+// for (int i=0; i<consensus.size(); i++)
+// indexPositions[i] = i+1;
+}
+
+void gnAlignedSequences::constructFromNexus(string alignedFileName)
+{
+ alignedSequenceFileName = alignedFileName;
+
+ readNexusAlignment();
+ buildConsensus();
+
+ indexPositions.resize(consensus.size());
+ for (int i=0; i<consensus.size(); i++)
+ indexPositions[i] = i+1;
+}
+
+
+void gnAlignedSequences::constructFromMega(string alignedFileName)
+{
+ alignedSequenceFileName = alignedFileName;
+
+ readMegaAlignment();
+ buildConsensus();
+
+ indexPositions.resize(consensus.size());
+ for (int i=0; i<consensus.size(); i++)
+ indexPositions[i] = i+1;
+}
+
+const vector< string >& gnAlignedSequences::getSupportedFormats()
+{
+ static vector< string > formats;
+ if( formats.size() == 0 ){
+ formats.push_back( "phylip" );
+ formats.push_back( "clustal" );
+ formats.push_back( "msf" );
+ formats.push_back( "nexus" );
+ formats.push_back( "mega" );
+ formats.push_back( "codon" );
+ }
+ return formats;
+}
+
+boolean gnAlignedSequences::isSupportedFormat( const string& format_name )
+{
+ const vector< string >& formats = getSupportedFormats();
+ for( int formatI = 0; formatI < formats.size(); formatI++ ){
+ if( formats[ formatI ] == format_name )
+ return true;
+ }
+ return false;
+}
+void gnAlignedSequences::output( const string& format_name, ostream& os ) const
+{
+ bool rval = false;
+
+ if( format_name == "phylip" )
+ rval = outputPhylip( os );
+
+ if( format_name == "clustal" )
+ rval = outputClustalW( os );
+
+ if( format_name == "msf" )
+ rval = outputMSF( os );
+
+ if( format_name == "nexus" )
+ rval = outputNexus( os );
+
+ if( format_name == "mega" )
+ rval = outputMega( os );
+
+ if( format_name == "codon" )
+ rval = outputCodon( os );
+
+ if( !rval )
+ throw "Error writing alignment\n";
+
+}
+
+bool gnAlignedSequences::outputPhylip(ostream& os) const
+{
+
+ os << "Sequences in Alignment: " << sequences.size()
+ << " Bases in Each Aligned Sequence: " << sequences[0].length() << endl;
+
+ int offset = 10;
+ uint seqI;
+ for( seqI = 0; seqI < sequences.size(); seqI++ )
+ {
+ int position = 0;
+ const string& seq = sequences[ seqI ];
+ string seqName = names[ seqI ].substr( 0, offset );
+ seqName.append( offset - seqName.length() + 1, ' ' );
+
+ os << seqName;
+
+ for ( position=0; position + offset < seq.size(); position += offset){
+ if ( position % 50 == 0)
+ os << endl;
+ os.write( seq.data() + position, offset );
+ os << ' ';
+ }
+
+ if ( position % 50 == 0)
+ os << endl;
+
+ os.write( seq.data() + position, seq.size() - position );
+ os << endl;
+ }
+
+ return true;
+}
+
+uint64 countGaps( string& seq );
+uint64 countGaps( string& seq ){
+ uint gap_count = 0;
+ for( uint charI = 0; charI < seq.length(); charI++ )
+ if( seq[ charI ] == '-' )
+ gap_count++;
+ return gap_count;
+}
+
+bool gnAlignedSequences::outputClustalW(ostream& os) const
+{
+ boolean output_positions = true;
+
+ os << "Clustal W multiple sequence alignment" << endl;
+
+ vector< int64 > seq_pos( sequences.size(), 0 );
+ if( positions.size() == sequences.size() )
+ seq_pos = positions;
+ vector< string > seq_names;
+ int pos;
+ uint seqI = 0;
+ int longestNameSize = 0;
+ for( ; seqI < sequences.size(); seqI++ )
+ {
+ seq_names.push_back( names[ seqI ].substr( 0, 30 ) );
+ if ( seq_names[ seq_names.size() - 1 ].length() > longestNameSize)
+ longestNameSize=seq_names[ seq_names.size() - 1 ].length();
+ }
+ // add space padding to the names
+ for( seqI = 0; seqI < seq_names.size(); seqI++ )
+ seq_names[ seqI ] += string( (longestNameSize - seq_names[ seqI ].length()) + 6, ' ' );
+ for (pos=0; pos+60 < alignedSeqsSize(); pos+=60)
+ {
+ os << endl
+ << endl;
+ for( seqI = 0; seqI < sequences.size(); seqI++ )
+ {
+ os << seq_names[ seqI ];
+ const string& seq = sequences[ seqI ];
+ string cur_seq = seq.substr( pos, 60 );
+ os << cur_seq;
+ if( output_positions ){
+ seq_pos[ seqI ] += 60 - countGaps( cur_seq );
+ os << " " << seq_pos[ seqI ];
+ }
+ os << endl;
+ }
+ }
+
+ if (pos<alignedSeqsSize())
+ {
+ os << endl
+ << endl;
+
+ for( seqI = 0; seqI < sequences.size(); seqI++ )
+ {
+ os << seq_names[ seqI ];
+ const string& seq = sequences[ seqI ];
+ string cur_seq = seq.substr( pos, 60 );
+ os << cur_seq;
+ if( output_positions ){
+ seq_pos[ seqI ] += 60 - countGaps( cur_seq );
+ os << " " << seq_pos[ seqI ];
+ }
+ os << endl;
+ }
+ }
+ return true;
+}
+
+
+bool gnAlignedSequences::outputMSF(ostream& os) const
+{
+ os << "//" << endl;
+
+ list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
+ int longestSeqNameLength = 0;
+ for ( ; sequenceItr!=alignedSequences.end(); sequenceItr++)
+ {
+ if ((*(*sequenceItr).first).length() > longestSeqNameLength)
+ longestSeqNameLength = (*(*sequenceItr).first).length();
+ }
+
+ int pos = 0;
+ for ( ; pos+60<(*(*alignedSequences.begin()).second).size(); pos+=60)
+ {
+ // output spaces until sequence ordinates
+ for (int i=0; i<longestSeqNameLength+2; i++)
+ os << " ";
+
+ os << pos+1;
+ for (int i=0; i<54; i++) // output appropriate number of spaces on ordinate line
+ os << " ";
+ os << pos+60 << endl;
+
+ for (sequenceItr=alignedSequences.begin(); sequenceItr!=alignedSequences.end(); sequenceItr++)
+ {
+ int spaces = longestSeqNameLength-(*(*sequenceItr).first).length();
+ for (int i=0; i<spaces; i++)
+ os << " ";
+
+ os << (*(*sequenceItr).first) << " ";
+
+ string seq = (*(*sequenceItr).second).substr(pos, 60);
+ for (int i=0; i<60; i++)
+ {
+ if (seq[i]=='-')
+ os << ".";
+ else
+ os << seq[i];
+ }
+ os << endl;
+ }
+
+ os << endl;
+ }
+
+ if (pos<(*(*alignedSequences.begin()).second).size())
+ {
+ // output spaces until sequence ordinates
+ for (int i=0; i<longestSeqNameLength+2; i++)
+ os << " ";
+
+ os << pos+1;
+ for (int i=0; i<(*(*alignedSequences.begin()).second).size()-pos; i++) // output appropriate number of spaces on ordinate line
+ os << " ";
+ os << (*(*alignedSequences.begin()).second).size() << endl;
+
+ for (sequenceItr=alignedSequences.begin(); sequenceItr!=alignedSequences.end(); sequenceItr++)
+ {
+ int spaces = longestSeqNameLength-(*(*sequenceItr).first).length();
+ for (int i=0; i<spaces; i++)
+ os << " ";
+
+ os << (*(*sequenceItr).first) << " ";
+
+ string seq = (*(*sequenceItr).second).substr(pos, (*(*alignedSequences.begin()).second).size()-pos );
+ for (int i=0; i<seq.length(); i++)
+ {
+ if (seq[i]=='-')
+ os << ".";
+ else
+ os << seq[i];
+ }
+ os << endl;
+ }
+
+ os << endl;
+ }
+
+ return false;
+}
+
+
+
+bool gnAlignedSequences::outputNexus(ostream& os) const
+{
+ os << "begin data;" << endl
+ << " dimensions ntax=" << sequences.size();
+ if( sequences.size() == 0 )
+ return true;
+ os << " nchar="
+ << sequences[0].length() << ";" << endl
+ << " ;" << endl
+ << " matrix" << endl;
+
+ list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
+ int i;
+ int seqI;
+ int longestSeqNameLength = 0;
+ for( seqI = 0; seqI < sequences.size(); seqI++ ){
+ if( names[ seqI ].length() > longestSeqNameLength )
+ longestSeqNameLength = names[ seqI ].length();
+ }
+
+ int pos = 1;
+ for ( ; pos+59 < sequences[0].size(); pos+=60)
+ {
+ os << "[";
+ // output spaces until sequence ordinates
+ for (i = 0; i < longestSeqNameLength+2; i++)
+ os << " ";
+
+ os << pos;
+ for (i = 0; i < 54; i++) // output appropriate number of spaces on ordinate line
+ os << " ";
+ os << pos+59 << "]" << endl;
+
+ for( seqI = 0; seqI < sequences.size(); seqI++ )
+ {
+ os << names[ seqI ];
+
+ int spaces = longestSeqNameLength - names[ seqI ].length();
+ for (i = 0; i < spaces + 2; i++)
+ os << " ";
+
+ string seq = sequences[ seqI ].substr( pos, 60 );
+ os << seq << endl;
+ }
+
+ os << endl;
+ }
+
+ // write out the last little bit
+ if (pos - 1 < sequences[0].size())
+ {
+ // output spaces until sequence ordinates
+ os << "[";
+ // output spaces until sequence ordinates
+ for (i = 0; i < longestSeqNameLength + 2; i++)
+ os << " ";
+
+ os << pos;
+ for (i=0; i < sequences[0].size() - pos + 1; i++) // output appropriate number of spaces on ordinate line
+ os << " ";
+ os << pos+59 << "]" << endl;
+
+ for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++)
+ for( seqI = 0; seqI < sequences.size(); seqI++ )
+ {
+ os << names[ seqI ];
+
+ int spaces = longestSeqNameLength - names[ seqI ].length();
+ for (i=0; i<spaces+2; i++)
+ os << " ";
+
+ string seq = sequences[seqI].substr( pos, sequences[seqI].size()-pos+1 );
+ os << seq << endl;
+ }
+
+ os << endl;
+ }
+
+ return true;
+}
+/*
+bool gnAlignedSequences::outputNexus(ostream& os) const
+{
+ os << "begin data;" << endl
+ << " dimensions ntax=" << alignedSequences.size() << " nchar="
+ << alignedSequences.begin()->second->size() << ";" << endl
+ << " ;" << endl
+ << " matrix" << endl;
+
+ list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
+ int i;
+ int longestSeqNameLength = 0;
+ for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ if ( sequenceItr->first->length() > longestSeqNameLength )
+ longestSeqNameLength = sequenceItr->first->length();
+ }
+
+ int pos = 1;
+ for ( ; pos+59 < alignedSequences.begin()->second->size(); pos+=60)
+ {
+ os << "[";
+ // output spaces until sequence ordinates
+ for (i = 0; i < longestSeqNameLength+2; i++)
+ os << " ";
+
+ os << pos;
+ for (i = 0; i < 54; i++) // output appropriate number of spaces on ordinate line
+ os << " ";
+ os << pos+59 << "]" << endl;
+
+ for (sequenceItr=alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ os << (*(*sequenceItr).first);
+
+ int spaces = longestSeqNameLength - sequenceItr->first->length();
+ for (i = 0; i < spaces + 2; i++)
+ os << " ";
+
+ string seq = sequenceItr->second->substr( pos, 60 );
+ for (i = 0; i < 60; i++)
+ os << seq[i];
+ os << endl;
+ }
+
+ os << endl;
+ }
+
+ if (pos - 1 < alignedSequences.begin()->second->size())
+ {
+ // output spaces until sequence ordinates
+ os << "[";
+ // output spaces until sequence ordinates
+ for (i = 0; i < longestSeqNameLength + 2; i++)
+ os << " ";
+
+ os << pos;
+ for (i=0; i < alignedSequences.begin()->second->size() - pos + 1; i++) // output appropriate number of spaces on ordinate line
+ os << " ";
+ os << pos+59 << "]" << endl;
+
+ for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ os << *(sequenceItr->first);
+
+ int spaces = longestSeqNameLength-(*(*sequenceItr).first).length();
+ for (i=0; i<spaces+2; i++)
+ os << " ";
+
+ string seq = (*(*sequenceItr).second).substr( pos, (*(*alignedSequences.begin()).second).size()-pos+1 );
+ for (i=0; i<seq.length(); i++)
+ os << seq[i];
+ os << endl;
+ }
+
+ os << endl;
+ }
+
+ return false;
+}
+*/
+bool gnAlignedSequences::outputMega(ostream& os) const
+{
+ os << "#MEGA" << endl
+ << "TITLE:" << endl;
+
+ list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
+ int longestSeqNameLength = 0;
+
+ for ( ; sequenceItr!=alignedSequences.end(); sequenceItr++){
+ if (sequenceItr->first->length() > longestSeqNameLength)
+ longestSeqNameLength = sequenceItr->first->length();
+ }
+
+ gnSeqI pos = 1;
+ gnSeqI remaining_len = alignedSequences.begin()->second->size(); //determine the amount to be written
+ // loop while there is more to write
+ while(remaining_len > 0){
+ os << endl;
+ gnSeqI write_chars = MEGA_ALIGN_COLUMNS < remaining_len ? MEGA_ALIGN_COLUMNS : remaining_len;
+
+ //write each sequence's line
+ for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++){
+ os << "#" << *(sequenceItr->first);
+
+ int spaces = longestSeqNameLength - sequenceItr->first->length();
+ for (int i = 0; i < spaces + 5; i++)
+ os << " ";
+
+ string seq = sequenceItr->second->substr( pos, write_chars );
+ for (int i = 0; i < write_chars; i++)
+ os << seq[i];
+ os << endl;
+ }
+ os << endl;
+
+ pos += write_chars;
+ remaining_len -= write_chars;
+ }
+ return true;
+}
+
+
+bool gnAlignedSequences::outputCodon(ostream& os) const
+{
+ list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
+
+ os << '\t' << alignedSequences.size() << '\t' << (*(*sequenceItr).second).size() << endl;
+
+ int offset = 10;
+ for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ int position = 0;
+ string seq = (*(*sequenceItr).second);
+ string seqName = (*(*sequenceItr).first);
+ if (seqName.size() <= offset)
+ {
+ for (int i=seqName.size(); i<offset; i++)
+ seqName += " ";
+ }
+
+ else
+ {
+ string temp = seqName;
+ seqName = "";
+ for (int i=0; i<offset; i++)
+ seqName += temp[i];
+ }
+
+ os << seqName;
+ int count = 0;
+ for ( ; position+3<seq.size(); position+=3)
+ {
+ if (count == 20)
+ {
+ count = 0;
+ os << endl;
+ }
+ for (int i=position; i<position+3; i++)
+ os << seq[i];
+
+ os << ' ';
+ count++;
+ }
+
+ for ( ; position < seq.size(); position++)
+ os << seq[position];
+
+ os << endl;
+ }
+
+ return false;
+}
+
+
+bool gnAlignedSequences::outputWithConsensus(ostream& os)
+{
+ list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.begin();
+
+ os << '\t' << alignedSequences.size() << '\t' << (*(*sequenceItr).second).size() << endl;
+
+ int offset = 10;
+ for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ int position = 0;
+ string seq = (*(*sequenceItr).second);
+ string seqName = (*(*sequenceItr).first);
+ if (seqName.size() <= offset)
+ {
+ for (int i=seqName.size(); i<offset; i++)
+ seqName += " ";
+ }
+
+ else
+ {
+ string temp = seqName;
+ seqName = "";
+ for (int i=0; i<offset; i++)
+ seqName += temp[i];
+ }
+
+ os << seqName;
+ int count = 0;
+ for ( ; position+10<seq.size(); position+=10)
+ {
+ if (count == 5)
+ {
+ count = 0;
+ os << endl;
+ }
+ for (int i=position; i<position+10; i++)
+ os << seq[i];
+
+ os << ' ';
+ count++;
+ }
+
+ for ( ; position < seq.size(); position++)
+ os << seq[position];
+
+ os << endl;
+ }
+
+ int position = 0;
+ int count = 0;
+ os << "Consensus:";
+ for ( ; position+10<consensus.size(); position+=10)
+ {
+ if (count == 5)
+ {
+ count = 0;
+ os << endl;
+ }
+ for (int i=position; i<position+10; i++)
+ os << consensus[i];
+
+ os << ' ';
+ count++;
+ }
+
+ for ( ; position < consensus.size(); position++)
+ os << consensus[position];
+
+ os << endl;
+
+ return false;
+}
+
+
+gnAlignedSequences gnAlignedSequences::getAlignedSegment(unsigned start, unsigned stop)
+{
+ gnAlignedSequences newAlignment;
+
+ addAllSegments(newAlignment, start, stop);
+ newAlignment.buildConsensus();
+
+ return newAlignment;
+}
+
+
+gnAlignedSequences gnAlignedSequences::getCodons(int readingFrame, int startCodon, int codonMultiple)
+{
+ gnAlignedSequences toReturn;
+ int startBase = ((startCodon*3)-2)+(readingFrame-1);
+
+ for (int index=startBase; (index+2)<(*(*alignedSequences.begin()).second).size(); index+=(codonMultiple*3))
+ addAllSegmentsReplaceGaps(toReturn, index, index+2);
+
+ toReturn.buildConsensus();
+
+ return toReturn;
+}
+
+
+gnSeqI gnAlignedSequences::alignedSeqsSize() const
+{
+ if( sequences.size() > 0 )
+ return sequences[ 0 ].size();
+ return 0;
+}
+
+
+bool gnAlignedSequences::removeAlignedSeq(string seqName)
+{
+ list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.begin();
+
+ for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ if ((*(*sequenceItr).first) == seqName)
+ {
+ alignedSequences.erase(sequenceItr);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+bool gnAlignedSequences::removeAlignedSeq(unsigned index)
+{
+ list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.begin();
+ int i = 0;
+
+ for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ if (i == index)
+ {
+ alignedSequences.erase(sequenceItr);
+ return true;
+ }
+
+ i++;
+ }
+
+ return false;
+}
+
+
+void gnAlignedSequences::concatenateAlignedSequences(gnAlignedSequences toConcat)
+{
+ list <pair <string*, string*> >::iterator toConcatItr = toConcat.alignedSequences.begin();
+ list <pair <string*, string*> >::iterator originalItr;
+
+ unsigned largestSeqSize = 0;
+
+ for ( ; toConcatItr != toConcat.alignedSequences.end(); toConcatItr++)
+ {
+ for (originalItr = alignedSequences.begin(); originalItr != alignedSequences.end(); originalItr++)
+ {
+ if ((*(*originalItr).second).size() > largestSeqSize)
+ largestSeqSize = (*(*originalItr).second).size();
+
+ if ((*(*toConcatItr).first) == (*(*originalItr).first)) {
+ string seq = (*(*originalItr).second);
+ seq += (*(*toConcatItr).second);
+ (*(*originalItr).second) = seq;
+ break;
+ }
+ }
+ }
+
+ for (originalItr = alignedSequences.begin(); originalItr != alignedSequences.end(); originalItr++)
+ {
+ while ((*(*originalItr).second).size() < largestSeqSize)
+ (*(*originalItr).second).append("-");
+ }
+
+ buildConsensus();
+}
+
+
+void gnAlignedSequences::extractVariableSites(gnAlignedSequences &variableSites, bool countGapsAsMismatches)
+{
+ list <pair <string*, string*> >::iterator originalItr = alignedSequences.begin();
+
+ int alignedSeqSize = (*((*originalItr).second)).size();
+
+ char positionBase;
+ int matchStart = alignedSeqSize,
+ matchStop = alignedSeqSize;
+
+ bool mismatch = false;
+
+ indexPositions.resize(0);
+
+ for (int position=alignedSeqSize; position > 0; position--)
+ {
+ originalItr = alignedSequences.begin();
+ positionBase = (*((*originalItr).second))[position-1];
+ while (!countGapsAsMismatches && (*((*originalItr).second))[position-1] == '-')
+ {
+ originalItr++;
+ positionBase = (*((*originalItr).second))[position-1];
+ if (originalItr == alignedSequences.end()) break;
+ }
+
+ if (originalItr == alignedSequences.end()) break;
+
+ for ( ; originalItr != alignedSequences.end(); originalItr++)
+ {
+ // extend matched segment before adding match to variableSites
+ // much less expensive to add blocks of sites rather than a single site at a time
+ if (positionBase != (*((*originalItr).second))[position-1])// && matchStop==position)
+ {
+ if (!(!countGapsAsMismatches && (*((*originalItr).second))[position-1] == '-'))
+ {
+ mismatch = true;
+ break;
+ }
+ }
+ }
+
+ if (!mismatch)
+ matchStart--;
+
+ else
+ {
+ matchStart--;
+ matchStop = matchStart;
+
+ //variableSites.indexPositions.resize(variableSites.indexPositions.size()+1);
+ variableSites.indexPositions.push_back(position);//[indexPositions.size()-1]=position;
+ }
+
+ mismatch = false;
+ }
+
+ for (int i=variableSites.indexPositions.size()-1; i>=0; i--)
+ addAllSegments(variableSites, variableSites.indexPositions[i], variableSites.indexPositions[i]);
+
+ variableSites.buildConsensus();
+}
+
+
+bool gnAlignedSequences::collapseIdenticalSequences()
+{
+ list <pair <string*, string*> >::iterator itr1 = alignedSequences.begin();
+ list <pair <string*, string*> >::iterator itr2;
+ bool toReturn = false;
+
+ for ( ; itr1!=alignedSequences.end(); itr1++)
+ {
+ itr2=alignedSequences.begin();
+ for (itr2++; itr2!=alignedSequences.end(); itr2++)
+ {
+ if (((*(*itr1).second)==(*(*itr2).second)) && itr1!=itr2)
+ {
+ list <pair <string*, string*> >::iterator itrTemp = itr2;
+ itr2--;
+ alignedSequences.erase(itrTemp);
+ toReturn = true;
+ }
+ }
+ }
+
+ return toReturn;
+}
+
+
+vector <char> gnAlignedSequences::operator[]( const int offset ) //const
+{
+ vector <char> toReturn;
+ list <pair <string*, string*> >::iterator itr;
+
+ for (itr=alignedSequences.begin(); itr!=alignedSequences.end(); itr++)
+ toReturn.push_back((*(*itr).second)[offset]);
+
+ return toReturn;
+}
+
+
+bool gnAlignedSequences::readClustalWAlignment()
+{
+ ifstream alignmentFile;
+
+ alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
+
+ if (!(alignmentFile.is_open()))
+ {
+ cout << "Unable to open " << alignedSequenceFileName << ".\n"
+ << "Exiting.\n";
+
+ exit(-1);
+ }
+
+ string line;
+
+ // REMOVE 1st 3 LINES FROM .ALN FILE - SEQUENCE BEGINS ON LINE 4
+ getline(alignmentFile, line);
+ getline(alignmentFile, line);
+ getline(alignmentFile, line);
+
+ bool constructSuccess = constructClustalWAlignedSequenceList(alignmentFile);
+
+ alignmentFile.close();
+
+ if (constructSuccess) return true;
+
+ return false;
+}
+
+
+bool gnAlignedSequences::readPhylipAlignment()
+{
+ ifstream alignmentFile;
+
+ alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
+
+ if (!(alignmentFile.is_open()))
+ {
+ cout << "Unable to open " << alignedSequenceFileName << ".\n"
+ << "Exiting.\n";
+
+ exit(-1);
+ }
+
+ string line;
+
+ // REMOVE 1st LINE FROM PHYLIP FILE - SEQUENCE NUMBER AND LENGTH OF SEQUENCES
+ getline(alignmentFile, line);
+
+ bool constructSuccess = constructPhylipAlignedSequenceList(alignmentFile);
+
+ alignmentFile.close();
+
+ if (constructSuccess) return true;
+
+ return false;
+}
+
+
+bool gnAlignedSequences::readMSFAlignment()
+{
+ ifstream alignmentFile;
+
+ alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
+
+ if (!(alignmentFile.is_open()))
+ {
+ cout << "Unable to open " << alignedSequenceFileName << ".\n"
+ << "Exiting.\n";
+
+ exit(-1);
+ }
+
+ string line;
+ getline(alignmentFile, line);
+
+ // remove format's initial annotation
+ while (line.find("//")<0 || line.find("//")>line.size())
+ getline(alignmentFile, line);
+
+ bool constructSuccess = constructMSFAlignedSequenceList(alignmentFile);
+
+ alignmentFile.close();
+
+ if (constructSuccess) return true;
+
+ return false;
+}
+
+
+/**
+ * This function assumes that the #NEXUS at the beginning of the file has
+ * been read off already. It will read a single aligned sequences entry.
+ */
+bool gnAlignedSequences::readRelaxedNexusAlignment( istream& align_stream ){
+
+ string line;
+ string comments;
+ getline( align_stream, line );
+ if( line == "#NEXUS" ){
+ getline( align_stream, line );
+ }
+ if( line[0] == '[' ){
+ getline( align_stream, line );
+ while( line[0] != ']' ){
+ comments += line + "\n";
+ getline( align_stream, line );
+ }
+ getline( align_stream, line ); // possibly empty line
+ if( line.size() == 0 )
+ getline( align_stream, line );
+ }
+ while( line.length() == 0 )
+ getline( align_stream, line );
+ // this is the alignment info line
+ stringstream align_info( line );
+ uint seq_count;
+ gnSeqI align_len;
+ align_info >> seq_count;
+ align_info >> align_len;
+ sequences = vector< string >( seq_count );
+ // now read in each alignment line
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ align_stream >> line;
+ names.push_back( line );
+// getline( align_stream, line );
+ align_stream >> sequences[ seqI ];
+// Array< char > seq_data( align_len );
+// align_stream.read( seq_data.data, align_len );
+// sequences.push_back( seq_data.data );
+ }
+
+ // read off the trailing newline
+ getline( align_stream, line );
+ return true;
+}
+
+
+bool gnAlignedSequences::readNexusAlignment()
+{
+ ifstream alignmentFile;
+
+ alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
+
+ if (!(alignmentFile.is_open()))
+ {
+ cout << "Unable to open " << alignedSequenceFileName << ".\n"
+ << "Exiting.\n";
+
+ exit(-1);
+ }
+
+ string line;
+ getline(alignmentFile, line);
+
+ // remove format's initial annotation
+ while (line.find("begin data;")<0 || line.find("begin data;")>line.length()) // searching for "begin data;"
+ getline(alignmentFile, line);
+
+ bool constructSuccess = constructNexusAlignedSequenceList(alignmentFile);
+
+ alignmentFile.close();
+
+ if (constructSuccess) return true;
+
+ return false;
+}
+
+
+bool gnAlignedSequences::readMegaAlignment()
+{
+ ifstream alignmentFile;
+
+ alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
+
+ if (!(alignmentFile.is_open()))
+ {
+ cout << "Unable to open " << alignedSequenceFileName << ".\n"
+ << "Exiting.\n";
+
+ exit(-1);
+ }
+
+ string line;
+ // remove first three lines from mega file - prior to begining of sequence data
+ getline(alignmentFile, line);
+ getline(alignmentFile, line);
+ getline(alignmentFile, line);
+
+ bool constructSuccess = constructMegaAlignedSequenceList(alignmentFile);
+
+ alignmentFile.close();
+
+ if (constructSuccess) return true;
+
+ return false;
+}
+
+
+bool gnAlignedSequences::constructClustalWAlignedSequenceList(ifstream& alignmentFile)
+{
+ string line;
+
+ // GET THE 1st LINE OF SEQUENCE
+ getline(alignmentFile, line);
+
+ while (alignmentFile.good())
+ {
+ while (line[0] != ' ' && line[0] != '\0')
+ {
+ string sequenceName;
+ int i;
+ for (i=0; line[i] != ' '; i++)
+ sequenceName += line[i];
+
+ const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
+ string sequenceBases;
+ for(int i=sequenceName.size(); i < line.length(); i++){
+ if ((*newFilter).IsValid(line[i]))
+ sequenceBases += line[i];
+ }
+
+ list <pair <string*, string*> >::iterator sequenceItr;
+ if (!(sequenceNameInList(sequenceName, sequenceItr)))
+ {
+ pair <string*, string*> sequence;
+ sequence.first = new string(sequenceName);
+
+ sequence.second = new string( sequenceBases );
+
+ alignedSequences.push_back(sequence);
+ }
+
+ else
+ (*(*sequenceItr).second).append(sequenceBases);
+
+ getline(alignmentFile, line);
+ }
+
+ getline(alignmentFile, line);
+ getline(alignmentFile, line);
+ }
+
+
+ return false;
+}
+
+
+bool gnAlignedSequences::constructPhylipAlignedSequenceList(ifstream& alignmentFile)
+{
+ string line;
+
+ // GET THE 1st LINE OF SEQUENCE
+ getline(alignmentFile, line);
+
+ while (alignmentFile.good())
+ {
+ if (line[10]!=' ')
+ {
+ string sequenceName = line.substr(0,10);
+ cout << sequenceName << endl;
+ const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
+ string sequenceBases;
+ for(int i=10; i < line.length(); i++)
+ {
+ if ((*newFilter).IsValid(line[i]))
+ sequenceBases += line[i];
+ }
+
+ pair <string*, string*> sequence;
+ sequence.first = new string(sequenceName);
+
+ sequence.second = new string( sequenceBases );
+
+ alignedSequences.push_back(sequence);
+
+ getline(alignmentFile, line);
+ }
+
+ // NOT THE 1st LINE IN SEQUENCE (CONTAINS SEQ NAME)
+ else
+ {
+ string sequenceBases;
+ while (line[10]==' ' && line[0]!='\0' && line.length()>0)
+ {
+ const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
+ for(int i=0; i < line.length(); i++)
+ {
+ if ((*newFilter).IsValid(line[i]))
+ sequenceBases += line[i];
+ }
+
+ getline(alignmentFile, line);
+ }
+
+ list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.end();
+ sequenceItr--;
+ (*(*sequenceItr).second) += sequenceBases;
+ }
+ }
+
+ return false;
+}
+
+
+bool gnAlignedSequences::constructMSFAlignedSequenceList(ifstream& alignmentFile)
+{
+ string line;
+
+ // clear coordinate line
+ getline(alignmentFile, line);
+
+ while (alignmentFile.good())//line[0] != '\0')
+ {
+ getline(alignmentFile, line); // 1st line of sequence
+ while (!coordinates(line))
+ {
+ string sequenceName;
+ int i;
+
+ for (i=0; line[i] == ' '; i++) {}
+
+ for (; line[i] != ' '; i++)
+ sequenceName += line[i];
+
+ const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
+ string sequenceBases;
+ for( ; i < line.length(); i++){
+ if ((*newFilter).IsValid(line[i]))
+ sequenceBases += line[i];
+ else if (line[i] == '.' || line[i]=='~')
+ sequenceBases += '-';
+ }
+
+ list <pair <string*, string*> >::iterator sequenceItr;
+ if (!(sequenceNameInList(sequenceName, sequenceItr)))
+ {
+ pair <string*, string*> sequence;
+ sequence.first = new string(sequenceName);
+
+ sequence.second = new string( sequenceBases );
+
+ alignedSequences.push_back(sequence);
+ }
+
+ else
+ (*(*sequenceItr).second).append(sequenceBases);
+
+ getline(alignmentFile, line);
+ }
+ }
+
+ return false;
+}
+
+
+bool gnAlignedSequences::constructNexusAlignedSequenceList(ifstream& alignmentFile)
+{
+ string line;
+
+ // GET THE 1st LINE OF SEQUENCE
+ getline(alignmentFile, line);
+
+ // searching for "endblock;"
+ while (alignmentFile.good() && (line.find("endblock;")<0 || line.find("endblock;")>line.length()))
+ {
+ while (line[0]!='[' && line[0]!=' ' && line[0]!='\n' && line[0]!='\r' && alignmentFile.good())
+ {
+ string sequenceName;
+ int i;
+ for (i=0; line[i] != ' '; i++)
+ sequenceName += line[i];
+
+ const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
+ string sequenceBases;
+ for(int i=sequenceName.size(); i < line.length(); i++){
+ if ( line[i] != '\r' && line[i] != '\n' && line[i] != ' ' )
+ sequenceBases += line[i];
+ }
+
+ list <pair <string*, string*> >::iterator sequenceItr;
+ if (!(sequenceNameInList(sequenceName, sequenceItr)))
+ {
+ pair <string*, string*> sequence;
+ sequence.first = new string(sequenceName);
+
+ sequence.second = new string( sequenceBases );
+
+ alignedSequences.push_back(sequence);
+ }
+
+ else
+ (*(*sequenceItr).second).append(sequenceBases);
+
+ getline(alignmentFile, line);
+ }
+
+ getline(alignmentFile, line);
+ }
+
+
+ return false;
+}
+
+
+bool gnAlignedSequences::constructMegaAlignedSequenceList(ifstream& alignmentFile)
+{
+ string line;
+ string consensusSequenceBases;
+ list <pair <string*, string*> >::iterator alignedSequencesItr;
+
+ // GET THE 1st LINE OF SEQUENCE
+ getline(alignmentFile, line);
+
+ int previousLineLength = 0;
+
+ // searching for "endblock;"
+ while (alignmentFile.good())
+ {
+ while (line.length()>0 && line[0]=='#')
+ {
+ string sequenceName;
+ for (int i=1; line[i] != ' '; i++)
+ sequenceName += line[i];
+
+ const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
+ string sequenceBases;
+ bool isInSeqName = true;
+ if (alignedSequences.size()>0)// && consensusSequenceBases.size()>0)
+ consensusSequenceBases = (*(*alignedSequencesItr).second);
+
+ for(int i=sequenceName.size(); i < line.length(); i++)
+ {
+ // allow only valid characters to be placed - if '.' replace leter
+ // with consensus data
+ if ((*newFilter).IsValid(line[i]) && !isInSeqName)
+ sequenceBases += line[i];
+
+ else if (line[i] == ' ') isInSeqName=false;
+
+ else if (line[i]=='.' && alignedSequences.size()>0 && !isInSeqName) // a reference to the consensus
+ sequenceBases += consensusSequenceBases[sequenceBases.size()+previousLineLength];
+ }
+
+ list <pair <string*, string*> >::iterator sequenceItr;
+ if (!(sequenceNameInList(sequenceName, sequenceItr)))
+ {
+ pair <string*, string*> sequence;
+ sequence.first = new string(sequenceName);
+
+ sequence.second = new string( sequenceBases );
+
+ alignedSequences.push_back(sequence);
+ alignedSequencesItr = alignedSequences.begin();
+ }
+
+ else
+ (*(*sequenceItr).second).append(sequenceBases);
+
+ getline(alignmentFile, line);
+ }
+
+ if (alignedSequences.size() > 0)
+ previousLineLength = (*(*alignedSequences.begin()).second).size();
+
+ getline(alignmentFile, line);
+ }
+
+
+ return false;
+}
+
+
+int gnAlignedSequences::sequenceNameInList( string& sequenceName ){
+ for( uint nameI = 0; nameI < names.size(); nameI++ ){
+ if( sequenceName == names[ nameI ] )
+ return nameI;
+ }
+ return -1;
+}
+
+bool gnAlignedSequences::sequenceNameInList(string sequenceName, list <pair <string*, string*> >::iterator &sequenceItr)
+{
+ for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++)
+ {
+ if (sequenceName == (*(*sequenceItr).first))
+ return true;
+ }
+
+ return false;
+}
+
+
+bool gnAlignedSequences::buildConsensus()
+{
+ char consensusBase = '-';
+
+ consensus = "";
+
+ vector <char> crossAlignmentBases;
+ for (int index=0; index<(*(*alignedSequences.begin()).second).size(); index++)
+ {
+ vector <int> baseCounts(26, 0);
+ crossAlignmentBases = (*this)[index];
+ /*list <pair <string*, string*> >::iterator itr = alignedSequences.begin();
+ itr++;*/
+ for (int i=0; i<crossAlignmentBases.size(); i++)
+ {
+ // to hold knowledge of consensus if MEGA '.' format employed
+ // ('.'==same as base in 1st sequence)
+ if (i == 0)
+ consensusBase=crossAlignmentBases[i];
+
+ // consensus already established if in MEGA '.' format - the 1st seq
+ if (i>0 && crossAlignmentBases[i]=='.')
+ break;
+
+ else if (crossAlignmentBases[i] != '-')
+ {
+ int baseIndex = determineBaseIndex(crossAlignmentBases[i]);
+ baseCounts[baseIndex]++;
+ }
+ }
+
+ int toAppendToConsensus = 0;
+ for (int i=1; i<baseCounts.size(); i++)
+ {
+ // strictly alphabetic - count ties are broken lexigraphically
+ if (baseCounts[i] > baseCounts[toAppendToConsensus])
+ toAppendToConsensus = i;
+
+ /* nearly functional code for replacing '.'s w/ consensus data
+ if (crossAlignmentBases[i]=='.')
+ {
+ (*(*itr).second).erase(index, 1);
+ string toInsert;
+ toInsert += crossAlignmentBases[0];
+ (*(*itr).second).insert(index, toInsert);
+ }
+
+ itr++;*/
+ }
+
+ consensus += (toAppendToConsensus+65);
+ }
+
+ return false;
+}
+
+
+void gnAlignedSequences::addSequence(string& seqToAdd, string& seqName)
+{
+ sequences.push_back( seqToAdd );
+ names.push_back( seqName );
+}
+
+
+void gnAlignedSequences::addSequence(gnSequence& seqToAdd, string& seqName)
+{
+
+ ErrorMsg( "Fix gnAlignedSequences::addSequence()" );
+ sequences.push_back( seqToAdd.ToString() );
+ names.push_back( seqName );
+
+/* list <pair <string*, string*> >::iterator itr;
+ if (!sequenceNameInList(seqName, itr))
+ {
+ pair <string*, string*> toAdd;
+ toAdd.first = new string(seqName);
+ toAdd.second = new string( seqToAdd.ToString() );
+
+ alignedSequences.push_back(toAdd);
+ }
+
+ else
+ {
+ (*((*itr).second)) += seqToAdd.ToString();
+ }
+*/
+}
+
+
+void gnAlignedSequences::addSequence(gnSequence seqToAdd, string seqName, int consensusStart, string originalConsensus)
+{
+ list <pair <string*, string*> >::iterator itr;
+ if (!sequenceNameInList(seqName, itr))
+ {
+ pair <string*, string*> toAdd;
+ toAdd.first = new string(seqName);
+ string seq = seqToAdd.ToString();
+ toAdd.second = new string( seq );
+ (*toAdd.second).erase();
+
+ for (int i=0; i<(*toAdd.second).size(); i++)
+ {
+ if (seq[i] == '-')
+ seq[i] = originalConsensus[consensusStart+i-1];
+ }
+
+ (*toAdd.second) = seq;
+
+ alignedSequences.push_back(toAdd);
+ }
+
+ else
+ {
+ string seq = (*((*itr).second));
+ seq += seqToAdd.ToString();
+ for (int i=0; i<seq.size(); i++)
+ {
+ if (seq[i+(*((*itr).second)).size()]=='-' && originalConsensus.size()>0)
+ seq[i+(*((*itr).second)).size()] = originalConsensus[consensusStart+i-1];
+ }
+
+ (*((*itr).second)) = seq;
+ }
+}
+
+
+void gnAlignedSequences::addAllSegments(gnAlignedSequences &alignment, unsigned start, unsigned stop)
+{
+ for ( uint seqI = 0; seqI < alignment.sequences.size(); seqI++ ){
+ if (stop == 0 || stop == alignment.sequences[ seqI ].size()-1)
+ stop = alignment.sequences[ seqI ].size();
+ string seq = alignment.sequences[ seqI ].substr(start, stop-start+1);
+ alignment.addSequence( seq, alignment.names[ seqI ] );
+
+ }
+}
+
+
+void gnAlignedSequences::addAllSegmentsReplaceGaps(gnAlignedSequences &alignment, unsigned start, unsigned stop)
+{
+ list <pair <string*, string*> >::iterator alignedItr = alignedSequences.begin();
+ for ( ; alignedItr != alignedSequences.end(); alignedItr++)
+ {
+ if (stop == 0 || stop == (*(*alignedItr).second).size()-1)
+ stop = (*(*alignedItr).second).size();
+
+ alignment.addSequence(((*(*alignedItr).second).substr(start, stop-start+1)),
+ ((*(*alignedItr).first)), start, consensus);
+ }
+}
+
+
+void gnAlignedSequences::removeAllSegments(unsigned start, unsigned stop)
+{
+ list <pair <string*, string*> >::iterator alignedItr = alignedSequences.begin();
+ for ( ; alignedItr != alignedSequences.end(); alignedItr++)
+ {
+ if (stop == 0)
+ stop = (*(*alignedItr).second).size();
+
+ (alignedItr->second)->erase(start, stop-start+1);
+ }
+
+ cout << start << " " << stop << ": " << stop-start+1 << endl;
+}
+
+
+int gnAlignedSequences::determineBaseIndex(char base)
+{
+ if (base < 91) // Upper Case
+ return (base-65);
+
+ // Lower Case
+ return (base-97);
+}
+
+
+bool gnAlignedSequences::coordinates(string line)
+{
+ bool toReturn = true;
+
+ for (int i=0; i<line.length(); i++)
+ {
+ if (line[i]!=' ' && line[i]!='\r' && line[i]!='\n' && (line[i]<48 || line[i]>57))
+ {
+ toReturn = false;
+ break;
+ }
+ }
+
+ return toReturn;
+}
+
+}
diff --git a/libMems/gnAlignedSequences.h b/libMems/gnAlignedSequences.h
new file mode 100644
index 0000000..9a1f644
--- /dev/null
+++ b/libMems/gnAlignedSequences.h
@@ -0,0 +1,401 @@
+/*******************************************************************************
+ * $Id: gnAlignedSequences.h,v 1.5 2004/02/27 23:08:55 darling Exp $
+ * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
+ * This file is licensed under the GPL.
+ * Please see the file called COPYING for licensing details.
+ * **************
+ ******************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////////
+// File: gnAlignedSequences.h
+// Purpose: Aligned Sequences class
+// Discription: Provides an alignment interface for any number of alignable
+// sequences (the data of each of which is contained in a
+// genome::gnSequence object).
+// Currently only compatible with ClustalW alignment files.
+// Revisions:
+// Version: A
+// Created: August 3, 2000, 11:55am
+// Author: Brian Gettler
+// Last Edited: May 3, 2001, 4:25pm
+// Modified by:
+// Copyright: (c)
+// Licences:
+/////////////////////////////////////////////////////////////////////////////
+#ifndef __gnAlignedSequences_h__
+#define __gnAlignedSequences_h__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libGenome/gnFilter.h"
+#include <list>
+#include <fstream>
+#include <vector>
+
+namespace mems {
+
+// the number of characters in each row of an alignment file
+const int MEGA_ALIGN_COLUMNS = 60;
+
+/**
+ * gnAlignedSequences allows for the manipulation of aligned sequence
+ * data.
+ */
+
+class gnAlignedSequences// : blClone
+{
+public:
+ /**
+ * Empty Constructor, creates a default gnAlignedSequences.
+ */
+ gnAlignedSequences();
+ /**
+ * Copy Constructor, creates a copy of toCopy.
+ */
+ gnAlignedSequences(const gnAlignedSequences &toCopy);
+
+
+ /**
+ * Returns a vector of supported format names
+ */
+ static const std::vector< std::string >& getSupportedFormats();
+
+ /**
+ * Checks whether a particular format name is supported
+ */
+ static boolean isSupportedFormat( const std::string& format_name );
+
+ /**
+ * Writes out this sequence alignment in the specified format,
+ * assuming the format is supported
+ */
+ void output( const std::string& format_name, std::ostream& os ) const;
+
+// sequence alignment loading
+ /**
+ * Loads the data held in file alignedFileName (in ClustalW format).
+ * @param alignedFileName name of a file containing an alignment.
+ */
+ void constructFromClustalW(std::string alignedFileName);
+ /**
+ * Loads the data held in file alignedFileName (in Phylip format).
+ * @param alignedFileName name of a file containing an alignment.
+ */
+ void constructFromPhylip(std::string alignedFileName);
+ /**
+ * Loads the data held in file alignedFileName (in MSF format).
+ * @param alignedFileName name of a file containing an alignment.
+ */
+ void constructFromMSF(std::string alignedFileName);
+ /**
+ * Loads the data held in file alignedFileName (in Nexus format).
+ * @param alignedFileName name of a file containing an alignment.
+ */
+ void constructFromNexus(std::string alignedFileName);
+ /**
+ * Loads the data held in file alignedFileName (in Mega format).
+ * @param alignedFileName name of a file containing an alignment.
+ */
+ void constructFromMega(std::string alignedFileName);
+
+ /**
+ * Reads a single sequence entry in relaxed NEXUS format. Assumes that
+ * the #NEXUS has already been read off.
+ * @param align_stream The stream to read data from
+ */
+ void constructFromRelaxedNexus( std::istream& align_stream );
+
+ /**
+ * Assigns a file name to the alignment data for purposes of output.
+ * @param name the name of the file.
+ */
+ void assignFileName(std::string name);
+
+// output
+ /**
+ * Writes alignment using the given output stream (in Phylip format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputPhylip(std::ostream& os) const;
+ /**
+ * Writes alignment using the given output stream (in ClustalW format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputClustalW(std::ostream& os) const;
+ /**
+ * Writes alignment using the given output stream (in MSF format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputMSF(std::ostream& os) const;
+ /**
+ * Writes alignment using the given output stream (in Nexus format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputNexus(std::ostream& os) const;
+ /**
+ * Writes alignment using the given output stream (in Mega format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputMega(std::ostream& os) const;
+ /**
+ * Writes alignment in 3-base, codon segments using the given output
+ * stream (in Phylip format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputCodon(std::ostream& os) const;
+ /**
+ * Writes alignment with consensus using the given output stream
+ * (in Phylip format).
+ * @param os the output stream.
+ * @return true if successful.
+ */
+ bool outputWithConsensus(std::ostream& os);
+
+// alignment manipulators that create new gnAlignedSequences
+ /**
+ * Create a new alignment that is comprised of all of the segments
+ * in the initial alignment from start to stop (inclusive)
+ * if stop == 0, the alignment ends at the end
+ * @param start the beginning point of the segment.
+ * @param stop the end point of the segment.
+ * @return the new gnAlignedSequences that is created
+ */
+ gnAlignedSequences getAlignedSegment(unsigned start, unsigned stop);
+ /**
+ * Extracts every codonMultiple-th codon in the reading
+ * frame readingFrame beginning with startCodon
+ * reading frames supported: 1, 2 & 3 (no reverse complementing)
+ * @param readingFrame the codon reading frame.
+ * @param startCodon the number codon in readingFrame with which to begin.
+ * @param codonMultiple the multiple with which codons in readingFrame
+ * are selected.
+ * @return the new gnAlignedSequences that is created
+ */
+ gnAlignedSequences getCodons(int readingFrame, int startCodon, int codonMultiple);
+
+ /**
+ * Returns the name of the file associated with this gnAlignedSequences.
+ * @return the alignment file name.
+ */
+ std::string getAlignedSequenceFileName();
+ /**
+ * Returns the size of each sequence in the alignment (all are identical).
+ * @return the size of the aligned sequences.
+ */
+ gnSeqI alignedSeqsSize() const;
+
+ /**
+ * Removes a single sequence from the alignment.
+ * @param seqName the name of the sequence to remove.
+ * @return true if successful (a sequence called seqName exists).
+ */
+ bool removeAlignedSeq(std::string seqName);
+ /**
+ * Removes a single sequence from the alignment.
+ * @param index the position in the of the sequence to be removed.
+ * @return true if successful (a sequence at index exists).
+ */
+ bool removeAlignedSeq(unsigned index);
+
+ /**
+ * Concatenates 2 alignmnets.
+ * @param toConcat the sequence which is appended to *this.
+ */
+ void concatenateAlignedSequences(gnAlignedSequences toConcat);
+
+ /**
+ * Extracts the variable sites from *this.
+ * @param variableSites the alignment consisting of all variable sites.
+ * @param countGapsAsMismatches true if gaps are to be considered mismatches.
+ */
+ void extractVariableSites(gnAlignedSequences &variableSites, bool countGapsAsMismatches);
+
+ /**
+ * Collapses the alignment accross all sequences.
+ * Sequences are compared in terms of base content -
+ * if the sequences of base pairs of equal, the sequences are identical
+ * @return true if there exist identical sequences that are collapsed.
+ */
+ bool collapseIdenticalSequences();
+ /**
+ * Accesses the alignment and returns the bases at that position in all
+ * sequences.
+ * @param offset the position in the alignment to access.
+ * @return a vector of characters at position offset.
+ */
+ std::vector <char> operator[]( const int offset ); //const;
+
+ /**
+ * Adds a sequence to the alignment.
+ * @param seqToAdd the sequence data.
+ * @param seqName the sequence's name.
+ */
+ void addSequence(std::string& seqToAdd, std::string& seqName);
+ /**
+ * Adds a sequence to the alignment.
+ * @param seqToAdd the sequence data.
+ * @param seqName the sequence's name.
+ */
+ void addSequence(genome::gnSequence& seqToAdd, std::string& seqName);
+
+ std::list <std::pair <std::string*, std::string*> > alignedSequences;
+ std::vector< std::string > sequences;
+ std::vector< std::string > names;
+ std::vector< int64 > positions; /**< If this is part of a larger alignment this vector stores start positions within that alignment */
+ void seq( uint seqI );
+
+private:
+
+ /**
+ * Reads a relaxed NEXUS format alignment.
+ * @return true if successful.
+ */
+ bool readRelaxedNexusAlignment( std::istream& align_stream );
+ /**
+ * Aids constructFromClustalW.
+ * @return true if successful.
+ */
+ bool readClustalWAlignment();
+ /**
+ * Aids constructFromPhylip.
+ * @return true if successful.
+ */
+ bool readPhylipAlignment();
+ /**
+ * Aids constructFromMSF.
+ * @return true if successful.
+ */
+ bool readMSFAlignment();
+ /**
+ * Aids constructFromNexus.
+ * @return true if successful.
+ */
+ bool readNexusAlignment();
+ /**
+ * Aids constructFromMega.
+ * @return true if successful.
+ */
+ bool readMegaAlignment();
+
+ /**
+ * Aids readClustalWAlignment.
+ * @param alignmentFile the file that contains the alignment.
+ * @return true if successful.
+ */
+ bool constructClustalWAlignedSequenceList(std::ifstream& alignmentFile);
+ /**
+ * Aids readPhylipAlignment.
+ * @param alignmentFile the file that contains the alignment.
+ * @return true if successful.
+ */
+ bool constructPhylipAlignedSequenceList(std::ifstream& alignmentFile);
+ /**
+ * Aids readMSFAlignment.
+ * @param alignmentFile the file that contains the alignment.
+ * @return true if successful.
+ */
+ bool constructMSFAlignedSequenceList(std::ifstream& alignmentFile);
+ /**
+ * Aids readNexusAlignment.
+ * @param alignmentFile the file that contains the alignment.
+ * @return true if successful.
+ */
+ bool constructNexusAlignedSequenceList(std::ifstream& alignmentFile);
+ /**
+ * Aids readMegaAlignment.
+ * @param alignmentFile the file that contains the alignment.
+ * @return true if successful.
+ */
+ bool constructMegaAlignedSequenceList(std::ifstream& alignmentFile);
+
+ /**
+ * Determines whether a sequence of the given name is present in the list..
+ * @param sequenceName the name to be found.
+ * @param sequenceItr the list iterator to be employed.
+ * @return true if sequenceName is present.
+ */
+ bool sequenceNameInList(std::string sequenceName, std::list <std::pair <std::string*, std::string*> >::iterator &sequenceItr);
+
+ /**
+ * Determines whether a sequence of the given name is present in the list.
+ * @param sequenceName the name to be found.
+ * @return the index in the list or -1 if not present
+ */
+ int sequenceNameInList( std::string& sequenceName );
+
+ /**
+ * Reads all sequences in the alignment and creates a consensus.
+ * @return true if successful.
+ */
+ bool buildConsensus();
+
+ /**
+ * Adds a sequence to the alignment.
+ * @param seqToAdd the sequence data.
+ * @param seqName the sequence's name.
+ * @param consensusStart position in consensus to add sequence.
+ * @param originalConsensus the alignment's consensus.
+ */
+ void addSequence(genome::gnSequence seqToAdd, std::string seqName, int consensusStart, std::string originalConsensus);
+
+ /**
+ * Adds all segments in *this to the given alignment.
+ * @param alignment sequences to add.
+ * @param start segment start point.
+ * @param stop segment stop point.
+ */
+ void addAllSegments(gnAlignedSequences &alignment, unsigned start, unsigned stop);
+ /**
+ * Adds all segments in *this to the given alignment -
+ * replaces gaps with consensus data.
+ * @param alignment sequences to add.
+ * @param start segment start point.
+ * @param stop segment stop point.
+ */
+ void addAllSegmentsReplaceGaps(gnAlignedSequences &alignment, unsigned start, unsigned stop);
+ /**
+ * Removes all segments across all sequences in *this.
+ * @param start segment start point.
+ * @param stop segment stop point.
+ */
+ void removeAllSegments(unsigned start, unsigned stop);
+
+ /**
+ * Computes an index for a given base (0-25: a-z).
+ * @param base base to be converted.
+ * @return an index.
+ */
+ int determineBaseIndex(char base);
+
+ /**
+ * Searches given line for coordinates.
+ * @param line line to search.
+ * @return true if coordinates.
+ */
+ bool coordinates(std::string line);
+
+ std::string alignedSequenceFileName;
+// list <pair <string*, genome::gnSequence*> > alignedSequences;
+ std::string consensus;
+ std::vector <int> indexPositions; // 1->n if a standard alignment, variable for varible sites
+}; // gnAlignedSequences
+
+
+inline
+void gnAlignedSequences::assignFileName(std::string name) {alignedSequenceFileName=name;}
+
+inline
+std::string gnAlignedSequences::getAlignedSequenceFileName() {return alignedSequenceFileName;}
+
+}
+
+#endif // __gnAlignedSequences_h__
diff --git a/libMems/gnRAWSequence.h b/libMems/gnRAWSequence.h
new file mode 100644
index 0000000..02e33da
--- /dev/null
+++ b/libMems/gnRAWSequence.h
@@ -0,0 +1,202 @@
+/////////////////////////////////////////////////////////////////////////////
+// File: gnRAWSequence.h
+// Purpose: Optimized Sequence class for RAW sequence files
+// Description: Provides a high level sequence interface to all types of
+// sequence data.
+// Changes:
+// Version: libGenome 0.5.1
+// Author: Aaron Darling
+// Modified by:
+// Copyright: (c) Aaron Darling
+// Licenses: See COPYING file for details
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef _gnRAWSequence_h_
+#define _gnRAWSequence_h_
+
+#include "libGenome/gnDefs.h"
+
+#include <string>
+#include <iostream>
+#include <list>
+#include "libGenome/gnSequence.h"
+
+namespace genome {
+
+
+/**
+ * gnRAWSequence is a bastardization of gnSequence that creates a lightweight wrapper
+ * around a memory-mapped file of raw sequence data
+ */
+class GNDLLEXPORT gnRAWSequence : public gnSequence
+{
+public:
+ /**
+ * Empty Constructor, creates an empty gnRAWSequence.
+ */
+ gnRAWSequence();
+ /**
+ * Creates a gnRAWSequence based on the file specified by filename
+ */
+ gnRAWSequence( const std::string& filename )
+ {
+ this->filename = filename;
+ data.open( filename );
+ }
+
+ gnRAWSequence* Clone() const {return new gnRAWSequence(*this);}
+
+ gnSeqI contigListSize() const {return 1;}
+ gnSeqI contigListLength() const {return 1;}
+ uint32 contigIndexByBase( const gnSeqI baseI) const {
+ if(baseI >= data.size()) Throw_gnEx(SeqIndexOutOfBounds());
+ return 0;
+ }
+ gnRAWSequence contig( const uint32 contigI) const {
+ if(contigI>0) Throw_gnEx(FragmentIndexOutOfBounds());
+ return *this;
+ }
+ gnRAWSequence contigByBase( const gnSeqI baseI) const {
+ if(baseI >= data.size()) Throw_gnEx(SeqIndexOutOfBounds());
+ return *this;
+ }
+ virtual gnSeqI contigStart( const uint32 contigI) const {
+ if(contigI>0) Throw_gnEx(FragmentIndexOutOfBounds());
+ return 0;
+ }
+ virtual gnSeqI contigLength( const uint32 contigI) const {
+ if(contigI>0) Throw_gnEx(FragmentIndexOutOfBounds());
+ return data.size();
+ }
+ virtual uint32 contigIndexByName( std::string& contigName) const {
+ return 0;
+ }
+ virtual std::string contigName( const uint32 contigI) const {
+ if(contigI>0) Throw_gnEx(FragmentIndexOutOfBounds());
+ return "";
+ }
+ virtual gnSequence contigByName( std::string& contigName) const {
+ Throw_gnEx(FragmentIndexOutOfBounds());
+ }
+ virtual void merge(const gnSeqI startI, const gnSeqI endI){ throw; }
+ virtual void mergeContigs(const uint32 startC, const uint32 endC){ throw; }
+ virtual void splitContig(const gnSeqI splitI, const uint32 contigI=ALL_CONTIGS) { throw; }
+
+ virtual void setContigName( const uint32 contigI, const std::string& contig_name) { throw; }
+
+ virtual uint32 getFeatureListLength() const {
+ return 0;
+ }
+ virtual gnBaseFeature* getFeature(const uint32 featureI) const{ Throw_gnEx(FeatureIndexOutOfBounds()); }
+ virtual void getContainedFeatures(const gnLocation& lt, std::vector<gnBaseFeature*>& feature_vector, std::vector<uint32>& index_vector) const {}
+ virtual void getIntersectingFeatures(const gnLocation& lt, std::vector<gnBaseFeature*>& feature_vector, std::vector<uint32>& index_vector) const {}
+ virtual uint32 addFeature(gnBaseFeature* feature) { throw; }
+ virtual void removeFeature(const uint32 featureI){ Throw_gnEx(FeatureIndexOutOfBounds()); }
+ virtual void getBrokenFeatures(const gnLocation& lt, std::vector<gnBaseFeature*>& feature_vector) const{};
+ virtual uint32 getHeaderListLength(const uint32 contigI) const{ return 0; }
+ virtual gnBaseHeader* getHeader(const uint32 contigI, const uint32 headerI) const{Throw_gnEx(HeaderIndexOutOfBounds());};
+ virtual void addHeader(const uint32 contigI, gnBaseHeader* header, const uint32 headerI){Throw_gnEx(FragmentIndexOutOfBounds());}
+ virtual void removeHeader(const uint32 contigI, const uint32 headerI){ Throw_gnEx(HeaderIndexOutOfBounds()); }
+ virtual void setReverseComplement( const boolean revComp, const uint32 contigI=ALL_CONTIGS){Throw_gnEx(FragmentIndexOutOfBounds());};
+ virtual boolean isReverseComplement( const uint32 contigI=ALL_CONTIGS ){return false;}
+ virtual boolean isCircular() const{ return false; }
+ virtual void setCircular( const boolean value ) {}
+
+ virtual void globalToLocal(uint32& contigI, gnSeqI& baseI) const{};
+ virtual void localToGlobal(const uint32 contigI, gnSeqI& baseI) const {};
+ virtual void globalToSource(uint32& contigI, gnSeqI& baseI) const{};
+ virtual void localToSource(uint32& contigI, gnSeqI& baseI) const{};
+ virtual bool LoadSource(const std::string sourcename){
+ data.open( sourcename );
+ filename = sourcename;
+ return true;
+ }
+
+ /**
+ * Appends the bases in "seq" to this sequence.
+ */
+ gnRAWSequence& operator+=(const gnRAWSequence& seq);
+
+ /**
+ * Compares the bases in "seq" to this sequence.
+ * @param seq The sequence to compare this sequence to.
+ * @return Negative if this sequence is lesser, 0 if the two sequences are
+ * equal, and positive if this sequence is greater.
+ */
+/* virtual int compare(const gnRAWSequence& seq) const;
+ virtual int compare(const std::string& str) const;
+
+ virtual void append( const gnRAWSequence& seq);
+ virtual void insert( const gnSeqI offset, const gnSeqC *bases, const gnSeqI length);
+ virtual void insert( const gnSeqI offset, const gnRAWSequence& seq);
+ virtual void insert( const gnSeqI offset, const gnGenomeSpec& gnbs);
+ gnRAWSequence const operator+(const gnRAWSequence& seq) const;
+ virtual void erase( const gnSeqI offset=0, const gnSeqI length=GNSEQI_END );
+*/
+ gnRAWSequence subseq(const gnSeqI offset, const gnSeqI length) const
+ {
+ gnRAWSequence gnrs;
+ gnrs.data.open(filename, length, offset - 1);
+ return gnrs;
+ }
+// friend std::istream& operator>>(std::istream& is, gnRAWSequence& gns); //read from source.
+ /**
+ * Writes the bases in this sequence to the specified output stream (e.g. cout).
+ */
+// friend std::ostream& operator<<(std::ostream& os, const gnRAWSequence& gns); //write to source.
+
+ virtual gnSeqI length() const { return data.size(); }
+ virtual gnSeqI size() const { return data.size(); }
+
+ virtual std::string ToString( const gnSeqI length=GNSEQI_END, const gnSeqI offset=1 ) const
+ {
+ gnSeqI len = length == GNSEQI_END ? data.size() - offset - 1 : length;
+ std::string asdf(data.data()+offset-1, len);
+ return asdf;
+ }
+
+ virtual boolean ToString( std::string& str, const gnSeqI length=GNSEQI_END, const gnSeqI offset=1 ) const
+ {
+ gnSeqI len = length == GNSEQI_END ? data.size() - offset - 1 : length;
+ str.assign(data.data()+offset-1,len);
+ return true;
+ }
+ virtual boolean ToArray( gnSeqC* pSeqC, gnSeqI length, const gnSeqI offset=1 ) const
+ {
+ gnSeqI len = length == GNSEQI_END ? data.size() - offset - 1 : length;
+ memcpy(pSeqC, data.data()+offset-1, len);
+ return true;
+ }
+ virtual gnSeqC GetSeqC( const gnSeqI offset ) const
+ {
+ return *(data.data()+(offset-1));
+ }
+ gnSeqC operator[]( const gnSeqI offset ) const
+ {
+ return *(data.data()+(offset-1));
+ }
+
+ virtual gnSeqI find(const gnRAWSequence& search, const gnSeqI offset=0) const {return GNSEQI_ERROR;}
+
+private:
+ boost::iostreams::mapped_file_source data;
+ std::string filename;
+}; // class gnRAWSequence
+
+/*
+GNDLLEXPORT
+std::istream& operator>>(std::istream& is, gnRAWSequence& gns); //read from source.
+GNDLLEXPORT
+std::ostream& operator<<(std::ostream& os, const gnRAWSequence& gns); //write to source.
+*/
+
+
+
+} // end namespace genome
+
+#endif
+ // _gnRAWSequence_h_
diff --git a/libMems/twister.c b/libMems/twister.c
new file mode 100644
index 0000000..977cf39
--- /dev/null
+++ b/libMems/twister.c
@@ -0,0 +1,224 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+*/
+
+#include <stdio.h>
+#include <time.h>
+#include <limits.h>
+#include "twister.h"
+
+
+/* Period parameters */
+#define N 624
+#define M 397
+#define MATRIX_A 0x9908b0dfUL /* constant vector a */
+#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
+#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
+
+static unsigned long mt[N]; /* the array for the state vector */
+static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned long s)
+{
+ mt[0]= s & 0xffffffffUL;
+ for (mti=1; mti<N; mti++) {
+ mt[mti] =
+ (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
+ /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+ /* In the previous versions, MSBs of the seed affect */
+ /* only MSBs of the array mt[]. */
+ /* 2002/01/09 modified by Makoto Matsumoto */
+ mt[mti] &= 0xffffffffUL;
+ /* for >32 bit machines */
+ }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned long init_key[], int key_length)
+{
+ int i, j, k;
+ init_genrand(19650218UL);
+ i=1; j=0;
+ k = (N>key_length ? N : key_length);
+ for (; k; k--) {
+ mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ + init_key[j] + j; /* non linear */
+ mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+ i++; j++;
+ if (i>=N) { mt[0] = mt[N-1]; i=1; }
+ if (j>=key_length) j=0;
+ }
+ for (k=N-1; k; k--) {
+ mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
+ - i; /* non linear */
+ mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+ i++;
+ if (i>=N) { mt[0] = mt[N-1]; i=1; }
+ }
+
+ mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned long genrand_int32(void)
+{
+ unsigned long y;
+ static unsigned long mag01[2]={0x0UL, MATRIX_A};
+ /* mag01[x] = x * MATRIX_A for x=0,1 */
+
+ if (mti >= N) { /* generate N words at one time */
+ int kk;
+
+ if (mti == N+1) /* if init_genrand() has not been called, */
+ init_genrand(5489UL); /* a default initial seed is used */
+
+ for (kk=0;kk<N-M;kk++) {
+ y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+ mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+ for (;kk<N-1;kk++) {
+ y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+ mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+ y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
+ mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+ mti = 0;
+ }
+
+ y = mt[mti++];
+
+ /* Tempering */
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9d2c5680UL;
+ y ^= (y << 15) & 0xefc60000UL;
+ y ^= (y >> 18);
+
+ return y;
+}
+
+/* generates a random number on [0,0x7fffffff]-interval */
+long genrand_int31(void)
+{
+ return (long)(genrand_int32()>>1);
+}
+
+/* generates a random number on [0,1]-real-interval */
+double genrand_real1(void)
+{
+ return genrand_int32()*(1.0/4294967295.0);
+ /* divided by 2^32-1 */
+}
+
+/* generates a random number on [0,1)-real-interval */
+double genrand_real2(void)
+{
+ return genrand_int32()*(1.0/4294967296.0);
+ /* divided by 2^32 */
+}
+
+/* generates a random number on (0,1)-real-interval */
+double genrand_real3(void)
+{
+ return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
+ /* divided by 2^32 */
+}
+
+/* generates a random number on [0,1) with 53-bit resolution*/
+double genrand_res53(void)
+{
+ unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
+ return(a*67108864.0+b)*(1.0/9007199254740992.0);
+}
+/* These real versions are due to Isaku Wada, 2002/01/09 added */
+
+void SetTwisterSeed(unsigned long seed)
+{
+ init_genrand(seed);
+}
+
+unsigned long CreateTwisterSeed( )
+{
+ static unsigned long differ = 0; // guarantee time-based seeds will change
+
+ // Get a uint32 from t and c
+ // Better than uint32(x) in case x is floating point in [0,1]
+ // Based on code by Lawrence Kirby (fred at genesis.demon.co.uk)
+ time_t t = time(NULL);
+ clock_t c = clock();
+
+ unsigned long h1 = 0;
+ unsigned long h2 = 0;
+
+ unsigned char *p = (unsigned char *) &t;
+
+ size_t i, j;
+
+ for( i = 0; i < sizeof(t); ++i )
+ {
+ h1 *= UCHAR_MAX + 2U;
+ h1 += p[i];
+ }
+ p = (unsigned char *) &c;
+ for( j = 0; j < sizeof(c); ++j )
+ {
+ h2 *= UCHAR_MAX + 2U;
+ h2 += p[j];
+ }
+ return ( h1 + differ++ ) ^ h2;
+}
+
+double RandTwisterDouble()
+{
+ return genrand_real1();
+}
+
+unsigned RandTwisterUnsigned()
+{
+ return genrand_int32();
+}
+
+
diff --git a/libMems/twister.h b/libMems/twister.h
new file mode 100644
index 0000000..24c5e6b
--- /dev/null
+++ b/libMems/twister.h
@@ -0,0 +1,18 @@
+#ifndef __TWISTER_H__
+#define __TWISTER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SetTwisterSeed (unsigned long seed);
+unsigned long CreateTwisterSeed(void);
+double RandTwisterDouble (void);
+unsigned RandTwisterUnsigned(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __TWISTER_H__
+
diff --git a/m4/ac_cxx_namespaces.m4 b/m4/ac_cxx_namespaces.m4
new file mode 100644
index 0000000..2f18477
--- /dev/null
+++ b/m4/ac_cxx_namespaces.m4
@@ -0,0 +1,25 @@
+dnl @synopsis AC_CXX_NAMESPACES
+dnl
+dnl If the compiler can prevent names clashes using namespaces, define
+dnl HAVE_NAMESPACES.
+dnl
+dnl @category Cxx
+dnl @author Todd Veldhuizen
+dnl @author Luc Maisonobe <luc at spaceroots.org>
+dnl @version 2004-02-04
+dnl @license AllPermissive
+
+AC_DEFUN([AC_CXX_NAMESPACES],
+[AC_CACHE_CHECK(whether the compiler implements namespaces,
+ac_cv_cxx_namespaces,
+[AC_LANG_SAVE
+ AC_LANG_CPLUSPLUS
+ AC_TRY_COMPILE([namespace Outer { namespace Inner { int i = 0; }}],
+ [using namespace Outer::Inner; return i;],
+ ac_cv_cxx_namespaces=yes, ac_cv_cxx_namespaces=no)
+ AC_LANG_RESTORE
+])
+if test "$ac_cv_cxx_namespaces" = yes; then
+ AC_DEFINE(HAVE_NAMESPACES,,[define if the compiler implements namespaces])
+fi
+])
diff --git a/m4/ax_openmp.m4 b/m4/ax_openmp.m4
new file mode 100644
index 0000000..91a2ee3
--- /dev/null
+++ b/m4/ax_openmp.m4
@@ -0,0 +1,104 @@
+##### http://autoconf-archive.cryp.to/ax_openmp.html
+#
+# SYNOPSIS
+#
+# AX_OPENMP([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+# This macro tries to find out how to compile programs that use
+# OpenMP a standard API and set of compiler directives for parallel
+# programming (see http://www-unix.mcs/)
+#
+# On success, it sets the
+# OPENMP_CFLAGS/OPENMP_CXXFLAGS/OPENMP_F77FLAGS output variable to
+# the flag (e.g. -omp) used both to compile *and* link OpenMP
+# programs in the current language.
+#
+# NOTE: You are assumed to not only compile your program with these
+# flags, but also link it with them as well.
+#
+# If you want to compile everything with OpenMP, you should set:
+#
+# CFLAGS="$CFLAGS $OPENMP_CFLAGS"
+# #OR# CXXFLAGS="$CXXFLAGS $OPENMP_CXXFLAGS"
+# #OR# FFLAGS="$FFLAGS $OPENMP_FFLAGS"
+#
+# (depending on the selected language).
+#
+# The user can override the default choice by setting the
+# corresponding environment variable (e.g. OPENMP_CFLAGS).
+#
+# ACTION-IF-FOUND is a list of shell commands to run if an OpenMP
+# flag is found, and ACTION-IF-NOT-FOUND is a list of commands to run
+# it if it is not found. If ACTION-IF-FOUND is not specified, the
+# default action will define HAVE_OPENMP.
+#
+# LAST MODIFICATION
+#
+# 2006-01-24
+#
+# COPYLEFT
+#
+# Copyright (c) 2006 Steven G. Johnson <stevenj at alum.mit.edu>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+#
+# As a special exception, the respective Autoconf Macro's copyright
+# owner gives unlimited permission to copy, distribute and modify the
+# configure scripts that are the output of Autoconf when processing
+# the Macro. You need not follow the terms of the GNU General Public
+# License when using or distributing such scripts, even though
+# portions of the text of the Macro appear in them. The GNU General
+# Public License (GPL) does govern all other use of the material that
+# constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the
+# Autoconf Macro released by the Autoconf Macro Archive. When you
+# make and distribute a modified version of the Autoconf Macro, you
+# may extend this special exception to the GPL to apply to your
+# modified version as well.
+
+AC_DEFUN([AX_OPENMP], [
+AC_PREREQ(2.59) dnl for _AC_LANG_PREFIX
+
+AC_CACHE_CHECK([for OpenMP flag of _AC_LANG compiler], ax_cv_[]_AC_LANG_ABBREV[]_openmp, [save[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS
+ax_cv_[]_AC_LANG_ABBREV[]_openmp=unknown
+# Flags to try: -fopenmp (gcc), -openmp (icc), -mp (SGI & PGI),
+# -xopenmp (Sun), -omp (Tru64), -qsmp=omp (AIX), none
+ax_openmp_flags="-fopenmp -openmp -mp -xopenmp -omp -qsmp=omp none"
+if test "x$OPENMP_[]_AC_LANG_PREFIX[]FLAGS" != x; then
+ ax_openmp_flags="$OPENMP_[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flags"
+fi
+for ax_openmp_flag in $ax_openmp_flags; do
+ case $ax_openmp_flag in
+ none) []_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[] ;;
+ *) []_AC_LANG_PREFIX[]FLAGS="$save[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flag" ;;
+ esac
+ AC_TRY_LINK_FUNC(omp_set_num_threads,
+ [ax_cv_[]_AC_LANG_ABBREV[]_openmp=$ax_openmp_flag; break])
+done
+[]_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[]FLAGS
+])
+if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" = "xunknown"; then
+ m4_default([$2],:)
+else
+ if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" != "xnone"; then
+ OPENMP_[]_AC_LANG_PREFIX[]FLAGS=$ax_cv_[]_AC_LANG_ABBREV[]_openmp
+ fi
+ m4_default([$1], [AC_DEFINE(HAVE_OPENMP,1,[Define if OpenMP is enabled])])
+fi
+])dnl AX_OPENMP
diff --git a/m4/ax_prog_doxygen.m4 b/m4/ax_prog_doxygen.m4
new file mode 100644
index 0000000..14c31cb
--- /dev/null
+++ b/m4/ax_prog_doxygen.m4
@@ -0,0 +1,535 @@
+##### http://autoconf-archive.cryp.to/ax_prog_doxygen.html
+#
+# SYNOPSIS
+#
+# DX_INIT_DOXYGEN(PROJECT-NAME, DOXYFILE-PATH, [OUTPUT-DIR])
+# DX_DOXYGEN_FEATURE(ON|OFF)
+# DX_DOT_FEATURE(ON|OFF)
+# DX_HTML_FEATURE(ON|OFF)
+# DX_CHM_FEATURE(ON|OFF)
+# DX_CHI_FEATURE(ON|OFF)
+# DX_MAN_FEATURE(ON|OFF)
+# DX_RTF_FEATURE(ON|OFF)
+# DX_XML_FEATURE(ON|OFF)
+# DX_PDF_FEATURE(ON|OFF)
+# DX_PS_FEATURE(ON|OFF)
+#
+# DESCRIPTION
+#
+# The DX_*_FEATURE macros control the default setting for the given
+# Doxygen feature. Supported features are 'DOXYGEN' itself, 'DOT' for
+# generating graphics, 'HTML' for plain HTML, 'CHM' for compressed
+# HTML help (for MS users), 'CHI' for generating a seperate .chi file
+# by the .chm file, and 'MAN', 'RTF', 'XML', 'PDF' and 'PS' for the
+# appropriate output formats. The environment variable
+# DOXYGEN_PAPER_SIZE may be specified to override the default
+# 'a4wide' paper size.
+#
+# By default, HTML, PDF and PS documentation is generated as this
+# seems to be the most popular and portable combination. MAN pages
+# created by Doxygen are usually problematic, though by picking an
+# appropriate subset and doing some massaging they might be better
+# than nothing. CHM and RTF are specific for MS (note that you can't
+# generate both HTML and CHM at the same time). The XML is rather
+# useless unless you apply specialized post-processing to it.
+#
+# The macros mainly control the default state of the feature. The use
+# can override the default by specifying --enable or --disable. The
+# macros ensure that contradictory flags are not given (e.g.,
+# --enable-doxygen-html and --enable-doxygen-chm,
+# --enable-doxygen-anything with --disable-doxygen, etc.) Finally,
+# each feature will be automatically disabled (with a warning) if the
+# required programs are missing.
+#
+# Once all the feature defaults have been specified, call
+# DX_INIT_DOXYGEN with the following parameters: a one-word name for
+# the project for use as a filename base etc., an optional
+# configuration file name (the default is 'Doxyfile', the same as
+# Doxygen's default), and an optional output directory name (the
+# default is 'doxygen-doc').
+#
+# Automake Support
+#
+# The following is a template aminclude.am file for use with
+# Automake. Make targets and variables values are controlled by the
+# various DX_COND_* conditionals set by autoconf.
+#
+# The provided targets are:
+#
+# doxygen-doc: Generate all doxygen documentation.
+#
+# doxygen-run: Run doxygen, which will generate some of the
+# documentation (HTML, CHM, CHI, MAN, RTF, XML)
+# but will not do the post processing required
+# for the rest of it (PS, PDF, and some MAN).
+#
+# doxygen-man: Rename some doxygen generated man pages.
+#
+# doxygen-ps: Generate doxygen PostScript documentation.
+#
+# doxygen-pdf: Generate doxygen PDF documentation.
+#
+# Note that by default these are not integrated into the automake
+# targets. If doxygen is used to generate man pages, you can achieve
+# this integration by setting man3_MANS to the list of man pages
+# generated and then adding the dependency:
+#
+# $(man3_MANS): doxygen-doc
+#
+# This will cause make to run doxygen and generate all the
+# documentation.
+#
+# The following variable is intended for use in Makefile.am:
+#
+# DX_CLEANFILES = everything to clean.
+#
+# Then add this variable to MOSTLYCLEANFILES.
+#
+# ----- begin aminclude.am -------------------------------------
+#
+# ## --------------------------------- ##
+# ## Format-independent Doxygen rules. ##
+# ## --------------------------------- ##
+#
+# if DX_COND_doc
+#
+# ## ------------------------------- ##
+# ## Rules specific for HTML output. ##
+# ## ------------------------------- ##
+#
+# if DX_COND_html
+#
+# DX_CLEAN_HTML = @DX_DOCDIR@/html
+#
+# endif DX_COND_html
+#
+# ## ------------------------------ ##
+# ## Rules specific for CHM output. ##
+# ## ------------------------------ ##
+#
+# if DX_COND_chm
+#
+# DX_CLEAN_CHM = @DX_DOCDIR@/chm
+#
+# if DX_COND_chi
+#
+# DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE at .chi
+#
+# endif DX_COND_chi
+#
+# endif DX_COND_chm
+#
+# ## ------------------------------ ##
+# ## Rules specific for MAN output. ##
+# ## ------------------------------ ##
+#
+# if DX_COND_man
+#
+# DX_CLEAN_MAN = @DX_DOCDIR@/man
+#
+# endif DX_COND_man
+#
+# ## ------------------------------ ##
+# ## Rules specific for RTF output. ##
+# ## ------------------------------ ##
+#
+# if DX_COND_rtf
+#
+# DX_CLEAN_RTF = @DX_DOCDIR@/rtf
+#
+# endif DX_COND_rtf
+#
+# ## ------------------------------ ##
+# ## Rules specific for XML output. ##
+# ## ------------------------------ ##
+#
+# if DX_COND_xml
+#
+# DX_CLEAN_XML = @DX_DOCDIR@/xml
+#
+# endif DX_COND_xml
+#
+# ## ----------------------------- ##
+# ## Rules specific for PS output. ##
+# ## ----------------------------- ##
+#
+# if DX_COND_ps
+#
+# DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE at .ps
+#
+# DX_PS_GOAL = doxygen-ps
+#
+# doxygen-ps: @DX_DOCDIR@/@PACKAGE at .ps
+#
+# @DX_DOCDIR@/@PACKAGE at .ps: @DX_DOCDIR@/@PACKAGE at .tag
+# cd @DX_DOCDIR@/latex; \
+# rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
+# $(DX_LATEX) refman.tex; \
+# $(MAKEINDEX_PATH) refman.idx; \
+# $(DX_LATEX) refman.tex; \
+# countdown=5; \
+# while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
+# refman.log > /dev/null 2>&1 \
+# && test $$countdown -gt 0; do \
+# $(DX_LATEX) refman.tex; \
+# countdown=`expr $$countdown - 1`; \
+# done; \
+# $(DX_DVIPS) -o ../@PACKAGE at .ps refman.dvi
+#
+# endif DX_COND_ps
+#
+# ## ------------------------------ ##
+# ## Rules specific for PDF output. ##
+# ## ------------------------------ ##
+#
+# if DX_COND_pdf
+#
+# DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE at .pdf
+#
+# DX_PDF_GOAL = doxygen-pdf
+#
+# doxygen-pdf: @DX_DOCDIR@/@PACKAGE at .pdf
+#
+# @DX_DOCDIR@/@PACKAGE at .pdf: @DX_DOCDIR@/@PACKAGE at .tag
+# cd @DX_DOCDIR@/latex; \
+# rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
+# $(DX_PDFLATEX) refman.tex; \
+# $(DX_MAKEINDEX) refman.idx; \
+# $(DX_PDFLATEX) refman.tex; \
+# countdown=5; \
+# while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
+# refman.log > /dev/null 2>&1 \
+# && test $$countdown -gt 0; do \
+# $(DX_PDFLATEX) refman.tex; \
+# countdown=`expr $$countdown - 1`; \
+# done; \
+# mv refman.pdf ../@PACKAGE at .pdf
+#
+# endif DX_COND_pdf
+#
+# ## ------------------------------------------------- ##
+# ## Rules specific for LaTeX (shared for PS and PDF). ##
+# ## ------------------------------------------------- ##
+#
+# if DX_COND_latex
+#
+# DX_CLEAN_LATEX = @DX_DOCDIR@/latex
+#
+# endif DX_COND_latex
+#
+# .PHONY: doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
+#
+# .INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
+#
+# doxygen-run: @DX_DOCDIR@/@PACKAGE at .tag
+#
+# doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
+#
+# @DX_DOCDIR@/@PACKAGE at .tag: $(DX_CONFIG) $(pkginclude_HEADERS)
+# rm -rf @DX_DOCDIR@
+# $(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
+#
+# DX_CLEANFILES = \
+# @DX_DOCDIR@/@PACKAGE at .tag \
+# -r \
+# $(DX_CLEAN_HTML) \
+# $(DX_CLEAN_CHM) \
+# $(DX_CLEAN_CHI) \
+# $(DX_CLEAN_MAN) \
+# $(DX_CLEAN_RTF) \
+# $(DX_CLEAN_XML) \
+# $(DX_CLEAN_PS) \
+# $(DX_CLEAN_PDF) \
+# $(DX_CLEAN_LATEX)
+#
+# endif DX_COND_doc
+#
+# ----- end aminclude.am ---------------------------------------
+#
+# LAST MODIFICATION
+#
+# 2007-08-04
+#
+# COPYLEFT
+#
+# Copyright (c) 2007 Oren Ben-Kiki <oren at ben-kiki.org>
+#
+# Copying and distribution of this file, with or without
+# modification, are permitted in any medium without royalty provided
+# the copyright notice and this notice are preserved.
+
+## ----------##
+## Defaults. ##
+## ----------##
+
+DX_ENV=""
+AC_DEFUN([DX_FEATURE_doc], ON)
+AC_DEFUN([DX_FEATURE_dot], ON)
+AC_DEFUN([DX_FEATURE_man], OFF)
+AC_DEFUN([DX_FEATURE_html], ON)
+AC_DEFUN([DX_FEATURE_chm], OFF)
+AC_DEFUN([DX_FEATURE_chi], OFF)
+AC_DEFUN([DX_FEATURE_rtf], OFF)
+AC_DEFUN([DX_FEATURE_xml], OFF)
+AC_DEFUN([DX_FEATURE_pdf], ON)
+AC_DEFUN([DX_FEATURE_ps], ON)
+
+## --------------- ##
+## Private macros. ##
+## --------------- ##
+
+# DX_ENV_APPEND(VARIABLE, VALUE)
+# ------------------------------
+# Append VARIABLE="VALUE" to DX_ENV for invoking doxygen.
+AC_DEFUN([DX_ENV_APPEND], [AC_SUBST([DX_ENV], ["$DX_ENV $1='$2'"])])
+
+# DX_DIRNAME_EXPR
+# ---------------
+# Expand into a shell expression prints the directory part of a path.
+AC_DEFUN([DX_DIRNAME_EXPR],
+ [[expr ".$1" : '\(\.\)[^/]*$' \| "x$1" : 'x\(.*\)/[^/]*$']])
+
+# DX_IF_FEATURE(FEATURE, IF-ON, IF-OFF)
+# -------------------------------------
+# Expands according to the M4 (static) status of the feature.
+AC_DEFUN([DX_IF_FEATURE], [ifelse(DX_FEATURE_$1, ON, [$2], [$3])])
+
+# DX_REQUIRE_PROG(VARIABLE, PROGRAM)
+# ----------------------------------
+# Require the specified program to be found for the DX_CURRENT_FEATURE to work.
+AC_DEFUN([DX_REQUIRE_PROG], [
+AC_PATH_TOOL([$1], [$2])
+if test "$DX_FLAG_[]DX_CURRENT_FEATURE$$1" = 1; then
+ AC_MSG_WARN([$2 not found - will not DX_CURRENT_DESCRIPTION])
+ AC_SUBST([DX_FLAG_]DX_CURRENT_FEATURE, 0)
+fi
+])
+
+# DX_TEST_FEATURE(FEATURE)
+# ------------------------
+# Expand to a shell expression testing whether the feature is active.
+AC_DEFUN([DX_TEST_FEATURE], [test "$DX_FLAG_$1" = 1])
+
+# DX_CHECK_DEPEND(REQUIRED_FEATURE, REQUIRED_STATE)
+# -------------------------------------------------
+# Verify that a required features has the right state before trying to turn on
+# the DX_CURRENT_FEATURE.
+AC_DEFUN([DX_CHECK_DEPEND], [
+test "$DX_FLAG_$1" = "$2" \
+|| AC_MSG_ERROR([doxygen-DX_CURRENT_FEATURE ifelse([$2], 1,
+ requires, contradicts) doxygen-DX_CURRENT_FEATURE])
+])
+
+# DX_CLEAR_DEPEND(FEATURE, REQUIRED_FEATURE, REQUIRED_STATE)
+# ----------------------------------------------------------
+# Turn off the DX_CURRENT_FEATURE if the required feature is off.
+AC_DEFUN([DX_CLEAR_DEPEND], [
+test "$DX_FLAG_$1" = "$2" || AC_SUBST([DX_FLAG_]DX_CURRENT_FEATURE, 0)
+])
+
+# DX_FEATURE_ARG(FEATURE, DESCRIPTION,
+# CHECK_DEPEND, CLEAR_DEPEND,
+# REQUIRE, DO-IF-ON, DO-IF-OFF)
+# --------------------------------------------
+# Parse the command-line option controlling a feature. CHECK_DEPEND is called
+# if the user explicitly turns the feature on (and invokes DX_CHECK_DEPEND),
+# otherwise CLEAR_DEPEND is called to turn off the default state if a required
+# feature is disabled (using DX_CLEAR_DEPEND). REQUIRE performs additional
+# requirement tests (DX_REQUIRE_PROG). Finally, an automake flag is set and
+# DO-IF-ON or DO-IF-OFF are called according to the final state of the feature.
+AC_DEFUN([DX_ARG_ABLE], [
+ AC_DEFUN([DX_CURRENT_FEATURE], [$1])
+ AC_DEFUN([DX_CURRENT_DESCRIPTION], [$2])
+ AC_ARG_ENABLE(doxygen-$1,
+ [AS_HELP_STRING(DX_IF_FEATURE([$1], [--disable-doxygen-$1],
+ [--enable-doxygen-$1]),
+ DX_IF_FEATURE([$1], [don't $2], [$2]))],
+ [
+case "$enableval" in
+#(
+y|Y|yes|Yes|YES)
+ AC_SUBST([DX_FLAG_$1], 1)
+ $3
+;; #(
+n|N|no|No|NO)
+ AC_SUBST([DX_FLAG_$1], 0)
+;; #(
+*)
+ AC_MSG_ERROR([invalid value '$enableval' given to doxygen-$1])
+;;
+esac
+], [
+AC_SUBST([DX_FLAG_$1], [DX_IF_FEATURE([$1], 1, 0)])
+$4
+])
+if DX_TEST_FEATURE([$1]); then
+ $5
+ :
+fi
+if DX_TEST_FEATURE([$1]); then
+ AM_CONDITIONAL(DX_COND_$1, :)
+ $6
+ :
+else
+ AM_CONDITIONAL(DX_COND_$1, false)
+ $7
+ :
+fi
+])
+
+## -------------- ##
+## Public macros. ##
+## -------------- ##
+
+# DX_XXX_FEATURE(DEFAULT_STATE)
+# -----------------------------
+AC_DEFUN([DX_DOXYGEN_FEATURE], [AC_DEFUN([DX_FEATURE_doc], [$1])])
+AC_DEFUN([DX_MAN_FEATURE], [AC_DEFUN([DX_FEATURE_man], [$1])])
+AC_DEFUN([DX_HTML_FEATURE], [AC_DEFUN([DX_FEATURE_html], [$1])])
+AC_DEFUN([DX_CHM_FEATURE], [AC_DEFUN([DX_FEATURE_chm], [$1])])
+AC_DEFUN([DX_CHI_FEATURE], [AC_DEFUN([DX_FEATURE_chi], [$1])])
+AC_DEFUN([DX_RTF_FEATURE], [AC_DEFUN([DX_FEATURE_rtf], [$1])])
+AC_DEFUN([DX_XML_FEATURE], [AC_DEFUN([DX_FEATURE_xml], [$1])])
+AC_DEFUN([DX_XML_FEATURE], [AC_DEFUN([DX_FEATURE_xml], [$1])])
+AC_DEFUN([DX_PDF_FEATURE], [AC_DEFUN([DX_FEATURE_pdf], [$1])])
+AC_DEFUN([DX_PS_FEATURE], [AC_DEFUN([DX_FEATURE_ps], [$1])])
+
+# DX_INIT_DOXYGEN(PROJECT, [CONFIG-FILE], [OUTPUT-DOC-DIR])
+# ---------------------------------------------------------
+# PROJECT also serves as the base name for the documentation files.
+# The default CONFIG-FILE is "Doxyfile" and OUTPUT-DOC-DIR is "doxygen-doc".
+AC_DEFUN([DX_INIT_DOXYGEN], [
+
+# Files:
+AC_SUBST([DX_PROJECT], [$1])
+AC_SUBST([DX_CONFIG], [ifelse([$2], [], Doxyfile, [$2])])
+AC_SUBST([DX_DOCDIR], [ifelse([$3], [], doxygen-doc, [$3])])
+
+# Environment variables used inside doxygen.cfg:
+DX_ENV_APPEND(SRCDIR, $srcdir)
+DX_ENV_APPEND(PROJECT, $DX_PROJECT)
+DX_ENV_APPEND(DOCDIR, $DX_DOCDIR)
+DX_ENV_APPEND(VERSION, $PACKAGE_VERSION)
+
+# Doxygen itself:
+DX_ARG_ABLE(doc, [generate any doxygen documentation],
+ [],
+ [],
+ [DX_REQUIRE_PROG([DX_DOXYGEN], doxygen)
+ DX_REQUIRE_PROG([DX_PERL], perl)],
+ [DX_ENV_APPEND(PERL_PATH, $DX_PERL)])
+
+# Dot for graphics:
+DX_ARG_ABLE(dot, [generate graphics for doxygen documentation],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [DX_REQUIRE_PROG([DX_DOT], dot)],
+ [DX_ENV_APPEND(HAVE_DOT, YES)
+ DX_ENV_APPEND(DOT_PATH, [`DX_DIRNAME_EXPR($DX_DOT)`])],
+ [DX_ENV_APPEND(HAVE_DOT, NO)])
+
+# Man pages generation:
+DX_ARG_ABLE(man, [generate doxygen manual pages],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [],
+ [DX_ENV_APPEND(GENERATE_MAN, YES)],
+ [DX_ENV_APPEND(GENERATE_MAN, NO)])
+
+# RTF file generation:
+DX_ARG_ABLE(rtf, [generate doxygen RTF documentation],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [],
+ [DX_ENV_APPEND(GENERATE_RTF, YES)],
+ [DX_ENV_APPEND(GENERATE_RTF, NO)])
+
+# XML file generation:
+DX_ARG_ABLE(xml, [generate doxygen XML documentation],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [],
+ [DX_ENV_APPEND(GENERATE_XML, YES)],
+ [DX_ENV_APPEND(GENERATE_XML, NO)])
+
+# (Compressed) HTML help generation:
+DX_ARG_ABLE(chm, [generate doxygen compressed HTML help documentation],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [DX_REQUIRE_PROG([DX_HHC], hhc)],
+ [DX_ENV_APPEND(HHC_PATH, $DX_HHC)
+ DX_ENV_APPEND(GENERATE_HTML, YES)
+ DX_ENV_APPEND(GENERATE_HTMLHELP, YES)],
+ [DX_ENV_APPEND(GENERATE_HTMLHELP, NO)])
+
+# Seperate CHI file generation.
+DX_ARG_ABLE(chi, [generate doxygen seperate compressed HTML help index file],
+ [DX_CHECK_DEPEND(chm, 1)],
+ [DX_CLEAR_DEPEND(chm, 1)],
+ [],
+ [DX_ENV_APPEND(GENERATE_CHI, YES)],
+ [DX_ENV_APPEND(GENERATE_CHI, NO)])
+
+# Plain HTML pages generation:
+DX_ARG_ABLE(html, [generate doxygen plain HTML documentation],
+ [DX_CHECK_DEPEND(doc, 1) DX_CHECK_DEPEND(chm, 0)],
+ [DX_CLEAR_DEPEND(doc, 1) DX_CLEAR_DEPEND(chm, 0)],
+ [],
+ [DX_ENV_APPEND(GENERATE_HTML, YES)],
+ [DX_TEST_FEATURE(chm) || DX_ENV_APPEND(GENERATE_HTML, NO)])
+
+# PostScript file generation:
+DX_ARG_ABLE(ps, [generate doxygen PostScript documentation],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [DX_REQUIRE_PROG([DX_LATEX], latex)
+ DX_REQUIRE_PROG([DX_MAKEINDEX], makeindex)
+ DX_REQUIRE_PROG([DX_DVIPS], dvips)
+ DX_REQUIRE_PROG([DX_EGREP], egrep)])
+
+# PDF file generation:
+DX_ARG_ABLE(pdf, [generate doxygen PDF documentation],
+ [DX_CHECK_DEPEND(doc, 1)],
+ [DX_CLEAR_DEPEND(doc, 1)],
+ [DX_REQUIRE_PROG([DX_PDFLATEX], pdflatex)
+ DX_REQUIRE_PROG([DX_MAKEINDEX], makeindex)
+ DX_REQUIRE_PROG([DX_EGREP], egrep)])
+
+# LaTeX generation for PS and/or PDF:
+if DX_TEST_FEATURE(ps) || DX_TEST_FEATURE(pdf); then
+ AM_CONDITIONAL(DX_COND_latex, :)
+ DX_ENV_APPEND(GENERATE_LATEX, YES)
+else
+ AM_CONDITIONAL(DX_COND_latex, false)
+ DX_ENV_APPEND(GENERATE_LATEX, NO)
+fi
+
+# Paper size for PS and/or PDF:
+AC_ARG_VAR(DOXYGEN_PAPER_SIZE,
+ [a4wide (default), a4, letter, legal or executive])
+case "$DOXYGEN_PAPER_SIZE" in
+#(
+"")
+ AC_SUBST(DOXYGEN_PAPER_SIZE, "")
+;; #(
+a4wide|a4|letter|legal|executive)
+ DX_ENV_APPEND(PAPER_SIZE, $DOXYGEN_PAPER_SIZE)
+;; #(
+*)
+ AC_MSG_ERROR([unknown DOXYGEN_PAPER_SIZE='$DOXYGEN_PAPER_SIZE'])
+;;
+esac
+
+#For debugging:
+#echo DX_FLAG_doc=$DX_FLAG_doc
+#echo DX_FLAG_dot=$DX_FLAG_dot
+#echo DX_FLAG_man=$DX_FLAG_man
+#echo DX_FLAG_html=$DX_FLAG_html
+#echo DX_FLAG_chm=$DX_FLAG_chm
+#echo DX_FLAG_chi=$DX_FLAG_chi
+#echo DX_FLAG_rtf=$DX_FLAG_rtf
+#echo DX_FLAG_xml=$DX_FLAG_xml
+#echo DX_FLAG_pdf=$DX_FLAG_pdf
+#echo DX_FLAG_ps=$DX_FLAG_ps
+#echo DX_ENV=$DX_ENV
+])
diff --git a/m4/boost.m4 b/m4/boost.m4
new file mode 100644
index 0000000..11a623d
--- /dev/null
+++ b/m4/boost.m4
@@ -0,0 +1,1343 @@
+# boost.m4: Locate Boost headers and libraries for autoconf-based projects.
+# Copyright (C) 2007-2011, 2014 Benoit Sigoure <tsuna at lrde.epita.fr>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Additional permission under section 7 of the GNU General Public
+# License, version 3 ("GPLv3"):
+#
+# If you convey this file as part of a work that contains a
+# configuration script generated by Autoconf, you may do so under
+# terms of your choice.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+m4_define([_BOOST_SERIAL], [m4_translit([
+# serial 23
+], [#
+], [])])
+
+# Original sources can be found at http://github.com/tsuna/boost.m4
+# You can fetch the latest version of the script by doing:
+# wget http://github.com/tsuna/boost.m4/raw/master/build-aux/boost.m4
+
+# ------ #
+# README #
+# ------ #
+
+# This file provides several macros to use the various Boost libraries.
+# The first macro is BOOST_REQUIRE. It will simply check if it's possible to
+# find the Boost headers of a given (optional) minimum version and it will
+# define BOOST_CPPFLAGS accordingly. It will add an option --with-boost to
+# your configure so that users can specify non standard locations.
+# If the user's environment contains BOOST_ROOT and --with-boost was not
+# specified, --with-boost=$BOOST_ROOT is implicitly used.
+# For more README and documentation, go to http://github.com/tsuna/boost.m4
+# Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL. If you don't, don't worry,
+# simply read the README, it will show you what to do step by step.
+
+m4_pattern_forbid([^_?(BOOST|Boost)_])
+
+
+# _BOOST_SED_CPP(SED-PROGRAM, PROGRAM,
+# [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# --------------------------------------------------------
+# Same as AC_EGREP_CPP, but leave the result in conftest.i.
+#
+# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP. It is expanded
+# in double-quotes, so escape your double quotes.
+#
+# It could be useful to turn this into a macro which extracts the
+# value of any macro.
+m4_define([_BOOST_SED_CPP],
+[AC_LANG_PUSH([C++])dnl
+AC_LANG_PREPROC_REQUIRE()dnl
+AC_REQUIRE([AC_PROG_SED])dnl
+AC_LANG_CONFTEST([AC_LANG_SOURCE([[$2]])])
+AS_IF([dnl eval is necessary to expand ac_cpp.
+dnl Ultrix and Pyramid sh refuse to redirect output of eval, so use subshell.
+dnl Beware of Windows end-of-lines, for instance if we are running
+dnl some Windows programs under Wine. In that case, boost/version.hpp
+dnl is certainly using "\r\n", but the regular Unix shell will only
+dnl strip `\n' with backquotes, not the `\r'. This results in
+dnl boost_cv_lib_version='1_37\r' for instance, which breaks
+dnl everything else.
+dnl Cannot use 'dnl' after [$4] because a trailing dnl may break AC_CACHE_CHECK
+(eval "$ac_cpp conftest.$ac_ext") 2>&AS_MESSAGE_LOG_FD |
+ tr -d '\r' |
+ $SED -n -e "$1" >conftest.i 2>&1],
+ [$3],
+ [$4])
+rm -rf conftest*
+AC_LANG_POP([C++])dnl
+])# _BOOST_SED_CPP
+
+
+
+# BOOST_REQUIRE([VERSION], [ACTION-IF-NOT-FOUND])
+# -----------------------------------------------
+# Look for Boost. If version is given, it must either be a literal of the form
+# "X.Y.Z" where X, Y and Z are integers (the ".Z" part being optional) or a
+# variable "$var".
+# Defines the value BOOST_CPPFLAGS. This macro only checks for headers with
+# the required version, it does not check for any of the Boost libraries.
+# On # success, defines HAVE_BOOST. On failure, calls the optional
+# ACTION-IF-NOT-FOUND action if one was supplied.
+# Otherwise aborts with an error message.
+AC_DEFUN([BOOST_REQUIRE],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_PROG_GREP])dnl
+echo "$as_me: this is boost.m4[]_BOOST_SERIAL" >&AS_MESSAGE_LOG_FD
+boost_save_IFS=$IFS
+boost_version_req=$1
+IFS=.
+set x $boost_version_req 0 0 0
+IFS=$boost_save_IFS
+shift
+boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"`
+boost_version_req_string=$[1].$[2].$[3]
+AC_ARG_WITH([boost],
+ [AS_HELP_STRING([--with-boost=DIR],
+ [prefix of Boost $1 @<:@guess@:>@])])dnl
+AC_ARG_VAR([BOOST_ROOT],[Location of Boost installation])dnl
+# If BOOST_ROOT is set and the user has not provided a value to
+# --with-boost, then treat BOOST_ROOT as if it the user supplied it.
+if test x"$BOOST_ROOT" != x; then
+ if test x"$with_boost" = x; then
+ AC_MSG_NOTICE([Detected BOOST_ROOT; continuing with --with-boost=$BOOST_ROOT])
+ with_boost=$BOOST_ROOT
+ else
+ AC_MSG_NOTICE([Detected BOOST_ROOT=$BOOST_ROOT, but overridden by --with-boost=$with_boost])
+ fi
+fi
+AC_SUBST([DISTCHECK_CONFIGURE_FLAGS],
+ ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+ AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string],
+ [boost_cv_inc_path],
+ [boost_cv_inc_path=no
+AC_LANG_PUSH([C++])dnl
+m4_pattern_allow([^BOOST_VERSION$])dnl
+ AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <boost/version.hpp>
+#if !defined BOOST_VERSION
+# error BOOST_VERSION is not defined
+#elif BOOST_VERSION < $boost_version_req
+# error Boost headers version < $boost_version_req
+#endif
+]])])
+ # If the user provided a value to --with-boost, use it and only it.
+ case $with_boost in #(
+ ''|yes) set x '' /opt/local/include /usr/local/include /opt/include \
+ /usr/include C:/Boost/include;; #(
+ *) set x "$with_boost/include" "$with_boost";;
+ esac
+ shift
+ for boost_dir
+ do
+ # Without --layout=system, Boost (or at least some versions) installs
+ # itself in <prefix>/include/boost-<version>. This inner loop helps to
+ # find headers in such directories.
+ #
+ # Any ${boost_dir}/boost-x_xx directories are searched in reverse version
+ # order followed by ${boost_dir}. The final '.' is a sentinel for
+ # searching $boost_dir" itself. Entries are whitespace separated.
+ #
+ # I didn't indent this loop on purpose (to avoid over-indented code)
+ boost_layout_system_search_list=`cd "$boost_dir" 2>/dev/null \
+ && ls -1 | "${GREP}" '^boost-' | sort -rn -t- -k2 \
+ && echo .`
+ for boost_inc in $boost_layout_system_search_list
+ do
+ if test x"$boost_inc" != x.; then
+ boost_inc="$boost_dir/$boost_inc"
+ else
+ boost_inc="$boost_dir" # Uses sentinel in boost_layout_system_search_list
+ fi
+ if test x"$boost_inc" != x; then
+ # We are going to check whether the version of Boost installed
+ # in $boost_inc is usable by running a compilation that
+ # #includes it. But if we pass a -I/some/path in which Boost
+ # is not installed, the compiler will just skip this -I and
+ # use other locations (either from CPPFLAGS, or from its list
+ # of system include directories). As a result we would use
+ # header installed on the machine instead of the /some/path
+ # specified by the user. So in that precise case (trying
+ # $boost_inc), make sure the version.hpp exists.
+ #
+ # Use test -e as there can be symlinks.
+ test -e "$boost_inc/boost/version.hpp" || continue
+ CPPFLAGS="$CPPFLAGS -I$boost_inc"
+ fi
+ AC_COMPILE_IFELSE([], [boost_cv_inc_path=yes], [boost_cv_version=no])
+ if test x"$boost_cv_inc_path" = xyes; then
+ if test x"$boost_inc" != x; then
+ boost_cv_inc_path=$boost_inc
+ fi
+ break 2
+ fi
+ done
+ done
+AC_LANG_POP([C++])dnl
+ ])
+ case $boost_cv_inc_path in #(
+ no)
+ boost_errmsg="cannot find Boost headers version >= $boost_version_req_string"
+ m4_if([$2], [], [AC_MSG_ERROR([$boost_errmsg])],
+ [AC_MSG_NOTICE([$boost_errmsg])])
+ $2
+ ;;#(
+ yes)
+ BOOST_CPPFLAGS=
+ ;;#(
+ *)
+ AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl
+ ;;
+ esac
+ if test x"$boost_cv_inc_path" != xno; then
+ AC_DEFINE([HAVE_BOOST], [1],
+ [Defined if the requested minimum BOOST version is satisfied])
+ AC_CACHE_CHECK([for Boost's header version],
+ [boost_cv_lib_version],
+ [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl
+ _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}],
+ [#include <boost/version.hpp>
+boost-lib-version = BOOST_LIB_VERSION],
+ [boost_cv_lib_version=`cat conftest.i`])])
+ # e.g. "134" for 1_34_1 or "135" for 1_35
+ boost_major_version=`echo "$boost_cv_lib_version" | sed 's/_//;s/_.*//'`
+ case $boost_major_version in #(
+ '' | *[[!0-9]]*)
+ AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version])
+ ;;
+ esac
+fi
+CPPFLAGS=$boost_save_CPPFLAGS
+])# BOOST_REQUIRE
+
+
+# BOOST_STATIC()
+# --------------
+# Add the "--enable-static-boost" configure argument. If this argument is given
+# on the command line, static versions of the libraries will be looked up.
+AC_DEFUN([BOOST_STATIC],
+ [AC_ARG_ENABLE([static-boost],
+ [AS_HELP_STRING([--enable-static-boost],
+ [Prefer the static boost libraries over the shared ones [no]])],
+ [enable_static_boost=yes],
+ [enable_static_boost=no])])# BOOST_STATIC
+
+
+# BOOST_FIND_HEADER([HEADER-NAME], [ACTION-IF-NOT-FOUND], [ACTION-IF-FOUND])
+# --------------------------------------------------------------------------
+# Wrapper around AC_CHECK_HEADER for Boost headers. Useful to check for
+# some parts of the Boost library which are only made of headers and don't
+# require linking (such as Boost.Foreach).
+#
+# Default ACTION-IF-NOT-FOUND: Fail with a fatal error unless Boost couldn't be
+# found in the first place, in which case by default a notice is issued to the
+# user. Presumably if we haven't died already it's because it's OK to not have
+# Boost, which is why only a notice is issued instead of a hard error.
+#
+# Default ACTION-IF-FOUND: define the preprocessor symbol HAVE_<HEADER-NAME> in
+# case of success # (where HEADER-NAME is written LIKE_THIS, e.g.,
+# HAVE_BOOST_FOREACH_HPP).
+AC_DEFUN([BOOST_FIND_HEADER],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+if test x"$boost_cv_inc_path" = xno; then
+ m4_default([$2], [AC_MSG_NOTICE([Boost not available, not searching for $1])])
+else
+AC_LANG_PUSH([C++])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CHECK_HEADER([$1],
+ [m4_default([$3], [AC_DEFINE(AS_TR_CPP([HAVE_$1]), [1],
+ [Define to 1 if you have <$1>])])],
+ [m4_default([$2], [AC_MSG_ERROR([cannot find $1])])])
+CPPFLAGS=$boost_save_CPPFLAGS
+AC_LANG_POP([C++])dnl
+fi
+])# BOOST_FIND_HEADER
+
+
+# BOOST_FIND_LIBS([COMPONENT-NAME], [CANDIDATE-LIB-NAMES],
+# [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+# [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Look for the Boost library COMPONENT-NAME (e.g., `thread', for
+# libboost_thread) under the possible CANDIDATE-LIB-NAMES (e.g.,
+# "thread_win32 thread"). Check that HEADER-NAME works and check that
+# libboost_LIB-NAME can link with the code CXX-TEST. The optional
+# argument CXX-PROLOGUE can be used to include some C++ code before
+# the `main' function.
+#
+# Invokes BOOST_FIND_HEADER([HEADER-NAME]) (see above).
+#
+# Boost libraries typically come compiled with several flavors (with different
+# runtime options) so PREFERRED-RT-OPT is the preferred suffix. A suffix is one
+# or more of the following letters: sgdpn (in that order). s = static
+# runtime, d = debug build, g = debug/diagnostic runtime, p = STLPort build,
+# n = (unsure) STLPort build without iostreams from STLPort (it looks like `n'
+# must always be used along with `p'). Additionally, PREFERRED-RT-OPT can
+# start with `mt-' to indicate that there is a preference for multi-thread
+# builds. Some sample values for PREFERRED-RT-OPT: (nothing), mt, d, mt-d, gdp
+# ... If you want to make sure you have a specific version of Boost
+# (eg, >= 1.33) you *must* invoke BOOST_REQUIRE before this macro.
+AC_DEFUN([BOOST_FIND_LIBS],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+AC_REQUIRE([_BOOST_FIND_COMPILER_TAG])dnl
+AC_REQUIRE([BOOST_STATIC])dnl
+AC_REQUIRE([_BOOST_GUESS_WHETHER_TO_USE_MT])dnl
+if test x"$boost_cv_inc_path" = xno; then
+ AC_MSG_NOTICE([Boost not available, not searching for the Boost $1 library])
+else
+dnl The else branch is huge and wasn't intended on purpose.
+AC_LANG_PUSH([C++])dnl
+AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl
+AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl
+BOOST_FIND_HEADER([$4])
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CACHE_CHECK([for the Boost $1 library], [Boost_lib],
+ [_BOOST_FIND_LIBS($@)])
+case $Boost_lib in #(
+ (no) _AC_MSG_LOG_CONFTEST
+ AC_MSG_ERROR([cannot find the flags to link with Boost $1])
+ ;;
+esac
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl
+AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl
+CPPFLAGS=$boost_save_CPPFLAGS
+AS_VAR_POPDEF([Boost_lib])dnl
+AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl
+AS_VAR_POPDEF([Boost_lib_LDPATH])dnl
+AS_VAR_POPDEF([Boost_lib_LIBS])dnl
+AC_LANG_POP([C++])dnl
+fi
+])
+
+
+# BOOST_FIND_LIB([LIB-NAME],
+# [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+# [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Backward compatibility wrapper for BOOST_FIND_LIBS.
+AC_DEFUN([BOOST_FIND_LIB],
+[BOOST_FIND_LIBS([$1], $@)])
+
+
+# _BOOST_FIND_LIBS([LIB-NAME], [CANDIDATE-LIB-NAMES],
+# [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+# [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Real implementation of BOOST_FIND_LIBS: rely on these local macros:
+# Boost_lib, Boost_lib_LDFLAGS, Boost_lib_LDPATH, Boost_lib_LIBS
+#
+# The algorithm is as follows: first look for a given library name
+# according to the user's PREFERRED-RT-OPT. For each library name, we
+# prefer to use the ones that carry the tag (toolset name). Each
+# library is searched through the various standard paths were Boost is
+# usually installed. If we can't find the standard variants, we try
+# to enforce -mt (for instance on MacOSX, libboost_thread.dylib
+# doesn't exist but there's -obviously- libboost_thread-mt.dylib).
+AC_DEFUN([_BOOST_FIND_LIBS],
+[Boost_lib=no
+ case "$3" in #(
+ (mt | mt-) boost_mt=-mt; boost_rtopt=;; #(
+ (mt* | mt-*) boost_mt=-mt; boost_rtopt=`expr "X$3" : 'Xmt-*\(.*\)'`;; #(
+ (*) boost_mt=; boost_rtopt=$3;;
+ esac
+ if test $enable_static_boost = yes; then
+ boost_rtopt="s$boost_rtopt"
+ fi
+ # Find the proper debug variant depending on what we've been asked to find.
+ case $boost_rtopt in #(
+ (*d*) boost_rt_d=$boost_rtopt;; #(
+ (*[[sgpn]]*) # Insert the `d' at the right place (in between `sg' and `pn')
+ boost_rt_d=`echo "$boost_rtopt" | sed 's/\(s*g*\)\(p*n*\)/\1\2/'`;; #(
+ (*) boost_rt_d='-d';;
+ esac
+ # If the PREFERRED-RT-OPT are not empty, prepend a `-'.
+ test -n "$boost_rtopt" && boost_rtopt="-$boost_rtopt"
+ $boost_guess_use_mt && boost_mt=-mt
+ # Look for the abs path the static archive.
+ # $libext is computed by Libtool but let's make sure it's non empty.
+ test -z "$libext" &&
+ AC_MSG_ERROR([the libext variable is empty, did you invoke Libtool?])
+ boost_save_ac_objext=$ac_objext
+ # Generate the test file.
+ AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <$4>
+$6], [$5])])
+dnl Optimization hacks: compiling C++ is slow, especially with Boost. What
+dnl we're trying to do here is guess the right combination of link flags
+dnl (LIBS / LDFLAGS) to use a given library. This can take several
+dnl iterations before it succeeds and is thus *very* slow. So what we do
+dnl instead is that we compile the code first (and thus get an object file,
+dnl typically conftest.o). Then we try various combinations of link flags
+dnl until we succeed to link conftest.o in an executable. The problem is
+dnl that the various TRY_LINK / COMPILE_IFELSE macros of Autoconf always
+dnl remove all the temporary files including conftest.o. So the trick here
+dnl is to temporarily change the value of ac_objext so that conftest.o is
+dnl preserved accross tests. This is obviously fragile and I will burn in
+dnl hell for not respecting Autoconf's documented interfaces, but in the
+dnl mean time, it optimizes the macro by a factor of 5 to 30.
+dnl Another small optimization: the first argument of AC_COMPILE_IFELSE left
+dnl empty because the test file is generated only once above (before we
+dnl start the for loops).
+ AC_COMPILE_IFELSE([],
+ [ac_objext=do_not_rm_me_plz],
+ [AC_MSG_ERROR([cannot compile a test that uses Boost $1])])
+ ac_objext=$boost_save_ac_objext
+ boost_failed_libs=
+# Don't bother to ident the following nested for loops, only the 2
+# innermost ones matter.
+for boost_lib_ in $2; do
+for boost_tag_ in -$boost_cv_lib_tag ''; do
+for boost_ver_ in -$boost_cv_lib_version ''; do
+for boost_mt_ in $boost_mt -mt ''; do
+for boost_rtopt_ in $boost_rtopt '' -d; do
+ for boost_lib in \
+ boost_$boost_lib_$boost_tag_$boost_mt_$boost_rtopt_$boost_ver_ \
+ boost_$boost_lib_$boost_tag_$boost_rtopt_$boost_ver_ \
+ boost_$boost_lib_$boost_tag_$boost_mt_$boost_ver_ \
+ boost_$boost_lib_$boost_tag_$boost_ver_
+ do
+ # Avoid testing twice the same lib
+ case $boost_failed_libs in #(
+ (*@$boost_lib@*) continue;;
+ esac
+ # If with_boost is empty, we'll search in /lib first, which is not quite
+ # right so instead we'll try to a location based on where the headers are.
+ boost_tmp_lib=$with_boost
+ test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include}
+ for boost_ldpath in "$boost_tmp_lib/lib" '' \
+ /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \
+ "$with_boost" C:/Boost/lib /lib*
+ do
+ # Don't waste time with directories that don't exist.
+ if test x"$boost_ldpath" != x && test ! -e "$boost_ldpath"; then
+ continue
+ fi
+ boost_save_LDFLAGS=$LDFLAGS
+ # Are we looking for a static library?
+ case $boost_ldpath:$boost_rtopt_ in #(
+ (*?*:*s*) # Yes (Non empty boost_ldpath + s in rt opt)
+ Boost_lib_LIBS="$boost_ldpath/lib$boost_lib.$libext"
+ test -e "$Boost_lib_LIBS" || continue;; #(
+ (*) # No: use -lboost_foo to find the shared library.
+ Boost_lib_LIBS="-l$boost_lib";;
+ esac
+ boost_save_LIBS=$LIBS
+ LIBS="$Boost_lib_LIBS $LIBS"
+ test x"$boost_ldpath" != x && LDFLAGS="$LDFLAGS -L$boost_ldpath"
+dnl First argument of AC_LINK_IFELSE left empty because the test file is
+dnl generated only once above (before we start the for loops).
+ _BOOST_AC_LINK_IFELSE([],
+ [Boost_lib=yes], [Boost_lib=no])
+ ac_objext=$boost_save_ac_objext
+ LDFLAGS=$boost_save_LDFLAGS
+ LIBS=$boost_save_LIBS
+ if test x"$Boost_lib" = xyes; then
+ # Check or used cached result of whether or not using -R or
+ # -rpath makes sense. Some implementations of ld, such as for
+ # Mac OSX, require -rpath but -R is the flag known to work on
+ # other systems. https://github.com/tsuna/boost.m4/issues/19
+ AC_CACHE_VAL([boost_cv_rpath_link_ldflag],
+ [case $boost_ldpath in
+ '') # Nothing to do.
+ boost_cv_rpath_link_ldflag=
+ boost_rpath_link_ldflag_found=yes;;
+ *)
+ for boost_cv_rpath_link_ldflag in -Wl,-R, -Wl,-rpath,; do
+ LDFLAGS="$boost_save_LDFLAGS -L$boost_ldpath $boost_cv_rpath_link_ldflag$boost_ldpath"
+ LIBS="$boost_save_LIBS $Boost_lib_LIBS"
+ _BOOST_AC_LINK_IFELSE([],
+ [boost_rpath_link_ldflag_found=yes
+ break],
+ [boost_rpath_link_ldflag_found=no])
+ done
+ ;;
+ esac
+ AS_IF([test "x$boost_rpath_link_ldflag_found" != "xyes"],
+ [AC_MSG_ERROR([Unable to determine whether to use -R or -rpath])])
+ LDFLAGS=$boost_save_LDFLAGS
+ LIBS=$boost_save_LIBS
+ ])
+ test x"$boost_ldpath" != x &&
+ Boost_lib_LDFLAGS="-L$boost_ldpath $boost_cv_rpath_link_ldflag$boost_ldpath"
+ Boost_lib_LDPATH="$boost_ldpath"
+ break 7
+ else
+ boost_failed_libs="$boost_failed_libs@$boost_lib@"
+ fi
+ done
+ done
+done
+done
+done
+done
+done # boost_lib_
+rm -f conftest.$ac_objext
+])
+
+
+
+# --------------------------------------- #
+# Checks for the various Boost libraries. #
+# --------------------------------------- #
+
+# List of boost libraries: http://www.boost.org/libs/libraries.htm
+# The page http://beta.boost.org/doc/libs is useful: it gives the first release
+# version of each library (among other things).
+
+# BOOST_DEFUN(LIBRARY, CODE)
+# --------------------------
+# Define BOOST_<LIBRARY-UPPERCASE> as a macro that runs CODE.
+#
+# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN.
+m4_define([BOOST_DEFUN],
+[m4_indir([AC_DEFUN],
+ m4_toupper([BOOST_$1]),
+[m4_pushdef([BOOST_Library], [$1])dnl
+$2
+m4_popdef([BOOST_Library])dnl
+])
+])
+
+# BOOST_ARRAY()
+# -------------
+# Look for Boost.Array
+BOOST_DEFUN([Array],
+[BOOST_FIND_HEADER([boost/array.hpp])])
+
+
+# BOOST_ASIO()
+# ------------
+# Look for Boost.Asio (new in Boost 1.35).
+BOOST_DEFUN([Asio],
+[AC_REQUIRE([BOOST_SYSTEM])dnl
+BOOST_FIND_HEADER([boost/asio.hpp])])
+
+
+# BOOST_BIND()
+# ------------
+# Look for Boost.Bind.
+BOOST_DEFUN([Bind],
+[BOOST_FIND_HEADER([boost/bind.hpp])])
+
+
+# BOOST_CHRONO()
+# --------------
+# Look for Boost.Chrono.
+BOOST_DEFUN([Chrono],
+[# Do we have to check for Boost.System? This link-time dependency was
+# added as of 1.35.0. If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+ BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([chrono], [$1],
+ [boost/chrono.hpp],
+ [boost::chrono::thread_clock d;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+ BOOST_FILESYSTEM_LIBS="$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_CHRONO
+
+
+# BOOST_CONVERSION()
+# ------------------
+# Look for Boost.Conversion (cast / lexical_cast)
+BOOST_DEFUN([Conversion],
+[BOOST_FIND_HEADER([boost/cast.hpp])
+BOOST_FIND_HEADER([boost/lexical_cast.hpp])
+])# BOOST_CONVERSION
+
+
+# BOOST_CRC()
+# -----------
+# Look for Boost.CRC
+BOOST_DEFUN([CRC],
+[BOOST_FIND_HEADER([boost/crc.hpp])
+])# BOOST_CRC
+
+
+# BOOST_DATE_TIME([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Date_Time. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Date_Time],
+[BOOST_FIND_LIB([date_time], [$1],
+ [boost/date_time/posix_time/posix_time.hpp],
+ [boost::posix_time::ptime t;])
+])# BOOST_DATE_TIME
+
+
+# BOOST_FILESYSTEM([PREFERRED-RT-OPT])
+# ------------------------------------
+# Look for Boost.Filesystem. For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+# Do not check for boost/filesystem.hpp because this file was introduced in
+# 1.34.
+BOOST_DEFUN([Filesystem],
+[# Do we have to check for Boost.System? This link-time dependency was
+# added as of 1.35.0. If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+ BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([filesystem], [$1],
+ [boost/filesystem/path.hpp], [boost::filesystem::path p;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+ BOOST_FILESYSTEM_LIBS="$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_FILESYSTEM
+
+
+# BOOST_FLYWEIGHT()
+# -----------------
+# Look for Boost.Flyweight.
+BOOST_DEFUN([Flyweight],
+[dnl There's a hidden dependency on pthreads.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+BOOST_FIND_HEADER([boost/flyweight.hpp])
+AC_SUBST([BOOST_FLYWEIGHT_LIBS], [$boost_cv_pthread_flag])
+])
+
+
+# BOOST_FOREACH()
+# ---------------
+# Look for Boost.Foreach.
+BOOST_DEFUN([Foreach],
+[BOOST_FIND_HEADER([boost/foreach.hpp])])
+
+
+# BOOST_FORMAT()
+# --------------
+# Look for Boost.Format.
+# Note: we can't check for boost/format/format_fwd.hpp because the header isn't
+# standalone. It can't be compiled because it triggers the following error:
+# boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std'
+# does not name a type
+BOOST_DEFUN([Format],
+[BOOST_FIND_HEADER([boost/format.hpp])])
+
+
+# BOOST_FUNCTION()
+# ----------------
+# Look for Boost.Function
+BOOST_DEFUN([Function],
+[BOOST_FIND_HEADER([boost/function.hpp])])
+
+
+# BOOST_GEOMETRY()
+# ----------------
+# Look for Boost.Geometry (new since 1.47.0).
+BOOST_DEFUN([Geometry],
+[BOOST_FIND_HEADER([boost/geometry.hpp])
+])# BOOST_GEOMETRY
+
+
+# BOOST_GRAPH([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Graphs. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Graph],
+[BOOST_FIND_LIB([graph], [$1],
+ [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;])
+])# BOOST_GRAPH
+
+
+# BOOST_IOSTREAMS([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.IOStreams. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([IOStreams],
+[BOOST_FIND_LIB([iostreams], [$1],
+ [boost/iostreams/device/file_descriptor.hpp],
+ [boost::iostreams::file_descriptor fd; fd.close();])
+])# BOOST_IOSTREAMS
+
+
+# BOOST_HASH()
+# ------------
+# Look for Boost.Functional/Hash
+BOOST_DEFUN([Hash],
+[BOOST_FIND_HEADER([boost/functional/hash.hpp])])
+
+
+# BOOST_LAMBDA()
+# --------------
+# Look for Boost.Lambda
+BOOST_DEFUN([Lambda],
+[BOOST_FIND_HEADER([boost/lambda/lambda.hpp])])
+
+
+# BOOST_LOG([PREFERRED-RT-OPT])
+# -----------------------------
+# Look for Boost.Log. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log],
+[BOOST_FIND_LIB([log], [$1],
+ [boost/log/core/core.hpp],
+ [boost::log::attribute a; a.get_value();])
+])# BOOST_LOG
+
+
+# BOOST_LOG_SETUP([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Log. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log_Setup],
+[AC_REQUIRE([BOOST_LOG])dnl
+BOOST_FIND_LIB([log_setup], [$1],
+ [boost/log/utility/setup/from_settings.hpp],
+ [boost::log::basic_settings<char> bs; bs.empty();])
+])# BOOST_LOG_SETUP
+
+
+# BOOST_MATH()
+# ------------
+# Look for Boost.Math
+# TODO: This library isn't header-only but it comes in multiple different
+# flavors that don't play well with BOOST_FIND_LIB (e.g, libboost_math_c99,
+# libboost_math_c99f, libboost_math_c99l, libboost_math_tr1,
+# libboost_math_tr1f, libboost_math_tr1l). This macro must be fixed to do the
+# right thing anyway.
+BOOST_DEFUN([Math],
+[BOOST_FIND_HEADER([boost/math/special_functions.hpp])])
+
+
+# BOOST_MPI([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost MPI. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above. Uses MPICXX variable if it is
+# set, otherwise tries CXX
+#
+BOOST_DEFUN([MPI],
+[boost_save_CXX=${CXX}
+boost_save_CXXCPP=${CXXCPP}
+if test x"${MPICXX}" != x; then
+ CXX=${MPICXX}
+ CXXCPP="${MPICXX} -E"
+fi
+BOOST_FIND_LIB([mpi], [$1],
+ [boost/mpi.hpp],
+ [int argc = 0;
+ char **argv = 0;
+ boost::mpi::environment env(argc,argv);])
+CXX=${boost_save_CXX}
+CXXCPP=${boost_save_CXXCPP}
+])# BOOST_MPI
+
+
+# BOOST_MULTIARRAY()
+# ------------------
+# Look for Boost.MultiArray
+BOOST_DEFUN([MultiArray],
+[BOOST_FIND_HEADER([boost/multi_array.hpp])])
+
+
+# BOOST_NUMERIC_UBLAS()
+# --------------------------
+# Look for Boost.NumericUblas (Basic Linear Algebra)
+BOOST_DEFUN([Numeric_Ublas],
+[BOOST_FIND_HEADER([boost/numeric/ublas/vector.hpp])
+])# BOOST_NUMERIC_UBLAS
+
+
+# BOOST_NUMERIC_CONVERSION()
+# --------------------------
+# Look for Boost.NumericConversion (policy-based numeric conversion)
+BOOST_DEFUN([Numeric_Conversion],
+[BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp])
+])# BOOST_NUMERIC_CONVERSION
+
+
+# BOOST_OPTIONAL()
+# ----------------
+# Look for Boost.Optional
+BOOST_DEFUN([Optional],
+[BOOST_FIND_HEADER([boost/optional.hpp])])
+
+
+# BOOST_PREPROCESSOR()
+# --------------------
+# Look for Boost.Preprocessor
+BOOST_DEFUN([Preprocessor],
+[BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])])
+
+
+# BOOST_UNORDERED()
+# -----------------
+# Look for Boost.Unordered
+BOOST_DEFUN([Unordered],
+[BOOST_FIND_HEADER([boost/unordered_map.hpp])])
+
+
+# BOOST_UUID()
+# ------------
+# Look for Boost.Uuid
+BOOST_DEFUN([Uuid],
+[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])])
+
+
+# BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT])
+# -----------------------------------------
+# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Program_Options],
+[BOOST_FIND_LIB([program_options], [$1],
+ [boost/program_options.hpp],
+ [boost::program_options::options_description d("test");])
+])# BOOST_PROGRAM_OPTIONS
+
+
+
+# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG)
+# ------------------------------------
+# Save VARIABLE, and define it via `python-config --FLAG`.
+# Substitute BOOST_PYTHON_VARIABLE.
+m4_define([_BOOST_PYTHON_CONFIG],
+[AC_SUBST([BOOST_PYTHON_$1],
+ [`python-config --$2 2>/dev/null`])dnl
+boost_python_save_$1=$$1
+$1="$$1 $BOOST_PYTHON_$1"])
+
+
+# BOOST_PYTHON([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.Python. For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Python],
+[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes])
+_BOOST_PYTHON_CONFIG([LDFLAGS], [ldflags])
+_BOOST_PYTHON_CONFIG([LIBS], [libs])
+m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl
+BOOST_FIND_LIBS([python], [python python3], [$1],
+ [boost/python.hpp],
+ [], [BOOST_PYTHON_MODULE(empty) {}])
+CPPFLAGS=$boost_python_save_CPPFLAGS
+LDFLAGS=$boost_python_save_LDFLAGS
+LIBS=$boost_python_save_LIBS
+])# BOOST_PYTHON
+
+
+# BOOST_REF()
+# -----------
+# Look for Boost.Ref
+BOOST_DEFUN([Ref],
+[BOOST_FIND_HEADER([boost/ref.hpp])])
+
+
+# BOOST_REGEX([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Regex. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Regex],
+[BOOST_FIND_LIB([regex], [$1],
+ [boost/regex.hpp],
+ [boost::regex exp("*"); boost::regex_match("foo", exp);])
+])# BOOST_REGEX
+
+
+# BOOST_SERIALIZATION([PREFERRED-RT-OPT])
+# ---------------------------------------
+# Look for Boost.Serialization. For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Serialization],
+[BOOST_FIND_LIB([serialization], [$1],
+ [boost/archive/text_oarchive.hpp],
+ [std::ostream* o = 0; // Cheap way to get an ostream...
+ boost::archive::text_oarchive t(*o);])
+])# BOOST_SERIALIZATION
+
+
+# BOOST_SIGNALS([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Signals. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Signals],
+[BOOST_FIND_LIB([signals], [$1],
+ [boost/signal.hpp],
+ [boost::signal<void ()> s;])
+])# BOOST_SIGNALS
+
+
+# BOOST_SIGNALS2()
+# ----------------
+# Look for Boost.Signals2 (new since 1.39.0).
+BOOST_DEFUN([Signals2],
+[BOOST_FIND_HEADER([boost/signals2.hpp])
+])# BOOST_SIGNALS2
+
+
+# BOOST_SMART_PTR()
+# -----------------
+# Look for Boost.SmartPtr
+BOOST_DEFUN([Smart_Ptr],
+[BOOST_FIND_HEADER([boost/scoped_ptr.hpp])
+BOOST_FIND_HEADER([boost/shared_ptr.hpp])
+])
+
+
+# BOOST_STATICASSERT()
+# --------------------
+# Look for Boost.StaticAssert
+BOOST_DEFUN([StaticAssert],
+[BOOST_FIND_HEADER([boost/static_assert.hpp])])
+
+
+# BOOST_STRING_ALGO()
+# -------------------
+# Look for Boost.StringAlgo
+BOOST_DEFUN([String_Algo],
+[BOOST_FIND_HEADER([boost/algorithm/string.hpp])
+])
+
+
+# BOOST_SYSTEM([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.System. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above. This library was introduced in Boost
+# 1.35.0.
+BOOST_DEFUN([System],
+[BOOST_FIND_LIB([system], [$1],
+ [boost/system/error_code.hpp],
+ [boost::system::error_code e; e.clear();])
+])# BOOST_SYSTEM
+
+
+# BOOST_TEST([PREFERRED-RT-OPT])
+# ------------------------------
+# Look for Boost.Test. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Test],
+[m4_pattern_allow([^BOOST_CHECK$])dnl
+BOOST_FIND_LIB([unit_test_framework], [$1],
+ [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);],
+ [using boost::unit_test::test_suite;
+ test_suite* init_unit_test_suite(int argc, char ** argv)
+ { return NULL; }])
+])# BOOST_TEST
+
+
+# BOOST_THREAD([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Thread. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Thread],
+[dnl Having the pthread flag is required at least on GCC3 where
+dnl boost/thread.hpp would complain if we try to compile without
+dnl -pthread on GNU/Linux.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+boost_thread_save_LIBS=$LIBS
+boost_thread_save_LDFLAGS=$LDFLAGS
+boost_thread_save_CPPFLAGS=$CPPFLAGS
+# Link-time dependency from thread to system was added as of 1.49.0.
+if test $boost_major_version -ge 149; then
+BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag"
+
+# When compiling for the Windows platform, the threads library is named
+# differently.
+case $host_os in
+ (*mingw*) boost_thread_lib_ext=_win32;;
+esac
+BOOST_FIND_LIBS([thread], [thread$boost_thread_lib_ext],
+ [$1],
+ [boost/thread.hpp], [boost::thread t; boost::mutex m;])
+
+BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS"
+BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag"
+LIBS=$boost_thread_save_LIBS
+LDFLAGS=$boost_thread_save_LDFLAGS
+CPPFLAGS=$boost_thread_save_CPPFLAGS
+])# BOOST_THREAD
+
+AU_ALIAS([BOOST_THREADS], [BOOST_THREAD])
+
+
+# BOOST_TOKENIZER()
+# -----------------
+# Look for Boost.Tokenizer
+BOOST_DEFUN([Tokenizer],
+[BOOST_FIND_HEADER([boost/tokenizer.hpp])])
+
+
+# BOOST_TRIBOOL()
+# ---------------
+# Look for Boost.Tribool
+BOOST_DEFUN([Tribool],
+[BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp])
+BOOST_FIND_HEADER([boost/logic/tribool.hpp])
+])
+
+
+# BOOST_TUPLE()
+# -------------
+# Look for Boost.Tuple
+BOOST_DEFUN([Tuple],
+[BOOST_FIND_HEADER([boost/tuple/tuple.hpp])])
+
+
+# BOOST_TYPETRAITS()
+# --------------------
+# Look for Boost.TypeTraits
+BOOST_DEFUN([TypeTraits],
+[BOOST_FIND_HEADER([boost/type_traits.hpp])])
+
+
+# BOOST_UTILITY()
+# ---------------
+# Look for Boost.Utility (noncopyable, result_of, base-from-member idiom,
+# etc.)
+BOOST_DEFUN([Utility],
+[BOOST_FIND_HEADER([boost/utility.hpp])])
+
+
+# BOOST_VARIANT()
+# ---------------
+# Look for Boost.Variant.
+BOOST_DEFUN([Variant],
+[BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp])
+BOOST_FIND_HEADER([boost/variant.hpp])])
+
+
+# BOOST_POINTER_CONTAINER()
+# ------------------------
+# Look for Boost.PointerContainer
+BOOST_DEFUN([Pointer_Container],
+[BOOST_FIND_HEADER([boost/ptr_container/ptr_deque.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_list.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_vector.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_array.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_set.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_map.hpp])
+])# BOOST_POINTER_CONTAINER
+
+
+# BOOST_WAVE([PREFERRED-RT-OPT])
+# ------------------------------
+# NOTE: If you intend to use Wave/Spirit with thread support, make sure you
+# call BOOST_THREAD first.
+# Look for Boost.Wave. For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Wave],
+[AC_REQUIRE([BOOST_FILESYSTEM])dnl
+AC_REQUIRE([BOOST_DATE_TIME])dnl
+boost_wave_save_LIBS=$LIBS
+boost_wave_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \
+$BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \
+$BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS"
+BOOST_FIND_LIB([wave], [$1],
+ [boost/wave.hpp],
+ [boost::wave::token_id id; get_token_name(id);])
+LIBS=$boost_wave_save_LIBS
+LDFLAGS=$boost_wave_save_LDFLAGS
+])# BOOST_WAVE
+
+
+# BOOST_XPRESSIVE()
+# -----------------
+# Look for Boost.Xpressive (new since 1.36.0).
+BOOST_DEFUN([Xpressive],
+[BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])])
+
+
+# ----------------- #
+# Internal helpers. #
+# ----------------- #
+
+
+# _BOOST_PTHREAD_FLAG()
+# ---------------------
+# Internal helper for BOOST_THREAD. Computes boost_cv_pthread_flag
+# which must be used in CPPFLAGS and LIBS.
+#
+# Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3,
+# boost/thread.hpp will trigger a #error if -pthread isn't used:
+# boost/config/requires_threads.hpp:47:5: #error "Compiler threading support
+# is not turned on. Please set the correct command line options for
+# threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)"
+#
+# Based on ACX_PTHREAD: http://autoconf-archive.cryp.to/acx_pthread.html
+AC_DEFUN([_BOOST_PTHREAD_FLAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_LANG_PUSH([C++])dnl
+AC_CACHE_CHECK([for the flags needed to use pthreads], [boost_cv_pthread_flag],
+[ boost_cv_pthread_flag=
+ # The ordering *is* (sometimes) important. Some notes on the
+ # individual items follow:
+ # (none): in case threads are in libc; should be tried before -Kthread and
+ # other compiler flags to prevent continual compiler warnings
+ # -lpthreads: AIX (must check this before -lpthread)
+ # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+ # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+ # -llthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+ # -pthread: GNU Linux/GCC (kernel threads), BSD/GCC (userland threads)
+ # -pthreads: Solaris/GCC
+ # -mthreads: MinGW32/GCC, Lynx/GCC
+ # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+ # doesn't hurt to check since this sometimes defines pthreads too;
+ # also defines -D_REENTRANT)
+ # ... -mt is also the pthreads flag for HP/aCC
+ # -lpthread: GNU Linux, etc.
+ # --thread-safe: KAI C++
+ case $host_os in #(
+ *solaris*)
+ # On Solaris (at least, for some versions), libc contains stubbed
+ # (non-functional) versions of the pthreads routines, so link-based
+ # tests will erroneously succeed. (We need to link with -pthreads/-mt/
+ # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather
+ # a function called by this macro, so we could check for that, but
+ # who knows whether they'll stub that too in a future libc.) So,
+ # we'll just look for -pthreads and -lpthread first:
+ boost_pthread_flags="-pthreads -lpthread -mt -pthread";; #(
+ *)
+ boost_pthread_flags="-lpthreads -Kthread -kthread -llthread -pthread \
+ -pthreads -mthreads -lpthread --thread-safe -mt";;
+ esac
+ # Generate the test file.
+ AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <pthread.h>],
+ [pthread_t th; pthread_join(th, 0);
+ pthread_attr_init(0); pthread_cleanup_push(0, 0);
+ pthread_create(0,0,0,0); pthread_cleanup_pop(0);])])
+ for boost_pthread_flag in '' $boost_pthread_flags; do
+ boost_pthread_ok=false
+dnl Re-use the test file already generated.
+ boost_pthreads__save_LIBS=$LIBS
+ LIBS="$LIBS $boost_pthread_flag"
+ AC_LINK_IFELSE([],
+ [if grep ".*$boost_pthread_flag" conftest.err; then
+ echo "This flag seems to have triggered warnings" >&AS_MESSAGE_LOG_FD
+ else
+ boost_pthread_ok=:; boost_cv_pthread_flag=$boost_pthread_flag
+ fi])
+ LIBS=$boost_pthreads__save_LIBS
+ $boost_pthread_ok && break
+ done
+])
+AC_LANG_POP([C++])dnl
+])# _BOOST_PTHREAD_FLAG
+
+
+# _BOOST_gcc_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_gcc_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC @ gcc$1$2"])dnl
+
+# _BOOST_mingw_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_mingw_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC && \
+ (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+ || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw$1$2"])dnl
+
+
+# _BOOST_FIND_COMPILER_TAG()
+# --------------------------
+# Internal. When Boost is installed without --layout=system, each library
+# filename will hold a suffix that encodes the compiler used during the
+# build. The Boost build system seems to call this a `tag'.
+AC_DEFUN([_BOOST_FIND_COMPILER_TAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_CACHE_CHECK([for the toolset name used by Boost for $CXX],
+ [boost_cv_lib_tag],
+[boost_cv_lib_tag=unknown
+if test x$boost_cv_inc_path != xno; then
+ AC_LANG_PUSH([C++])dnl
+ # The following tests are mostly inspired by boost/config/auto_link.hpp
+ # The list is sorted to most recent/common to oldest compiler (in order
+ # to increase the likelihood of finding the right compiler with the
+ # least number of compilation attempt).
+ # Beware that some tests are sensible to the order (for instance, we must
+ # look for MinGW before looking for GCC3).
+ # I used one compilation test per compiler with a #error to recognize
+ # each compiler so that it works even when cross-compiling (let me know
+ # if you know a better approach).
+ # Known missing tags (known from Boost's tools/build/v2/tools/common.jam):
+ # como, edg, kcc, bck, mp, sw, tru, xlc
+ # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines
+ # the same defines as GCC's).
+ for i in \
+ _BOOST_mingw_test(4, 10) \
+ _BOOST_gcc_test(4, 10) \
+ _BOOST_mingw_test(4, 9) \
+ _BOOST_gcc_test(4, 9) \
+ _BOOST_mingw_test(4, 8) \
+ _BOOST_gcc_test(4, 8) \
+ _BOOST_mingw_test(4, 7) \
+ _BOOST_gcc_test(4, 7) \
+ _BOOST_mingw_test(4, 6) \
+ _BOOST_gcc_test(4, 6) \
+ _BOOST_mingw_test(4, 5) \
+ _BOOST_gcc_test(4, 5) \
+ _BOOST_mingw_test(4, 4) \
+ _BOOST_gcc_test(4, 4) \
+ _BOOST_mingw_test(4, 3) \
+ _BOOST_gcc_test(4, 3) \
+ _BOOST_mingw_test(4, 2) \
+ _BOOST_gcc_test(4, 2) \
+ _BOOST_mingw_test(4, 1) \
+ _BOOST_gcc_test(4, 1) \
+ _BOOST_mingw_test(4, 0) \
+ _BOOST_gcc_test(4, 0) \
+ "defined __GNUC__ && __GNUC__ == 3 && !defined __ICC \
+ && (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+ || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw" \
+ _BOOST_gcc_test(3, 4) \
+ _BOOST_gcc_test(3, 3) \
+ "defined _MSC_VER && _MSC_VER >= 1500 @ vc90" \
+ "defined _MSC_VER && _MSC_VER == 1400 @ vc80" \
+ _BOOST_gcc_test(3, 2) \
+ "defined _MSC_VER && _MSC_VER == 1310 @ vc71" \
+ _BOOST_gcc_test(3, 1) \
+ _BOOST_gcc_test(3, 0) \
+ "defined __BORLANDC__ @ bcb" \
+ "defined __ICC && (defined __unix || defined __unix__) @ il" \
+ "defined __ICL @ iw" \
+ "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \
+ _BOOST_gcc_test(2, 95) \
+ "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \
+ "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \
+ "defined _MSC_VER && _MSC_VER < 1300 && defined UNDER_CE @ evc4" \
+ "defined __MWERKS__ && __MWERKS__ <= 0x31FF @ cw8"
+ do
+ boost_tag_test=`expr "X$i" : 'X\([[^@]]*\) @ '`
+ boost_tag=`expr "X$i" : 'X[[^@]]* @ \(.*\)'`
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if $boost_tag_test
+/* OK */
+#else
+# error $boost_tag_test
+#endif
+]])], [boost_cv_lib_tag=$boost_tag; break], [])
+ done
+AC_LANG_POP([C++])dnl
+ case $boost_cv_lib_tag in #(
+ # Some newer (>= 1.35?) versions of Boost seem to only use "gcc" as opposed
+ # to "gcc41" for instance.
+ *-gcc | *'-gcc ') :;; #( Don't re-add -gcc: it's already in there.
+ gcc*)
+ boost_tag_x=
+ case $host_os in #(
+ darwin*)
+ if test $boost_major_version -ge 136; then
+ # The `x' added in r46793 of Boost.
+ boost_tag_x=x
+ fi;;
+ esac
+ # We can specify multiple tags in this variable because it's used by
+ # BOOST_FIND_LIB that does a `for tag in -$boost_cv_lib_tag' ...
+ boost_cv_lib_tag="$boost_tag_x$boost_cv_lib_tag -${boost_tag_x}gcc"
+ ;; #(
+ unknown)
+ AC_MSG_WARN([[could not figure out which toolset name to use for $CXX]])
+ boost_cv_lib_tag=
+ ;;
+ esac
+fi])dnl end of AC_CACHE_CHECK
+])# _BOOST_FIND_COMPILER_TAG
+
+
+# _BOOST_GUESS_WHETHER_TO_USE_MT()
+# --------------------------------
+# Compile a small test to try to guess whether we should favor MT (Multi
+# Thread) flavors of Boost. Sets boost_guess_use_mt accordingly.
+AC_DEFUN([_BOOST_GUESS_WHETHER_TO_USE_MT],
+[# Check whether we do better use `mt' even though we weren't ask to.
+AC_LANG_PUSH([C++])dnl
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if defined _REENTRANT || defined _MT || defined __MT__
+/* use -mt */
+#else
+# error MT not needed
+#endif
+]])], [boost_guess_use_mt=:], [boost_guess_use_mt=false])
+AC_LANG_POP([C++])dnl
+])
+
+# _BOOST_AC_LINK_IFELSE(PROGRAM, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
+# -------------------------------------------------------------------
+# Fork of _AC_LINK_IFELSE that preserves conftest.o across calls. Fragile,
+# will break when Autoconf changes its internals. Requires that you manually
+# rm -f conftest.$ac_objext in between to really different tests, otherwise
+# you will try to link a conftest.o left behind by a previous test.
+# Used to aggressively optimize BOOST_FIND_LIB (see the big comment in this
+# macro).
+#
+# Don't use "break" in the actions, as it would short-circuit some code
+# this macro runs after the actions.
+m4_define([_BOOST_AC_LINK_IFELSE],
+[m4_ifvaln([$1], [AC_LANG_CONFTEST([$1])])dnl
+rm -f conftest$ac_exeext
+boost_save_ac_ext=$ac_ext
+boost_use_source=:
+# If we already have a .o, re-use it. We change $ac_ext so that $ac_link
+# tries to link the existing object file instead of compiling from source.
+test -f conftest.$ac_objext && ac_ext=$ac_objext && boost_use_source=false &&
+ _AS_ECHO_LOG([re-using the existing conftest.$ac_objext])
+AS_IF([_AC_DO_STDERR($ac_link) && {
+ test -z "$ac_[]_AC_LANG_ABBREV[]_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest$ac_exeext && {
+ test "$cross_compiling" = yes ||
+ $as_executable_p conftest$ac_exeext
+dnl FIXME: use AS_TEST_X instead when 2.61 is widespread enough.
+ }],
+ [$2],
+ [if $boost_use_source; then
+ _AC_MSG_LOG_CONFTEST
+ fi
+ $3])
+ac_objext=$boost_save_ac_objext
+ac_ext=$boost_save_ac_ext
+dnl Delete also the IPA/IPO (Inter Procedural Analysis/Optimization)
+dnl information created by the PGI compiler (conftest_ipa8_conftest.oo),
+dnl as it would interfere with the next link command.
+rm -f core conftest.err conftest_ipa8_conftest.oo \
+ conftest$ac_exeext m4_ifval([$1], [conftest.$ac_ext])[]dnl
+])# _BOOST_AC_LINK_IFELSE
+
+# Local Variables:
+# mode: autoconf
+# End:
diff --git a/m4/pkg.m4 b/m4/pkg.m4
new file mode 100644
index 0000000..cbb46db
--- /dev/null
+++ b/m4/pkg.m4
@@ -0,0 +1,156 @@
+# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
+#
+# Copyright © 2004 Scott James Remnant <scott at netsplit.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# PKG_PROG_PKG_CONFIG([MIN-VERSION])
+# ----------------------------------
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+ AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+ _pkg_min_version=m4_default([$1], [0.9.0])
+ AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+ if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ PKG_CONFIG=""
+ fi
+
+fi[]dnl
+])# PKG_PROG_PKG_CONFIG
+
+# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+#
+# Check to see whether a particular set of modules exists. Similar
+# to PKG_CHECK_MODULES(), but does not set variables or print errors.
+#
+#
+# Similar to PKG_CHECK_MODULES, make sure that the first instance of
+# this or PKG_CHECK_MODULES is called, or make sure to call
+# PKG_CHECK_EXISTS manually
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+ AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+ m4_ifval([$2], [$2], [:])
+m4_ifvaln([$3], [else
+ $3])dnl
+fi])
+
+
+# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+# ---------------------------------------------
+m4_define([_PKG_CONFIG],
+[if test -n "$PKG_CONFIG"; then
+ if test -n "$$1"; then
+ pkg_cv_[]$1="$$1"
+ else
+ PKG_CHECK_EXISTS([$3],
+ [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
+ [pkg_failed=yes])
+ fi
+else
+ pkg_failed=untried
+fi[]dnl
+])# _PKG_CONFIG
+
+# _PKG_SHORT_ERRORS_SUPPORTED
+# -----------------------------
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+ _pkg_short_errors_supported=yes
+else
+ _pkg_short_errors_supported=no
+fi[]dnl
+])# _PKG_SHORT_ERRORS_SUPPORTED
+
+
+# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+# [ACTION-IF-NOT-FOUND])
+#
+#
+# Note that if there is a possibility the first call to
+# PKG_CHECK_MODULES might not happen, you should be sure to include an
+# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+#
+#
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_MODULES],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+_PKG_CONFIG([$1][_LIBS], [libs], [$2])
+
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+
+if test $pkg_failed = yes; then
+ _PKG_SHORT_ERRORS_SUPPORTED
+ if test $_pkg_short_errors_supported = yes; then
+ $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"`
+ else
+ $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"`
+ fi
+ # Put the nasty error message in config.log where it belongs
+ echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+
+ ifelse([$4], , [AC_MSG_ERROR(dnl
+[Package requirements ($2) were not met:
+
+$$1_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+_PKG_TEXT
+])],
+ [$4])
+elif test $pkg_failed = untried; then
+ ifelse([$4], , [AC_MSG_FAILURE(dnl
+[The pkg-config script could not be found or is too old. Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+_PKG_TEXT
+
+To get pkg-config, see <http://www.freedesktop.org/software/pkgconfig>.])],
+ [$4])
+else
+ $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+ $1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+ AC_MSG_RESULT([yes])
+ ifelse([$3], , :, [$3])
+fi[]dnl
+])# PKG_CHECK_MODULES
diff --git a/projects/libMems.doxygen b/projects/libMems.doxygen
new file mode 100644
index 0000000..b13de44
--- /dev/null
+++ b/projects/libMems.doxygen
@@ -0,0 +1,212 @@
+# Doxyfile 1.3.7
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+PROJECT_NAME = $(PROJECT)-$(VERSION)
+PROJECT_NUMBER =
+OUTPUT_DIRECTORY = $(DOCDIR)
+CREATE_SUBDIRS = NO
+OUTPUT_LANGUAGE = English
+USE_WINDOWS_ENCODING = NO
+BRIEF_MEMBER_DESC = YES
+REPEAT_BRIEF = YES
+ABBREVIATE_BRIEF =
+ALWAYS_DETAILED_SEC = NO
+INLINE_INHERITED_MEMB = NO
+STRIP_FROM_INC_PATH = $(SRCDIR)
+FULL_PATH_NAMES = YES
+STRIP_FROM_PATH = $(SRCDIR)
+SHORT_NAMES = NO
+JAVADOC_AUTOBRIEF = YES
+MULTILINE_CPP_IS_BRIEF = NO
+DETAILS_AT_TOP = YES
+INHERIT_DOCS = YES
+DISTRIBUTE_GROUP_DOC = NO
+TAB_SIZE = 8
+ALIASES =
+OPTIMIZE_OUTPUT_FOR_C = YES
+OPTIMIZE_OUTPUT_JAVA = NO
+SUBGROUPING = YES
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL = YES
+EXTRACT_PRIVATE = YES
+EXTRACT_STATIC = YES
+EXTRACT_LOCAL_CLASSES = YES
+EXTRACT_LOCAL_METHODS = NO
+HIDE_UNDOC_MEMBERS = NO
+HIDE_UNDOC_CLASSES = NO
+HIDE_FRIEND_COMPOUNDS = NO
+HIDE_IN_BODY_DOCS = NO
+INTERNAL_DOCS = NO
+CASE_SENSE_NAMES = NO
+HIDE_SCOPE_NAMES = NO
+SHOW_INCLUDE_FILES = YES
+INLINE_INFO = YES
+SORT_MEMBER_DOCS = YES
+SORT_BRIEF_DOCS = NO
+SORT_BY_SCOPE_NAME = NO
+GENERATE_TODOLIST = YES
+GENERATE_TESTLIST = YES
+GENERATE_BUGLIST = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS =
+MAX_INITIALIZER_LINES = 30
+SHOW_USED_FILES = YES
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET = YES
+WARNINGS = YES
+WARN_IF_UNDOCUMENTED = YES
+WARN_IF_DOC_ERROR = YES
+WARN_FORMAT = "$file:$line: $text"
+WARN_LOGFILE =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT = $(SRCDIR)
+FILE_PATTERNS = *.c *.h
+RECURSIVE = YES
+EXCLUDE =
+EXCLUDE_SYMLINKS = NO
+EXCLUDE_PATTERNS =
+EXAMPLE_PATH = $(SRCDIR)
+EXAMPLE_PATTERNS =
+EXAMPLE_RECURSIVE = NO
+IMAGE_PATH =
+INPUT_FILTER =
+FILTER_SOURCE_FILES = NO
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER = NO
+INLINE_SOURCES = NO
+STRIP_CODE_COMMENTS = YES
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION = YES
+VERBATIM_HEADERS = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX = NO
+COLS_IN_ALPHA_INDEX = 5
+IGNORE_PREFIX =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML = $(GENERATE_HTML)
+HTML_OUTPUT = html
+HTML_FILE_EXTENSION = .html
+HTML_HEADER =
+HTML_FOOTER =
+HTML_STYLESHEET =
+HTML_ALIGN_MEMBERS = YES
+GENERATE_HTMLHELP = $(GENERATE_CHM)
+CHM_FILE = ../$(PROJECT).chm
+HHC_LOCATION = $(HHC_PATH)
+GENERATE_CHI = $(GENERATE_CHI)
+BINARY_TOC = NO
+TOC_EXPAND = NO
+DISABLE_INDEX = NO
+ENUM_VALUES_PER_LINE = 4
+GENERATE_TREEVIEW = YES
+TREEVIEW_WIDTH = 250
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX = $(GENERATE_LATEX)
+LATEX_OUTPUT = latex
+LATEX_CMD_NAME = latex
+MAKEINDEX_CMD_NAME = makeindex
+COMPACT_LATEX = NO
+PAPER_TYPE = $(PAPER_SIZE)
+EXTRA_PACKAGES =
+LATEX_HEADER =
+PDF_HYPERLINKS = NO
+USE_PDFLATEX = NO
+LATEX_BATCHMODE = YES
+LATEX_HIDE_INDICES = NO
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF = $(GENERATE_RTF)
+RTF_OUTPUT = rtf
+COMPACT_RTF = NO
+RTF_HYPERLINKS = NO
+RTF_STYLESHEET_FILE =
+RTF_EXTENSIONS_FILE =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN = $(GENERATE_MAN)
+MAN_OUTPUT = man
+MAN_EXTENSION = .1
+MAN_LINKS = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML = $(GENERATE_XML)
+XML_OUTPUT = xml
+XML_SCHEMA =
+XML_DTD =
+XML_PROGRAMLISTING = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD = NO
+PERLMOD_LATEX = NO
+PERLMOD_PRETTY = YES
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING = YES
+MACRO_EXPANSION = NO
+EXPAND_ONLY_PREDEF = NO
+SEARCH_INCLUDES = YES
+INCLUDE_PATH =
+INCLUDE_FILE_PATTERNS =
+PREDEFINED =
+EXPAND_AS_DEFINED =
+SKIP_FUNCTION_MACROS = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+TAGFILES =
+GENERATE_TAGFILE = $(DOCDIR)/$(PROJECT).tag
+ALLEXTERNALS = NO
+EXTERNAL_GROUPS = YES
+PERL_PATH = $(PERL_PATH)
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS = YES
+HIDE_UNDOC_RELATIONS = YES
+HAVE_DOT = $(HAVE_DOT)
+CLASS_GRAPH = YES
+COLLABORATION_GRAPH = YES
+UML_LOOK = NO
+TEMPLATE_RELATIONS = NO
+INCLUDE_GRAPH = YES
+INCLUDED_BY_GRAPH = YES
+CALL_GRAPH = NO
+GRAPHICAL_HIERARCHY = YES
+DOT_IMAGE_FORMAT = png
+DOT_PATH = $(DOT_PATH)
+DOTFILE_DIRS =
+MAX_DOT_GRAPH_WIDTH = 1024
+MAX_DOT_GRAPH_HEIGHT = 1024
+MAX_DOT_GRAPH_DEPTH = 0
+GENERATE_LEGEND = YES
+DOT_CLEANUP = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+SEARCHENGINE = NO
diff --git a/projects/libMems.kdevprj b/projects/libMems.kdevprj
new file mode 100644
index 0000000..3af1b86
--- /dev/null
+++ b/projects/libMems.kdevprj
@@ -0,0 +1,281 @@
+[AUTHORS]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[COPYING]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[ChangeLog]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[Config for BinMakefileAm]
+bin_program=mormems
+cxxflags=-O0 -g3 -Wall
+ldflags=\s
+
+[General]
+author=Aaron Darling
+email=darling at cs.wisc.edu
+kdevprj_version=1.3
+lfv_open_groups=
+makefiles=Makefile.am,mormems/Makefile.am,mormems/docs/Makefile.am,mormems/docs/en/Makefile.am,po/Makefile.am
+project_name=Mormems
+project_type=normal_empty
+sub_dir=mormems/
+version=0.1
+version_control=CVS
+workspace=1
+
+[INSTALL]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[LFV Groups]
+GNU=AUTHORS,COPYING,ChangeLog,INSTALL,README,TODO,NEWS
+Headers=*.h,*.hh,*.hxx,*.hpp,*.H
+Others=*
+Sources=*.cpp,*.c,*.cc,*.C,*.cxx,*.ec,*.ecpp,*.lxx,*.l++,*.ll,*.l
+User Interface=*.kdevdlg,*.ui,*.rc
+groups=Headers,Sources,User Interface,GNU,Others
+
+[Makefile.am]
+files=mormems.kdevprj,AUTHORS,COPYING,ChangeLog,INSTALL,README,TODO,mormems.lsm
+sub_dirs=mormems
+type=normal
+
+[README]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[TODO]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[mormems.kdevprj]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[mormems.lsm]
+dist=true
+install=false
+install_location=
+type=DATA
+
+[mormems/BigDiskSuffixArray.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/BigDiskSuffixArray.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/BigDnaSar.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/BigDnaSar.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/DiskSuffixArray.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/DiskSuffixArray.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/HashingMatchFinder.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/HashingMatchFinder.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/Makefile.am]
+files=mormems/BigDiskSuffixArray.cpp,mormems/BigDnaSar.cpp,mormems/DiskSuffixArray.cpp,mormems/HashingMatchFinder.cpp,mormems/MatchFinder.cpp,mormems/MemHash.cpp,mormems/MemHashEntry.cpp,mormems/MemScorer.cpp,mormems/MimHash.cpp,mormems/MimHashEntry.cpp,mormems/SmallDiskSuffixArray.cpp,mormems/SmallDnaSar.cpp,mormems/SuffixArray.cpp,mormems/genomeApp.cpp,mormems/BigDiskSuffixArray.h,mormems/BigDnaSar.h,mormems/DiskSuffixArray.h,mormems/HashingMatchFinder.h,mormems/MatchFinder.h,mormems/M [...]
+sub_dirs=
+type=prog_main
+
+[mormems/MatchFinder.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/MatchFinder.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/MemHash.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/MemHash.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/MemHashEntry.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/MemHashEntry.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/MemScorer.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/MemScorer.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/MemorySuffixArray.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/MimHash.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/MimHash.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/MimHashEntry.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/MimHashEntry.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/RepeatFinder.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/SmallDiskSuffixArray.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/SmallDiskSuffixArray.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/SmallDnaSar.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/SmallDnaSar.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/SuffixArray.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/SuffixArray.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/docs/Makefile.am]
+sub_dirs=
+type=normal
+
+[mormems/docs/en/Makefile.am]
+sub_dirs=
+type=normal
+
+[mormems/genomeApp.cpp]
+dist=true
+install=false
+install_location=
+type=SOURCE
+
+[mormems/precomp.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[mormems/precomp_d.h]
+dist=true
+install=false
+install_location=
+type=HEADER
+
+[po/Makefile.am]
+sub_dirs=
+type=po
diff --git a/projects/libMems.sln b/projects/libMems.sln
new file mode 100644
index 0000000..e2fb0d2
--- /dev/null
+++ b/projects/libMems.sln
@@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual C++ Express 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libMems", "libMems.vcproj", "{20FE3C39-9B04-4D5F-8249-115D8812B93E}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {20FE3C39-9B04-4D5F-8249-115D8812B93E}.Debug|Win32.ActiveCfg = Debug|Win32
+ {20FE3C39-9B04-4D5F-8249-115D8812B93E}.Debug|Win32.Build.0 = Debug|Win32
+ {20FE3C39-9B04-4D5F-8249-115D8812B93E}.Release|Win32.ActiveCfg = Release|Win32
+ {20FE3C39-9B04-4D5F-8249-115D8812B93E}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/projects/libMems.vcproj b/projects/libMems.vcproj
new file mode 100644
index 0000000..946529e
--- /dev/null
+++ b/projects/libMems.vcproj
@@ -0,0 +1,1033 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="8.00"
+ Name="libMems"
+ ProjectGUID="{20FE3C39-9B04-4D5F-8249-115D8812B93E}"
+ RootNamespace="libMems"
+ Keyword="Win32Proj"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ <Platform
+ Name="x64"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="Release"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ EnableIntrinsicFunctions="true"
+ FavorSizeOrSpeed="1"
+ WholeProgramOptimization="true"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="1"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/mems.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="FastDebug|Win32"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ EnableIntrinsicFunctions="false"
+ WholeProgramOptimization="false"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;NDEBUG;FASTDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="1"
+ OpenMP="false"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/memsfd.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release OpenMP|Win32"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ EnableIntrinsicFunctions="true"
+ FavorSizeOrSpeed="1"
+ WholeProgramOptimization="true"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="1"
+ OpenMP="true"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/memsomp.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="FastDebug OpenMP|Win32"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ EnableIntrinsicFunctions="false"
+ WholeProgramOptimization="false"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="1"
+ OpenMP="true"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/memsfdomp.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|x64"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ EnableIntrinsicFunctions="true"
+ FavorSizeOrSpeed="1"
+ WholeProgramOptimization="true"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;WIN64;NDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="0"
+ OpenMP="true"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/mems64omp.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="FastDebug|x64"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ EnableIntrinsicFunctions="false"
+ WholeProgramOptimization="false"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;WIN64;NDEBUG;FASTDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="0"
+ OpenMP="false"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/mems64fd.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release OpenMP|x64"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ EnableIntrinsicFunctions="true"
+ FavorSizeOrSpeed="1"
+ WholeProgramOptimization="true"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;WIN64;NDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="0"
+ OpenMP="true"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/mems64omp.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="FastDebug OpenMP|x64"
+ OutputDirectory="..\lib"
+ IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+ ConfigurationType="4"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ EnableIntrinsicFunctions="false"
+ WholeProgramOptimization="false"
+ AdditionalIncludeDirectories="../../muscle/libMUSCLE;../../muscle;../../libGenome;../;..\..\boost\boost_1_34_0\;"
+ PreprocessorDefinitions="WIN32;WIN64;NDEBUG;_LIB;_SCL_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE"
+ RuntimeLibrary="0"
+ EnableEnhancedInstructionSet="0"
+ OpenMP="true"
+ UsePrecompiledHeader="0"
+ WarningLevel="2"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ OutputFile="$(OutDir)/mems64fdomp.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <Filter
+ Name="dmSML"
+ >
+ <File
+ RelativePath="..\libmems\dmsml\alibc.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\alinuxaio.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\aPOSIXaio.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\asyncio.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\awin32aio.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\buffer.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\dmsort.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\sml.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\sorting.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\timing.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\util.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="SortedMerList"
+ >
+ <File
+ RelativePath="..\libmems\DNAFileSML.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\DNAMemorySML.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\FileSML.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MemorySML.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\SortedMerList.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Matches"
+ >
+ <File
+ RelativePath="..\libMems\AbstractGappedAlignment.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\AbstractMatch.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\CompactGappedAlignment.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\DenseAbstractMatch.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\GappedAlignment.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\gnAlignedSequences.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HybridAbstractMatch.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\Interval.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\IntervalList.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\Match.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\MatchHashEntry.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MatchList.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\MatchProjectionAdapter.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\RepeatMatch.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\RepeatMatchList.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\SparseAbstractMatch.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\UngappedLocalAlignment.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="MatchFinder"
+ >
+ <File
+ RelativePath="..\libmems\MaskedMemHash.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MatchFinder.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MemHash.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\RepeatHash.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\SeedMasks.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Aligner"
+ >
+ <File
+ RelativePath="..\libmems\Aligner.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\GappedAligner.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\Islands.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\LCB.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\MuscleInterface.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Utility"
+ >
+ <File
+ RelativePath=".\libmems\configuration.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\Matrix.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\Memory.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\NumericMatrix.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\PhyloTree.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\SlotAllocator.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\twister.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="ProgressiveAligner"
+ >
+ <File
+ RelativePath="..\libMems\Backbone.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\GreedyBreakpointElimination.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\PairwiseMatchAdapter.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\PairwiseMatchFinder.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\ProgressiveAligner.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\SeedOccurrenceList.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\SubstitutionMatrix.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\SuperInterval.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\TreeUtilities.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="HomologyHMM"
+ >
+ <File
+ RelativePath="..\libMems\HomologyHMM\algebras.cc"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HomologyHMM\algebras.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HomologyHMM\dptables.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HomologyHMM\homology.cc"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HomologyHMM\homology.h"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HomologyHMM\homologymain.cc"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\HomologyHMM\parameters.h"
+ >
+ </File>
+ </Filter>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+ >
+ </Filter>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <Filter
+ Name="dmSML"
+ >
+ <File
+ RelativePath="..\libmems\dmsml\alibc.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\alinuxaio.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\aPOSIXaio.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\asyncio.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\awin32aio.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\buffer.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\dmsort.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\sml.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\sorting.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\timing.c"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\dmsml\util.c"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="SortedMerList"
+ >
+ <File
+ RelativePath="..\libmems\DNAFileSML.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\DNAMemorySML.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\FileSML.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MemorySML.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\SortedMerList.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Matches"
+ >
+ <File
+ RelativePath="..\libMems\GappedAlignment.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\gnAlignedSequences.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\MatchHashEntry.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\RepeatMatch.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\RepeatMatchList.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="MatchFinder"
+ >
+ <File
+ RelativePath="..\libmems\MaskedMemHash.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MatchFinder.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\MemHash.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\RepeatHash.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Aligner"
+ >
+ <File
+ RelativePath="..\libmems\Aligner.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libmems\Islands.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\MuscleInterface.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Utility"
+ >
+ <File
+ RelativePath="..\libMems\twister.c"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="ProgressiveAligner"
+ >
+ <File
+ RelativePath="..\libMems\Backbone.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\GreedyBreakpointElimination.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\PairwiseMatchFinder.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\ProgressiveAligner.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\libMems\SuperInterval.cpp"
+ >
+ </File>
+ </Filter>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libmems.git
More information about the debian-med-commit
mailing list