[med-svn] [mauvealigner] 01/02: Imported Upstream version 1.2.0+4713
Andreas Tille
tille at debian.org
Sun Apr 19 20:14:23 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository mauvealigner.
commit 541a254bb0b97e5cdfab13284177d260e91ae937
Author: Andreas Tille <tille at debian.org>
Date: Sun Apr 19 22:06:14 2015 +0200
Imported Upstream version 1.2.0+4713
---
AUTHORS | 1 +
COPYING | 340 +++++
ChangeLog | 0
Makefile.am | 26 +
NEWS | 0
README | 0
acinclude.m4 | 156 +++
autogen.sh | 5 +
configure.ac | 91 ++
include/getopt.h | 133 ++
src/AlignmentTree.cpp | 188 +++
src/AlignmentTree.h | 0
src/Makefile.am | 225 +++
src/MatchRecord.h | 369 +++++
src/RepeatHashCat.cpp | 12 +
src/RepeatHashCat.h | 21 +
src/SeedMatchEnumerator.h | 144 ++
src/UniqueMatchFinder.cpp | 60 +
src/UniqueMatchFinder.h | 34 +
src/addUnalignedIntervals.cpp | 33 +
src/alignmentProjector.cpp | 101 ++
src/backbone_global_to_local.cpp | 60 +
src/bbAnalyze.cpp | 1411 +++++++++++++++++++
src/bbBreakOnGenes.cpp | 358 +++++
src/bbFilter.cpp | 292 ++++
src/calculateBackboneCoverage.cpp | 138 ++
src/calculateBackboneCoverage2.cpp | 132 ++
src/calculateCoverage.cpp | 89 ++
src/checkForLGT.cpp | 253 ++++
src/coordinateTranslate.cpp | 51 +
src/countInPlaceInversions.cpp | 69 +
src/createBackboneMFA.cpp | 57 +
src/evd.cpp | 129 ++
src/extractBCITrees.cpp | 369 +++++
src/extractBackbone.cpp | 83 ++
src/extractBackbone2.cpp | 70 +
src/extractSubalignments.cpp | 96 ++
src/gappiness.cpp | 53 +
src/getAlignmentWindows.cpp | 137 ++
src/getOrthologList.cpp | 317 +++++
src/getopt.c | 1279 +++++++++++++++++
src/getopt.cpp | 772 ++++++++++
src/getopt.h | 185 +++
src/getopt1.c | 196 +++
src/joinAlignmentFiles.cpp | 108 ++
src/makeBadgerMatrix.cpp | 117 ++
src/makeMc4Matrix.cpp | 112 ++
src/mauveAligner.cpp | 919 ++++++++++++
src/mauveAligner.h | 10 +
src/mauveToXMFA.cpp | 35 +
src/mfa2xmfa.cpp | 117 ++
src/multiEVD.cpp | 217 +++
src/multiToRawSequence.cpp | 28 +
src/pairCompare.cpp | 85 ++
src/progressiveMauve.cpp | 768 ++++++++++
src/projectAndStrip.cpp | 144 ++
src/randomGeneSample.cpp | 165 +++
src/repeatoire.cpp | 2716 ++++++++++++++++++++++++++++++++++++
src/rootTrees.cpp | 128 ++
src/scoreALU.cpp | 729 ++++++++++
src/scoreAlignment.cpp | 467 +++++++
src/scoreProcrastAlignment.cpp | 458 ++++++
src/sortContigs.cpp | 181 +++
src/stripGapColumns.cpp | 74 +
src/stripSubsetLCBs.cpp | 183 +++
src/toEvoHighwayFormat.cpp | 148 ++
src/toGBKsequence.cpp | 38 +
src/toGrimmFormat.cpp | 84 ++
src/toMultiFastA.cpp | 54 +
src/toRawSequence.cpp | 27 +
src/transposeCoordinates.cpp | 71 +
src/unalign.cpp | 91 ++
src/uniqueMerCount.cpp | 41 +
src/uniquifyTrees.cpp | 250 ++++
src/xmfa2maf.cpp | 87 ++
75 files changed, 17187 insertions(+)
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..1dcdaf3
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Aaron Darling
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d60c31a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..e69de29
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..5c3907c
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,26 @@
+EXTRA_DIST = \
+projects/everything.sln \
+projects/mauveAligner.sln \
+projects/calculateBackboneCoverage2.vcproj \
+projects/calculateBackboneCoverage.vcproj \
+projects/checkForLGT.vcproj \
+projects/extractBackbone.vcproj \
+projects/extractBCITrees.vcproj \
+projects/extractSubalignments.vcproj \
+projects/mauveAligner.vcproj \
+projects/repeatoire.vcproj \
+projects/progressiveMauve.vcproj \
+projects/rootTrees.vcproj \
+projects/scoreAlignment.vcproj \
+projects/scoreALU.vcproj \
+projects/sortContigs.vcproj \
+projects/toEvoHighwayFormat.vcproj \
+projects/toGrimmFormat.vcproj \
+projects/transposeCoordinates.vcproj \
+projects/unalign.vcproj \
+projects/uniqueMerCount.vcproj \
+projects/uniquifyTrees.vcproj \
+projects/mauveAligner.dox
+
+SUBDIRS = src
+
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..e69de29
diff --git a/README b/README
new file mode 100644
index 0000000..e69de29
diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 0000000..cbb46db
--- /dev/null
+++ b/acinclude.m4
@@ -0,0 +1,156 @@
+# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
+#
+# Copyright © 2004 Scott James Remnant <scott at netsplit.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# PKG_PROG_PKG_CONFIG([MIN-VERSION])
+# ----------------------------------
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+ AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+ _pkg_min_version=m4_default([$1], [0.9.0])
+ AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+ if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ PKG_CONFIG=""
+ fi
+
+fi[]dnl
+])# PKG_PROG_PKG_CONFIG
+
+# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+#
+# Check to see whether a particular set of modules exists. Similar
+# to PKG_CHECK_MODULES(), but does not set variables or print errors.
+#
+#
+# Similar to PKG_CHECK_MODULES, make sure that the first instance of
+# this or PKG_CHECK_MODULES is called, or make sure to call
+# PKG_CHECK_EXISTS manually
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+ AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+ m4_ifval([$2], [$2], [:])
+m4_ifvaln([$3], [else
+ $3])dnl
+fi])
+
+
+# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+# ---------------------------------------------
+m4_define([_PKG_CONFIG],
+[if test -n "$PKG_CONFIG"; then
+ if test -n "$$1"; then
+ pkg_cv_[]$1="$$1"
+ else
+ PKG_CHECK_EXISTS([$3],
+ [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
+ [pkg_failed=yes])
+ fi
+else
+ pkg_failed=untried
+fi[]dnl
+])# _PKG_CONFIG
+
+# _PKG_SHORT_ERRORS_SUPPORTED
+# -----------------------------
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+ _pkg_short_errors_supported=yes
+else
+ _pkg_short_errors_supported=no
+fi[]dnl
+])# _PKG_SHORT_ERRORS_SUPPORTED
+
+
+# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+# [ACTION-IF-NOT-FOUND])
+#
+#
+# Note that if there is a possibility the first call to
+# PKG_CHECK_MODULES might not happen, you should be sure to include an
+# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+#
+#
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_MODULES],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+_PKG_CONFIG([$1][_LIBS], [libs], [$2])
+
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+
+if test $pkg_failed = yes; then
+ _PKG_SHORT_ERRORS_SUPPORTED
+ if test $_pkg_short_errors_supported = yes; then
+ $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"`
+ else
+ $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"`
+ fi
+ # Put the nasty error message in config.log where it belongs
+ echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+
+ ifelse([$4], , [AC_MSG_ERROR(dnl
+[Package requirements ($2) were not met:
+
+$$1_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+_PKG_TEXT
+])],
+ [$4])
+elif test $pkg_failed = untried; then
+ ifelse([$4], , [AC_MSG_FAILURE(dnl
+[The pkg-config script could not be found or is too old. Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+_PKG_TEXT
+
+To get pkg-config, see <http://www.freedesktop.org/software/pkgconfig>.])],
+ [$4])
+else
+ $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+ $1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+ AC_MSG_RESULT([yes])
+ ifelse([$3], , :, [$3])
+fi[]dnl
+])# PKG_CHECK_MODULES
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..65f32e1
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+mkdir -p config
+autoreconf --force --install -I config
+echo "Now run ./configure --prefix=$HOME ; make install"
+
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..8f7a923
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,91 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_PREREQ([2.59])
+AC_INIT(mauveAligner, 1.2.0)
+AC_CONFIG_SRCDIR([src/mauveAligner.cpp])
+AC_CONFIG_AUX_DIR(config)
+
+
+dnl Get the target and build system types and add appropriate options
+AC_CANONICAL_TARGET
+
+AM_INIT_AUTOMAKE([no-define])
+AM_CONFIG_HEADER([src/config.h])
+
+AC_PREFIX_DEFAULT(/usr/local)
+
+dnl Override default O2
+CFLAGS=${CFLAGS-""}
+CXXFLAGS=${CXXFLAGS-""}
+
+dnl Checks for programs.
+AC_PROG_CC
+AC_PROG_CXX
+AC_PROG_INSTALL
+AC_PROG_LN_S
+
+dnl Checks for header files.
+AC_HEADER_STDC
+
+dnl Check what compiler we're using
+AM_CONDITIONAL(ICC, test x$CXX = xicc )
+
+dnl Check for getopt_long
+AC_CHECK_FUNC(getopt_long,
+[GETOPT_LONG_SYSTEM=""],
+[GETOPT_LONG_SYSTEM='getopt.$(OBJEXT) getopt1.$(OBJEXT)']
+)
+AC_SUBST([GETOPT_LONG_SYSTEM])
+
+dnl Check for getopt
+AC_CHECK_FUNC(getopt,
+[HAVE_GETOPT="true"],
+AC_DEFINE(GETOPT_UNDEFINED,[],"Define this if the system does not provide getopt
+()")
+)
+
+PKG_CHECK_MODULES(DEPS, libMems-1.6 >= 1.0.0)
+AC_SUBST(DEPS_CFLAGS)
+AC_SUBST(DEPS_LIBS)
+
+dnl Mac OS X won't allow static compilation...
+STATIC_FLAG="-static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
+if ( test "x$target_vendor" = "xapple") then
+ STATIC_FLAG=""
+fi
+AC_SUBST(STATIC_FLAG)
+
+dnl Allow debugging compilation
+AC_ARG_ENABLE(debug,
+[ --enable-debug Turn on debugging],
+[case "${enableval}" in
+ yes) debug=true ;;
+ no) debug=false ;;
+ *) AC_MSG_ERROR(bad value ${enableval} for --enable-debug) ;;
+esac],[debug=false])
+AM_CONDITIONAL(DEBUG, test x$debug = xtrue)
+
+dnl Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_C_INLINE
+dnl AC_C_BIGENDIAN
+AC_HEADER_TIME
+
+AC_CHECK_FUNCS([memset])
+AC_CHECK_HEADERS([libintl.h])
+AC_CHECK_HEADERS([stdlib.h])
+AC_CHECK_HEADERS([string.h])
+AC_CHECK_HEADERS([strings.h])
+AC_CHECK_HEADERS([unistd.h])
+AC_CHECK_HEADERS([wchar.h])
+AC_FUNC_MALLOC
+AC_HEADER_STDBOOL
+AC_TYPE_SIZE_T
+
+dnl Checks for library functions.
+AC_PROG_GCC_TRADITIONAL
+
+dnl SAVE_LIBRARY_VERSION
+AC_SUBST(LIBTOOL_VERSION_INFO)
+
+
+AC_OUTPUT(Makefile src/Makefile )
diff --git a/include/getopt.h b/include/getopt.h
new file mode 100644
index 0000000..1330eea
--- /dev/null
+++ b/include/getopt.h
@@ -0,0 +1,133 @@
+/* Declarations for getopt.
+ Copyright (C) 1989, 90, 91, 92, 93, 94 Free Software Foundation, Inc.
+
+This file is part of the GNU C Library. Its master source is NOT part of
+the C library, however. The master source lives in /gd/gnu/lib.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB. If
+not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+Cambridge, MA 02139, USA. */
+
+#ifndef _GETOPT_H
+#define _GETOPT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns EOF, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+ for unrecognized options. */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized. */
+
+extern int optopt;
+
+/* Describe the long-named options requested by the application.
+ The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+ of `struct option' terminated by an element containing a name which is
+ zero.
+
+ The field `has_arg' is:
+ no_argument (or 0) if the option does not take an argument,
+ required_argument (or 1) if the option requires an argument,
+ optional_argument (or 2) if the option takes an optional argument.
+
+ If the field `flag' is not NULL, it points to a variable that is set
+ to the value given in the field `val' when the option is found, but
+ left unchanged if the option is not found.
+
+ To have a long-named option do something other than set an `int' to
+ a compiled-in constant, such as set a value from `optarg', set the
+ option's `flag' field to zero and its `val' field to a nonzero
+ value (the equivalent single-letter option character, if there is
+ one). For long options that have a zero `flag' field, `getopt'
+ returns the contents of the `val' field. */
+
+struct option
+{
+#if defined (__STDC__) && __STDC__
+ const char *name;
+#else
+ char *name;
+#endif
+ /* has_arg can't be an enum because some compilers complain about
+ type mismatches in all the code that assumes it is an int. */
+ int has_arg;
+ int *flag;
+ int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'. */
+
+#define no_argument 0
+#define required_argument 1
+#define optional_argument 2
+
+#if ( defined (__STDC__) && __STDC__ ) || defined(__cplusplus) || defined(MSDOS)
+#ifdef __GNU_LIBRARY__
+/* Many other libraries have conflicting prototypes for getopt, with
+ differences in the consts, in stdlib.h. To avoid compilation
+ errors, only prototype getopt for the GNU C library. */
+extern int getopt (int argc, char *const *argv, const char *shortopts);
+#else /* not __GNU_LIBRARY__ */
+extern int getopt (int argc, char *const *argv, const char *optstring);
+#endif /* __GNU_LIBRARY__ */
+extern int getopt_long (int argc, char *const *argv, const char *shortopts,
+ const struct option *longopts, int *longind);
+extern int getopt_long_only (int argc, char *const *argv,
+ const char *shortopts,
+ const struct option *longopts, int *longind);
+
+/* Internal only. Users should not call this directly. */
+extern int _getopt_internal (int argc, char *const *argv,
+ const char *shortopts,
+ const struct option *longopts, int *longind,
+ int long_only);
+#else /* not __STDC__ */
+extern int getopt ();
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+#endif /* __STDC__ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GETOPT_H */
diff --git a/src/AlignmentTree.cpp b/src/AlignmentTree.cpp
new file mode 100644
index 0000000..b55994d
--- /dev/null
+++ b/src/AlignmentTree.cpp
@@ -0,0 +1,188 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "PhyloTree.h"
+#include <sstream>
+#include <stack>
+using namespace std;
+
+typedef unsigned uint;
+
+PhyloTree::PhyloTree() : vector< TreeNode >() {
+ weight = 0;
+ root = 0;
+}
+
+PhyloTree::PhyloTree( const PhyloTree& pt ) :
+vector< TreeNode >( pt ),
+weight( pt.weight ),
+root( pt.root )
+{}
+
+PhyloTree& PhyloTree::operator=( const PhyloTree& pt )
+{
+ vector< TreeNode >::operator=( pt );
+ weight = pt.weight;
+ root = pt.root;
+ return *this;
+}
+
+PhyloTree::~PhyloTree()
+{}
+
+void PhyloTree::clear()
+{
+ vector< TreeNode >::clear();
+ weight = 0;
+ root = 0;
+}
+
+
+/**
+ * readTree version 2.0: read in a phylogenetic tree in the Newick file format.
+ *
+ */
+void PhyloTree::readTree( istream& tree_file ){
+ string line;
+ clear();
+ if( !getline( tree_file, line ) )
+ return;
+
+ stringstream line_str( line );
+
+ // look for a weight
+ string::size_type open_bracket_pos = line.find( "[" );
+ string::size_type bracket_pos = line.find( "]" );
+ if( open_bracket_pos != string::npos && bracket_pos != string::npos &&
+ open_bracket_pos < bracket_pos && bracket_pos < line.find( "(" ) ){
+ // read in a weight
+ getline( line_str, line, '[' );
+ getline( line_str, line, ']' );
+ stringstream weight_str( line );
+ weight_str >> weight;
+ }
+
+ // ready to begin parsing the tree data.
+ string tree_line;
+ getline( line_str, tree_line, ';' );
+ uint read_state = 0; /**< read_state of 0 indicates nothing has been parsed yet */
+ uint section_start = 0;
+ stack< node_id_t > node_stack;
+ stringstream blen_str;
+ TreeNode new_node;
+ new_node.distance = 0; // default the distance to 0
+ for( uint charI = 0; charI < tree_line.size(); charI++ ){
+ switch( tree_line[ charI ] ){
+ // if this is an open parens then simply create a new
+ // parent node and push it on the parent stack
+ case '(':
+ if( node_stack.size() > 0 ){
+ new_node.parents.clear();
+ new_node.parents.push_back( node_stack.top() );
+ (*this)[ node_stack.top() ].children.push_back( (*this).size() );
+ }
+ node_stack.push( (*this).size() );
+ push_back( new_node );
+ read_state = 1;
+ section_start = charI + 1;
+ break;
+ case ')':
+ // read off a branch length
+ blen_str.clear();
+ blen_str.str( tree_line.substr( section_start, charI - section_start ) );
+ blen_str >> (*this)[ node_stack.top() ].distance;
+ if( read_state == 2 )
+ node_stack.pop();
+ section_start = charI + 1;
+ // pop off the top of the node stack after its branch length is read:
+ read_state = 2;
+ break;
+ case ',':
+ // read off a branch length
+ blen_str.clear();
+ blen_str.str( tree_line.substr( section_start, charI - section_start ) );
+ blen_str >> (*this)[ node_stack.top() ].distance;
+ if( read_state == 2 )
+ node_stack.pop();
+ section_start = charI + 1;
+ read_state = 1; // indicates that we'll be creating a new node when we hit :
+ break;
+ case ':':
+ // read off a name, if possible
+ if( read_state == 1 ){
+ new_node.parents.clear();
+ new_node.parents.push_back( node_stack.top() );
+ (*this)[ node_stack.top() ].children.push_back( (*this).size() );
+ node_stack.push( (*this).size() );
+ push_back( new_node );
+ read_state = 2; // pop this node after reading its branch length
+ }
+ (*this)[ node_stack.top() ].name = tree_line.substr( section_start, charI - section_start );
+ section_start = charI + 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+}
+
+
+void PhyloTree::writeTree( ostream& os ) const{
+ stack< node_id_t > node_stack;
+ stack< uint > child_stack;
+ node_stack.push( root );
+ child_stack.push( 0 );
+
+ if( (*this).weight != 0 )
+ os << "[" << weight << "]";
+ os << "(";
+
+ while( node_stack.size() > 0 ) {
+ if( (*this)[ node_stack.top() ].children.size() != 0 ){
+ // this is a parent node
+ // if we have scanned all its children then pop it
+ if( child_stack.top() == (*this)[ node_stack.top() ].children.size() ){
+ os << ")";
+ if( node_stack.size() > 1 )
+ os << ":" << (*this)[ node_stack.top() ].distance;
+ node_stack.pop();
+ child_stack.pop();
+ continue;
+ }
+ // try to recurse to its children
+ // if the child is a parent as well spit out a paren
+ node_id_t child = (*this)[ node_stack.top() ].children[ child_stack.top() ];
+ node_stack.push( child );
+ child_stack.top()++;
+ // print a comma to separate multiple children
+ if( child_stack.top() > 1 )
+ os << ",";
+ if( (*this)[ child ].children.size() > 0 ){
+ child_stack.push( 0 );
+ os << "(";
+ }
+ continue;
+ }
+
+ // this is a leaf node
+ os << (*this)[ node_stack.top() ].name << ":" << (*this)[ node_stack.top() ].distance;
+
+ // pop the child
+ node_stack.pop();
+ }
+ os << ";" << endl;
+}
+
+
+double PhyloTree::getHeight() const
+{
+ return getHeight( root );
+}
+double PhyloTree::getHeight( node_id_t nodeI ) const
+{
+ if( (*this)[ nodeI ].children.size() == 0 )
+ return (*this)[ nodeI ].distance;
+ return (*this)[ nodeI ].distance + getHeight( (*this)[ nodeI ].children[ 0 ] );
+}
diff --git a/src/AlignmentTree.h b/src/AlignmentTree.h
new file mode 100644
index 0000000..e69de29
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..7d9c94a
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,225 @@
+
+if DEBUG
+D_CXXFLAGS = -Wall -g -DCOMMAND_LINE -D__GNDEBUG__
+else
+P_CXXFLAGS = -DCOMMAND_LINE
+endif
+OPTIMIZATION = -O2 -funroll-loops -fomit-frame-pointer -ftree-vectorize
+AM_CXXFLAGS = $(OPTIMIZATION) $(D_CXXFLAGS) $(P_CXXFLAGS)
+AM_LDFLAGS = @STATIC_FLAG@
+LIBRARY_CL = $(DEPS_LIBS)
+#AM_LDADD = $(DEPS_LIBS)
+INCLUDES = @DEPS_CFLAGS@
+
+bin_PROGRAMS = mauveAligner mauveStatic scoreAlignment \
+uniqueMerCount toRawSequence \
+mfa2xmfa addUnalignedIntervals \
+toMultiFastA getAlignmentWindows uniquifyTrees \
+toGrimmFormat mauveToXMFA \
+stripGapColumns progressiveMauve progressiveMauveStatic \
+extractBCITrees createBackboneMFA \
+repeatoire alignmentProjector stripSubsetLCBs \
+projectAndStrip makeBadgerMatrix randomGeneSample getOrthologList \
+bbFilter bbAnalyze backbone_global_to_local xmfa2maf coordinateTranslate
+
+EXTRA_PROGRAMS = bbBreakOnGenes mauveMpatrol mauveEfence toGBKsequence \
+multiToRawSequence unalign makeMc4Matrix multiEVD evd scoreALU \
+calculateBackboneCoverage2 sortContigs countInPlaceInversions gappiness \
+joinAlignmentFiles extractBackbone2 pairCompare \
+calculateCoverage calculateBackboneCoverage extractBackbone transposeCoordinates
+
+mauveAligner_SOURCES = mauveAligner.cpp mauveAligner.h
+mauveAligner_LDFLAGS = $(OPTIMIZATION)
+mauveAligner_LDADD = $(DEPS_LIBS)
+EXTRA_mauveAligner_SOURCES = getopt.c getopt.h getopt1.c
+mauveAligner_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+mauveMpatrol_SOURCES = mauveAligner.cpp
+mauveMpatrol_LDFLAGS = -lmpatrol -lbfd -liberty $(LIBRARY_CL) `wx-config --libs`
+EXTRA_mauveMpatrol_SOURCES = getopt.c getopt.h getopt1.c
+mauveMpatrol_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+
+
+mauveEfence_SOURCES = mauveAligner.cpp
+mauveEfence_LDADD = -lefence $(DEPS_LIBS)
+EXTRA_mauveEfence_SOURCES = getopt.c getopt.h getopt1.c
+mauveEfence_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+mauveStatic_SOURCES = mauveAligner.cpp
+mauveStatic_LDADD = $(LIBRARY_CL)
+EXTRA_mauveStatic_SOURCES = getopt.c getopt.h getopt1.c
+mauveStatic_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+#mauveAligner4_SOURCES = mauveAligner.cpp
+#mauveAligner4_LDADD = $(LIBRARY_CL)
+#mauveAligner4_CXXFLAGS = MAX_SEQ_COUNT=4
+#EXTRA_mauveAligner4_SOURCES = getopt.c getopt.h getopt1.c
+#mauveAligner4_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+
+calculateCoverage_SOURCES = calculateCoverage.cpp
+calculateCoverage_LDADD = $(LIBRARY_CL)
+EXTRA_calculateCoverage_SOURCES = getopt.c getopt.h getopt1.c
+calculateCoverage_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+scoreAlignment_SOURCES = scoreAlignment.cpp
+scoreAlignment_LDADD = $(LIBRARY_CL)
+EXTRA_scoreAlignment_SOURCES = getopt.c getopt.h getopt1.c
+scoreAlignment_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+extractBackbone_SOURCES = extractBackbone.cpp
+extractBackbone_LDADD = $(LIBRARY_CL)
+EXTRA_extractBackbone_SOURCES = getopt.c getopt.h getopt1.c
+extractBackbone_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+toRawSequence_SOURCES = toRawSequence.cpp
+toRawSequence_LDADD = $(LIBRARY_CL)
+EXTRA_toRawSequence_SOURCES = getopt.c getopt.h getopt1.c
+toRawSequence_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+transposeCoordinates_SOURCES = transposeCoordinates.cpp
+transposeCoordinates_LDADD = $(LIBRARY_CL)
+EXTRA_transposeCoordinates_SOURCES = getopt.c getopt.h getopt1.c
+transposeCoordinates_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+uniqueMerCount_SOURCES = uniqueMerCount.cpp
+uniqueMerCount_LDADD = $(LIBRARY_CL)
+EXTRA_uniqueMerCount_SOURCES = getopt.c getopt.h getopt1.c
+uniqueMerCount_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+calculateBackboneCoverage_SOURCES = calculateBackboneCoverage.cpp
+calculateBackboneCoverage_LDADD = $(LIBRARY_CL)
+EXTRA_calculateBackboneCoverage_SOURCES = getopt.c getopt.h getopt1.c
+calculateBackboneCoverage_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+mfa2xmfa_SOURCES = mfa2xmfa.cpp
+mfa2xmfa_LDADD = $(LIBRARY_CL)
+EXTRA_mfa2xmfa_SOURCES = getopt.c getopt.h getopt1.c
+mfa2xmfa_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+xmfa2maf_SOURCES = xmfa2maf.cpp
+xmfa2maf_LDADD = $(LIBRARY_CL)
+EXTRA_xmfa2maf_SOURCES = getopt.c getopt.h getopt1.c
+xmfa2maf_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+toGBKsequence_SOURCES = toGBKsequence.cpp
+toGBKsequence_LDADD = $(LIBRARY_CL)
+EXTRA_toGBKsequence_SOURCES = getopt.c getopt.h getopt1.c
+toGBKsequence_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+multiToRawSequence_SOURCES = multiToRawSequence.cpp
+multiToRawSequence_LDADD = $(LIBRARY_CL)
+EXTRA_multiToRawSequence_SOURCES = getopt.c getopt.h getopt1.c
+multiToRawSequence_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+unalign_SOURCES = unalign.cpp
+unalign_LDADD = $(LIBRARY_CL)
+EXTRA_unalign_SOURCES = getopt.c getopt.h getopt1.c
+unalign_DEPENDENCIES = @GETOPT_LONG_SYSTEM@
+
+addUnalignedIntervals_SOURCES = addUnalignedIntervals.cpp
+addUnalignedIntervals_LDADD = $(LIBRARY_CL)
+
+toMultiFastA_SOURCES = toMultiFastA.cpp
+toMultiFastA_LDADD = $(LIBRARY_CL)
+
+getAlignmentWindows_SOURCES = getAlignmentWindows.cpp
+getAlignmentWindows_LDADD = $(LIBRARY_CL)
+
+extractBackbone2_SOURCES = extractBackbone2.cpp
+extractBackbone2_LDADD = $(LIBRARY_CL)
+
+uniquifyTrees_SOURCES = uniquifyTrees.cpp
+uniquifyTrees_LDADD = $(LIBRARY_CL)
+
+
+countInPlaceInversions_SOURCES = countInPlaceInversions.cpp
+countInPlaceInversions_LDADD = $(LIBRARY_CL)
+
+toGrimmFormat_SOURCES = toGrimmFormat.cpp
+toGrimmFormat_LDADD = $(LIBRARY_CL)
+
+joinAlignmentFiles_SOURCES = joinAlignmentFiles.cpp
+joinAlignmentFiles_LDADD = $(LIBRARY_CL)
+
+mauveToXMFA_SOURCES = mauveToXMFA.cpp
+mauveToXMFA_LDADD = $(LIBRARY_CL)
+
+stripGapColumns_SOURCES = stripGapColumns.cpp
+stripGapColumns_LDADD = $(LIBRARY_CL)
+
+gappiness_SOURCES = gappiness.cpp
+gappiness_LDADD = $(LIBRARY_CL)
+
+
+progressiveMauve_SOURCES = progressiveMauve.cpp UniqueMatchFinder.h UniqueMatchFinder.cpp
+progressiveMauve_LDFLAGS =
+progressiveMauve_LDADD = $(LIBRARY_CL)
+
+progressiveMauveStatic_SOURCES = progressiveMauve.cpp UniqueMatchFinder.h UniqueMatchFinder.cpp
+progressiveMauveStatic_LDADD = $(LIBRARY_CL)
+
+sortContigs_SOURCES = sortContigs.cpp
+sortContigs_LDADD = $(LIBRARY_CL)
+
+extractBCITrees_SOURCES = extractBCITrees.cpp
+extractBCITrees_LDADD = $(LIBRARY_CL)
+
+calculateBackboneCoverage2_SOURCES = calculateBackboneCoverage2.cpp
+calculateBackboneCoverage2_LDADD = $(LIBRARY_CL)
+
+createBackboneMFA_SOURCES = createBackboneMFA.cpp
+createBackboneMFA_LDADD = $(LIBRARY_CL)
+
+pairCompare_SOURCES = pairCompare.cpp
+pairCompare_LDADD = $(LIBRARY_CL)
+
+repeatoire_SOURCES = repeatoire.cpp MatchRecord.h SeedMatchEnumerator.h
+repeatoire_LDADD = $(LIBRARY_CL)
+
+scoreALU_SOURCES = scoreALU.cpp
+scoreALU_LDADD = $(LIBRARY_CL)
+
+evd_SOURCES = evd.cpp
+evd_LDADD = $(LIBRARY_CL)
+
+alignmentProjector_SOURCES = alignmentProjector.cpp
+alignmentProjector_LDADD = $(LIBRARY_CL)
+
+stripSubsetLCBs_SOURCES = stripSubsetLCBs.cpp
+stripSubsetLCBs_LDADD = $(LIBRARY_CL)
+
+projectAndStrip_SOURCES = projectAndStrip.cpp
+projectAndStrip_LDADD = $(LIBRARY_CL)
+
+makeBadgerMatrix_SOURCES = makeBadgerMatrix.cpp
+makeBadgerMatrix_LDADD = $(LIBRARY_CL)
+
+multiEVD_SOURCES = multiEVD.cpp
+multiEVD_LDADD = $(LIBRARY_CL)
+
+randomGeneSample_SOURCES = randomGeneSample.cpp
+randomGeneSample_LDADD = $(LIBRARY_CL)
+
+getOrthologList_SOURCES = getOrthologList.cpp
+getOrthologList_LDADD = $(LIBRARY_CL)
+
+bbFilter_SOURCES = bbFilter.cpp
+bbFilter_LDADD = $(LIBRARY_CL)
+
+bbAnalyze_SOURCES = bbAnalyze.cpp
+bbAnalyze_LDADD = $(LIBRARY_CL)
+
+makeMc4Matrix_SOURCES = makeMc4Matrix.cpp
+makeMc4Matrix_LDADD = $(LIBRARY_CL)
+
+bbBreakOnGenes_SOURCES = bbBreakOnGenes.cpp
+bbBreakOnGenes_LDADD = $(LIBRARY_CL)
+
+backbone_global_to_local_SOURCES = backbone_global_to_local.cpp
+backbone_global_to_local_LDADD = $(LIBRARY_CL)
+
+coordinateTranslate_SOURCES = coordinateTranslate.cpp
+coordinateTranslate_LDADD = $(LIBRARY_CL)
+
diff --git a/src/MatchRecord.h b/src/MatchRecord.h
new file mode 100644
index 0000000..8774ead
--- /dev/null
+++ b/src/MatchRecord.h
@@ -0,0 +1,369 @@
+#ifndef __MatchRecord_h__
+#define __MatchRecord_h__
+
+#include "libMems/MuscleInterface.h"
+#include "libMems/AbstractMatch.h"
+#include "libMems/SparseAbstractMatch.h"
+#include "libMems/AbstractGappedAlignment.h"
+#include "libMems/Interval.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include <iostream>
+#include <set>
+#include <vector>
+//#include <boost/variant.hpp>
+
+// forward declaration
+class MatchLink;
+class MatchRecord;
+class CompactMatchRecord;
+class GappedMatchRecord;
+class UngappedMatchRecord;
+class CompactUngappedMatchRecord;
+
+/** stores a link between a subset and a superset match */
+class MatchLink
+{
+public:
+ MatchLink() : superset(NULL), subset(NULL) {};
+ MatchLink( MatchRecord* super, MatchRecord* sub, boost::dynamic_bitset<>& comp_list, std::vector< size_t > comp_map ) :
+ superset( super ), subset( sub ), super_component_list( comp_list ), sub_to_super_map( comp_map ) {};
+ void clear()
+ {
+ superset = NULL;
+ subset = NULL;
+ super_component_list.clear();
+ sub_to_super_map.clear();
+ }
+ MatchRecord* superset; /**< The superset match connected by this link */
+ MatchRecord* subset; /**< The subset match connected by this link */
+ boost::dynamic_bitset<> super_component_list; /**< this gets sized to be equal to superset->Multiplicity() and tracks which components of the superset are linked */
+ std::vector< size_t > sub_to_super_map; /**< mapping of subset components to superset components */
+};
+
+class MatchRecord : public mems::SparseAbstractMatch<>
+{
+public:
+ MatchRecord() : mems::SparseAbstractMatch<>() { clear(); }
+ MatchRecord( uint seq_count ): mems::SparseAbstractMatch<>( seq_count ){ clear(); }
+ GappedMatchRecord* subsuming_match;
+ std::vector< size_t > subsumption_component_map;
+ std::vector< MatchLink > left_subset_links; /**< Links to nearby subset matches on the left side */
+ std::vector< MatchLink > right_subset_links; /**< Links to nearby subset matches on the right side */
+ MatchLink left_superset; /**< The left-side superset, if one exists */
+ MatchLink right_superset; /**< The right-side superset, if one exists */
+ std::vector< MatchLink > extra_left_subsets; /**< left-side subsets that were further away than the first linked subset on the left side */
+ std::vector< MatchLink > extra_right_subsets; /**< right-side subsets that were further away than the first linked subset on the right side */
+ std::vector< MatchRecord* > chained_matches;
+ std::vector< std::vector< size_t > > chained_component_maps; /**< maps components in this match to those in chained matches */
+ bool tandem; /**< set to true if components of the match are chainable to each other (tandem repeats)*/
+ bool extended; /**< set to false prior to extending this match */
+ bool is_novel_subset;
+ bool dont_extend;
+
+ void clear()
+ {
+ subsuming_match = NULL;
+ left_superset.clear();
+ right_superset.clear();
+ tandem = false;
+ extended = false;
+ dont_extend = false;
+ is_novel_subset = false;
+ }
+};
+
+class CompactMatchRecord : public mems::DenseAbstractMatch<1>
+{
+public:
+ CompactMatchRecord() : mems::DenseAbstractMatch<1>() { clear(); }
+ CompactMatchRecord( uint seq_count ): mems::DenseAbstractMatch<1>( seq_count ){ clear(); }
+ GappedMatchRecord* subsuming_match;
+
+ void clear()
+ {
+ subsuming_match = NULL;
+ }
+};
+
+/**
+ * An ungapped alignment that also stores a match record
+ */
+class CompactUngappedMatchRecord : public mems::UngappedLocalAlignment< CompactMatchRecord >
+{
+public:
+
+ CompactUngappedMatchRecord(){};
+
+ /** always set seq_count, don't worry about align_length */
+ CompactUngappedMatchRecord( uint seq_count, gnSeqI align_length ) : mems::UngappedLocalAlignment< CompactMatchRecord >( seq_count )
+ {
+ subsuming_match = NULL;
+ }
+
+ CompactUngappedMatchRecord* Clone() const { return new CompactUngappedMatchRecord( *this ); }
+ CompactUngappedMatchRecord* Copy() const;
+ virtual void Free();
+};
+
+inline
+CompactUngappedMatchRecord* CompactUngappedMatchRecord::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+inline
+void CompactUngappedMatchRecord::Free()
+{
+ m_free(this);
+}
+
+
+/**
+ * An ungapped alignment that also stores a match record
+ */
+class UngappedMatchRecord : public mems::UngappedLocalAlignment< MatchRecord >
+{
+public:
+
+ UngappedMatchRecord(){};
+
+ /** always set seq_count, don't worry about align_length */
+ UngappedMatchRecord( uint seq_count, gnSeqI align_length ) : mems::UngappedLocalAlignment< MatchRecord >( seq_count )
+ {
+ subsuming_match = NULL;
+ }
+
+ UngappedMatchRecord* Clone() const { return new UngappedMatchRecord( *this ); }
+ UngappedMatchRecord* Copy() const;
+ virtual void Free();
+
+ friend std::ostream& operator<<(std::ostream& os, const UngappedMatchRecord& mr); //write to source.
+};
+
+inline
+UngappedMatchRecord* UngappedMatchRecord::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+inline
+void UngappedMatchRecord::Free()
+{
+ m_free(this);
+}
+
+
+/**
+ * The gapped match record class. Abuses the Interval class to store a chain of other matches
+ */
+class GappedMatchRecord : public mems::GenericInterval< mems::AbstractGappedAlignment< MatchRecord > >
+{
+public:
+
+ /** always set seq_count, don't worry about align_length */
+ GappedMatchRecord() :
+ mems::GenericInterval< mems::AbstractGappedAlignment< MatchRecord > >()
+ {
+ //tjt: initialize spscore to 0
+ spscore = 0;
+ }
+
+ GappedMatchRecord( UngappedMatchRecord& umr )
+ {
+ //tjt: initialize spscore to 0
+ spscore = 0;
+ std::vector<UngappedMatchRecord*> asdf(1, &umr);
+ mems::GenericInterval< mems::AbstractGappedAlignment< MatchRecord > > iv( asdf.begin(), asdf.end() );
+ mems::GenericInterval< mems::AbstractGappedAlignment< MatchRecord > >::operator=( iv );
+ MatchRecord::operator=( umr );
+ }
+
+ /**
+ * Call to indicate that all matches have been placed in the chained_matches list and can be
+ * converted to a gapped alignment
+ */
+ void finalize(std::vector<genome::gnSequence *> seq_table);
+ //tjt: should this go somewhere else?
+ mems::score_t spscore;
+// methods inherited from AbstractGappedAlignment
+public:
+ GappedMatchRecord* Clone() const { return new GappedMatchRecord( *this ); }
+ GappedMatchRecord* Copy() const;
+ virtual void Free();
+
+ friend std::ostream& operator<<(std::ostream& os, const GappedMatchRecord& mr); //write to source.
+};
+
+inline
+GappedMatchRecord* GappedMatchRecord::Copy() const
+{
+ return m_allocateAndCopy( *this );
+}
+inline
+void GappedMatchRecord::Free()
+{
+ m_free(this);
+}
+
+
+/** orders on increasing multiplicity */
+typedef std::pair< MatchRecord*, std::vector< size_t >* > MatchSortEntry;
+class MatchSortEntryCompare
+{
+public:
+ bool operator()( const MatchSortEntry& a, const MatchSortEntry& b )
+ {
+ return a.first->Multiplicity() < b.first->Multiplicity();
+ }
+};
+
+template< typename T >
+class IsNullPtr
+{
+public:
+ bool operator()( const T* a ){ return a == NULL; }
+};
+
+void GappedMatchRecord::finalize( std::vector<genome::gnSequence *> seq_table)
+{
+ std::vector< mems::AbstractMatch* > iv_matches;
+ MatchSortEntryCompare msec;
+ std::vector< MatchSortEntry > mse_list( chained_matches.size() );
+
+ for( size_t cI = 0; cI < chained_matches.size(); ++cI )
+ {
+ mse_list[cI].first = chained_matches[cI];
+ mse_list[cI].second = &chained_component_maps[cI];
+ }
+ std::sort( mse_list.begin(), mse_list.end(), msec );
+ // add lowest multiplicity matches first, progressively add higher mult. matches
+ std::vector< mems::AbstractMatch* > chain;
+ for( size_t cI = 0; cI < mse_list.size(); ++cI )
+ {
+ mems::MatchProjectionAdapter mpaa( mse_list[cI].first, *(mse_list[cI].second) );
+ // clobber any region that overlaps with this mpaa
+ for( size_t seqI = 0; seqI < mpaa.SeqCount(); seqI++ )
+ {
+ size_t csize = chain.size();
+ for( size_t mI = 0; mI < csize; mI++ )
+ {
+ mems::AbstractMatch* m = chain[mI];
+ if( m == NULL )
+ continue;
+ if (m->LeftEnd(seqI) == 0 && m->Length( seqI ) == 0)
+ continue; //should we throw error here?
+ if( m->RightEnd(seqI) < mpaa.LeftEnd(seqI) )
+ continue; // no overlap here!
+ if( m->LeftEnd(seqI) > mpaa.RightEnd(seqI) )
+ continue; // no overlap, woohoo!
+ if( m->LeftEnd(seqI) < mpaa.LeftEnd(seqI) &&
+ m->RightEnd(seqI) >= mpaa.LeftEnd(seqI) )
+ {
+ // take the part of m to the left of mpaa and put it at the end of our chain
+ mems::AbstractMatch* m_left = m->Copy();
+ m_left->CropRight( m_left->RightEnd(seqI) - mpaa.LeftEnd(seqI) + 1, seqI );
+ m->CropLeft( m_left->Length(seqI), seqI );
+ chain.push_back(m_left);
+ }
+ // now m is guaranteed to have left-end >= mpaa
+ if( m->RightEnd(seqI) <= mpaa.RightEnd(seqI) )
+ {
+ // m is completely contained inside mpaa, so get rid of it
+ m->Free();
+ chain[mI] = NULL;
+ continue;
+ }
+
+ m->CropLeft( mpaa.RightEnd(seqI) - m->LeftEnd(seqI) + 1, seqI );
+ }
+ }
+ // get rid of any null entries in the chain
+ std::vector< mems::AbstractMatch* >::iterator end_iter = std::remove( chain.begin(), chain.end(), (AbstractMatch*)NULL );
+ chain.erase( end_iter, chain.end() );
+ chain.push_back( mpaa.Copy() );
+ if( chain.back()->Orientation(0) == AbstractMatch::reverse )
+ chain.back()->Invert();
+ }
+
+ if( chain.size() == 0 )
+ {
+ *this = GappedMatchRecord();
+ return;
+ }
+ mems::MatchStartComparator< mems::AbstractMatch > asc(0);
+ std::sort( chain.begin(), chain.end(), asc );
+ // aed: At this point the matches in chain are in sorted order, so the region betweeen each of them is what should get fed to muscle
+ // will need to feed AbstractMatch instead of Match to MuscleInterface::Align though
+ std::vector< mems::AbstractMatch* >::iterator chain_begin = chain.begin();
+ uint chainsize = chain.size()-1;
+
+ if (1)
+ {
+ try{
+ for( uint i = 0; i < chainsize; i++ )
+ {
+ mems::GappedAlignment* cr = NULL;
+ boolean align_success = false;
+ // attempt a muscle alignment
+ cr = new mems::GappedAlignment();
+ mems::AbstractMatch* m1 = chain.at(i);
+ mems::AbstractMatch* m2 = chain.at(i+1);
+
+ align_success = mems::MuscleInterface::getMuscleInterface().Align( *cr, m1 , m2, seq_table );
+ if( align_success )
+ {
+ //cerr << "muscle alignment success!!" << endl;
+ iv_matches.push_back( cr );
+ // aed: just insert the resulting GappedAlignment objects into chain
+ chain.insert(chain.begin()+(i+1), cr);
+ chainsize++;
+ // tjt: skip over newly inserted item
+ i++;
+ }
+ else
+ continue;
+
+ }
+
+ }catch( genome::gnException& gne ){
+ std::cerr << gne << std::endl;
+ }catch(std::exception& e){
+ std::cerr << e.what() << std::endl;
+ std::cerr << chain.size() << std::endl;
+ }catch(...){
+ std::cerr << "matrix exception?\n";
+ }
+ }
+
+ MatchRecord* mr = this->Copy();
+ SetMatches( chain );
+ //tjt: now chain should be empty
+ // don't keep a potentially huge tree of GappedMatchRecords. instead, flatten to a single cga
+ mems::CompactGappedAlignment<> tmpcga(*this);
+ chain.push_back(tmpcga.Copy());
+ SetMatches( chain );
+ //tjt: assign this to slot allocated & copied MatchRecord
+ MatchRecord::operator=(*mr);
+ mr->Free();
+}
+
+std::ostream& operator<<(std::ostream& os, const UngappedMatchRecord& ula);
+std::ostream& operator<<(std::ostream& os, const UngappedMatchRecord& ula){ //write to stream.
+ os << ula.AlignmentLength();
+ for(uint i=0; i < ula.SeqCount(); i++)
+ os << '\t' << ula.Start(i);
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const GappedMatchRecord& ula);
+std::ostream& operator<<(std::ostream& os, const GappedMatchRecord& ula){ //write to stream.
+ os << ula.AlignmentLength();
+ for(uint i=0; i < ula.SeqCount(); i++)
+ os << '\t' << ula.Start(i);
+ os << "\nlens:";
+ for(uint i=0; i < ula.SeqCount(); i++)
+ os << '\t' << ula.Length(i);
+
+ return os;
+}
+
+#endif // __MatchRecord_h__
diff --git a/src/RepeatHashCat.cpp b/src/RepeatHashCat.cpp
new file mode 100644
index 0000000..aef9197
--- /dev/null
+++ b/src/RepeatHashCat.cpp
@@ -0,0 +1,12 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/RepeatHashCat.h"
+
+uint32 gnSequence::concatContigStart( void ) const{
+ STACK_TRACE_START
+ int32 ccs = this->concat_contig_start;
+ return ccs;
+ STACK_TRACE_END
+}
diff --git a/src/RepeatHashCat.h b/src/RepeatHashCat.h
new file mode 100644
index 0000000..7ab00e4
--- /dev/null
+++ b/src/RepeatHashCat.h
@@ -0,0 +1,21 @@
+#ifndef _RepeatHashThread_h_
+#define _RepeatHashThread_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/RepeatHash.h"
+
+class TheRealMemHash : public RepeatHash
+{
+public:
+ RepeatHashCat();
+ ~RepeatHashCat();
+ RepeatHashThread(const RepeatHashThread& mh);
+ virtual RepeatHashThread* Clone() const;
+protected:
+
+ //punt tjt: needed to add this to track where concatenated sequence starts
+ vector<uint32> concat_contig_start; // number of contigs in each sequence
+}
\ No newline at end of file
diff --git a/src/SeedMatchEnumerator.h b/src/SeedMatchEnumerator.h
new file mode 100644
index 0000000..08544c4
--- /dev/null
+++ b/src/SeedMatchEnumerator.h
@@ -0,0 +1,144 @@
+#ifndef __SeedMatchEnumerator_h__
+#define __SeedMatchEnumerator_h__
+
+#include "libMems/MatchFinder.h"
+#include "libMems/RepeatHash.h"
+#include "libMems/MemHash.h"
+#include "libMems/MatchList.h"
+#include "libMems/SortedMerList.h"
+#include "libMems/Match.h"
+
+/**
+ * Turns every seed match into a full match without extension.
+ */
+class SeedMatchEnumerator : public mems::MatchFinder
+{
+public:
+ virtual SeedMatchEnumerator* Clone() const;
+
+ void FindMatches( mems::MatchList& match_list, size_t min_multi = 2, size_t max_multi = 1000, bool direct_repeats_only = false )
+ {
+ this->max_multiplicity = max_multi;
+ this->min_multiplicity = min_multi;
+ this->only_direct = direct_repeats_only;
+ for( size_t seqI = 0; seqI < match_list.seq_table.size(); ++seqI ){
+ if( !AddSequence( match_list.sml_table[ seqI ], match_list.seq_table[ seqI ] ) ){
+ genome::ErrorMsg( "Error adding " + match_list.seq_filename[seqI] + "\n");
+ return;
+ }
+ }
+ CreateMatches();
+ match_list.clear();
+ match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
+ }
+
+ virtual boolean CreateMatches();
+protected:
+
+ virtual boolean EnumerateMatches( mems::IdmerList& match_list );
+ virtual boolean HashMatch(mems::IdmerList& match_list);
+ virtual mems::SortedMerList* GetSar(uint32 sarI) const;
+ mems::MatchList mlist;
+ void SetDirection(mems::Match& mhe);
+private:
+ //used to store rmin, rmax values
+ size_t max_multiplicity;
+ size_t min_multiplicity;
+ bool only_direct;
+};
+
+SeedMatchEnumerator* SeedMatchEnumerator::Clone() const{
+ return new SeedMatchEnumerator(*this);
+}
+
+inline
+mems::SortedMerList* SeedMatchEnumerator::GetSar(uint32 sarI) const{
+ return sar_table[0];
+}
+
+boolean SeedMatchEnumerator::CreateMatches(){
+ if(seq_count == 1){
+ MatchFinder::FindMatchSeeds();
+ return true;
+ }
+ return false;
+}
+
+boolean SeedMatchEnumerator::EnumerateMatches( mems::IdmerList& match_list ){
+ return HashMatch(match_list);
+}
+
+boolean SeedMatchEnumerator::HashMatch(mems::IdmerList& match_list){
+ //check that there is at least one forward component
+ match_list.sort(&mems::idmer_position_lessthan);
+ // initialize the hash entry
+ mems::Match mhe = mems::Match( match_list.size() );
+ mhe.SetLength( GetSar(0)->SeedLength() );
+
+ //Fill in the new Match and set direction parity if needed.
+ mems::IdmerList::iterator iter = match_list.begin();
+
+ uint32 repeatI = 0;
+ for(; iter != match_list.end(); iter++)
+ mhe.SetStart(repeatI++, iter->position + 1);
+
+ SetDirection( mhe );
+ bool found_reverse = false;
+ vector< size_t > component_map;
+ if(this->only_direct)
+ {
+ for( uint seqI = 0; seqI < mhe.Multiplicity(); seqI++)
+ {
+ if (mhe.Orientation(seqI) == 0)
+ component_map.push_back(seqI);
+ else
+ found_reverse = true;
+ }
+ }
+ mems::MatchProjectionAdapter mpaa(mhe.Copy(), component_map);
+ if(mhe.Multiplicity() < 2){
+ std::cerr << "red flag " << mhe << "\n";
+ }
+ //use rmin & rmax to discard irrelevant seed matches
+ else if(mhe.Multiplicity() > this->max_multiplicity || mhe.Multiplicity() < this->min_multiplicity )
+ {
+ ;
+ }
+ else if(this->only_direct && found_reverse)
+ {
+ if ( mpaa.Multiplicity() > 1)
+ {
+ mems::Match new_mhe = mems::Match( mpaa.Multiplicity() );
+ new_mhe.SetLength( GetSar(0)->SeedLength() );
+ for(uint mult = 0; mult < mpaa.Multiplicity(); mult++)
+ new_mhe.SetStart(mult, mpaa.Start(mult));
+ mlist.push_back(new_mhe.Copy());
+ }
+ }
+ else{
+ mlist.push_back(mhe.Copy());
+
+ }
+ return true;
+}
+
+// evil, evil code duplication.
+
+void SeedMatchEnumerator::SetDirection(mems::Match& mhe){
+ //get the reference direction
+ boolean ref_forward = false;
+ uint32 seqI=0;
+ for(; seqI < mhe.SeqCount(); ++seqI)
+ if(mhe[seqI] != mems::NO_MATCH){
+ ref_forward = !(GetSar(seqI)->GetMer(mhe[seqI] - 1) & 0x1);
+ break;
+ }
+ //set directional parity for the rest
+ for(++seqI; seqI < mhe.SeqCount(); ++seqI)
+ if(mhe[seqI] != mems::NO_MATCH)
+ if(ref_forward == (GetSar(seqI)->GetMer(mhe[seqI] - 1) & 0x1))
+ mhe.SetStart(seqI, -mhe[seqI]);
+}
+
+
+#endif // __SeedMatchEnumerator_h__
diff --git a/src/UniqueMatchFinder.cpp b/src/UniqueMatchFinder.cpp
new file mode 100644
index 0000000..b47554b
--- /dev/null
+++ b/src/UniqueMatchFinder.cpp
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ * $Id: UniqueMatchFinder.cpp,v 1.13 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "UniqueMatchFinder.h"
+#include <list>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+UniqueMatchFinder::UniqueMatchFinder(){
+}
+
+UniqueMatchFinder::~UniqueMatchFinder(){
+}
+
+UniqueMatchFinder::UniqueMatchFinder(const UniqueMatchFinder& mh) : MemHash(mh){
+
+}
+
+UniqueMatchFinder* UniqueMatchFinder::Clone() const{
+ return new UniqueMatchFinder(*this);
+}
+
+
+// enumerate out every pairwise match
+boolean UniqueMatchFinder::EnumerateMatches( IdmerList& match_list ){
+
+ match_list.sort(&idmer_id_lessthan);
+ IdmerList::iterator iter = match_list.begin();
+ IdmerList::iterator iter2 = match_list.begin();
+ uint cur_id_count = 1;
+ IdmerList unique_list;
+ // identify all of the unique seeds and add them to unique_list
+ while(iter2 != match_list.end()){
+ ++iter2;
+ if(iter2 == match_list.end() || iter->id != iter2->id){
+ if( cur_id_count == 1 )
+ unique_list.push_back( *iter );
+ else
+ cur_id_count = 1;
+ }else
+ cur_id_count++;
+ ++iter;
+ }
+ // hash all unique seeds
+ boolean success = true;
+ if( unique_list.size() >= 2 )
+ success = HashMatch(unique_list);
+ return success;
+}
diff --git a/src/UniqueMatchFinder.h b/src/UniqueMatchFinder.h
new file mode 100644
index 0000000..f9d08c0
--- /dev/null
+++ b/src/UniqueMatchFinder.h
@@ -0,0 +1,34 @@
+/*******************************************************************************
+ * $Id: UniqueMatchFinder.h,v 1.8 2004/03/01 02:40:08 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifndef _UniqueMatchFinder_h_
+#define _UniqueMatchFinder_h_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MemHash.h"
+
+/**
+ * Finds all pairwise matches with unique seeds among a group of sequences
+ */
+class UniqueMatchFinder : public mems::MemHash
+{
+public:
+ UniqueMatchFinder();
+ ~UniqueMatchFinder();
+
+ UniqueMatchFinder(const UniqueMatchFinder& mh);
+ virtual UniqueMatchFinder* Clone() const;
+protected:
+
+ virtual boolean EnumerateMatches( mems::IdmerList& match_list );
+};
+
+#endif //_UniqueMatchFinder_h_
diff --git a/src/addUnalignedIntervals.cpp b/src/addUnalignedIntervals.cpp
new file mode 100644
index 0000000..3c2619a
--- /dev/null
+++ b/src/addUnalignedIntervals.cpp
@@ -0,0 +1,33 @@
+#include "libMems/Interval.h"
+#include "libMems/Islands.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ IntervalList iv_list;
+ if( argc != 3 )
+ {
+ cerr << "Usage: <input interval file> <output interval file>\n";
+ return -1;
+ }
+ ifstream in_file( argv[1] );
+ if( !in_file.is_open() )
+ {
+ cerr << "Error opening \"argv[1]\"\n";
+ return -1;
+ }
+ iv_list.ReadStandardAlignment( in_file );
+ LoadSequences(iv_list, NULL);
+ addUnalignedIntervals( iv_list );
+ ofstream out_file( argv[2] );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"argv[2]\"\n";
+ return -2;
+ }
+ iv_list.WriteStandardAlignment( out_file );
+ return 0;
+}
diff --git a/src/alignmentProjector.cpp b/src/alignmentProjector.cpp
new file mode 100644
index 0000000..2e3df11
--- /dev/null
+++ b/src/alignmentProjector.cpp
@@ -0,0 +1,101 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include "libGenome/gnFilter.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Matrix.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/Aligner.h"
+#include "libGenome/gnFASSource.h"
+#include "libMems/ProgressiveAligner.h"
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+
+
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 6 )
+ {
+ cerr << "Usage: alignmentProjector <input xmfa> <output xmfa> <mfa seq input> <mfa seq output> <list of seqs to include, starting at 0>\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ ofstream aln_out;
+ aln_out.open( argv[2] );
+ if( !aln_out.is_open() ){
+ cerr << "Error writing to " << argv[2] << endl;
+ return -1;
+ }
+ string mfa_seqs = argv[3];
+ string mfa_output = argv[4];
+
+ try{
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+
+ MatchList ml;
+ ml.seq_filename = input_ivs.seq_filename;
+ LoadMFASequences( ml, mfa_seqs, NULL );
+ input_ivs.seq_table = ml.seq_table;
+
+ // create a projection list
+ vector< uint > projection;
+ IntervalList proj_ivs;
+ for( int i = 5; i < argc; ++i )
+ {
+ projection.push_back( atoi( argv[i] ) );
+ proj_ivs.seq_filename.push_back( mfa_seqs );
+ proj_ivs.seq_table.push_back( input_ivs.seq_table[projection.back()] );
+ }
+
+ vector< vector< MatchProjectionAdapter* > > LCB_list;
+ vector< LCB > projected_adjs;
+ projectIntervalList( input_ivs, projection, LCB_list, projected_adjs );
+
+ cout << "projection has " << LCB_list.size() << " LCBs\n";
+ proj_ivs.resize( LCB_list.size() );
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ proj_ivs[lcbI].SetMatches( LCB_list[lcbI] );
+
+ proj_ivs.WriteStandardAlignment( aln_out );
+
+ gnSequence seq;
+ seq.LoadSource( mfa_seqs );
+ ofstream seq_out( mfa_output.c_str() );
+ gnSequence proj_seq;
+ for( size_t projI = 0; projI < projection.size(); ++projI )
+ proj_seq += seq.contig(projection[projI]);
+ gnFASSource::Write(proj_seq,seq_out,false,false);
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }catch( exception& e ){
+ cerr << e.what() << endl;
+ return -2;
+ }catch( char const* c ){
+ cerr << c << endl;
+ return -3;
+ }catch(...){
+ cerr << "Unhandled exception" << endl;
+ return -4;
+ }
+}
+
diff --git a/src/backbone_global_to_local.cpp b/src/backbone_global_to_local.cpp
new file mode 100644
index 0000000..d099ae0
--- /dev/null
+++ b/src/backbone_global_to_local.cpp
@@ -0,0 +1,60 @@
+#include "libMems/IntervalList.h"
+#include "libMems/Backbone.h"
+
+using namespace mems;
+using namespace genome;
+using namespace std;
+
+int main (int ARGC,char ** ARGV) {
+
+ IntervalList input_alignment;
+ ifstream align_file;
+ if(ARGC != 4){
+ cout <<"Usage:\nbackbone_global_to_local <xmfa file> <backbone file> <output file>\n";
+ return 0;
+ }
+ align_file.open(ARGV[1]);
+ if(!align_file.is_open()){
+ cerr <<"Couldn't read xmfa file: "<<ARGV[1]<<"\n";
+ }
+ input_alignment.ReadStandardAlignment(align_file);
+ LoadSequences(input_alignment,&cout);
+ ifstream backbone_file;
+ backbone_file.open(ARGV[2]);
+ if(!backbone_file.is_open()){
+ cerr <<"Couldn't read backbone file: "<<ARGV[2]<<"\n";
+ }
+
+ ofstream new_backbone(ARGV[3]);
+ if(!align_file.is_open()){
+ cerr <<"Couldn't write to output file: "<<ARGV[3]<<"\n";
+ }
+
+ vector< bb_seqentry_t > backbone_struct;
+ readBackboneSeqFile(backbone_file, backbone_struct);
+
+ for(int i=0; i < backbone_struct.size(); i++){
+ for(int j=0; j < backbone_struct[i].size(); j++){
+ uint64 start = absolut(backbone_struct[i][j].first);
+ uint64 end = absolut(backbone_struct[i][j].second);
+ uint32 contig_num1;
+ uint32 contig_num2;
+ if(start == 0){
+ contig_num1=0;
+ contig_num2=0;
+ }else{
+ input_alignment.seq_table[j]->globalToLocal(contig_num1,start);
+ input_alignment.seq_table[j]->globalToLocal(contig_num2,end);
+ }
+
+ if(contig_num1 != contig_num2){
+ //cerr <<"Not the same contig!" <<contig_num1 <<" "<<contig_num2;
+ }
+ if(j>0){
+ new_backbone<<"\t";
+ }
+ new_backbone<<contig_num1 <<":"<<start<<"\t"<<contig_num2 <<":"<<end;
+ }
+ new_backbone <<"\n";
+ }
+}
diff --git a/src/bbAnalyze.cpp b/src/bbAnalyze.cpp
new file mode 100644
index 0000000..c2a0a8b
--- /dev/null
+++ b/src/bbAnalyze.cpp
@@ -0,0 +1,1411 @@
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include "libMems/Backbone.h"
+#include "libGenome/gnFeature.h"
+#include "libGenome/gnBaseQualifier.h"
+#include "libMems/IntervalList.h"
+#include "libMems/AbstractMatch.h"
+#include "libMems/MatchList.h"
+#include "libMems/PhyloTree.h"
+#include "libMems/ProgressiveAligner.h"
+#include <boost/algorithm/string/erase.hpp>
+#include <boost/tuple/tuple.hpp>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+// important constants that affect inference
+const uint SHORT_SEGMENT = 5; // when considering overlaps to genes, ignore overlaps less than this amount
+const uint DISCARD_SEGMENT = 20; // do not consider segments shorter than this amount
+const double ALTERNALOG_MIN_SIZE = 15.0;
+
+
+class BbSeqComp
+{
+public:
+ BbSeqComp( uint seq ) : m_seq( seq ) {}
+ bool operator()( const bb_seqentry_t* a, const bb_seqentry_t* b )
+ {
+ return genome::absolut( (*a)[m_seq].first ) < genome::absolut( (*b)[m_seq].first );
+ }
+private:
+ uint m_seq;
+};
+
+template< typename PtrVector >
+void createMap( const PtrVector& mv_from, const PtrVector& mv_to, vector< size_t >& map )
+{
+ typedef typename PtrVector::value_type PtrType;
+ vector< pair< PtrType, size_t > > m1(mv_from.size());
+ vector< pair< PtrType, size_t > > m2(mv_to.size());
+ for( size_t i = 0; i < mv_from.size(); ++i )
+ m1[i] = make_pair( mv_from[i], i );
+ for( size_t i = 0; i < mv_to.size(); ++i )
+ m2[i] = make_pair( mv_to[i], i );
+ std::sort( m1.begin(), m1.end() );
+ std::sort( m2.begin(), m2.end() );
+ map.resize( m1.size() );
+ for( size_t i = 0; i < m1.size(); ++i )
+ map[m1[i].second] = m2[i].second;
+}
+
+size_t getCDScount( gnSequence* anno_seq )
+{
+ size_t count = 0;
+ for( size_t featureI = 0; featureI < anno_seq->getFeatureListLength(); ++featureI )
+ {
+ gnBaseFeature* feat = anno_seq->getFeature( featureI );
+ string feat_name = feat->GetName();
+ if( feat_name == "CDS" )
+ count++;
+ delete feat;
+ }
+ return count;
+}
+
+void featureIntersect( vector< bb_seqentry_t >& bb_list, uint seqI, vector< vector< size_t > >& intersecting, gnSequence* anno_seq )
+{
+ // stores the bb segs that overlap each feature
+ intersecting.resize( anno_seq->getFeatureListLength() );
+
+ uint seq_count = bb_list.front().size();
+
+ vector< bb_seqentry_t* > bb_ptrs( bb_list.size() );
+ for( size_t i = 0; i < bb_list.size(); ++i )
+ bb_ptrs[i] = &bb_list[i];
+ vector< bb_seqentry_t* > orig_ptrs( bb_ptrs );
+ BbSeqComp bsc( seqI );
+ std::sort( bb_ptrs.begin(), bb_ptrs.end(), bsc );
+ vector< size_t > ptr_map;
+ createMap( bb_ptrs, orig_ptrs, ptr_map );
+
+ for( size_t featureI = 0; featureI < anno_seq->getFeatureListLength(); ++featureI )
+ {
+ gnBaseFeature* feat = anno_seq->getFeature( featureI );
+ string feat_name = feat->GetName();
+ if( feat_name != "CDS" &&
+ feat_name != "tRNA" &&
+ feat_name != "rRNA" &&
+ feat_name != "misc_rna" )
+ continue; // don't deal with other feature types (source, misc_RNA, etc)
+ gnLocation loc = feat->GetLocation(0);
+ if( loc.GetFirst() > loc.GetLast() || loc.GetFirst() == 0 || loc.GetLast() == 0 )
+ continue; // a problem parsing annotation?
+ // find where feature lands in our list
+ bb_seqentry_t tmp_bb( seq_count );
+ tmp_bb[seqI].first = loc.GetFirst();
+ tmp_bb[seqI].second = loc.GetFirst();
+ vector< bb_seqentry_t* >::iterator liter = std::lower_bound( bb_ptrs.begin(), bb_ptrs.end(), &tmp_bb, bsc );
+ tmp_bb[seqI].first = loc.GetLast();
+ tmp_bb[seqI].second = loc.GetLast();
+ vector< bb_seqentry_t* >::iterator uiter = std::lower_bound( bb_ptrs.begin(), bb_ptrs.end(), &tmp_bb, bsc );
+ if( liter == bb_ptrs.end() &&
+ bb_ptrs.size() > 0 &&
+ genome::absolut( (*bb_ptrs.back())[seqI].second ) >= loc.GetFirst() )
+ liter--;
+ while( liter != bb_ptrs.end() &&
+ liter != bb_ptrs.begin() &&
+ genome::absolut( (**liter)[seqI].second ) >= loc.GetFirst() )
+ --liter;
+ if( liter != bb_ptrs.end() &&
+ genome::absolut( (**liter)[seqI].second ) < loc.GetFirst() )
+ ++liter;
+ for( ; liter != uiter; ++liter )
+ {
+ if( (**liter)[seqI].first == 0 )
+ continue;
+ // only add the bbseg if the intersection is larger than SHORT_SEGMENT
+ gnLocation bb_loc;
+ if( (**liter)[seqI].first > 0 )
+ bb_loc = gnLocation((**liter)[seqI].first, (**liter)[seqI].second);
+ else
+ bb_loc = gnLocation(-(**liter)[seqI].first, -(**liter)[seqI].second);
+
+ gnLocation intersect = loc.GetIntersection( bb_loc, gnLocation::determinedRegions );
+ if( intersect.GetLast() - intersect.GetFirst() <= SHORT_SEGMENT )
+ continue;
+
+ intersecting[ featureI ].push_back( ptr_map[ liter - bb_ptrs.begin() ] );
+ }
+ delete feat;
+ }
+}
+
+void getFeatureHits( const vector< vector< size_t > >& intersecting, const bitset_t& segs, bitset_t& features_hit )
+{
+ features_hit.resize(intersecting.size());
+ features_hit.reset();
+ for( size_t featI = 0; featI < intersecting.size(); featI++ )
+ {
+ for( size_t i = 0; i < intersecting[featI].size(); ++i )
+ {
+ if( segs.test( intersecting[featI][i] ) )
+ features_hit.set( featI );
+ }
+ }
+}
+
+typedef map< string, map< string, double > > multifun_map_t;
+typedef map< string, map< string, string > > multifun_names_t;
+
+void makeMultiFunCount( gnSequence* anno_seq, multifun_map_t& mf_count, multifun_names_t& mf_names, bitset_t& feature_mask )
+{
+ for( size_t featureI = 0; featureI < anno_seq->getFeatureListLength(); ++featureI )
+ {
+ if( !feature_mask.test( featureI ) )
+ continue; // skip this feature if we're not supposed to include it
+ gnBaseFeature* feat = anno_seq->getFeature( featureI );
+ string feat_name = feat->GetName();
+ if( feat_name != "CDS" )
+ {
+ delete feat;
+ continue;
+ }
+ bool found_multifun = false;
+ for( size_t qualI = 0; qualI < feat->GetQualifierListLength(); ++qualI )
+ {
+ gnBaseQualifier* gnq = feat->GetQualifier(qualI);
+ string qual_name = gnq->GetName();
+ if( qual_name != "function" )
+ {
+ delete gnq;
+ continue;
+ }
+ string qual_value = gnq->GetValue();
+ if( qual_value[0] == '"' )
+ qual_value = qual_value.substr(1);
+ stringstream qv_str( qual_value );
+ string mf_level1;
+ getline( qv_str, mf_level1, '.' );
+ if( mf_level1.size() > 1 )
+ {
+ // not a multifun tag
+ delete gnq;
+ continue;
+ }
+ string mf_level2;
+ mf_level2 += qv_str.get();
+ mf_count[mf_level1][mf_level2]++;
+
+ // get the name
+ string l1_name;
+ getline( qv_str, l1_name, ' ' );
+ getline( qv_str, l1_name, ';' );
+ string l2_name;
+ getline( qv_str, l2_name, ';' );
+ string cur_name = l1_name + ';' + l2_name;
+ std::remove( cur_name.begin(), cur_name.end(), '\r' );
+ std::remove( cur_name.begin(), cur_name.end(), '\n' );
+ string space_str = " ";
+ boost::algorithm::erase_all( cur_name, space_str );
+ mf_names[mf_level1][mf_level2] = cur_name;
+ delete gnq;
+
+ found_multifun = true;
+ }
+ // if we didn't find multifun, call it an "Unknown"
+ if( !found_multifun )
+ {
+ string q = "?";
+ mf_names[q][q] = "Unknown; No MultiFun Tag";
+ mf_count[q][q]++;
+ }
+
+ delete feat;
+ }
+}
+
+typedef boost::tuple< size_t, size_t, double, double, string > anal_row_t;
+class AnalRowComp
+{
+public:
+ bool operator()( const anal_row_t& a, const anal_row_t& b )
+ {
+ return a.get<2>() < b.get<2>();
+ }
+};
+
+double chi_square_threshold = 5;
+double min_expected_threshold = 5;
+void mfAnalyze( ofstream& anal_output, multifun_map_t& all_mf, multifun_map_t& subset_mf, multifun_names_t& mf_names, double expect_freq )
+{
+ vector< anal_row_t > rows;
+ multifun_map_t::iterator l1_iter = subset_mf.begin();
+ for( ; l1_iter != subset_mf.end(); ++l1_iter )
+ {
+ multifun_map_t::iterator all_l1_iter = all_mf.find(l1_iter->first);
+ multifun_names_t::iterator names_l1_iter = mf_names.find(l1_iter->first);
+ map< string, double >::iterator l2_iter = l1_iter->second.begin();
+ for( ; l2_iter != l1_iter->second.end(); ++l2_iter )
+ {
+ map< string, double >::iterator all_l2_iter = all_l1_iter->second.find(l2_iter->first);
+ map< string, string >::iterator names_l2_iter = names_l1_iter->second.find(l2_iter->first);
+
+ // percent in this category:
+ double pct = (l2_iter->second / all_l2_iter->second) * 100;
+ // category number:
+ string cat_num = l1_iter->first + "." + l2_iter->first;
+ // chi-square
+ double chi_square = (l2_iter->second - (all_l2_iter->second*expect_freq));
+ chi_square *= chi_square;
+ chi_square /= (all_l2_iter->second*expect_freq);
+ // total category gene count
+ // category name
+ if( chi_square < chi_square_threshold )
+ continue; // not significantly different
+ if( (all_l2_iter->second*expect_freq) < min_expected_threshold )
+ continue; // don't have enough elements to make reliable estimation
+ rows.push_back( boost::make_tuple( l2_iter->second, all_l2_iter->second, pct, chi_square, names_l2_iter->second ) );
+ }
+ }
+ AnalRowComp arc;
+ string col_delim = " & ";
+ string new_row = "\\\\\n\\hline\n";
+ std::sort( rows.begin(), rows.end(), arc );
+ anal_output << "NumGenes" << col_delim << "GenesInCat" << col_delim << "Percent" << col_delim;
+ anal_output << "Chi_square" << col_delim << "Mf_Level_2_name" << new_row;
+ for( size_t rI = 0; rI < rows.size(); ++rI )
+ {
+ // if we transition from under to over-represented, output an empty row
+ if( rI > 0 && rows[rI-1].get<2>() < expect_freq * 100 && rows[rI].get<2>() > expect_freq * 100 )
+ anal_output << new_row;
+ anal_output << rows[rI].get<0>() << col_delim;
+ anal_output << rows[rI].get<1>() << col_delim;
+ anal_output << setprecision(3) << rows[rI].get<2>() << col_delim;
+ anal_output << setprecision(3) << rows[rI].get<3>() << col_delim;
+ anal_output << rows[rI].get<4>() << new_row;
+ }
+}
+
+
+void featureNearestNeighbors( const vector< bb_seqentry_t >& bb_list, const bitset_t& filter, uint seqI, vector< pair< size_t, size_t > >& neighbors, gnSequence* anno_seq, const vector< string >& feature_types )
+{
+ // stores the bb segs that overlap each feature
+ neighbors.resize( bb_list.size() );
+
+ uint seq_count = bb_list.front().size();
+
+ vector< gnBaseFeature* > feats( anno_seq->getFeatureListLength() );
+ vector< gnLocation > locs( anno_seq->getFeatureListLength() );
+ vector< string > names( anno_seq->getFeatureListLength() );
+ for( size_t featureI = 0; featureI < anno_seq->getFeatureListLength(); ++featureI )
+ {
+ feats[featureI] = anno_seq->getFeature( featureI );
+ locs[featureI] = feats[featureI]->GetLocation(0);
+ names[featureI] = feats[featureI]->GetName();
+ }
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ {
+ // find the nearest feature
+ size_t best_left = (std::numeric_limits<size_t>::max)();
+ size_t best_right = (std::numeric_limits<size_t>::max)();
+ size_t best_left_dist = (std::numeric_limits<size_t>::max)();
+ size_t best_right_dist = (std::numeric_limits<size_t>::max)();
+ if( !filter.test(bbI) )
+ {
+ neighbors[bbI].first = best_left;
+ neighbors[bbI].second = best_right;
+ continue;
+ }
+ for( size_t featI = 0; featI < feats.size(); ++featI )
+ {
+ size_t ntype = 0;
+ for( ; ntype < feature_types.size(); ++ntype )
+ if( names[featI] == feature_types[ntype] )
+ break;
+ if( ntype == feature_types.size() )
+ continue; // apparently this type of feature isn't interesting...
+ if( locs[featI].GetFirst() > locs[featI].GetLast() || locs[featI].GetFirst() == 0 || locs[featI].GetLast() == 0 )
+ continue; // a problem parsing annotation?
+ if( genome::absolut(bb_list[bbI][seqI].first) > locs[featI].GetLast() - ALTERNALOG_MIN_SIZE &&
+ (int64)genome::absolut(bb_list[bbI][seqI].first) - (int64)locs[featI].GetLast() < best_left_dist )
+ {
+ best_left_dist = (int64)genome::absolut(bb_list[bbI][seqI].first) - (int64)locs[featI].GetLast();
+ best_left = featI;
+ }
+ if( genome::absolut(bb_list[bbI][seqI].second) < locs[featI].GetFirst() + ALTERNALOG_MIN_SIZE &&
+ (int64)locs[featI].GetFirst() - (int64)genome::absolut(bb_list[bbI][seqI].second) < best_right_dist )
+ {
+ best_right_dist = (int64)locs[featI].GetFirst() - (int64)genome::absolut(bb_list[bbI][seqI].second);
+ best_right = featI;
+ }
+ }
+ neighbors[bbI].first = best_left;
+ neighbors[bbI].second = best_right;
+ }
+ // clean up
+ for( size_t featureI = 0; featureI < feats.size(); ++featureI )
+ delete feats[featureI];
+}
+
+void printFilteredBbSeqList( ostream& os, const vector< bb_seqentry_t >& bb_seq_list, const bitset_t& filter )
+{
+ for( size_t aI = 0; aI < bb_seq_list.size(); ++aI )
+ {
+ if( filter.test(aI) )
+ {
+ printBbSeq( os, bb_seq_list[aI] );
+ os << endl;
+ }
+ }
+}
+
+void classifyIntergenic( ostream& os, const vector< bb_seqentry_t >& bbseq_list, const bitset_t& intergenic,
+ uint anno_seqI, gnSequence* anno_seq, bitset_t& trna_neighbor, bitset_t& miscrna_neighbor,
+ bitset_t& converging_cds, bitset_t& diverging_cds, bitset_t& inline_cds,
+ bitset_t& variable_miscrna, bitset_t& variable_trna )
+{
+ vector< pair< size_t, size_t > > all_neighbors;
+ vector< string > all_types;
+ all_types.push_back( "CDS" );
+ all_types.push_back( "rRNA" );
+ all_types.push_back( "tRNA" );
+ all_types.push_back( "misc_RNA" );
+ trna_neighbor.resize( bbseq_list.size() );
+ miscrna_neighbor.resize( bbseq_list.size() );
+ variable_miscrna.resize(anno_seq->getFeatureListLength());
+ variable_trna.resize(anno_seq->getFeatureListLength());
+ featureNearestNeighbors( bbseq_list, intergenic, anno_seqI, all_neighbors, anno_seq, all_types );
+ for( size_t bbI = 0; bbI < bbseq_list.size(); ++bbI )
+ {
+ if( !intergenic.test(bbI) )
+ continue;
+ if( all_neighbors[bbI].first >= anno_seq->getFeatureListLength() ||
+ all_neighbors[bbI].second >= anno_seq->getFeatureListLength() )
+ continue;
+ gnBaseFeature* lfeat = anno_seq->getFeature(all_neighbors[bbI].first);
+ gnBaseFeature* rfeat = anno_seq->getFeature(all_neighbors[bbI].second);
+ if( lfeat->GetName() == "tRNA" || rfeat->GetName() == "tRNA" )
+ trna_neighbor.set(bbI);
+ if( lfeat->GetName() == "tRNA" )
+ variable_trna.set(all_neighbors[bbI].first);
+ if( rfeat->GetName() == "tRNA" )
+ variable_trna.set(all_neighbors[bbI].second);
+ if( lfeat->GetName() == "misc_RNA" || rfeat->GetName() == "misc_RNA" )
+ miscrna_neighbor.set(bbI);
+ if( lfeat->GetName() == "misc_RNA" )
+ variable_miscrna.set(all_neighbors[bbI].first);
+ if( rfeat->GetName() == "misc_RNA" )
+ variable_miscrna.set(all_neighbors[bbI].second);
+ delete lfeat;
+ delete rfeat;
+ }
+
+ vector< pair< size_t, size_t > > cds_neighbors;
+ vector< string > cds_types;
+ cds_types.push_back( "CDS" );
+ featureNearestNeighbors( bbseq_list, intergenic, anno_seqI, cds_neighbors, anno_seq, cds_types );
+
+ converging_cds.resize( bbseq_list.size() );
+ diverging_cds.resize( bbseq_list.size() );
+ inline_cds.resize( bbseq_list.size() );
+ for( size_t bbI = 0; bbI < bbseq_list.size(); ++bbI )
+ {
+ if( !intergenic.test(bbI) )
+ continue;
+ if( cds_neighbors[bbI].first >= anno_seq->getFeatureListLength() ||
+ cds_neighbors[bbI].second >= anno_seq->getFeatureListLength() )
+ continue;
+ gnBaseFeature* lfeat = anno_seq->getFeature(cds_neighbors[bbI].first);
+ gnBaseFeature* rfeat = anno_seq->getFeature(cds_neighbors[bbI].second);
+ if( lfeat->GetLocationType() == gnLocation::LT_Complement &&
+ rfeat->GetLocationType() != gnLocation::LT_Complement )
+ diverging_cds.set(bbI);
+ else if( lfeat->GetLocationType() != gnLocation::LT_Complement &&
+ rfeat->GetLocationType() == gnLocation::LT_Complement )
+ converging_cds.set(bbI);
+ else
+ inline_cds.set(bbI);
+ delete lfeat;
+ delete rfeat;
+ }
+}
+
+void findVariableSegmentsWithFlankingBB( const vector< bb_entry_t >& bb_list, const vector< double >& avg_lens, vector< pair< size_t, size_t > >& variable_segs, size_t min_bb_size = ALTERNALOG_MIN_SIZE, size_t min_variable_size = ALTERNALOG_MIN_SIZE, bool alternalogs = false )
+{
+ // find alternalogs (only at root node)
+ const size_t NO_PREVIOUS = (std::numeric_limits<size_t>::max)();
+ size_t prev_bb_seg = NO_PREVIOUS;
+ uint seq_count = bb_list.front().bb_seq.size();
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ {
+ if( bb_list[bbI].bb_cols.Multiplicity() != seq_count ||
+ avg_lens[bbI] < min_bb_size )
+ continue; // don't count this as n-way backbone
+ if( prev_bb_seg == NO_PREVIOUS ||
+ (bb_list[prev_bb_seg].iv != bb_list[bbI].iv)
+ )
+ {
+ // no intervening alternalog...
+ prev_bb_seg = bbI;
+ continue;
+ }
+ // was there an alternalog?
+ uint subset_count = 0; // count the subset backbone of substantial size
+ bitset_t in_bb( seq_count );
+ for( size_t segI = prev_bb_seg + 1; segI < bbI; ++segI )
+ {
+ if( avg_lens[segI] < min_variable_size )
+ continue;
+ bool found_new = false;
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( bb_list[segI].bb_seq[seqI].first != 0 )
+ {
+ if( !in_bb.test(seqI) )
+ found_new = true;
+ in_bb.set(seqI);
+ }
+ }
+ if( found_new )
+ subset_count++;
+ }
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ // debug:
+ if( (bb_list[bbI].bb_seq[seqI].first < 0 && bb_list[bbI].bb_seq[seqI].second > 0) ||
+ (bb_list[bbI].bb_seq[seqI].first > 0 && bb_list[bbI].bb_seq[seqI].second < 0) ||
+ (bb_list[bbI].bb_seq[seqI].first < 0 && bb_list[prev_bb_seg].bb_seq[seqI].first > 0) ||
+ (bb_list[bbI].bb_seq[seqI].first > 0 && bb_list[prev_bb_seg].bb_seq[seqI].first < 0) ||
+ (bb_list[bbI].bb_seq[seqI].first < 0 && bb_list[prev_bb_seg].bb_seq[seqI].second > 0) ||
+ (bb_list[bbI].bb_seq[seqI].first > 0 && bb_list[prev_bb_seg].bb_seq[seqI].second < 0) )
+ {
+ cerr << "mismatch parity!!\n";
+ genome::breakHere();
+ }
+ // normal:
+ if( in_bb.test(seqI) )
+ continue;
+ int64 diff = 0;
+ if( bb_list[bbI].bb_seq[seqI].first < 0 )
+ diff = genome::absolut( bb_list[prev_bb_seg].bb_seq[seqI].first - bb_list[bbI].bb_seq[seqI].second );
+ else
+ diff = bb_list[bbI].bb_seq[seqI].first - bb_list[prev_bb_seg].bb_seq[seqI].second;
+ if( diff >= min_variable_size )
+ subset_count++;
+ }
+ if( alternalogs && subset_count > 1 )
+ variable_segs.push_back( make_pair( prev_bb_seg, bbI ) );
+ else if( !alternalogs && subset_count > 0 )
+ variable_segs.push_back( make_pair( prev_bb_seg, bbI ) );
+ prev_bb_seg = bbI;
+ }
+}
+
+void makeVariableSegmentsCoordinateList( const vector< bb_entry_t >& bb_list, const vector< pair< size_t, size_t > >& alternalogs, vector< bb_seqentry_t >& alternabb_list )
+{
+ uint seq_count = bb_list.front().bb_seq.size();
+ alternabb_list.resize( alternalogs.size() );
+ for( size_t aI = 0; aI < alternalogs.size(); ++aI )
+ {
+ const bb_seqentry_t& a = bb_list[ alternalogs[aI].first ].bb_seq;
+ const bb_seqentry_t& b = bb_list[ alternalogs[aI].second ].bb_seq;
+ bb_seqentry_t alternabb = a;
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ if( alternabb[seqI].first < 0 )
+ {
+ alternabb[seqI].first = b[seqI].second;
+ alternabb[seqI].second = a[seqI].first;
+ }
+ else
+ {
+ alternabb[seqI].first = a[seqI].second;
+ alternabb[seqI].second = b[seqI].first;
+ }
+ }
+ alternabb_list[aI] = alternabb;
+ }
+}
+
+class LocComp {
+public:
+ bool operator()( const gnLocation& a, const gnLocation& b ) const
+ {
+ return a.GetFirst() < b.GetFirst();
+ }
+};
+
+void identifyIntergenicRanges( vector< gnSequence* >& seq_table, vector< vector< pair< size_t, size_t > > >& ranges )
+{
+ ranges.resize(seq_table.size());
+ for( size_t seqI = 0; seqI < seq_table.size(); seqI++ )
+ {
+ vector< gnLocation > loc_list;
+ for( size_t featI = 0; featI < seq_table[seqI]->getFeatureListLength(); featI++ )
+ {
+ gnBaseFeature* feat = seq_table[seqI]->getFeature(featI);
+ string feat_name = feat->GetName();
+ if( feat_name != "CDS" )
+ continue; // don't deal with other feature types (source, etc)
+ loc_list.push_back( feat->GetLocation(0) );
+ delete feat;
+ }
+
+ size_t sum = 0;
+ LocComp lc;
+ std::sort( loc_list.begin(), loc_list.end(), lc );
+ size_t fI = 0;
+ size_t lI = 1;
+ while( fI < loc_list.size() && lI < loc_list.size() )
+ {
+ if( loc_list[fI].GetLast() < loc_list[lI].GetFirst() )
+ {
+ ranges[seqI].push_back( make_pair( loc_list[fI].GetLast(), loc_list[lI].GetFirst() ) );
+ sum += loc_list[lI].GetFirst() - loc_list[fI].GetLast() - 1;
+ }
+ fI++; lI++;
+ while( fI < loc_list.size() && lI < loc_list.size() &&
+ loc_list[fI].GetLast() >= loc_list[lI].GetFirst() )
+ {
+ if( loc_list[fI].GetLast() >= loc_list[lI].GetLast() )
+ {
+ fI++; lI++;
+ cerr << "danger, complete containment in seq " << seqI << endl;
+ }
+ fI++; lI++;
+ }
+ }
+ }
+}
+
+//big_coli_sam_fixed_goh0001_gou000001.xmfa guide.tre big_coli_sam_fixed_goh0001_gou000001.xmfa.backbone big_coli_sam_fixed_goh0001_gou000001.xmfa.bbcols 5 bb.out
+
+void classifyCoordinateRanges(
+ const vector< bb_seqentry_t >& alternabb_list,
+ gnSequence* annotated_seq,
+ vector< gnSequence* >& seq_table,
+ vector< bitset_t >& genic,
+ vector< bitset_t >& genic_fudge,
+ vector< bitset_t >& overlaps_cds_upstream,
+ vector< bitset_t >& overlaps_cds_upstream_fudge,
+ vector< bitset_t >& overlaps_cds_downstream,
+ vector< bitset_t >& overlaps_cds_downstream_fudge,
+ vector< bitset_t >& intergenic,
+ vector< bitset_t >& spanner,
+ vector< bitset_t >& trna,
+ vector< bitset_t >& rrna,
+ vector< bitset_t >& miscrna,
+ vector< bitset_t >& pseudogenized,
+ vector< bitset_t >& variable_miscrna,
+ vector< bitset_t >& variable_trna,
+ vector< bitset_t >& intergenic_segs
+ )
+{
+ if( alternabb_list.size() == 0 )
+ return;
+ uint seq_count = seq_table.size();
+ // count genic vs. intergenic alternalogs
+ // classify alternalogs as genic, intergenic, multigenic
+ // and pseudogenizing
+ bitset_t bbclass_tmp( alternabb_list.size() );
+ // all of these classifications should be mutually exclusive
+ genic.resize( seq_count, bbclass_tmp );
+ genic_fudge.resize( seq_count, bbclass_tmp );
+
+ // set to true if a variable segment ends in a CDS, but isn't contained by the CDS
+ overlaps_cds_upstream.resize( seq_count, bbclass_tmp );
+ overlaps_cds_upstream_fudge.resize( seq_count, bbclass_tmp );
+ overlaps_cds_downstream.resize( seq_count, bbclass_tmp );
+ overlaps_cds_downstream_fudge.resize( seq_count, bbclass_tmp );
+ intergenic.resize( seq_count, bbclass_tmp );
+ spanner.resize( seq_count, bbclass_tmp );
+// vector< bitset_t > multigenic( seq_count, bbclass_tmp );
+ // these are true if trna or rrna are intersected
+ trna.resize( seq_count, bbclass_tmp );
+ rrna.resize( seq_count, bbclass_tmp );
+ miscrna.resize( seq_count, bbclass_tmp );
+ variable_miscrna.resize( seq_count );
+ variable_trna.resize( seq_count );
+ // an alternalog is pseudogenizing if it's genic in other sequences but not in the subject
+ pseudogenized.resize( seq_count, bbclass_tmp );
+
+ vector< vector< pair< size_t, size_t > > > ranges;
+ identifyIntergenicRanges( seq_table, ranges );
+ intergenic_segs.resize(seq_table.size());
+
+ vector< const bb_seqentry_t* > alterna_ptrs( alternabb_list.size() );
+ for( size_t i = 0; i < alternabb_list.size(); ++i )
+ alterna_ptrs[i] = &alternabb_list[i];
+ vector< const bb_seqentry_t* > orig_ptrs( alterna_ptrs );
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ BbSeqComp bsc( seqI );
+ std::sort( alterna_ptrs.begin(), alterna_ptrs.end(), bsc );
+ vector< size_t > ptr_map;
+ createMap( alterna_ptrs, orig_ptrs, ptr_map );
+
+ vector< vector< unsigned > > bb_features( alternabb_list.size() ); // stores feature IDs of overlapping features
+ variable_miscrna[seqI].resize(seq_table[seqI]->getFeatureListLength());
+ variable_trna[seqI].resize(seq_table[seqI]->getFeatureListLength());
+ for( size_t featureI = 0; featureI < seq_table[seqI]->getFeatureListLength(); ++featureI )
+ {
+ gnBaseFeature* feat = seq_table[seqI]->getFeature( featureI );
+ string feat_name = feat->GetName();
+ if( feat_name != "CDS" &&
+ feat_name != "tRNA" &&
+ feat_name != "rRNA" &&
+ feat_name != "misc_RNA" )
+ continue; // don't deal with other feature types (source, etc)
+ if( feat->GetLocationListLength() > 1 )
+ continue; // any multi-part CDS features are likely to be pseudogene annotations
+ // which we don't want to bias our results. There are only a couple true multi-part
+ // CDS in enteric bacteria
+
+ gnLocation loc = feat->GetLocation(0);
+ if( loc.GetFirst() > loc.GetLast() || loc.GetFirst() == 0 || loc.GetLast() == 0 )
+ continue; // a problem parsing annotation?
+ // find where feature lands in our list
+ bb_seqentry_t tmp_bb( seq_count );
+ tmp_bb[seqI].first = loc.GetFirst();
+ tmp_bb[seqI].second = loc.GetFirst();
+ vector< const bb_seqentry_t* >::iterator liter = std::lower_bound( alterna_ptrs.begin(), alterna_ptrs.end(), &tmp_bb, bsc );
+ tmp_bb[seqI].first = loc.GetLast();
+ tmp_bb[seqI].second = loc.GetLast();
+ vector< const bb_seqentry_t* >::iterator uiter = std::lower_bound( alterna_ptrs.begin(), alterna_ptrs.end(), &tmp_bb, bsc );
+ if( liter == alterna_ptrs.end() &&
+ alterna_ptrs.size() > 0 &&
+ genome::absolut( (*alterna_ptrs.back())[seqI].second ) >= loc.GetFirst() )
+ liter--;
+ while( liter != alterna_ptrs.end() &&
+ liter != alterna_ptrs.begin() &&
+ genome::absolut( (**liter)[seqI].second ) >= loc.GetFirst() )
+ --liter;
+ if( liter != alterna_ptrs.end() &&
+ genome::absolut( (**liter)[seqI].second ) < loc.GetFirst() )
+ ++liter;
+ for( ; liter != uiter; ++liter )
+ {
+ bb_features[ liter - alterna_ptrs.begin() ].push_back( featureI );
+ }
+ delete feat;
+ }
+
+ intergenic_segs[seqI].resize(ranges[seqI].size());
+ for( size_t bbI = 0; bbI < alterna_ptrs.size(); ++bbI )
+ {
+ size_t l = (*alterna_ptrs[bbI])[seqI].first;
+ size_t r = (*alterna_ptrs[bbI])[seqI].second;
+ for( size_t rI = 0; rI < ranges[seqI].size(); ++rI )
+ {
+ if( (l < ranges[seqI][rI].first + 1 && ranges[seqI][rI].first + 1 <= r) || // left overlap and complete contains
+ (l <= ranges[seqI][rI].second - 1 && ranges[seqI][rI].first + 1 <= r) ) // right overlap and inside
+ {
+ intergenic_segs[seqI].set(rI);
+ break;
+ }
+ }
+ }
+
+ for( size_t bbI = 0; bbI < alterna_ptrs.size(); ++bbI )
+ {
+ gnLocation bb_loc;
+ if( (*alterna_ptrs[bbI])[seqI].first > 0 )
+ bb_loc = gnLocation((*alterna_ptrs[bbI])[seqI].first, (*alterna_ptrs[bbI])[seqI].second);
+ else
+ bb_loc = gnLocation(-(*alterna_ptrs[bbI])[seqI].first, -(*alterna_ptrs[bbI])[seqI].second);
+ if( (*alterna_ptrs[bbI])[0].first > 2302400 && (*alterna_ptrs[bbI])[0].second < 2303211 )
+ {
+ cerr << "debugme\n";
+ }
+ for( size_t featI = 0; featI < bb_features[bbI].size(); ++featI )
+ {
+ gnBaseFeature* feat = seq_table[seqI]->getFeature( bb_features[bbI][featI] );
+ gnLocation feat_loc = feat->GetLocation(0);
+ gnLocation intersect = feat_loc.GetIntersection( bb_loc, gnLocation::determinedRegions );
+ string name = feat->GetName();
+ if( intersect.GetFirst() == bb_loc.GetFirst() &&
+ intersect.GetLast() == bb_loc.GetLast() &&
+ name == "CDS" )
+ {
+ if( intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE ||
+ intersect.GetFirst() - ALTERNALOG_MIN_SIZE > feat_loc.GetFirst() ||
+ intersect.GetLast() + ALTERNALOG_MIN_SIZE < feat_loc.GetLast() )
+ {
+ // alternalog completely contained by CDS, at least ALTERNALOG_MIN_SIZE inside the CDS
+ genic[seqI].set( ptr_map[bbI] );
+ }else{
+ genic_fudge[seqI].set( ptr_map[bbI] ); // small and close to the edge
+ }
+ }
+ else if( (intersect.GetFirst() == bb_loc.GetFirst() ||
+ intersect.GetLast() == bb_loc.GetLast()) &&
+ name == "CDS" )
+ {
+ bool up = false;
+ // overlaps a CDS by at least ALTERNALOG_MIN_SIZE nucleotides,
+ // but does not contain the CDS, nor is it contained by the CDS
+ if( intersect.GetFirst() == bb_loc.GetFirst() )
+ {
+ if( feat->GetLocationType() != gnLocation::LT_Standard )
+ up = true;
+ }else if( feat->GetLocationType() == gnLocation::LT_Standard )
+ up = true;
+
+ if( !up && intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE )
+ overlaps_cds_downstream[seqI].set( ptr_map[bbI] );
+ else if( !up )
+ overlaps_cds_downstream_fudge[seqI].set( ptr_map[bbI] );
+ else if( up && intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE )
+ overlaps_cds_upstream[seqI].set( ptr_map[bbI] );
+ else
+ overlaps_cds_upstream_fudge[seqI].set( ptr_map[bbI] );
+
+ }else if( intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE &&
+ name == "CDS" )
+ {
+ // spans CDS
+ spanner[seqI].set( ptr_map[bbI] );
+ }
+ if( intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE &&
+ name == "rRNA" )
+ {
+ // overlaps a rRNA by at least ALTERNALOG_MIN_SIZE nucleotides
+ rrna[seqI].set( ptr_map[bbI] );
+ }
+ if( intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE &&
+ name == "tRNA" )
+ {
+ // overlaps a tRNA by at least ALTERNALOG_MIN_SIZE nucleotides
+ trna[seqI].set( ptr_map[bbI] );
+ variable_trna[seqI].set(bb_features[bbI][featI]);
+ }
+ if( intersect.GetLast() - intersect.GetFirst() > ALTERNALOG_MIN_SIZE &&
+ name == "misc_RNA" )
+ {
+ // overlaps a misc_RNA by at least ALTERNALOG_MIN_SIZE nucleotides
+ miscrna[seqI].set( ptr_map[bbI] );
+ variable_miscrna[seqI].set(bb_features[bbI][featI]);
+ }
+ delete feat;
+ }
+ }
+ intergenic[seqI] = genic[seqI] | overlaps_cds_upstream[seqI] | overlaps_cds_downstream[seqI] | rrna[seqI] | trna[seqI];
+ intergenic[seqI].flip();
+ }
+
+ // identify pseudogenizing segments as intergenic segments in one genome that
+ // are genic in other genomes
+ size_t seqI = 0;
+ for( seqI = 0; seqI < seq_count; ++seqI )
+ {
+ bitset_t pseudo = bbclass_tmp;
+ for( size_t seqJ = 0; seqJ < seq_count; ++seqJ )
+ {
+ if( seqJ == seqI )
+ continue;
+ pseudo |= genic[seqJ] | overlaps_cds_upstream[seqJ] | overlaps_cds_downstream[seqJ];
+ }
+ bitset_t fudge = genic_fudge[seqI] | overlaps_cds_upstream_fudge[seqI] | overlaps_cds_downstream_fudge[seqI];
+ fudge.flip(); // if it's questionably within a gene then don't let it be a pseudogene. we want to be sure
+ // about these
+ pseudogenized[seqI] = intergenic[seqI] & pseudo & fudge;
+ }
+}
+
+void analyzeVariableSegments( ostream& os, const vector< bb_entry_t >& bb_list, const vector< double >& avg_lens, uint anno_seqI, vector< gnSequence* >& seq_table, string site_class_name = "alternalog", bool analyze_alternalogs = true )
+{
+ gnSequence* annotated_seq = seq_table[anno_seqI];
+ vector< pair< size_t, size_t > > alternalogs;
+ vector< bb_seqentry_t > alternabb_list;
+ findVariableSegmentsWithFlankingBB( bb_list, avg_lens, alternalogs, ALTERNALOG_MIN_SIZE, ALTERNALOG_MIN_SIZE, analyze_alternalogs );
+ makeVariableSegmentsCoordinateList( bb_list, alternalogs, alternabb_list );
+
+ os << "There are " << alternalogs.size() << " " << site_class_name << " sites\n";
+
+ // count genic vs. intergenic alternalogs
+ // classify alternalogs as genic, intergenic, etc.
+ vector< bitset_t > alt_genic, alt_overlaps_cds_upstream, alt_overlaps_cds_downstream;
+ vector< bitset_t > alt_intergenic, alt_spanner, alt_trna, alt_rrna, alt_pseudogenized;
+ vector< bitset_t > alt_genic_fudge, alt_overlaps_cds_upstream_fudge, alt_overlaps_cds_downstream_fudge;
+ vector< bitset_t > alt_miscrna, v_miscrna, v_trna;
+ vector< bitset_t > intergenic_segs;
+
+ classifyCoordinateRanges(
+ alternabb_list, annotated_seq, seq_table, alt_genic, alt_genic_fudge, alt_overlaps_cds_upstream,
+ alt_overlaps_cds_upstream_fudge, alt_overlaps_cds_downstream, alt_overlaps_cds_downstream_fudge,
+ alt_intergenic, alt_spanner, alt_trna, alt_rrna, alt_miscrna, alt_pseudogenized, v_miscrna, v_trna,
+ intergenic_segs
+ );
+
+ // find alternalogs that are always inside annotated genes
+ bitset_t bbclass_tmp( alternabb_list.size() );
+ bitset_t alt_multi_allelic_genes( bbclass_tmp );
+ alt_multi_allelic_genes.flip();
+ // alternalogs that are always outside genes
+ bitset_t alt_multi_allelic_intergenic( bbclass_tmp );
+ bitset_t alt_multi_allelic_entirely_intergenic( bbclass_tmp );
+ alt_multi_allelic_intergenic.flip();
+ alt_multi_allelic_entirely_intergenic.flip();
+ uint seq_count = bb_list.front().bb_seq.size();
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ alt_multi_allelic_genes &= alt_genic[seqI];
+ alt_multi_allelic_intergenic &= alt_intergenic[seqI];
+ bitset_t spanner_flip = alt_spanner[seqI];
+ spanner_flip.flip();
+ alt_multi_allelic_entirely_intergenic &= alt_intergenic[seqI] & spanner_flip;
+ }
+
+
+ os << " There are " << alt_multi_allelic_genes.count() << " apparently multi-allelic genes (" << site_class_name << ")\n";
+ os << " There are " << alt_multi_allelic_intergenic.count() << " apparently multi-allelic regions with intergenic endpoints (" << site_class_name << ")\n";
+ os << " Of those, " << alt_multi_allelic_entirely_intergenic.count() << " contain no annotated CDS (" << site_class_name << ")\n";
+ os << " The remaining segments span gene boundaries, but are not entirely contained in annotated genes\n";
+
+ bitset_t trna_neighbor;
+ bitset_t miscrna_neighbor;
+ bitset_t converging_cds;
+ bitset_t diverging_cds;
+ bitset_t inline_cds;
+ bitset_t vv_miscrna;
+ bitset_t vv_trna;
+ classifyIntergenic( os, alternabb_list, alt_multi_allelic_intergenic, anno_seqI,
+ annotated_seq, trna_neighbor, miscrna_neighbor, converging_cds, diverging_cds, inline_cds, vv_miscrna, vv_trna );
+
+
+ os << "There are " << trna_neighbor.count() << " intergenic segments with a tRNA nearest neighbor\n";
+ os << "There are " << miscrna_neighbor.count() << " intergenic segments with a miscRNA nearest neighbor\n";
+ os << "There are " << converging_cds.count() << " intergenic segments surrounded by converging CDS\n";
+ os << "There are " << diverging_cds.count() << " intergenic segments surrounded by diverging CDS\n";
+ os << "There are " << inline_cds.count() << " intergenic segments surrounded by inline CDS\n";
+ bitset_t miscrna_inter = v_miscrna[anno_seqI] | vv_miscrna;
+ os << "There are " << miscrna_inter.count() << " annotated misc_RNA associated with variable segments\n";
+ os << "There are " << intergenic_segs[anno_seqI].size() << " intergenic sites in the ref genome, of which " << intergenic_segs[anno_seqI].count() << " exhibit variability\n";
+ bitset_t trna_inter = v_trna[anno_seqI] | vv_trna;
+ os << "There are " << trna_inter.count() << " annotated tRNA associated with variable segments\n";
+
+ if( miscrna_neighbor.count() > 0 )
+ {
+ os << "coordinates of variable segs with misc_RNA neighboring:\n";
+ printFilteredBbSeqList( os, alternabb_list, miscrna_neighbor );
+ }
+ if( diverging_cds.count() > 0 )
+ {
+ os << "coordinates of variable segs with diverging_cds neighboring:\n";
+ printFilteredBbSeqList( os, alternabb_list, diverging_cds );
+ }
+ bitset_t total_miscrna = alt_miscrna[anno_seqI] | miscrna_neighbor;
+ os << "Total variable intergenic segs that neighbor or contain miscRNA: " << total_miscrna.count() << endl;
+
+ os << "coordinates of multi-allelic genes:\n";
+ printFilteredBbSeqList( os, alternabb_list, alt_multi_allelic_genes );
+
+ os << "coordinates of multi-allelic intergenic regions without CDS:\n";
+ printFilteredBbSeqList( os, alternabb_list, alt_multi_allelic_entirely_intergenic );
+
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ os << "genome " << seqI << " has " << alt_genic[seqI].count() << " " << site_class_name << " within CDS\n";
+ os << "genome " << seqI << " has " << alt_spanner[seqI].count() << " " << site_class_name << " that span CDS boundaries\n";
+ os << "genome " << seqI << " has " << alt_intergenic[seqI].count() << " " << site_class_name << " that lie entirely in intergenic regions\n";
+ os << "genome " << seqI << " has " << alt_rrna[seqI].count() << " " << site_class_name << " that contain rRNA\n";
+ os << "genome " << seqI << " has " << alt_trna[seqI].count() << " " << site_class_name << " that contain tRNA\n";
+ os << "genome " << seqI << " has " << alt_miscrna[seqI].count() << " " << site_class_name << " that contain misc_RNA\n";
+ os << "genome " << seqI << " has " << alt_pseudogenized[seqI].count() << " apparent recent pseudogenes in " << site_class_name << "\n";
+ os.flush();
+
+/*
+ os << "coordinates of genic alternalogs:\n";
+ printFilteredBbSeqList( os, alternabb_list, genic[seqI] );
+*/
+
+ if( alt_trna[seqI].count() > 0 )
+ {
+ os << "coordinates of tRNA " << site_class_name << ":\n";
+ printFilteredBbSeqList( os, alternabb_list, alt_trna[seqI] );
+ }
+
+ if( alt_rrna[seqI].count() > 0 )
+ {
+ os << "coordinates of rRNA " << site_class_name << ":\n";
+ printFilteredBbSeqList( os, alternabb_list, alt_rrna[seqI] );
+ }
+
+ if( alt_miscrna[seqI].count() > 0 )
+ {
+ os << "coordinates of misc_RNA " << site_class_name << ":\n";
+ printFilteredBbSeqList( os, alternabb_list, alt_miscrna[seqI] );
+ }
+
+ os << "coordinates of possible pseudogenes:\n";
+ printFilteredBbSeqList( os, alternabb_list, alt_pseudogenized[seqI] );
+ os.flush();
+ }
+}
+
+const uint INTERNAL_NODE = (std::numeric_limits<uint>::max)();
+const uint INTERVAL_UNKNOWN = (std::numeric_limits<uint>::max)();
+
+int main( int argc, char* argv[] )
+{
+#if WIN32
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+#endif
+
+ if( argc < 7 )
+ {
+ cerr << "bbAnalyze <xmfa file> <guide tree> <backbone seqpos file> <backbone col file> <annotated seq index> <output file>\n";
+ cerr << "annotated seq index starts at 0.\n";
+ return -1;
+ }
+ string aln_fname( argv[1] );
+ string guide_tree_fname( argv[2] );
+ string bbseq_fname( argv[3] );
+ string bbcol_fname( argv[4] );
+ int gff_seq_index = atoi( argv[5] );
+ string output_fname( argv[6] );
+
+ ifstream aln_input( aln_fname.c_str() );
+ if( !aln_input.is_open() ){
+ cerr << "Error opening \"" << aln_fname << "\"" << endl;
+ return -2;
+ }
+ ifstream tree_input( guide_tree_fname.c_str() );
+ if( !tree_input.is_open() ){
+ cerr << "Error opening \"" << guide_tree_fname << "\"" << endl;
+ return -3;
+ }
+ ifstream bbseq_input( bbseq_fname.c_str() );
+ if( !bbseq_input.is_open() ){
+ cerr << "Error opening \"" << bbseq_fname << "\"" << endl;
+ return -4;
+ }
+ ifstream bbcol_input( bbcol_fname.c_str() );
+ if( !bbcol_input.is_open() ){
+ cerr << "Error opening \"" << bbcol_fname << "\"" << endl;
+ return -4;
+ }
+ ofstream anal_output( output_fname.c_str() );
+ if( !anal_output.is_open() ){
+ cerr << "Error opening \"" << output_fname << "\" for writing" << endl;
+ return -6;
+ }
+
+ // read the guide tree
+ PhyloTree< TreeNode > tree;
+ tree.readTree( tree_input );
+
+ // read the backbone column file
+ vector< bb_seqentry_t > bb_seq_list;
+ vector< pair< size_t, ULA > > bb_col_list;
+ readBackboneSeqFile( bbseq_input, bb_seq_list );
+ readBackboneColsFile( bbcol_input, bb_col_list );
+
+ // read the alignment
+ IntervalList iv_list;
+ iv_list.ReadStandardAlignment( aln_input );
+
+ LoadSequences(iv_list, &cout);
+
+
+
+ const size_t seq_count = iv_list.seq_table.size();
+
+ vector< bb_entry_t > bb_list( bb_seq_list.size() );
+ for( size_t i = 0; i < bb_seq_list.size(); ++i )
+ {
+ bb_list[i].bb_seq = bb_seq_list[i];
+ bb_list[i].bb_cols = bb_col_list[i].second;
+ bb_list[i].iv = bb_col_list[i].first;
+ // awful hack: homogenize the parity inside intervals. this is a bug in progressiveMauve
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ AbstractMatch::orientation o = iv_list[bb_list[i].iv].Orientation(seqI);
+ if( o == AbstractMatch::undefined )
+ continue;
+ if( bb_list[i].bb_cols.LeftEnd(seqI) != NO_MATCH )
+ bb_list[i].bb_cols.SetOrientation( seqI, o );
+ if( (bb_list[i].bb_seq[seqI].first < 0 && o == AbstractMatch::forward) ||
+ (bb_list[i].bb_seq[seqI].first > 0 && o == AbstractMatch::reverse) )
+ bb_list[i].bb_seq[seqI].first *= -1;
+ if( (bb_list[i].bb_seq[seqI].second < 0 && o == AbstractMatch::forward) ||
+ (bb_list[i].bb_seq[seqI].second > 0 && o == AbstractMatch::reverse) )
+ bb_list[i].bb_seq[seqI].second *= -1;
+ if( genome::absolut( bb_list[i].bb_seq[seqI].first ) > genome::absolut( bb_list[i].bb_seq[seqI].second ) )
+ swap( bb_list[i].bb_seq[seqI].first, bb_list[i].bb_seq[seqI].second );
+ }
+ }
+
+
+ // make faux single-genome bb segments for anything not contained in
+ // real backbone
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ vector< AbstractMatch* > seq_beeb;
+ ULA single_ula(1);
+ for( size_t i = 0; i < bb_seq_list.size(); ++i )
+ {
+ if( bb_seq_list[i][seqI].first == 0 )
+ continue;
+ single_ula.SetStart( 0, genome::absolut(bb_seq_list[i][seqI].first) );
+ single_ula.SetLength( genome::absolut(bb_seq_list[i][seqI].second - bb_seq_list[i][seqI].first) + 1 );
+ seq_beeb.push_back( single_ula.Copy() );
+ }
+ SingleStartComparator<AbstractMatch> ssc(0);
+ sort( seq_beeb.begin(), seq_beeb.end(), ssc );
+ // HACK!!
+ // trim single base pair overlaps in seq_beeb that arise due to an off-by-one bug in the backbone output...
+ EliminateOverlaps_v2( seq_beeb );
+ sort( seq_beeb.begin(), seq_beeb.end(), ssc );
+ list< AbstractMatch* > seq_beeb_list( seq_beeb.begin(), seq_beeb.end() );
+ AddGapMatches( seq_beeb_list, seq_beeb_list.begin(), seq_beeb_list.end(),
+ 0, 1, iv_list.seq_table[seqI]->length()+1, AbstractMatch::forward, 1 );
+ sort( seq_beeb.begin(), seq_beeb.end() );
+ vector< AbstractMatch* > tmp_list( seq_beeb_list.begin(), seq_beeb_list.end() );
+ sort( tmp_list.begin(), tmp_list.end() );
+ vector< AbstractMatch* > new_beeb( seq_beeb_list.size() - seq_beeb.size() );
+ std::set_difference( tmp_list.begin(), tmp_list.end(),
+ seq_beeb.begin(), seq_beeb.end(), new_beeb.begin() );
+
+ // add each new_beeb to the backbone list
+ size_t bbI = bb_list.size();
+ bb_list.resize( bbI + new_beeb.size() );
+ for( size_t i = 0; i < new_beeb.size(); ++i )
+ {
+ bb_list[bbI].bb_seq.resize( seq_count );
+ bb_list[bbI].bb_seq[seqI] = make_pair( new_beeb[i]->LeftEnd(0), new_beeb[i]->RightEnd(0) );
+ bb_list[bbI].iv = INTERVAL_UNKNOWN;
+ ULA cols(seq_count);
+ cols.SetLeftEnd(seqI, 1);
+ cols.SetLength(new_beeb[i]->Length(0));
+ bb_list[bbI].bb_cols = cols;
+ bbI++;
+ }
+ for( size_t i = 0; i < tmp_list.size(); ++i )
+ tmp_list[i]->Free();
+ }
+
+ // create a map between tree nodes and sequences
+ vector< uint > node_sequence_map( tree.size(), -1 );
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ stringstream seq_name;
+ seq_name << "seq" << seqI + 1;
+ node_id_t nodeI = 0;
+ for( ; nodeI < tree.size(); nodeI++ )
+ {
+ if( seq_name.str() == tree[nodeI].name )
+ {
+ node_sequence_map[nodeI] = seqI;
+ break;
+ }
+ }
+ if( nodeI == tree.size() )
+ throw "Phylogenetic tree names unrecognized. Should follow seqN naming format\n";
+ }
+
+ // mark small backbone segments
+ bitset_t too_small( bb_list.size(), false );
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ if( bb_list[bbI].bb_cols.Length() < DISCARD_SEGMENT )
+ too_small.set(bbI, true);
+ bitset_t not_small = too_small;
+ not_small.flip();
+
+ vector< double > avg_lens( bb_list.size(), 0 );
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ {
+ double ct = 0;
+ for( size_t seqI = 0; seqI < bb_list[bbI].bb_seq.size(); ++seqI )
+ {
+ if( bb_list[bbI].bb_seq[seqI].first != 0 )
+ {
+ ct++;
+ avg_lens[bbI] += genome::absolut( bb_list[bbI].bb_seq[seqI].second - bb_list[bbI].bb_seq[seqI].first ) + 1;
+ }
+ }
+ avg_lens[bbI] /= ct;
+ }
+
+
+ // got the backbone. now do something with it.
+ // at each node of the tree, count the total amount backbone contained in nodes
+ // below that tree, both inside genes and outside genes
+
+
+ vector< node_id_t > all_leaves;
+ getLeaves( tree, tree.root, all_leaves );
+ sort( all_leaves.begin(), all_leaves.end() );
+
+ bitset_t true_temper( bb_list.size() );
+ true_temper.reset();
+ true_temper.flip();
+ bitset_t false_temper( bb_list.size() );
+ false_temper.reset();
+
+ vector< bitset_t > unique( tree.size(), true_temper );
+ // partial contains bb segs that have representation among two or more genomes below a given node
+ vector< bitset_t > partial( tree.size(), true_temper );
+ // conserved have representation in all genomes below a node, and possibly others
+ vector< bitset_t > conserved( tree.size(), true_temper );
+ // child partial have representation in one or more genomes below a node
+ vector< bitset_t > c1_partial( tree.size(), false_temper );
+ vector< bitset_t > c2_partial( tree.size(), false_temper );
+ vector< bitset_t > c1_complete( tree.size(), false_temper );
+ vector< bitset_t > c2_complete( tree.size(), false_temper );
+
+ // calculate which segments have heterogenous occurrence at each node
+ vector< bitset_t > hop_one( tree.size(), false_temper );
+ vector< bitset_t > hop_two( tree.size(), false_temper );
+ vector< double > pan_genome_size( tree.size(), 0 );
+ // hop_two if(c1_partial && c2_partial) && !c1_complete && !c2_complete
+ // hop_one if !hop_two && (!c1_complete || !c2_complete) && (c1_partial && c2_partial) && !(hop_one at incomplete child)
+
+ stack< node_id_t > node_stack;
+ node_stack.push( tree.root );
+ bitset_t visited( tree.size(), false );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t nI = node_stack.top();
+ if( !visited[nI] && tree[nI].children.size() > 0 )
+ {
+ node_stack.push( tree[nI].children[0] );
+ node_stack.push( tree[nI].children[1] );
+ visited.set(nI,true);
+ continue; // visit post-order
+ }
+ node_stack.pop();
+
+ vector< node_id_t > leaves;
+ getLeaves( tree, nI, leaves );
+ sort( leaves.begin(), leaves.end() );
+
+ vector< node_id_t > not_leaves( all_leaves.size() - leaves.size() );
+ std::set_difference( all_leaves.begin(), all_leaves.end(),
+ leaves.begin(), leaves.end(),
+ not_leaves.begin() );
+
+
+ vector< node_id_t > c1_leaves;
+ vector< node_id_t > c2_leaves;
+ if( tree[nI].children.size() > 0 )
+ {
+ getLeaves( tree, tree[nI].children[0], c1_leaves );
+ getLeaves( tree, tree[nI].children[1], c2_leaves );
+ }
+
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ {
+ // do all the leaves have this segment?
+ size_t lI = 0;
+ size_t ct = 0;
+ for( lI = 0; lI < leaves.size(); ++lI )
+ {
+ if( bb_list[bbI].bb_seq[ node_sequence_map[ leaves[lI] ] ].first != 0 )
+ ct++;
+ }
+ unique[nI].set(bbI, ct == leaves.size());
+
+ // was this conserved in more than one?
+ partial[nI].set(bbI, ct > 1);
+ conserved[nI].set(bbI, ct == leaves.size());
+
+ // if this one was represented at all then it's part of the pan-genome
+ if( ct > 0 )
+ pan_genome_size[nI] += avg_lens[bbI];
+
+ // do only the leaves below this node have this segment?
+ for( lI = 0; lI < not_leaves.size(); ++lI )
+ {
+ if( bb_list[bbI].bb_seq[ node_sequence_map[ not_leaves[lI] ] ].first != 0 )
+ unique[nI].set(bbI, false);
+ }
+
+ // is the segment present in both children?
+ bool c1 = false;
+ bool c2 = false;
+ uint c1_ct = 0;
+ uint c2_ct = 0;
+ for( lI = 0; lI < c1_leaves.size(); ++lI )
+ {
+ if( bb_list[bbI].bb_seq[ node_sequence_map[ c1_leaves[lI] ] ].first != 0 )
+ c1_ct++;
+ }
+ for( lI = 0; lI < c2_leaves.size(); ++lI )
+ {
+ if( bb_list[bbI].bb_seq[ node_sequence_map[ c2_leaves[lI] ] ].first != 0 )
+ c2_ct++;
+ }
+ c1_partial[nI].set(bbI, c1_ct > 0);
+ c2_partial[nI].set(bbI, c2_ct > 0);
+ c1_complete[nI].set(bbI, c1_ct == c1_leaves.size());
+ c2_complete[nI].set(bbI, c2_ct == c2_leaves.size());
+ }
+ }
+
+ node_stack.push( tree.root );
+ visited = bitset_t( tree.size(), false );
+ vector< bitset_t > all_unique( tree.size(), false_temper );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t nI = node_stack.top();
+ if( !visited[nI] && tree[nI].children.size() > 0 )
+ {
+ node_stack.push( tree[nI].children[0] );
+ node_stack.push( tree[nI].children[1] );
+ visited.set(nI,true);
+ continue; // visit post-order
+ }
+ node_stack.pop();
+
+ all_unique[nI] = unique[nI];
+
+ if( tree[nI].children.size() == 0 )
+ continue; // hop concept doesn't apply to leaf nodes
+ bitset_t not_c1_comp = c1_complete[nI];
+ not_c1_comp.flip();
+ bitset_t not_c2_comp = c2_complete[nI];
+ not_c2_comp.flip();
+ hop_two[nI] = c1_partial[nI] & c2_partial[nI] & not_c1_comp & not_c2_comp;
+ bitset_t not_hop_two_nI = hop_two[nI];
+ not_hop_two_nI.flip();
+ bitset_t not_child_hop = hop_one[ tree[nI].children[0] ] | hop_one[ tree[nI].children[1] ];
+ not_child_hop.flip();
+ hop_one[nI] = not_hop_two_nI & (not_c1_comp | not_c2_comp) & c1_partial[nI] & c2_partial[nI] & not_child_hop;
+
+ // don't count small segments in anything
+ hop_two[nI] &= not_small;
+ hop_one[nI] &= not_small;
+ unique[nI] &= not_small;
+ conserved[nI] &= not_small;
+ partial[nI] &= not_small;
+ all_unique[nI] = unique[nI] | all_unique[ tree[nI].children[0] ] | all_unique[ tree[nI].children[1] ];
+ }
+
+ // compute length statistics for various types of backbone
+ vector< double > conserved_len( tree.size(), 0 );
+ vector< double > unique_len( tree.size(), 0 );
+ vector< double > hop_one_len( tree.size(), 0 );
+ vector< double > hop_two_len( tree.size(), 0 );
+
+ for( size_t nI = 0; nI < tree.size(); nI++ )
+ {
+ // count up avg lengths
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ {
+ if( conserved[nI].test(bbI) )
+ conserved_len[nI] += avg_lens[bbI];
+ if( unique[nI].test(bbI) )
+ unique_len[nI] += avg_lens[bbI];
+ if( hop_one[nI].test(bbI) )
+ hop_one_len[nI] += avg_lens[bbI];
+ if( hop_two[nI].test(bbI) )
+ hop_two_len[nI] += avg_lens[bbI];
+ }
+ }
+
+ // print a general summary of how clustered variable segments are...
+ bitset_t uni_root = unique[0] & not_small;
+ anal_output << "There are " << uni_root.count() << " segments conserved among all genomes\n";
+ anal_output << "and " << not_small.count()-uni_root.count() << " variable segments fall in between these\n";
+
+
+ // prepare to analyze distribution of gene functions in backbone
+ vector< bb_seqentry_t > m_bbseq_list( bb_list.size() );
+ for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
+ m_bbseq_list[bbI] = bb_list[bbI].bb_seq;
+ multifun_map_t all_mf_count;
+ multifun_names_t all_mf_names;
+ size_t cds_count = getCDScount( iv_list.seq_table[gff_seq_index] );
+ bitset_t all_features( iv_list.seq_table[gff_seq_index]->getFeatureListLength() );
+ all_features.flip();
+ makeMultiFunCount( iv_list.seq_table[gff_seq_index], all_mf_count, all_mf_names, all_features );
+
+ // print summaries for each node
+ anal_output << "#\n";
+ anal_output << "# Alignment tree summary\n";
+ anal_output << "#\n";
+ for( size_t nI = 0; nI < tree.size(); nI++ )
+ {
+ anal_output << "Node " << nI << endl;
+ vector< node_id_t > leaves;
+ getLeaves( tree, nI, leaves );
+ anal_output << "Genomes at or below this node:\n";
+ for( size_t lI = 0; lI < leaves.size(); ++lI )
+ anal_output << '\t' << iv_list.seq_filename[ node_sequence_map[ leaves[ lI ] ] ] << endl;
+
+ anal_output << "\tNumber of unique segments at this node: " << unique[nI].count() << endl;
+ anal_output << "\tNumber of hop one (single deletion) segments at this node: " << hop_one[nI].count() << endl;
+ anal_output << "\tNumber of hop two (multiple deletion or lgt) segments at this node: " << hop_two[nI].count() << endl;
+
+ anal_output << "total avg. \"core-genome\" size at this node: " << conserved_len[nI] << endl;
+ anal_output << "total avg. unique length at this node: " << unique_len[nI] << endl;
+ anal_output << "total avg. hop one length at this node: " << hop_one_len[nI] << endl;
+ anal_output << "total avg. hop two length at this node: " << hop_two_len[nI] << endl;
+ anal_output << "total \"pan-genome\" size at this node: " << pan_genome_size[nI] << endl;
+
+ // if this node has the annotated genome below it then analyze the distribution of
+ // backbone content
+ vector< uint > leaf_seqids( leaves.size() );
+ for( size_t i = 0; i < leaves.size(); ++i )
+ leaf_seqids[i] = node_sequence_map[leaves[i]];
+ vector< uint >::iterator id_iter =std::find( leaf_seqids.begin(), leaf_seqids.end(), gff_seq_index );
+ if( id_iter != leaf_seqids.end() )
+ {
+ vector< vector< size_t > > intersecting;
+ featureIntersect( m_bbseq_list, gff_seq_index, intersecting, iv_list.seq_table[gff_seq_index] );
+ bitset_t features_hit;
+ getFeatureHits( intersecting, conserved[nI], features_hit );
+ multifun_map_t bb_mf_count;
+ multifun_names_t bb_mf_names;
+ double expect_freq = (double)features_hit.count() / (double)cds_count;
+ makeMultiFunCount( iv_list.seq_table[gff_seq_index], bb_mf_count, bb_mf_names, features_hit );
+ anal_output << "#\n#Conserved gene content distribution\n#\n";
+ anal_output << "Avg percent conserved " << setprecision(3) << expect_freq * 100 << endl;
+ mfAnalyze( anal_output, all_mf_count, bb_mf_count, all_mf_names, expect_freq );
+
+ // analyze hop_one distributions
+ intersecting.clear();
+ featureIntersect( m_bbseq_list, gff_seq_index, intersecting, iv_list.seq_table[gff_seq_index] );
+ features_hit.clear();
+ getFeatureHits( intersecting, hop_one[nI], features_hit );
+ bb_mf_count.clear();
+ bb_mf_names.clear();
+ expect_freq = (double)features_hit.count() / (double)cds_count;
+ makeMultiFunCount( iv_list.seq_table[gff_seq_index], bb_mf_count, bb_mf_names, features_hit );
+ anal_output << "#\n#Hop one gene content distribution\n#\n";
+ anal_output << "Avg percent in hop_one " << setprecision(3) << expect_freq * 100 << endl;
+ mfAnalyze( anal_output, all_mf_count, bb_mf_count, all_mf_names, expect_freq );
+
+
+ // analyze hop_two distributions
+ intersecting.clear();
+ featureIntersect( m_bbseq_list, gff_seq_index, intersecting, iv_list.seq_table[gff_seq_index] );
+ features_hit.clear();
+ getFeatureHits( intersecting, hop_two[nI], features_hit );
+ bb_mf_count.clear();
+ bb_mf_names.clear();
+ expect_freq = (double)features_hit.count() / (double)cds_count;
+ makeMultiFunCount( iv_list.seq_table[gff_seq_index], bb_mf_count, bb_mf_names, features_hit );
+ anal_output << "#\n#Hop two gene content distribution\n#\n";
+ anal_output << "Avg percent in hop_two " << setprecision(3) << expect_freq * 100 << endl;
+ mfAnalyze( anal_output, all_mf_count, bb_mf_count, all_mf_names, expect_freq );
+
+
+ // analyze distributions of segments unique to this clade
+ intersecting.clear();
+ featureIntersect( m_bbseq_list, gff_seq_index, intersecting, iv_list.seq_table[gff_seq_index] );
+ features_hit.clear();
+ getFeatureHits( intersecting, all_unique[nI], features_hit );
+ bb_mf_count.clear();
+ bb_mf_names.clear();
+ expect_freq = (double)features_hit.count() / (double)cds_count;
+ makeMultiFunCount( iv_list.seq_table[gff_seq_index], bb_mf_count, bb_mf_names, features_hit );
+ anal_output << "#\n#Unique to this clade gene content distribution\n#\n";
+ anal_output << "Avg percent in unique_to_clade " << setprecision(3) << expect_freq * 100 << endl;
+ mfAnalyze( anal_output, all_mf_count, bb_mf_count, all_mf_names, expect_freq );
+
+ }
+ }
+
+ // first analyze all variable segments
+ analyzeVariableSegments( anal_output, bb_list, avg_lens, gff_seq_index, iv_list.seq_table, "variable segments", false );
+
+ // then analyze "alternalogs": variable segments with at least two non-null alleles
+ analyzeVariableSegments( anal_output, bb_list, avg_lens, gff_seq_index, iv_list.seq_table, "alternalogs", true );
+ anal_output.flush();
+}
+
diff --git a/src/bbBreakOnGenes.cpp b/src/bbBreakOnGenes.cpp
new file mode 100644
index 0000000..04bf652
--- /dev/null
+++ b/src/bbBreakOnGenes.cpp
@@ -0,0 +1,358 @@
+#include "libMems/Backbone.h"
+#include "libMems/ProgressiveAligner.h"
+#include <sstream>
+using namespace mems;
+using namespace std;
+using namespace genome;
+
+
+template< typename MatchVector >
+void getBpList( MatchVector& mvect, uint seq, vector< gnSeqI >& bp_list )
+{
+ bp_list.clear();
+ for( size_t ivI = 0; ivI < mvect.size(); ivI++ )
+ {
+ if( mvect[ivI]->LeftEnd(seq) == NO_MATCH )
+ continue;
+ bp_list.push_back( mvect[ivI]->LeftEnd(seq) );
+ bp_list.push_back( mvect[ivI]->RightEnd(seq)+1 );
+ }
+ std::sort( bp_list.begin(), bp_list.end() );
+}
+
+template< typename MatchVector >
+void createMap( const MatchVector& mv_from, const MatchVector& mv_to, vector< size_t >& map )
+{
+ typedef typename MatchVector::value_type MatchPtr;
+ vector< pair< MatchPtr, size_t > > m1(mv_from.size());
+ vector< pair< MatchPtr, size_t > > m2(mv_to.size());
+ for( size_t i = 0; i < mv_from.size(); ++i )
+ m1[i] = make_pair( mv_from[i], i );
+ for( size_t i = 0; i < mv_to.size(); ++i )
+ m2[i] = make_pair( mv_to[i], i );
+ std::sort( m1.begin(), m1.end() );
+ std::sort( m2.begin(), m2.end() );
+ map.resize( m1.size() );
+ for( size_t i = 0; i < m1.size(); ++i )
+ map[m1[i].second] = m2[i].second;
+}
+
+
+void makeAllPairwiseGenomeHSSBreakOnGenes( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_ptrs, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, const HssDetector* detector, vector< vector< gnSeqI > >& gene_bounds )
+{
+ uint seq_count = iv_list.seq_table.size();
+ // make pairwise projections of intervals and find LCBs...
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
+ {
+ vector< uint > projection;
+ projection.push_back( seqI );
+ projection.push_back( seqJ );
+ vector< vector< MatchProjectionAdapter* > > LCB_list;
+ vector< LCB > projected_adjs;
+ projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
+ // make intervals
+ IntervalList pair_ivs;
+ pair_ivs.seq_table.push_back( iv_list.seq_table[seqI] );
+ pair_ivs.seq_table.push_back( iv_list.seq_table[seqJ] );
+ pair_ivs.resize( LCB_list.size() );
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
+ pair_ivs[lcbI].SetMatches( LCB_list[lcbI] );
+ LCB_list.clear();
+
+ vector< CompactGappedAlignment<>* > pair_cgas( pair_ivs.size() );
+ for( size_t lcbI = 0; lcbI < pair_ivs.size(); ++lcbI )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ pair_cgas[lcbI] = tmp_cga.Copy();
+ new (pair_cgas[lcbI])CompactGappedAlignment<>( pair_ivs[lcbI] );
+ }
+
+ vector< CompactGappedAlignment<>* > hss_list;
+ // now find islands
+ hss_array_t hss_array;
+ (*detector)( pair_cgas, pair_ivs.seq_table, hss_array );
+ HssArrayToCga(pair_cgas, pair_ivs.seq_table, hss_array, hss_list);
+
+ for( size_t cgaI = 0; cgaI < pair_cgas.size(); ++cgaI )
+ pair_cgas[cgaI]->Free();
+ pair_cgas.clear();
+
+ // now split up on iv boundaries
+ vector< gnSeqI > bp_list;
+ getBpList( iv_ptrs, seqI, bp_list );
+ GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm(0);
+ SingleStartComparator< CompactGappedAlignment<> > ssc(0);
+ std::sort(hss_list.begin(), hss_list.end(), ssc );
+ applyBreakpoints( bp_list, hss_list, gmsm );
+ // break on gene bounds in seqI
+ std::sort(hss_list.begin(), hss_list.end(), ssc );
+// if( !(seqI == 1 && seqJ == 15 ) )
+ applyBreakpoints( gene_bounds[seqI], hss_list, gmsm );
+ // and again on seqJ
+ getBpList( iv_ptrs, seqJ, bp_list );
+ GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm1(1);
+ SingleStartComparator< CompactGappedAlignment<> > ssc1(1);
+ std::sort(hss_list.begin(), hss_list.end(), ssc1 );
+ applyBreakpoints( bp_list, hss_list, gmsm1 );
+ // break on gene bounds in seqJ
+ std::sort(hss_list.begin(), hss_list.end(), ssc1 );
+// if( !(seqI == 1 && seqJ == 15 ) )
+ applyBreakpoints( gene_bounds[seqJ], hss_list, gmsm1 );
+
+ // now transform into interval-specific columns
+ std::sort(hss_list.begin(), hss_list.end(), ssc );
+
+ SingleStartComparator< CompactGappedAlignment<> > ivcomp(seqI);
+ std::sort( iv_ptrs.begin(), iv_ptrs.end(), ivcomp );
+ vector< size_t > iv_map;
+ createMap( iv_ptrs, iv_orig_ptrs, iv_map );
+ size_t ivI = 0;
+ while( ivI < iv_ptrs.size() && iv_ptrs[ivI]->LeftEnd(0) == NO_MATCH )
+ ++ivI;
+ for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
+ {
+ if( hss_list[hssI]->LeftEnd(0) == NO_MATCH || hss_list[hssI]->Length(0) == 0 )
+ continue;
+ if( ivI == iv_ptrs.size() )
+ {
+ cerr << "huh?\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs.back()->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs.back()->RightEnd(seqI) << endl;
+ }
+ while( ivI < iv_ptrs.size() &&
+ (iv_ptrs[ivI]->LeftEnd(seqI) == NO_MATCH ||
+ hss_list[hssI]->LeftEnd(0) > iv_ptrs[ivI]->RightEnd(seqI) ) )
+ ++ivI;
+ if( ivI == iv_ptrs.size() )
+ {
+ cerr << "hssI fit!!\n";
+ genome::breakHere();
+ }
+ // check for containment in seqJ
+ if( iv_ptrs[ivI]->LeftEnd(seqJ) == NO_MATCH ||
+ iv_ptrs[ivI]->RightEnd(seqJ) < hss_list[hssI]->LeftEnd(1) ||
+ hss_list[hssI]->RightEnd(1) < iv_ptrs[ivI]->LeftEnd(seqJ) )
+ continue; // this hss falls to an invalid range in seqJ
+
+ if( hss_list[hssI]->RightEnd(0) < iv_ptrs[ivI]->LeftEnd(seqI) )
+ {
+ cerr << "huh 2?\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
+ hssI++;
+ continue;
+ }
+
+ vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][iv_map[ivI]];
+
+ gnSeqI left_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->LeftEnd(0) );
+ gnSeqI right_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->RightEnd(0) );
+ if(left_col > right_col && iv_ptrs[ivI]->Orientation(seqI) == AbstractMatch::reverse )
+ {
+ swap(left_col, right_col); // must have been a revcomp seq
+ }
+ else if(left_col > right_col)
+ {
+ cerr << "bad cols\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
+ genome::breakHere();
+ }
+
+ if( left_col > 2000000000 || right_col > 2000000000 )
+ {
+ cerr << "huh 2?\n";
+ cerr << hss_list[hssI]->LeftEnd(0) << endl;
+ cerr << hss_list[hssI]->RightEnd(0) << endl;
+ cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
+ cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
+ genome::breakHere();
+ }
+ cur_hss_cols.push_back( make_pair( left_col, right_col ) );
+ }
+ for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
+ hss_list[hssI]->Free();
+ }
+ }
+}
+
+
+class IntervalSeqManipulator
+{
+public:
+ IntervalSeqManipulator( uint seq ) : m_seq(seq) {}
+ gnSeqI LeftEnd(Interval& m) const{ return m.LeftEnd(m_seq); }
+ gnSeqI Length(Interval& m) const{ return m.Length(m_seq); }
+ void CropLeft(Interval& m, gnSeqI amount ) const{ m.CropLeft(amount, m_seq); }
+ void CropRight(Interval& m, gnSeqI amount ) const{ m.CropRight(amount, m_seq); }
+ template< typename ContainerType >
+ void AddCopy(ContainerType& c, Interval& m) const{ c.push_back( m ); }
+private:
+ uint m_seq;
+};
+
+
+void detectBackboneBreakOnGenes( IntervalList& iv_list, backbone_list_t& bb_list, const HssDetector* detector, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, vector< vector< gnSeqI > >& gene_bounds )
+{
+ uint seq_count = iv_list.seq_table.size();
+
+ // indexed by seqI, seqJ, ivI, hssI (left col, right col)
+ pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][iv_list.size()]);
+
+ // ugg. need CompactGappedAlignment for its SeqPosToColumn
+ vector< CompactGappedAlignment<>* > iv_ptrs(iv_list.size());
+ for( size_t i = 0; i < iv_list.size(); ++i )
+ {
+ CompactGappedAlignment<> tmp_cga;
+ iv_ptrs[i] = tmp_cga.Copy();
+ new (iv_ptrs[i])CompactGappedAlignment<>( iv_list[i] );
+ }
+
+ iv_orig_ptrs = iv_ptrs;
+ makeAllPairwiseGenomeHSSBreakOnGenes( iv_list, iv_ptrs, iv_orig_ptrs, hss_cols, detector, gene_bounds );
+
+ // merge overlapping pairwise homology predictions into n-way predictions
+ mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, bb_list );
+}
+
+int main( int argc, char* argv[] )
+{
+#if WIN32
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+#endif
+
+ if( argc < 4 )
+ {
+ cerr << "bbBreakOnGenes <xmfa file> <min bb gap size> <bb output>\n";
+ return -1;
+ }
+ string xmfa_fname( argv[1] );
+ int min_bb_gap = atoi( argv[2] );
+ string output_fname( argv[3] );
+
+ ifstream xmfa_input( xmfa_fname.c_str() );
+ if( !xmfa_input.is_open() ){
+ cerr << "Error opening \"" << xmfa_fname << "\"" << endl;
+ return -4;
+ }
+ ofstream bb_output( output_fname.c_str() );
+ if( !bb_output.is_open() ){
+ cerr << "Error opening \"" << output_fname << "\" for writing" << endl;
+ return -6;
+ }
+
+
+ // read the alignment
+ IntervalList iv_list;
+ iv_list.ReadStandardAlignment( xmfa_input );
+ LoadSequences(iv_list, &cout);
+ vector< vector< gnSeqI > > gene_bounds( iv_list.seq_table.size() );
+
+ if( argc - 4 == iv_list.seq_filename.size() )
+ {
+ cerr << "Reading gene coordinates from .ptt files\n";
+ // read ptt files instead
+ for( size_t aI = 0; aI < iv_list.seq_filename.size(); aI++ )
+ {
+ ifstream ptt_in( argv[aI+4] );
+ string bub;
+ getline( ptt_in, bub );
+ getline( ptt_in, bub );
+ getline( ptt_in, bub );
+ while( getline( ptt_in, bub ) )
+ {
+ stringstream line_str(bub);
+ string buf;
+ getline( line_str, buf, '.' );
+ int64 lend = atoi(buf.c_str());
+ getline( line_str, buf, '.' );
+ getline( line_str, buf );
+ int64 rend = atoi(buf.c_str());
+ gene_bounds[aI].push_back( lend -1);
+ gene_bounds[aI].push_back( lend );
+ gene_bounds[aI].push_back( rend );
+ gene_bounds[aI].push_back( rend+1 );
+
+ }
+ }
+ }else{
+
+ // get gene boundary coordinates, break bb segs on genes...
+ for( size_t genomeI = 0; genomeI < iv_list.seq_table.size(); genomeI++ )
+ {
+ for( size_t featureI = 0; featureI < iv_list.seq_table[genomeI]->getFeatureListLength(); ++featureI )
+ {
+ gnBaseFeature* feat = iv_list.seq_table[genomeI]->getFeature( featureI );
+ string feat_name = feat->GetName();
+ if( feat_name != "CDS" )
+ continue; // don't deal with other feature types (source, misc_RNA, etc)
+ gnLocation loc = feat->GetLocation(0);
+ if( loc.GetFirst() > loc.GetLast() || loc.GetFirst() == 0 || loc.GetLast() == 0 )
+ continue; // a problem parsing annotation?
+ gene_bounds[genomeI].push_back( loc.GetFirst() );
+ gene_bounds[genomeI].push_back( loc.GetLast() +1 );
+ }
+// IntervalSeqManipulator ism(genomeI);
+ std::sort( gene_bounds[genomeI].begin(), gene_bounds[genomeI].end() );
+ cerr << "Found " << gene_bounds[genomeI].size() / 2 << " genes for " << iv_list.seq_filename[genomeI] << endl;
+ }
+ }
+
+ // detect big gaps
+ backbone_list_t bb_list;
+ vector< CompactGappedAlignment<>* > iv_orig_ptrs;
+ BigGapsDetector bgd( min_bb_gap );
+ detectBackboneBreakOnGenes( iv_list, bb_list, &bgd, iv_orig_ptrs, gene_bounds );
+
+ writeBackboneSeqCoordinates( bb_list, iv_list, bb_output );
+ std::vector< bb_seqentry_t > bb_seq_list;
+ bb_output.close();
+ std::ifstream bbseq_input( output_fname.c_str() );
+ readBackboneSeqFile( bbseq_input, bb_seq_list );
+
+ // testing: check whether any gene boundaries are violated
+ gene_bounds[0].push_back(31337); // test the test:
+ gene_bounds[0].push_back(31333); // insert some bogus gene bounds to make sure
+ gene_bounds[0].push_back(31341); // they get found and reported
+ gene_bounds[0].push_back(31345);
+ for( uint seqI = 0; seqI < iv_list.seq_table.size(); seqI++ )
+ {
+ cerr << "Checking seq " << seqI << " for errors\n";
+ std::sort( gene_bounds[seqI].begin(), gene_bounds[seqI].end() );
+ BbSeqEntrySorter bs(seqI);
+ std::sort( bb_seq_list.begin(), bb_seq_list.end(), bs );
+ size_t gI = 0;
+ size_t bI = 0;
+ cerr << gene_bounds[seqI].size() << " gene boundaries and " << bb_seq_list.size() << " bb segs\n";
+ for( ; gI < gene_bounds[seqI].size() && bI < bb_seq_list.size(); gI++ )
+ {
+ cout << "checking " << bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << endl;
+ while( bI < bb_seq_list.size() && gene_bounds[seqI][gI] > abs(bb_seq_list[bI][seqI].second) )
+ bI++;
+ if( bI == bb_seq_list.size() )
+ break;
+ if(abs(bb_seq_list[bI][seqI].first) + 1 < gene_bounds[seqI][gI] && gene_bounds[seqI][gI] < abs(bb_seq_list[bI][seqI].second) - 1)
+ {
+ cerr << "segment " <<bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << " violates gene boundary " << gene_bounds[seqI][gI] << " in seq " << seqI << endl;
+ }else
+ cout << "segment " <<bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << " is okay for " << gene_bounds[seqI][gI] << " in seq " << seqI << endl;
+ }
+ }
+
+// mergeAdjacentSegments( bb_seq_list );
+// addUniqueSegments( bb_seq_list );
+ bbseq_input.close();
+ bb_output.open(output_fname.c_str());
+ writeBackboneSeqFile( bb_output, bb_seq_list );
+
+ return 0;
+}
+
diff --git a/src/bbFilter.cpp b/src/bbFilter.cpp
new file mode 100644
index 0000000..9d62587
--- /dev/null
+++ b/src/bbFilter.cpp
@@ -0,0 +1,292 @@
+#include "libMems/Backbone.h"
+using namespace mems;
+using namespace std;
+using namespace genome;
+
+typedef pair< bb_seqentry_t, size_t > labeled_bb_t;
+
+class BbSorter
+{
+public:
+ BbSorter( size_t seqI ){ m_seq = seqI; }
+ bool operator()( const labeled_bb_t& a, const labeled_bb_t& b )
+ {
+ return genome::absolut(a.first[m_seq].first) < genome::absolut(b.first[m_seq].first);
+ }
+ size_t m_seq;
+};
+
+
+
+class ShorterThan {
+public:
+ bool operator()( const bb_seqentry_t& a )
+ {
+ size_t sc = 0;
+ size_t tot = 0;
+ for( size_t i = 0; i < a.size(); i++ )
+ if( a[i].first != 0 )
+ {
+ tot += genome::absolut(a[i].second - a[i].first) + 1;
+ sc++;
+ }
+ if( tot == 0 )
+ return true;
+ return (tot / sc) < 20;
+ }
+};
+
+int main( int argc, char* argv[] )
+{
+#if WIN32
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+#endif
+
+ if( argc < 4 )
+ {
+ cerr << "bbFilter <backbone file> <independent dist> <output file> <beast|gp> [<seq1> <seq2>...<seqN>]\n";
+ cerr << "seq index starts at 0.\n";
+ cerr << "\nExample:\n";
+ cerr << "bbFilter my_alignment.backbone 50 my_feats.bin gp\n";
+ cerr << "the above command extracts binary features from \"my_alignment.backbone\" which are separated by a minimum of 50nt sequence conserved among all taxa in the alignment. The output is written to my_feats.bin in genoplast format\n";
+ cerr << "\n\nExample 2:\nbbFilter aln.backbone 100 feats.xml beast 0 1 2 5 6\n";
+ cerr << "the above command extracts binary features from \"aln.backbone\" which are separated by a minimum of 100nt sequence conserved among genomes 0,1,2,5, and 6 from the alignment. The output is written to feats.xml in beast format\n";
+ return -1;
+ }
+ string bbseq_fname( argv[1] );
+ int indie_dist = atoi( argv[2] );
+ string output_fname( argv[3] );
+ string target_format( argv[4] );
+ bool allow_alternalogs = true;
+ bool check_independence = false;
+
+ ifstream bbseq_input( bbseq_fname.c_str() );
+ if( !bbseq_input.is_open() ){
+ cerr << "Error opening \"" << bbseq_fname << "\"" << endl;
+ return -4;
+ }
+ ofstream anal_output( output_fname.c_str() );
+ if( !anal_output.is_open() ){
+ cerr << "Error opening \"" << output_fname << "\" for writing" << endl;
+ return -6;
+ }
+
+ // read the backbone column file
+ vector< bb_seqentry_t > bb_seq_list;
+ readBackboneSeqFile( bbseq_input, bb_seq_list );
+
+ // read the list of seqs of interest
+ vector< int > seqs;
+ for( int i = 5; i < argc; i++ )
+ seqs.push_back(atoi(argv[i]));
+
+ // assume all seqs are of interest
+ if( seqs.size() == 0 && bb_seq_list.size() > 0 )
+ {
+ for( int i = 0; i < bb_seq_list[0].size(); i++ )
+ seqs.push_back(i);
+ }
+ // add any genome-specific segments
+ addUniqueSegments( bb_seq_list );
+
+ // remove short segments
+ ShorterThan st;
+ vector< bb_seqentry_t >::iterator new_end = std::remove_if( bb_seq_list.begin(), bb_seq_list.end(), st );
+ cout << "Removing " << bb_seq_list.end() - new_end << " features shorter than 20 nt\n";
+ bb_seq_list.erase( new_end, bb_seq_list.end() );
+
+ // now assign tracking IDs to the backbone segments
+ vector< labeled_bb_t > bb_segs;
+ for( size_t i = 0; i < bb_seq_list.size(); i++ )
+ {
+ bb_segs.push_back( make_pair( bb_seq_list[i], i ) );
+ }
+
+ // create a sorted list for each genome and a map to the segment ID
+ vector< vector< labeled_bb_t > > sorted_segs( seqs.size(), bb_segs );
+ vector< vector< size_t > > seg_id_maps( seqs.size(), vector< size_t >( bb_segs.size() ) );
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ BbSorter bbs(seqs[seqI]);
+ std::sort( sorted_segs[seqI].begin(), sorted_segs[seqI].end(), bbs );
+ for( size_t bbI = 0; bbI < sorted_segs[seqI].size(); bbI++ )
+ seg_id_maps[ seqI ][ sorted_segs[seqI][bbI].second ] = bbI;
+ }
+
+
+ bitset_t good_bb( bb_seq_list.size() );
+ bitset_t nway( bb_seq_list.size() );
+ bitset_t nunya( bb_seq_list.size() );
+
+ // mark anything that has all of the seqs or none of seqs as not useful
+ for( size_t bbI = 0; bbI < bb_seq_list.size(); bbI++ )
+ {
+ bool all = true;
+ bool none = true;
+ for( size_t sI = 0; sI < seqs.size(); sI++ )
+ {
+ if( bb_seq_list[bbI][seqs[sI]].first == 0 )
+ all = false;
+ else
+ none = false;
+ }
+ if(all)
+ nway.set(bbI);
+ if(none)
+ nunya.set(bbI);
+ }
+ good_bb = nway | nunya;
+ good_bb.flip();
+
+ // now mark segs that are too close to each other to be considered independent
+ for( size_t sI = 0; check_independence && sI < seqs.size(); sI++ )
+ {
+ BbSorter bbs(seqs[sI]);
+ std::sort( bb_segs.begin(), bb_segs.end(), bbs );
+ for( size_t bbI = 1; bbI < bb_segs.size()-1; bbI++ )
+ {
+ if( nway[bb_segs[bbI].second] )
+ continue;
+ if( bb_segs[bbI].first[seqs[sI]].first == 0 )
+ continue;
+ // ensure that it has n-way on both sides and that they are at least "indie_dist" long
+ if( nway.test(bb_segs[bbI-1].second) &&
+ nway.test(bb_segs[bbI+1].second) &&
+ absolut(bb_segs[bbI-1].first[seqs[sI]].second - bb_segs[bbI-1].first[seqs[sI]].first) >= indie_dist &&
+ absolut(bb_segs[bbI+1].first[seqs[sI]].second - bb_segs[bbI+1].first[seqs[sI]].first) >= indie_dist )
+ {
+ if( !allow_alternalogs ){
+ // ensure that there is no other feature in the other genomes
+ for( size_t k = 0; k < seqs.size(); k++ )
+ {
+ if( k == sI )
+ continue;
+ size_t oid = seg_id_maps[k][ bb_segs[bbI-1].second ];
+ int parity = ((bb_segs[bbI-1].first[seqs[sI]].first > 0 && bb_segs[bbI-1].first[seqs[k]].first > 0) ||
+ (bb_segs[bbI-1].first[seqs[sI]].first < 0 && bb_segs[bbI-1].first[seqs[k]].first < 0)) ? 1 : -1;
+ size_t prev_in_sI = bb_segs[bbI-1].second;
+ size_t cur_in_sI = bb_segs[bbI].second;
+ size_t next_in_sI = bb_segs[bbI+1].second;
+ size_t prev_in_k = sorted_segs[k][oid].second;
+ size_t cur_in_k = sorted_segs[k][oid+parity].second;
+ size_t next_in_k = sorted_segs[k][oid+parity*2].second;
+ if( (cur_in_sI == cur_in_k && next_in_sI == next_in_k) ||
+ (next_in_sI == cur_in_k))
+ ; // it's good because no other segs intervene
+ else
+ {
+ good_bb.set( bb_segs[bbI].second, false );
+ break; // it's an alternalog or overlapping, no sense in checking other seqs
+ }
+ }
+ }
+ }else
+ good_bb.set(bb_segs[bbI].second, false);
+ }
+ }
+
+ // create site patterns, then write out the good ones
+ bitset_t empty( bb_seq_list.size() );
+ vector< bitset_t > spa_seqs( seqs.size(), empty );
+ for( size_t bbI = 0; bbI < bb_seq_list.size(); bbI++ )
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ spa_seqs[seqI].set(bbI, bb_seq_list[bbI][seqs[seqI]].first != 0);
+
+ vector< string > binseqs( seqs.size(), string( good_bb.count(), '0' ) );
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ size_t goodI = 0;
+ for( size_t bbI = 0; bbI < good_bb.size(); bbI++ )
+ if(good_bb.test(bbI))
+ {
+ if(spa_seqs[seqI].test(bbI))
+ binseqs[seqI][goodI] = '1';
+ goodI++;
+ }
+ }
+ map< string, int > sitepattern_count;
+ // count how many segments of each site pattern
+ for( size_t bbI = 0; bbI < good_bb.size(); bbI++ )
+ {
+ if(!good_bb.test(bbI)) continue;
+ size_t length=0;
+ size_t sc=0;
+ string sitepat( seqs.size(), '0' );
+ for( int seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ sitepat[seqI] = spa_seqs[seqI][bbI] ? '1' : '0';
+ if(spa_seqs[seqI][bbI]){
+ length += genome::absolut(bb_seq_list[bbI][seqI].second - bb_seq_list[bbI][seqI].first);
+ sc++;
+ }
+ }
+ length /= sc;
+ map< string, int >::iterator iter = sitepattern_count.find(sitepat);
+ if(iter == sitepattern_count.end())
+ sitepattern_count.insert( make_pair( sitepat, length ) );
+ else
+ iter->second+= length;
+ }
+
+ // write out the seqs!!
+ if( target_format == "beast" )
+ {
+ anal_output << "\t<taxa id=\"taxa\">\n";
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ anal_output << "\t\t<taxon id=\"seq" << seqI << "\"/>\n";
+
+ }
+ anal_output << "\t</taxa>\n";
+ anal_output << "\t<alignment id=\"alignment\" dataType=\"binary\">\n";
+
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ anal_output << "\t\t<sequence>\n";
+ anal_output << "\t\t\t<taxon idref=\"seq" << seqI << "\"/>\n";
+ anal_output << "\t\t\t" << binseqs[seqI] << endl;
+ anal_output << "\t\t</sequence>\n";
+// anal_output << "> seq" << seqI << endl;
+// for( size_t i = 0; i < binseqs[seqI].size(); i+=80 )
+// anal_output << binseqs[seqI].substr(i, 80) << endl;
+ }
+ anal_output << "\t</alignment>\n";
+ }else{
+ // write out a header line with the number of times each site pattern is used.
+ map<string,int>::iterator f = sitepattern_count.begin();
+ for(; f!= sitepattern_count.end(); f++){
+ if(f!=sitepattern_count.begin()) anal_output << ' ';
+ anal_output << (f->second / 20);
+ }
+ anal_output << endl;
+ // write genoplast format
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ f = sitepattern_count.begin();
+ for(; f!= sitepattern_count.end(); f++){
+ if(f!=sitepattern_count.begin()) anal_output << ' ';
+ anal_output << f->first[seqI];
+ }
+ anal_output << endl;
+ }
+ }
+
+ anal_output.close();
+
+ string loc_fname = output_fname + ".locs";
+ ofstream location_output( loc_fname.c_str() );
+ for( size_t bbI = 0; bbI < good_bb.size(); bbI++ )
+ {
+ if( good_bb.test(bbI) )
+ {
+ for( size_t seqI = 0; seqI < seqs.size(); seqI++ )
+ {
+ if( seqI > 0 )
+ location_output << '\t';
+ location_output << bb_seq_list[bbI][seqI].first << '\t' << bb_seq_list[bbI][seqI].second;
+ }
+ location_output << std::endl;
+ }
+ }
+}
+
diff --git a/src/calculateBackboneCoverage.cpp b/src/calculateBackboneCoverage.cpp
new file mode 100644
index 0000000..8a57db4
--- /dev/null
+++ b/src/calculateBackboneCoverage.cpp
@@ -0,0 +1,138 @@
+/*******************************************************************************
+ * $Id: calculateBackboneCoverage.cpp,v 1.5 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+#include "libMems/DistanceMatrix.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <source alignment> <min bb sequence length> <max bb gap size> <sequence 1>...<sequence N>\n";
+}
+
+#define NELEMS(a) ( sizeof( a ) / sizeof( *a ) )
+
+int main( int argc, const char* argv[] ){
+
+// debugging command line
+#if defined(__MWERKS__) && defined(__GNDEBUG__)
+ const char* m_argv[] = {
+ "calculateBackboneCoverage",
+ "9coli.dat",
+ "50",
+ "50",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\ecolim52.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\EDL933.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\o157sakai.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\CFTv17.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\Sflex57_v3.gbk",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\shigella_flexnerii_2a.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\typhimurium.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\styphi.fas",
+ "\\\\Ramesses\\Workspace\\aaron\\diarrhea\\STY2.fas",
+ };
+ int m_argc = NELEMS( m_argv );
+
+ argv = m_argv;
+ argc = m_argc;
+#endif
+
+try{
+ if( argc <= 0 ){
+ print_usage( "extractBackbone" );
+ return -1;
+ }
+ if( argc < 6 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ string alignment_fname = argv[1];
+ int64 min_bb_length = atol( argv[2] );
+ int64 max_gap_length = atol( argv[3] );
+ vector< string > sequence_fname;
+ vector< gnSequence* > source_seqs;
+ for( uint argI = 4; argI < argc; argI++ ){
+ sequence_fname.push_back( argv[ argI ] );
+ cout << "Loading " << sequence_fname[ argI - 4 ];
+ try{
+ source_seqs.push_back( new gnSequence() );
+ source_seqs[ argI - 4 ]->LoadSource( sequence_fname[ argI - 4 ] );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+ cout << " " << source_seqs[ argI - 4 ]->length() << " bp\n";
+ }
+
+ ifstream alignment_in;
+ alignment_in.open( alignment_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+
+ cout << "Loading alignment...\n";
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadStandardAlignment( alignment_in );
+
+ // add the sequence data to the interval list
+ aligned_ivs.seq_table = source_seqs;
+ uint seq_count = source_seqs.size();
+ cout << "Extracting backbone..." << endl;
+ vector< GappedAlignment > backbone_data;
+ simpleFindBackbone( aligned_ivs, min_bb_length, max_gap_length, backbone_data );
+
+ IntervalList backbone_ivs;
+ backbone_ivs.seq_table = aligned_ivs.seq_table;
+
+ cout << "There are " << backbone_data.size() << " backbone segments\n";
+
+ // count up the total length of backbone in each genome
+ cout << "Averaging backbone lengths..." << endl;
+ vector< gnSeqI > total_bb( seq_count, 0 );
+ NumericMatrix< double > overall_identity;
+ for( uint bbI = 0; bbI < backbone_data.size(); bbI++ ){
+ vector<AbstractMatch*> tmp(1, &backbone_data[ bbI ]);
+ backbone_ivs.push_back( Interval(tmp.begin(), tmp.end()) );
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ total_bb[ seqI ] += backbone_data[ bbI ].Length( seqI );
+ }
+ }
+
+ IdentityMatrix( backbone_ivs, overall_identity );
+
+ gnSeqI avg_bb = 0;
+ for( uint seqI = 0; seqI < aligned_ivs.seq_table.size(); seqI++ ){
+ cout << "seq " << seqI << " backbone: " << total_bb[ seqI ] << endl;
+ avg_bb += total_bb[ seqI ];
+ }
+ avg_bb /= aligned_ivs.seq_table.size();
+ cout << "Average: " << avg_bb << endl;
+
+ // output the identity matrix
+ cout << "Identity matrix: " << endl;
+ overall_identity.print( cout );
+ cout << endl;
+
+}catch( gnException& gne ){
+ cerr << gne << endl;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+}catch(...){
+
+}
+ return 0;
+}
diff --git a/src/calculateBackboneCoverage2.cpp b/src/calculateBackboneCoverage2.cpp
new file mode 100644
index 0000000..5a898ec
--- /dev/null
+++ b/src/calculateBackboneCoverage2.cpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ * $Id: calculateBackboneCoverage.cpp,v 1.5 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+#include "libMems/DistanceMatrix.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <XMFA alignment> <min bb sequence length> <max bb gap size> \n";
+}
+
+
+int main( int argc, const char* argv[] ){
+
+try{
+ if( argc <= 0 ){
+ print_usage( "extractBackbone" );
+ return -1;
+ }
+ if( argc < 4 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ string alignment_fname = argv[1];
+ int64 min_bb_length = atol( argv[2] );
+ int64 max_gap_length = atol( argv[3] );
+ vector< string > sequence_fname;
+ vector< gnSequence* > source_seqs;
+
+ ifstream alignment_in;
+ alignment_in.open( alignment_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+
+ cout << "Loading alignment...\n";
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadStandardAlignment( alignment_in );
+ LoadSequences(aligned_ivs, &cout);
+ source_seqs = aligned_ivs.seq_table;
+ // calculate total lengths covered
+ uint seq_count = source_seqs.size();
+ double avg_coverage = 0;
+ double total_lcb_len = 0;
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ double cur_size = 0;
+ for( uint ivI = 0; ivI < aligned_ivs.size(); ++ivI )
+ cur_size += aligned_ivs[ivI].Length(seqI);
+ total_lcb_len += cur_size;
+ cout << "Genome " << seqI << " coverage is: " << cur_size << " / " << source_seqs[seqI]->length() << " = ";
+ cur_size /= (double)source_seqs[seqI]->length();
+ cout << cur_size << endl;
+ avg_coverage += cur_size;
+ }
+ avg_coverage /= (double)seq_count;
+ cout << "Average coverage = " << avg_coverage << endl;
+ double avg_lcb_len = total_lcb_len / (double)(seq_count * aligned_ivs.size());
+ double lcb_len_variance = 0;
+
+ for( uint seqI = 0; seqI < seq_count; ++seqI )
+ {
+ for( uint ivI = 0; ivI < aligned_ivs.size(); ++ivI )
+ lcb_len_variance += (aligned_ivs[ivI].Length(seqI) - avg_lcb_len) * (aligned_ivs[ivI].Length(seqI) - avg_lcb_len);
+ }
+ lcb_len_variance /= (double)((seq_count*aligned_ivs.size()) - 1.0);
+ cout << "Avg lcb len: " << avg_lcb_len << endl;
+ cout << "variance: " << lcb_len_variance << endl;
+ cout << "std dev: " << pow( lcb_len_variance, 0.5 ) << endl;
+
+ cout << "Extracting backbone..." << endl;
+ vector< GappedAlignment > backbone_data;
+ simpleFindBackbone( aligned_ivs, min_bb_length, max_gap_length, backbone_data );
+
+ IntervalList backbone_ivs;
+ backbone_ivs.seq_table = aligned_ivs.seq_table;
+
+ cout << "There are " << backbone_data.size() << " backbone segments\n";
+
+ // count up the total length of backbone in each genome
+ cout << "Averaging backbone lengths..." << endl;
+ vector< gnSeqI > total_bb( seq_count, 0 );
+ NumericMatrix< double > overall_identity;
+ for( uint bbI = 0; bbI < backbone_data.size(); bbI++ ){
+ vector<AbstractMatch*> tmp(1, &backbone_data[ bbI ]);
+ backbone_ivs.push_back( Interval(tmp.begin(), tmp.end()) );
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ total_bb[ seqI ] += backbone_data[ bbI ].Length( seqI );
+ }
+ }
+ vector< AbstractMatch* > bbivs;
+ for( uint bbI = 0; bbI < backbone_ivs.size(); bbI++ )
+ bbivs.push_back( &backbone_ivs[bbI] );
+ BackboneIdentityMatrix( bbivs, aligned_ivs.seq_table, overall_identity );
+
+ gnSeqI avg_bb = 0;
+ for( uint seqI = 0; seqI < aligned_ivs.seq_table.size(); seqI++ ){
+ cout << "seq " << seqI << " backbone: " << total_bb[ seqI ] << endl;
+ avg_bb += total_bb[ seqI ];
+ }
+ avg_bb /= aligned_ivs.seq_table.size();
+ cout << "Average: " << avg_bb << endl;
+
+ // output the identity matrix
+ cout << "Identity matrix: " << endl;
+ overall_identity.print( cout );
+ cout << endl;
+
+}catch( gnException& gne ){
+ cerr << gne << endl;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+}catch(...){
+
+}
+ return 0;
+}
diff --git a/src/calculateCoverage.cpp b/src/calculateCoverage.cpp
new file mode 100644
index 0000000..907616c
--- /dev/null
+++ b/src/calculateCoverage.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ * $Id: calculateCoverage.cpp,v 1.5 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <source mauve alignment> <sequence 1>...<sequence N>\n";
+}
+
+#define NELEMS(a) ( sizeof( a ) / sizeof( *a ) )
+
+int main( int argc, const char* argv[] ){
+
+try{
+ if( argc <= 0 ){
+ print_usage( "calculateCoverage" );
+ return -1;
+ }
+ if( argc < 2 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ //
+ // Load sequences
+ //
+ string alignment_fname = argv[1];
+ vector< string > sequence_fname;
+ vector< gnSequence* > source_seqs;
+ for( uint argI = 2; argI < argc; argI++ ){
+ sequence_fname.push_back( argv[ argI ] );
+ cout << "Loading " << sequence_fname[ argI - 2 ];
+ try{
+ source_seqs.push_back( new gnSequence() );
+ source_seqs[ argI - 2 ]->LoadSource( sequence_fname[ argI - 2 ] );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+ cout << " " << source_seqs[ argI - 2 ]->length() << " bp\n";
+ }
+
+ //
+ // Load IntervalList matches
+ //
+ ifstream alignment_in;
+ alignment_in.open( alignment_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+
+ cout << "Loading alignment...\n";
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadList( alignment_in );
+
+ for( uint ivI = 0; ivI < aligned_ivs.size(); ivI++ ){
+ cout << "Interval " << ivI;
+ Interval& iv = aligned_ivs[ ivI ];
+ for( uint seqI = 0; seqI < source_seqs.size(); seqI++ ){
+ cout << '\t' << iv.Length( seqI );
+ }
+ cout << endl;
+ }
+
+
+}catch( gnException& gne ){
+ cerr << gne << endl;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+}catch(...){
+
+}
+ return 0;
+}
diff --git a/src/checkForLGT.cpp b/src/checkForLGT.cpp
new file mode 100644
index 0000000..ab993cd
--- /dev/null
+++ b/src/checkForLGT.cpp
@@ -0,0 +1,253 @@
+#include "libMems/PhyloTree.h"
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <utility>
+#include <fstream>
+#include <set>
+
+using namespace std;
+
+typedef unsigned int uint;
+
+/**
+ * Depth first search to check whether a subtree contains a given node
+ */
+bool containsNode( PhyloTree< TreeNode >& t, node_id_t subtree_nodeI, node_id_t query_nodeI )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push( subtree_nodeI );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( cur_node == query_nodeI )
+ return true;
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ }
+ return false;
+}
+
+void setTaxonNames( PhyloTree< TreeNode >& t, char** taxon_names )
+{
+ for( node_id_t nI = 0; nI < t.size(); nI++ )
+ {
+ if( t[nI].name.size() == 0 )
+ continue;
+ stringstream ss( t[nI].name );
+ uint num;
+ ss >> num;
+ t[nI].name = taxon_names[num];
+ }
+}
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 3 )
+ {
+ cerr << "Usage: checkForLGT <newick input file> <newick output file>\n";
+ return -1;
+ }
+ string input_filename = argv[1];
+ string tree_outfname = argv[2];
+ vector< string > group_1;
+ vector< string > group_2;
+ for( uint taxonI = 0; taxonI < 16; taxonI++ )
+ {
+ stringstream ss;
+ ss << taxonI;
+ group_1.push_back( ss.str() );
+ }
+ for( uint taxonI = 16; taxonI < 21; taxonI++ )
+ {
+ stringstream ss;
+ ss << taxonI;
+ group_2.push_back( ss.str() );
+ }
+ char* taxon_names[] = {
+ "E. coli 53638",
+ "E. coli b171",
+ "E. coli b7a",
+ "E. coli e110019",
+ "E. coli e22",
+ "E. coli e24377a",
+ "E. coli f11",
+ "E. coli HS",
+ "S. boydii BS512",
+ "S. sonnei Ss046",
+ "S. flexneri 2457T",
+ "S. flexneri 301",
+ "E. coli CFT073",
+ "E. coli O157_H7 RIMD",
+ "E. coli O157_H7 EDL933",
+ "E. coli K-12 MG1655",
+ "S. enterica B67",
+ "S. enterica CT18",
+ "S. enterica LT2",
+ "S. enterica PA9150",
+ "S. enterica Ty2",
+ };
+
+ ifstream input_file( input_filename.c_str() );
+ if( !input_file.is_open() )
+ {
+ cerr << "Error opening \"" << input_filename << "\"\n";
+ return -1;
+ }
+
+ uint tree_count = 0;
+ vector< PhyloTree< TreeNode > > tree_list;
+ while( true )
+ {
+ PhyloTree< TreeNode > new_t;
+ tree_list.push_back( new_t );
+ PhyloTree< TreeNode >& t = tree_list[tree_list.size() - 1];
+ t.readTree( input_file );
+ if( t.size() == 0 )
+ break;
+ tree_count++;
+ }
+ tree_list.erase( tree_list.end() - 1 );
+
+ for( size_t treeI = 0; treeI < tree_list.size(); treeI++ )
+ {
+ PhyloTree< TreeNode >& t = tree_list[treeI];
+
+ if( t[t.root].children.size() != 2 )
+ {
+ cout << treeI << "\t1\n";
+ continue;
+ }
+
+ vector< node_id_t > group1_id;
+ vector< node_id_t > group2_id;
+ node_id_t nI = 0;
+ size_t gI = 0;
+ for( gI = 0; gI < group_1.size(); gI++ )
+ {
+ nI = 0;
+ for( ; nI < t.size(); nI++ )
+ {
+ if( t[nI].name == group_1[gI] )
+ {
+ group1_id.push_back( nI );
+ break;
+ }
+ }
+ if( nI == t.size() )
+ {
+ cerr << "Couldn't find node " << group_1[gI] << " in tree " << treeI << endl;
+ return -1;
+ }
+ }
+ for( gI = 0; gI < group_2.size(); gI++ )
+ {
+ nI = 0;
+ for( ; nI < t.size(); nI++ )
+ {
+ if( t[nI].name == group_2[gI] )
+ {
+ group2_id.push_back( nI );
+ break;
+ }
+ }
+ if( nI == t.size() )
+ {
+ cerr << "Couldn't find node " << group_2[gI] << " in tree " << treeI << endl;
+ return -1;
+ }
+ }
+
+
+ node_id_t g1_subtree;
+ if( containsNode( t, t[t.root].children[0], group1_id[0] ) )
+ g1_subtree = t[t.root].children[0];
+ else
+ g1_subtree = t[t.root].children[1];
+
+ bool g1_monophyletic = true;
+ bool g2_monophyletic = true;
+
+ node_id_t cur_parent = group1_id[0];
+ set<node_id_t> g1_remaining;
+ g1_remaining.insert( group1_id.begin(), group1_id.end() );
+ // find the least common ancestor of all g1 nodes
+ while(g1_remaining.size() > 0)
+ {
+ // go to parent
+ cur_parent = t[cur_parent].parents[0];
+ set<node_id_t>::iterator iter = g1_remaining.begin();
+ while( iter != g1_remaining.end() )
+ {
+ if( containsNode( t, cur_parent, *iter ) )
+ {
+ set<node_id_t>::iterator erase_iter = iter;
+ iter++;
+ g1_remaining.erase( erase_iter );
+ }else
+ iter++;
+ }
+ }
+ // check none of group 2 is below the group 1 LCA
+ for( gI = 0; gI < group2_id.size(); gI++ )
+ if( containsNode( t, cur_parent, group2_id[gI] ) )
+ break;
+ if( gI < group2_id.size() )
+ g1_monophyletic = false;
+
+
+ cur_parent = group2_id[0];
+ set<node_id_t> g2_remaining;
+ g2_remaining.insert( group2_id.begin(), group2_id.end() );
+ // find the least common ancestor of all g1 nodes
+ while(g2_remaining.size() > 0)
+ {
+ // go to parent
+ cur_parent = t[cur_parent].parents[0];
+ set<node_id_t>::iterator iter = g2_remaining.begin();
+ while( iter != g2_remaining.end() )
+ {
+ if( containsNode( t, cur_parent, *iter ) )
+ {
+ set<node_id_t>::iterator erase_iter = iter;
+ iter++;
+ g2_remaining.erase( erase_iter );
+ }else
+ iter++;
+ }
+ }
+
+ // check none of group 1 is below the group 2 LCA
+ for( gI = 0; gI < group1_id.size(); gI++ )
+ if( containsNode( t, cur_parent, group1_id[gI] ) )
+ break;
+ if( gI < group1_id.size() )
+ g2_monophyletic = false;
+
+ if( !g1_monophyletic && !g2_monophyletic )
+ cout << treeI << "\t2\n"; // found something interesting?
+ else if( !g1_monophyletic )
+ cout << treeI << "\t3\n";
+ else if( !g2_monophyletic )
+ cout << treeI << "\t4\n";
+ else
+ cout << treeI << "\t0\n"; // nothing to see here
+ }
+
+ ofstream tree_out( tree_outfname.c_str() );
+ if( !tree_out.is_open() )
+ {
+ cerr << "Error opening \"" << tree_outfname << "\"\n";
+ return -1;
+ }
+ for( size_t treeI = 0; treeI < tree_list.size(); treeI++ )
+ {
+ setTaxonNames(tree_list[treeI], taxon_names);
+ tree_list[treeI].writeTree(tree_out);
+ }
+ return 0;
+}
\ No newline at end of file
diff --git a/src/coordinateTranslate.cpp b/src/coordinateTranslate.cpp
new file mode 100644
index 0000000..ad6a4fc
--- /dev/null
+++ b/src/coordinateTranslate.cpp
@@ -0,0 +1,51 @@
+// coordinateTranslate
+// (c) Aaron Darling 2011
+// Licensed under the GPL
+
+#include <libMems/IntervalList.h>
+#include <fstream>
+#include <libMems/CompactGappedAlignment.h>
+#include <libMems/MatchList.h>
+
+using namespace mems;
+using namespace std;
+using namespace genome;
+
+int main( int argc, char* argv[] ){
+ if(argc != 3){
+ cerr << "Usage: coordinateTranslate <XMFA alignment> <alignment coordinate file>\n";
+ cerr << "Alignment coordinate file should be structured into 2 tab-delimited columns: <block ID> <column>\n";
+ cerr << "Output will be the nearest aligned position for each genome in the block, with 0 entries for genomes undefined in the block\n";
+ return -1;
+ }
+ ifstream in_aln( argv[1] );
+ if(!in_aln.is_open() ){
+ cerr << "Error opening alignment file \"" << argv[1] << "\"\n";
+ return -2;
+ }
+ IntervalList iv_list;
+ iv_list.ReadStandardAlignment(in_aln);
+ LoadSequences( iv_list, NULL );
+
+ ifstream in_coords( argv[2] );
+ if(!in_coords.is_open() ){
+ cerr << "Error opening coordinate file \"" << argv[2] << "\"\n";
+ return -2;
+ }
+ int block_id;
+ while( in_coords >> block_id ){
+ int block_col;
+ in_coords >> block_col;
+ std::vector<gnSeqI> pos;
+ std::vector<bool> column;
+ iv_list[block_id].GetColumn( block_col, pos, column );
+ for(int i=0; i<pos.size(); i++){
+ if(i>0) cout << "\t";
+ cout << (column[i] ? pos[i] : 0);
+ }
+ cout << "\n";
+ }
+ return 0;
+}
+
+
diff --git a/src/countInPlaceInversions.cpp b/src/countInPlaceInversions.cpp
new file mode 100644
index 0000000..025a0d1
--- /dev/null
+++ b/src/countInPlaceInversions.cpp
@@ -0,0 +1,69 @@
+#include "libMems/IntervalList.h"
+#include "libMems/Aligner.h"
+#include <fstream>
+#include <string>
+#include <vector>
+#include <utility>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 2 )
+ {
+ cerr << "Usage: countInPlaceInversions <Mauve Alignment>\n";
+ return -1;
+ }
+ ifstream aln_file( argv[1] );
+ if( !aln_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+
+ IntervalList iv_list;
+ iv_list.ReadList( aln_file );
+ vector< int64 > weights = vector< int64 >( iv_list.size(), 1 );
+ vector< LCB > adjacencies;
+ computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
+ uint seq_count = iv_list.seq_filename.size();
+ vector< pair< uint, uint > > inv_seqs;
+
+ for( uint adjI = 0; adjI < adjacencies.size(); adjI++ )
+ {
+ // find in place inversions
+ uint seqI = 1;
+ for( ; seqI < seq_count; seqI++ )
+ {
+ if( adjacencies[adjI].left_adjacency[0] != adjacencies[adjI].left_adjacency[seqI] ||
+ adjacencies[adjI].right_adjacency[0] != adjacencies[adjI].right_adjacency[seqI] )
+ break;
+ }
+ if( seqI == seq_count )
+ {
+ // in place inversion
+ // count forward
+ uint forward_count = 0;
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ {
+ if( adjacencies[adjI].left_end[seqI] > 0 )
+ forward_count++;
+ }
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ {
+ if( forward_count * 2 > seqI && adjacencies[adjI].left_end[seqI] < 0 )
+ inv_seqs.push_back( make_pair( adjI, seqI ) );
+ if( forward_count * 2 < seqI && adjacencies[adjI].left_end[seqI] > 0 )
+ inv_seqs.push_back( make_pair( adjI, seqI ) );
+ }
+ }
+ }
+ for( uint invI = 0; invI < inv_seqs.size(); invI++ )
+ {
+ cout << "In-place inversion in seq " << inv_seqs[invI].second;
+ cout << "\tlend: " << adjacencies[inv_seqs[invI].first].left_end[inv_seqs[invI].second];
+ cout << "\trend: " << adjacencies[inv_seqs[invI].first].right_end[inv_seqs[invI].second] << endl;
+ }
+}
diff --git a/src/createBackboneMFA.cpp b/src/createBackboneMFA.cpp
new file mode 100644
index 0000000..ce5533b
--- /dev/null
+++ b/src/createBackboneMFA.cpp
@@ -0,0 +1,57 @@
+#include "libMems/Interval.h"
+#include "libMems/Islands.h"
+#include "libGenome/gnFASSource.h"
+
+using namespace std;
+using namespace mems;
+using namespace genome;
+
+int main( int argc, char* argv[] )
+{
+ IntervalList iv_list;
+ if( argc != 3 )
+ {
+ cerr << "Usage: <input interval file> <output MFA name>\n";
+ return -1;
+ }
+ ifstream in_file( argv[1] );
+ if( !in_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+ iv_list.ReadList( in_file );
+ LoadSequences(iv_list, NULL);
+ string base_name = argv[2];
+ cout << "Input alignment has " << iv_list.size() << " intervals\n";
+ vector< string > superaln = vector<string>( iv_list.seq_table.size() );
+ for( uint lcbI = 0; lcbI < iv_list.size(); lcbI++ )
+ {
+ // only use 1/30 LCBs
+ if( lcbI % 30 != 0 )
+ continue;
+ gnAlignedSequences gnas;
+ iv_list[lcbI].GetAlignedSequences( gnas, iv_list.seq_table );
+ for( uint seqI = 0; seqI < gnas.sequences.size(); seqI++ )
+ superaln[seqI] += gnas.sequences[seqI];
+ }
+
+ ofstream out_file( base_name.c_str() );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"" << base_name << "\"\n";
+ return -2;
+ }
+ gnSequence gns;
+ for( uint seqI = 0; seqI < superaln.size(); seqI++ )
+ {
+ stringstream seq_name;
+ seq_name << seqI;
+// seq_name << "(" << iv_list[lcbI].Start(seqI) << "-" << iv_list[lcbI].Start(seqI) + iv_list[lcbI].Length(seqI) << ")";
+ gns += superaln[seqI];
+ gns.setContigName( gns.contigListSize()-1, seq_name.str() );
+ }
+ gnFASSource::Write( gns, out_file, false, false );
+ return 0;
+}
+
diff --git a/src/evd.cpp b/src/evd.cpp
new file mode 100644
index 0000000..932d1bc
--- /dev/null
+++ b/src/evd.cpp
@@ -0,0 +1,129 @@
+#include "libMems/Islands.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libGenome/gnSequence.h"
+
+#include <sstream>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+template< typename MatchVector >
+void getLocalRecordHeights( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, vector< score_t >& lrh )
+{
+ typedef typename MatchVector::value_type MatchType;
+ if( iv_list.size() == 0 )
+ return;
+ uint seq_count = seq_table.size();
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ const MatchType& iv = iv_list[ iv_listI ];
+ std::vector< std::string > aln_table;
+ GetAlignment( *iv, seq_table, aln_table );
+
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ uint seqJ;
+ for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
+
+ std::vector< score_t > scores;
+ PairwiseScoringScheme pss;
+ computeMatchScores( aln_table[seqI], aln_table[seqJ], pss, scores );
+ computeGapScores( aln_table[seqI], aln_table[seqJ], pss, scores );
+
+ // Invert the scores since we're trying to detect rare bouts of non-homologous sequence
+ for( size_t sI = 0; sI < scores.size(); ++sI )
+ if( scores[sI] != INVALID_SCORE)
+ scores[sI] = -scores[sI];
+
+ score_t score_sum = 0; // start in an hss
+ score_t local_record_height = 0;
+ for( size_t colI = 0; colI < scores.size(); ++colI )
+ {
+ if( scores[colI] == INVALID_SCORE )
+ continue;
+
+ if( score_sum > 0 && score_sum + scores[colI] < 0 )
+ {
+ // end of an excursion
+ score_sum = 0;
+ lrh.push_back( local_record_height );
+ local_record_height = 0;
+ }else if( score_sum == 0 && scores[colI] > 0 )
+ {
+ // start a new excursion
+ score_sum += scores[colI];
+ if( score_sum > local_record_height )
+ local_record_height = score_sum;
+ }else if( score_sum > 0 ){
+ score_sum += scores[colI];
+ if( score_sum > local_record_height )
+ local_record_height = score_sum;
+ }
+ }
+ }
+ }
+ }
+}
+
+
+// read each input file, write summary statistics about the EVD to stdout
+int main( int argc, char* argv[] )
+{
+ vector< score_t > lrh_all;
+ if( argc != 2 )
+ {
+ cerr << "Usage: evd <simulation run count>\n";
+ cerr << "This program must be run from a directory which contains alignjob directories\n";
+ return -1;
+ }
+ int run_count = atoi( argv[1] );
+ int simu_count = 0;
+ for( int runI = 0; runI < run_count; ++runI )
+ {
+ IntervalList iv_list;
+ stringstream aln_fname;
+ aln_fname << "alignjob." << runI << "/evolved.dat";
+ ifstream in_file( aln_fname.str().c_str() );
+ if( !in_file.is_open() )
+ {
+ cerr << "Error opening " << aln_fname.str() << endl;
+ continue;
+ }
+ simu_count++;
+ iv_list.ReadStandardAlignment(in_file);
+ stringstream seq_fname;
+ seq_fname << "alignjob." << runI << "/evolved_seqs.fas";
+ MatchList ml;
+ LoadMFASequences(ml, seq_fname.str(), &cout);
+ iv_list.seq_table = ml.seq_table;
+
+ vector< Interval* > iv_ptrs( iv_list.size() );
+ for( size_t ivI = 0; ivI < iv_list.size(); ++ivI )
+ iv_ptrs[ivI] = &iv_list[ivI];
+
+ vector< score_t > lrh;
+ getLocalRecordHeights( iv_ptrs, iv_list.seq_table, lrh );
+ lrh_all.insert( lrh_all.end(), lrh.begin(), lrh.end() );
+ }
+ std::sort( lrh_all.begin(), lrh_all.end() );
+ size_t index_95 = lrh_all.size() * .95;
+ size_t index_99 = lrh_all.size() * .99;
+ size_t index_999 = lrh_all.size() * .999;
+ size_t index_9999 = lrh_all.size() * .9999;
+ index_95 = std::min(index_95, lrh_all.size()-1);
+ index_99 = std::min(index_99, lrh_all.size()-1);
+ index_999 = std::min(index_999, lrh_all.size()-1);
+ index_9999 = std::min(index_9999, lrh_all.size()-1);
+ cout << "Total number of simulations: " << simu_count << endl;
+ cout << "Total number of excursions: " << lrh_all.size() << endl;
+ cout << "95% score threshold: " << lrh_all[index_95] << endl;
+ cout << "Number excursions above 95%: " << lrh_all.size() - index_95 << endl;
+ cout << "99% score threshold: " << lrh_all[index_99] << endl;
+ cout << "Number excursions above 99%: " << lrh_all.size() - index_99 << endl;
+ cout << "99.9% score threshold: " << lrh_all[index_999] << endl;
+ cout << "Number excursions above 99.9%: " << lrh_all.size() - index_999 << endl;
+ cout << "99.99% score threshold: " << lrh_all[index_9999] << endl;
+ cout << "Number excursions above 99.99%: " << lrh_all.size() - index_9999 << endl;
+}
+
+
diff --git a/src/extractBCITrees.cpp b/src/extractBCITrees.cpp
new file mode 100644
index 0000000..7e9d6a5
--- /dev/null
+++ b/src/extractBCITrees.cpp
@@ -0,0 +1,369 @@
+#include "libMems/PhyloTree.h"
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <utility>
+#include <fstream>
+#include <boost/random/uniform_real.hpp>
+#include <boost/random/lagged_fibonacci.hpp>
+
+using namespace std;
+
+typedef unsigned int uint;
+
+bool taxonNameLessThan( string name1, string name2 )
+{
+ stringstream n1_str( name1 );
+ stringstream n2_str( name2 );
+ int n1, n2;
+ n1_str >> n1;
+ n2_str >> n2;
+ return n1 < n2;
+}
+
+template<class T, class S>
+void findAndErase( T& container, S& item )
+{
+ T new_container;
+ for( typename T::iterator t_iter = container.begin(); t_iter != container.end(); t_iter++ )
+ if( *t_iter != item )
+ new_container.push_back( *t_iter );
+ container = new_container;
+};
+
+/**
+ * Depth first search to check whether a subtree contains a given node
+ */
+bool containsNode( PhyloTree< TreeNode >& t, node_id_t subtree_nodeI, node_id_t query_nodeI )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push( subtree_nodeI );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( cur_node == query_nodeI )
+ return true;
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ }
+ return false;
+}
+
+
+/** place a root on the branch with endpoints root_left and root_right
+ */
+void rerootTree( PhyloTree< TreeNode >& t, node_id_t new_root )
+{
+ // new root must be an internal node
+ if( t[new_root].children.size() == 0 )
+ throw "Can't root on a leaf node";
+ if( new_root == t.root )
+ return; // idiot caller didn't realize it's already rooted here
+
+ // change the old root node to an internal node
+ uint childI = 0;
+ for( ; childI < t[t.root].children.size(); childI++ ){
+ if( containsNode( t, t[t.root].children[childI], new_root ) )
+ {
+ t[t.root].parents.push_back( t[t.root].children[childI] );
+ findAndErase( t[t.root].children, t[t.root].children[childI] );
+ break;
+ }
+ }
+ // shake the tree out on the new root node
+ t.root = new_root;
+ t[t.root].children.insert( t[t.root].children.end(), t[t.root].parents.begin(), t[t.root].parents.end() );
+
+ stack<node_id_t> node_stack;
+ node_stack.push(t.root);
+ while( node_stack.size() > 0 )
+ {
+ // delete the current node from all of its child nodes lists
+ // and insert it as a parent
+ // make all other nodes reference by the child grandchildren
+ // recurse on each child
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ for( uint childI = 0; childI < t[cur_node].children.size(); childI++ )
+ {
+ TreeNode& child_n = t[t[cur_node].children[childI]];
+ findAndErase( child_n.children, cur_node );
+ findAndErase( child_n.parents, cur_node );
+ child_n.children.insert( child_n.children.end(), child_n.parents.begin(), child_n.parents.end() );
+ child_n.parents.clear();
+ child_n.parents.push_back(cur_node);
+ node_stack.push(t[cur_node].children[childI]);
+ }
+ }
+}
+
+/**
+ * Find the leaf node lexicographically least taxon name in the
+ * subtree below nodeI
+ */
+node_id_t getRepresentativeTaxon( PhyloTree< TreeNode >& t, node_id_t nodeI )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push( nodeI );
+ string least_name = "";
+ node_id_t least_node = nodeI;
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ else
+ {
+ if( least_name == "" )
+ {
+ least_name = t[cur_node].name;
+ least_node = cur_node;
+ }
+ if( taxonNameLessThan( t[cur_node].name, least_name ) )
+ {
+ least_name = t[cur_node].name;
+ least_node = cur_node;
+ }
+ }
+ }
+ return least_node;
+}
+
+class TaxonNamePairComparator
+{
+public:
+ bool operator()( const pair<string, size_t>& p1, const pair<string, node_id_t>& p2 )
+ {
+ return taxonNameLessThan( p1.first, p2.first );
+ }
+};
+
+void sortTaxa( PhyloTree< TreeNode >& t )
+{
+ for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ )
+ {
+ if( t[nodeI].children.size() == 0 )
+ continue;
+ // get the "representative" of each subtree
+ vector< pair<string, node_id_t> > representatives = vector< pair<string, node_id_t> >( t[nodeI].children.size() );
+ for( size_t repI = 0; repI < representatives.size(); repI++ )
+ {
+ node_id_t rep_node = getRepresentativeTaxon( t, t[nodeI].children[ repI ] );
+ representatives[ repI ] = make_pair( t[rep_node].name, repI );
+ }
+ // sort children on their representative taxon names
+ TaxonNamePairComparator tnc;
+ sort( representatives.begin(), representatives.end(), tnc );
+ // repopulate the children array with the sorted order
+ vector< node_id_t > sorted_children;
+ for( size_t repI = 0; repI < representatives.size(); repI++ )
+ sorted_children.push_back( t[nodeI].children[representatives[repI].second] );
+ t[nodeI].children = sorted_children;
+ }
+}
+
+/**
+ * Assumes that taxa have numeric labels starting at 1 and simply
+ * subtracts 1 from each node label
+ */
+void relabelTaxaToStartWithZero( PhyloTree< TreeNode >& t )
+{
+ for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ )
+ {
+ if( t[nodeI].name == "" )
+ continue;
+ stringstream name_str( t[nodeI].name );
+ uint number;
+ name_str >> number;
+ number--;
+ stringstream new_name_str;
+ new_name_str << number;
+ t[nodeI].name = new_name_str.str();
+ }
+}
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 5 )
+ {
+ cerr << "Usage: extractBCITrees <random seed> <BCI threshold> <max output trees> <MrBayes .trprobs input file 1 .. N> <nexus output file>\n";
+ cerr << "This program reads all trees and their posterior from a set of MrBayes .trprobs files\n";
+ cerr << "and sums and normalizes posteriors for each topology. All trees that meet a Bayes Credible\n";
+ cerr << "Interval threshold will be saved, up to some maximum number of trees.\n";
+ cerr << "<BCI Threshold>\tA number between 0 and 1 giving the BCI threshold. 0.9 is a good choice.\n";
+ cerr << "<max output trees>\tLimit the output to this many trees.\n";
+ cerr << "All trees in the input file must have the same number of taxa and the same taxon labels\n";
+ return -1;
+ }
+ boost::uint32_t prng_seed = atoi( argv[1] );
+ double bci_threshold = atof( argv[2] );
+ uint max_output_trees = atoi( argv[3] );
+ vector< string > trprobs_fnames;
+ for( uint argI = 4; argI < argc - 1; argI++ )
+ trprobs_fnames.push_back( argv[argI] );
+ if( trprobs_fnames.size() == 0 )
+ {
+ cerr << "At least one .trprobs file must be given\n";
+ return -1;
+ }
+ string output_filename = argv[argc-1];
+
+
+ ofstream output_file( output_filename.c_str() );
+ if( !output_file.is_open() )
+ {
+ cerr << "Error opening \"" << output_filename << "\"\n";
+ return -1;
+ }
+
+ size_t tree_sizes = 0;
+ uint tree_count = 0;
+ vector< pair< string, double > > tree_and_pp_list;
+ for( size_t fileI = 0; fileI < trprobs_fnames.size(); fileI++ )
+ {
+ ifstream input_file( trprobs_fnames[fileI].c_str() );
+ if( !input_file.is_open() )
+ {
+ cerr << "Error opening \"" << trprobs_fnames[fileI] << "\"\n";
+ return -1;
+ }
+ // scan ahead to start of trees
+ string cur_line;
+ while( getline( input_file, cur_line ) )
+ {
+ stringstream line_str( cur_line );
+ string first_token;
+ line_str >> first_token;
+ if( first_token == "tree" )
+ break;
+ }
+ do
+ {
+ stringstream line_str( cur_line );
+ string token;
+ line_str >> token;
+ if( token != "tree" )
+ break;
+ for( int i = 0; i < 6; i++ )
+ line_str >> token;
+
+ line_str >> token;
+ // read the cumulative posterior
+ stringstream cum_str( token );
+ string cum;
+ getline( cum_str, cum, ']' );
+ double cumulative = 0;
+ stringstream cc_str(cum);
+ cum_str >> cumulative;
+ if( cumulative > bci_threshold )
+ break;
+
+ for( int i = 0; i < 3; i++ )
+ line_str >> token;
+
+ // read the weight
+ stringstream w_str( token );
+ string w;
+ getline( w_str, w, ']' );
+ double weight = 0;
+ stringstream ww_str(w);
+ ww_str >> weight;
+
+ // read the tree
+ line_str >> token;
+ stringstream tree_str( token );
+ PhyloTree< TreeNode > t;
+ t.readTree( tree_str );
+ if( t.size() == 0 )
+ break;
+ if( tree_sizes == 0 )
+ tree_sizes = t.size();
+ if( t.size() != tree_sizes )
+ {
+ cerr << "Error: tree " << tree_count + 1 << " has a different number of taxa\n";
+ return -2;
+ }
+ sortTaxa( t );
+ relabelTaxaToStartWithZero( t );
+ stringstream ss;
+ t.writeTree(ss);
+ tree_and_pp_list.push_back(make_pair(ss.str(),weight));
+ cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+ cout << "Read " << tree_and_pp_list.size() << " trees";
+ }while( getline( input_file, cur_line ) );
+
+ }
+
+ sort( tree_and_pp_list.begin(), tree_and_pp_list.end() );
+
+ long unique_count = 0;
+
+ // identify unique trees
+ vector< pair< string, double > > unique_tree_and_pp_list;
+ for( size_t treeI = 0; treeI < tree_and_pp_list.size(); treeI++ )
+ {
+ if( treeI > 0 && tree_and_pp_list[treeI].first == tree_and_pp_list[treeI - 1].first )
+ {
+ unique_tree_and_pp_list.back().second += tree_and_pp_list[treeI].second;
+ continue;
+ }
+ unique_tree_and_pp_list.push_back( tree_and_pp_list[treeI] );
+ unique_count++;
+ }
+
+ // if the number of unique trees is less than the max, just write them out
+ // otherwise we need to subsample
+ if( unique_tree_and_pp_list.size() < max_output_trees )
+ {
+ cout << endl;
+ cout << "Writing unique trees to \"" << output_filename << "\"\n";
+ for( size_t treeI = 0; treeI < unique_tree_and_pp_list.size(); treeI++ )
+ output_file << unique_tree_and_pp_list[treeI].first;
+ cerr << "There are " << unique_count << " unique trees\n";
+ return 0;
+ }
+
+ // create a running sum of posteriors
+ double sum = 0;
+ for( size_t treeI = 0; treeI < unique_tree_and_pp_list.size(); treeI++ )
+ sum += unique_tree_and_pp_list[treeI].second;
+ // sample a tree
+ vector< string > subsample;
+ boost::lagged_fibonacci44497 rng;
+ rng.seed(prng_seed);
+ for( size_t treeI = 0; treeI < max_output_trees; treeI++ )
+ {
+ // get a random number
+ boost::uniform_real<> url( 0, sum );
+ double dart = url(rng);
+ double cursum = 0;
+ size_t i = 0;
+ for( ; i < unique_tree_and_pp_list.size(); i++ )
+ {
+ cursum += unique_tree_and_pp_list[i].second;
+ if( cursum > dart )
+ break;
+ }
+ if( i == unique_tree_and_pp_list.size() )
+ i--;
+ unique_tree_and_pp_list[i].second = 0;
+ subsample.push_back( unique_tree_and_pp_list[i].first );
+ }
+
+
+ cout << endl;
+ cout << "Writing unique trees to \"" << output_filename << "\"\n";
+ for( size_t treeI = 0; treeI < subsample.size(); treeI++ )
+ output_file << subsample[treeI];
+ cerr << "There are " << unique_count << " unique trees\n";
+ cerr << "The subsample contains " << subsample.size() << " trees\n";
+ return 0;
+}
diff --git a/src/extractBackbone.cpp b/src/extractBackbone.cpp
new file mode 100644
index 0000000..cc432ce
--- /dev/null
+++ b/src/extractBackbone.cpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ * $Id: extractBackbone.cpp,v 1.2 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <source sequences> <source alignment> <min bb sequence length> <max bb gap size> <backbone output>\n";
+}
+
+int main( int argc, const char* argv[] ){
+ if( argc <= 0 ){
+ print_usage( "extractBackbone" );
+ return -1;
+ }
+ if( argc != 6 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ string sequence_fname = argv[1];
+ string alignment_fname = argv[2];
+ int64 min_bb_length = atol( argv[3] );
+ int64 max_gap_length = atol( argv[4] );
+ string output_fname = argv[5];
+
+ gnSequence source_seqs;
+ try{
+ source_seqs.LoadSource( sequence_fname );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+
+ ifstream alignment_in;
+ alignment_in.open( alignment_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+
+
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadStandardAlignment( alignment_in );
+
+ // add the sequence data to the interval list
+ for( uint seqI = 0; seqI < source_seqs.contigListSize(); seqI++ ){
+ aligned_ivs.seq_table.push_back( new gnSequence( source_seqs.contig( seqI ) ) );
+ }
+
+ vector< GappedAlignment > backbone_data;
+ simpleFindBackbone( aligned_ivs, min_bb_length, max_gap_length, backbone_data );
+ IntervalList backbone_ivs;
+ backbone_ivs.seq_table = aligned_ivs.seq_table;
+ // construct a new IntervalList containing only backbone regions
+ for( uint bbI = 0; bbI < backbone_data.size(); bbI++ )
+ {
+ vector<AbstractMatch*> tmp(1, &backbone_data[ bbI ] );
+ backbone_ivs.push_back( Interval(tmp.begin(), tmp.end()) );
+ }
+
+ ofstream output( output_fname.c_str() );
+ if( !output.is_open() ){
+ cerr << "Error opening " << output_fname << endl;
+ return -1;
+ }
+ backbone_ivs.WriteStandardAlignment( output );
+
+ return 0;
+}
diff --git a/src/extractBackbone2.cpp b/src/extractBackbone2.cpp
new file mode 100644
index 0000000..4bfe004
--- /dev/null
+++ b/src/extractBackbone2.cpp
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ * $Id: extractBackbone.cpp,v 1.2 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <mauve alignment> <min bb sequence length> <max bb gap size> <backbone output>\n";
+}
+
+int main( int argc, const char* argv[] ){
+ if( argc <= 0 ){
+ print_usage( "extractBackbone" );
+ return -1;
+ }
+ if( argc != 5 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ string alignment_fname = argv[1];
+ int64 min_bb_length = atol( argv[2] );
+ int64 max_gap_length = atol( argv[3] );
+ string output_fname = argv[4];
+
+ ifstream alignment_in;
+ alignment_in.open( alignment_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+
+
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadList( alignment_in );
+ LoadSequences(aligned_ivs, &cout);
+
+ vector< GappedAlignment > backbone_data;
+ simpleFindBackbone( aligned_ivs, min_bb_length, max_gap_length, backbone_data );
+ IntervalList backbone_ivs;
+ backbone_ivs.seq_table = aligned_ivs.seq_table;
+ backbone_ivs.seq_filename = aligned_ivs.seq_filename;
+ // construct a new IntervalList containing only backbone regions
+ for( uint bbI = 0; bbI < backbone_data.size(); bbI++ ){
+ vector< AbstractMatch* > tmp( 1, &backbone_data[ bbI ] );
+ backbone_ivs.push_back( Interval( tmp.begin(), tmp.end() ) );
+ }
+
+ ofstream output( output_fname.c_str() );
+ if( !output.is_open() ){
+ cerr << "Error opening " << output_fname << endl;
+ return -1;
+ }
+ backbone_ivs.WriteList( output );
+
+ return 0;
+}
diff --git a/src/extractSubalignments.cpp b/src/extractSubalignments.cpp
new file mode 100644
index 0000000..761cc7e
--- /dev/null
+++ b/src/extractSubalignments.cpp
@@ -0,0 +1,96 @@
+#include "libMems/IntervalList.h"
+#include "libGenome/gnFASSource.h"
+#include <sstream>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void extractSubAlignment( IntervalList& iv_list, IntervalList& sub_list, uint seqI, gnSeqI lend, gnSeqI length )
+{
+ // find the relevant interval
+ uint ivI = 0;
+ for( ; ivI < iv_list.size(); ivI++ )
+ if( iv_list[ivI].LeftEnd(seqI) <= lend &&
+ lend < iv_list[ivI].LeftEnd(seqI) + iv_list[ivI].Length(seqI) )
+ break;
+
+ // we've now got the starting interval, crop appropriately
+ gnSeqI crop_left_amt = lend - iv_list[ivI].LeftEnd(seqI);
+ Interval iv(iv_list[ivI]);
+ iv.CropLeft(crop_left_amt, seqI);
+ gnSeqI crop_right_amt = length < iv.Length(seqI) ? iv.Length(seqI) - length : 0;
+ iv.CropRight(crop_right_amt, seqI);
+ iv.CalculateOffset();
+ sub_list.push_back(iv);
+}
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 4 )
+ {
+ cerr << "Usage: extractSubAlignment <XMFA alignment input> <Multi-FastA base name> <sub-alignment spec file>\n";
+ cerr << "where subalignment spec file is tab delimited text of the form:\n";
+ cerr << "<genome id>\t<left end>\t<length>\n";
+ }
+
+ string alignment_infilename = argv[1];
+ string alignment_outfilename = argv[2];
+ string spec_filename = argv[3];
+
+ ifstream alignment_infile( alignment_infilename.c_str() );
+ if( !alignment_infile.is_open() )
+ {
+ cerr << "Error opening \"" << alignment_infilename << "\"\n";
+ return -1;
+ }
+
+ IntervalList iv_list, iv_sublist;
+ iv_list.ReadStandardAlignment( alignment_infile );
+
+ ifstream spec_infile( spec_filename.c_str() );
+ if( !spec_infile.is_open() )
+ {
+ cerr << "Error opening \"" << spec_filename << "\"\n";
+ return -1;
+ }
+ vector< gnSequence* > seq_table( iv_list.seq_filename.size(), new gnSequence() );
+ size_t ivI = 0;
+ string cur_line;
+ while( getline( spec_infile, cur_line ) )
+ {
+ stringstream line_str( cur_line );
+ uint seqI;
+ int64 lend;
+ gnSeqI length;
+ if( !(line_str >> seqI) )
+ break;
+ if( !(line_str >> lend) )
+ break;
+ if( !(line_str >> length) )
+ break;
+ extractSubAlignment( iv_list, iv_sublist, seqI, lend, length );
+
+ gnAlignedSequences gnas;
+ iv_sublist[0].GetAlignedSequences( gnas, seq_table );
+ stringstream ss;
+ ss << alignment_outfilename << ".interval_" << ivI;
+ ofstream out_file( ss.str().c_str() );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"" << ss.str() << "\"\n";
+ return -1;
+ }
+ gnSequence mfa;
+ for( uint seqI = 0; seqI < seq_table.size(); seqI++ )
+ {
+ mfa += gnas.sequences[seqI];
+ stringstream cname;
+ cname << seqI << "(" << iv_sublist[0].Start(seqI) << ":" << iv_sublist[0].Start(seqI) + iv_sublist[0].Length(seqI) << ")";
+ mfa.setContigName( mfa.contigListLength() - 1, cname.str() );
+ }
+ gnFASSource::Write( mfa, out_file, false, false );
+ iv_sublist.clear();
+ ivI++;
+ }
+}
\ No newline at end of file
diff --git a/src/gappiness.cpp b/src/gappiness.cpp
new file mode 100644
index 0000000..4fe2a35
--- /dev/null
+++ b/src/gappiness.cpp
@@ -0,0 +1,53 @@
+#include "libGenome/gnFASSource.h"
+
+using namespace std;
+using namespace genome;
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 2 )
+ {
+ cerr << "Usage: gappiness <MFA file>\n";
+ }
+ string aln_fname = argv[1];
+ gnSequence gns;
+ gns.LoadSource( aln_fname );
+ cout << "aln_length\t" << gns.contig(0).length() << endl;
+ gnSeqI total_len = 0;
+ for( uint seqI = 0; seqI < gns.contigListSize(); seqI++ )
+ {
+ string cur_seq = gns.contig(seqI).ToString();
+ gnSeqI len = 0;
+ for( size_t charI = 0; charI < cur_seq.size(); charI++ )
+ {
+ if( cur_seq[charI] != '-' )
+ len++;
+ }
+ cout << "seq" << seqI << "_len\t" << len << endl;
+ total_len += len;
+ }
+ double avg_seq_len = (double)total_len / (double)gns.contigListSize();
+ cout << "avg_seq_len\t" << avg_seq_len << endl;
+ cout << "gappiness\t" << (double)(gns.contig(0).length()) / avg_seq_len << endl;
+
+ // compute average pairwise identity
+ gnSeqI total_id = 0;
+ gnSeqI total_possible = 0;
+ for( uint seqI = 0; seqI < gns.contigListSize(); seqI++ )
+ for( uint seqJ = seqI + 1; seqJ < gns.contigListSize(); seqJ++ )
+ {
+ string cur_seqI = gns.contig(seqI).ToString();
+ string cur_seqJ = gns.contig(seqJ).ToString();
+ for( size_t colI = 0; colI < cur_seqI.size(); colI++ )
+ {
+ if( cur_seqI[colI] == '-' || cur_seqJ[colI] == '-' )
+ continue;
+ total_possible++;
+ if( toupper(cur_seqI[colI]) == toupper(cur_seqJ[colI]) )
+ total_id++;
+ }
+ }
+ cout << "percent_id\t" << (double)total_id / (double)total_possible << endl;
+ return 0;
+}
+
diff --git a/src/getAlignmentWindows.cpp b/src/getAlignmentWindows.cpp
new file mode 100644
index 0000000..6ce86cb
--- /dev/null
+++ b/src/getAlignmentWindows.cpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ * $Id: getAlignmentWindows.cpp,v 1.2 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+#include "libGenome/gnFASSource.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Interval.h"
+#include <boost/filesystem/operations.hpp>
+#include <boost/algorithm/string/erase.hpp>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <XMFA alignment> <window length> <window shift amount> <base output filename>\n";
+}
+
+int main( int argc, const char* argv[] ){
+ if( argc <= 0 ){
+ print_usage( "getAlignmentWindows" );
+ return -1;
+ }
+ if( argc != 5 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ string alignment_fname = argv[1];
+ int64 window_length = atol( argv[2] );
+ int64 shift_length = atol( argv[3] );
+ string output_basename = argv[4];
+
+ ifstream alignment_in;
+ alignment_in.open( alignment_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadStandardAlignment( alignment_in );
+ cout << "Read " << aligned_ivs[0].SeqCount() << " sequences with " << aligned_ivs.size() << " aligned intervals from " << alignment_fname << endl;
+ cout.flush();
+ MatchList mlist;
+ mlist.seq_filename = aligned_ivs.seq_filename;
+ if( mlist.seq_filename.size() > 0 )
+ LoadSequences(mlist, &cout);
+ else if( aligned_ivs.size() == 1 )
+ {
+ mlist.seq_filename.resize( aligned_ivs[0].SeqCount() );
+ mlist.seq_table.resize( aligned_ivs[0].SeqCount() );
+ std::vector< mems::AbstractMatch* > matches;
+ aligned_ivs[0].StealMatches(matches);
+ std::vector< string > seqs = mems::GetAlignment( *((mems::GappedAlignment*)matches[0]), mlist.seq_table );
+ for( size_t seqI = 0; seqI < mlist.seq_table.size(); ++seqI )
+ {
+ boost::algorithm::erase_all( seqs[seqI], std::string("-") );
+ mlist.seq_table[seqI] = new gnSequence( seqs[seqI] );
+ }
+ aligned_ivs[0].SetMatches( matches );
+ }else{
+ cerr << "Error, source sequence file references not given\n";
+ }
+ // for each interval, extract sliding windows and write them to Multi-FastA files
+ for( uint ivI = 0; ivI < aligned_ivs.size(); ivI++ )
+ {
+ vector< string > alignment;
+ GetAlignment( aligned_ivs[ivI], mlist.seq_table, alignment );
+ Interval& iv = aligned_ivs[ivI];
+ stringstream ivnum;
+ ivnum << ivI;
+ boost::filesystem::path base_path = output_basename;
+ boost::filesystem::create_directory( base_path );
+ boost::filesystem::path iv_path = output_basename;
+ iv_path /= "interval_" + ivnum.str();
+ boost::filesystem::create_directory( iv_path );
+ for( gnSeqI window_leftend = 0; window_leftend < iv.AlignmentLength(); window_leftend += shift_length )
+ {
+ gnSeqI cur_window_size = window_leftend + window_length < iv.AlignmentLength() ? window_length : iv.AlignmentLength() - window_leftend;
+
+ stringstream window_filename;
+ window_filename << "window_" << window_leftend << "_to_" << window_leftend + cur_window_size - 1 << ".mfa";
+ boost::filesystem::path window_path = iv_path;
+ window_path /= window_filename.str();
+ ofstream out_file( window_path.string().c_str() );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"" << window_filename.str() << "\"\n";
+ return -2;
+ }
+ // write a multi-FastA
+ gnSequence gns;
+ for( uint seqI = 0; seqI < iv.SeqCount(); seqI++ )
+ {
+ stringstream seq_name;
+ seq_name << seqI;
+ gns += alignment[seqI].substr(window_leftend, cur_window_size);
+ gns.setContigName( gns.contigListSize()-1, seq_name.str() );
+ }
+ gnFASSource::Write( gns, out_file, false, false );
+ if( cur_window_size < window_length )
+ break;
+ }
+ // now write the whole interval as a single MFA
+ boost::filesystem::path lcb_path = iv_path;
+ lcb_path /= "lcb.mfa";
+ ofstream lcb_out( lcb_path.string().c_str() );
+ if( !lcb_out.is_open() )
+ {
+ cerr << "Error opening " << lcb_path.string() << endl;
+ return -3;
+ }
+ gnSequence fns;
+ for( uint seqI = 0; seqI < iv.SeqCount(); seqI++ )
+ {
+ stringstream seq_name;
+ seq_name << seqI;
+ fns += alignment[seqI];
+ fns.setContigName( fns.contigListSize()-1, seq_name.str() );
+ }
+ gnFASSource::Write( fns, lcb_out, false, false );
+
+ }
+ return 0;
+}
+
diff --git a/src/getOrthologList.cpp b/src/getOrthologList.cpp
new file mode 100644
index 0000000..de8d790
--- /dev/null
+++ b/src/getOrthologList.cpp
@@ -0,0 +1,317 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include "libGenome/gnFilter.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Matrix.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include "libGenome/gnFASSource.h"
+#include <boost/tuple/tuple.hpp>
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/Backbone.h"
+#include "libGenome/gnFeature.h"
+#include "libGenome/gnFASSource.h"
+#include "libMems/DistanceMatrix.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+typedef boost::tuple< uint, gnSeqI, gnSeqI, vector< uint > > bbcol_t;
+
+void printGI( ostream& out, gnBaseFeature* f )
+{
+ // print out the feature GI
+ size_t qI = 0;
+ for( ; qI < f->GetQualifierListLength(); qI++ )
+ {
+ if( f->GetQualifierName(qI) == "db_xref" )
+ {
+ string qval = f->GetQualifierValue(qI);
+ if( qval.substr(0,4) == "\"GI:" )
+ {
+ out << qval;
+ }
+ }
+ }
+}
+
+double computeAvgCoverage( vector< bb_seqentry_t >& backbone, vector< size_t >& nway_bb, vector< gnBaseFeature* >& ortho_cds )
+{
+ vector< double > covs( ortho_cds.size() );
+ double cov_sum = 0;
+ for( size_t oI = 0; oI < ortho_cds.size(); oI++ )
+ {
+ gnLocation floc = ortho_cds[oI]->GetLocation(0);
+ double intlen = 0;
+ for( size_t bbI = 0; bbI < nway_bb.size(); bbI++ )
+ {
+ gnLocation loc;
+ loc.SetStart(absolut(backbone[nway_bb[bbI]][oI].first));
+ loc.SetEnd(absolut(backbone[nway_bb[bbI]][oI].second));
+ gnLocation intloc = floc.GetIntersection(loc,gnLocation::determinedRegions);
+ intlen += intloc.GetEnd()-intloc.GetStart();
+ }
+ covs[oI] = intlen / (double)(floc.GetEnd()-floc.GetStart());
+ cov_sum += covs[oI];
+ }
+ return cov_sum / ((double)ortho_cds.size());
+}
+
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 6 )
+ {
+ cerr << "Usage: getOrthologList <input xmfa> <backbone seq file> <reference genome> <CDS ortholog filename> <CDS alignment base name>\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ uint sgI = atoi( argv[3] );
+ string ortho_fname = argv[4];
+ string output_base = argv[5];
+
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+ LoadSequences( input_ivs, &cout );
+
+ size_t seq_count = input_ivs.seq_table.size();
+
+ vector< bb_seqentry_t > backbone;
+ ifstream bb_in;
+ bb_in.open( argv[2] );
+ if( !bb_in.is_open() ){
+ cerr << "Error opening \"" << argv[2] << "\"" << endl;
+ return -2;
+ }
+ readBackboneSeqFile( bb_in, backbone );
+ bb_in.close();
+
+ ofstream ortho_out( ortho_fname.c_str() );
+ if( !ortho_out.is_open() )
+ {
+ cerr << "Error opening \"" << ortho_fname << "\"\n";
+ return -3;
+ }
+
+ gnSequence* gen0 = input_ivs.seq_table[sgI];
+ vector< gnBaseFeature* > genes;
+ for( size_t featI = 0; featI < gen0->getFeatureListLength(); featI++ )
+ {
+ gnBaseFeature* feat = gen0->getFeature(featI);
+ if( feat->GetName() == "CDS" )
+ genes.push_back( feat );
+ else
+ delete feat;
+ }
+
+ cout << genes.size() << " of the " << gen0->getFeatureListLength() << " annotated features are CDS\n";
+
+ size_t ortho_count = 0;
+ size_t rr_count = 0;
+ size_t partial_rr_annotated = 0;
+
+ ortho_out << "OrthoID";
+ for( size_t seqI = 0; seqI < seq_count; seqI++ )
+ ortho_out << "\tGI_in_Genome_" << seqI;
+ ortho_out << "\tCoverage\tIdentity\tRearranged\n";
+
+ // pick a gene at random from the first genome, extract the alignment, and write it to a file
+ for( size_t geneI = 0; geneI < genes.size(); geneI++ )
+ {
+ if( geneI == 156 )
+ cerr << "watchme\n";
+ // is this gene part of N-way backbone?
+ gnLocation loc = genes[geneI]->GetLocation(0);
+ int64 lend = loc.GetFirst();
+ int64 rend = loc.GetLast();
+ vector< size_t > intersecting_bb;
+ size_t bbI = 0;
+ for( size_t bbI = 0; bbI < backbone.size(); bbI++ )
+ {
+ if( (absolut(backbone[bbI][sgI].first) <= lend && lend <= absolut(backbone[bbI][sgI].second)) ||
+ (absolut(backbone[bbI][sgI].first) <= rend && rend <= absolut(backbone[bbI][sgI].second)) ||
+ (lend <= absolut(backbone[bbI][sgI].first) && absolut(backbone[bbI][sgI].first) <= rend) )
+ intersecting_bb.push_back(bbI);
+ }
+ vector< size_t > nway_bb;
+ for( size_t bbI = 0; bbI < intersecting_bb.size(); bbI++ )
+ {
+ size_t seqI = 0;
+ for( ; seqI < input_ivs.seq_table.size(); ++seqI )
+ {
+ if( backbone[intersecting_bb[bbI]][seqI].first == 0 || backbone[intersecting_bb[bbI]][seqI].second == 0 )
+ break;
+ }
+ if( seqI == input_ivs.seq_table.size() )
+ nway_bb.push_back(intersecting_bb[bbI]);
+ }
+
+ // skip to the next CDS if this one wasn't part of some n-way backbone
+ if( nway_bb.size() == 0 )
+ continue;
+
+ // use the alignment to find CDS that overlap in this region
+
+
+ // extract the alignment
+ size_t ivI = 0;
+ // identify the interval that has the biggest intersection
+ vector< pair< size_t, size_t > > iv_overlap;
+ for( ivI = 0; ivI < input_ivs.size(); ivI++ )
+ {
+ if( input_ivs[ivI].Start(sgI) != NO_MATCH )
+ {
+ size_t inter_size = 0;
+ for( size_t bbI = 0; bbI < nway_bb.size(); bbI++ )
+ {
+ gnLocation loc1;
+ loc1.SetStart( input_ivs[ivI].LeftEnd(sgI) );
+ loc1.SetEnd( input_ivs[ivI].RightEnd(sgI) );
+ gnLocation loc2;
+ loc2.SetStart( absolut(backbone[nway_bb[bbI]][sgI].first) );
+ loc2.SetEnd( absolut(backbone[nway_bb[bbI]][sgI].second) );
+ gnLocation intloc = loc1.GetIntersection( loc2, gnLocation::determinedRegions );
+ gnLocation intloc2 = intloc.GetIntersection( loc, gnLocation::determinedRegions );
+ inter_size += intloc2.GetEnd() - intloc2.GetStart();
+ }
+ if( inter_size > 0 )
+ iv_overlap.push_back( make_pair( inter_size, ivI ) );
+ }
+ }
+ bool partial_rr = false;
+ std::sort( iv_overlap.begin(), iv_overlap.end() );
+ if( iv_overlap.size() == 0 )
+ {
+ cerr << "Warning: unable to assign gene to an interval!\n" << "coordinates: " << lend << '\t' << rend << endl;
+ continue;
+ }else{
+ ivI = iv_overlap.back().second;
+ if( iv_overlap.size() > 1 )
+ {
+ partial_rr = true;
+ rr_count++;
+ }
+ }
+ CompactGappedAlignment<> iv_cga(input_ivs[ivI]);
+ CompactGappedAlignment<> col_cga;
+ gnLocation loc1;
+ loc1.SetStart( input_ivs[ivI].LeftEnd(sgI) );
+ loc1.SetEnd( input_ivs[ivI].RightEnd(sgI) );
+ gnLocation intloc = loc1.GetIntersection( loc, gnLocation::determinedRegions );
+ gnSeqI lcol = iv_cga.SeqPosToColumn( sgI, intloc.GetStart() );
+ gnSeqI rcol = iv_cga.SeqPosToColumn( sgI, intloc.GetEnd() );
+ if( rcol < lcol )
+ swap( rcol, lcol ); // handle reverse complement
+ iv_cga.copyRange(col_cga, lcol, rcol-lcol + 1);
+ vector< string > aln;
+ GetAlignment( col_cga, input_ivs.seq_table, aln );
+ gnSequence gene_aln;
+ for( size_t i = 0; i < aln.size(); i++ )
+ {
+ gene_aln += aln[i];
+ stringstream ss;
+ ss << "seq" << i;
+ gene_aln.setContigName(i, ss.str());
+ }
+
+ stringstream of_name;
+ of_name << output_base << "_" << ortho_count << ".fas";
+ gnFASSource::Write( gene_aln, of_name.str() );
+
+ // find orthologous CDS features...
+ vector< gnBaseFeature* > ortho_cds( seq_count, NULL );
+ size_t ocds_count = 0;
+ for( size_t seqI = 0; seqI < input_ivs.seq_table.size(); seqI++ )
+ {
+ gnLocation seqloc;
+ seqloc.SetStart(col_cga.LeftEnd(seqI));
+ seqloc.SetEnd(col_cga.RightEnd(seqI));
+ vector< gnBaseFeature* > int_feats;
+ vector< uint32 > indie;
+ input_ivs.seq_table[seqI]->getIntersectingFeatures( seqloc, int_feats, indie );
+ vector< pair< gnSeqI, size_t > > overlap_frac;
+ for( size_t featI = 0; featI < int_feats.size(); featI++ )
+ {
+ if( int_feats[featI]->GetName() == "CDS" )
+ {
+ gnLocation l = seqloc.GetIntersection( int_feats[featI]->GetLocation(0), gnLocation::determinedRegions );
+ size_t max_bb = 0;
+ for( size_t bbI = 0; bbI < nway_bb.size(); bbI++ )
+ {
+ gnLocation bbloc;
+ bbloc.SetBounds( absolut(backbone[nway_bb[bbI]][seqI].first), absolut(backbone[nway_bb[bbI]][seqI].second) );
+ gnLocation l2 = bbloc.GetIntersection( l, gnLocation::determinedRegions );
+ if( l2.GetEnd() - l2.GetStart() > max_bb )
+ max_bb = l2.GetEnd() - l2.GetStart();
+ }
+ overlap_frac.push_back( make_pair( max_bb, featI ) );
+ }else
+ delete int_feats[featI];
+ }
+ std::sort( overlap_frac.begin(), overlap_frac.end() );
+ if( overlap_frac.size() > 0 )
+ {
+ ortho_cds[seqI] = int_feats[ overlap_frac.back().second ];
+ ocds_count++;
+ }
+ }
+
+ if( ocds_count == seq_count )
+ {
+ if( ortho_count == 88 )
+ cerr << "watchme\n";
+ ortho_out << ortho_count;
+ for( size_t i = 0; i < seq_count; i++ )
+ {
+ ortho_out << '\t';
+ printGI( ortho_out, ortho_cds[i] );
+ }
+
+ double cov = computeAvgCoverage( backbone, nway_bb, ortho_cds );
+ ortho_out << '\t' << cov;
+ NumericMatrix<double> identity;
+ vector< AbstractMatch* > amvec( 1, &col_cga );
+ BackboneIdentityMatrix( amvec, input_ivs.seq_table, identity );
+ double id = 0;
+ for( size_t i = 0; i < seq_count; i++ )
+ for( size_t j = i+1; j < seq_count; j++ )
+ id += identity(i,j);
+ id /= (double)(seq_count * (seq_count-1)) / 2.0;
+
+ ortho_out << '\t' << id;
+ ortho_out << '\t';
+ if( partial_rr )
+ {
+ partial_rr_annotated++;
+ ortho_out << "*";
+ }
+ ortho_out << endl;
+ ortho_count++;
+ }
+ for( size_t oI = 0; oI < ortho_cds.size(); oI++ )
+ if( ortho_cds[oI] != NULL )
+ delete ortho_cds[oI];
+
+ }
+ cout << ortho_count << " out of " << genes.size() << " genes were at least partially conserved\n";
+ cout << rr_count << " CDS appear to be broken by rearrangement, of which " << partial_rr_annotated << " are still annotated as CDS in all genomes\n";
+}
+
diff --git a/src/getopt.c b/src/getopt.c
new file mode 100644
index 0000000..3a85480
--- /dev/null
+++ b/src/getopt.c
@@ -0,0 +1,1279 @@
+/* Getopt for GNU.
+ NOTE: getopt is now part of the C library, so if you don't know what
+ "Keep this file name-space clean" means, talk to drepper at gnu.org
+ before changing it!
+ Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001,2002
+ Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
+ Ditto for AIX 3.2 and <stdlib.h>. */
+#ifndef _NO_PROTO
+# define _NO_PROTO
+#endif
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#if !defined __STDC__ || !__STDC__
+/* This is a separate conditional since some stdc systems
+ reject `defined (const)'. */
+# ifndef const
+# define const
+# endif
+#endif
+
+#include <stdio.h>
+
+/* Comment out all this code if we are using the GNU C Library, and are not
+ actually compiling the library itself. This code is part of the GNU C
+ Library, but also included in many other GNU distributions. Compiling
+ and linking in this code is a waste when using the GNU C library
+ (especially if it is a shared library). Rather than having every GNU
+ program understand `configure --with-gnu-libc' and omit the object files,
+ it is simpler to just do this in the source for each such file. */
+
+#define GETOPT_INTERFACE_VERSION 2
+#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2
+# include <gnu-versions.h>
+# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION
+# define ELIDE_CODE
+# endif
+#endif
+
+#ifndef ELIDE_CODE
+
+
+/* This needs to come after some library #include
+ to get __GNU_LIBRARY__ defined. */
+#ifdef __GNU_LIBRARY__
+/* Don't include stdlib.h for non-GNU C libraries because some of them
+ contain conflicting prototypes for getopt. */
+# include <stdlib.h>
+# include <unistd.h>
+#endif /* GNU C library. */
+
+#ifdef VMS
+# include <unixlib.h>
+# if HAVE_STRING_H - 0
+# include <string.h>
+# endif
+#endif
+
+#ifndef _
+/* This is for other GNU distributions with internationalized messages. */
+# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
+# include <libintl.h>
+# ifndef _
+# define _(msgid) gettext (msgid)
+# endif
+# else
+# define _(msgid) (msgid)
+# endif
+# if defined _LIBC && defined USE_IN_LIBIO
+# include <wchar.h>
+# endif
+#endif
+
+#ifndef attribute_hidden
+# define attribute_hidden
+#endif
+
+/* This version of `getopt' appears to the caller like standard Unix `getopt'
+ but it behaves differently for the user, since it allows the user
+ to intersperse the options with the other arguments.
+
+ As `getopt' works, it permutes the elements of ARGV so that,
+ when it is done, all the options precede everything else. Thus
+ all application programs are extended to handle flexible argument order.
+
+ Setting the environment variable POSIXLY_CORRECT disables permutation.
+ Then the behavior is completely standard.
+
+ GNU application programs can use a third alternative mode in which
+ they can distinguish the relative order of options and other arguments. */
+
+#include "getopt.h"
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns -1, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+/* 1003.2 says this must be 1 before any call. */
+int optind = 1;
+
+/* Formerly, initialization of getopt depended on optind==0, which
+ causes problems with re-calling getopt as programs generally don't
+ know that. */
+
+int __getopt_initialized attribute_hidden;
+
+/* The next char to be scanned in the option-element
+ in which the last option character we returned was found.
+ This allows us to pick up the scan where we left off.
+
+ If this is zero, or a null string, it means resume the scan
+ by advancing to the next ARGV-element. */
+
+static char *nextchar;
+
+/* Callers store zero here to inhibit the error message
+ for unrecognized options. */
+
+int opterr = 1;
+
+/* Set to an option character which was unrecognized.
+ This must be initialized on some systems to avoid linking in the
+ system's own getopt implementation. */
+
+int optopt = '?';
+
+/* Describe how to deal with options that follow non-option ARGV-elements.
+
+ If the caller did not specify anything,
+ the default is REQUIRE_ORDER if the environment variable
+ POSIXLY_CORRECT is defined, PERMUTE otherwise.
+
+ REQUIRE_ORDER means don't recognize them as options;
+ stop option processing when the first non-option is seen.
+ This is what Unix does.
+ This mode of operation is selected by either setting the environment
+ variable POSIXLY_CORRECT, or using `+' as the first character
+ of the list of option characters.
+
+ PERMUTE is the default. We permute the contents of ARGV as we scan,
+ so that eventually all the non-options are at the end. This allows options
+ to be given in any order, even with programs that were not written to
+ expect this.
+
+ RETURN_IN_ORDER is an option available to programs that were written
+ to expect options and other ARGV-elements in any order and that care about
+ the ordering of the two. We describe each non-option ARGV-element
+ as if it were the argument of an option with character code 1.
+ Using `-' as the first character of the list of option characters
+ selects this mode of operation.
+
+ The special argument `--' forces an end of option-scanning regardless
+ of the value of `ordering'. In the case of RETURN_IN_ORDER, only
+ `--' can cause `getopt' to return -1 with `optind' != ARGC. */
+
+static enum
+{
+ REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
+} ordering;
+
+/* Value of POSIXLY_CORRECT environment variable. */
+static char *posixly_correct;
+
+#ifdef __GNU_LIBRARY__
+/* We want to avoid inclusion of string.h with non-GNU libraries
+ because there are many ways it can cause trouble.
+ On some systems, it contains special magic macros that don't work
+ in GCC. */
+# include <string.h>
+# define my_index strchr
+#else
+
+# if HAVE_STRING_H
+# include <string.h>
+# else
+# ifndef WIN32
+# include <strings.h>
+# endif
+# endif
+
+/* Avoid depending on library functions or files
+ whose names are inconsistent. */
+
+#ifndef getenv
+extern char *getenv ();
+#endif
+
+static char *
+my_index (str, chr)
+ const char *str;
+ int chr;
+{
+ while (*str)
+ {
+ if (*str == chr)
+ return (char *) str;
+ str++;
+ }
+ return 0;
+}
+
+/* If using GCC, we can safely declare strlen this way.
+ If not using GCC, it is ok not to declare it. */
+#ifdef __GNUC__
+/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
+ That was relevant to code that was here before. */
+# if (!defined __STDC__ || !__STDC__) && !defined strlen
+/* gcc with -traditional declares the built-in strlen to return int,
+ and has done so at least since version 2.4.5. -- rms. */
+extern int strlen (const char *);
+# endif /* not __STDC__ */
+#endif /* __GNUC__ */
+
+#endif /* not __GNU_LIBRARY__ */
+
+/* Handle permutation of arguments. */
+
+/* Describe the part of ARGV that contains non-options that have
+ been skipped. `first_nonopt' is the index in ARGV of the first of them;
+ `last_nonopt' is the index after the last of them. */
+
+static int first_nonopt;
+static int last_nonopt;
+
+#ifdef _LIBC
+/* Stored original parameters.
+ XXX This is no good solution. We should rather copy the args so
+ that we can compare them later. But we must not use malloc(3). */
+extern int __libc_argc;
+extern char **__libc_argv;
+
+/* Bash 2.0 gives us an environment variable containing flags
+ indicating ARGV elements that should not be considered arguments. */
+
+# ifdef USE_NONOPTION_FLAGS
+/* Defined in getopt_init.c */
+extern char *__getopt_nonoption_flags;
+
+static int nonoption_flags_max_len;
+static int nonoption_flags_len;
+# endif
+
+# ifdef USE_NONOPTION_FLAGS
+# define SWAP_FLAGS(ch1, ch2) \
+ if (nonoption_flags_len > 0) \
+ { \
+ char __tmp = __getopt_nonoption_flags[ch1]; \
+ __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \
+ __getopt_nonoption_flags[ch2] = __tmp; \
+ }
+# else
+# define SWAP_FLAGS(ch1, ch2)
+# endif
+#else /* !_LIBC */
+# define SWAP_FLAGS(ch1, ch2)
+#endif /* _LIBC */
+
+/* Exchange two adjacent subsequences of ARGV.
+ One subsequence is elements [first_nonopt,last_nonopt)
+ which contains all the non-options that have been skipped so far.
+ The other is elements [last_nonopt,optind), which contains all
+ the options processed since those non-options were skipped.
+
+ `first_nonopt' and `last_nonopt' are relocated so that they describe
+ the new indices of the non-options in ARGV after they are moved. */
+
+#if defined __STDC__ && __STDC__
+static void exchange (char **);
+#endif
+
+static void
+exchange (argv)
+ char **argv;
+{
+ int bottom = first_nonopt;
+ int middle = last_nonopt;
+ int top = optind;
+ char *tem;
+
+ /* Exchange the shorter segment with the far end of the longer segment.
+ That puts the shorter segment into the right place.
+ It leaves the longer segment in the right place overall,
+ but it consists of two parts that need to be swapped next. */
+
+#if defined _LIBC && defined USE_NONOPTION_FLAGS
+ /* First make sure the handling of the `__getopt_nonoption_flags'
+ string can work normally. Our top argument must be in the range
+ of the string. */
+ if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len)
+ {
+ /* We must extend the array. The user plays games with us and
+ presents new arguments. */
+ char *new_str = malloc (top + 1);
+ if (new_str == NULL)
+ nonoption_flags_len = nonoption_flags_max_len = 0;
+ else
+ {
+ memset (__mempcpy (new_str, __getopt_nonoption_flags,
+ nonoption_flags_max_len),
+ '\0', top + 1 - nonoption_flags_max_len);
+ nonoption_flags_max_len = top + 1;
+ __getopt_nonoption_flags = new_str;
+ }
+ }
+#endif
+
+ while (top > middle && middle > bottom)
+ {
+ if (top - middle > middle - bottom)
+ {
+ /* Bottom segment is the short one. */
+ int len = middle - bottom;
+ register int i;
+
+ /* Swap it with the top part of the top segment. */
+ for (i = 0; i < len; i++)
+ {
+ tem = argv[bottom + i];
+ argv[bottom + i] = argv[top - (middle - bottom) + i];
+ argv[top - (middle - bottom) + i] = tem;
+ SWAP_FLAGS (bottom + i, top - (middle - bottom) + i);
+ }
+ /* Exclude the moved bottom segment from further swapping. */
+ top -= len;
+ }
+ else
+ {
+ /* Top segment is the short one. */
+ int len = top - middle;
+ register int i;
+
+ /* Swap it with the bottom part of the bottom segment. */
+ for (i = 0; i < len; i++)
+ {
+ tem = argv[bottom + i];
+ argv[bottom + i] = argv[middle + i];
+ argv[middle + i] = tem;
+ SWAP_FLAGS (bottom + i, middle + i);
+ }
+ /* Exclude the moved top segment from further swapping. */
+ bottom += len;
+ }
+ }
+
+ /* Update records for the slots the non-options now occupy. */
+
+ first_nonopt += (optind - last_nonopt);
+ last_nonopt = optind;
+}
+
+/* Initialize the internal data when the first call is made. */
+
+#if defined __STDC__ && __STDC__
+static const char *_getopt_initialize (int, char *const *, const char *);
+#endif
+static const char *
+_getopt_initialize (argc, argv, optstring)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+{
+ /* Start processing options with ARGV-element 1 (since ARGV-element 0
+ is the program name); the sequence of previously skipped
+ non-option ARGV-elements is empty. */
+
+ first_nonopt = last_nonopt = optind;
+
+ nextchar = NULL;
+
+ posixly_correct = getenv ("POSIXLY_CORRECT");
+
+ /* Determine how to handle the ordering of options and nonoptions. */
+
+ if (optstring[0] == '-')
+ {
+ ordering = RETURN_IN_ORDER;
+ ++optstring;
+ }
+ else if (optstring[0] == '+')
+ {
+ ordering = REQUIRE_ORDER;
+ ++optstring;
+ }
+ else if (posixly_correct != NULL)
+ ordering = REQUIRE_ORDER;
+ else
+ ordering = PERMUTE;
+
+#if defined _LIBC && defined USE_NONOPTION_FLAGS
+ if (posixly_correct == NULL
+ && argc == __libc_argc && argv == __libc_argv)
+ {
+ if (nonoption_flags_max_len == 0)
+ {
+ if (__getopt_nonoption_flags == NULL
+ || __getopt_nonoption_flags[0] == '\0')
+ nonoption_flags_max_len = -1;
+ else
+ {
+ const char *orig_str = __getopt_nonoption_flags;
+ int len = nonoption_flags_max_len = strlen (orig_str);
+ if (nonoption_flags_max_len < argc)
+ nonoption_flags_max_len = argc;
+ __getopt_nonoption_flags =
+ (char *) malloc (nonoption_flags_max_len);
+ if (__getopt_nonoption_flags == NULL)
+ nonoption_flags_max_len = -1;
+ else
+ memset (__mempcpy (__getopt_nonoption_flags, orig_str, len),
+ '\0', nonoption_flags_max_len - len);
+ }
+ }
+ nonoption_flags_len = nonoption_flags_max_len;
+ }
+ else
+ nonoption_flags_len = 0;
+#endif
+
+ return optstring;
+}
+
+/* Scan elements of ARGV (whose length is ARGC) for option characters
+ given in OPTSTRING.
+
+ If an element of ARGV starts with '-', and is not exactly "-" or "--",
+ then it is an option element. The characters of this element
+ (aside from the initial '-') are option characters. If `getopt'
+ is called repeatedly, it returns successively each of the option characters
+ from each of the option elements.
+
+ If `getopt' finds another option character, it returns that character,
+ updating `optind' and `nextchar' so that the next call to `getopt' can
+ resume the scan with the following option character or ARGV-element.
+
+ If there are no more option characters, `getopt' returns -1.
+ Then `optind' is the index in ARGV of the first ARGV-element
+ that is not an option. (The ARGV-elements have been permuted
+ so that those that are not options now come last.)
+
+ OPTSTRING is a string containing the legitimate option characters.
+ If an option character is seen that is not listed in OPTSTRING,
+ return '?' after printing an error message. If you set `opterr' to
+ zero, the error message is suppressed but we still return '?'.
+
+ If a char in OPTSTRING is followed by a colon, that means it wants an arg,
+ so the following text in the same ARGV-element, or the text of the following
+ ARGV-element, is returned in `optarg'. Two colons mean an option that
+ wants an optional arg; if there is text in the current ARGV-element,
+ it is returned in `optarg', otherwise `optarg' is set to zero.
+
+ If OPTSTRING starts with `-' or `+', it requests different methods of
+ handling the non-option ARGV-elements.
+ See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
+
+ Long-named options begin with `--' instead of `-'.
+ Their names may be abbreviated as long as the abbreviation is unique
+ or is an exact match for some defined option. If they have an
+ argument, it follows the option name in the same ARGV-element, separated
+ from the option name by a `=', or else the in next ARGV-element.
+ When `getopt' finds a long-named option, it returns 0 if that option's
+ `flag' field is nonzero, the value of the option's `val' field
+ if the `flag' field is zero.
+
+ The elements of ARGV aren't really const, because we permute them.
+ But we pretend they're const in the prototype to be compatible
+ with other systems.
+
+ LONGOPTS is a vector of `struct option' terminated by an
+ element containing a name which is zero.
+
+ LONGIND returns the index in LONGOPT of the long-named option found.
+ It is only valid when a long-named option has been found by the most
+ recent call.
+
+ If LONG_ONLY is nonzero, '-' as well as '--' can introduce
+ long-named options. */
+
+int
+_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+ const struct option *longopts;
+ int *longind;
+ int long_only;
+{
+ int print_errors = opterr;
+ if (optstring[0] == ':')
+ print_errors = 0;
+
+ if (argc < 1)
+ return -1;
+
+ optarg = NULL;
+
+ if (optind == 0 || !__getopt_initialized)
+ {
+ if (optind == 0)
+ optind = 1; /* Don't scan ARGV[0], the program name. */
+ optstring = _getopt_initialize (argc, argv, optstring);
+ __getopt_initialized = 1;
+ }
+
+ /* Test whether ARGV[optind] points to a non-option argument.
+ Either it does not have option syntax, or there is an environment flag
+ from the shell indicating it is not an option. The later information
+ is only used when the used in the GNU libc. */
+#if defined _LIBC && defined USE_NONOPTION_FLAGS
+# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \
+ || (optind < nonoption_flags_len \
+ && __getopt_nonoption_flags[optind] == '1'))
+#else
+# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0')
+#endif
+
+ if (nextchar == NULL || *nextchar == '\0')
+ {
+ /* Advance to the next ARGV-element. */
+
+ /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been
+ moved back by the user (who may also have changed the arguments). */
+ if (last_nonopt > optind)
+ last_nonopt = optind;
+ if (first_nonopt > optind)
+ first_nonopt = optind;
+
+ if (ordering == PERMUTE)
+ {
+ /* If we have just processed some options following some non-options,
+ exchange them so that the options come first. */
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (last_nonopt != optind)
+ first_nonopt = optind;
+
+ /* Skip any additional non-options
+ and extend the range of non-options previously skipped. */
+
+ while (optind < argc && NONOPTION_P)
+ optind++;
+ last_nonopt = optind;
+ }
+
+ /* The special ARGV-element `--' means premature end of options.
+ Skip it like a null option,
+ then exchange with previous non-options as if it were an option,
+ then skip everything else like a non-option. */
+
+ if (optind != argc && !strcmp (argv[optind], "--"))
+ {
+ optind++;
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (first_nonopt == last_nonopt)
+ first_nonopt = optind;
+ last_nonopt = argc;
+
+ optind = argc;
+ }
+
+ /* If we have done all the ARGV-elements, stop the scan
+ and back over any non-options that we skipped and permuted. */
+
+ if (optind == argc)
+ {
+ /* Set the next-arg-index to point at the non-options
+ that we previously skipped, so the caller will digest them. */
+ if (first_nonopt != last_nonopt)
+ optind = first_nonopt;
+ return -1;
+ }
+
+ /* If we have come to a non-option and did not permute it,
+ either stop the scan or describe it to the caller and pass it by. */
+
+ if (NONOPTION_P)
+ {
+ if (ordering == REQUIRE_ORDER)
+ return -1;
+ optarg = argv[optind++];
+ return 1;
+ }
+
+ /* We have found another option-ARGV-element.
+ Skip the initial punctuation. */
+
+ nextchar = (argv[optind] + 1
+ + (longopts != NULL && argv[optind][1] == '-'));
+ }
+
+ /* Decode the current option-ARGV-element. */
+
+ /* Check whether the ARGV-element is a long option.
+
+ If long_only and the ARGV-element has the form "-f", where f is
+ a valid short option, don't consider it an abbreviated form of
+ a long option that starts with f. Otherwise there would be no
+ way to give the -f short option.
+
+ On the other hand, if there's a long option "fubar" and
+ the ARGV-element is "-fu", do consider that an abbreviation of
+ the long option, just like "--fu", and not "-f" with arg "u".
+
+ This distinction seems to be the most useful approach. */
+
+ if (longopts != NULL
+ && (argv[optind][1] == '-'
+ || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
+ {
+ char *nameend;
+ const struct option *p;
+ const struct option *pfound = NULL;
+ int exact = 0;
+ int ambig = 0;
+ int indfound = -1;
+ int option_index;
+
+ for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
+ /* Do nothing. */ ;
+
+ /* Test all long options for either exact match
+ or abbreviated matches. */
+ for (p = longopts, option_index = 0; p->name; p++, option_index++)
+ if (!strncmp (p->name, nextchar, nameend - nextchar))
+ {
+ if ((unsigned int) (nameend - nextchar)
+ == (unsigned int) strlen (p->name))
+ {
+ /* Exact match found. */
+ pfound = p;
+ indfound = option_index;
+ exact = 1;
+ break;
+ }
+ else if (pfound == NULL)
+ {
+ /* First nonexact match found. */
+ pfound = p;
+ indfound = option_index;
+ }
+ else if (long_only
+ || pfound->has_arg != p->has_arg
+ || pfound->flag != p->flag
+ || pfound->val != p->val)
+ /* Second or later nonexact match found. */
+ ambig = 1;
+ }
+
+ if (ambig && !exact)
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf, _("%s: option `%s' is ambiguous\n"),
+ argv[0], argv[optind]) >= 0)
+ {
+
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr, _("%s: option `%s' is ambiguous\n"),
+ argv[0], argv[optind]);
+#endif
+ }
+ nextchar += strlen (nextchar);
+ optind++;
+ optopt = 0;
+ return '?';
+ }
+
+ if (pfound != NULL)
+ {
+ option_index = indfound;
+ optind++;
+ if (*nameend)
+ {
+ /* Don't test has_arg with >, because some C compilers don't
+ allow it to be used on enums. */
+ if (pfound->has_arg)
+ optarg = nameend + 1;
+ else
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+ int n;
+#endif
+
+ if (argv[optind - 1][1] == '-')
+ {
+ /* --option */
+#if defined _LIBC && defined USE_IN_LIBIO
+ n = __asprintf (&buf, _("\
+%s: option `--%s' doesn't allow an argument\n"),
+ argv[0], pfound->name);
+#else
+ fprintf (stderr, _("\
+%s: option `--%s' doesn't allow an argument\n"),
+ argv[0], pfound->name);
+#endif
+ }
+ else
+ {
+ /* +option or -option */
+#if defined _LIBC && defined USE_IN_LIBIO
+ n = __asprintf (&buf, _("\
+%s: option `%c%s' doesn't allow an argument\n"),
+ argv[0], argv[optind - 1][0],
+ pfound->name);
+#else
+ fprintf (stderr, _("\
+%s: option `%c%s' doesn't allow an argument\n"),
+ argv[0], argv[optind - 1][0], pfound->name);
+#endif
+ }
+
+#if defined _LIBC && defined USE_IN_LIBIO
+ if (n >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#endif
+ }
+
+ nextchar += strlen (nextchar);
+
+ optopt = pfound->val;
+ return '?';
+ }
+ }
+ else if (pfound->has_arg == 1)
+ {
+ if (optind < argc)
+ optarg = argv[optind++];
+ else
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf, _("\
+%s: option `%s' requires an argument\n"),
+ argv[0], argv[optind - 1]) >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr,
+ _("%s: option `%s' requires an argument\n"),
+ argv[0], argv[optind - 1]);
+#endif
+ }
+ nextchar += strlen (nextchar);
+ optopt = pfound->val;
+ return optstring[0] == ':' ? ':' : '?';
+ }
+ }
+ nextchar += strlen (nextchar);
+ if (longind != NULL)
+ *longind = option_index;
+ if (pfound->flag)
+ {
+ *(pfound->flag) = pfound->val;
+ return 0;
+ }
+ return pfound->val;
+ }
+
+ /* Can't find it as a long option. If this is not getopt_long_only,
+ or the option starts with '--' or is not a valid short
+ option, then it's an error.
+ Otherwise interpret it as a short option. */
+ if (!long_only || argv[optind][1] == '-'
+ || my_index (optstring, *nextchar) == NULL)
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+ int n;
+#endif
+
+ if (argv[optind][1] == '-')
+ {
+ /* --option */
+#if defined _LIBC && defined USE_IN_LIBIO
+ n = __asprintf (&buf, _("%s: unrecognized option `--%s'\n"),
+ argv[0], nextchar);
+#else
+ fprintf (stderr, _("%s: unrecognized option `--%s'\n"),
+ argv[0], nextchar);
+#endif
+ }
+ else
+ {
+ /* +option or -option */
+#if defined _LIBC && defined USE_IN_LIBIO
+ n = __asprintf (&buf, _("%s: unrecognized option `%c%s'\n"),
+ argv[0], argv[optind][0], nextchar);
+#else
+ fprintf (stderr, _("%s: unrecognized option `%c%s'\n"),
+ argv[0], argv[optind][0], nextchar);
+#endif
+ }
+
+#if defined _LIBC && defined USE_IN_LIBIO
+ if (n >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#endif
+ }
+ nextchar = (char *) "";
+ optind++;
+ optopt = 0;
+ return '?';
+ }
+ }
+
+ /* Look at and handle the next short option-character. */
+
+ {
+ char c = *nextchar++;
+ char *temp = my_index (optstring, c);
+
+ /* Increment `optind' when we start to process its last character. */
+ if (*nextchar == '\0')
+ ++optind;
+
+ if (temp == NULL || c == ':')
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+ int n;
+#endif
+
+ if (posixly_correct)
+ {
+ /* 1003.2 specifies the format of this message. */
+#if defined _LIBC && defined USE_IN_LIBIO
+ n = __asprintf (&buf, _("%s: illegal option -- %c\n"),
+ argv[0], c);
+#else
+ fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c);
+#endif
+ }
+ else
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ n = __asprintf (&buf, _("%s: invalid option -- %c\n"),
+ argv[0], c);
+#else
+ fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c);
+#endif
+ }
+
+#if defined _LIBC && defined USE_IN_LIBIO
+ if (n >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#endif
+ }
+ optopt = c;
+ return '?';
+ }
+ /* Convenience. Treat POSIX -W foo same as long option --foo */
+ if (temp[0] == 'W' && temp[1] == ';')
+ {
+ char *nameend;
+ const struct option *p;
+ const struct option *pfound = NULL;
+ int exact = 0;
+ int ambig = 0;
+ int indfound = 0;
+ int option_index;
+
+ /* This is an option that requires an argument. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ /* If we end this ARGV-element by taking the rest as an arg,
+ we must advance to the next element now. */
+ optind++;
+ }
+ else if (optind == argc)
+ {
+ if (print_errors)
+ {
+ /* 1003.2 specifies the format of this message. */
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf,
+ _("%s: option requires an argument -- %c\n"),
+ argv[0], c) >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr, _("%s: option requires an argument -- %c\n"),
+ argv[0], c);
+#endif
+ }
+ optopt = c;
+ if (optstring[0] == ':')
+ c = ':';
+ else
+ c = '?';
+ return c;
+ }
+ else
+ /* We already incremented `optind' once;
+ increment it again when taking next ARGV-elt as argument. */
+ optarg = argv[optind++];
+
+ /* optarg is now the argument, see if it's in the
+ table of longopts. */
+
+ for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++)
+ /* Do nothing. */ ;
+
+ /* Test all long options for either exact match
+ or abbreviated matches. */
+ for (p = longopts, option_index = 0; p->name; p++, option_index++)
+ if (!strncmp (p->name, nextchar, nameend - nextchar))
+ {
+ if ((unsigned int) (nameend - nextchar) == strlen (p->name))
+ {
+ /* Exact match found. */
+ pfound = p;
+ indfound = option_index;
+ exact = 1;
+ break;
+ }
+ else if (pfound == NULL)
+ {
+ /* First nonexact match found. */
+ pfound = p;
+ indfound = option_index;
+ }
+ else
+ /* Second or later nonexact match found. */
+ ambig = 1;
+ }
+ if (ambig && !exact)
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf, _("%s: option `-W %s' is ambiguous\n"),
+ argv[0], argv[optind]) >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"),
+ argv[0], argv[optind]);
+#endif
+ }
+ nextchar += strlen (nextchar);
+ optind++;
+ return '?';
+ }
+ if (pfound != NULL)
+ {
+ option_index = indfound;
+ if (*nameend)
+ {
+ /* Don't test has_arg with >, because some C compilers don't
+ allow it to be used on enums. */
+ if (pfound->has_arg)
+ optarg = nameend + 1;
+ else
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf, _("\
+%s: option `-W %s' doesn't allow an argument\n"),
+ argv[0], pfound->name) >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr, _("\
+%s: option `-W %s' doesn't allow an argument\n"),
+ argv[0], pfound->name);
+#endif
+ }
+
+ nextchar += strlen (nextchar);
+ return '?';
+ }
+ }
+ else if (pfound->has_arg == 1)
+ {
+ if (optind < argc)
+ optarg = argv[optind++];
+ else
+ {
+ if (print_errors)
+ {
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf, _("\
+%s: option `%s' requires an argument\n"),
+ argv[0], argv[optind - 1]) >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr,
+ _("%s: option `%s' requires an argument\n"),
+ argv[0], argv[optind - 1]);
+#endif
+ }
+ nextchar += strlen (nextchar);
+ return optstring[0] == ':' ? ':' : '?';
+ }
+ }
+ nextchar += strlen (nextchar);
+ if (longind != NULL)
+ *longind = option_index;
+ if (pfound->flag)
+ {
+ *(pfound->flag) = pfound->val;
+ return 0;
+ }
+ return pfound->val;
+ }
+ nextchar = NULL;
+ return 'W'; /* Let the application handle it. */
+ }
+ if (temp[1] == ':')
+ {
+ if (temp[2] == ':')
+ {
+ /* This is an option that accepts an argument optionally. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ optind++;
+ }
+ else
+ optarg = NULL;
+ nextchar = NULL;
+ }
+ else
+ {
+ /* This is an option that requires an argument. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ /* If we end this ARGV-element by taking the rest as an arg,
+ we must advance to the next element now. */
+ optind++;
+ }
+ else if (optind == argc)
+ {
+ if (print_errors)
+ {
+ /* 1003.2 specifies the format of this message. */
+#if defined _LIBC && defined USE_IN_LIBIO
+ char *buf;
+
+ if (__asprintf (&buf, _("\
+%s: option requires an argument -- %c\n"),
+ argv[0], c) >= 0)
+ {
+ if (_IO_fwide (stderr, 0) > 0)
+ __fwprintf (stderr, L"%s", buf);
+ else
+ fputs (buf, stderr);
+
+ free (buf);
+ }
+#else
+ fprintf (stderr,
+ _("%s: option requires an argument -- %c\n"),
+ argv[0], c);
+#endif
+ }
+ optopt = c;
+ if (optstring[0] == ':')
+ c = ':';
+ else
+ c = '?';
+ }
+ else
+ /* We already incremented `optind' once;
+ increment it again when taking next ARGV-elt as argument. */
+ optarg = argv[optind++];
+ nextchar = NULL;
+ }
+ }
+ return c;
+ }
+}
+
+int
+getopt (argc, argv, optstring)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+{
+ return _getopt_internal (argc, argv, optstring,
+ (const struct option *) 0,
+ (int *) 0,
+ 0);
+}
+
+#endif /* Not ELIDE_CODE. */
+
+#ifdef TEST
+
+/* Compile with -DTEST to make an executable for use in testing
+ the above definition of `getopt'. */
+
+int
+main (argc, argv)
+ int argc;
+ char **argv;
+{
+ int c;
+ int digit_optind = 0;
+
+ while (1)
+ {
+ int this_option_optind = optind ? optind : 1;
+
+ c = getopt (argc, argv, "abc:d:0123456789");
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (digit_optind != 0 && digit_optind != this_option_optind)
+ printf ("digits occur in two different argv-elements.\n");
+ digit_optind = this_option_optind;
+ printf ("option %c\n", c);
+ break;
+
+ case 'a':
+ printf ("option a\n");
+ break;
+
+ case 'b':
+ printf ("option b\n");
+ break;
+
+ case 'c':
+ printf ("option c with value `%s'\n", optarg);
+ break;
+
+ case '?':
+ break;
+
+ default:
+ printf ("?? getopt returned character code 0%o ??\n", c);
+ }
+ }
+
+ if (optind < argc)
+ {
+ printf ("non-option ARGV-elements: ");
+ while (optind < argc)
+ printf ("%s ", argv[optind++]);
+ printf ("\n");
+ }
+
+ exit (0);
+}
+
+#endif /* TEST */
diff --git a/src/getopt.cpp b/src/getopt.cpp
new file mode 100644
index 0000000..29a76bd
--- /dev/null
+++ b/src/getopt.cpp
@@ -0,0 +1,772 @@
+/* Getopt for GNU.
+ NOTE: getopt is now part of the C library, so if you don't know what
+ "Keep this file name-space clean" means, talk to roland at gnu.ai.mit.edu
+ before changing it!
+
+ Copyright (C) 1987, 88, 89, 90, 91, 92, 93, 94
+ Free Software Foundation, Inc.
+
+Changes by monty:
+- Added include of string.h when nessessary.
+- Removed two warnings from gcc.
+
+This file is part of the GNU C Library. Its master source is NOT part of
+the C library, however. The master source lives in /gd/gnu/lib.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB. If
+not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+Cambridge, MA 02139, USA. */
+
+#ifdef __cplusplus
+extern "C" {
+/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
+ Ditto for AIX 3.2 and <stdlib.h>. */
+#ifndef _NO_PROTO
+#define _NO_PROTO
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#if (!defined (__STDC__) || !__STDC__) && !defined(MSDOS)
+/* This is a separate conditional since some stdc systems
+ reject `defined (const)'. */
+#ifndef const
+#define const
+#endif
+#endif
+
+#ifndef WIN32
+#include <global.h> /* Changes for mysys */
+#include <m_string.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "getopt.h"
+
+int
+getopt_long (int argc, char *const *argv, const char *options, const struct option *long_options, int *opt_index)
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 0);
+}
+
+/* Like getopt_long, but '-' as well as '--' can indicate a long option.
+ If an option that starts with '-' (not '--') doesn't match a long option,
+ but does match a short option, it is parsed as a short option
+ instead. */
+
+int
+getopt_long_only (int argc, char *const *argv, const char *options, const struct option *long_options, int *opt_index)
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 1);
+}
+
+#endif
+/* Comment out all this code if we are using the GNU C Library, and are not
+ actually compiling the library itself. This code is part of the GNU C
+ Library, but also included in many other GNU distributions. Compiling
+ and linking in this code is a waste when using the GNU C library
+ (especially if it is a shared library). Rather than having every GNU
+ program understand `configure --with-gnu-libc' and omit the object files,
+ it is simpler to just do this in the source for each such file. */
+
+#if defined (_LIBC) || !defined (__GNU_LIBRARY__)
+
+
+/* This needs to come after some library #include
+ to get __GNU_LIBRARY__ defined. */
+#ifdef __GNU_LIBRARY__
+/* Don't include stdlib.h for non-GNU C libraries because some of them
+ contain conflicting prototypes for getopt. */
+#include <stdlib.h>
+#endif /* GNU C library. */
+
+/* This version of `getopt' appears to the caller like standard Unix `getopt'
+ but it behaves differently for the user, since it allows the user
+ to intersperse the options with the other arguments.
+
+ As `getopt' works, it permutes the elements of ARGV so that,
+ when it is done, all the options precede everything else. Thus
+ all application programs are extended to handle flexible argument order.
+
+ Setting the environment variable POSIXLY_CORRECT disables permutation.
+ Then the behavior is completely standard.
+
+ GNU application programs can use a third alternative mode in which
+ they can distinguish the relative order of options and other arguments. */
+
+#include "getopt.h"
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+char *optarg = NULL;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns EOF, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+/* XXX 1003.2 says this must be 1 before any call. */
+int optind = 1;
+
+/* The next char to be scanned in the option-element
+ in which the last option character we returned was found.
+ This allows us to pick up the scan where we left off.
+
+ If this is zero, or a null string, it means resume the scan
+ by advancing to the next ARGV-element. */
+
+static char *nextchar;
+
+/* Callers store zero here to inhibit the error message
+ for unrecognized options. */
+
+int opterr = 1;
+
+/* Set to an option character which was unrecognized.
+ This must be initialized on some systems to avoid linking in the
+ system's own getopt implementation. */
+
+int optopt = '?';
+
+/* Describe how to deal with options that follow non-option ARGV-elements.
+
+ If the caller did not specify anything,
+ the default is REQUIRE_ORDER if the environment variable
+ POSIXLY_CORRECT is defined, PERMUTE otherwise.
+
+ REQUIRE_ORDER means don't recognize them as options;
+ stop option processing when the first non-option is seen.
+ This is what Unix does.
+ This mode of operation is selected by either setting the environment
+ variable POSIXLY_CORRECT, or using `+' as the first character
+ of the list of option characters.
+
+ PERMUTE is the default. We permute the contents of ARGV as we scan,
+ so that eventually all the non-options are at the end. This allows options
+ to be given in any order, even with programs that were not written to
+ expect this.
+
+ RETURN_IN_ORDER is an option available to programs that were written
+ to expect options and other ARGV-elements in any order and that care about
+ the ordering of the two. We describe each non-option ARGV-element
+ as if it were the argument of an option with character code 1.
+ Using `-' as the first character of the list of option characters
+ selects this mode of operation.
+
+ The special argument `--' forces an end of option-scanning regardless
+ of the value of `ordering'. In the case of RETURN_IN_ORDER, only
+ `--' can cause `getopt' to return EOF with `optind' != ARGC. */
+
+static enum
+{
+ REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
+} ordering;
+
+/* Value of POSIXLY_CORRECT environment variable. */
+static char *posixly_correct;
+
+#ifdef __GNU_LIBRARY__
+/* We want to avoid inclusion of string.h with non-GNU libraries
+ because there are many ways it can cause trouble.
+ On some systems, it contains special magic macros that don't work
+ in GCC. */
+#include <string.h>
+#define my_index strchr
+#else
+
+/* Avoid depending on library functions or files
+ whose names are inconsistent. */
+
+char *getenv (const char *);
+
+static char *
+my_index (const char *str, int chr)
+{
+ while (*str)
+ {
+ if (*str == chr)
+ return (char *) str;
+ str++;
+ }
+ return 0;
+}
+
+/* If using GCC, we can safely declare strlen this way.
+ If not using GCC, it is ok not to declare it. */
+#ifdef __GNUC__
+/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
+ That was relevant to code that was here before. */
+#if !defined (__STDC__) || !__STDC__
+/* gcc with -traditional declares the built-in strlen to return int,
+ and has done so at least since version 2.4.5. -- rms. */
+extern int strlen (const char *);
+#endif /* not __STDC__ */
+#endif /* __GNUC__ */
+
+#endif /* not __GNU_LIBRARY__ */
+
+/* Handle permutation of arguments. */
+
+/* Describe the part of ARGV that contains non-options that have
+ been skipped. `first_nonopt' is the index in ARGV of the first of them;
+ `last_nonopt' is the index after the last of them. */
+
+static int first_nonopt;
+static int last_nonopt;
+
+/* Exchange two adjacent subsequences of ARGV.
+ One subsequence is elements [first_nonopt,last_nonopt)
+ which contains all the non-options that have been skipped so far.
+ The other is elements [last_nonopt,optind), which contains all
+ the options processed since those non-options were skipped.
+
+ `first_nonopt' and `last_nonopt' are relocated so that they describe
+ the new indices of the non-options in ARGV after they are moved. */
+
+static void
+exchange (char **argv)
+{
+ int bottom = first_nonopt;
+ int middle = last_nonopt;
+ int top = optind;
+ char *tem;
+
+ /* Exchange the shorter segment with the far end of the longer segment.
+ That puts the shorter segment into the right place.
+ It leaves the longer segment in the right place overall,
+ but it consists of two parts that need to be swapped next. */
+
+ while (top > middle && middle > bottom)
+ {
+ if (top - middle > middle - bottom)
+ {
+ /* Bottom segment is the short one. */
+ int len = middle - bottom;
+ register int i;
+
+ /* Swap it with the top part of the top segment. */
+ for (i = 0; i < len; i++)
+ {
+ tem = argv[bottom + i];
+ argv[bottom + i] = argv[top - (middle - bottom) + i];
+ argv[top - (middle - bottom) + i] = tem;
+ }
+ /* Exclude the moved bottom segment from further swapping. */
+ top -= len;
+ }
+ else
+ {
+ /* Top segment is the short one. */
+ int len = top - middle;
+ register int i;
+
+ /* Swap it with the bottom part of the bottom segment. */
+ for (i = 0; i < len; i++)
+ {
+ tem = argv[bottom + i];
+ argv[bottom + i] = argv[middle + i];
+ argv[middle + i] = tem;
+ }
+ /* Exclude the moved top segment from further swapping. */
+ bottom += len;
+ }
+ }
+
+ /* Update records for the slots the non-options now occupy. */
+
+ first_nonopt += (optind - last_nonopt);
+ last_nonopt = optind;
+}
+
+/* Initialize the internal data when the first call is made. */
+
+static const char *
+_getopt_initialize (const char *optstring)
+{
+ /* Start processing options with ARGV-element 1 (since ARGV-element 0
+ is the program name); the sequence of previously skipped
+ non-option ARGV-elements is empty. */
+
+ first_nonopt = last_nonopt = optind = 1;
+
+ nextchar = NULL;
+
+ posixly_correct = getenv ("POSIXLY_CORRECT");
+
+ /* Determine how to handle the ordering of options and nonoptions. */
+
+ if (optstring[0] == '-')
+ {
+ ordering = RETURN_IN_ORDER;
+ ++optstring;
+ }
+ else if (optstring[0] == '+')
+ {
+ ordering = REQUIRE_ORDER;
+ ++optstring;
+ }
+ else if (posixly_correct != NULL)
+ ordering = REQUIRE_ORDER;
+ else
+ ordering = PERMUTE;
+
+ return optstring;
+}
+
+/* Scan elements of ARGV (whose length is ARGC) for option characters
+ given in OPTSTRING.
+
+ If an element of ARGV starts with '-', and is not exactly "-" or "--",
+ then it is an option element. The characters of this element
+ (aside from the initial '-') are option characters. If `getopt'
+ is called repeatedly, it returns successively each of the option characters
+ from each of the option elements.
+
+ If `getopt' finds another option character, it returns that character,
+ updating `optind' and `nextchar' so that the next call to `getopt' can
+ resume the scan with the following option character or ARGV-element.
+
+ If there are no more option characters, `getopt' returns `EOF'.
+ Then `optind' is the index in ARGV of the first ARGV-element
+ that is not an option. (The ARGV-elements have been permuted
+ so that those that are not options now come last.)
+
+ OPTSTRING is a string containing the legitimate option characters.
+ If an option character is seen that is not listed in OPTSTRING,
+ return '?' after printing an error message. If you set `opterr' to
+ zero, the error message is suppressed but we still return '?'.
+
+ If a char in OPTSTRING is followed by a colon, that means it wants an arg,
+ so the following text in the same ARGV-element, or the text of the following
+ ARGV-element, is returned in `optarg'. Two colons mean an option that
+ wants an optional arg; if there is text in the current ARGV-element,
+ it is returned in `optarg', otherwise `optarg' is set to zero.
+
+ If OPTSTRING starts with `-' or `+', it requests different methods of
+ handling the non-option ARGV-elements.
+ See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
+
+ Long-named options begin with `--' instead of `-'.
+ Their names may be abbreviated as long as the abbreviation is unique
+ or is an exact match for some defined option. If they have an
+ argument, it follows the option name in the same ARGV-element, separated
+ from the option name by a `=', or else the in next ARGV-element.
+ When `getopt' finds a long-named option, it returns 0 if that option's
+ `flag' field is nonzero, the value of the option's `val' field
+ if the `flag' field is zero.
+
+ The elements of ARGV aren't really const, because we permute them.
+ But we pretend they're const in the prototype to be compatible
+ with other systems.
+
+ LONGOPTS is a vector of `struct option' terminated by an
+ element containing a name which is zero.
+
+ LONGIND returns the index in LONGOPT of the long-named option found.
+ It is only valid when a long-named option has been found by the most
+ recent call.
+
+ If LONG_ONLY is nonzero, '-' as well as '--' can introduce
+ long-named options. */
+
+int
+_getopt_internal (int argc, char *const *argv, const char *optstring, const struct option *longopts, int *longind, int long_only)
+{
+ optarg = NULL;
+
+ if (optind == 0)
+ optstring = _getopt_initialize (optstring);
+
+ if (nextchar == NULL || *nextchar == '\0')
+ {
+ /* Advance to the next ARGV-element. */
+
+ if (ordering == PERMUTE)
+ {
+ /* If we have just processed some options following some non-options,
+ exchange them so that the options come first. */
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (last_nonopt != optind)
+ first_nonopt = optind;
+
+ /* Skip any additional non-options
+ and extend the range of non-options previously skipped. */
+
+ while (optind < argc
+ && (argv[optind][0] != '-' || argv[optind][1] == '\0'))
+ optind++;
+ last_nonopt = optind;
+ }
+
+ /* The special ARGV-element `--' means premature end of options.
+ Skip it like a null option,
+ then exchange with previous non-options as if it were an option,
+ then skip everything else like a non-option. */
+
+ if (optind != argc && !strcmp (argv[optind], "--"))
+ {
+ optind++;
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (first_nonopt == last_nonopt)
+ first_nonopt = optind;
+ last_nonopt = argc;
+
+ optind = argc;
+ }
+
+ /* If we have done all the ARGV-elements, stop the scan
+ and back over any non-options that we skipped and permuted. */
+
+ if (optind == argc)
+ {
+ /* Set the next-arg-index to point at the non-options
+ that we previously skipped, so the caller will digest them. */
+ if (first_nonopt != last_nonopt)
+ optind = first_nonopt;
+ return EOF;
+ }
+
+ /* If we have come to a non-option and did not permute it,
+ either stop the scan or describe it to the caller and pass it by. */
+
+ if ((argv[optind][0] != '-' || argv[optind][1] == '\0'))
+ {
+ if (ordering == REQUIRE_ORDER)
+ return EOF;
+ optarg = argv[optind++];
+ return 1;
+ }
+
+ /* We have found another option-ARGV-element.
+ Skip the initial punctuation. */
+
+ nextchar = (argv[optind] + 1
+ + (longopts != NULL && argv[optind][1] == '-'));
+ }
+
+ /* Decode the current option-ARGV-element. */
+
+ /* Check whether the ARGV-element is a long option.
+
+ If long_only and the ARGV-element has the form "-f", where f is
+ a valid short option, don't consider it an abbreviated form of
+ a long option that starts with f. Otherwise there would be no
+ way to give the -f short option.
+
+ On the other hand, if there's a long option "fubar" and
+ the ARGV-element is "-fu", do consider that an abbreviation of
+ the long option, just like "--fu", and not "-f" with arg "u".
+
+ This distinction seems to be the most useful approach. */
+
+ if (longopts != NULL
+ && (argv[optind][1] == '-'
+ || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
+ {
+ char *nameend;
+ const struct option *p;
+ const struct option *pfound = NULL;
+ int exact = 0;
+ int ambig = 0;
+ int indfound=0; /* Keep gcc happy */
+ int option_index;
+
+ for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
+ /* Do nothing. */ ;
+
+ /* Test all long options for either exact match
+ or abbreviated matches. */
+ for (p = longopts, option_index = 0; p->name; p++, option_index++)
+ if (!strncmp (p->name, nextchar, nameend - nextchar))
+ {
+ if ((size_t) (nameend - nextchar) == (size_t) strlen (p->name))
+ {
+ /* Exact match found. */
+ pfound = p;
+ indfound = option_index;
+ exact = 1;
+ break;
+ }
+ else if (pfound == NULL)
+ {
+ /* First nonexact match found. */
+ pfound = p;
+ indfound = option_index;
+ }
+ else
+ /* Second or later nonexact match found. */
+ ambig = 1;
+ }
+
+ if (ambig && !exact)
+ {
+ if (opterr)
+ fprintf (stderr, "%s: option `%s' is ambiguous\n",
+ argv[0], argv[optind]);
+ nextchar += strlen (nextchar);
+ optind++;
+ return '?';
+ }
+
+ if (pfound != NULL)
+ {
+ option_index = indfound;
+ optind++;
+ if (*nameend)
+ {
+ /* Don't test has_arg with >, because some C compilers don't
+ allow it to be used on enums. */
+ if (pfound->has_arg)
+ optarg = nameend + 1;
+ else
+ {
+ if (opterr)
+ {
+ if (argv[optind - 1][1] == '-')
+ /* --option */
+ fprintf (stderr,
+ "%s: option `--%s' doesn't allow an argument\n",
+ argv[0], pfound->name);
+ else
+ /* +option or -option */
+ fprintf (stderr,
+ "%s: option `%c%s' doesn't allow an argument\n",
+ argv[0], argv[optind - 1][0], pfound->name);
+ }
+ nextchar += strlen (nextchar);
+ return '?';
+ }
+ }
+ else if (pfound->has_arg == 1)
+ {
+ if (optind < argc)
+ optarg = argv[optind++];
+ else
+ {
+ if (opterr)
+ fprintf (stderr, "%s: option `%s' requires an argument\n",
+ argv[0], argv[optind - 1]);
+ nextchar += strlen (nextchar);
+ return optstring[0] == ':' ? ':' : '?';
+ }
+ }
+ nextchar += strlen (nextchar);
+ if (longind != NULL)
+ *longind = option_index;
+ if (pfound->flag)
+ {
+ *(pfound->flag) = pfound->val;
+ return 0;
+ }
+ return pfound->val;
+ }
+
+ /* Can't find it as a long option. If this is not getopt_long_only,
+ or the option starts with '--' or is not a valid short
+ option, then it's an error.
+ Otherwise interpret it as a short option. */
+ if (!long_only || argv[optind][1] == '-'
+ || my_index (optstring, *nextchar) == NULL)
+ {
+ if (opterr)
+ {
+ if (argv[optind][1] == '-')
+ /* --option */
+ fprintf (stderr, "%s: unrecognized option `--%s'\n",
+ argv[0], nextchar);
+ else
+ /* +option or -option */
+ fprintf (stderr, "%s: unrecognized option `%c%s'\n",
+ argv[0], argv[optind][0], nextchar);
+ }
+ nextchar = (char *) "";
+ optind++;
+ return '?';
+ }
+ }
+
+ /* Look at and handle the next short option-character. */
+
+ {
+ char c = *nextchar++;
+ char *temp = my_index (optstring, c);
+
+ /* Increment `optind' when we start to process its last character. */
+ if (*nextchar == '\0')
+ ++optind;
+
+ if (temp == NULL || c == ':')
+ {
+ if (opterr)
+ {
+ if (posixly_correct)
+ /* 1003.2 specifies the format of this message. */
+ fprintf (stderr, "%s: illegal option -- %c\n", argv[0], c);
+ else
+ fprintf (stderr, "%s: invalid option -- %c\n", argv[0], c);
+ }
+ optopt = c;
+ return '?';
+ }
+ if (temp[1] == ':')
+ {
+ if (temp[2] == ':')
+ {
+ /* This is an option that accepts an argument optionally. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ optind++;
+ }
+ else
+ optarg = NULL;
+ nextchar = NULL;
+ }
+ else
+ {
+ /* This is an option that requires an argument. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ /* If we end this ARGV-element by taking the rest as an arg,
+ we must advance to the next element now. */
+ optind++;
+ }
+ else if (optind == argc)
+ {
+ if (opterr)
+ {
+ /* 1003.2 specifies the format of this message. */
+ fprintf (stderr, "%s: option requires an argument -- %c\n",
+ argv[0], c);
+ }
+ optopt = c;
+ if (optstring[0] == ':')
+ c = ':';
+ else
+ c = '?';
+ }
+ else
+ /* We already incremented `optind' once;
+ increment it again when taking next ARGV-elt as argument. */
+ optarg = argv[optind++];
+ nextchar = NULL;
+ }
+ }
+ return c;
+ }
+}
+
+int
+getopt (int argc, char *const *argv, const char *optstring)
+{
+ return _getopt_internal (argc, argv, optstring,
+ (const struct option *) 0,
+ (int *) 0,
+ 0);
+}
+
+#endif /* _LIBC or not __GNU_LIBRARY__. */
+
+#ifdef TEST
+
+/* Compile with -DTEST to make an executable for use in testing
+ the above definition of `getopt'. */
+
+int
+main (argc, argv)
+ int argc;
+ char **argv;
+{
+ int c;
+ int digit_optind = 0;
+
+ while (1)
+ {
+ int this_option_optind = optind ? optind : 1;
+
+ c = getopt (argc, argv, "abc:d:0123456789");
+ if (c == EOF)
+ break;
+
+ switch (c)
+ {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (digit_optind != 0 && digit_optind != this_option_optind)
+ printf ("digits occur in two different argv-elements.\n");
+ digit_optind = this_option_optind;
+ printf ("option %c\n", c);
+ break;
+
+ case 'a':
+ printf ("option a\n");
+ break;
+
+ case 'b':
+ printf ("option b\n");
+ break;
+
+ case 'c':
+ printf ("option c with value `%s'\n", optarg);
+ break;
+
+ case '?':
+ break;
+
+ default:
+ printf ("?? getopt returned character code 0%o ??\n", c);
+ }
+ }
+
+ if (optind < argc)
+ {
+ printf ("non-option ARGV-elements: ");
+ while (optind < argc)
+ printf ("%s ", argv[optind++]);
+ printf ("\n");
+ }
+
+ exit (0);
+}
+
+#endif /* TEST */
+}
+#endif
\ No newline at end of file
diff --git a/src/getopt.h b/src/getopt.h
new file mode 100644
index 0000000..36fcf74
--- /dev/null
+++ b/src/getopt.h
@@ -0,0 +1,185 @@
+/* Declarations for getopt.
+ Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef _GETOPT_H
+
+#ifndef __need_getopt
+# define _GETOPT_H 1
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+/* If __GNU_LIBRARY__ is not already defined, either we are being used
+ standalone, or this is the first header included in the source file.
+ If we are being used with glibc, we need to include <features.h>, but
+ that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
+ not defined, include <ctype.h>, which will pull in <features.h> for us
+ if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
+ doesn't flood the namespace with stuff the way some other headers do.) */
+#if !defined __GNU_LIBRARY__
+# include <ctype.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns -1, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+ for unrecognized options. */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized. */
+
+extern int optopt;
+
+#ifndef __need_getopt
+/* Describe the long-named options requested by the application.
+ The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+ of `struct option' terminated by an element containing a name which is
+ zero.
+
+ The field `has_arg' is:
+ no_argument (or 0) if the option does not take an argument,
+ required_argument (or 1) if the option requires an argument,
+ optional_argument (or 2) if the option takes an optional argument.
+
+ If the field `flag' is not NULL, it points to a variable that is set
+ to the value given in the field `val' when the option is found, but
+ left unchanged if the option is not found.
+
+ To have a long-named option do something other than set an `int' to
+ a compiled-in constant, such as set a value from `optarg', set the
+ option's `flag' field to zero and its `val' field to a nonzero
+ value (the equivalent single-letter option character, if there is
+ one). For long options that have a zero `flag' field, `getopt'
+ returns the contents of the `val' field. */
+
+struct option
+{
+# if (defined __STDC__ && __STDC__) || defined __cplusplus
+ const char *name;
+# else
+ char *name;
+# endif
+ /* has_arg can't be an enum because some compilers complain about
+ type mismatches in all the code that assumes it is an int. */
+ int has_arg;
+ int *flag;
+ int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'. */
+
+# define no_argument 0
+# define required_argument 1
+# define optional_argument 2
+#endif /* need getopt */
+
+
+/* Get definitions and prototypes for functions to process the
+ arguments in ARGV (ARGC of them, minus the program name) for
+ options given in OPTS.
+
+ Return the option character from OPTS just read. Return -1 when
+ there are no more options. For unrecognized options, or options
+ missing arguments, `optopt' is set to the option letter, and '?' is
+ returned.
+
+ The OPTS string is a list of characters which are recognized option
+ letters, optionally followed by colons, specifying that that letter
+ takes an argument, to be placed in `optarg'.
+
+ If a letter in OPTS is followed by two colons, its argument is
+ optional. This behavior is specific to the GNU `getopt'.
+
+ The argument `--' causes premature termination of argument
+ scanning, explicitly telling `getopt' that there are no more
+ options.
+
+ If OPTS begins with `--', then non-option arguments are treated as
+ arguments to the option '\0'. This behavior is specific to the GNU
+ `getopt'. */
+
+#if (defined __STDC__ && __STDC__) || defined __cplusplus
+# ifdef __GNU_LIBRARY__
+/* Many other libraries have conflicting prototypes for getopt, with
+ differences in the consts, in stdlib.h. To avoid compilation
+ errors, only prototype getopt for the GNU C library. */
+extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
+# elif GETOPT_UNDEFINED /* not __GNU_LIBRARY__ */
+extern int getopt ();
+# endif /* __GNU_LIBRARY__ */
+
+# ifndef __need_getopt
+extern int getopt_long (int ___argc, char *const *___argv,
+ const char *__shortopts,
+ const struct option *__longopts, int *__longind);
+extern int getopt_long_only (int ___argc, char *const *___argv,
+ const char *__shortopts,
+ const struct option *__longopts, int *__longind);
+
+/* Internal only. Users should not call this directly. */
+extern int _getopt_internal (int ___argc, char *const *___argv,
+ const char *__shortopts,
+ const struct option *__longopts, int *__longind,
+ int __long_only);
+# endif
+#else /* not __STDC__ */
+extern int getopt ();
+# ifndef __need_getopt
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+# endif
+#endif /* __STDC__ */
+
+#ifdef __cplusplus
+}
+#endif
+
+/* Make sure we later can get all the definitions and declarations. */
+#undef __need_getopt
+
+#endif /* getopt.h */
diff --git a/src/getopt1.c b/src/getopt1.c
new file mode 100644
index 0000000..45c35ed
--- /dev/null
+++ b/src/getopt1.c
@@ -0,0 +1,196 @@
+/* getopt_long and getopt_long_only entry points for GNU getopt.
+ Copyright (C) 1987,88,89,90,91,92,93,94,96,97,98
+ Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef _LIBC
+# include <getopt.h>
+#else
+# include "getopt.h"
+#endif
+
+#if !defined __STDC__ || !__STDC__
+/* This is a separate conditional since some stdc systems
+ reject `defined (const)'. */
+#ifndef const
+#define const
+#endif
+#endif
+
+#include <stdio.h>
+
+/* Comment out all this code if we are using the GNU C Library, and are not
+ actually compiling the library itself. This code is part of the GNU C
+ Library, but also included in many other GNU distributions. Compiling
+ and linking in this code is a waste when using the GNU C library
+ (especially if it is a shared library). Rather than having every GNU
+ program understand `configure --with-gnu-libc' and omit the object files,
+ it is simpler to just do this in the source for each such file. */
+
+#define GETOPT_INTERFACE_VERSION 2
+#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2
+#include <gnu-versions.h>
+#if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION
+#define ELIDE_CODE
+#endif
+#endif
+
+#ifndef ELIDE_CODE
+
+
+/* This needs to come after some library #include
+ to get __GNU_LIBRARY__ defined. */
+#ifdef __GNU_LIBRARY__
+#include <stdlib.h>
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+int
+getopt_long (argc, argv, options, long_options, opt_index)
+ int argc;
+ char *const *argv;
+ const char *options;
+ const struct option *long_options;
+ int *opt_index;
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 0);
+}
+
+/* Like getopt_long, but '-' as well as '--' can indicate a long option.
+ If an option that starts with '-' (not '--') doesn't match a long option,
+ but does match a short option, it is parsed as a short option
+ instead. */
+
+int
+getopt_long_only (argc, argv, options, long_options, opt_index)
+ int argc;
+ char *const *argv;
+ const char *options;
+ const struct option *long_options;
+ int *opt_index;
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 1);
+}
+
+# ifdef _LIBC
+libc_hidden_def (getopt_long)
+libc_hidden_def (getopt_long_only)
+# endif
+
+#endif /* Not ELIDE_CODE. */
+
+#ifdef TEST
+
+#include <stdio.h>
+
+int
+main (argc, argv)
+ int argc;
+ char **argv;
+{
+ int c;
+ int digit_optind = 0;
+
+ while (1)
+ {
+ int this_option_optind = optind ? optind : 1;
+ int option_index = 0;
+ static struct option long_options[] =
+ {
+ {"add", 1, 0, 0},
+ {"append", 0, 0, 0},
+ {"delete", 1, 0, 0},
+ {"verbose", 0, 0, 0},
+ {"create", 0, 0, 0},
+ {"file", 1, 0, 0},
+ {0, 0, 0, 0}
+ };
+
+ c = getopt_long (argc, argv, "abc:d:0123456789",
+ long_options, &option_index);
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (digit_optind != 0 && digit_optind != this_option_optind)
+ printf ("digits occur in two different argv-elements.\n");
+ digit_optind = this_option_optind;
+ printf ("option %c\n", c);
+ break;
+
+ case 'a':
+ printf ("option a\n");
+ break;
+
+ case 'b':
+ printf ("option b\n");
+ break;
+
+ case 'c':
+ printf ("option c with value `%s'\n", optarg);
+ break;
+
+ case 'd':
+ printf ("option d with value `%s'\n", optarg);
+ break;
+
+ case '?':
+ break;
+
+ default:
+ printf ("?? getopt returned character code 0%o ??\n", c);
+ }
+ }
+
+ if (optind < argc)
+ {
+ printf ("non-option ARGV-elements: ");
+ while (optind < argc)
+ printf ("%s ", argv[optind++]);
+ printf ("\n");
+ }
+
+ exit (0);
+}
+
+#endif /* TEST */
diff --git a/src/joinAlignmentFiles.cpp b/src/joinAlignmentFiles.cpp
new file mode 100644
index 0000000..025946d
--- /dev/null
+++ b/src/joinAlignmentFiles.cpp
@@ -0,0 +1,108 @@
+#include "libMems/IntervalList.h"
+#include <fstream>
+#include <vector>
+#include <sstream>
+#include "libMems/SlotAllocator.h"
+#include "libMems/Match.h"
+#include "libMems/GappedAlignment.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 4 )
+ {
+ cerr << "joinAlignments <mauve .mln base name> <number of files> <mauve output file>\n";
+ return -1;
+ }
+ string base_name = argv[1];
+ stringstream aln_count_str(argv[2]);
+ string out_fname = argv[3];
+ uint aln_count;
+ aln_count_str >> aln_count;
+ cerr << "aln_count is: " << aln_count << endl;
+ cerr << "fix this trash code\n";
+ throw "shit";
+/*
+try{
+ SlotAllocator< Match >& sa = SlotAllocator< Match >::GetSlotAllocator();
+ IntervalList all_iv_list;
+ for( uint alnI = 1; alnI <= aln_count; alnI++ )
+ {
+ IntervalList cur_iv_list;
+ try{
+ stringstream aln_fname;
+ aln_fname << base_name << alnI << ".mln";
+ ifstream cur_aln_file( aln_fname.str().c_str() );
+ if( !cur_aln_file.is_open() )
+ {
+ cerr << "Couldn't open: \"" << aln_fname.str() << "\"\n";
+ return -1;
+ }
+ cur_iv_list.ReadList( cur_aln_file );
+ // hack: trim out all gapped alignments
+ for( uint ivI = 0; ivI < cur_iv_list.size(); ivI++ )
+ {
+ Interval& cur_iv = cur_iv_list[ivI];
+ vector<AbstractMatch*> new_matches;
+ for( uint mI = 0; mI < cur_iv.matches.size(); mI++ )
+ {
+ GappedAlignment* ga = dynamic_cast<GappedAlignment*>(cur_iv.matches[mI]);
+ if( ga == NULL )
+ {
+ if( mI < 5 || mI > cur_iv.matches.size() - 5 )
+ new_matches.push_back( cur_iv.matches[mI] );
+ else
+ sa.Free(static_cast<Match*>(cur_iv.matches[mI]));
+ continue;
+ }
+ delete ga;
+ }
+ cur_iv.matches = new_matches;
+ cur_iv.CalculateOffset();
+ }
+ }catch(gnException& gne){
+ // try reading the .alignment file instead of the .mln
+ stringstream aln_fname;
+ aln_fname << base_name << alnI << ".alignment";
+ ifstream cur_aln_file( aln_fname.str().c_str() );
+ if( !cur_aln_file.is_open() )
+ {
+ cerr << "Couldn't open: \"" << aln_fname.str() << "\"\n";
+ return -1;
+ }
+ cur_iv_list.ReadStandardAlignment( cur_aln_file );
+ for( uint ivI = 0; ivI < cur_iv_list.size(); ivI++ )
+ {
+ cout << ((GappedAlignment*)cur_iv_list[ivI].matches[0])->Start(0) << endl;
+ }
+ }
+ if( alnI == 0 )
+ {
+ all_iv_list = cur_iv_list;
+ }else{
+ all_iv_list.insert( all_iv_list.end(), cur_iv_list.begin(), cur_iv_list.end() );
+ }
+ // progress update
+ if( (alnI*100)/aln_count != ((alnI*100)-1)/aln_count ){
+ cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bRead " << (alnI*100)/aln_count << "% of data";
+ cout.flush();
+ }
+ }
+ cout << endl << "Writing output\n";
+ ofstream out_file( out_fname.c_str() );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"" << out_fname << "\"\n";
+ return -2;
+ }
+ all_iv_list.WriteList( out_file );
+}catch( gnException& gne )
+{
+ cerr << gne << endl;
+}
+*/
+ return 0;
+}
diff --git a/src/makeBadgerMatrix.cpp b/src/makeBadgerMatrix.cpp
new file mode 100644
index 0000000..66ed5fc
--- /dev/null
+++ b/src/makeBadgerMatrix.cpp
@@ -0,0 +1,117 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/Aligner.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+class livComp {
+public:
+ livComp( uint seq ){ m_seq = seq; };
+ bool operator()( const pair< Interval*, uint >& a, const pair< Interval*, uint >& b )
+ {
+ return a.first->LeftEnd(m_seq) < b.first->LeftEnd(m_seq);
+ }
+protected:
+ uint m_seq;
+};
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 4 )
+ {
+ cerr << "Usage: makeBadgerMatrix <input xmfa> <output badger file> <LCB coordinate file>\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ ofstream badger_out;
+ badger_out.open( argv[2] );
+ if( !badger_out.is_open() ){
+ cerr << "Error writing to " << argv[2] << endl;
+ return -1;
+ }
+
+ ofstream coord_out;
+ coord_out.open( argv[3] );
+ if( !coord_out.is_open() ){
+ cerr << "Error writing to " << argv[3] << endl;
+ return -2;
+ }
+
+ try{
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+
+ vector< pair< Interval*, uint > > labeled_ivs( input_ivs.size() );
+ for( size_t ivI = 0; ivI < input_ivs.size(); ivI++ )
+ labeled_ivs[ivI] = make_pair( &input_ivs[ivI], ivI );
+
+ // write out block boundaries
+ for( uint seqI = 0; seqI < input_ivs.seq_filename.size(); ++seqI )
+ {
+ if(seqI > 0) coord_out << '\t';
+ coord_out << "seq" << seqI << "_leftend\tseq" << seqI << "_rightend";
+ }
+ coord_out << endl;
+ for( size_t ivI = 0; ivI < input_ivs.size(); ivI++ )
+ {
+ if( labeled_ivs[ivI].first->Multiplicity() == 1 )
+ continue;
+ for( uint seqI = 0; seqI < input_ivs.seq_filename.size(); ++seqI )
+ {
+ if(seqI > 0) coord_out << '\t';
+ string sign = labeled_ivs[ivI].first->Start(seqI) < 0 ? "-" : "";
+ coord_out << sign << labeled_ivs[ivI].first->LeftEnd(seqI) << '\t' << sign << labeled_ivs[ivI].first->RightEnd(seqI);
+ }
+ coord_out << endl;
+ }
+
+ for( uint seqI = 0; seqI < input_ivs.seq_filename.size(); ++seqI )
+ {
+ badger_out << input_ivs.seq_filename[seqI];
+ livComp lc(seqI);
+ std::sort( labeled_ivs.begin(), labeled_ivs.end(), lc );
+ for( size_t ivI = 0; ivI < labeled_ivs.size(); ivI++ )
+ {
+ if( labeled_ivs[ivI].first->LeftEnd(seqI) == NO_MATCH )
+ continue;
+ if( labeled_ivs[ivI].first->Multiplicity() == 1 )
+ continue;
+ int fs = labeled_ivs[ivI].first->FirstStart();
+ const char* dir = labeled_ivs[ivI].first->Orientation(seqI) == labeled_ivs[ivI].first->Orientation(fs) ? "" : "-";
+ badger_out << "," << dir << labeled_ivs[ivI].second + 1;
+ }
+ badger_out << endl;
+ }
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }catch( exception& e ){
+ cerr << e.what() << endl;
+ return -2;
+ }catch( char const* c ){
+ cerr << c << endl;
+ return -3;
+ }catch(...){
+ cerr << "Unhandled exception" << endl;
+ return -4;
+ }
+}
+
diff --git a/src/makeMc4Matrix.cpp b/src/makeMc4Matrix.cpp
new file mode 100644
index 0000000..391cf2a
--- /dev/null
+++ b/src/makeMc4Matrix.cpp
@@ -0,0 +1,112 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include "libMems/IntervalList.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+class livComp {
+public:
+ livComp( uint seq ){ m_seq = seq; };
+ bool operator()( const pair< Interval*, uint >& a, const pair< Interval*, uint >& b )
+ {
+ return a.first->LeftEnd(m_seq) < b.first->LeftEnd(m_seq);
+ }
+protected:
+ uint m_seq;
+};
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 3 )
+ {
+ cerr << "Usage: makeBadgerMatrix <input xmfa> <output badger file>\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ ofstream badger_out;
+ badger_out.open( argv[2] );
+ if( !badger_out.is_open() ){
+ cerr << "Error writing to " << argv[2] << endl;
+ return -1;
+ }
+
+ try{
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+
+ vector< pair< Interval*, uint > > labeled_ivs;
+ for( size_t ivI = 0; ivI < input_ivs.size(); ivI++ )
+ {
+ if( input_ivs[ivI].Multiplicity() != input_ivs.seq_filename.size() )
+ continue; // not an N-way block
+ labeled_ivs.push_back( make_pair( &input_ivs[ivI], ivI ) );
+ }
+ for( uint seqI = 0; seqI < input_ivs.seq_filename.size(); ++seqI )
+ {
+ badger_out << input_ivs.seq_filename[seqI];
+ livComp lc(seqI);
+ std::sort( labeled_ivs.begin(), labeled_ivs.end(), lc );
+ if( seqI == 0 )
+ {
+ for( size_t ivI = 0; ivI < labeled_ivs.size(); ivI++ )
+ {
+ labeled_ivs[ivI].second = ivI + 1;
+ if( labeled_ivs[ivI].first->Orientation(seqI) == AbstractMatch::reverse )
+ labeled_ivs[ivI].first->Invert();
+ }
+ }
+ vector< size_t > other( labeled_ivs.size() * 2 + 2 );
+ for( size_t ivI = 0; ivI < labeled_ivs.size(); ivI++ )
+ {
+ if(labeled_ivs[ivI].first->Orientation(seqI) == AbstractMatch::forward)
+ {
+ other[ivI*2+1] = absolut(labeled_ivs[ivI].second)*2 - 1;
+ other[ivI*2+2] = absolut(labeled_ivs[ivI].second)*2;
+ }else{
+ other[ivI*2+1] = absolut(labeled_ivs[ivI].second)*2;
+ other[ivI*2+2] = absolut(labeled_ivs[ivI].second)*2 - 1;
+ }
+ }
+ for( size_t ivI = 0; ivI < other.size(); ivI++ )
+ {
+ badger_out << "," << other[ivI];
+ }
+ badger_out << "\nstandard";
+ for( size_t ivI = 0; ivI < labeled_ivs.size(); ivI++ )
+ {
+ badger_out << "," << (labeled_ivs[ivI].first->Orientation(seqI) == AbstractMatch::reverse? "-" : "") << labeled_ivs[ivI].second;
+ }
+
+ badger_out << endl;
+ }
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }catch( exception& e ){
+ cerr << e.what() << endl;
+ return -2;
+ }catch( char const* c ){
+ cerr << c << endl;
+ return -3;
+ }catch(...){
+ cerr << "Unhandled exception" << endl;
+ return -4;
+ }
+}
+
diff --git a/src/mauveAligner.cpp b/src/mauveAligner.cpp
new file mode 100644
index 0000000..5eea936
--- /dev/null
+++ b/src/mauveAligner.cpp
@@ -0,0 +1,919 @@
+/*******************************************************************************
+ * $Id: memsApp.cpp,v 1.49 2004/04/23 00:18:45 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "mauveAligner.h"
+#include "getopt.h"
+#include <sstream>
+#include <stdexcept>
+#include "libGenome/gnSequence.h"
+#include "libMems/Matrix.h"
+#include "libMems/NumericMatrix.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/MemHash.h"
+#include "libMems/MaskedMemHash.h"
+#include "libMems/Aligner.h"
+#include "libMems/MatchList.h"
+#include "libMems/RepeatHash.h"
+#include "libMems/Interval.h"
+#include "libMems/IntervalList.h"
+#include "libMems/gnAlignedSequences.h"
+#include "libMems/Islands.h"
+#include "libMems/MuscleInterface.h"
+#include "libMems/DistanceMatrix.h"
+
+#include "boost/filesystem/operations.hpp"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+class MLDeleter {
+public:
+ MLDeleter( MatchList& ml ) : mlist( ml ) {}
+ ~MLDeleter(){ mlist.Clear(); }
+private:
+ MatchList& mlist;
+};
+
+#define NELEMS(a) ( sizeof( a ) / sizeof( *a ) )
+
+int main( int argc, char* argv[] ){
+#if WIN32
+// Multi-tasking does not work well in CPU-bound
+// console apps running under Win32.
+// Reducing the process priority allows GUI apps
+// to run responsively in parallel.
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+#endif
+ return doAlignment(argc, argv);
+}
+
+/**
+ * This application uses libMems to produce full scale multiple
+ * genomic alignments. First the command line is parsed to get the names of data files
+ * and user specified options. Next each sequence and its corresponding sorted mer list
+ * are loaded. If the sorted mer list fails to load a new one is created.
+ * If it is necessary to find matches in the sequences instead of loading them, each
+ * sequence and SML are added to a MemHash which searches for exact matches.
+ * Then LCBs are found if the user requested it. Finally, either the MatchList or the
+ * LCB list is written to disk.
+ */
+int doAlignment( int argc, char* argv[] ){
+try{
+ if( argc <= 0 ){
+ print_usage( "mauveAligner" );
+ return -1;
+ }
+ if( argc == 1 ){
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ // set the Muscle path
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+ mi.ParseMusclePath( argv[0] );
+
+ //
+ // definitions of the variables that can be set by the user on the command line:
+ //
+ vector<string> seq_files;
+ vector<string> sml_files;
+ vector<gnSequence*> seq_table;
+ vector<DNAFileSML*> sml_table;
+ uint seed_size = 0; // Use default settings
+ int seed_rank = 0;
+ boolean recursive = true;
+ boolean lcb_extension = true;
+ boolean gapped_alignment = true;
+ boolean create_LCBs = true;
+ boolean calculate_coverage = false;
+ int64 LCB_size = -1;
+ string output_file = "";
+ boolean read_matches = false;
+ boolean read_lcbs = false;
+ boolean find_repeats = false;
+ boolean print_stats = false;
+ boolean eliminate_overlaps = false;
+ boolean nway_filter = false;
+ boolean collinear_genomes = false;
+ string match_input_file = "";
+ string lcb_stats_file = "";
+ string island_file = "";
+ string lcb_file = "";
+ string tree_filename = "";
+ string coverage_list_file = "";
+ boolean output_alignment = false;
+ string alignment_output_dir = "";
+ string alignment_output_format = "";
+ string alignment_output_file = "";
+ string match_log = "";
+ string offset_log = "";
+ string merge_log = "";
+ // island related
+ uint island_size = 0;
+ uint island_break_min = 0;
+ // backbone related
+ uint backbone_size = 0;
+ uint max_backbone_gap = 0;
+ int64 min_r_gap_length = -1;
+ string backbone_file = "";
+ boolean output_backbone = false;
+ // for parallelization of LCB alignment
+ vector< int > realign_lcbs;
+ string muscle_args = "";
+ string gapped_aligner;
+
+ string permutation_filename;
+ int64 permutation_weight = -1;
+
+ boolean lcb_match_input_format = false;
+ int opt_max_extension_iters = -1;
+
+ uint seqI;
+ boolean print_version = false;
+ int max_gapped_alignment_length = -1;
+
+ ostream* detail_list_out = NULL; /**< output stream for detail list */
+
+ //
+ // parse command line with gnu getopt
+ //
+ int opt;
+ int config_opt;
+ int ac = argc;
+ char** av = argv;
+ // 'm' mer size
+ // 'r' recursive
+ const char* short_args= "";
+ enum opt_names{
+ opt_mums,
+ opt_no_recursion,
+ opt_no_lcb_extension,
+ opt_no_gapped_alignment,
+ opt_seed_size,
+ opt_seed_type,
+ opt_weight,
+ opt_output,
+ opt_eliminate_overlaps,
+ opt_n_way_filter,
+ opt_match_input,
+ opt_lcb_input,
+ opt_output_alignment,
+ opt_id_matrix,
+ opt_island_size,
+ opt_island_output,
+ opt_island_break_min,
+ opt_backbone_size,
+ opt_max_backbone_gap,
+ opt_backbone_output,
+ opt_coverage_output,
+ opt_repeats,
+ opt_gapped_aligner,
+ opt_max_gapped_aligner_length,
+ opt_min_recursive_gap_length,
+ opt_output_guide_tree,
+ opt_alignment_output_dir,
+ opt_alignment_output_format,
+ opt_match_log,
+ opt_offset_log,
+ opt_merge_match_log,
+ opt_version,
+ opt_scratch_path,
+ opt_realign_lcb,
+ opt_id_matrix_input,
+ opt_collinear,
+ opt_muscle_args,
+ opt_permutation_matrix_output,
+ opt_permutation_matrix_min_weight,
+ opt_lcb_match_input,
+ opt_max_extension_iterations,
+ };
+ struct option long_opts[] = {
+ {"mums", no_argument, &config_opt, opt_mums},
+ {"no-recursion", no_argument, &config_opt, opt_no_recursion},
+ {"no-lcb-extension", no_argument, &config_opt, opt_no_lcb_extension},
+ {"no-gapped-alignment", no_argument, &config_opt, opt_no_gapped_alignment},
+ {"seed-size", required_argument, &config_opt, opt_seed_size},
+ {"seed-type", required_argument, &config_opt, opt_seed_type},
+ {"weight", required_argument, &config_opt, opt_weight},
+ {"output", required_argument, &config_opt, opt_output},
+ {"eliminate-overlaps", no_argument, &config_opt, opt_eliminate_overlaps},
+ {"n-way-filter", no_argument, &config_opt, opt_n_way_filter},
+ {"match-input", required_argument, &config_opt, opt_match_input},
+ {"lcb-input", required_argument, &config_opt, opt_lcb_input},
+ {"output-alignment", optional_argument, &config_opt, opt_output_alignment},
+ {"id-matrix", optional_argument, &config_opt, opt_id_matrix},
+ {"island-size", required_argument, &config_opt, opt_island_size},
+ {"island-output", required_argument, &config_opt, opt_island_output},
+ {"island-break-min", required_argument, &config_opt, opt_island_break_min},
+ {"backbone-size", required_argument, &config_opt, opt_backbone_size},
+ {"max-backbone-gap", required_argument, &config_opt, opt_max_backbone_gap},
+ {"backbone-output", optional_argument, &config_opt, opt_backbone_output},
+ {"coverage-output", optional_argument, &config_opt, opt_coverage_output},
+ {"repeats", no_argument, &config_opt, opt_repeats},
+ {"max-gapped-aligner-length", required_argument, &config_opt, opt_max_gapped_aligner_length},
+ {"min-recursive-gap-length", required_argument, &config_opt, opt_min_recursive_gap_length},
+ {"output-guide-tree", required_argument, &config_opt, opt_output_guide_tree},
+ {"alignment-output-dir", required_argument, &config_opt, opt_alignment_output_dir},
+ {"alignment-output-format", required_argument, &config_opt, opt_alignment_output_format},
+ {"match-log", required_argument, &config_opt, opt_match_log},
+ {"offset-log", required_argument, &config_opt, opt_offset_log},
+ {"merge-match-log", required_argument, &config_opt, opt_merge_match_log},
+ {"version", no_argument, &config_opt, opt_version},
+ {"scratch-path", required_argument, &config_opt, opt_scratch_path},
+ {"realign-lcb", required_argument, &config_opt, opt_realign_lcb},
+ {"id-matrix-input", required_argument, &config_opt, opt_id_matrix_input},
+ {"collinear", no_argument, &config_opt, opt_collinear},
+ {"muscle-args", required_argument, &config_opt, opt_muscle_args},
+ {"permutation-matrix-output", required_argument, &config_opt, opt_permutation_matrix_output},
+ {"permutation-matrix-min-weight", required_argument, &config_opt, opt_permutation_matrix_min_weight},
+ {"lcb-match-input", no_argument, &config_opt, opt_lcb_match_input},
+ {"max-extension-iterations", required_argument, &config_opt, opt_max_extension_iterations},
+
+ {0, 0, 0, 0} // for correct termination of option list
+ // getopt_long can segfault without this
+ };
+
+ int indexptr;
+ while( (opt = getopt_long( ac, av, short_args, long_opts, &indexptr )) != EOF ){
+ switch( opt ){
+ case 0:
+ switch(config_opt){
+ case opt_mums:
+ create_LCBs = false;
+ break;
+ case opt_no_recursion:
+ recursive = false;
+ break;
+ case opt_no_lcb_extension:
+ lcb_extension = false;
+ break;
+ case opt_no_gapped_alignment:
+ gapped_alignment = false;
+ break;
+ case opt_seed_size:
+ seed_size = atoi( optarg );
+ break;
+ case opt_seed_type:
+ if( strcmp( "solid", optarg ) == 0 )
+ seed_rank = SOLID_SEED;
+ else if( strcmp( "coding", optarg ) == 0 )
+ seed_rank = CODING_SEED;
+ else if( strcmp( "spaced", optarg ) == 0 )
+ seed_rank = 0;
+ else if( strcmp( "spaced1", optarg ) == 0 )
+ seed_rank = 1;
+ else if( strcmp( "spaced2", optarg ) == 0 )
+ seed_rank = 2;
+ else
+ cerr << "Warning: --seed-type parameter not understood. Using default spaced seeds\n";
+ break;
+ case opt_weight:
+ LCB_size = atol( optarg );
+ break;
+ case opt_output:
+ output_file = optarg;
+ break;
+ case opt_eliminate_overlaps:
+ eliminate_overlaps = true;
+ break;
+ case opt_n_way_filter:
+ nway_filter = true;
+ break;
+ case opt_match_input:
+ read_matches = true;
+ match_input_file = optarg;
+ break;
+ case opt_lcb_input:
+ lcb_file = optarg;
+ read_lcbs = true;
+ break;
+ case opt_output_alignment:
+ output_alignment = true;
+ if( optarg != NULL )
+ alignment_output_file = optarg;
+ break;
+ case opt_id_matrix:
+ break;
+ case opt_island_size:
+ island_size = atoi( optarg );
+ break;
+ case opt_island_output:
+ island_file = optarg;
+ break;
+ case opt_island_break_min:
+ island_break_min = atoi( optarg );
+ break;
+ case opt_backbone_size:
+ backbone_size = atoi( optarg );
+ break;
+ case opt_max_backbone_gap:
+ max_backbone_gap = atoi( optarg );
+ break;
+ case opt_backbone_output:
+ backbone_file = optarg;
+ output_backbone = true;
+ break;
+ case opt_coverage_output:
+ if( optarg != NULL )
+ coverage_list_file = optarg;
+ calculate_coverage = true;
+ break;
+ case opt_repeats:
+ find_repeats = true;
+ break;
+ case opt_gapped_aligner:
+ gapped_aligner = optarg;
+ break;
+ case opt_max_gapped_aligner_length:
+ max_gapped_alignment_length = atoi( optarg );
+ break;
+ case opt_min_recursive_gap_length:
+ min_r_gap_length = atol( optarg );
+ break;
+ case opt_output_guide_tree:
+ tree_filename = optarg;
+ break;
+ case opt_alignment_output_dir:
+ alignment_output_dir = optarg;
+ break;
+ case opt_alignment_output_format:
+ alignment_output_format = optarg;
+ break;
+ case opt_match_log:
+ match_log = optarg;
+ break;
+ case opt_offset_log:
+ offset_log = optarg;
+ break;
+ case opt_merge_match_log:
+ merge_log = optarg;
+ break;
+ case opt_version:
+ print_version = true;
+ break;
+ case opt_scratch_path:
+ FileSML::registerTempPath( optarg );
+ break;
+ case opt_realign_lcb:
+ realign_lcbs.push_back( atoi( optarg ) );
+ break;
+ case opt_id_matrix_input:
+ case opt_collinear:
+ collinear_genomes = true;
+ break;
+ case opt_muscle_args:
+ muscle_args = optarg;
+ mi.SetExtraMuscleArguments( muscle_args );
+ break;
+ case opt_permutation_matrix_output:
+ permutation_filename = optarg;
+ break;
+ case opt_permutation_matrix_min_weight:
+ permutation_weight = atol(optarg);
+ break;
+ case opt_lcb_match_input:
+ lcb_match_input_format = true;
+ break;
+ case opt_max_extension_iterations:
+ opt_max_extension_iters = atoi(optarg);
+ break;
+ default:
+ print_usage( argv[0] );
+ return -1;
+ }
+ break;
+ default:
+ print_usage( argv[0] );
+ return -1;
+ }
+ }
+ // now read in the seq and sml file names from av
+ boolean seq_name_arg = true;
+ for( int optI = optind; optI < argc; optI++ ){
+ if( seq_name_arg )
+ seq_files.push_back( av[ optI ] );
+ else
+ sml_files.push_back( av[ optI ] );
+ seq_name_arg = !seq_name_arg;
+ }
+
+ // print the version if the user requested it
+ if( print_version ){
+ cerr << "mauveAligner " << " build date " << __DATE__ << " at " << __TIME__ << endl;
+ }
+
+
+ //
+ // check validity of command line option combinations
+ //
+ if( ( island_size != 0 && island_file == "" ) || ( island_size == 0 && island_file != "" ) ){
+ cerr << "Error: Both --island-output and --island-size must be specified to generate islands\n";
+ return -1;
+ }
+
+ if( (alignment_output_dir == "" && alignment_output_format != "") ||
+ (alignment_output_dir != "" && alignment_output_format == "") ){
+ cerr << "Error: Both --alignment-output-dir and --alignment-output-format must be specified in order to generate alignment output in a custom format\n";
+ return -1;
+ }
+
+ if( alignment_output_format != "" ){
+ if( !gnAlignedSequences::isSupportedFormat( alignment_output_format ) ){
+ cerr << "Error: " << alignment_output_format << " is not a supported alignment format.\n";
+ return -1;
+ }
+ }
+
+ if( find_repeats ){
+ if( create_LCBs || read_matches || read_lcbs || calculate_coverage ||
+ island_file != "" || island_size != 0 || recursive || lcb_stats_file != "" ){
+ cerr << "A paramater has been specified that is incompatible with repeat list generation\n";
+ return -1;
+ }
+ }
+
+ //
+ // done parsing and checking command line options
+ // Start doing the work
+ //
+
+ MatchList match_list;
+ MLDeleter deleter( match_list );
+
+ if( seq_files.size() == 1 && sml_files.size() == 0 ){
+ LoadMFASequences( match_list, seq_files[0], &cout);
+ if( find_repeats || ( !read_lcbs && !read_matches ) )
+ match_list.CreateMemorySMLs(seed_size, &cout, seed_rank);
+ }else if( seq_files.size() != sml_files.size() ){
+ cerr << "Error: Each sequence file must have a corresponding SML file specified.\n";
+ return -1;
+ }else{
+ match_list.seq_filename = seq_files;
+ match_list.sml_filename = sml_files;
+ LoadSequences( match_list, &cout );
+ if( find_repeats || !read_matches || ( !read_lcbs && !read_matches ) )
+ match_list.LoadSMLs( seed_size, &cout, seed_rank );
+ }
+
+ ostream* match_out;
+ if( output_file != "" ){
+ ofstream* match_out_file = new ofstream( output_file.c_str() );
+ if( !match_out_file->is_open() ){
+ cerr << "Error opening " << output_file << endl;
+ return -2;
+ }
+ match_out = match_out_file;
+ }else
+ match_out = &cout;
+
+ // search for repetitive regions
+ if( find_repeats ){
+ RepeatHash repeat_finder;
+ repeat_finder.LogProgress( &cout );
+ repeat_finder.FindMatches( match_list );
+ WriteList( match_list, *match_out );
+ match_out->flush();
+ return 0;
+ }
+
+ // read matches if the user requested it
+ if( read_matches ){
+ ifstream match_in( match_input_file.c_str() );
+ if( !match_in.is_open() ){
+ cerr << "Error opening " << match_input_file << endl;
+ return -2;
+ }
+ if( !lcb_match_input_format )
+ {
+ try{
+ ReadList( match_list, match_in );
+ }catch( gnException& gne ){
+ cerr << "Error reading " << match_input_file << "\nPossibly corrupt file or invalid file format\n";
+ return -2;
+ }
+ }else{
+ IntervalList m_iv_list;
+ m_iv_list.ReadList( match_in );
+ for( int ivI = 0; ivI < m_iv_list.size(); ivI++ ){
+ for( int mI = 0; mI < m_iv_list[ivI].GetMatches().size(); mI++ ){
+ Match* m = dynamic_cast< Match* >(m_iv_list[ivI].GetMatches()[mI]);
+ if( m != NULL && m->Multiplicity() > 1)
+ match_list.push_back(m->Copy());
+ }
+ }
+ }
+ if( seq_files.size() > 1 )
+ match_list.seq_filename = seq_files;
+ else if( match_list.seq_table.size() == 0 )
+ // fill seq_table with empty sequences
+ for( seqI = 0; seqI < match_list.seq_filename.size(); seqI++ )
+ match_list.seq_table.push_back( new gnSequence() );
+ }else if ( !read_lcbs ){
+ // get full subset matches
+ MaskedMemHash match_finder;
+
+ if( nway_filter ){
+ // only find the n-way matches
+ uint64 nway_mask = 1;
+ nway_mask <<= match_list.seq_table.size();
+ nway_mask--;
+ match_finder.SetMask( nway_mask );
+ }
+ match_finder.LogProgress( &cout );
+ fstream match_log_out;
+ if( match_log != "" ){
+ match_log_out.open( match_log.c_str(), ios::in | ios::out );
+ if( !match_log_out.is_open() ){
+ cerr << "Error opening " << match_log << endl;
+ return -1;
+ }
+ match_finder.SetMatchLog( &match_log_out );
+ // append to whatever's already in the file
+ match_log_out.seekg( 0, ios::end );
+ }
+ fstream offset_log_out;
+ vector< gnSeqI > offset_start;
+ for( seqI = 0; seqI < match_list.seq_table.size(); seqI++ )
+ offset_start.push_back( 0 );
+
+ if( offset_log != "" ){
+ offset_log_out.open( offset_log.c_str(), ios::in | ios::out );
+ if( !offset_log_out.is_open() ){
+ cerr << "Error opening " << offset_log << endl;
+ return -1;
+ }
+ match_finder.SetOffsetLog( &offset_log_out );
+ string last_line;
+ string cur_line;
+ while( getline( offset_log_out, cur_line ) ){
+ last_line = cur_line;
+ }
+ if( last_line != "" ){
+ stringstream cur_off_stream( last_line );
+ for( seqI = 0; seqI < match_list.seq_table.size(); seqI++ )
+ cur_off_stream >> offset_start[ seqI ];
+ }
+ offset_log_out.clear();
+ }
+ ifstream merge_log_in;
+ if( merge_log != "" ){
+ merge_log_in.open( merge_log.c_str() );
+ if( !merge_log_in.is_open() ){
+ cerr << "Error opening " << merge_log << endl;
+ return -1;
+ }
+
+ for( seqI = 0; seqI < match_list.seq_table.size(); seqI++ ){
+ if( !match_finder.AddSequence( match_list.sml_table[ seqI ], match_list.seq_table[ seqI ] ) ){
+ ErrorMsg( "Error adding " + match_list.seq_filename[seqI] + "\n");
+ return -1;
+ }
+ }
+ match_finder.LoadFile( merge_log_in );
+ match_finder.GetMatchList( match_list );
+ }else{
+ match_finder.FindMatchesFromPosition( match_list, offset_start );
+ }
+ match_log_out.close();
+ offset_log_out.close();
+ match_finder.Clear();
+ }
+
+
+ // write out a match list if the user doesn't want LCBs
+ if( !create_LCBs && !read_lcbs){
+ if( eliminate_overlaps ){
+ EliminateOverlaps( match_list );
+ }
+
+ if( nway_filter ){
+ match_list.MultiplicityFilter( match_list.seq_table.size() );
+ }
+
+ WriteList( match_list, *match_out );
+ match_out->flush();
+
+ // output a guide tree or a coverage list if necessary
+ // beware that selecting the nway filter above will cause the guide tree
+ // and coverage lists to be incorrect
+ vector< pair< uint64, uint64 > > coverage_list;
+ if( tree_filename != "" || calculate_coverage ){
+ // only count each base pair once!
+ if( !eliminate_overlaps )
+ EliminateOverlaps( match_list );
+ }
+
+ if( tree_filename != "" ){
+ NumericMatrix< double > distance;
+ DistanceMatrix( match_list.seq_table.size(), coverage_list, distance );
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+ if( tree_filename == "" )
+ tree_filename = CreateTempFileName("guide_tree");
+ mi.CreateTree( distance, tree_filename );
+ }
+
+ return 0;
+ }
+
+ // check whether the input sequences were masked to eliminate excess NNNNNs
+ for( seqI = 0; seqI < match_list.sml_table.size(); seqI++ ){
+ FileSML* cur_sml = dynamic_cast< FileSML* >(match_list.sml_table[ seqI ]);
+ if( cur_sml != NULL ){
+ const vector< int64 >& seq_coords = cur_sml->getUsedCoordinates();
+ if( seq_coords.size() > 0 ){
+ transposeMatches( match_list, seqI, seq_coords );
+ }
+ }
+ }
+
+ // at this point any SortedMerLists used to identify the initial set of MUMs
+ // are no longer necessary. Free them
+ for( uint smlI = 0; smlI < match_list.sml_table.size(); smlI++ ){
+ match_list.sml_table[ smlI ]->Clear();
+ delete match_list.sml_table[ smlI ];
+ }
+ match_list.sml_table.clear();
+
+ // Align the sequences if necessary
+ if( LCB_size < 0 ){
+ // calculate a default LCB weight, 3 times the mer size times the seq. count
+ if( seed_size <= 0 )
+ seed_size = MatchList::GetDefaultMerSize( match_list.seq_table );
+ LCB_size = seed_size * 3 * match_list.seq_table.size();
+ }else{
+ // adjust the LCB weight for the number of sequences being aligned
+ LCB_size *= match_list.seq_table.size();
+ }
+
+ // check that LCB_size can be set appropriately
+ if( create_LCBs && LCB_size < 0) {
+ cerr << "A minimum LCB size greater than 0 must be specified in order to create LCBs.\n";
+ return -1;
+ }
+
+ // hack to communicate that the genomes are collinear
+ if( collinear_genomes )
+ LCB_size = -1;
+
+ Aligner aligner( match_list.seq_table.size() );
+
+ if( min_r_gap_length >= 0 ){
+ aligner.SetMinRecursionGapLength( min_r_gap_length );
+ }
+
+ aligner.SetGappedAligner( MuscleInterface::getMuscleInterface() );
+ if( max_gapped_alignment_length != -1 )
+ aligner.SetMaxGappedAlignmentLength( max_gapped_alignment_length );
+
+ if( permutation_weight != -1 && permutation_filename == "" )
+ cerr << "A permutation output file must be specified to generate signed permutations\n";
+ if( permutation_weight == -1 && permutation_filename != "" )
+ permutation_weight = LCB_size;
+ if( permutation_weight != -1 )
+ {
+ permutation_weight *= match_list.seq_table.size();
+ aligner.SetPermutationOutput( permutation_filename, permutation_weight );
+ }
+ if( opt_max_extension_iters != -1 )
+ {
+ aligner.SetMaxExtensionIterations(opt_max_extension_iters);
+ }
+
+ IntervalList interval_list;
+ interval_list.seq_table = match_list.seq_table;
+ interval_list.seq_filename = match_list.seq_filename;
+ if( lcb_file == "" ){
+
+ try{
+ aligner.align( match_list, interval_list, 0, LCB_size, recursive, lcb_extension, gapped_alignment, tree_filename );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ }
+ interval_list.WriteList( *match_out );
+ match_out->flush();
+
+ }else if( read_lcbs ){
+ ifstream lcb_input( lcb_file.c_str() );
+ if( !lcb_input.is_open() ){
+ cerr << "Error opening " << lcb_file << endl;
+ return -2;
+ }
+ try{
+
+ interval_list.seq_table = match_list.seq_table;
+ interval_list.seq_filename = match_list.seq_filename;
+ interval_list.ReadList( lcb_input );
+// addUnalignedIntervals( interval_list );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ cerr << "Error reading " << lcb_file << "\nPossibly corrupt file or invalid file format\n";
+ return -2;
+ }
+ }
+ if( realign_lcbs.size() > 0 ){
+ // set up a new IntervalList
+ IntervalList realigned_intervals;
+ realigned_intervals.seq_table = interval_list.seq_table;
+ realigned_intervals.seq_filename = interval_list.seq_filename;
+ for( int realignI = 0; realignI < realign_lcbs.size(); realignI++ ){
+ // extract a match list from the interval list for this LCB
+ Interval& iv = interval_list[ realignI ];
+ // clear any matches from the current match_list
+ match_list.clear();
+ for( int matchI = 0; matchI < iv.GetMatches().size(); matchI++ ){
+ AbstractMatch* m = iv.GetMatches()[ matchI ];
+ Match* match = dynamic_cast< Match* >( m );
+ if( match != NULL && m->Multiplicity() > 1)
+ match_list.push_back( match->Copy() );
+ }
+ aligner.align( match_list, realigned_intervals, 0, LCB_size, recursive, false, gapped_alignment, tree_filename );
+ }
+
+ // once all intervals have been realigned reset the interval_list
+ interval_list = realigned_intervals;
+ }
+
+ if( output_alignment ){
+ if( !gapped_alignment )
+ addUnalignedIntervals( interval_list );
+ if( alignment_output_file == "" || alignment_output_file == "-" ){
+ interval_list.WriteStandardAlignment( cout );
+ }else{
+ ofstream align_out( alignment_output_file.c_str() );
+ if( !align_out.is_open() ){
+ cerr << "Error opening " << alignment_output_file << endl;
+ return -1;
+ }
+ interval_list.WriteStandardAlignment( align_out );
+ align_out.close();
+ }
+ }
+ uint lcbI;
+
+ // output alignments in another format if the user asked for it
+ if( alignment_output_dir != "" ){
+ boost::filesystem::path output_dir = alignment_output_dir;
+ boost::filesystem::create_directory( output_dir );
+
+ for( lcbI = 0; lcbI < interval_list.size(); lcbI++ ){
+ gnAlignedSequences gnas;
+ interval_list[ lcbI ].GetAlignedSequences( gnas, match_list.seq_table );
+ ostringstream oss;
+ oss << "lcb_" << lcbI << ".txt";
+ boost::filesystem::path outtie = output_dir / oss.str();
+ ofstream alignment_lcb_out( outtie.string().c_str(), ios::trunc );
+ if( !alignment_lcb_out.is_open() ){
+ cerr << "Error opening " << oss.str() << endl;
+ return -1;
+ }
+ gnas.output( alignment_output_format, alignment_lcb_out );
+ }
+ }
+
+ //
+ // output an identity matrix if requested
+ //
+ if( print_stats ){
+ ostream* stats_out;
+ if( lcb_stats_file == "" || lcb_stats_file == "-" ){
+ stats_out = &cout;
+ }else{
+ ofstream* stats_out_file = new ofstream( lcb_stats_file.c_str() );
+ if( !stats_out_file->is_open() ){
+ cerr << "Error opening " << lcb_stats_file << endl;
+ return -1;
+ }
+ stats_out = stats_out_file;
+ }
+ NumericMatrix< double > identity;
+ IdentityMatrix( interval_list, identity );
+ identity.print( *stats_out );
+ if( lcb_stats_file == "" || lcb_stats_file == "-" ){
+ delete stats_out;
+ }
+ }
+
+ //
+ // output backbone if it was requested
+ //
+ if( output_backbone ){
+ ostream* backbone_out;
+ if( backbone_file != "" ){
+ ofstream* backbone_out_file = new ofstream( backbone_file.c_str() );
+ if( !backbone_out_file->is_open() ){
+ cerr << "Error opening " << backbone_file << endl;
+ return -1;
+ }
+ backbone_out = backbone_out_file;
+ }else
+ backbone_out = &cout;
+
+ vector< GappedAlignment > backbone_data;
+ simpleFindBackbone( interval_list, backbone_size, max_backbone_gap, backbone_data );
+ outputBackbone( backbone_data, *backbone_out );
+ if( backbone_file != "" ){
+ delete backbone_out;
+ }
+ }
+
+ //
+ // output islands if they were requested
+ //
+ if( island_file != "" ){
+ ostream* island_out;
+ if( island_file == "-" )
+ island_out = &cout;
+ else{
+ ofstream* island_out_file = new ofstream( island_file.c_str() );
+ if( !island_out_file->is_open() ){
+ cerr << "Error opening " << island_file << endl;
+ return -1;
+ }
+ island_out = island_out_file;
+ }
+ simpleFindIslands( interval_list, island_size, *island_out );
+ findIslandsBetweenLCBs( interval_list, island_size, *island_out );
+
+ if( island_file != "-" ){
+ delete island_out;
+ }
+ }
+ match_list.clear(); // bad. leaks memory.
+}catch( gnException& gne ) {
+ cerr << "Unhandled gnException: " << gne << endl;
+ return -10;
+}catch( exception& e ) {
+ cerr << "Unhandled exception: " << e.what() << endl;
+ return -11;
+}catch( char* message ){
+ cerr << "Unhandled exception: " << message << endl;
+ return -12;
+}catch(...){
+ cerr << "Unknown exception occurred.\n";
+ return -13;
+}
+
+ return 0;
+}
+
+void print_usage( const char* pname ){
+ cerr << "Usage:" << endl;
+ cerr << pname << " [options] <seq1 filename> <sml1 filename> ... "
+ << " <seqN filename> <smlN filename>" << endl;
+ cerr << "Options:" << endl;
+ cerr << "\t --output=<file> Output file name. Prints to screen by default" << endl;
+ cerr << "\t --mums Find MUMs only, do not attempt to determine locally collinear blocks (LCBs)\n";
+ cerr << "\t --no-recursion Don't perform recursive anchor identification (implies --no-gapped-alignment)" << endl;
+ cerr << "\t --no-lcb-extension If determining LCBs, don't attempt to extend the LCBs\n";
+ cerr << "\t --seed-size=<number> Initial seed match size, default is log_2( average seq. length )" << endl;
+ cerr << "\t --max-extension-iterations=<number> Limit LCB extensions to this number of attempts, default is 4\n";
+ cerr << "\t --eliminate-inclusions Eliminate linked inclusions in subset matches.\n";
+ cerr << "\t --weight=<number> Minimum LCB weight in base pairs per sequence" << endl;
+ cerr << "\t --match-input=<file> Use specified match file instead of searching for matches\n";
+ cerr << "\t --lcb-match-input Indicates that the match input file contains matches that have been clustered into LCBs\n";
+ cerr << "\t --lcb-input=<file> Use specified lcb file instead of constructing LCBs (skips LCB generation)\n";
+ cerr << "\t --scratch-path=<path> For large genomes, use a directory for storage of temporary data. Should be given two or more times to with different paths.\n";
+ cerr << "\t --id-matrix=<file> Generate LCB stats and write them to the specified file\n";
+ cerr << "\t --island-size=<number> Find islands larger than the given number\n";
+ cerr << "\t --island-output=<file> Output islands the given file (requires --island-size)\n";
+ cerr << "\t --backbone-size=<number> Find stretches of backbone longer than the given number of b.p.\n";
+ cerr << "\t --max-backbone-gap=<number> Allow backbone to be interrupted by gaps up to this length in b.p.\n";
+ cerr << "\t --backbone-output=<file> Output islands the given file (requires --island-size)\n";
+ cerr << "\t --coverage-output=<file> Output a coverage list to the specified file (- for stdout)\n";
+ cerr << "\t --repeats Generates a repeat map. Only one sequence can be specified\n";
+ cerr << "\t --output-guide-tree=<file> Write out a guide tree to the designated file\n";
+ cerr << "\t --collinear Assume that input sequences are collinear--they have no rearrangements\n";
+ cerr << "\nGapped alignment controls:\n";
+ cerr << "\t --no-gapped-alignment Don't perform a gapped alignment\n";
+ cerr << "\t --max-gapped-aligner-length=<number> Maximum number of base pairs to attempt aligning with the gapped aligner\n";
+ cerr << "\t --min-recursive-gap-length=<number> Minimum size of gaps that Mauve will perform recursive MUM anchoring on (Default is 200)\n";
+ cerr << "\nSigned permutation matrix options:\n";
+ cerr << "\t --permutation-matrix-output=<file> Write out the LCBs as a signed permutation matrix to the given file\n";
+ cerr << "\t --permutation-matrix-min-weight=<number> A permutation matrix will be written for every set of LCBs with weight between this value and the value of --weight\n";
+ cerr << "\nAlignment output options:\n";
+ cerr << "\t --alignment-output-dir=<directory> Outputs a set of alignment files (one per LCB) to a given directory\n";
+ cerr << "\t --alignment-output-format=<directory> Selects the output format for --alignment-output-dir\n";
+ cerr << "\t --output-alignment=<file> Write out an XMFA format alignment to the designated file\n";
+ cerr << endl;
+
+ const vector< string >& formats = gnAlignedSequences::getSupportedFormats();
+ cerr << "Supported alignment output formats are: ";
+ for( int formatI = 0; formatI < formats.size(); formatI++ ){
+ if( formatI > 0 )
+ cerr << ", ";
+ cerr << formats[ formatI ];
+ }
+ cerr << endl;
+ cerr << endl;
+}
+
diff --git a/src/mauveAligner.h b/src/mauveAligner.h
new file mode 100644
index 0000000..4fcf2da
--- /dev/null
+++ b/src/mauveAligner.h
@@ -0,0 +1,10 @@
+#ifndef _MAUVEALIGNER_H
+
+#ifndef __need_getopt
+# define _MAUVEALIGNER_H 1
+#endif
+
+void print_usage( const char* pname );
+int doAlignment( int argc, char* argv[] );
+
+#endif
diff --git a/src/mauveToXMFA.cpp b/src/mauveToXMFA.cpp
new file mode 100644
index 0000000..8f7cdfc
--- /dev/null
+++ b/src/mauveToXMFA.cpp
@@ -0,0 +1,35 @@
+#include "libMems/IntervalList.h"
+#include <fstream>
+#include <string>
+#include "libMems/MatchList.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 3 )
+ {
+ cerr << "Usage: mauveToXMFA <Mauve Alignment input> <XMFA output>\n";
+ return -1;
+ }
+ ifstream mauve_file( argv[1] );
+ if( !mauve_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -2;
+ }
+ ofstream xmfa_file( argv[2] );
+ if( !xmfa_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[2] << "\"\n";
+ return -3;
+ }
+
+ IntervalList iv_list;
+ iv_list.ReadList( mauve_file );
+ LoadSequences(iv_list, &cout);
+ iv_list.WriteStandardAlignment( xmfa_file );
+}
+
diff --git a/src/mfa2xmfa.cpp b/src/mfa2xmfa.cpp
new file mode 100644
index 0000000..60de675
--- /dev/null
+++ b/src/mfa2xmfa.cpp
@@ -0,0 +1,117 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnFASSource.h"
+#include "libGenome/gnSequence.h"
+#include <algorithm>
+#include <fstream>
+
+using namespace std;
+using namespace genome;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <MFA alignment input> <XMFA alignment output> [Unaligned FastA output]\n";
+}
+
+int main( int argc, char* argv[] ) {
+ if( argc < 3 ){
+ if( argc == 0 )
+ print_usage( "mfa2xmfa" );
+ else
+ print_usage( argv[0] );
+ return -1;
+ }
+
+ gnSequence mfa_seq;
+ string mfa_name = argv[1];
+ try{
+ mfa_seq.LoadSource( mfa_name );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+ ofstream xmfa_out( argv[2] );
+ if( !xmfa_out.is_open() ){
+ cerr << "Error opening " << argv[2] << endl;
+ return -1;
+ }
+
+ // if unaligned mfa output is desired, create it now
+ string mfa_output_name;
+ ofstream mfa_out;
+ if(argc >= 4)
+ {
+ mfa_output_name = argv[3];
+ mfa_out.open( mfa_output_name.c_str() );
+ if( !mfa_out.is_open() ){
+ cerr << "Error opening " << argv[3] << endl;
+ return -1;
+ }
+
+ gnSequence unaligned;
+ for( size_t seqI = 0; seqI < mfa_seq.contigListSize(); seqI++ )
+ {
+ string cur_seq = mfa_seq.contig(seqI).ToString();
+ string::iterator striter = std::remove( cur_seq.begin(), cur_seq.end(), '-' );
+ cur_seq.resize( striter - cur_seq.begin() );
+ unaligned += cur_seq;
+ unaligned.setContigName( seqI, mfa_seq.contigName(seqI) );
+ }
+ gnFASSource::Write( unaligned, mfa_out, false, false );
+
+ // create xmfa header if unaligned seq is to be written
+ xmfa_out << "#FormatVersion Mauve1\n";
+ }
+
+ unsigned int seq_count = mfa_seq.contigListSize();
+ // find the max length alignment entry and add gaps
+ // to the ends of shorter entries for consistency
+ gnSeqI max_length = 0;
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ if( mfa_seq.contigLength( seqI ) > max_length )
+ max_length = mfa_seq.contigLength( seqI );
+ }
+
+
+ // count the number of base pairs in each sequence
+ vector< gnSeqI > seq_lens( seq_count, 0 );
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ string cur_seq;
+ mfa_seq.contig( seqI ).ToString(cur_seq);
+ for( gnSeqI baseI = 0; baseI < cur_seq.length(); baseI++ ){
+ if( cur_seq[ baseI ] != '-' )
+ seq_lens[ seqI ]++;
+ }
+ // fill in xmfa header details if unaligned seq is to be written
+ if(mfa_output_name.size() > 0)
+ {
+ xmfa_out << "#Sequence" << seqI + 1 << "File\t" << mfa_output_name << endl;
+ xmfa_out << "#Sequence" << seqI + 1 << "Entry\t" << seqI + 1 << endl;
+ xmfa_out << "#Sequence" << seqI + 1 << "Format\tFastA\n";
+ }
+ }
+
+ // write xmfa body
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ string cur_seq;
+ mfa_seq.contig( seqI ).ToString(cur_seq);
+ // if the alignment entry is shorter than the max length
+ // then add gaps to the end for consistency
+ if( cur_seq.length() < max_length )
+ cur_seq += string( max_length - cur_seq.length(), '-' );
+
+ xmfa_out << "> " << seqI + 1 << ":";
+ xmfa_out << 1 << "-" << seq_lens[ seqI ] << " + " << mfa_seq.contigName( seqI );
+ xmfa_out << endl;
+ gnSeqI cur_pos = 0;
+ for( ; cur_pos < cur_seq.length(); cur_pos += 80 ){
+ gnSeqI cur_len = cur_pos + 80 < cur_seq.length() ? 80 : cur_seq.length() - cur_pos;
+ xmfa_out.write( cur_seq.data() + cur_pos, cur_len );
+ xmfa_out << endl;
+ }
+ }
+
+ xmfa_out << "=" << endl;
+
+}
diff --git a/src/multiEVD.cpp b/src/multiEVD.cpp
new file mode 100644
index 0000000..8952683
--- /dev/null
+++ b/src/multiEVD.cpp
@@ -0,0 +1,217 @@
+/**
+ * multiEVD
+ * (c)left 2007 aaron darling
+ * A program to calculate the extreme value distribution of alignment drops in homologous sequence.
+ * INPUT: a simulated multiple alignment as input
+ * OUTPUT: the 95%ile, 99%ile, etc of scores in the extreme value distribution
+ * THEORY:
+ * computes inverse substitution and gap scores, never allowing
+ * the inverse score to drop below 0. Each time the score rises above 0, an "excursion" begins, and when the score
+ * drops back to 0, the excursion has ended. The highest score achieved by the excursion is the "extreme value".
+ * Each extreme value is recorded, and the distribution of these extreme values is what gets output.
+ */
+#include "libMems/Islands.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/MuscleInterface.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/ProgressiveAligner.h"
+
+#include <sstream>
+
+#include <boost/multi_array.hpp>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+template< typename MatchVector >
+void getLocalRecordHeights( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, vector< score_t >& lrh )
+{
+ typedef typename MatchVector::value_type MatchType;
+ if( iv_list.size() == 0 )
+ return;
+ uint seq_count = seq_table.size();
+ for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
+ const MatchType& iv = iv_list[ iv_listI ];
+ std::vector< std::string > aln_table;
+ GetAlignment( *iv, seq_table, aln_table );
+
+ std::vector< score_t > scores;
+ PairwiseScoringScheme pss;
+ score_t total_score;
+
+ stripGapColumns(aln_table);
+ computeSPScore( aln_table, pss, scores, total_score );
+
+ // Invert the scores since we're trying to detect rare bouts of non-homologous sequence
+ for( size_t sI = 0; sI < scores.size(); ++sI )
+ if( scores[sI] != INVALID_SCORE)
+ scores[sI] = -scores[sI];
+
+ score_t score_sum = 0; // start in an hss
+ score_t local_record_height = 0;
+ for( size_t colI = 0; colI < scores.size(); ++colI )
+ {
+ if( scores[colI] == INVALID_SCORE )
+ continue;
+
+ if( score_sum > 0 && score_sum + scores[colI] < 0 )
+ {
+ // end of an excursion
+ score_sum = 0;
+ lrh.push_back( local_record_height );
+ local_record_height = 0;
+ }else if( score_sum == 0 && scores[colI] > 0 )
+ {
+ // start a new excursion
+ score_sum += scores[colI];
+ if( score_sum > local_record_height )
+ local_record_height = score_sum;
+ }else if( score_sum > 0 ){
+ score_sum += scores[colI];
+ if( score_sum > local_record_height )
+ local_record_height = score_sum;
+ }
+ }
+ }
+}
+
+//bad: copied from progressiveAligner.cpp
+template< class BoostMatType >
+void print2d_matrix( BoostMatType& mat, std::ostream& os )
+{
+ for( size_t i = 0; i < mat.shape()[0]; ++i )
+ {
+ for( size_t j = 0; j < mat.shape()[1]; ++j )
+ {
+ if( j > 0 )
+ os << "\t";
+ os << mat[i][j];
+ }
+ os << endl;
+ }
+}
+
+
+// read each input file, write summary statistics about the EVD to stdout
+int main( int argc, char* argv[] )
+{
+// vector< score_t > lrh_all;
+ if( argc != 2 )
+ {
+ cerr << "Usage: multiEVD <simulation run count>\n";
+ cerr << "This program must be run from a directory which contains alignjob directories\n";
+ return -1;
+ }
+ int run_count = atoi( argv[1] );
+ int simu_count = 0;
+ vector< vector< score_t > > lrh_all;
+ size_t seq_count = 0;
+ for( int runI = 0; runI < run_count; ++runI )
+ {
+ IntervalList iv_list;
+ stringstream aln_fname;
+ aln_fname << "alignjob." << runI << "/evolved.dat";
+ ifstream in_file( aln_fname.str().c_str() );
+ if( !in_file.is_open() )
+ {
+ cerr << "Error opening " << aln_fname.str() << endl;
+ continue;
+ }
+ simu_count++;
+ iv_list.ReadStandardAlignment(in_file);
+ stringstream seq_fname;
+ seq_fname << "alignjob." << runI << "/evolved_seqs.fas";
+ MatchList ml;
+ LoadMFASequences(ml, seq_fname.str(), &cout);
+ iv_list.seq_table = ml.seq_table;
+ if( seq_count == 0 )
+ {
+ seq_count = iv_list.seq_table.size();
+ lrh_all.resize(seq_count+1);
+ }
+
+ vector< Interval* > iv_ptrs( iv_list.size() );
+ for( size_t ivI = 0; ivI < iv_list.size(); ++ivI )
+ iv_ptrs[ivI] = &iv_list[ivI];
+
+ vector< gnSequence* > seq_table = iv_list.seq_table;
+
+ vector< uint > proj_seqs(seq_count);
+ for( size_t sI = 0; sI < seq_count; ++sI )
+ proj_seqs[sI] = sI;
+
+ std::vector< std::vector< mems::MatchProjectionAdapter* > > LCB_list;
+ std::vector< mems::LCB > projected_adjs;
+ for( size_t mult = seq_count; mult > 1; mult-- )
+ {
+ vector< score_t > lrh;
+ getLocalRecordHeights( iv_ptrs, seq_table, lrh );
+ lrh_all[mult].insert( lrh_all[mult].end(), lrh.begin(), lrh.end() );
+ // randomly pick a sequence to discard
+ int disc = rand() % proj_seqs.size();
+ proj_seqs.erase(proj_seqs.begin()+disc);
+ seq_table.erase(seq_table.begin()+disc);
+ // project the original alignment down to the remaining sequences
+ projectIntervalList( iv_list, proj_seqs, LCB_list, projected_adjs );
+ // free storage used by the previous set of projections
+ if( mult != seq_count )
+ {
+ for( size_t ivI = 0; ivI < iv_ptrs.size(); ivI++ )
+ iv_ptrs[ivI]->Free();
+ }
+ // update iv_ptrs to contain the new projections
+ iv_ptrs.resize(LCB_list.size());
+ for( size_t lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
+ {
+ Interval iv;
+ iv_ptrs[lcbI] = iv.Copy();
+ iv_ptrs[lcbI]->SetMatches(LCB_list[lcbI]);
+ }
+ }
+ }
+
+ boost::multi_array<score_t, 2> evd_table;
+ evd_table.resize( boost::extents[4][seq_count-1] );
+ boost::multi_array<size_t, 2> ss_table;
+ ss_table.resize( boost::extents[4][seq_count-1] );
+ for( size_t mult = 2; mult < seq_count + 1; mult++ )
+ {
+ std::sort( lrh_all[mult].begin(), lrh_all[mult].end() );
+ size_t index_95 = lrh_all[mult].size() * .95;
+ size_t index_99 = lrh_all[mult].size() * .99;
+ size_t index_999 = lrh_all[mult].size() * .999;
+ size_t index_9999 = lrh_all[mult].size() * .9999;
+ index_95 = (std::min)(index_95, lrh_all[mult].size()-1);
+ index_99 = (std::min)(index_99, lrh_all[mult].size()-1);
+ index_999 = (std::min)(index_999, lrh_all[mult].size()-1);
+ index_9999 = (std::min)(index_9999, lrh_all[mult].size()-1);
+// cout << "Total number of simulations: " << simu_count << endl;
+// cout << "Total number of excursions: " << lrh_all[mult].size() << endl;
+// cout << "95% score threshold: " << lrh_all[mult][index_95] << endl;
+ evd_table[0][mult-2] = lrh_all[mult][index_95];
+// cout << "Number excursions above 95%: " << lrh_all[mult].size() - index_95 << endl;
+ ss_table[0][mult-2] = lrh_all[mult].size() - index_95;
+// cout << "99% score threshold: " << lrh_all[mult][index_99] << endl;
+ evd_table[1][mult-2] = lrh_all[mult][index_99];
+// cout << "Number excursions above 99%: " << lrh_all[mult].size() - index_99 << endl;
+ ss_table[1][mult-2] = lrh_all[mult].size() - index_99;
+// cout << "99.9% score threshold: " << lrh_all[mult][index_999] << endl;
+ evd_table[2][mult-2] = lrh_all[mult][index_999];
+// cout << "Number excursions above 99.9%: " << lrh_all[mult].size() - index_999 << endl;
+ ss_table[2][mult-2] = lrh_all[mult].size() - index_999;
+// cout << "99.99% score threshold: " << lrh_all[mult][index_9999] << endl;
+ evd_table[3][mult-2] = lrh_all[mult][index_9999];
+// cout << "Number excursions above 99.99%: " << lrh_all[mult].size() - index_9999 << endl;
+ ss_table[3][mult-2] = lrh_all[mult].size() - index_9999;
+ }
+ cout << "Matrix of score thresholds:\n";
+ print2d_matrix( evd_table, cout );
+ cout << "\n\nMatrix of sample sizes:\n";
+ print2d_matrix( ss_table, cout );
+ cout << endl;
+}
+
+
diff --git a/src/multiToRawSequence.cpp b/src/multiToRawSequence.cpp
new file mode 100644
index 0000000..d910c6f
--- /dev/null
+++ b/src/multiToRawSequence.cpp
@@ -0,0 +1,28 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libGenome/gnRAWSource.h"
+
+int main( int argc, char* argv[] ){
+
+ if( argc != 3 ){
+ cout << argv[0] << " <input sequence> <output file>\n";
+ }
+ gnSequence seq;
+ try{
+ seq.LoadSource( argv[1] );
+ cout << argv[1] << " has " << seq.contigListLength() << " contigs\n";
+ for( int contigI = 0; contigI < seq.contigListLength(); contigI++ ){
+ gnSequence contig = seq.contig( contigI );
+ string contig_name = seq.contigName( contigI );
+ cout << "contig " << contig_name << " has " << contig.length() << "b.p.\n";
+ gnRAWSource::Write( contig, contig_name+".raw" );
+ }
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+ return 0;
+}
diff --git a/src/pairCompare.cpp b/src/pairCompare.cpp
new file mode 100644
index 0000000..e770bf1
--- /dev/null
+++ b/src/pairCompare.cpp
@@ -0,0 +1,85 @@
+#include "libMems/IntervalList.h"
+#include "libMems/Islands.h"
+#include "libMems/DistanceMatrix.h"
+#include <sstream>
+#include <fstream>
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 2 )
+ {
+ cerr << "Usage: pairCompare <sequence count>\n";
+ return -1;
+ }
+ int seq_count = atoi( argv[1] );
+ cout << "SeqI\tSeqJ\tNTidentity\tAvgBBpct\tLCB count\n";
+ for( size_t seqI = 10; seqI < seq_count; seqI++ )
+ {
+ for( size_t seqJ = 0; seqJ < seq_count; seqJ++ )
+ {
+ if( seqJ <= seqI )
+ continue;
+ cout << seqI << '\t' << seqJ << '\t';
+
+ size_t lcb_count = 0;
+
+ stringstream aln_in_fname;
+ aln_in_fname << "all_pairs/pair_" << seqI << "." << seqJ << ".xmfa";
+ ifstream alignment_in(aln_in_fname.str().c_str());
+ IntervalList aligned_ivs;
+ aligned_ivs.ReadStandardAlignment( alignment_in );
+
+
+ LoadSequences(aligned_ivs, NULL);
+
+ // add the sequence data to the interval list
+ uint seq_count = aligned_ivs.seq_table.size();
+ vector< GappedAlignment > backbone_data;
+ simpleFindBackbone( aligned_ivs, 50, 50, backbone_data );
+
+ IntervalList backbone_ivs;
+ backbone_ivs.seq_table = aligned_ivs.seq_table;
+
+ // count up the total length of backbone in each genome
+ vector< gnSeqI > total_bb( seq_count, 0 );
+ NumericMatrix< double > overall_identity;
+ for( uint bbI = 0; bbI < backbone_data.size(); bbI++ ){
+ vector<AbstractMatch*> tmp_iv(1, &backbone_data[ bbI ]);
+ backbone_ivs.push_back( Interval( tmp_iv.begin(), tmp_iv.end() ) );
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ total_bb[ seqI ] += backbone_data[ bbI ].Length( seqI );
+ }
+ }
+
+ vector< AbstractMatch* > bbivs;
+ for( uint bbI = 0; bbI < backbone_ivs.size(); bbI++ )
+ bbivs.push_back( &backbone_ivs[bbI] );
+ BackboneIdentityMatrix( bbivs, aligned_ivs.seq_table, overall_identity );
+
+ gnSeqI avg_bb = 0;
+ double seq_len_average = 0;
+ for( uint seqI = 0; seqI < aligned_ivs.seq_table.size(); seqI++ ){
+ avg_bb += total_bb[ seqI ];
+ seq_len_average += aligned_ivs.seq_table[seqI]->length();
+ }
+ avg_bb /= aligned_ivs.seq_table.size();
+ seq_len_average /= (double)seq_count;
+
+
+ for( size_t lcbI = 0; lcbI < aligned_ivs.size(); lcbI++ )
+ if( aligned_ivs[lcbI].Multiplicity() > 1 )
+ lcb_count++;
+
+
+ cout << overall_identity(0,1) << '\t';
+ cout << avg_bb / seq_len_average << '\t';
+ cout << lcb_count << endl;
+
+
+ }
+ }
+
+}
diff --git a/src/progressiveMauve.cpp b/src/progressiveMauve.cpp
new file mode 100644
index 0000000..abf23ba
--- /dev/null
+++ b/src/progressiveMauve.cpp
@@ -0,0 +1,768 @@
+/*******************************************************************************
+ * $Id: memsApp.cpp,v 1.49 2004/04/23 00:18:45 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "mauveAligner.h"
+#include "getopt.h"
+#include <sstream>
+#include <stdexcept>
+#include "libMems/Matrix.h"
+#include "libMems/NumericMatrix.h"
+#include "libGenome/gnSequence.h"
+#include "libMems/DNAFileSML.h"
+#include "libMems/MemHash.h"
+#include "libMems/MatchList.h"
+#include "libMems/Interval.h"
+#include "libMems/IntervalList.h"
+#include "libMems/gnAlignedSequences.h"
+#include "libMems/Islands.h"
+#include "libMems/MuscleInterface.h"
+#include "libMems/Backbone.h"
+//#include "libMems/twister.h"
+
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/PairwiseMatchFinder.h"
+#include "libMems/HomologyHMM/parameters.h"
+#include "UniqueMatchFinder.h"
+
+#include <boost/filesystem.hpp>
+
+#include "libMems/Memory.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+class MLDeleter {
+public:
+ MLDeleter( MatchList& ml ) : mlist( ml ) {}
+ ~MLDeleter(){ mlist.Clear(); }
+private:
+ MatchList& mlist;
+};
+
+class OptionList;
+
+class MauveOption : public option
+{
+public:
+ MauveOption( OptionList& ol, const char* name, int has_arg, const std::string& usage_info);
+
+ boolean set;
+ std::string arg_value;
+ std::string usage_info;
+};
+
+
+class OptionList : public vector< MauveOption* >
+{
+public:
+ OptionList() : opt_list(NULL){};
+ ~OptionList()
+ {
+ if( opt_list != NULL )
+ delete[] opt_list;
+ }
+ struct option* getOptions()
+ {
+ if( opt_list == NULL )
+ {
+ opt_list = new option[ this->size() + 1 ];
+ int i = 0;
+ for( ; i < this->size(); i++ ){
+ opt_list[i] = *(*this)[i];
+ }
+ struct option empty = {0,0,0,0};
+ opt_list[i] = empty;
+ }
+ return opt_list;
+ }
+ int config_opt;
+protected:
+ struct option* opt_list;
+};
+
+MauveOption::MauveOption( OptionList& ol, const char* name, int has_arg, const std::string& usage_info ) :
+ set( false ),
+ usage_info( usage_info )
+{
+ this->name = name;
+ this->has_arg = has_arg;
+ this->flag = &ol.config_opt;
+ this->val = ol.size();
+ ol.push_back(this);
+}
+
+void print_usage( const char* pname, OptionList& option_list )
+{
+ cerr << "progressiveMauve usage:\n\n";
+ cerr << "When each genome resides in a separate file:" << endl;
+ cerr << pname << " [options] <seq1 filename> ... <seqN filename>" << endl << endl;
+ cerr << "When all genomes are in a single file:" << endl;
+ cerr << pname << " [options] <seq filename>" << endl << endl;
+ cerr << "Options:" << endl;
+ for( size_t optionI = 0; optionI < option_list.size(); optionI++ )
+ {
+ cerr << "\t" << "--" << option_list[optionI]->name;
+ cerr << (option_list[optionI]->has_arg == no_argument ? " " : "=");
+ cerr << option_list[optionI]->usage_info << endl;
+ }
+ cerr << endl << endl;
+ cerr << "Examples:\n";
+ cerr << pname << " --output=my_seqs.xmfa my_genome1.gbk my_genome2.gbk my_genome3.fasta\n";
+ cerr << "\nIf genomes are in a single file and have no rearrangement:\n";
+ cerr << pname << " --collinear --output=my_seqs.xmfa my_genomes.fasta\n";
+}
+
+void printMatchSizes()
+{
+ UngappedLocalAlignment< HybridAbstractMatch<> > ula;
+ UngappedLocalAlignment< SparseAbstractMatch<> > sula;
+ CompactGappedAlignment<> cga;
+ MatchHashEntry mhe;
+ bitset_t bitset;
+ Match m;
+ cerr << "sizeof(UngappedLocalAlignment< HybridAbstractMatch<> >) " << sizeof(ula) << endl;
+ cerr << "sizeof(UngappedLocalAlignment< SparseAbstractMatch<> >) " << sizeof(sula) << endl;
+ cerr << "sizeof(m) " << sizeof(m) << endl;
+ cerr << "sizeof(CompactGappedAlignment<>) " << sizeof(cga) << endl;
+ cerr << "sizeof(boost::dynamic_bitset) " << sizeof(bitset) << endl;
+ cerr << "sizeof(MatchHashEntry) " << sizeof(mhe) << endl;
+}
+
+#ifndef WIN32
+#include <signal.h>
+#endif
+
+/**
+ * Aborts the running progressiveMauve program
+ */
+void terminateProgram( int sig )
+{
+ std::cerr << "Caught signal " << sig << std::endl;
+ std::cerr << "Cleaning up and exiting!\n";
+ deleteRegisteredFiles();
+ std::cerr << "Temporary files deleted.\n";
+ exit(sig);
+}
+
+#ifdef WIN32
+BOOL WINAPI handler(DWORD dwCtrlType)
+{
+ switch(dwCtrlType)
+ {
+ case CTRL_C_EVENT:
+ case CTRL_BREAK_EVENT:
+ case CTRL_CLOSE_EVENT:
+ case CTRL_LOGOFF_EVENT:
+ case CTRL_SHUTDOWN_EVENT:
+ terminateProgram(dwCtrlType);
+ default:
+ break;
+ }
+ return true;
+}
+#endif
+
+int main( int argc, char* argv[] )
+{
+#if WIN32
+// Multi-tasking does not work well in CPU-bound
+// console apps running under Win32.
+// Reducing the process priority allows GUI apps
+// to run responsively in parallel. (thanks Bob Edgar!)
+ SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
+// also register a handler to clean up during abnormal shutdown
+ BOOL status = SetConsoleCtrlHandler(handler, TRUE);
+#else
+// register a signal handler to catch errors and control-c and clean up...
+ signal( SIGINT, terminateProgram );
+ signal( SIGTERM, terminateProgram );
+ signal( SIGSEGV, terminateProgram );
+#endif
+ // delete temp files at program exit!
+ atexit( &deleteRegisteredFiles );
+
+ return doAlignment(argc, argv);
+}
+
+void getPatternText( int64 seed_pattern, char pattern[65] )
+{
+ char pat[65] = {
+ '0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0',
+ '0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0',
+ '0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0',
+ '0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0',
+ '\0'};
+ int lastone = 64;
+ for( int i = 63; i >= 0; i-- )
+ {
+ pat[i] = seed_pattern & 0x1 ? '1' : '0';
+ lastone = pat[i] == '1' ? i : lastone;
+ seed_pattern >>= 1;
+ }
+ memcpy( pattern, pat + lastone, 65 - lastone );
+}
+
+void getDefaultSmlFileNames( const vector< string >& seq_files, vector< string >& sml_files, int seed_weight, int seed_rank )
+{
+ int64 seed_pattern = getSeed(seed_weight, seed_rank);
+ // convert seed pattern to text;
+ char pattern[65];
+ getPatternText(seed_pattern, pattern);
+ sml_files.resize(seq_files.size());
+ for( int seqI = 0; seqI < seq_files.size(); seqI++ )
+ sml_files[seqI] = seq_files[seqI] + "." + pattern + ".sslist";
+}
+
+void applyBackbone( IntervalList& iv_list, string& bbcols_fname, string& bb_fname, size_t island_gap_size, double hmm_identity, double pgh, double pgu )
+{
+ ofstream bb_out( bb_fname.c_str() );
+ backbone_list_t bb_list;
+ // adapt to the GC of the sequences
+ double gc_content = computeGC( iv_list.seq_table );
+ std::cout << "Organisms have " << std::setprecision(3) << gc_content*100 << "% GC\n";
+
+ Params hmm_params = getAdaptedHoxdMatrixParameters( gc_content );
+ hmm_params.iGoHomologous = pgh;
+ hmm_params.iGoUnrelated = pgu;
+ adaptToPercentIdentity( hmm_params, hmm_identity );
+
+ detectAndApplyBackbone(iv_list, bb_list, hmm_params);
+ bb_list.clear();
+
+ BigGapsDetector bgd( island_gap_size );
+ detectBackbone( iv_list, bb_list, &bgd );
+
+ writeBackboneSeqCoordinates( bb_list, iv_list, bb_out );
+ std::vector< bb_seqentry_t > bb_seq_list;
+ bb_out.close();
+ std::ifstream bbseq_input( bb_fname.c_str() );
+ readBackboneSeqFile( bbseq_input, bb_seq_list );
+
+ mergeAdjacentSegments( bb_seq_list );
+ addUniqueSegments( bb_seq_list );
+ bbseq_input.close();
+ bb_out.open(bb_fname.c_str());
+ writeBackboneSeqFile( bb_out, bb_seq_list );
+
+ ofstream bbcols_out( bbcols_fname.c_str() );
+ writeBackboneColumns( bbcols_out, bb_list );
+ iv_list.backbone_filename = bbcols_fname;
+}
+
+/**
+ * progressive alignment. wheee.
+ */
+int doAlignment( int argc, char* argv[] ){
+//try{
+ OptionList mauve_options;
+ MauveOption opt_island_gap_size( mauve_options, "island-gap-size", required_argument, "<number> Alignment gaps above this size in nucleotides are considered to be islands [20]" );
+ MauveOption opt_profile( mauve_options, "profile", required_argument, "<file> (Not yet implemented) Read an existing sequence alignment in XMFA format and align it to other sequences or alignments" );
+ MauveOption opt_apply_backbone( mauve_options, "apply-backbone", required_argument, "<file> Read an existing sequence alignment in XMFA format and apply backbone statistics to it" );
+ MauveOption opt_disable_backbone( mauve_options, "disable-backbone", no_argument, "Disable backbone detection" );
+ MauveOption opt_mums( mauve_options, "mums", no_argument, "Find MUMs only, do not attempt to determine locally collinear blocks (LCBs)" );
+ MauveOption opt_seed_weight( mauve_options, "seed-weight", required_argument, "<number> Use the specified seed weight for calculating initial anchors" );
+ MauveOption opt_output( mauve_options, "output", required_argument, "<file> Output file name. Prints to screen by default" );
+ MauveOption opt_backbone_output( mauve_options, "backbone-output", required_argument, "<file> Backbone output file name (optional)." );
+ MauveOption opt_match_input( mauve_options, "match-input", required_argument, "<file> Use specified match file instead of searching for matches" );
+ MauveOption opt_input_id_matrix( mauve_options, "input-id-matrix", required_argument, "<file> An identity matrix describing similarity among all pairs of input sequences/alignments" );
+ MauveOption opt_max_gapped_aligner_length( mauve_options, "max-gapped-aligner-length", required_argument, "<number> Maximum number of base pairs to attempt aligning with the gapped aligner" );
+ MauveOption opt_input_guide_tree( mauve_options, "input-guide-tree", required_argument, "<file> A phylogenetic guide tree in NEWICK format that describes the order in which sequences will be aligned" );
+ MauveOption opt_output_guide_tree( mauve_options, "output-guide-tree", required_argument, "<file> Write out the guide tree used for alignment to a file" );
+ MauveOption opt_version( mauve_options, "version", no_argument, "Display software version information" );
+ MauveOption opt_debug( mauve_options, "debug", no_argument, "Run in debug mode (perform internal consistency checks--very slow)" );
+ MauveOption opt_scratch_path_1( mauve_options, "scratch-path-1", required_argument, "<path> Designate a path that can be used for temporary data storage. Two or more paths should be specified." );
+ MauveOption opt_scratch_path_2( mauve_options, "scratch-path-2", required_argument, "<path> Designate a path that can be used for temporary data storage. Two or more paths should be specified." );
+ MauveOption opt_collinear( mauve_options, "collinear", no_argument, "Assume that input sequences are collinear--they have no rearrangements" );
+ MauveOption opt_scoring_scheme( mauve_options, "scoring-scheme", required_argument, "<ancestral|sp_ancestral|sp> Selects the anchoring score function. Default is extant sum-of-pairs (sp)." );
+ MauveOption opt_no_weight_scaling( mauve_options, "no-weight-scaling", no_argument, "Don't scale LCB weights by conservation distance and breakpoint distance" );
+ MauveOption opt_max_breakpoint_distance_scale( mauve_options, "max-breakpoint-distance-scale", required_argument, "<number [0,1]> Set the maximum weight scaling by breakpoint distance. Defaults to 0.5" );
+ MauveOption opt_conservation_distance_scale( mauve_options, "conservation-distance-scale", required_argument, "<number [0,1]> Scale conservation distances by this amount. Defaults to 0.5" );
+ MauveOption opt_muscle_args( mauve_options, "muscle-args", required_argument, "<arguments in quotes> Additional command-line options for MUSCLE. Any quotes should be escaped with a backslash" );
+ MauveOption opt_skip_refinement( mauve_options, "skip-refinement", no_argument, "Do not perform iterative refinement" );
+ MauveOption opt_skip_gapped_alignment( mauve_options, "skip-gapped-alignment", no_argument, "Do not perform gapped alignment" );
+ MauveOption opt_bp_dist_estimate_min_score( mauve_options, "bp-dist-estimate-min-score", required_argument, "<number> Minimum LCB score for estimating pairwise breakpoint distance" );
+ MauveOption opt_mem_clean( mauve_options, "mem-clean", no_argument, "Set this to true when debugging memory allocations" );
+ MauveOption opt_gap_open( mauve_options, "gap-open", required_argument, "<number> Gap open penalty" );
+ MauveOption opt_penalize_repeats( mauve_options, "repeat-penalty", required_argument, "<negative|zero> Sets whether the repeat scores go negative or go to zero for highly repetitive sequences. Default is negative." );
+ MauveOption opt_gap_extend( mauve_options, "gap-extend", required_argument, "<number> Gap extend penalty" );
+ MauveOption opt_substitution_matrix( mauve_options, "substitution-matrix", required_argument, "<file> Nucleotide substitution matrix in NCBI format" );
+ MauveOption opt_weight( mauve_options, "weight", required_argument, "<number> Minimum pairwise LCB score" );
+ MauveOption opt_min_scaled_penalty( mauve_options, "min-scaled-penalty", required_argument, "<number> Minimum breakpoint penalty after scaling the penalty by expected divergence" );
+ MauveOption opt_go_homologous( mauve_options, "hmm-p-go-homologous", required_argument, "<number> Probability of transitioning from the unrelated to the homologous state [0.00001]" );
+ MauveOption opt_go_unrelated( mauve_options, "hmm-p-go-unrelated", required_argument, "<number> Probability of transitioning from the homologous to the unrelated state [0.000000001]" );
+ MauveOption opt_hmm_identity( mauve_options, "hmm-identity", required_argument, "<number> Expected level of sequence identity among pairs of sequences, ranging between 0 and 1 [0.7]" );
+ MauveOption opt_seed_family( mauve_options, "seed-family", no_argument, "Use a family of spaced seeds to improve sensitivity" );
+ MauveOption opt_solid_seeds( mauve_options, "solid-seeds", no_argument, "Use solid seeds. Do not permit substitutions in anchor matches." );
+ MauveOption opt_coding_seeds( mauve_options, "coding-seeds", no_argument, "Use coding pattern seeds. Useful to generate matches coding regions with 3rd codon position degeneracy." );
+ MauveOption opt_disable_cache( mauve_options, "disable-cache", no_argument, "Disable recursive anchor search cacheing to workaround a crash bug" );
+ MauveOption opt_recursive( mauve_options, "no-recursion", no_argument, "Disable recursive anchor search" );
+
+ if( argc <= 0 ){
+ print_usage( "mauveAligner", mauve_options );
+ return -1;
+ }
+ if( argc == 1 ){
+ print_usage( argv[0], mauve_options );
+ return -1;
+ }
+
+ // default values for homology HMM transitions
+ double pgh = 0.00001;
+ double pgu = 0.000000001;
+ double hmm_identity = 0.7; // percent identity modeled by the HMM homologous state
+ size_t island_gap_size = 20;
+
+ // set the Muscle path
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+ mi.ParseMusclePath( argv[0] );
+
+ // parse the options
+ //
+ // parse command line with gnu getopt
+ //
+ int opt;
+ int ac = argc;
+ char** av = argv;
+ int indexptr;
+ while( (opt = getopt_long( ac, av, "", mauve_options.getOptions(), &indexptr )) != EOF ){
+ if( opt == 0 )
+ {
+ mauve_options[mauve_options.config_opt]->set = true;
+ if( optarg != NULL )
+ mauve_options[mauve_options.config_opt]->arg_value = optarg;
+ }else{
+ print_usage( argv[0], mauve_options );
+ return -1;
+ }
+ }
+
+ if( opt_scratch_path_1.set )
+ FileSML::registerTempPath( opt_scratch_path_1.arg_value.c_str() );
+ if( opt_scratch_path_2.set )
+ FileSML::registerTempPath( opt_scratch_path_2.arg_value.c_str() );
+
+ // set the random number generator to a fixed seed for repeatability
+ // this should be changed if the algorithm ever depends on true pseudo-randomness
+ SetTwisterSeed(37);
+
+ if( opt_go_homologous.set )
+ pgh = strtod( opt_go_homologous.arg_value.c_str(), NULL );
+ if( opt_go_unrelated.set )
+ pgu = strtod( opt_go_unrelated.arg_value.c_str(), NULL );
+ if( opt_hmm_identity.set )
+ hmm_identity = strtod( opt_hmm_identity.arg_value.c_str(), NULL );
+ if( opt_island_gap_size.set )
+ island_gap_size = atoi( opt_island_gap_size.arg_value.c_str() );
+
+ // for debugging only:
+ if( opt_apply_backbone.set )
+ {
+ IntervalList iv_list;
+ ifstream in_file( opt_apply_backbone.arg_value.c_str() );
+ ofstream out_file( opt_output.arg_value.c_str() );
+ iv_list.ReadStandardAlignment(in_file);
+ MatchList ml;
+ ml.seq_filename = iv_list.seq_filename;
+ if( ml.seq_filename[0] != ml.seq_filename[1] )
+ LoadSequences(ml, &cout);
+ else
+ LoadMFASequences(ml, ml.seq_filename[0], &cout);
+ iv_list.seq_table = ml.seq_table;
+ string bb_fname = opt_output.arg_value + ".backbone";
+ string bbcols_fname = opt_output.arg_value + ".bbcols";
+ applyBackbone( iv_list, bbcols_fname, bb_fname, island_gap_size, hmm_identity, pgh, pgu );
+ iv_list.WriteStandardAlignment(out_file);
+ return 0;
+ }
+
+ //
+ // definitions of the variables that can be set by the user on the command line:
+ //
+ vector<string> seq_files;
+ vector<string> sml_files;
+ vector<gnSequence*> seq_table;
+ vector<DNAFileSML*> sml_table;
+ uint mer_size = 0; // Use default settings
+ boolean create_LCBs = true;
+ string output_file = "";
+ string tree_filename = "";
+
+ boolean lcb_match_input_format = false;
+
+ uint seqI;
+
+ ostream* detail_list_out = NULL; /**< output stream for detail list */
+
+ // now read in the seq file names from av
+ boolean seq_name_arg = true;
+ for( int optI = optind; optI < argc; optI++ )
+ seq_files.push_back( av[ optI ] );
+
+ // set sml_names
+ for( size_t seq_fileI = 0; seq_fileI < seq_files.size(); seq_fileI++ )
+ sml_files.push_back( seq_files[seq_fileI] + ".sslist" );
+
+ // print the version if the user requested it
+ if( opt_version.set ){
+ cerr << "progressiveMauve " << " build date " << __DATE__ << " at " << __TIME__ << endl;
+ }
+
+ if( seq_files.size() == 0 )
+ {
+ if( !opt_version.set )
+ print_usage( argv[0], mauve_options );
+ return 0;
+ }
+
+ //
+ // done parsing and checking command line options
+ // Start doing the work
+ //
+
+ MatchList pairwise_match_list;
+ if( opt_seed_weight.set )
+ {
+ mer_size = atoi( opt_seed_weight.arg_value.c_str() );
+ }
+
+ if( seq_files.size() == 1 ){
+ LoadMFASequences( pairwise_match_list, seq_files[0], &cout );
+ pairwise_match_list.CreateMemorySMLs( mer_size, &cout );
+ }else{
+ pairwise_match_list.seq_filename = seq_files;
+ pairwise_match_list.sml_filename = sml_files;
+ // testing: rewrite seq files in RAW format
+ LoadAndCreateRawSequences( pairwise_match_list, &cout );
+// LoadSequences( pairwise_match_list, &cout );
+ if(opt_solid_seeds.set)
+ pairwise_match_list.LoadSMLs( mer_size, &cout, SOLID_SEED, true );
+ else if(opt_coding_seeds.set)
+ pairwise_match_list.LoadSMLs( mer_size, &cout, CODING_SEED );
+ else
+ pairwise_match_list.LoadSMLs( mer_size, &cout, CODING_SEED );
+ }
+
+ ostream* match_out;
+ if( opt_output.set ){
+ ofstream* match_out_file = new ofstream( opt_output.arg_value.c_str() );
+ if( !match_out_file->is_open() ){
+ cerr << "Unable to open output file \"" << opt_output.arg_value << "\" for writing.\nCheck that you have permission to write files in this location and that the disk has free space.\n";
+ return -2;
+ }
+ match_out = match_out_file;
+ }else
+ match_out = &cout;
+
+ if(opt_mem_clean.set)
+ debugging_memory = true;
+
+ // read matches if the user requested it
+ if( opt_match_input.set ){
+ ifstream match_in( opt_match_input.arg_value.c_str() );
+ if( !match_in.is_open() ){
+ cerr << "Error opening " << opt_match_input.arg_value << endl;
+ return -2;
+ }
+ try{
+ ReadList( pairwise_match_list, match_in );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ cerr << "Error reading " << opt_match_input.arg_value << "\nPossibly corrupt file or invalid file format\n";
+ return -2;
+ }
+
+ if( seq_files.size() > 1 )
+ pairwise_match_list.seq_filename = seq_files;
+ else if( pairwise_match_list.seq_table.size() == 0 )
+ // fill seq_table with empty sequences
+ for( seqI = 0; seqI < pairwise_match_list.seq_filename.size(); seqI++ )
+ pairwise_match_list.seq_table.push_back( new gnSequence() );
+ }else if( !opt_seed_family.set ){
+ if( pairwise_match_list.seq_table.size() > 4 )
+ {
+ UniqueMatchFinder umf;
+ umf.LogProgress( &cout );
+ umf.FindMatches( pairwise_match_list );
+ umf.Clear();
+ }else{
+ PairwiseMatchFinder pmf;
+ pmf.LogProgress( &cout );
+ pmf.FindMatches( pairwise_match_list );
+ pmf.Clear();
+ }
+ cout << "done.\n";
+ }else{
+ // use an entire seed family to do the search
+ if( mer_size == 0 )
+ {
+ size_t avg = 0;
+ for( int seqI = 0; seqI < pairwise_match_list.seq_table.size(); seqI++ )
+ avg += pairwise_match_list.seq_table[seqI]->length();
+ avg /= pairwise_match_list.seq_table.size();
+ mer_size = getDefaultSeedWeight( avg );
+ }
+ // search with the longest seeds first so that overlapping matches tend to get contained
+ vector< pair< int, int > > length_ranks(3);
+ length_ranks[0] = make_pair( getSeedLength( getSeed(mer_size, 0) ), 0 );
+ length_ranks[1] = make_pair( getSeedLength( getSeed(mer_size, 1) ), 1 );
+ length_ranks[2] = make_pair( getSeedLength( getSeed(mer_size, 2) ), 2 );
+ std::sort( length_ranks.begin(), length_ranks.end() );
+
+ UniqueMatchFinder umf;
+ for( int seedI = 2; seedI >= 0; seedI-- )
+ {
+ umf.LogProgress( &cout );
+ int64 seed_pattern = getSeed(mer_size, length_ranks[seedI].second );
+ char pattern[65];
+ getPatternText( seed_pattern, pattern );
+ cout << "\nSearching with seed pattern " << pattern << "\n";
+ MatchList cur_list;
+ cur_list.seq_filename = pairwise_match_list.seq_filename;
+ cur_list.seq_table = pairwise_match_list.seq_table;
+ if( seq_files.size() == 1 )
+ cur_list.CreateMemorySMLs( mer_size, &cout, length_ranks[seedI].second );
+ else
+ {
+ getDefaultSmlFileNames( cur_list.seq_filename, cur_list.sml_filename, mer_size, length_ranks[seedI].second );
+ cur_list.LoadSMLs(mer_size, &cout, length_ranks[seedI].second);
+ }
+ umf.FindMatches( cur_list );
+ umf.ClearSequences();
+ for( size_t smlI = 0; smlI < cur_list.sml_table.size(); smlI++ )
+ delete cur_list.sml_table[smlI]; // free memory
+ for( size_t curI = 0; curI < cur_list.size(); curI++ )
+ cur_list[curI]->Free(); // free more memory!
+ }
+ umf.GetMatchList(pairwise_match_list);
+ cout << "done\n";
+ umf.Clear();
+ }
+
+ if( opt_mums.set )
+ {
+ WriteList(pairwise_match_list, *match_out);
+ for( size_t seqI = 0; seqI < pairwise_match_list.seq_table.size(); seqI++ )
+ delete pairwise_match_list.seq_table[seqI]; // an auto_ptr or shared_ptr could be great for this
+ for( size_t seqI = 0; seqI < pairwise_match_list.sml_table.size(); seqI++ )
+ delete pairwise_match_list.sml_table[seqI];
+ return 0;
+ }
+
+ // check whether the input sequences were masked to eliminate excess NNNNNs
+ for( seqI = 0; seqI < pairwise_match_list.sml_table.size(); seqI++ ){
+ FileSML* cur_sml = dynamic_cast< FileSML* >(pairwise_match_list.sml_table[ seqI ]);
+ if( cur_sml != NULL ){
+ const vector< int64 >& seq_coords = cur_sml->getUsedCoordinates();
+ if( seq_coords.size() > 0 ){
+ transposeMatches( pairwise_match_list, seqI, seq_coords );
+ }
+ }
+ }
+
+ // free any match search memory
+ SlotAllocator<MatchHashEntry>& allocator = SlotAllocator<MatchHashEntry>::GetSlotAllocator();
+ allocator.Purge();
+
+ ProgressiveAligner aligner( pairwise_match_list.seq_table.size() );
+ if( opt_skip_gapped_alignment.set )
+ aligner.setGappedAlignment(false);
+ if( opt_skip_refinement.set )
+ aligner.setRefinement(false);
+ if( opt_debug.set )
+ debug_aligner = true;
+
+ // check that LCB_size can be set appropriately
+ if( opt_weight.set )
+ {
+ double lcb_weight = strtod( opt_weight.arg_value.c_str(), NULL );
+ if( lcb_weight < 0 )
+ {
+ cerr << "A minimum LCB size greater than 0 must be specified in order to create LCBs.\n";
+ return -1;
+ }else
+ aligner.setBreakpointPenalty( lcb_weight );
+ }
+
+ if( opt_collinear.set )
+ aligner.setCollinear(true);
+
+ if( opt_max_gapped_aligner_length.set )
+ {
+ int64 mgal = atol( opt_max_gapped_aligner_length.arg_value.c_str() );
+ aligner.SetMaxGappedAlignmentLength( mgal );
+ }
+
+ if( opt_seed_family.set )
+ aligner.setUseSeedFamilies(true);
+
+ penalize_repeats = true;
+ if(opt_penalize_repeats.set && opt_penalize_repeats.arg_value == "zero")
+ penalize_repeats = false;
+
+ if( opt_scoring_scheme.set )
+ {
+ if( opt_scoring_scheme.arg_value == "ancestral" )
+ aligner.setLcbScoringScheme(ProgressiveAligner::AncestralScoring);
+ else if( opt_scoring_scheme.arg_value == "ancestral_sp" )
+ aligner.setLcbScoringScheme(ProgressiveAligner::AncestralSumOfPairsScoring);
+ else if( opt_scoring_scheme.arg_value == "sp" )
+ aligner.setLcbScoringScheme(ProgressiveAligner::ExtantSumOfPairsScoring);
+ else
+ {
+ cerr << "Unrecognized scoring scheme: " << opt_scoring_scheme.arg_value << endl;
+ return -2;
+ }
+ }else // default to extant sp
+ aligner.setLcbScoringScheme(ProgressiveAligner::ExtantSumOfPairsScoring);
+ if( opt_no_weight_scaling.set )
+ aligner.setUseLcbWeightScaling(false);
+ if( opt_max_breakpoint_distance_scale.set )
+ {
+ double d = strtod( opt_max_breakpoint_distance_scale.arg_value.c_str(), NULL );
+ aligner.setBreakpointDistanceScale(d);
+ }
+ if( opt_conservation_distance_scale.set )
+ {
+ double d = strtod( opt_conservation_distance_scale.arg_value.c_str(), NULL );
+ aligner.setConservationDistanceScale(d);
+ }
+ if( opt_bp_dist_estimate_min_score.set )
+ {
+ double d = strtod( opt_bp_dist_estimate_min_score.arg_value.c_str(), NULL );
+ aligner.setBpDistEstimateMinScore(d);
+ }
+ if( opt_disable_cache.set )
+ {
+ aligner.SetUseCacheDb(false);
+ }
+
+ if( opt_min_scaled_penalty.set )
+ {
+ aligner.setMinimumBreakpointPenalty(strtod( opt_min_scaled_penalty.arg_value.c_str(), NULL ) );
+ }
+ if( pairwise_match_list.seq_table.size() != 0 )
+ {
+ aligner.setPairwiseMatches( pairwise_match_list );
+ }
+ if( opt_muscle_args.set )
+ {
+ MuscleInterface& mi = MuscleInterface::getMuscleInterface();
+ mi.SetExtraMuscleArguments(opt_muscle_args.arg_value);
+ }
+ if( opt_recursive.set )
+ aligner.SetRecursive(false);
+ else
+ aligner.SetRecursive(true);
+
+ PairwiseScoringScheme pss;
+ if( opt_gap_open.set )
+ {
+ pss.gap_open = atoi(opt_gap_open.arg_value.c_str());
+ }
+ if( opt_gap_extend.set )
+ {
+ pss.gap_extend = atoi(opt_gap_open.arg_value.c_str());
+ }
+ if( opt_substitution_matrix.set )
+ {
+ ifstream sub_in( opt_substitution_matrix.arg_value.c_str() );
+ if( !sub_in.is_open() )
+ {
+ cerr << "Error opening substitution matrix file: \"" << opt_substitution_matrix.arg_value << "\"\n";
+ return -1;
+ }
+ score_t matrix[4][4];
+ readSubstitutionMatrix( sub_in, matrix );
+ pss = PairwiseScoringScheme(matrix, pss.gap_open, pss.gap_extend);
+ }
+ aligner.setPairwiseScoringScheme(pss);
+
+ if( opt_input_guide_tree.set )
+ aligner.setInputGuideTreeFileName( opt_input_guide_tree.arg_value );
+ if( opt_output_guide_tree.set )
+ aligner.setOutputGuideTreeFileName( opt_output_guide_tree.arg_value );
+
+ // if we will be doing a profile-profile or profile-sequence alignment
+ // then read in the profile
+ IntervalList profile_1;
+ IntervalList profile_2;
+ if( opt_profile.set ){
+ cerr << "Profile-profile alignment not yet implemented\n";
+ return -3;
+ }
+
+ IntervalList interval_list;
+ interval_list.seq_table = pairwise_match_list.seq_table;
+ interval_list.seq_filename = pairwise_match_list.seq_filename;
+
+ if( opt_profile.set )
+ ; //aligner.alignPP(profile_1, profile_2, interval_list );
+ else
+ aligner.align( interval_list.seq_table, interval_list );
+
+ if( !opt_disable_backbone.set )
+ {
+
+ string bbcols_fname = opt_output.arg_value + ".bbcols";
+ string bb_seq_fname = opt_backbone_output.arg_value;
+ if( !opt_backbone_output.set )
+ bb_seq_fname = opt_output.arg_value + ".backbone";
+ applyBackbone( interval_list, bbcols_fname, bb_seq_fname, island_gap_size, hmm_identity, pgh, pgu );
+ }
+
+ interval_list.WriteStandardAlignment(*match_out);
+ match_out->flush();
+
+ for( size_t seqI = 0; seqI < pairwise_match_list.seq_table.size(); seqI++ )
+ delete pairwise_match_list.seq_table[seqI]; // an auto_ptr or shared_ptr could be great for this
+ for( size_t seqI = 0; seqI < pairwise_match_list.sml_table.size(); seqI++ )
+ delete pairwise_match_list.sml_table[seqI];
+
+// only explicitly free memory if absolutely necessary
+// since free() is very slow and the OS will reclaim it at program exit anyways
+ if(opt_mem_clean.set)
+ {
+ // free memory used by pairwise matches
+ for( size_t mI = 0; mI < pairwise_match_list.size(); mI++ )
+ pairwise_match_list[mI]->Free();
+
+ if( opt_output.set )
+ delete match_out;
+ }
+
+/*
+}catch( gnException& gne ) {
+ cerr << "Unhandled gnException: " << gne << endl;
+ throw gne;
+ return -10;
+}catch( exception& e ) {
+ cerr << "Unhandled exception: " << e.what() << endl;
+ throw e;
+ return -11;
+}catch( char* message ){
+ cerr << "Unhandled exception: " << message << endl;
+ throw message;
+ return -12;
+}catch( const char* message ){
+ cerr << "Unhandled exception: " << message << " (const)\n";
+ throw message;
+ return -14;
+}catch(...){
+ cerr << "Unknown exception occurred.\n";
+ throw;
+ return -13;
+}
+*/
+ return 0;
+}
+
diff --git a/src/projectAndStrip.cpp b/src/projectAndStrip.cpp
new file mode 100644
index 0000000..34d11e8
--- /dev/null
+++ b/src/projectAndStrip.cpp
@@ -0,0 +1,144 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include "libGenome/gnFilter.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Matrix.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include "libGenome/gnFASSource.h"
+#include <boost/tuple/tuple.hpp>
+#include "libMems/ProgressiveAligner.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+typedef boost::tuple< uint, gnSeqI, gnSeqI, vector< uint > > bbcol_t;
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 5 )
+ {
+ cerr << "Usage: projectAndStrip <input xmfa> <output xmfa> <seq1> <seq2>...<seqN>\n";
+ cerr << "\nNumeric sequence identifiers start at 0.\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ ofstream aln_out;
+ aln_out.open( argv[2] );
+ if( !aln_out.is_open() ){
+ cerr << "Error writing to " << argv[2] << endl;
+ return -1;
+ }
+ vector<uint> seq_ids(argc-3);
+ vector<uint> not_ids;
+ for( size_t i = 3; i < argc; ++i )
+ seq_ids[i - 3] = atoi(argv[i]);
+
+ try{
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+
+ LoadSequences( input_ivs, NULL );
+
+ not_ids.resize( input_ivs.seq_table.size() );
+ for( size_t i = 0; i < not_ids.size(); i++ )
+ not_ids[i] = i;
+ for( size_t i = 0; i < seq_ids.size(); i++ )
+ not_ids[seq_ids[i]] = (std::numeric_limits<size_t>::max)();
+ std::sort( not_ids.begin(), not_ids.end() );
+ not_ids.resize( not_ids.size() - seq_ids.size() );
+
+ IntervalList output_ivs;
+ output_ivs.seq_table = input_ivs.seq_table;
+ output_ivs.seq_filename = input_ivs.seq_filename;
+
+ vector< GappedAlignment* > gaga_list;
+
+ for( size_t ivI = 0; ivI < input_ivs.size(); ivI++ )
+ {
+ Interval& iv = input_ivs[ivI];
+ size_t j = 0;
+ for( ; j < seq_ids.size(); j++ )
+ {
+ if( iv.LeftEnd( seq_ids[j] ) == NO_MATCH )
+ break;
+ }
+ if( j == seq_ids.size() )
+ {
+ vector<string> aln_mat;
+ GetAlignment( iv, input_ivs.seq_table, aln_mat );
+ Interval new_iv;
+ GappedAlignment ga(seq_ids.size(), 0);
+ GappedAlignment* gaga = ga.Copy();
+ vector<string> sub_mat( seq_ids.size() );
+ for( size_t sI = 0; sI < seq_ids.size(); sI++ )
+ {
+ gaga->SetStart( sI, iv.Start(seq_ids[sI]) );
+ gaga->SetLength( iv.Length(seq_ids[sI]), sI );
+ swap( sub_mat[sI], aln_mat[seq_ids[sI]] );
+ }
+ gaga->SetAlignment(sub_mat);
+ gaga_list.push_back( gaga );
+ }
+ }
+
+ for( size_t gI = 0; gI < gaga_list.size(); gI++ )
+ if( gaga_list[gI]->Orientation(0) == AbstractMatch::reverse )
+ gaga_list[gI]->Invert();
+
+ cout << "constructing LCBs\n";
+ vector< gnSeqI > bps;
+ IntervalList real_out_ivs;
+ IdentifyBreakpoints(gaga_list, bps);
+ vector< vector< GappedAlignment* > > coal_ivs;
+ ComputeLCBs_v2(gaga_list, bps, coal_ivs);
+ real_out_ivs.seq_filename.resize(seq_ids.size());
+ real_out_ivs.seq_table.resize(seq_ids.size());
+ for( size_t sI = 0; sI < seq_ids.size(); sI++ )
+ {
+ real_out_ivs.seq_filename[sI] = input_ivs.seq_filename[seq_ids[sI]];
+ real_out_ivs.seq_table[sI] = input_ivs.seq_table[seq_ids[sI]];
+ }
+ real_out_ivs.resize( coal_ivs.size() );
+ for( size_t cI = 0; cI < coal_ivs.size(); cI++ )
+ real_out_ivs[cI].SetMatches(coal_ivs[cI]);
+ cout << "real_out_ivs.size() " << real_out_ivs.size() << endl;
+
+
+
+ addUnalignedIntervals( real_out_ivs );
+ real_out_ivs.WriteStandardAlignment( aln_out );
+ aln_out.close();
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }catch( exception& e ){
+ cerr << e.what() << endl;
+ return -2;
+ }catch( char const* c ){
+ cerr << c << endl;
+ return -3;
+ }catch(...){
+ cerr << "Unhandled exception" << endl;
+ return -4;
+ }
+}
+
diff --git a/src/randomGeneSample.cpp b/src/randomGeneSample.cpp
new file mode 100644
index 0000000..22aa880
--- /dev/null
+++ b/src/randomGeneSample.cpp
@@ -0,0 +1,165 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include "libGenome/gnFilter.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Matrix.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/Aligner.h"
+#include "libMems/Islands.h"
+#include "libGenome/gnFASSource.h"
+#include <boost/tuple/tuple.hpp>
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/Backbone.h"
+#include "libGenome/gnFeature.h"
+#include "libGenome/gnFASSource.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+typedef boost::tuple< uint, gnSeqI, gnSeqI, vector< uint > > bbcol_t;
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 6 )
+ {
+ cerr << "Usage: randomGeneSample <input xmfa> <backbone seq file> <sample genome> <number of genes> <output base name> [random seed]\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ uint gene_count = atoi( argv[4] );
+ uint sgI = atoi( argv[3] );
+ string output_base = argv[5];
+
+ if( argc == 7 )
+ srand(atoi(argv[6]));
+ else
+ srand(time(NULL));
+
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+ LoadSequences( input_ivs, &cout );
+
+ vector< bb_seqentry_t > backbone;
+ ifstream bb_in;
+ bb_in.open( argv[2] );
+ if( !bb_in.is_open() ){
+ cerr << "Error opening \"" << argv[2] << "\"" << endl;
+ return -2;
+ }
+ readBackboneSeqFile( bb_in, backbone );
+ bb_in.close();
+
+ gnSequence* gen0 = input_ivs.seq_table[sgI];
+ vector< gnBaseFeature* > genes;
+ for( size_t featI = 0; featI < gen0->getFeatureListLength(); featI++ )
+ {
+ gnBaseFeature* feat = gen0->getFeature(featI);
+ if( feat->GetName() == "CDS" )
+ genes.push_back( feat );
+ else
+ delete feat;
+ }
+
+ cout << genes.size() << " of the " << gen0->getFeatureListLength() << " annotated features are CDS\n";
+
+ // pick a gene at random from the first genome, extract the alignment, and write it to a file
+ for( size_t geneI = 0; geneI < gene_count; geneI++ )
+ {
+ cerr << "picking gene\n";
+ int randy;
+ do{
+ randy = rand() % genes.size();
+ // has this gene already been used?
+ if( genes[randy] == NULL )
+ continue;
+ // is this gene part of N-way backbone?
+ gnLocation loc = genes[randy]->GetLocation(0);
+ int64 lend = loc.GetFirst();
+ int64 rend = loc.GetLast();
+ size_t bbI = 0;
+ for( ; bbI < backbone.size(); bbI++ )
+ {
+ if( genome::absolut(backbone[bbI][sgI].first) <= lend && rend <= genome::absolut(backbone[bbI][sgI].second) )
+ break;
+ }
+ size_t seqI = 0;
+ for( ; bbI < backbone.size() && seqI < input_ivs.seq_table.size(); ++seqI )
+ {
+ if( backbone[bbI][seqI].first == 0 || backbone[bbI][seqI].second == 0 )
+ break;
+ }
+ if( seqI == input_ivs.seq_table.size() && bbI < backbone.size() )
+ break; // found a containing segment
+ }while(true);
+ // print out the feature name
+ for( size_t qI = 0; qI < genes[randy]->GetQualifierListLength(); qI++ )
+ {
+ if( genes[randy]->GetQualifierName(qI) == "gene" )
+ cout << "gene:\t" << genes[randy]->GetQualifierValue(qI) << endl;
+ }
+ // extract the alignment
+ gnLocation loc = genes[randy]->GetLocation(0);
+ int64 lend = loc.GetFirst();
+ int64 rend = loc.GetLast();
+ cerr << "lend: " << lend << "\trend: " << rend << endl;
+ size_t ivI = 0;
+ for( ivI = 0; ivI < input_ivs.size(); ivI++ )
+ {
+ if( input_ivs[ivI].Start(sgI) != NO_MATCH )
+ {
+// cerr << "iv: " << ivI << "\tstart: " << input_ivs[ivI].Start(sgI) << "\tlength: " << input_ivs[ivI].Length(sgI) << endl;
+ gnSeqI iv_rend = genome::absolut(input_ivs[ivI].Start(sgI)) + input_ivs[ivI].Length(sgI);
+ if( genome::absolut(input_ivs[ivI].Start(sgI)) < lend && rend < iv_rend )
+ break;
+ }
+ }
+ if( ivI == input_ivs.size() )
+ cerr << "Error: unable to assign gene to an interval!\n" << "coordinates: " << lend << '\t' << rend << endl;
+ cerr << "making iv_cga\n";
+ CompactGappedAlignment<> iv_cga(input_ivs[ivI]);
+ CompactGappedAlignment<> col_cga;
+ cerr << "getting left and right cols\n";
+ gnSeqI lcol = iv_cga.SeqPosToColumn( sgI, lend );
+ gnSeqI rcol = iv_cga.SeqPosToColumn( sgI, rend );
+ cerr << "left col: " << lcol << "\tright_col: " << rcol << endl;
+ iv_cga.copyRange(col_cga, lcol, rcol-lcol + 1);
+ cerr << "getting alignment\n";
+ vector< string > aln;
+ GetAlignment( col_cga, input_ivs.seq_table, aln );
+ gnSequence gene_aln;
+ for( size_t i = 0; i < aln.size(); i++ )
+ {
+ gene_aln += aln[i];
+ stringstream ss;
+ ss << "seq" << i;
+ gene_aln.setContigName(i, ss.str());
+ }
+ cerr << "writing fasta\n";
+ stringstream of_name;
+ of_name << output_base << "_" << geneI << ".fas";
+ gnFASSource::Write( gene_aln, of_name.str() );
+
+ // done with this gene
+ delete genes[randy];
+ genes[randy] = NULL;
+ }
+
+}
+
diff --git a/src/repeatoire.cpp b/src/repeatoire.cpp
new file mode 100644
index 0000000..45daf65
--- /dev/null
+++ b/src/repeatoire.cpp
@@ -0,0 +1,2716 @@
+#include "libGenome/gnSequence.h"
+#include "libMems/Interval.h"
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/Islands.h"
+#include "libMems/Aligner.h"
+#include "libMems/MuscleInterface.h"
+#include "libGenome/gnFASSource.h"
+#include "libMems/Backbone.h"
+#include "libMems/ProgressiveAligner.h"
+#include "libMems/HomologyHMM/parameters.h"
+
+#include <iomanip>
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+
+#include "MatchRecord.h"
+#include "SeedMatchEnumerator.h"
+//#include "procrastUtilities.h"
+
+#include <boost/tuple/tuple.hpp>
+#include <boost/program_options/cmdline.hpp>
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+bool print_warnings = false;
+
+enum rvalue { OK=0, FAILED=1, DONE=2, NOVEL=3, FIXME=110};
+int scoredropoff_matrix[10] = {0,0,4756,9144,13471,17981,25302,30945,38361,40754};
+int ccount = 0;
+/** A Match Position Entry stores a pointer to a match and its component for a given sequence coordinate */
+typedef std::pair< MatchRecord*, size_t > MatchPositionEntry;
+/** the Match Position Lookup Table should be sized match the length of the sequence */
+typedef vector< MatchPositionEntry > MatchPositionLookupTable;
+
+/** This class stores a single entry in the neighborhood list */
+class NeighborhoodListEntry
+{
+public:
+ MatchRecord* match;
+ bool relative_orientation; /** true for identical (1) and false for opposite (-1) */
+ size_t Mi_component; /** the x value in the paper (Matching component of M_i)*/
+ size_t distance; /** the d value from the paper */
+ size_t Mj_component; /** the y value in the paper (Matching component of M_j) */
+};
+
+/** Used to sort the neighborhood list using std::sort */
+class NeighborhoodListComparator
+{
+public:
+ bool operator()( const NeighborhoodListEntry& a, const NeighborhoodListEntry& b )
+ {
+ if( a.match != b.match )
+ return a.match < b.match;
+ if( a.relative_orientation != b.relative_orientation )
+ return a.relative_orientation == false;
+ if( a.Mi_component != b.Mi_component )
+ return a.Mi_component < b.Mi_component;
+ return a.distance < b.distance;
+ }
+};
+
+
+bool scorecmp( GappedMatchRecord* a, GappedMatchRecord* b )
+{
+ // sort first by multipicity, then by spscore
+ if( a->Multiplicity() > b->Multiplicity())
+ return true;
+ else if ( a->Multiplicity() < b->Multiplicity())
+ return false;
+ else
+ return a->spscore > b->spscore;
+ }
+
+bool score_by_sp( GappedMatchRecord* a, GappedMatchRecord* b )
+{
+ // sort first by multipicity, then by spscore
+ if( a->spscore > b->spscore)
+ return true;
+ else if ( a->spscore < b->spscore)
+ return false;
+ else
+ return a->Multiplicity() > b->Multiplicity();
+ }
+
+bool score_by_length( GappedMatchRecord* a, GappedMatchRecord* b )
+{
+ // sort first by multipicity, then by spscore
+ if( a->AlignmentLength() > b->AlignmentLength())
+ return true;
+ else if ( a->AlignmentLength() < b->AlignmentLength())
+ return false;
+ else
+ return a->spscore > b->spscore;
+ }
+/** The NeighborhoodGroup contains the MatchRecord pointer, the component map to the match being extended (M_i), and a vector of distances to M_i*/
+typedef boost::tuple< MatchRecord*, std::vector< size_t >, std::vector< size_t > > NeighborhoodGroup;
+
+class NeighborhoodGroupComponentCompare
+{
+public:
+ bool operator()( const NeighborhoodGroup& a, const NeighborhoodGroup& b ) const
+ {
+ return compare(a,b) < 0;
+ }
+ int compare( const NeighborhoodGroup& a, const NeighborhoodGroup& b ) const
+ {
+ // compare component map vectors
+ // todo: make these buffers persistent to avoid reallocation!!
+ vector< size_t > ac(a.get<1>());
+ vector< size_t > bc(b.get<1>());
+ std::sort(ac.begin(), ac.end());
+ std::sort(bc.begin(), bc.end());
+ size_t i = 0;
+ for( ; i < ac.size() && i < bc.size(); ++i )
+ {
+ if( ac[i] != bc[i] )
+ return ac[i] - bc[i];
+ }
+ if( i < ac.size() && ac[i] != (std::numeric_limits<size_t>::max)())
+ return 1;
+ else if( i < bc.size() && bc[i] != (std::numeric_limits<size_t>::max)())
+ return -1;
+
+ return 0;
+ }
+};
+
+class NeighborhoodGroupCompare
+{
+public:
+ bool operator()( const NeighborhoodGroup& a, const NeighborhoodGroup& b )
+ {
+ int cval = srcc.compare(a,b);
+ if( cval != 0 )
+ return cval < 0;
+
+ // compare distance vectors
+ vector< size_t > ad(a.get<2>());
+ vector< size_t > bd(b.get<2>());
+ std::sort(ad.begin(), ad.end());
+ std::sort(bd.begin(), bd.end());
+ size_t i = 0;
+ for( ; i < ad.size() && i < bd.size(); ++i )
+ {
+ if( ad[i] != bd[i] )
+ return ad[i] < bd[i];
+ }
+ if( i < ad.size() )
+ return false;
+ else if( i < bd.size() )
+ return true;
+
+ return false;
+ }
+protected:
+ NeighborhoodGroupComponentCompare srcc;
+};
+
+
+//function to test if a chainable match is OK, i.e. none of this stuff:
+// |---m1---> |----c1---> |---c2----> |----m2---->
+bool testChainableMatch( MatchRecord* M_i, MatchRecord* M_m, const vector< size_t >& component_map )
+{
+ bool ok = true;
+ // set range to cover M_m
+ int right_count = 0;
+ int left_count = 0;
+ for( size_t x = 0; x < M_i->Multiplicity(); ++x )
+ {
+ size_t z = component_map[x];
+
+ //if there is no match, we've got a problem
+ if( M_i->LeftEnd(x) == NO_MATCH || M_m->LeftEnd(z) == NO_MATCH )
+ genome::breakHere();
+
+ //should it be allowed to chain with matches with differing orientation
+ //if so, how do we align the gap between these two matches?
+ if (M_m->Orientation(z) != M_i->Orientation(x) )
+ {
+ left_count = 1;
+ right_count = 1;
+ break;
+ }
+ int64 lend_diff = M_m->LeftEnd(z) -M_i->LeftEnd(x);
+ int64 rend_diff = M_m->RightEnd(z) - M_i->RightEnd(x);
+ /// <---m1----> <----b1---> <----b2----> <----m2---->
+ if( rend_diff < 0 && lend_diff < 0)
+ {
+ //component to chain is to the left of current match component
+ if (M_m->Orientation(z) == AbstractMatch::forward)
+ left_count++;
+ else
+ right_count++;
+ ok = false;
+ }
+ else if ( rend_diff > 0 && lend_diff > 0)
+ {
+ if (M_m->Orientation(z) == AbstractMatch::forward)
+ right_count++;
+ else
+ left_count++;
+ //component to chain is to the right of current match component
+ ok = false;
+ }
+ else
+ {
+ left_count = 1;
+ right_count = 1;
+ break;
+
+ }
+ }
+
+ //if there are components to the left && right, things are not ok with this chained match
+ if (left_count != 0 && right_count != 0)
+ ok = false;
+ else
+ ok = true;
+ return ok;
+}
+
+bool extendRange( MatchRecord* M_i, MatchRecord* M_m, const vector< size_t >& component_map )
+{
+ bool changed = false;
+ // set range to cover M_m
+ for( size_t x = 0; x < M_i->Multiplicity(); ++x )
+ {
+ size_t z = component_map[x];
+ if( M_i->LeftEnd(x) == NO_MATCH || M_m->LeftEnd(z) == NO_MATCH )
+ genome::breakHere();
+ int64 lend_diff = M_i->LeftEnd(x) - M_m->LeftEnd(z);
+ if( lend_diff > 0 )
+ {
+ if ( M_i->LeftEnd(x) - lend_diff == 0)
+ cerr << "extendRange debugme" << endl;
+ M_i->SetLeftEnd(x, M_i->LeftEnd(x) - lend_diff);
+ M_i->SetLength(M_i->Length(x)+lend_diff, x);
+ changed = true;
+ }
+
+ int64 rend_diff = M_m->RightEnd(z) - M_i->RightEnd(x);
+ if( rend_diff > 0 )
+ {
+ M_i->SetLength( M_i->Length(x)+rend_diff, x );
+ changed = true;
+ }
+
+ }
+ return changed;
+}
+
+bool reduceRange( MatchRecord* M_i, MatchRecord* M_m, const vector< size_t >& component_map )
+{
+ bool changed = false;
+ // set range to cover M_m
+ for( size_t x = 0; x < M_i->Multiplicity(); ++x )
+ {
+ size_t z = component_map[x];
+ if( M_i->LeftEnd(x) == NO_MATCH || M_m->LeftEnd(z) == NO_MATCH )
+ genome::breakHere();
+ int64 lend_diff = M_m->LeftEnd(z) - M_i->LeftEnd(x);
+ if( lend_diff > 0 )
+ {
+ if ( M_i->LeftEnd(x) - lend_diff == 0)
+ cerr << "reduceRange debugme" << endl;
+ M_i->SetLeftEnd(x, M_i->LeftEnd(x) - lend_diff);
+ M_i->SetLength(M_i->Length(x)+lend_diff, x);
+ changed = true;
+ }
+ int64 rend_diff = M_i->RightEnd(x) - M_m->RightEnd(z) ;
+ if( rend_diff > 0 )
+ {
+ M_i->SetLength( M_i->Length(x)+rend_diff, x );
+ changed = true;
+ }
+ }
+ return changed;
+}
+
+void remapComponents(const vector< size_t >& srcmap, size_t mid_multiplicity, const vector< size_t >& destmap, vector< size_t >& newmap )
+{
+ vector< size_t > super_map( mid_multiplicity, (std::numeric_limits<size_t>::max)() );
+ for( size_t mapI = 0; mapI < destmap.size(); ++mapI )
+ super_map[destmap[mapI]] = mapI;
+ for( size_t mapI = 0; mapI < srcmap.size(); ++mapI )
+ newmap[mapI] = super_map[srcmap[mapI]];
+}
+
+void classifyMatch( AbstractMatch* M_i, AbstractMatch* M_j, vector< size_t >& ji_component_map, bool& subsumed, bool& partial, bool superset = false )
+{
+ subsumed = true;
+ partial = false;
+ for( size_t i = 0; i < ji_component_map.size(); ++i )
+ {
+ size_t x = ji_component_map[i];
+ size_t y = i;
+ int64 lend_diff = M_i->LeftEnd(x) - M_j->LeftEnd(y);
+ int64 rend_diff = M_j->RightEnd(y) - M_i->RightEnd(x);
+ if (superset)
+ {
+ lend_diff = M_j->LeftEnd(y) - M_i->LeftEnd(x);
+ rend_diff = M_i->RightEnd(x)-M_j->RightEnd(y);
+ }
+
+ if( lend_diff > 0 || rend_diff > 0 )
+ subsumed = false;
+ if( lend_diff <= 0 && rend_diff <= 0 )
+ partial = true;
+ }
+}
+//same as classifySubset, except for supersets
+void classifySuperset( MatchRecord* M_i, NeighborhoodGroup& sr, bool& subsumed, bool& partial )
+{
+ classifyMatch( M_i, sr.get<0>(), sr.get<1>(), subsumed, partial, true );
+}
+
+void classifySubset( MatchRecord* M_i, NeighborhoodGroup& sr, bool& subsumed, bool& partial )
+{
+ classifyMatch( M_i, sr.get<0>(), sr.get<1>(), subsumed, partial, false );
+}
+
+void checkLink( MatchRecord*& mr )
+{
+ while( mr->subsuming_match != NULL )
+ mr = mr->subsuming_match;
+}
+
+
+void checkLink( MatchLink& mlink )
+{
+ while( mlink.subset->subsuming_match != NULL )
+ {
+ vector< size_t > new_map( mlink.sub_to_super_map.size() );
+ for( size_t i = 0; i < mlink.sub_to_super_map.size(); ++i )
+ new_map[i] = mlink.sub_to_super_map[ mlink.subset->subsumption_component_map[i] ];
+ swap( new_map, mlink.sub_to_super_map );
+ mlink.subset = mlink.subset->subsuming_match;
+ }
+}
+
+void checkLinkAndComponent( MatchRecord*& mr, size_t& component )
+{
+ while( mr->subsuming_match != NULL )
+ {
+ component = mr->subsumption_component_map[component];
+ mr = mr->subsuming_match;
+ }
+}
+
+/** returns one of the superset links f
+rom a match. direction is 1 for left, -1 for right */
+MatchLink& getSuperset( MatchRecord* mr, int direction )
+{
+ if( direction == 1 )
+ return mr->left_superset;
+ return mr->right_superset;
+}
+
+/** returns the subset links for a given direction. direction is 1 for left, -1 for right */
+vector<MatchLink>& getSubsets( MatchRecord* mr, int direction )
+{
+ if( direction == 1 )
+ return mr->left_subset_links;
+ return mr->right_subset_links;
+}
+
+/** returns the extra subsets for a given direction. direction is 1 for left, -1 for right */
+vector<MatchLink>& getExtraSubsets( MatchRecord* mr, int direction )
+{
+ if( direction == 1 )
+ return mr->extra_left_subsets;
+ return mr->extra_right_subsets;
+}
+//inverse of unlinkSuperset
+//linkSuperset then unlinkSuperset should exactly offset each other
+void linkSuperset( MatchRecord* mr, MatchRecord* supermatch, boost::dynamic_bitset<>& comp_list, vector< size_t >& comp_map, int direction )
+{
+ // update superset links
+ MatchLink slink = MatchLink( supermatch, mr, comp_list, comp_map );
+ if( slink.superset != NULL )
+ {
+ slink.subset = mr;
+ int parity = mr->Orientation(0) == slink.superset->Orientation(slink.sub_to_super_map[0]) ? 1 : -1;
+ getSubsets(slink.superset,-direction*parity).push_back(slink);
+ }
+ vector< MatchLink >& subsets = getSubsets(mr,direction);
+ for( size_t subI = 0; subI < subsets.size(); ++subI )
+ {
+ subsets[subI].superset = mr;
+ int parity = mr->Orientation(subsets[subI].sub_to_super_map[0]) == subsets[subI].subset->Orientation(0) ? 1 : -1;
+ getSuperset(subsets[subI].subset, -direction*parity).superset = mr;
+ }
+ //punt: link extra subsets too!
+ vector< MatchLink >& extrasubsets = getExtraSubsets(mr,direction);
+ for( size_t subI = 0; subI < extrasubsets.size(); ++subI )
+ {
+ subsets[subI].superset = mr;
+ int parity = mr->Orientation(extrasubsets[subI].sub_to_super_map[0]) == extrasubsets[subI].subset->Orientation(0) ? 1 : -1;
+ getSuperset(extrasubsets[subI].subset, -direction*parity).superset = mr;
+ }
+
+
+
+}
+void unlinkSuperset( MatchRecord* mr, int direction )
+{
+ MatchLink& superlink = getSuperset( mr, direction );
+ MatchRecord* super = superlink.superset;
+ if( super != NULL )
+ {
+ int parity = mr->Orientation(0) == super->Orientation(superlink.sub_to_super_map[0]) ? 1 : -1;
+ vector< MatchLink >& subs = getSubsets( super, -direction*parity );
+ for( size_t subI = 0; subI < subs.size(); ++subI )
+ {
+ if( subs[subI].subset == mr )
+ {
+ subs.erase( subs.begin() + subI, subs.begin() + subI + 1 );
+ subI--;
+ }
+ }
+ //tjt: unlink extrasubsets!
+ vector< MatchLink >& extrasubs = getExtraSubsets( super, -direction*parity );
+ for( size_t subI = 0; subI < extrasubs.size(); ++subI )
+ {
+ if( extrasubs[subI].subset == mr )
+ {
+ extrasubs.erase( extrasubs.begin() + subI, extrasubs.begin() + subI + 1 );
+ subI--;
+ }
+ }
+
+ superlink.clear();
+ }
+}
+
+void unlinkSupersets( MatchRecord* mr )
+{
+ unlinkSuperset( mr, 1 );
+ unlinkSuperset( mr, -1 );
+}
+
+template< class MatchRecordPtrType >
+void validate( vector< MatchRecordPtrType >& records )
+{
+ // make sure all matches have non-zero components
+ for( size_t recI = 0; recI < records.size(); ++recI )
+ {
+ size_t seqI = 0;
+ for( ; seqI < records[recI]->SeqCount(); ++seqI )
+ if( records[recI]->LeftEnd(seqI) == NO_MATCH )
+ break;
+ if( seqI < records[recI]->SeqCount() )
+ {
+ cerr << "missing component\n";
+ genome::breakHere();
+ }
+ }
+
+ // make sure all links are consistent
+ for( size_t recI = 0; recI < records.size(); ++recI )
+ {
+ MatchRecord* mr = records[recI];
+ for( int direction = 1; direction >-2; direction -= 2 )
+ {
+ for( size_t subI = 0; subI < getSubsets(mr, direction).size(); subI++ )
+ {
+ // follow any stale links
+ MatchRecord* sub = getSubsets(mr, direction)[subI].subset;
+ size_t sub_mult = sub->Multiplicity();
+ while( sub->subsuming_match != NULL )
+ sub = sub->subsuming_match;
+ size_t parity_seq = getSubsets(mr, direction)[subI].sub_to_super_map[0];
+ int parity = mr->Orientation(parity_seq) == sub->Orientation(0) ? 1 : -1;
+ // make sure that each of the subsets in these points back to this superset in its own link
+ if( getSuperset(sub, -direction*parity).superset != mr )
+ {
+ cerr << "ohno\n";
+ genome::breakHere();
+ }
+ if( sub_mult != sub->Multiplicity() )
+ {
+ cerr << "unequal mult\n";
+ genome::breakHere();
+ }
+ if( getSubsets(mr,direction)[subI].super_component_list.count() != getSubsets(mr,direction)[subI].sub_to_super_map.size())
+ {
+ cerr << "broke\n";
+ genome::breakHere();
+ }
+ }
+
+ // make sure the supersets have this subset
+ if( getSuperset(mr,direction).superset != NULL )
+ {
+ MatchRecord* sup = getSuperset(mr,direction).superset;
+ int parity = mr->Orientation(0) == sup->Orientation(getSuperset(mr,direction).sub_to_super_map[0]) ? 1 : -1;
+ size_t subI = 0;
+ for( ; subI < getSubsets(sup,-direction*parity).size(); subI++ )
+ {
+ if( getSubsets(sup,-direction*parity)[subI].subset == mr )
+ break;
+ }
+ if( subI == getSubsets(sup,-direction*parity).size() )
+ {
+ cerr << "oh crap!\n";
+ genome::breakHere();
+ }
+ if( getSuperset(mr,direction).super_component_list.count() != getSuperset(mr,direction).sub_to_super_map.size())
+ {
+ cerr << "broke 3\n";
+ genome::breakHere();
+ }
+ }
+ }
+ }
+}
+
+void createNeighborhoodGroupList( vector< NeighborhoodGroup >& group_list, vector< vector< size_t > >& group_members, vector< NeighborhoodListEntry >& neighborhood_list )
+{
+ group_list.resize( group_members.size() );
+ for( size_t gI = 0; gI < group_members.size(); gI++ )
+ {
+ // is this subset completely contained--is it subsumed?
+ MatchRecord* M_j = neighborhood_list[group_members[gI][0]].match;
+
+ vector< size_t > component_map(M_j->Multiplicity(), (std::numeric_limits<size_t>::max)());
+ vector< size_t > distances(M_j->Multiplicity(), (std::numeric_limits<size_t>::max)());
+ for( vector< size_t >::iterator rec_iter = group_members[gI].begin(); rec_iter != group_members[gI].end(); ++rec_iter )
+ {
+ component_map[neighborhood_list[*rec_iter].Mj_component] = neighborhood_list[*rec_iter].Mi_component;
+ distances[neighborhood_list[*rec_iter].Mj_component] = neighborhood_list[*rec_iter].distance;
+ }
+ group_list[gI].get<0>() = M_j;
+ swap( group_list[gI].get<1>(), component_map );
+ swap( group_list[gI].get<2>(), distances );
+ }
+
+ static NeighborhoodGroupCompare src;
+ std::sort( group_list.begin(), group_list.end(), src );
+}
+
+/**
+ * Assigns the superset link from M_j to M_i. This function should be called when
+ * M_j has been chained as part of M_i and M_j has an outgoing superset link.
+ */
+void inheritSuperset( MatchRecord* M_i, MatchRecord* M_j, int direction, int parity )
+{
+ // remap superset components
+ vector< size_t > comp_map( M_i->Multiplicity() );
+ for( size_t ci = 0; ci < comp_map.size(); ci++ )
+ comp_map[ci] = getSuperset( M_j, direction*parity ).sub_to_super_map[ M_j->subsumption_component_map[ci] ];
+ // rebuild the superset component list
+ boost::dynamic_bitset<> comp_list(getSuperset( M_j, direction*parity ).superset->Multiplicity(), false);
+ for( size_t compI = 0; compI < comp_map.size(); ++compI )
+ comp_list.set(comp_map[compI]);
+ MatchLink& slink = getSuperset(M_i, direction);
+ slink = MatchLink( getSuperset( M_j, direction*parity ).superset, M_i, comp_list, comp_map );
+ unlinkSuperset(M_j,direction*parity);
+ int slink_parity = M_i->Orientation(0) == slink.superset->Orientation(slink.sub_to_super_map[0]) ? 1 : -1;
+ getSubsets(slink.superset,-direction*slink_parity).push_back(slink);
+
+}
+
+/**
+ * returns either the left or right list, depending on the current direction of extension
+ */
+vector< NeighborhoodGroup >& selectList( vector< NeighborhoodGroup >& left_list, vector< NeighborhoodGroup >& right_list, int direction )
+{
+ return direction == 1 ? left_list : right_list;
+}
+
+
+
+/**
+ * Performs a superset link extension on M_i
+ */
+void supersetLinkExtension( GappedMatchRecord*& M_i, int direction, int& last_linked,
+ vector< NeighborhoodGroup >& left_deferred_subsets,
+ vector< NeighborhoodGroup >& right_deferred_subsets, bool chain )
+{
+ // update the left end and look for another superset to chain with
+ // then extend all the way to that match
+ MatchRecord* M_j = getSuperset(M_i, direction).superset;
+ MatchLink ij_link = getSuperset(M_i, direction); // make a copy for safekeeping
+ int ij_parity = M_i->Orientation(0) == M_j->Orientation(ij_link.sub_to_super_map[0]) ? 1 : -1;
+
+ //
+ // Link extension part 1:
+ // extend M_i to include M_j, add M_j to the chained matches
+
+
+ bool changed = extendRange( M_i, M_j, ij_link.sub_to_super_map );
+ M_i->chained_matches.push_back(M_j);
+ M_i->chained_component_maps.push_back(ij_link.sub_to_super_map);
+
+
+
+ // Link extension part 2:
+ // figure out whether any subsets between M_j and M_i got subsumed
+ for( size_t subtypeI = 0; subtypeI < 2; subtypeI++ )
+ {
+ vector< MatchLink >* mjsubs;
+ if( subtypeI == 0 )
+ mjsubs = &getSubsets(M_j, -direction*ij_parity);
+ else
+ mjsubs = &getExtraSubsets(M_j, -direction*ij_parity);
+ vector< MatchLink >& mj_otherside_subsets = *mjsubs;
+
+ for( size_t leftI = 0; leftI < mj_otherside_subsets.size(); ++leftI )
+ {
+ if( subtypeI == 0 )
+ checkLink( mj_otherside_subsets[leftI] );
+ MatchLink& jk_link = mj_otherside_subsets[leftI];
+ boost::dynamic_bitset<> intersect = ij_link.super_component_list & jk_link.super_component_list;
+ MatchRecord* M_k = jk_link.subset;
+ if( M_k == M_i )
+ continue; // been there, chained that.
+ size_t inter_size = intersect.count();
+ if( inter_size < 2 )
+ continue; // no match
+ if( inter_size >= M_i->Multiplicity() || M_k->Multiplicity() != inter_size )
+ continue;
+
+ // has this guy already been subsumed? if so then just skip him
+ if( M_k->subsuming_match != NULL )
+ {
+ if( subtypeI != 1 )
+ breakHere(); // this should only happen with extra subsets
+ mj_otherside_subsets.erase(mj_otherside_subsets.begin()+leftI, mj_otherside_subsets.begin()+leftI+1 );
+ leftI--;
+ continue;
+ }
+
+ // M_k is a subset relative to M_i
+ int jk_parity = M_k->Orientation(0) == M_j->Orientation(jk_link.sub_to_super_map[0]) ? 1 : -1;
+ int ik_parity = ij_parity * jk_parity;
+
+ vector< size_t > component_map( M_k->Multiplicity() );
+ remapComponents(jk_link.sub_to_super_map, M_j->Multiplicity(), ij_link.sub_to_super_map, component_map );
+
+ NeighborhoodGroup sr = boost::make_tuple( M_k, component_map, vector<size_t>( M_k->Multiplicity(), 0 ) );
+ // defer it until we're done extending
+ selectList( left_deferred_subsets, right_deferred_subsets, -direction ).push_back( sr );
+ }
+ }
+
+ //
+ // Link extension part 3:
+ // classify outgoing links that share components with M_i
+ unlinkSuperset(M_i,direction);
+ vector< size_t > supersets;
+ vector< size_t > chainable;
+ vector< size_t > subsets;
+ vector< size_t > novel_subsets;
+ vector< MatchLink >& mj_subsets = getSubsets(M_j, direction*ij_parity);
+ for( size_t leftI = 0; leftI < mj_subsets.size(); ++leftI )
+ {
+ checkLink( mj_subsets[leftI] );
+ boost::dynamic_bitset<> intersect = ij_link.super_component_list & mj_subsets[leftI].super_component_list;
+ MatchRecord* M_k = mj_subsets[leftI].subset;
+ if( M_k == M_i )
+ continue; // been there, chained that.
+ size_t inter_size = intersect.count();
+ if( inter_size < 2 )
+ continue; // no match
+ // M_k is a superset relative to M_i
+ if( inter_size == M_i->Multiplicity() && M_k->Multiplicity() > inter_size )
+ supersets.push_back(leftI);
+ else if( inter_size == M_i->Multiplicity() && M_k->Multiplicity() == inter_size )
+ chainable.push_back(leftI);
+ else if( inter_size < M_i->Multiplicity() && M_k->Multiplicity() == inter_size )
+ subsets.push_back(leftI);
+ else
+ novel_subsets.push_back(leftI);
+ }
+
+
+ if( supersets.size() > 0 && 1)
+ {
+//#4018
+ cerr << "something is wrong, we should never have supersets during link extension!\n";
+ genome::breakHere();
+ }
+
+ if (chain)
+ {
+ for( size_t cI = 0; cI < chainable.size(); ++cI )
+ {
+ if( chainable.size() > 1 && 1)
+ {
+ cerr << "bad news bruthah\n";
+ genome::breakHere();
+ }
+ // chain with this guy
+ MatchLink& jk_link = mj_subsets[chainable[cI]];
+ MatchRecord* M_k = jk_link.subset;
+ if( M_k->extended )
+ {
+ cerr << "extensor crap\n";
+ breakHere();
+ }
+ if( M_k == M_i )
+ {
+ cerr << "crap\n";
+ breakHere();
+ }
+
+ // update boundary coordinates
+ vector< size_t > component_map( M_i->Multiplicity() );
+ remapComponents(ij_link.sub_to_super_map, M_j->Multiplicity(), jk_link.sub_to_super_map, component_map );
+ bool changed = extendRange( M_i, M_k, component_map );
+ if( changed )
+ last_linked = 2;
+
+ // unlink from superset
+ int jk_parity = M_k->Orientation(0) == M_j->Orientation(jk_link.sub_to_super_map[0]) ? 1 : -1;
+ unlinkSuperset(M_k,-direction*ij_parity*jk_parity);
+ // set subsuming match ptrs
+ M_k->subsuming_match = M_i;
+ M_k->subsumption_component_map = component_map;
+ M_i->chained_matches.push_back( M_k );
+ M_i->chained_component_maps.push_back( component_map );
+
+ // compensate for the deletion in subsets
+ for( size_t subI = 0; subI < chainable.size(); subI++ )
+ if( chainable[subI] > chainable[cI] )
+ chainable[subI]--;
+ for( size_t subI = 0; subI < subsets.size(); subI++ )
+ if( subsets[subI] > chainable[cI] )
+ subsets[subI]--;
+
+ // inherit M_k's outward superset and stop chaining here
+ if( getSuperset( M_k, direction*ij_parity*jk_parity ).superset != NULL )
+ {
+ inheritSuperset( M_i, M_k, direction, ij_parity*jk_parity );
+ last_linked = 2;
+ break;
+ }
+ }
+ }
+ // process subsets
+ for( size_t sI = 0; sI < subsets.size(); ++sI )
+ {
+ // change M_k to point at M_i
+ MatchLink& jk_link = mj_subsets[subsets[sI]];
+ MatchRecord* M_k = jk_link.subset;
+ int jk_parity = M_k->Orientation(0) == M_j->Orientation(jk_link.sub_to_super_map[0]) ? 1 : -1;
+ int ik_parity = ij_parity * jk_parity;
+
+ vector< size_t > component_map( M_k->Multiplicity() );
+ remapComponents(jk_link.sub_to_super_map, M_j->Multiplicity(), ij_link.sub_to_super_map, component_map );
+ // rebuild the superset component list
+ boost::dynamic_bitset<> comp_list(M_i->Multiplicity(), false);
+ for( size_t compI = 0; compI < component_map.size(); ++compI )
+ if(component_map[compI] != (std::numeric_limits<size_t>::max)())
+ comp_list.set(component_map[compI]);
+ unlinkSuperset(M_k,-1*direction*ik_parity);
+
+ // add to the deferred subsets list
+ NeighborhoodGroup sr = boost::make_tuple( M_k, component_map, vector<size_t>( M_k->Multiplicity(), 0 ) );
+ vector< NeighborhoodGroup >& subset_list = selectList( left_deferred_subsets, right_deferred_subsets, direction );
+ subset_list.push_back( sr );
+
+ // compensate for the deletion in subsets
+ for( size_t subI = 0; subI < subsets.size(); subI++ )
+ if( subsets[subI] > subsets[sI] )
+ subsets[subI]--;
+ }
+}
+
+/**
+ * Temporary buffers that get used every time a neighborhood list lookup is performed.
+ * Storing the buffers persistently prevents repeated memory allocations
+ */
+class NllBuffers
+{
+public:
+ std::vector< std::vector< size_t > > superset_groups;
+ std::vector< std::vector< size_t > > chainable_groups;
+ std::vector< std::vector< size_t > > subset_groups;
+ std::vector< std::vector< size_t > > novel_subset_groups;
+ vector< NeighborhoodListEntry > neighborhood_list;
+ vector< pair< size_t, size_t > > j_comp_sort_list;
+ vector<size_t> group_entries;
+
+ NllBuffers()
+ {
+ superset_groups.reserve(100);
+ chainable_groups.reserve(100);
+ subset_groups.reserve(100);
+ novel_subset_groups.reserve(100);
+ neighborhood_list.reserve(10000);
+ j_comp_sort_list.reserve(1000);
+ group_entries.reserve(1000);
+ };
+ void clear()
+ {
+ superset_groups.resize(0);
+ chainable_groups.resize(0);
+ subset_groups.resize(0);
+ novel_subset_groups.resize(0);
+ neighborhood_list.resize(0);
+ j_comp_sort_list.resize(0);
+ group_entries.resize(0);
+ };
+};
+
+NllBuffers nllbufs;
+
+/**
+ * Performs a neighborhood list lookup to find other matches nearby the match of interest
+ * @param M_i The primary match which is under extension
+ * @param match_pos_lookup_table
+ * @param M_e (Optionally NULL) A gapped extension which will be added to M_i after its neighborhood has been searched
+ */
+void neighborhoodListLookup( GappedMatchRecord* M_i,
+ MatchPositionLookupTable& match_pos_lookup_table,
+ vector< NeighborhoodGroup >& superset_list,
+ vector< NeighborhoodGroup >& chainable_list,
+ vector< NeighborhoodGroup >& subset_list,
+ vector< NeighborhoodGroup >& novel_subset_list,
+ int direction,
+ uint seed_size,
+ uint w,
+ bitset_t& left_lookups,
+ bitset_t& right_lookups,
+ GappedMatchRecord* M_e
+ )
+{
+ // make sure storage is empty
+ nllbufs.clear();
+ //
+ // construct a neighborhood list and process the neighborhood groups
+ //
+ vector< NeighborhoodListEntry >& neighborhood_list = nllbufs.neighborhood_list;
+ for( size_t x = 0; x < M_i->Multiplicity(); ++x )
+ {
+ int o_x = M_i->Orientation(x) == AbstractMatch::forward ? 1 : -1;
+ int parity = o_x * direction;
+ int64 match_end = parity == 1 ? M_i->LeftEnd(x) : M_i->RightEnd(x) - seed_size + 1;
+
+ if( match_end > 0 )
+ if( (direction == 1 && left_lookups.test(match_end)) ||
+ (direction == -1 && right_lookups.test(match_end)) )
+ {
+ if(print_warnings)
+ cerr << "looking twice in the same place\n";
+// genome::breakHere();
+ }else{
+ if( direction == 1 )
+ left_lookups.set(match_end);
+ if( direction == -1 )
+ right_lookups.set(match_end);
+ }
+
+ int d = 1;
+ int w_end = parity == 1 ? w : w + seed_size;
+ // are we cleaning up a gapped extension? if so, adjust d and w_end so
+ // we don't search anything twice and also cover all of the extension area
+ if(M_e != NULL)
+ {
+ int64 me_match_end = parity == 1 ? M_e->LeftEnd(x) : M_e->RightEnd(x)-(M_e->Length(x)-1);
+ d = w+1; // need to start at the begining of the window to properly
+ // classify all matches subsumed by extension and all novel
+ // matches which may have been discovered
+ w_end = w + me_match_end - match_end; // search anything new included in M_e
+ }
+ for( ; d <= w_end; ++d )
+ {
+ if( match_end <= parity * d )
+ continue; // we're too close to the beginning
+ size_t mplt_index = match_end - parity * d;
+ if( mplt_index >= match_pos_lookup_table.size() )
+ continue; // we're too close to the end!
+
+ MatchRecord* M_j = match_pos_lookup_table[ mplt_index ].first;
+ size_t y = match_pos_lookup_table[ mplt_index ].second;
+ if( M_j == NULL )
+ continue; // no match at this position
+
+ NeighborhoodListEntry nle;
+ nle.match = M_j;
+ nle.Mi_component = x;
+ nle.Mj_component = y;
+ // update the link if this one was subsumed
+ checkLinkAndComponent( M_j, y );
+ int o_y = ((AbstractMatch*)M_j)->Orientation(y) == AbstractMatch::forward ? 1 : -1;
+ nle.relative_orientation = o_x * o_y == 1 ? true : false;
+ nle.distance = d;
+ neighborhood_list.push_back( nle );
+
+ if( M_j == M_i )
+ {
+ M_i->tandem = true;
+ break; // not so fast there cowboy! can't chain beyond ourself!
+ }
+ }
+ }
+
+ //
+ // now classify each group of the neighborhood list and act appropriately
+ // group types are superset, chainable, subset, novel subset
+ //
+ NeighborhoodListComparator nlc;
+ std::sort( neighborhood_list.begin(), neighborhood_list.end(), nlc );
+
+ //std::reverse(neighborhood_list.begin(), neighborhood_list.end());
+
+ std::vector< std::vector< size_t > >& superset_groups = nllbufs.superset_groups;
+ std::vector< std::vector< size_t > >& chainable_groups = nllbufs.chainable_groups;
+ std::vector< std::vector< size_t > >& subset_groups = nllbufs.subset_groups;
+ std::vector< std::vector< size_t > >& novel_subset_groups = nllbufs.novel_subset_groups;
+
+ size_t group_end = 0;
+ for( size_t prev = 0; prev < neighborhood_list.size(); prev = group_end )
+ {
+ group_end = prev + 1;
+ while( group_end < neighborhood_list.size() &&
+ neighborhood_list[prev].match == neighborhood_list[group_end].match &&
+ neighborhood_list[prev].relative_orientation == neighborhood_list[group_end].relative_orientation )
+ {
+ ++group_end;
+ }
+ // the group is everything in the range of prev to end-1
+ if( prev + 1 == group_end )
+ continue; // can't do anything with groups of size 1 -- there's no match
+
+ // do something about ties here...???
+ // this code selects the *furthest* away match (e.g. that with the largest d)
+ // because that's what got sorted in last in the comparator
+ // it eliminates both duplicate M_i and duplicate M_j components...
+ // FIXME: is this true? is it safe?
+ vector< pair< size_t, size_t > >& j_comp_sort_list = nllbufs.j_comp_sort_list;
+ j_comp_sort_list.resize(0);
+ for( size_t i = prev + 1; i < group_end; ++i )
+ {
+ //selects the *furthest* away match
+ if( neighborhood_list[i-1].Mi_component == neighborhood_list[i].Mi_component )
+ continue;
+ j_comp_sort_list.push_back(make_pair(neighborhood_list[i-1].Mj_component, i-1));
+ }
+ j_comp_sort_list.push_back(make_pair(neighborhood_list[group_end-1].Mj_component, group_end-1));
+ std::sort(j_comp_sort_list.begin(), j_comp_sort_list.end());
+ vector<size_t>& group_entries = nllbufs.group_entries;
+ group_entries.resize(0);
+ for( size_t i = 1; i < j_comp_sort_list.size(); ++i )
+ {
+ //selects the *furthest* away match
+ if( j_comp_sort_list[i-1].first == j_comp_sort_list[i].first )
+ continue;
+ group_entries.push_back(j_comp_sort_list[i-1].second);
+ }
+ group_entries.push_back(j_comp_sort_list.back().second);
+
+ // update the links in case something is subsumed
+ for( size_t gI = 0; gI < group_entries.size(); ++gI )
+ checkLinkAndComponent( neighborhood_list[group_entries[gI]].match, neighborhood_list[group_entries[gI]].Mj_component );
+
+ // finally, classify the match as one of superset, subset,
+ // chainable, novel subset
+ MatchRecord* M_j = neighborhood_list[prev].match;
+
+ if( group_entries.size() == M_i->Multiplicity() &&
+ M_j->Multiplicity() > M_i->Multiplicity() )
+ {
+ // superset
+ superset_groups.push_back( group_entries );
+ }else
+ if( group_entries.size() == M_i->Multiplicity() &&
+ M_j->Multiplicity() == M_i->Multiplicity() )
+ {
+
+ // chainable
+ chainable_groups.push_back( group_entries );
+ }else
+ if( group_entries.size() < M_i->Multiplicity() &&
+ group_entries.size() == M_j->Multiplicity() )
+ {
+ // subset
+ subset_groups.push_back( group_entries );
+ }else
+ {
+ // novel subset
+ novel_subset_groups.push_back( group_entries );
+ }
+
+ } // end loop that splits the neighborhood into groups
+
+ createNeighborhoodGroupList( superset_list, superset_groups, neighborhood_list );
+ createNeighborhoodGroupList( chainable_list, chainable_groups, neighborhood_list );
+ createNeighborhoodGroupList( subset_list, subset_groups, neighborhood_list );
+ createNeighborhoodGroupList( novel_subset_list, novel_subset_groups, neighborhood_list );
+}
+
+/**
+ * Chains matches onto M_i or subsumes them as appropriate
+ */
+void processChainableMatches( GappedMatchRecord*& M_i, vector< NeighborhoodGroup >& chainable_list,
+ int direction, int& last_linked, bool find_novel_subsets, bool chain )
+{
+ // link the closest possible chainable first.
+ for( size_t gI = 0; gI < chainable_list.size(); gI++ )
+ {
+ MatchRecord* M_j = chainable_list[gI].get<0>();
+
+ vector< size_t >& component_map = chainable_list[gI].get<1>();
+
+ if( M_j == M_i )
+ {
+ // this is an inverted overlapping repeat, skip it.
+ continue;
+ }
+ if( M_j->extended )
+ {
+ if ( !find_novel_subsets && (M_i->is_novel_subset ))
+ {
+ //novel subsets have been disabled!! this is why it wasn't swallowed up!
+ continue;
+ }
+ else
+ {
+ // oh no! M_i should have been swallowed up already!
+ //tjt: claro, work has been wasted, but bypassing the breakHere() will allow
+ //the assumed-to-be subsumed M_i to be detected and updated accordingly
+ //but the question remains, why wasn't M_i previously subsumed?
+ //1) what if before gapped extension M_j was not in M_i's neighborhood?
+ // but after gapped extension, M_i is found in M_j's neighborhood and classified as chainable?
+ //cerr << "extensor crap 2\n";
+ //breakHere();
+ }
+ }
+
+ bool subsumed;
+ bool partial;
+ classifySubset( M_i, chainable_list[gI], subsumed, partial );
+
+ vector< size_t >& yx_map = chainable_list[gI].get<1>();
+ vector< size_t > xy_map(yx_map.size());
+ for( size_t i = 0; i < yx_map.size(); ++i )
+ xy_map[ yx_map[i] ] = i;
+// for( vector< size_t >::iterator rec_iter = chainable_groups[gI].begin(); rec_iter != chainable_groups[gI].end(); ++rec_iter )
+// xy_map[ neighborhood_list[*rec_iter].Mi_component ] = neighborhood_list[*rec_iter].Mj_component;
+
+ // if M_j isn't extending the boundaries of every component of M_i then
+ // it may be inconsistent with already chained matches. just subsume it without
+ // chaining in that case.
+ if( !subsumed && !partial && chain)
+ {
+ bool ok = testChainableMatch(M_i, M_j, xy_map);
+ if (ok)
+ {
+ M_i->chained_matches.push_back( M_j );
+ M_i->chained_component_maps.push_back( component_map );
+ bool changed = extendRange(M_i, M_j, xy_map);
+ if( changed )
+ {
+ // update the left-end and right-end coords
+ last_linked = 2;
+ }
+ }
+ else
+ break;
+ }
+ M_j->subsuming_match = M_i;
+ M_j->subsumption_component_map = component_map;
+ int parity = M_i->Orientation(0) == M_j->Orientation(xy_map[0]) ? 1 : -1;
+ if( getSuperset( M_j, -direction*parity ).superset != NULL )
+ unlinkSuperset(M_j,-direction*parity); // won't be needing this anymore...
+
+ // if M_j has a superset then inherit it and stop chaining here
+ if( getSuperset( M_j, direction*parity ).superset != NULL )
+ {
+ inheritSuperset( M_i, M_j, direction, parity );
+ last_linked = 2; // we may do a link extension!
+ break;
+ }
+ }
+}
+//processes supersets
+void processSupersetMatches( GappedMatchRecord*& M_i, vector< NeighborhoodGroup >& superset_list,
+ int direction, int& last_linked, bool gapped_extension = false )
+{
+
+ // link the closest possible superset first.
+ for( size_t gI = 0; gI < superset_list.size(); gI++ )
+ {
+ MatchRecord* M_j = superset_list[gI].get<0>();
+
+ vector< size_t >& component_map = M_i->chained_component_maps.at(0);
+ boost::dynamic_bitset<> comp_list(M_j->Multiplicity(), false);
+ for( size_t compI = 0; compI < M_i->Multiplicity(); ++compI )
+ comp_list.set(component_map[compI]);
+ if( M_j == M_i )
+ {
+ // this is an inverted overlapping repeat, skip it.
+ continue;
+ }
+ //tjt: shouldn't the superset always be extended when we reach this point during gapped extension?
+ if( M_j->extended && !gapped_extension )
+ {
+ // oh no! M_i should have been swallowed up already!
+ cerr << "extensor crap 2\n";
+ breakHere();
+ }
+
+ bool subsumed;
+ bool partial;
+ //update classifysubset to ClassifySuperset
+ classifySuperset( M_i, superset_list[gI], subsumed, partial );
+
+ if( subsumed && !partial )
+ {
+ // update the left-end and right-end coords
+ bool changed = reduceRange(M_i, M_j, component_map);
+ }
+ if( partial )
+ //some of the components of the superset matches are subsumed
+ //punt: what should I do differently here?
+
+ linkSuperset( M_i, M_j, comp_list, component_map, direction);
+ last_linked = 1;// stores the group type that was chained. 1 == superset, 2 == chainable, 0 == none
+
+ }
+}
+
+
+/**
+ * Performs a gapped extension on a match. The region either left or right of the match is processed by
+ * progressive alignment.
+ * @param M_i The match to extend
+ * @param seq_table gnSequences which correspond to each match component
+ * @param params The Homology HMM parameters to use
+ * @param w The max gap for chaining. Used to compute extension lengths.
+ * @param direction The direction of extension
+ * @param M_e (output) A MatchRecord containing just the extension, or NULL if extension failed
+ * @return FAILED, OK, or FIXME
+ */
+int ExtendMatch(GappedMatchRecord*& M_i, vector< gnSequence* >& seq_table, Params& hmm_params, unsigned w, int direction, vector<GappedMatchRecord*>& novel_matches, int gap_open, int gap_extend, int extension_window)
+{
+ ccount +=1;
+ static bool debug_extension = false;
+// punt on this for now..
+ bool novel_hss_regions_support = false;
+ bool danger_zone_active = true;
+ int multi = M_i->Multiplicity();
+ double e = 2.71828182845904523536;
+// I think this works a little better...
+
+ int extend_length = 80*pow(e,-0.01*multi);
+ //use user specified window if requested
+ if (extension_window >= 0 )
+ extend_length = extension_window;
+ vector<int> left_extend_vector(multi,0);
+ vector<int> right_extend_vector(multi,0);
+ int left_extend_length = extend_length;
+ int right_extend_length = extend_length;
+
+ if ( M_i->tandem )
+ {
+ if ( debug_extension)
+ cerr << "Sorry, no extension for tandem repeats.." << endl << endl;
+ return FIXME;
+ }
+
+// careful, if M_i->LeftEnd(j) < extend_length, ToString() will be disappointed...
+ for( gnSeqI j = 0; j < multi; j++)
+ {
+// now put check for curpos+extend_length<startpos of next match component..
+ if( M_i->Orientation(j) == AbstractMatch::reverse )
+ {
+// if leftend <= 0 set right extension to 0
+ if( M_i->LeftEnd(j) <= 0 || M_i->LeftEnd(j) > 4000000000u )
+ right_extend_vector[j] = 0;
+// if extend_length goes too far, set to maximum possible
+ else if ( M_i->LeftEnd(j) <= extend_length )
+ right_extend_vector[j] = M_i->LeftEnd(j)-1;
+// if we run into another match, don't extend into it
+ else if ( j > 0 && M_i->LeftEnd(j) - extend_length <= M_i->RightEnd(j-1) )
+ {
+ int parity = M_i->Orientation(j) == M_i->Orientation(j-1) ? 1 : 1;
+ right_extend_vector[j] = parity*(M_i->LeftEnd(j)-M_i->RightEnd(j-1)-1);
+ }
+// else everything ok to set to preset extend_length
+ else
+ right_extend_vector[j] = extend_length-1;
+
+ if(M_i->RightEnd(j) <= 0 || M_i->RightEnd(j) > 4000000000u)
+ left_extend_vector.push_back(0);
+ else if ( M_i->RightEnd(j) + extend_length > seq_table[0]->length() )
+ left_extend_vector[j] = seq_table[0]->length()-M_i->RightEnd(j);
+ else if ( j+1 < multi && M_i->RightEnd(j) + extend_length >= M_i->LeftEnd(j+1) )
+ {
+ int parity = M_i->Orientation(j) == M_i->Orientation(j+1) ? 1 : 1;
+ left_extend_vector[j] =parity*( M_i->LeftEnd(j+1)-M_i->RightEnd(j)-1);
+ }
+ else
+ left_extend_vector[j] = extend_length-1;
+ }
+ else
+ {
+ if( M_i->LeftEnd(j) <= 0 || M_i->LeftEnd(j) > 4000000000u )
+ left_extend_vector[j] = 0;
+ else if ( M_i->LeftEnd(j) <= extend_length )
+ left_extend_vector[j] = M_i->LeftEnd(j)-1;
+ else if ( j > 0 && M_i->LeftEnd(j) - extend_length <= M_i->RightEnd(j-1) )
+ {
+ int parity = M_i->Orientation(j) == M_i->Orientation(j-1) ? 1 : 1;
+ left_extend_vector[j] = parity*(M_i->LeftEnd(j)-M_i->RightEnd(j-1)-1);
+ }
+ else
+ left_extend_vector[j] = extend_length;
+
+ if(M_i->RightEnd(j) <= 0 || M_i->RightEnd(j) > 4000000000u)
+ right_extend_vector[j] = 0;
+ else if ( M_i->RightEnd(j) + extend_length > seq_table[0]->length() )
+ right_extend_vector[j] = seq_table[0]->length()-M_i->RightEnd(j)-1;
+ else if ( j+1 < multi && M_i->RightEnd(j) + extend_length >= M_i->LeftEnd(j+1) )
+ {
+ int parity = M_i->Orientation(j) == M_i->Orientation(j+1) ? 1 : 1;
+ right_extend_vector[j] = parity*(M_i->LeftEnd(j+1)-M_i->RightEnd(j)-1);
+ }
+ else
+ right_extend_vector[j] = extend_length;
+ }
+ }
+
+ left_extend_length = *(std::min_element( left_extend_vector.begin(), left_extend_vector.end() ));
+ right_extend_length = *(std::min_element( right_extend_vector.begin(), right_extend_vector.end() ));
+ left_extend_length = left_extend_length < 0 ? 0 : left_extend_length;
+ right_extend_length = right_extend_length < 0 ? 0 : right_extend_length;
+ extend_length = direction < 0 ? right_extend_length : left_extend_length;
+ const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
+ std::vector<std::string> leftExtension(multi);
+ GappedAlignment leftside(multi,left_extend_length);
+ std::vector<std::string> rightExtension(multi);
+ GappedAlignment rightside(multi,right_extend_length);
+ vector< string > leftExtension_aln;
+ vector< string > rightExtension_aln;
+ if ( left_extend_length > 0 && direction == 1 )
+ {
+// extract sequence data
+ for( gnSeqI j = 0; j < multi; j++)
+ {
+ if( M_i->Orientation(j) == AbstractMatch::reverse )
+ {
+ seq_table[0]->ToString( leftExtension[j], left_extend_length, M_i->RightEnd(j)+1 );
+ leftside.SetLeftEnd(j,M_i->RightEnd(j)+1);
+ rc_filter->ReverseFilter(leftExtension[j]);
+ }else{
+ seq_table[0]->ToString( leftExtension[j], left_extend_length, M_i->LeftEnd(j) - left_extend_length );
+ leftside.SetLeftEnd(j,M_i->LeftEnd(j) - left_extend_length);
+ }
+ leftside.SetOrientation(j,M_i->Orientation(j));
+ leftside.SetLength(left_extend_length,j);
+ }
+ bool align_success = false;
+ //mems::MuscleInterface::getMuscleInterface().SetMuscleArguments("-stable -quiet -seqtype DNA");
+ align_success = mems::MuscleInterface::getMuscleInterface().CallMuscleFast( leftExtension_aln, leftExtension, gap_open, gap_extend );
+ if ( align_success ){
+ leftside.SetAlignment(leftExtension_aln);
+ leftside.SetAlignmentLength(leftExtension_aln.at(0).size());
+ }else{
+ cerr << "Extension failed: Muscle error" << endl;
+ return FAILED;
+ }
+ }
+ else if ( right_extend_length > 0 && direction == -1 )
+ {
+ for( gnSeqI j = 0; j < multi; j++)
+ {
+ if( M_i->Orientation(j) == AbstractMatch::reverse )
+ {
+ rightside.SetLeftEnd(j,M_i->LeftEnd(j) - right_extend_length-1);
+ seq_table[0]->ToString( rightExtension[j], right_extend_length, M_i->LeftEnd(j) - right_extend_length-1);
+ rc_filter->ReverseFilter(rightExtension[j]);
+ }else{
+ rightside.SetLeftEnd(j,M_i->RightEnd(j)+1 );
+ seq_table[0]->ToString( rightExtension[j], right_extend_length, M_i->RightEnd(j)+1 );
+ }
+ rightside.SetOrientation(j,M_i->Orientation(j));
+ rightside.SetLength(right_extend_length,j);
+ }
+ bool align_success = false;
+ align_success = mems::MuscleInterface::getMuscleInterface().CallMuscleFast( rightExtension_aln, rightExtension, gap_open, gap_extend );
+ if ( align_success ){
+ rightside.SetAlignment(rightExtension_aln);
+ rightside.SetAlignmentLength(rightExtension_aln.at(0).size());
+ }else{
+ cerr << "Extension failed: Muscle error" << endl;
+ return FAILED;
+ }
+ }else{
+ //what are you even doing here?!?
+ if(debug_extension)
+ {
+ cerr << "Extension failed: No room to extend" << endl;
+ }
+ return FAILED;
+ }
+
+// tjt: don't use original match, only regions to the left/right
+// since for now we won't modify M_i, even if the homology detection method suggests otherwise
+ vector< AbstractMatch* > mlist;
+ if( direction == 1 )
+ mlist.push_back(leftside.Copy());
+ if( direction == -1 )
+ mlist.push_back(rightside.Copy());
+
+// createIntervald
+ Interval iv;
+ iv.SetMatches(mlist);
+ CompactGappedAlignment<> tmp_cga;
+ CompactGappedAlignment<>* cga = tmp_cga.Copy();
+ new (cga)CompactGappedAlignment<>(iv);
+ vector< CompactGappedAlignment<>* > cga_list;
+ CompactGappedAlignment<>* result;
+// detectAndApplyBackbone
+ backbone_list_t bb_list;
+
+ detectAndApplyBackbone( cga, seq_table,result,bb_list, hmm_params, direction != 1, direction == 1 );
+ cga->Free();
+
+ bool boundaries_improved = false;
+ if( bb_list.size() == 0 || bb_list.at(0).size() == 0)
+ {
+// no backbone segment found
+ if(debug_extension)
+ cerr << "Extension failed: no backbones found during extension" << endl;
+ result->Free();
+ return FAILED;
+ }
+ AbstractMatch* extension_bb;
+// tjt: was > before, wasn't taking right backbone???
+// remember, direction == -1 rightward, == 1 leftward
+ bool isnovel = false;
+ for( size_t bbI = 0; bb_list.size() > 0 && bbI < bb_list[0].size(); bbI++ )
+ {
+ extension_bb = bb_list.at(0).at(bbI);
+ if (extension_bb == NULL)
+ continue;
+
+ cga_list.push_back( tmp_cga.Copy() );
+ result->copyRange( *(cga_list.back()), extension_bb->LeftEnd(0), extension_bb->AlignmentLength()-1 );
+ int cgalen = cga_list.back()->AlignmentLength();
+ int resultlen = result->AlignmentLength();
+
+ //why set this to > 5? what about seed weight? or to > 0? seems strange to allow novel matches of length 1...
+ if( cga_list.back()->Multiplicity() > 1 && cga_list.back()->Length() > 0 && cga_list.back()->AlignmentLength() > 0 )
+ {
+// successful extension!!
+// boundaries were improved, current match extended original match
+// create a GappedMatchRecord for the gapped extension
+ vector< AbstractMatch* > matches( 1, cga_list.back());
+// GappedMatchRecord* M_e = M_i->Copy();
+ UngappedMatchRecord tmp( cga_list.back()->Multiplicity(), cga_list.back()->AlignmentLength() );
+ MatchRecord* umr = tmp.Copy();
+ GappedMatchRecord* M_e = dynamic_cast<GappedMatchRecord*>(umr);
+
+ if( M_e == NULL )
+ {
+// create a new gapped match record for M_i
+ GappedMatchRecord gmr( *(UngappedMatchRecord*)umr );
+ M_e = gmr.Copy();
+ umr->subsuming_match = M_e;
+// umr->subsuming_match = M_i;
+ M_e->chained_matches.push_back( umr );
+ vector< size_t > component_map( M_e->SeqCount() );
+ for( size_t i = 0; i < component_map.size(); ++i )
+ component_map[i] = i;
+ M_e->chained_component_maps.push_back(component_map);
+ swap(umr->subsumption_component_map, component_map); // swap avoids reallocation
+// update superset and subset links
+ for( int dI = 1; dI > -2; dI -= 2 )
+ {
+ MatchLink& ij_link = getSuperset(M_e,dI);
+ if( ij_link.superset != NULL )
+ {
+ ij_link.subset = M_e;
+ unlinkSuperset(umr,dI);
+ int parity = M_e->Orientation(0) == ij_link.superset->Orientation(ij_link.sub_to_super_map[0]) ? 1 : -1;
+ getSubsets(ij_link.superset,-dI*parity).push_back(ij_link);
+ }
+ vector< MatchLink >& subsets = getSubsets(M_e,dI);
+ for( size_t subI = 0; subI < subsets.size(); ++subI )
+ {
+ subsets[subI].superset = M_e;
+ int parity = M_i->Orientation(subsets[subI].sub_to_super_map[0]) == subsets[subI].subset->Orientation(0) ? 1 : -1;
+ getSuperset(subsets[subI].subset, -dI*parity).superset = M_e;
+ }
+ getSubsets(umr,dI).clear(); // so that validate works...
+ }
+ // tjt: clobber M_e's GappedMatchRecord data and set boundaries
+ //tjt: we call the Temp version since we don't actually want to do anything with the regions between the matches
+ M_e->SetMatchesTemp(matches);//,cga_list.back()->Multiplicity() );
+ }
+ novel_matches.push_back(M_e->Copy());
+ }
+ }
+ result->Free();
+ if (novel_matches.size() > 0)
+ return OK;
+ else
+ return FAILED;
+}//tjt: match should be extended!
+
+/**
+ * A class to prioritize match records for extension based on their multiplicity
+ */
+class ProcrastinationQueue
+{
+public:
+ template< typename MrPtrType >
+ ProcrastinationQueue( vector< MrPtrType >& match_record_list ) :
+ mhc()
+ {
+ q.resize( match_record_list.size() );
+ std::copy(match_record_list.begin(), match_record_list.end(), q.begin() );
+ std::make_heap( q.begin(), q.end(), mhc );
+ q_end = q.size();
+ q_size = q.size();
+ }
+
+ /** pops an element from the queue, maintaining heap order */
+ MatchRecord* pop()
+ {
+ std::pop_heap( q.begin(), q.begin()+q_end, mhc );
+ q_end--;
+ return *(q.begin() + q_end);
+ }
+
+ /** Adds an element to the queue and restores heap order */
+ void push( MatchRecord* M_n )
+ {
+ if( q_end < q.size() )
+ q[q_end] = M_n;
+ else
+ {
+ q.push_back(M_n);
+ }
+ q_size++;
+ q_end++;
+ std::push_heap(q.begin(), q.begin()+q_end, mhc);
+ }
+ /** gets the total number of elements that have been placed in the queue */
+ size_t size() const{ return q_size; }
+ /** returns the current number of elements in the queue */
+ size_t end() const{ return q_end; }
+
+
+ /** defines a multiplicity heap ordering */
+ class MultiplicityHeapCompare
+ {
+ public:
+ bool operator()( const MatchRecord* a, const MatchRecord* b )
+ {
+ return a->Multiplicity() < b->Multiplicity();
+ }
+ };
+
+private:
+ const MultiplicityHeapCompare mhc;
+ std::vector< MatchRecord* > q;
+ size_t q_end;
+ size_t q_size;
+};
+
+/**
+ * Creates novel subset matches where appropriate and adds them to the procrastination queue
+ */
+void processNovelSubsetMatches( GappedMatchRecord*& M_i, vector< NeighborhoodGroup >& novel_subset_list,
+ bool find_novel_subsets, ProcrastinationQueue& procrastination_queue,
+ vector< gnSequence* >& seq_table, int direction, uint w, int& last_linked,
+ size_t& novel_subset_count )
+{
+ // finally process novel subset
+ bool prev_linked = false; // we only want to link the closest of a group with the same components
+ int created_thisround = 0;
+ static NeighborhoodGroupComponentCompare srcc;
+ for( size_t gI = 0; gI < novel_subset_list.size(); gI++ )
+ {
+ if( !find_novel_subsets )
+ continue; // only find novel subsets if we're supposed to
+ if( last_linked != 0 )
+ continue; // only generate subsets when last_linked == 0
+
+ // be sure to handle case where:
+ // --M_i--> --M_j-- <--M_i--
+ // that may cause an identical novel subset to get spawned but with
+ // M_i and M_j swapped as left and right supersets
+
+ bool same_components = false;
+ if( gI > 0 )
+ same_components = srcc.compare(novel_subset_list[gI], novel_subset_list[gI-1]) == 0;
+ prev_linked = same_components? prev_linked : false;
+
+ if( prev_linked )
+ continue; // don't link a subset with the same components...
+
+ // TODO: handle the tandem repeat case
+ if( M_i->tandem )
+ {
+ // step 1. count tandem repeat components
+ // step 2. create a new GappedMatchRecord with one component per tandem component
+ // add a GappedMatchRecord with the outer component boundaries
+ // do ordinary extension
+ // when finalize() gets called, something special needs to happen
+ continue;
+ }
+
+ MatchRecord* M_j = novel_subset_list[gI].get<0>();
+ // if M_j hasn't been extended then we don't do anything yet.
+ // we may find this novel subset again when M_j gets extended
+ if( M_j->extended == false)
+ continue;
+
+ size_t mult = 0; // multiplicity of novel subset
+ for( size_t i = 0; i < novel_subset_list[gI].get<1>().size(); ++i )
+ if( novel_subset_list[gI].get<1>()[i] != (std::numeric_limits<size_t>::max)() )
+ mult++;
+
+ if( mult < 2 )
+ continue; // can't do anything if there's no match!
+
+ UngappedMatchRecord tmper1(mult,0);
+ GappedMatchRecord tmper2(tmper1); // this is lame
+ GappedMatchRecord* M_n = tmper2.Copy();
+
+ size_t mnewi = 0;
+ vector< size_t > new_to_i_map(mult);
+ vector< size_t > new_to_j_map(mult);
+ boost::dynamic_bitset<> ni_list(M_i->Multiplicity());
+ boost::dynamic_bitset<> nj_list(M_j->Multiplicity());
+ for( size_t i = 0; i < novel_subset_list[gI].get<1>().size(); ++i )
+ {
+ if( novel_subset_list[gI].get<1>()[i] != (std::numeric_limits<size_t>::max)() )
+ {
+ new_to_i_map[mnewi] = novel_subset_list[gI].get<1>()[i];
+ new_to_j_map[mnewi] = i;
+ ni_list.set(new_to_i_map[mnewi]);
+ nj_list.set(i);
+ M_n->SetStart(mnewi, M_j->Start(i)); // sets left-end and orientation
+ M_n->SetLength(M_j->Length(i),mnewi);
+ mnewi++;
+ }
+ }
+ if( M_n->Orientation(0) == AbstractMatch::reverse )
+ M_n->Invert();
+ // before we go any further, make sure that the relevant portion of M_i is not
+ // either completely or partially subsumed by the relevant portion of M_j!
+ MatchProjectionAdapter mpaa( M_i, new_to_i_map );
+ vector< size_t > mpaa_to_Mn_map( new_to_i_map.size() );
+ for( size_t i = 0; i < mpaa_to_Mn_map.size(); ++i )
+ mpaa_to_Mn_map[i] = i;
+ bool subsumed;
+ bool partial;
+ classifyMatch( M_n, &mpaa, mpaa_to_Mn_map, subsumed, partial );
+ if( subsumed )
+ {
+ M_n->Free();
+ continue; // there's nothing novel about this subset...
+ }
+ if( partial )
+ {
+ // FIXME: we should really spawn a novel subset on the non-subsumed components
+ M_n->Free();
+ continue;
+ }
+ created_thisround+= M_n->Multiplicity();
+
+ M_n->chained_matches.push_back(M_j);
+ M_n->chained_component_maps.push_back(new_to_j_map);
+ //tjt: need to send finalize seq_table for muscle alignment
+ M_n->finalize(seq_table); // make this one a legitimate match...
+ M_n->chained_matches.clear();
+ M_n->chained_component_maps.clear();
+
+ M_n->is_novel_subset = true;//yep, this is a novel subset
+ // create links from M_n to M_i and M_j
+ int ni_parity = M_n->Orientation(0) == M_i->Orientation(new_to_i_map[0]) ? 1 : -1;
+ int nj_parity = M_n->Orientation(0) == M_j->Orientation(new_to_j_map[0]) ? 1 : -1;
+ MatchLink& ni_link = getSuperset(M_n,-direction*ni_parity);
+ ni_link = MatchLink(M_i,M_n,ni_list,new_to_i_map);
+ getSubsets(M_i,direction).push_back(ni_link);
+ //getExtraSubsets(M_i,direction).push_back(ni_link);
+ MatchLink& nj_link = getSuperset(M_n,direction*ni_parity);
+ nj_link = MatchLink(M_j,M_n,nj_list,new_to_j_map);
+ getSubsets(M_j,-direction*ni_parity*nj_parity).push_back(nj_link);
+ //getExtraSubsets(M_j,-direction*ni_parity*nj_parity).push_back(nj_link);
+ // push M_n onto the heap
+ novel_subset_list.push_back(M_n);
+ //procrastination_queue.push(M_n);
+ novel_subset_count++;
+ }
+
+
+}
+
+
+/**
+ * Writes a set of MatchRecords in eXtended Multi-FastA format
+ * @param seedml A matchlist containing the seq_table of interest
+ * @param extended_matches A set of matches to write out
+ * @param xmfa_file The filename to use for output
+ */
+void writeXmfa( MatchList& seedml, std::vector< GappedMatchRecord* >& extended_matches, const std::string& xmfa_file )
+{
+ GenericIntervalList<GappedMatchRecord> gmr_list;
+ for( size_t gmrI = 0; gmrI < extended_matches.size(); ++gmrI )
+ gmr_list.push_back(*extended_matches[gmrI]);
+
+ if( xmfa_file.length() > 0 && xmfa_file != "-")
+ {
+ gmr_list.seq_filename.push_back( seedml.seq_filename[0] );
+ gmr_list.seq_table.push_back( seedml.seq_table[0] );
+ if( xmfa_file == "-" )
+ gmr_list.WriteStandardAlignment(cout);
+ else
+ {
+ ofstream xmfa_out(xmfa_file.c_str());
+ gmr_list.WriteStandardAlignment(xmfa_out);
+ xmfa_out.close();
+ }
+ }
+}
+
+/**
+ * Writes a set of MatchRecords in XML format
+ * @param seedml A matchlist containing the seq_table of interest
+ * @param extended_matches A set of matches to write out
+ * @param xml_file The filename to use for output
+ */
+void writeXML( MatchList& seedml, std::vector< GappedMatchRecord* >& extended_matches, const std::string& xml_file )
+{
+
+ GenericIntervalList<GappedMatchRecord> gmr_list;
+ for( size_t gmrI = 0; gmrI < extended_matches.size(); ++gmrI )
+ gmr_list.push_back(*extended_matches[gmrI]);
+
+ if( xml_file.length() > 0 && xml_file != "-")
+ {
+ gmr_list.seq_filename.push_back( seedml.seq_filename[0] );
+ gmr_list.seq_table.push_back( seedml.seq_table[0] );
+ if( xml_file == "-" )
+ gmr_list.WriteXMLAlignment(cout);
+ else
+ {
+ ofstream xml_out(xml_file.c_str());
+ gmr_list.WriteXMLAlignment(xml_out);
+ xml_out.close();
+ }
+ }
+}
+
+class ToUPPER
+{
+public:
+ char operator()( char a ){ return toupper(a); }
+};
+int main( int argc, char* argv[] )
+{
+// debug_interval = true;
+ // Declare the supported options.
+ bool debug_extension = false;
+
+ string sequence_file = "";
+ int extension_window = 0;
+ int w = 0;
+ int kmersize =0;
+ int gap_open = 0;
+ int gap_extend = 0;
+ uint seed_weight = 0;
+ uint min_repeat_length = 0;
+ score_t min_spscore = 0;
+ uint rmin = 0;
+ uint rmax = 0;
+ string outputfile = "";
+ string output2file = "";
+ string xmfa_file = "";
+ string xml_file = "";
+ string stat_file = "";
+ string seed_file = "";
+ bool only_direct = false;
+ bool load_sml = false;
+ bool small_repeats = false;
+ bool large_repeats = false;
+ bool allow_tandem = false;
+ bool allow_redundant = false;
+ bool find_novel_subsets = false;
+ bool use_novel_matches = true; //should procrast use novel matches found during gapped extension ?
+ bool solid_seed = false;
+ bool extend_chains = true;
+ bool chain = true;
+ bool two_hits = false;
+ bool unalign = true;
+ float percent_id = 0.0;
+ float pGoHomo = 0.004f;
+ float pGoUnrelated = 0.004f;
+ bool only_extended = false;
+
+ po::variables_map vm;
+ try {
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("allow-redundant", po::value <bool>(&allow_redundant)->default_value(true), "allow redundant alignments?")
+ ("chain", po::value<bool>(&chain)->default_value(true), "chain seeds?")
+ ("extend", po::value<bool>(&extend_chains)->default_value(true), "perform gapped extension on chains?")
+ ("window", po::value<int>(&extension_window)->default_value(-1), "size of window to use during gapped extension")
+ ("gapopen",po::value <int>(&gap_open)->default_value(0), "gap open penalty")
+ ("gapextend",po::value <int>(&gap_extend)->default_value(0), "gap extension penalty")
+ ("h", po::value<float>(&pGoHomo)->default_value(0.008f), "Transition to Homologous")
+ ("help", "get help message")
+ ("highest", po::value<string>(&stat_file)->default_value("procrast.highest"), "file containing highest scoring alignment for each multiplicity ")
+ ("l", po::value <unsigned>(&min_repeat_length)->default_value(1), "minimum repeat length")
+ ("large-repeats", po::value <bool>(&large_repeats)->default_value(false), "optimize for large repeats")
+ ("load-sml", po::value <bool>(&load_sml)->default_value(false), "try to load existing SML file?")
+ ("onlydirect",po::value<bool>(&only_direct)->default_value(false), "only process seed matches on same strand?")
+ ("onlyextended",po::value<bool>(&only_extended)->default_value(false), "only output extended matches?")
+ ("output", po::value<string>(&outputfile)->default_value(""), "procrastAligner output ")
+ ("percentid", po::value<float>(&percent_id)->default_value(0.0), "min repeat family % id")
+ ("novel-subsets", po::value<bool>(&find_novel_subsets)->default_value(false), "find novel subset matches?")
+ ("novel-matches", po::value<bool>(&use_novel_matches)->default_value(true), "use novel matches found during gapped extension?")
+ ("rmax", po::value<unsigned>(&rmax)->default_value(500), "maximum repeat multiplicity (max copy number)")
+ ("rmin" , po::value<unsigned>(&rmin)->default_value(2), "minimum repeat multiplicity (min copy number)")
+ ("seeds", po::value<string>(&seed_file), "seed output file")
+ ("sequence", po::value<string>(&sequence_file), "FastA sequence file")
+ ("small-repeats", po::value <bool>(&small_repeats)->default_value(false), "optimize for small repeats")
+ ("score-out", po::value<string>(&output2file)->default_value(""), "output with corresponding score and alignment info ")
+ ("solid", po::value<bool>(&solid_seed)->default_value(0), "use solid/exact seeds?")
+ ("sp", po::value <score_t>(&min_spscore)->default_value(0), "minimum Sum-of-Pairs alignment score")
+ ("tandem", po::value <bool>(&allow_tandem)->default_value(true), "allow tandem repeats?")
+ ("two-hits", po::value<bool>(&two_hits)->default_value(false), "require two hits within w to trigger gapped extension?")
+ ("u", po::value<float>(&pGoUnrelated)->default_value(0.001f), "Transition to Unrelated")
+ ("unalign", po::value<bool>(&unalign)->default_value(true), "unalign non-homologous sequence?")
+ ("w", po::value<int>(&w)->default_value(0), "max gap width ")
+ ("xmfa", po::value<string>(&xmfa_file)->default_value(""), "XMFA format output")
+ ("xml", po::value<string>(&xml_file)->default_value(""), "XML format output")
+ ("z", po::value <unsigned>(&seed_weight)->default_value(0), "seed weight")
+
+ ;
+
+ if( argc < 2 )
+ {
+ cout << desc << "\n";
+ return 1;
+ }
+
+
+
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ cout << desc << "\n";
+ return 1;
+ }
+
+ if (large_repeats && small_repeats)
+ {
+ cout << "which is it? small or large? can't optimize for both!\n";
+ return 1;
+ }
+ if (seed_weight < 3) {
+ cout << "Invalid seed weight, minimum size is 3!\n";
+ return 1;
+ }
+ if (vm.count("rmin")) {
+ cout << "setting minimum multiplicity to "
+ << rmin << ".\n";
+ } else {
+ cout << "Using default minimum multiplicity (2).\n";
+ }
+
+ if (vm.count("rmax")) {
+ cout << "setting maximimum multiplicity to "
+ << rmax << ".\n";
+ } else {
+ cout << "Using default maximum multiplicity (500).\n";
+ }
+
+ if (rmin > rmax)
+ {
+ cout << "rmin > rmax, setting rmax == rmin\n";
+ rmax = rmin;
+ }
+ if (rmin < 2)
+ {
+ cout << "rmin < 2, setting rmin == 2\n";
+ rmin = 2;
+ }
+ if (rmax < 2)
+ {
+ cout << "rmax < 2, setting rmax == 2\n";
+ rmax = 2;
+ }
+ if (percent_id > 1 )
+ percent_id = 1.0;
+ if (vm.count("z")) {
+ cout << "seed weight set to "
+ << seed_weight << ".\n";
+ } else {
+ cout << "Using default seed weight.\n";
+ }
+ }
+ catch(exception& e) {
+ cerr << "error: " << e.what() << "\n";
+ return 1;
+ }
+ catch(...) {
+ cerr << "Exception of unknown type!\n";
+ }
+
+
+
+ // Roadmap:
+ // 1. identify seed matches using a Sorted Mer List
+ // 2. create a "UngappedMatchRecord" for each seed match and put in the match record list
+ // 3. create a Match Position Lookup Table
+ // 4. create a multiplicity priority queue
+ // 5. create an (empty) Novel Subset Match Record list
+ // 6. extend all matches!
+ // 7. create a procrastination queue for novel subset matches
+ // 8. extend all novel subset matches!
+ // 9. create a final list of matches
+ // 10. score matches
+ // 11. report matches
+
+
+ //
+ // part 1, load sequence and find seed matches using SML and a repeat class...
+ //
+ MatchList seedml;
+ seedml.seq_filename = vector< string >( 1, sequence_file );
+ seedml.sml_filename = vector< string >( 1, seedml.seq_filename[0] + ".sslist");
+ //seedml.LoadSequences( &cout );
+ LoadSequences( seedml, &cout );
+ if( seed_weight == 0 )
+ seed_weight = (int)((double)getDefaultSeedWeight( seedml.seq_table[0]->length() ) * .9);
+
+ int seed_rank = 0;
+ if ( solid_seed )
+ {
+ seed_rank = INT_MAX;
+ std::cout << "Using solid seed" << std::endl;
+ }
+ seedml.LoadSMLs( seed_weight, &cout, seed_rank, solid_seed, !load_sml );
+ int64 seed = getSeed( seed_weight, seed_rank);
+ uint seed_size = getSeedLength( seed );
+
+ if (min_spscore < 0 )
+ min_spscore = 0;
+
+ if( w == 0 )
+ w = seed_weight * 3; // default value
+ else if( w < 0 )
+ {
+ w = 0;
+ chain = false;
+ }
+
+ cout << "Using seed weight: " << seed_weight << " and w: " << w << endl;
+ SeedMatchEnumerator sme;
+ sme.FindMatches( seedml, rmin, rmax, only_direct );
+
+ // need single nuc & kmer frequency
+ string sequence = seedml.seq_table.at(0)->ToString();
+ string uppercase = sequence;
+ ToUPPER tupperware;
+ std::transform(sequence.begin(),sequence.end(), uppercase.begin(), tupperware);
+ kmersize =1;
+ map<string,gnSeqI> polyfreq;
+ map<string,gnSeqI> monofreq;
+ map<string, gnSeqI>::iterator it;
+ for (gnSeqI i = 0; i <= uppercase.size()-kmersize; i++)
+ {
+ string kmer = uppercase.substr(i,kmersize);
+ string nucleotide = uppercase.substr(i,1);
+ if( nucleotide[0] != 'A' &&
+ nucleotide[0] != 'C' &&
+ nucleotide[0] != 'G' &&
+ nucleotide[0] != 'T' )
+ nucleotide[0] = 'A';
+ for( size_t kI = 0; kI < kmer.size(); kI++ )
+ if( kmer[kI] != 'A' &&
+ kmer[kI] != 'C' &&
+ kmer[kI] != 'G' &&
+ kmer[kI] != 'T' )
+ kmer[kI] = 'A';
+
+ polyfreq[kmer] +=1;
+ monofreq[nucleotide] +=1;
+ //insert( const string& val );
+ //it = find( const string& mer );
+ //it->second+=1;
+ }
+
+ Params hmm_params = getAdaptedHoxdMatrixParameters( double(monofreq["G"]+monofreq["C"])/double(sequence.size()) );
+
+ if (percent_id > 0 )
+ adaptToPercentIdentity(hmm_params, percent_id);
+ hmm_params.iGoHomologous = pGoHomo;
+ hmm_params.iGoUnrelated = pGoUnrelated;
+
+ //
+ // part 2, convert to match records
+ //
+ vector< UngappedMatchRecord* > match_record_list(seedml.size());
+ size_t component_count = 0;
+ bool all_components_overlap = false;
+
+ bool prev_overlaps = false;
+ uint mi_multiplicity = 0;
+ uint mi2_multiplicity = 0;
+ uint num_components = 0;
+ int overlap_size = 1;
+ int hit_match =0;
+
+ cout << "Total number of seed matches found: " << seedml.size() << endl;
+ vector< pair< int64, UngappedMatchRecord* > > seed_sort_list;
+ for( size_t mI = 0; mI < seedml.size(); ++mI )
+ {
+ UngappedMatchRecord tmp( seedml[mI]->SeqCount(), seedml[mI]->AlignmentLength() );
+ match_record_list[mI] = tmp.Copy();
+
+ for( size_t seqI = 0; seqI < seedml[mI]->SeqCount(); seqI++ )
+ {
+ match_record_list[mI]->SetStart( seqI, seedml[mI]->Start( seqI ) );
+ match_record_list[mI]->SetLength( seedml[mI]->Length( seqI ), seqI );
+ }
+ seed_sort_list.push_back(make_pair(match_record_list[mI]->LeftEnd(0), match_record_list[mI]));
+ component_count += seedml[mI]->SeqCount();
+ seedml[mI]->Free();
+ }
+ std::sort( seed_sort_list.begin(), seed_sort_list.end() );
+ // write seeds to file if requested
+
+ ofstream seed_out;
+ if ( seed_file.size() > 0)
+ seed_out.open(seed_file.c_str());
+ //
+ // part 3, create a match position lookup table
+ //
+ vector< pair< gnSeqI, MatchPositionEntry > > mplt_sort_list( component_count );
+ size_t compI = 0;
+ for( size_t mI = 0; mI < seed_sort_list.size(); ++mI )
+ {
+ UngappedMatchRecord* mr = seed_sort_list[mI].second;
+ if ( seed_file.size() > 0)
+ seed_out << *mr << endl;
+ for( size_t seqI = 0; seqI < mr->Multiplicity(); ++seqI )
+ mplt_sort_list[compI++] = make_pair( mr->LeftEnd( seqI ), make_pair( mr, seqI ) );
+ }
+ // pairs get ordered on the first element by default
+ std::sort( mplt_sort_list.begin(), mplt_sort_list.end() );
+ gnSeqI seq_length = seedml.seq_table[0]->length();
+ MatchPositionLookupTable match_pos_lookup_table( seq_length+1, make_pair( (UngappedMatchRecord*)NULL, 0 ) );
+ for( size_t i = 0; i < mplt_sort_list.size(); ++i )
+ {
+ //if ( seed_file.size() > 0)
+ // seed_out << (*(UngappedMatchRecord*)mplt_sort_list[i].second.first) << endl;
+ //cerr << mplt_sort_list[i].first << endl;
+ match_pos_lookup_table[ mplt_sort_list[i].first ] = mplt_sort_list[i].second;
+
+ }
+
+ //
+ // part 4, create a procrastination queue
+ //
+ ProcrastinationQueue procrastination_queue( match_record_list );
+
+ //
+ // part 5, create an (empty) Novel Subset Match Record list
+ //
+ vector< GappedMatchRecord* > novel_subset_list;
+
+ size_t superset_count = 0;
+ size_t chainable_count = 0;
+ size_t subset_count = 0;
+ size_t novel_subset_count = 0;
+
+ boost::dynamic_bitset<> left_lookups(seedml.seq_table[0]->length(), false);
+ boost::dynamic_bitset<> right_lookups(seedml.seq_table[0]->length(), false);
+
+ //
+ // part 6, extend all matches!
+ //
+ vector< GappedMatchRecord* > extended_matches; /**< The extended matches will be chains of UngappedMatchRecords */
+
+ //for extension
+ PairwiseScoringScheme pss = PairwiseScoringScheme(hoxd_matrix,-100,-20);
+
+ int curI = 0;
+ uint curr_extensions = 0;
+ uint max_extensions = 2;
+ while( procrastination_queue.end() > 0 )
+ {
+ int prevI = curI;
+ curI +=1;
+ if( (curI * 100) / procrastination_queue.size() != (prevI * 100) / procrastination_queue.size() )
+ {
+ cout << (curI * 100) / procrastination_queue.size() << "%..";
+ cout.flush();
+ }
+
+ // pop the next match off the heap
+ MatchRecord* umr = procrastination_queue.pop();
+ // if the match has been subsumed then skip it
+ if( umr->subsuming_match != NULL )
+ continue;
+ if( umr->dont_extend == true )
+ continue;
+
+// if( umr == (MatchRecord*)0x01335878 )
+// cout << "umr:\n" << *(UngappedMatchRecord*)umr << endl;
+
+ GappedMatchRecord* M_i = dynamic_cast<GappedMatchRecord*>(umr);
+ if( M_i == NULL )
+ {
+ // create a new gapped match record for M_i
+ GappedMatchRecord gmr( *(UngappedMatchRecord*)umr );
+ M_i = gmr.Copy();
+ umr->subsuming_match = M_i;
+ M_i->chained_matches.push_back( umr );
+ vector< size_t > component_map( M_i->SeqCount() );
+ for( size_t i = 0; i < component_map.size(); ++i )
+ component_map[i] = i;
+ M_i->chained_component_maps.push_back(component_map);
+ swap(umr->subsumption_component_map, component_map); // swap avoids reallocation
+ // update superset and subset links
+ for( int dI = 1; dI > -2; dI -= 2 )
+ {
+ MatchLink& ij_link = getSuperset(M_i,dI);
+ if( ij_link.superset != NULL )
+ {
+ ij_link.subset = M_i;
+ unlinkSuperset(umr,dI);
+ int parity = M_i->Orientation(0) == ij_link.superset->Orientation(ij_link.sub_to_super_map[0]) ? 1 : -1;
+ getSubsets(ij_link.superset,-dI*parity).push_back(ij_link);
+ }
+ vector< MatchLink >& subsets = getSubsets(M_i,dI);
+ for( size_t subI = 0; subI < subsets.size(); ++subI )
+ {
+ subsets[subI].superset = M_i;
+ int parity = M_i->Orientation(subsets[subI].sub_to_super_map[0]) == subsets[subI].subset->Orientation(0) ? 1 : -1;
+ getSuperset(subsets[subI].subset, -dI*parity).superset = M_i;
+ }
+ getSubsets(umr,dI).clear(); // so that validate works...
+ }
+ }
+ else
+ cerr << "castdebugme!!\n" << endl;
+
+ M_i->extended = true;
+ extended_matches.push_back( M_i );
+
+ // extend the match in each direction
+ // if a superset exists use that first
+ // otherwise create a neighborhood list
+ int direction = 1; // leftward == 1, rightward == -1, done == -3
+ //int direction = -1; // leftward == 1, rightward == -1, done == 3
+ int last_linked = 0; // stores the group type that was chained. 1 == superset, 2 == chainable, 0 == none
+ vector< NeighborhoodGroup > left_deferred_subsets;
+ vector< NeighborhoodGroup > right_deferred_subsets;
+ vector< NeighborhoodGroup > left_deferred_novel_subsets;
+ vector< NeighborhoodGroup > right_deferred_novel_subsets;
+
+ score_t score = 0;
+ vector< gnSequence* > seqtable( M_i->SeqCount(), seedml.seq_table[0] );
+ vector< string > alignment;
+ vector<score_t> scores;
+ bool extended = false;
+ while( direction > -2 )
+ {
+ last_linked = 0;
+
+ // check for superset
+ if( getSuperset(M_i, direction).superset != NULL )
+ supersetLinkExtension( M_i, direction, last_linked, left_deferred_subsets, right_deferred_subsets, chain );
+
+// else
+// A hack to allow our chaining to work without novel subsets would be to
+// perform an additional neighborhood list lookup after superset link
+// extension even if no chainables are found during link extension.
+ else
+ {
+ //
+ // perform a neighborhood list extension,
+ // looks for neighboring matches in the match position lookup table
+ //
+ vector< NeighborhoodGroup > superset_list;
+ vector< NeighborhoodGroup > chainable_list;
+ vector< NeighborhoodGroup > subset_list;
+ vector< NeighborhoodGroup > novel_subset_list;
+ //tjt: ok
+ neighborhoodListLookup( M_i, match_pos_lookup_table,
+ superset_list, chainable_list, subset_list, novel_subset_list,
+ direction, seed_size, w, left_lookups, right_lookups, NULL);
+
+ // tallies for debugging
+ superset_count += superset_list.size();
+ chainable_count += chainable_list.size();
+ subset_count += subset_list.size();
+
+ // now process each type of neighborhood group
+ // supersets are already done. happy chrismakwanzuhkkah
+ // then process chainable
+ processChainableMatches( M_i, chainable_list, direction, last_linked, find_novel_subsets, chain );
+
+ // defer subset processing
+ for( size_t gI = 0; gI < subset_list.size(); gI++ )
+ {
+ vector< NeighborhoodGroup >& cur_subset_list = selectList( left_deferred_subsets, right_deferred_subsets, direction );
+ cur_subset_list.push_back( subset_list[gI] );
+ }
+ // defer novel subset processing
+ vector< NeighborhoodGroup >& cur_novel_subset_list = selectList( left_deferred_novel_subsets, right_deferred_novel_subsets, direction );
+ cur_novel_subset_list.clear(); // we only process novel subsets on the very last extension
+ for( size_t gI = 0; gI < novel_subset_list.size(); gI++ )
+ cur_novel_subset_list.push_back( novel_subset_list[gI] );
+
+ } // end if no superset was found then do neighborhood list lookup
+ //if find_novel_subsets not enabled, we can avoid this hack? is this true?
+ if (!find_novel_subsets)
+ {
+ vector< NeighborhoodGroup > superset_list;
+ vector< NeighborhoodGroup > chainable_list;
+ vector< NeighborhoodGroup > subset_list;
+ vector< NeighborhoodGroup > novel_subset_list;
+ neighborhoodListLookup( M_i, match_pos_lookup_table,
+ superset_list, chainable_list, subset_list, novel_subset_list,
+ direction, seed_size, w, left_lookups, right_lookups, NULL);
+
+ // defer subset processing
+ for( size_t gI = 0; gI < subset_list.size(); gI++ )
+ {
+ vector< NeighborhoodGroup >& cur_subset_list = selectList( left_deferred_subsets, right_deferred_subsets, direction );
+ cur_subset_list.push_back( subset_list[gI] );
+ }
+ }
+ // if we didn't do a chaining or superset extension, try a gapped extension
+ if( last_linked == 0 )
+ {
+ double e = 2.71828182845904523536;
+ int rcode =FAILED;
+ bool extend_it = false;
+ //extend_length = 0;
+ vector<GappedMatchRecord*> novel_matches; // M_e will contain the extension
+ // only extend if two matches are chained if two-hits == true
+ // its fast enough now that printing to screen actually slows things down...
+ if( extend_chains && (!two_hits || (two_hits && M_i->chained_matches.size() > 1 )))
+ rcode = ExtendMatch(M_i, seqtable, hmm_params, w, direction, novel_matches, gap_open, gap_extend, extension_window);
+
+ if (rcode == FAILED || rcode == FIXME || novel_matches.size() == 0)
+ {
+ //end gapped extension whenever extension fails.
+ direction -=2;
+ //direction +=2;
+ continue;
+ }
+ else
+ {
+ for (size_t mI = 0; mI < novel_matches.size(); mI++ )
+ {
+ //if (novel_matches.at(mI)->Multiplicity() != M_i->Multiplicity() )
+ // continue;
+ GappedMatchRecord* M_e = novel_matches.at(mI);
+ M_e->extended = false;
+
+ if (M_e->Multiplicity() > M_i->Multiplicity())//what does this mean??
+ continue;
+ else if (M_e->Multiplicity() == M_i->Multiplicity())
+ {
+ //immediately chainable!
+ if (direction > 0 && mI == novel_matches.size()-1)
+ {
+
+ extend_it = true;
+ continue;
+ }
+ else if (direction < 0 && mI == 0)
+ {
+ extend_it = true;
+ continue;
+ }
+ }
+ vector< pair< gnSeqI, MatchPositionEntry > > mplt_sort_list( M_e->Multiplicity() );
+ vector< pair< gnSeqI, MatchPositionEntry > > final_mplt_sort_list;
+ size_t compI = 0;
+
+ for( size_t seqI = 0; seqI < M_e->Multiplicity(); ++seqI )
+ mplt_sort_list[compI++] = make_pair( M_e->LeftEnd( seqI ), make_pair( M_e, seqI ) );
+
+ // pairs get ordered on the first element by default
+ std::sort( mplt_sort_list.begin(), mplt_sort_list.end() );
+
+ //don't use novel match if it clobbers the existing left end in the MPLT
+ if (use_novel_matches )
+ {
+
+ bool clobbers_existing_match = false;
+ for( size_t i = 0; i < mplt_sort_list.size(); ++i)
+ {
+ if (match_pos_lookup_table[ mplt_sort_list[i].first ].first != NULL )
+ {
+ clobbers_existing_match = true;
+ break;
+ }
+ }
+ if (! clobbers_existing_match )
+ {
+ for( size_t i = 0; i < mplt_sort_list.size(); ++i)
+ match_pos_lookup_table[ mplt_sort_list[i].first ] = mplt_sort_list[i].second;
+ }
+
+ }
+ //now, during the subsequent call to neighborhoodListLookup(), we should
+ //find the novel homologous region and process it accordingly...
+ }
+ }
+ //update links appropriately, and we can take another round
+ //through the evil megaloop, possibly discovering additional chainable
+ //seeds or superset links.
+
+ // need to update links by looking for matches in the region that was just extended over
+ vector< NeighborhoodGroup > superset_list;
+ vector< NeighborhoodGroup > chainable_list;
+ vector< NeighborhoodGroup > subset_list;
+ vector< NeighborhoodGroup > novel_subset_list;
+
+ //if extend_it is true, it means that we can immediately extend
+ //M_i with the corresponding result from ExtendMatch()
+ if (extend_it)
+ {
+ M_i->extended = true;
+ //build a component map for the new record
+ vector< size_t > component_map( M_i->Multiplicity() );
+ for( size_t i = 0; i < component_map.size(); ++i )
+ component_map[i] = i;
+
+ GappedMatchRecord* M_t = NULL;
+ //leftward extension
+ if (direction > 0 )
+ M_t = novel_matches.back();
+ else
+ M_t = novel_matches.front();
+
+ neighborhoodListLookup( M_i, match_pos_lookup_table,
+ superset_list, chainable_list, subset_list, novel_subset_list,
+ direction, seed_size, w, left_lookups, right_lookups, M_t);
+
+ M_t->subsuming_match = M_i;
+ M_t->subsumption_component_map = component_map;
+ M_i->chained_matches.push_back( M_t );
+ M_i->chained_component_maps.push_back( component_map );
+ bool changed = extendRange(M_i, M_t, component_map);
+
+
+ }
+ else
+ {
+
+ if (!M_i->extended)
+ M_i->extended = false;
+ GappedMatchRecord* M_t = NULL;
+ if (direction > 0 )
+ M_t = novel_matches.front();
+ else
+ M_t = novel_matches.back();
+
+ //we can't extend M_i, but we can classify all of the novel
+ //homologous regions with respect to M_i
+ neighborhoodListLookup( M_i, match_pos_lookup_table,
+ superset_list, chainable_list, subset_list, novel_subset_list,
+ direction, seed_size, w, left_lookups, right_lookups,M_t);
+
+ }
+ extended = true;
+ // now process each type of neighborhood group
+ // if we have completely extended through a superset
+ // then we want to replace that part of the alignment with the superset
+ // if the superset continues beyond the end of at least one component, then
+ // we want to create a superset link for it, and process it during a link extension
+ if ( superset_list.size() > 0 && chain )
+ processSupersetMatches( M_i, superset_list, direction, last_linked, true );
+
+ // then process chainable
+ if ( chainable_list.size() > 0 )
+ processChainableMatches( M_i, chainable_list, direction, last_linked, find_novel_subsets, chain );
+
+ // defer subset processing
+ for( size_t gI = 0; gI < subset_list.size(); gI++ )
+ {
+ vector< NeighborhoodGroup >& cur_subset_list = selectList( left_deferred_subsets, right_deferred_subsets, direction );
+ cur_subset_list.push_back( subset_list[gI] );
+ }
+ // defer novel subset processing
+ vector< NeighborhoodGroup >& cur_novel_subset_list = selectList( left_deferred_novel_subsets, right_deferred_novel_subsets, direction );
+ cur_novel_subset_list.clear(); // only process novel subsets from the very last extension
+ for( size_t gI = 0; gI < novel_subset_list.size(); gI++ )
+ cur_novel_subset_list.push_back( novel_subset_list[gI] );
+
+ //just as before, if we didn't extend M_i, change directions and continue on
+ if (!extend_it )
+ {
+ direction -=2;
+ continue;
+ }
+
+ //otherwise, enable another round of gapped extension in this direction.
+ }
+ } // end loop over leftward and rightward extension
+
+ //
+ // finalize the alignment -- this resolves overlapping components into a single gapped alignment
+ //
+ //tjt: need to send finalize seq_table for muscle alignment
+ if( M_i == (GappedMatchRecord*)0x00d37364 )
+ cerr << "debugmult\n";
+
+ // finally process novel subset
+ for( int direction = 1; direction >-2; direction -= 2 )
+ {
+ vector< NeighborhoodGroup >& cur_novel_subset_list = selectList( left_deferred_novel_subsets, right_deferred_novel_subsets, direction );
+ processNovelSubsetMatches(M_i, cur_novel_subset_list, find_novel_subsets, procrastination_queue,
+ seedml.seq_table, direction, w, last_linked, novel_subset_count );
+ }
+
+ //tjt: make sure finalize only gets called once!
+ M_i->finalize(seedml.seq_table);
+
+ if( M_i->SeqCount() == 0)//what the hell?
+ continue;
+ //
+ // process deferred subsets
+ //
+ for( int direction = 1; direction >-2; direction -= 2 )
+ {
+ vector< NeighborhoodGroup >& subset_list = selectList( left_deferred_subsets, right_deferred_subsets, direction );
+ NeighborhoodGroupCompare ngc;
+ NeighborhoodGroupComponentCompare ngcc;
+ std::sort( subset_list.begin(), subset_list.end(), ngc );
+ bool prev_linked = false;
+ for( size_t sI = 0; sI < subset_list.size(); ++sI )
+ {
+ bool same_components = false;
+ if( sI > 0 )
+ same_components = ngcc.compare(subset_list[sI], subset_list[sI-1]) == 0;
+ prev_linked = same_components? prev_linked : false;
+
+ // check whether each of these ended up getting subsumed
+ bool subsumed;
+ bool partial;
+ classifySubset( M_i, subset_list[sI], subsumed, partial );
+ MatchRecord* M_j = subset_list[sI].get<0>();
+
+ if( M_j->subsuming_match != NULL )
+ {
+ // sometimes duplicate MatchRecord pointers can exist in the subset list when a subset gets found
+ // during a neighborhood list lookup but was already linked to a neighboring superset
+ // in that case, we just skip the second entry...
+ if(M_j->subsuming_match != M_i )
+ cerr << "Error processing M_i " << M_i << ": match " << M_j << " was already subsumed\n";
+ continue;
+ }
+
+ if( subsumed )
+ {
+ M_j->subsuming_match = M_i;
+ M_j->subsumption_component_map = subset_list[sI].get<1>();
+ unlinkSupersets(M_j);
+ continue;
+ }
+ if( partial )
+ {
+ // create a novel subset record, mark this one as subsumed
+ // just destroy it for now...
+ M_j->dont_extend = true;
+
+ unlinkSupersets(M_j);
+ for( size_t mjI = 0; mjI < M_j->Multiplicity(); ++mjI )
+ {
+ if( match_pos_lookup_table[M_j->LeftEnd(mjI)].first == M_j )
+ match_pos_lookup_table[M_j->LeftEnd(mjI)] = make_pair((MatchRecord*)NULL,0);
+ }
+
+ continue;
+ }
+
+ if( prev_linked )
+ {
+ // the previous subset has the same components as this one and was linked.
+ // we may consider this one an 'extra' if all components are further away
+ NeighborhoodGroup cur_group = subset_list[sI];
+ subset_list.erase(subset_list.begin() + sI, subset_list.begin() + sI + 1);
+ sI--;
+ size_t dI = 0;
+ if (subset_list[sI].get<2>().size() < cur_group.get<2>().size())
+ {
+ //debugme: why would this happen?
+ //cerr << "subset_list[" << sI << "].get<2>().size() < cur_group.get<2>().size()" << endl;
+ //cerr << subset_list[sI].get<2>().size() << " < " << cur_group.get<2>().size() << endl;
+ //genome::breakHere();
+ continue;
+ }
+ for( ; dI < cur_group.get<2>().size(); ++dI )
+ {
+ // if cur_group.get<2)()[dI] <= subset_list[sI].get<2>()[dI],
+ // component dI is closer than a component from the current subset
+ if( cur_group.get<2>()[dI] <= subset_list[sI].get<2>()[dI] )
+ break;
+ }
+ // all components were the same, yet further away, so consider this an 'extra' subset
+ if( dI == cur_group.get<2>().size() )
+ {
+ // include this in a list of extra subsets
+ boost::dynamic_bitset<> tmp_bs(M_i->Multiplicity());
+ getExtraSubsets( M_i, direction ).push_back( MatchLink( (MatchRecord*)M_i, M_j, tmp_bs, cur_group.get<1>() ) );
+ continue;
+ }
+ // else we've got a subset tie.
+ if(print_warnings)
+ cerr << "Subset tie, erasing M_j\n";
+
+ //tjt: why do we need to erase the subset? later this will mean that we can't chain the two tied subsets..
+ M_j->dont_extend = true;
+ unlinkSupersets(M_j);
+
+ for( size_t mjI = 0; mjI < M_j->Multiplicity(); ++mjI )
+ {
+ if( match_pos_lookup_table[M_j->LeftEnd(mjI)].first == M_j )
+ match_pos_lookup_table[M_j->LeftEnd(mjI)] = make_pair((MatchRecord*)NULL,0);
+ }
+
+ continue;
+ }
+
+ int parity = M_i->Orientation( subset_list[sI].get<1>()[0] ) == M_j->Orientation(0) ? 1 : -1;
+ // if we have the following case:
+ // --M_i--> --M_j-- <--M_i-- ... ... --M_i--> --M_j-- <--M_i--
+ // then M_j may already be linked to M_i but on the other side
+ if( getSuperset(M_j,direction*parity).superset == M_i )
+ continue;
+ unlinkSuperset( M_j, -direction*parity );
+ // it's outside, just link it in
+ // rebuild the superset component list
+ boost::dynamic_bitset<> comp_list(M_i->Multiplicity(), false);
+
+ for( size_t compI = 0; compI < subset_list[sI].get<1>().size(); ++compI )
+ {
+ //debugme: why do I need to check this first?
+ if ( subset_list[sI].get<1>()[compI] != (std::numeric_limits<size_t>::max)())
+ comp_list.set(subset_list[sI].get<1>()[compI]);
+ }
+ getSuperset(M_j,-direction*parity) = MatchLink( M_i, M_j, comp_list, subset_list[sI].get<1>() );
+ getSubsets(M_i,direction).push_back( getSuperset(M_j,-direction*parity));
+ //getExtraSubsets(M_i,direction).push_back( getSuperset(M_j,-direction*parity));
+ prev_linked = true;
+ }
+ subset_list.clear();
+ }
+ }
+ cout << "\n# of calls to MUSCLE: " << ccount << endl;
+ cout << "------------------------------" << endl;
+ cout << "superset count: " << superset_count << endl;
+ cout << "chainable count: " << chainable_count << endl;
+ cout << "subset count: " << subset_count << endl;
+ cout << "novel subset count: " << novel_subset_count << endl;
+ cout << "------------------------------" << endl;
+ //
+ // part 9, create a final list of local multiple alignments (already done in extended_matches)
+ //
+ vector< GappedMatchRecord* > &final = extended_matches;
+
+ // part 10, score matches
+
+ //create output stream
+ ostream* output;
+ ostream* output2;
+ ofstream score_out_file;
+ ofstream aln_out_file;
+ ofstream stats_out_file;
+ if(stat_file != "" && stat_file != "-")
+ stats_out_file.open( stat_file.c_str() );
+
+ if(outputfile == "" || outputfile == "-")
+ output = &cout;
+ else
+ {
+ aln_out_file.open( outputfile.c_str() );
+ output = &aln_out_file;
+ }
+ if(output2file == "" || output2file == "-")
+ output2 = &cout;
+ else
+ {
+ score_out_file.open( output2file.c_str() );
+ output2 = &score_out_file;
+ }
+ vector< GappedMatchRecord* > scored;
+ vector<score_t> scores_final;
+ score_t score_final = 0;
+ double e = 2.71828182845904523536;
+ vector< GappedMatchRecord* > filtered_final;
+ int finalsize = final.size();
+ uint alignment_count = 0;
+
+ cout << "->Computing Sum-of-Pairs score of all lmas..." << endl;
+ for( size_t fI = 0; fI < finalsize; fI++ )
+ {
+ vector<string> alignment;
+ vector< gnSequence* > seq_table( final[fI]->SeqCount(), seedml.seq_table[0] );
+ mems::GetAlignment(*final[fI], seq_table, alignment); // expects one seq_table entry per matching component
+ //send temporary output format to file if requested
+ if (alignment.at(0).size() >= min_repeat_length)
+ {
+ if(only_extended)
+ {
+ //we don't want it..
+ if ( alignment.at(0).size() <= seed_size )
+ continue;
+ }
+ score_final = 0;
+ computeSPScore( alignment, pss, scores_final, score_final);
+ //*output << "#procrastAlignment " << ++alignment_count << endl << *final.at(fI) << endl;
+ final[fI]->spscore = score_final;
+ scored.push_back(final[fI]);
+
+ }
+ else
+ continue;
+
+ }
+
+ if (!allow_redundant)
+ {
+ cout << "->Removing redudant lmas..." << endl;
+ }
+ //
+ // remove overlapping regions
+ //
+ // 1) create a vector of CompactMatchRecord* with one entry for each nucleotide in the input sequence.
+ //tjt: CompactMatchRecord is an attempt to reduce the space requirements for the method currently used to
+ //remove overlapping regions
+ vector< CompactMatchRecord* > match_record_nt(sequence.size());
+ for( size_t mI = 0; mI < match_record_nt.size(); ++mI )
+ {
+ CompactUngappedMatchRecord tmp( 1, 1 );
+ match_record_nt[mI] = tmp.Copy();
+ match_record_nt[mI]->SetStart( 0, mI );
+ match_record_nt[mI]->SetLength( 1, 0 );
+ match_record_nt[mI]->subsuming_match = NULL;
+ }
+
+ // 2) sort the result GappedMatchRecords
+ if (large_repeats)
+ std::sort( scored.begin(), scored.end(), score_by_length );
+ else if (small_repeats)
+ std::sort( scored.begin(), scored.end(), scorecmp );
+ else
+ std::sort( scored.begin(), scored.end(), score_by_sp );
+ for( size_t fI = 0; fI < scored.size(); fI++ )
+ {
+ //this shouldn't be the case, but let's be safe
+ if (scored.at(fI)->AlignmentLength() < 1)
+ continue;
+
+ //if user wants to remove all overlapping regions among lmas, let's do it!
+ if (!allow_redundant)
+ {
+
+ //for each match compontent in M_i
+ for ( size_t seqI = 0; seqI < scored.at(fI)->Multiplicity(); seqI++)
+ {
+ //if there is no match, we can't do a thing
+ if( scored.at(fI)->LeftEnd(seqI) == NO_MATCH )
+ continue;
+
+ //if left/right ends are good, set subsuming_match pointers
+ if (scored.at(fI)->LeftEnd(seqI) < 4000000000u && scored.at(fI)->RightEnd(seqI) < 4000000000u)
+ {
+ gnSeqI endI = scored.at(fI)->RightEnd(seqI);
+ gnSeqI startI = scored.at(fI)->LeftEnd(seqI);
+ for( ; startI < scored.at(fI)->RightEnd(seqI); startI++)
+ {
+ //3) Mark each entry in the MatchRecord* vector which corresponds to nucleotides contained within the current GMR.
+ //A pointer to the current GMR can be >stored in each entry
+ if ( match_record_nt.at(startI)->subsuming_match == NULL)
+ match_record_nt.at(startI)->subsuming_match = scored.at(fI);
+ }
+ }
+
+ size_t left_crop_amt = 0;
+ size_t right_crop_amt = 0;
+ gnSeqI startI = scored.at(fI)->LeftEnd(seqI);
+ //4) When a non-null entry is encountered in the vector, crop out that portion of the current GMR
+ while(match_record_nt.at(startI)->subsuming_match != NULL && match_record_nt.at(startI)->subsuming_match != scored.at(fI) && startI < scored.at(fI)->RightEnd(seqI) && scored.at(fI)->Length(seqI) < 4000000000u)
+ {
+ startI++;
+ left_crop_amt++;
+ }
+ if (left_crop_amt > 0)
+ {
+ if (left_crop_amt >= scored.at(fI)->Length(seqI))
+ scored.at(fI)->CropLeft( scored.at(fI)->Length(seqI)-1, seqI);
+ else
+ scored.at(fI)->CropLeft( left_crop_amt, seqI);
+ }
+ if (scored.at(fI)->LeftEnd(seqI) < 4000000000u && scored.at(fI)->RightEnd(seqI) < 4000000000u && scored.at(fI)->Length(seqI) < 4000000000u)
+ {
+ startI = scored.at(fI)->RightEnd(seqI)-1;
+ //4) When a non-null entry is encountered in the vector, crop out that portion of the current GMR
+ while(match_record_nt.at(startI)->subsuming_match != NULL && match_record_nt.at(startI)->subsuming_match != scored.at(fI) && startI >= scored.at(fI)->LeftEnd(seqI))
+ {
+ startI--;
+ right_crop_amt++;
+ }
+ }
+ if (right_crop_amt > 0)
+ {
+
+ if (right_crop_amt >= scored.at(fI)->Length(seqI))
+ scored.at(fI)->CropRight( scored.at(fI)->Length(seqI)-1, seqI);
+ else
+ scored.at(fI)->CropRight( right_crop_amt, seqI);
+ }
+ }
+ }
+ //if ( left_crop_amt == 0 && right_crop_amt == 0)
+ // filtered_final.push_back(scored.at(fI));
+
+ if (scored.at(fI)->AlignmentLength() >= min_repeat_length )
+ {
+ if(only_extended)
+ {
+ //we don't want it..
+ if ( scored.at(fI)->AlignmentLength() <= seed_size )
+ continue;
+ }
+ // yuck,recalculating sp score to update after removing overlapping regions..
+ // couldn't I just subtract from the original score??
+ vector<string> alignment;
+ vector< gnSequence* > seq_table( scored[fI]->SeqCount(), seedml.seq_table[0] );
+ mems::GetAlignment(*scored[fI], seq_table, alignment); // expects one seq_table entry per matching component
+ // 5) put all LMAs above min_repeat_length and min_spscore into final list of scored LMAs
+ score_final = 0;
+ computeSPScore( alignment, pss, scores_final, score_final);
+ scored.at(fI)->spscore = score_final;
+ // pass it through a tandem repeat filter, too
+ if ((scored.at(fI)->spscore > min_spscore && ( scored.at(fI)->tandem <= allow_tandem)))
+ filtered_final.push_back(scored.at(fI));
+ }
+
+
+ }
+ cout << "->Writing xmfa & xml output..." << endl;
+ std::sort( filtered_final.begin(), filtered_final.end(), scorecmp );
+ // write the output to xmfa
+ writeXmfa( seedml, filtered_final, xmfa_file );
+
+ // write the output to xml
+ writeXML( seedml, filtered_final, xml_file );
+
+ //
+ // part 11, report matches in scored order, by multiplicity then by spscore
+ //
+ output->setf(ios::fixed);
+ output->precision(0);
+
+
+ for( size_t sI = 0; sI < filtered_final.size(); ++sI )
+ {
+ *output << "#procrastAlignment " << sI+1 << endl << *filtered_final[sI] << endl;
+ *output << "Alignment length: " << filtered_final[sI]->AlignmentLength() << endl;
+ *output << "Score: " << filtered_final[sI]->spscore << endl;
+ }
+
+ ///report highest scoring lma for each multiplicity
+ cout << "->Calculating highest scoring lma for each multiplicity..." << endl;
+ stats_out_file.setf(ios::fixed);
+ stats_out_file.precision(0);
+ int prev_multiplicity = 0;
+ uint record_count = 0;
+ for( size_t tI = 0; tI < filtered_final.size(); ++tI )
+ { if (filtered_final[tI]->Multiplicity() != prev_multiplicity)
+ {
+ stats_out_file << "#" << record_count+1 << ": r= " << filtered_final[tI]->Multiplicity() << " l= " << filtered_final[tI]->AlignmentLength() << " s= " << filtered_final[tI]->spscore << endl;
+ prev_multiplicity = filtered_final[tI]->Multiplicity();
+ record_count++;
+ }
+ else
+ continue;
+ }
+ // clean up
+ cout << "->Cleaning up..." << endl;
+ for( size_t eI = 0; eI < match_record_list.size(); ++eI )
+ match_record_list[eI]->Free();
+ for( size_t eI = 0; eI < novel_subset_list.size(); ++eI )
+ if( novel_subset_list[eI]->subsuming_match != NULL )
+ novel_subset_list[eI]->Free();
+ for( size_t eI = 0; eI < extended_matches.size(); ++eI )
+ if( extended_matches[eI]->subsuming_match == NULL )
+ extended_matches[eI]->Free();
+
+ for( size_t seqI = 0; seqI < seedml.seq_table.size(); ++seqI )
+ delete seedml.seq_table[seqI];
+ for( size_t seqI = 0; seqI < seedml.sml_table.size(); ++seqI )
+ delete seedml.sml_table[seqI];
+
+ cout << "->Done!" << endl;
+ return 0;
+}
+
diff --git a/src/rootTrees.cpp b/src/rootTrees.cpp
new file mode 100644
index 0000000..80eef25
--- /dev/null
+++ b/src/rootTrees.cpp
@@ -0,0 +1,128 @@
+#include "libMems/PhyloTree.h"
+#include "libMems/TreeUtilities.h"
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <utility>
+#include <fstream>
+
+using namespace std;
+
+typedef unsigned int uint;
+
+/**
+ * Depth first search to check whether a subtree contains a given node
+ */
+bool containsNode( PhyloTree< TreeNode >& t, node_id_t subtree_nodeI, node_id_t query_nodeI )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push( subtree_nodeI );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( cur_node == query_nodeI )
+ return true;
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ }
+ return false;
+}
+
+/** place a root on the branch with endpoints root_left and root_right
+ */
+void rerootTree( PhyloTree< TreeNode >& t, node_id_t new_root )
+{
+ // new root must be an internal node
+ if( t[new_root].children.size() == 0 )
+ throw "Can't root on a leaf node";
+ if( new_root == t.root )
+ return; // idiot caller didn't realize it's already rooted here
+
+ // change the old root node to an internal node
+ uint childI = 0;
+ for( ; childI < t[t.root].children.size(); childI++ ){
+ if( containsNode( t, t[t.root].children[childI], new_root ) )
+ {
+ t[t.root].parents.push_back( t[t.root].children[childI] );
+ std::vector<node_id_t>::iterator last = std::remove( t[t.root].children.begin(), t[t.root].children.end(), t[t.root].children[childI] );
+ t[t.root].children.erase(last,t[t.root].children.end());
+ break;
+ }
+ }
+ // shake the tree out on the new root node
+ t.root = new_root;
+ t[t.root].children.insert( t[t.root].children.end(), t[t.root].parents.begin(), t[t.root].parents.end() );
+
+ stack<node_id_t> node_stack;
+ node_stack.push(t.root);
+ while( node_stack.size() > 0 )
+ {
+ // delete the current node from all of its child nodes lists
+ // and insert it as a parent
+ // make all other nodes reference by the child grandchildren
+ // recurse on each child
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ for( uint childI = 0; childI < t[cur_node].children.size(); childI++ )
+ {
+ TreeNode& child_n = t[t[cur_node].children[childI]];
+ std::vector<node_id_t>::iterator last = std::remove( child_n.children.begin(), child_n.children.end(), cur_node );
+ child_n.children.erase(last,child_n.children.end());
+ last = std::remove( child_n.parents.begin(), child_n.parents.end(), cur_node );
+ child_n.parents.erase(last,child_n.parents.end());
+ child_n.children.insert( child_n.children.end(), child_n.parents.begin(), child_n.parents.end() );
+ child_n.parents.clear();
+ child_n.parents.push_back(cur_node);
+ node_stack.push(t[cur_node].children[childI]);
+ }
+ }
+}
+
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 3 )
+ {
+ cerr << "Usage: rootTrees <nexus input file> <nexus output file>\n";
+ }
+ string input_filename = argv[1];
+ string output_filename = argv[2];
+ ifstream input_file( input_filename.c_str() );
+ if( !input_file.is_open() )
+ {
+ cerr << "Error opening \"" << input_filename << "\"\n";
+ return -1;
+ }
+ ofstream output_file( output_filename.c_str() );
+ if( !output_file.is_open() )
+ {
+ cerr << "Error opening \"" << output_filename << "\"\n";
+ return -1;
+ }
+
+ uint tree_count = 0;
+ vector< string > tree_list;
+ while( true )
+ {
+ PhyloTree< TreeNode > t;
+ t.readTree( input_file );
+ if( t.size() == 0 )
+ break;
+ vector< PhyloTree< TreeNode > > rooted_trees;
+// rootAtEachNode( t, rooted_trees );
+ for( size_t treeI = 0; treeI < rooted_trees.size(); treeI++ )
+ {
+ rooted_trees[treeI].writeTree( output_file );
+ }
+ tree_count++;
+ if( tree_count % 100 == 0 )
+ cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+ cout << "Processed " << tree_count << " trees";
+ }
+ cerr << "Wrote rooted trees to \"" << output_filename << "\"\n";
+ return 0;
+}
\ No newline at end of file
diff --git a/src/scoreALU.cpp b/src/scoreALU.cpp
new file mode 100644
index 0000000..121acd6
--- /dev/null
+++ b/src/scoreALU.cpp
@@ -0,0 +1,729 @@
+/*******************************************************************************
+ * $Id: scoreAlignment.cpp,v 1.14 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MatchList.h"
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include "libMems/IntervalList.h"
+#include "libGenome/gnFilter.h"
+#include <boost/program_options/cmdline.hpp>
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+class AluRecord
+{
+
+public:
+
+ AluRecord();
+ //Smith-Waterman score of the match
+ int score;
+ //% substitutions in matching region compared to the consensus
+ float divergent;
+ //% deleted bp
+ float deleted;
+ //% inserted bp
+ float inserted;
+ //name of query sequence
+ string queryId;
+ //starting position of match in query sequence
+ gnSeqI start;
+ //ending position of match in query sequence
+ gnSeqI end;
+ //no. of bases in query sequence past the ending position of match
+ gnSeqI remaining;
+ //match is with the Complement of the consensus sequence in the database
+ string strand;
+ //name of the matching interspersed repeat
+ string repeatId;
+ //class of the matching repeat
+ string repeatClass;
+ //no. of bases in (complement of) the repeat consensus sequence
+ //prior to beginning of the match (so 0 means that the match extended
+ //all the way to the end of the repeat consensus sequence)
+ gnSeqI prior;
+ //starting position of match in database sequence
+ gnSeqI startDB;
+ //ending position of match in database sequence
+ gnSeqI endDB;
+ gnSeqI length(void);
+};
+
+gnSeqI AluRecord::length(void)
+{
+ gnSeqI len = 0;
+ len = absolut((int64)end)-absolut((int64)start);
+ return len;
+}
+AluRecord::AluRecord()
+{
+ score = 0;
+ divergent = 0.0;
+ deleted = 0.0;
+ inserted = 0.0;
+ queryId = "none";
+ start = 0;
+ end = 0;
+ remaining = 0;
+ strand = "+";
+ repeatId = "none";
+ repeatClass = "none";
+ prior = 0;
+ startDB = 0;
+ endDB = 0;
+}
+void ReadAluFile( istream& in_stream, vector<AluRecord*>& alu_list, gnSeqI& lr )
+{
+ uint seq_count = 0;
+ gnSeqI max_len = 0;
+ string cur_line;
+ //3 lines of header info
+ getline( in_stream, cur_line );
+ getline( in_stream, cur_line);
+ getline( in_stream, cur_line);
+ uint seqI = 0;
+ vector< gnSeqI > lengths;
+ //vector< AluRecord* > alu_list;
+
+ string empty_line;
+ vector< string > aln_mat;
+ uint line_count = 1;
+
+
+ while( getline( in_stream, cur_line) )
+ {
+
+ AluRecord* alu = new AluRecord();
+ // read and parse first AluRecord line
+ //stringstream line_str( cur_line );
+
+ //first line of data
+
+ // take off leading whitespace
+ string::size_type loc = cur_line.find("(");
+ if (loc != string::npos )
+ cur_line.replace(loc,1," ");
+
+ loc = cur_line.find(")");
+ if (loc != string::npos )
+ cur_line.replace(loc,1," ");
+ stringstream parse_str( cur_line );
+
+ parse_str >> alu->score;
+ parse_str >> alu->divergent;
+ parse_str >> alu->deleted;
+ parse_str >> alu->inserted;
+ parse_str >> alu->queryId;
+ parse_str >> alu->start;
+ parse_str >> alu->end;
+ parse_str >> alu->remaining;
+ parse_str >> alu->strand;
+ parse_str >> alu->repeatId;
+ parse_str >> alu->repeatClass;
+ //punt: rest of info not needed
+ //parse_str >> alu->prior;
+ //parse_str >> alu->startDB;
+ //parse_str >> alu->endDB;
+
+ //end of line
+ alu_list.push_back(alu);
+ lr+= alu->length();
+
+ }
+ cout << "number of ALU records in file: " << alu_list.size() << endl;
+}
+
+/**
+ * program to score alignments
+ * reads in a "correct" alignment and a calculated alignment
+ * scores the calculated alignment based on the correct one
+ */
+int main( int argc, char* argv[] ){
+
+ string alignment_fname;
+ string alu_fname;
+
+
+ if( argc < 2 ){
+ cout << "scoreALU <procrastAligner alignment> <repeatmasker ALUs>\n";
+ return -1;
+ }
+ // Declare the supported options.
+
+ po::variables_map vm;
+ try {
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help", "get help message")
+ ("alignment", po::value<string>(&alignment_fname), "procrastAligner alignment")
+ ("alus", po::value<string>(&alu_fname), "repeatmasker ALUs")
+ ;
+
+
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ cout << desc << "\n";
+ return 1;
+ }
+
+
+ }
+ catch(exception& e) {
+ cerr << "error: " << e.what() << "\n";
+ return 1;
+ }
+ catch(...) {
+ cerr << "Exception of unknown type!\n";
+ }
+
+
+
+ ifstream align_in;
+ align_in.open( alignment_fname.c_str() );
+ if( !align_in.is_open() ){
+ cerr << "Error opening " << alignment_fname << endl;
+ return -1;
+ }
+ ifstream alu_in;
+ alu_in.open( alu_fname.c_str() );
+ if( !alu_in.is_open() ){
+ cerr << "Error opening " << alu_fname << endl;
+ return -1;
+ }
+try{
+ cout << "Calclutating specificity and sensitivity of procrastAligner on dataset..." << endl;
+ IntervalList alignment;
+ vector<AluRecord*> alus;
+
+ //total length of all aligned repeats found by procrastAligner
+ gnSeqI lt = 0;
+ //total length of all alignments found by procrastAligner
+ gnSeqI ld = 0;
+ //total length of repeats found by repeatmasker
+ gnSeqI lr=0;
+ gnSeqI ln=0;
+ gnSeqI lp=0;
+ //total length of all regions found only in procrastAligner
+ gnSeqI lo=0;
+ //total length of repeats masked by both programs
+ gnSeqI lc = 0;
+ ReadAluFile( alu_in, alus, lr );
+ alu_in.close();
+ string cur_line;
+ uint seqI = 0;
+ //this will suffice for now, but should plan on using
+ //IntervalList::ReadStandardAlignment or equivalent
+ //to read in XMFA formatted output from procrastAligner
+ pair<int64,int64> pos;
+ vector< vector< pair<int64,int64> > > align_list;
+ vector< pair<int64,int64> > pos_list;
+ map<int64,bool> alncoverage;
+ map<int64,bool> coverage;
+ //list of maps, one for each alignment
+ vector< map<int64,bool> > totcoverage;
+ int64 ccount = 0;
+ while( getline( align_in, cur_line) )
+ {
+ vector< int64 > start_list;
+ getline( align_in, cur_line);
+ stringstream parse_str( cur_line );
+ int64 start = 0;
+ int64 end = 0;
+ int64 length = 0;
+ string aln_len_str;
+ parse_str >> aln_len_str;
+ while( parse_str >> start )
+ {
+ start_list.push_back(start);
+ }
+ getline( align_in, cur_line);
+ stringstream parse_string(cur_line);
+ //parse_str.( cur_line );
+ string lens;
+ parse_string >> lens;
+ uint region_count = 0;
+ while( parse_string >> length )
+ {
+ //cout << length << endl;
+ if ( region_count >= start_list.size() )
+ {
+ //something's wrong
+ cout << "alu data failed!" << endl;
+ break;
+ }
+ pos.first = start_list.at(region_count);
+ if (start_list.at(region_count) < 0 )
+ {
+ pos.second = start_list.at(region_count)-length;
+ //simply add up the alignment coverage in the map
+ for(int i = 0; i < length; i++)
+ {
+ alncoverage[pos.first-i] = true;
+ coverage[pos.first-i] = true;
+ ccount++;
+ }
+ }
+ else
+ {
+ pos.second = start_list.at(region_count)+length;
+ //for both strands
+ for(int i = 0; i < length; i++)
+ {
+ alncoverage[pos.first+i] = true;
+ coverage[pos.first+i] = true;
+ ccount++;
+ }
+ }
+ pos_list.push_back(pos);
+ region_count++;
+ }
+ totcoverage.push_back(alncoverage);
+ alncoverage.clear();
+ align_list.push_back(pos_list);
+ pos_list.clear();
+
+ }//end of read procrastAligner output hack
+ //alignment.ReadStandardAlignment( align_in );
+ align_in.close();
+ cout << "alu data processed!" << endl;
+ int aluhits = 0;
+ int matches = 0;
+ //a first attempt at generating the sensitivity & specificity of our method
+ //for comparison with zhang&waterman's eulerian path method...
+ //hopefully we pull out these ~290bp repeats in a nice chain in each case
+ //FIXME: is this ok?
+
+ map<int,bool> ignoreAlignment;
+ map<int64,bool> mergedCoverage;
+ map<int64,bool> aluCoverage;
+
+ //Total length of unaligned repeats(false positives?) found by procrastAligner
+ map<int64,bool> lpt;
+ map<int64,bool> lpn;
+
+ //Total length of regions found only in procrastAligner
+ map<int64,bool> lpo;
+
+
+ map<int,bool> hitlist;
+
+ map<int64,bool> specificity;
+
+ map< uint,pair<int,int> > best_borders;
+ map< uint,pair<int,int> > worst_borders;
+ int64 matchhits = 0;
+ int64 matchhitmult = 0;
+ cout << "checking which alus are aligned" << endl;
+ for ( int j = 0; j < align_list.size(); j++)
+ {
+ //if alufound in any component of curr alignment, consider 'aligned'
+ //if not, throw out to help our sr. specificity
+
+ //then, for each ALU, see if it is 'covered' by our procrastAlignments.
+ //if so, increase lc2 accordingly
+
+ //for each alignment returned by procrastAligner, highest multiplicity first
+ bool alufound = false;
+
+ //cout << "checking alignment #" << j << " for ALUs..." << endl;
+ for ( int i = 0; i < alus.size(); i++)
+ {
+
+ //lpt = 0;
+ if (alus.at(i)->strand == "+" )
+ {
+ for ( int a = 0; a < alus.at(i)->length(); a++)
+ {
+
+
+ //column in alignment coincides with an alu
+ if(totcoverage.at(j).find((alus.at(i)->start)+a) != totcoverage.at(j).end())
+ {
+
+ alufound = true;
+ //this column in sequence not accounted for
+ if(aluCoverage.find((alus.at(i)->start)+a) == aluCoverage.end())
+ {
+ lc+=1;
+
+ //now it is
+
+ }
+ hitlist[i] = true;
+ aluCoverage[(alus.at(i)->start)+a] = true;
+
+ }
+
+ }
+ }
+ else
+ {
+ for ( int a = 0; a < alus.at(i)->length(); a++)
+ {
+ if(totcoverage.at(j).find(-1*((alus.at(i)->start)+a)) != totcoverage.at(j).end())
+ {
+ if(aluCoverage.find(-1*((alus.at(i)->start)+a)) == aluCoverage.end())
+ {
+ lc+=1;
+
+
+ }
+ hitlist[i] = true;
+ aluCoverage[-1*((alus.at(i)->start)+a)] = true;
+ //lc+=1;
+ alufound =true;
+ }
+
+ }
+ }
+ }
+ if(!alufound)
+ {
+ ignoreAlignment[j] = true;
+ cout << "ignoring alignment " << j << endl;
+
+ //calculate regions only appearing in procrastAligner alignments
+ for(int k = 0; k < align_list.at(j).size();k++)
+ {
+ gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
+ for(int n = 0; n<len;n++)
+ {
+ if(align_list.at(j).at(k).first<0)
+ lpo[align_list.at(j).at(k).first-n] = true;
+ else
+ lpo[align_list.at(j).at(k).first+n] = true;
+ }
+ }
+ }
+ else
+ {
+ //cout << "ALU was aligned!" << endl;
+ bool hit = false;
+ bool debug_pos = false;
+ bool inall = true;
+ uint rnum = 0;
+ for(int k = 0; k < align_list.at(j).size();k++)
+ {
+ gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
+ for(int n = 0; n<len;n++)
+ {
+ if(align_list.at(j).at(k).first<0)
+ {
+ // j = lma #
+ // k = component #
+ // first,second = start,end pos
+ if(aluCoverage.find(align_list.at(j).at(k).first-n)!= aluCoverage.end())
+ {
+ //find which alu is hit
+ for ( int i = 0; i < alus.size(); i++)
+ {
+ //is this ok for reverse strand?
+ if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
+ || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) < alus.at(i)->end ) )
+ {
+ //the repeat #
+ if (rnum != i+1 && rnum != 0)
+ inall = false;
+ rnum = i+1;
+ break;
+ }
+ }
+ //current component of alignment pertains to alu
+ //spec.at(j).push_back(k)
+ lpn[align_list.at(j).at(k).first-n] = true;
+ hit = true;
+ }
+ //motif missed by procrastAligner
+ else
+ {
+ lpt[align_list.at(j).at(k).first-n] = true;
+ rnum = -1;
+ }
+ mergedCoverage[align_list.at(j).at(k).first-n] = true;
+ }
+ else
+ {
+ if(aluCoverage.find(align_list.at(j).at(k).first+n)!= aluCoverage.end())
+ {
+ //find out which alu is hit
+ for ( int i = 0; i < alus.size(); i++)
+ {
+
+ if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
+ || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
+ {
+ //the repeat #
+ //cout << rnum << " " << i+1 << endl;
+ if (rnum != i+1 && rnum != 0)
+ inall = false;
+ rnum = i+1;
+ break;
+ }
+ }
+ //current component of alignment pertains to alu
+ lpn[align_list.at(j).at(k).first+n] = true;
+ hit = true;
+ }
+ //motif missed by procrastAligner
+ else
+ {
+ lpt[align_list.at(j).at(k).first+n] = true;
+ rnum = -1;
+ }
+ mergedCoverage[align_list.at(j).at(k).first+n] = true;
+ }
+ }
+ if (rnum <= 0)
+ inall = false;
+
+ if(hit)
+ {
+ matchhits+=1;
+
+ }
+ }
+ //punt: DONT need to first check if it hits all components!!
+ if (inall || 1)
+ {
+ for(int k = 0; k < align_list.at(j).size();k++)
+ {
+ gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
+ uint rnum = 0;
+
+ if(align_list.at(j).at(k).first<0)
+ {
+ // j = lma #
+ // k = component #
+ // first,second = start,end pos
+
+ //find which alu is hit
+ for ( int i = 0; i < alus.size(); i++)
+ {
+ //is this ok for reverse strand?
+ if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
+ || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
+ {
+ //the repeat #
+ rnum = i+1;
+ //find overlap
+ int leftend = 0;
+ int rightend = 0;
+ leftend = abs((int)alus.at(i)->start)-abs((int)align_list.at(j).at(k).first);
+ rightend = abs((int)alus.at(i)->end)-abs((int)align_list.at(j).at(k).second);
+ if (debug_pos && (abs(leftend)>500 || abs(rightend)>500))
+ {
+ cout << "alu\talignment" << endl;
+ cout << alus.at(i)->start << "\t" << align_list.at(j).at(k).first << endl;
+ cout << alus.at(i)->end << "\t" << align_list.at(j).at(k).second << endl;
+
+ }
+
+ if ( worst_borders.find( rnum ) != worst_borders.end() )
+ {
+ // if component has worse boundaries for this alu, record them
+ if ( abs((int)worst_borders[rnum].first) < abs((int)leftend) )
+ worst_borders[rnum].first = leftend;
+ if ( abs((int)worst_borders[rnum].second) < abs((int)rightend) )
+ worst_borders[rnum].second = rightend;
+ if ( abs((int)best_borders[rnum].first) > abs((int)leftend) )
+ best_borders[rnum].first = leftend;
+ if ( abs((int)best_borders[rnum].second) > abs((int)rightend) )
+ best_borders[rnum].second = rightend;
+ }
+ else
+ {
+ worst_borders[rnum] = make_pair(leftend,rightend);
+ best_borders[rnum] = make_pair(leftend,rightend);
+ }
+
+ break;
+ }
+ }
+ }
+ else
+ {
+ //find out which alu is hit
+ for ( int i = 0; i < alus.size(); i++)
+ {
+ //if( (abs((int)align_list.at(j).at(k).first) <= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) >= alus.at(i)->end ) )
+ //if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
+ //if( ((abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end ) && (abs((int)align_list.at(j).at(k).second) > alus.at(i)->end) )
+ //|| ((abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->start) ) )
+ if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
+ || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
+ {
+ //the repeat #
+ rnum = i+1;
+ //find overlap
+ int leftend = 0;
+ int rightend = 0;
+
+ leftend = abs((int)alus.at(i)->start) -abs((int)align_list.at(j).at(k).first);
+ rightend = abs((int)alus.at(i)->end)-abs((int)align_list.at(j).at(k).second);
+
+ if (debug_pos && (abs(leftend)>500 || abs(rightend)>500))
+ {
+ cout << "alu\talignment" << endl;
+ cout << alus.at(i)->start << "\t" << align_list.at(j).at(k).first << endl;
+ cout << alus.at(i)->end << "\t" << align_list.at(j).at(k).second << endl;
+
+ }
+
+ if ( worst_borders.find( rnum ) != worst_borders.end() )
+ {
+ // if component has worse boundaries for this alu, record them
+ if ( abs((int)worst_borders[rnum].first) < abs((int)leftend) )
+ worst_borders[rnum].first = leftend;
+ if ( abs((int)worst_borders[rnum].second) < abs((int)rightend) )
+ worst_borders[rnum].second = rightend;
+
+ // if component has better boundaries for this alu, record them
+ if ( abs((int)best_borders[rnum].first) > abs((int)leftend) )
+ best_borders[rnum].first = leftend;
+ if ( abs((int)best_borders[rnum].second) > abs((int)rightend) )
+ best_borders[rnum].second = rightend;
+
+ }
+ else
+ {
+ worst_borders[rnum] = make_pair(leftend,rightend);
+ best_borders[rnum] = make_pair(leftend,rightend);
+ }
+
+ break;
+ }
+ }
+
+ }
+
+ }
+ }
+
+ }
+ alufound = false;
+ }
+ gnSequence empty_seq;
+ //this is the length of the repeats found by procrastAligner,
+ //with overlaps removed
+ //remember the alignments to ignore!
+ ofstream boundary_file;
+ alignment_fname.append(".boundary");
+ boundary_file.open(alignment_fname.c_str());
+ map< uint,pair<int,int> >::iterator iter;
+ uint avg_worst_left = 0;
+ uint avg_worst_right = 0;
+ uint avg_best_left = 0;
+ uint avg_best_right = 0;
+ for( iter = worst_borders.begin(); iter != worst_borders.end(); iter++ )
+ {
+ avg_worst_left += abs(iter->second.first);
+ avg_worst_right += abs(iter->second.second);
+ boundary_file << "worst boundaries for repeat copy #" << iter->first << "\t left: " << iter->second.first << "\t right: " << iter->second.second << endl;
+ }
+ for( iter = best_borders.begin(); iter != best_borders.end(); iter++ )
+ {
+ avg_best_left += abs( iter->second.first);
+ avg_best_right += abs(iter->second.second);
+ boundary_file << "best boundaries for repeat copy #" << iter->first << "\t left: " << iter->second.first << "\t right: " << iter->second.second << endl;
+ }
+
+ if (worst_borders.size() > 0 )
+ {
+ avg_worst_left /= worst_borders.size();
+ avg_worst_right /= worst_borders.size();
+ }
+ else
+ {
+ avg_worst_left = -1;
+ avg_worst_right = -1;
+
+ }
+ if ( best_borders.size() > 0)
+ {
+ avg_best_left /= best_borders.size();
+ avg_best_right /= best_borders.size();
+ }
+ else
+ {
+ avg_best_left = -1;
+ avg_best_right = -1;
+
+ }
+ boundary_file << "left best: \t" << avg_best_left << endl;
+ boundary_file << "right best: \t" << avg_best_right << endl;
+ boundary_file << "left worst: \t" << avg_worst_left << endl;
+ boundary_file << "right worst: \t" << avg_worst_right << endl;
+ boundary_file << "#" << endl;
+ boundary_file.close();
+
+ lt = mergedCoverage.size();
+ //lt2 = coverage.size();
+ ld = coverage.size();
+ lp = lpt.size();
+ ln = lpn.size();
+ lo = lpo.size();
+
+ //length of only ALUs hit by procrastAligner
+ gnSeqI hitlength =0;
+ for(int i =0; i< hitlist.size(); i++)
+ hitlength+= alus.at(i)->length();
+
+ cout << "\nprocrastAlignments processed: " << align_list.size() << endl;
+ cout << "matches processed: " << matches << endl;
+ cout << "Total ALUs found by repeatmasker: " << alus.size() << endl;
+ cout << "Total ALUs hit by procrastAligner: " << hitlist.size() << endl;
+ cout << "ALU hit percentage: " << (float)hitlist.size()/(float)alus.size() << endl;
+
+ //cout << aluCoverage.size() << endl;
+ cout << "\nTotal length of all repeats found by procrastAligner: " << ld << endl;
+ cout << "Total length of all regions found only in procrastAligner: " << lo << endl;
+ cout << "Total length of all (partially) aligned repeats found by procrastAligner: lt = " << lt << endl;
+ cout << "Total length of unaligned repeats(false positives?) found by procrastAligner: lp = " << lp << endl;
+ //cout << "Total length of ???: ln = " << ln << endl;
+ cout << "Total length of all repeats(ALU) found by repeatmasker: lr = " << lr << endl;
+ cout << "Total length of repeats(ALU) found by repeatmasker hit by procrastAligner: lh =" << hitlength << endl;
+ cout << "Total length of ALU repeats found by both methods: lc = " << lc << endl;
+
+ //cout << "Sensitivity: lc / lr = " << (double)(lc) / (double)(lr) << endl;
+ //cout << "Specificity: lc / lt = " << (double)(lc) / (double)(lt) << endl;
+
+ //score changes per Sunday email, focus on filtration
+ cout << "\nSensitivity-old: lc / lh = " << (double)(lc) / (double)(hitlength) << endl;
+ cout << "Specificity-old: lc / lt = " << (double)(lc) / (double)(lt) << endl;
+
+
+ cout << "\nSensitivity= " << (double)hitlist.size()/(double)alus.size() << endl;
+ cout << "Specificity= " << (double)matchhits/(double)matchhitmult << endl;
+
+
+ //TN = ltn
+ //TP = lc
+ //FN = lfn
+ //FP = lp
+}catch( gnException& gne ){
+ cerr << gne << endl;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+}
+
+}
+
+
diff --git a/src/scoreAlignment.cpp b/src/scoreAlignment.cpp
new file mode 100644
index 0000000..782edcd
--- /dev/null
+++ b/src/scoreAlignment.cpp
@@ -0,0 +1,467 @@
+/*******************************************************************************
+ * $Id: scoreAlignment.cpp,v 1.14 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/MatchList.h"
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include "libMems/IntervalList.h"
+#include "libGenome/gnFilter.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+class IntervalCompare {
+public:
+ boolean operator()(const pair< gnSeqI, gnSeqI >& a, const pair< gnSeqI, gnSeqI >& b) const{
+ if( ( a.first <= b.first && b.second <= a.second ) ||
+ ( b.first <= a.first && a.second <= b.second ) )
+ return false; // one contains the other, this must be a query, they are equal.
+ if( a.first == b.first )
+ return a.second < b.second;
+ return a.first < b.first;
+ }
+};
+
+class IntervalMap {
+public:
+ virtual void add( gnSeqI left, gnSeqI right ) = 0;
+ virtual void find( gnSeqI point, vector< uint >& intervals ) const = 0;
+};
+
+class TreeIntervalMap : public IntervalMap {
+public:
+ virtual void add( gnSeqI left, gnSeqI right );
+ virtual void find( gnSeqI point, vector< uint >& intervals ) const;
+protected:
+ map< pair< gnSeqI, gnSeqI >, uint, IntervalCompare > iv_map;
+
+};
+
+void TreeIntervalMap::add( gnSeqI left, gnSeqI right ) {
+ pair< gnSeqI, gnSeqI > cur_pos;
+ cur_pos.first = left;
+ cur_pos.second = right;
+ iv_map.insert( map< pair< gnSeqI, gnSeqI >, uint, IntervalCompare >::value_type( cur_pos, iv_map.size() ) );
+}
+
+void TreeIntervalMap::find( gnSeqI point, vector< uint >& intervals ) const{
+ pair< gnSeqI, gnSeqI > cur_loc = pair< gnSeqI, gnSeqI >( point, point );
+ map< pair< gnSeqI, gnSeqI >, uint, IntervalCompare >::const_iterator ivmap_iter = iv_map.lower_bound( cur_loc );
+ map< pair< gnSeqI, gnSeqI >, uint, IntervalCompare >::const_iterator upper_iter = iv_map.upper_bound( cur_loc );
+ while( ivmap_iter != upper_iter ){
+ if( !iv_map.key_comp()( cur_loc, ivmap_iter->first ) &&
+ !iv_map.key_comp()( ivmap_iter->first, cur_loc ) )
+ intervals.push_back( ivmap_iter->second );
+
+ ivmap_iter++;
+ }
+}
+
+class VectorIntervalMap : public IntervalMap {
+public:
+ virtual void add( gnSeqI left, gnSeqI right );
+ virtual void find( gnSeqI point, vector< uint >& intervals ) const;
+protected:
+ vector< pair< gnSeqI, gnSeqI > > iv_map;
+};
+
+void VectorIntervalMap::add( gnSeqI left, gnSeqI right ) {
+ pair< gnSeqI, gnSeqI > cur_pos;
+ cur_pos.first = left;
+ cur_pos.second = right;
+ iv_map.push_back( cur_pos );
+}
+
+void VectorIntervalMap::find( gnSeqI point, vector< uint >& intervals ) const{
+ for( uint ivI = 0; ivI < iv_map.size(); ivI++ ){
+ if( iv_map[ ivI ].first <= point && point <= iv_map[ ivI ].second )
+ intervals.push_back( ivI );
+ }
+}
+
+/**
+ * program to score alignments
+ * reads in a "correct" alignment and a calculated alignment
+ * scores the calculated alignment based on the correct one
+ */
+int main( int argc, char* argv[] ){
+
+ if( argc < 3 ){
+ cout << "scoreAlignment <correct alignment> <calculated alignment> [evolved sequence file] [slagan]\n";
+ return -1;
+ }
+
+ boolean debug_mismatches = false; /**< turns on code to debug mismatches in evolved and aligned base pairs */
+ boolean slagan_mode = false; /**< Set to true if scoring SLAGAN alignments */
+ string correct_fname = argv[ 1 ];
+ string calculated_fname = argv[ 2 ];
+ string evolved_fname;
+ if( argc > 3 ){
+ debug_mismatches = true;
+ evolved_fname = argv[ 3 ];
+ }
+ if( argc > 4 ){
+ string slagan = "slagan";
+ if( slagan == argv[ 4 ] )
+ slagan_mode = true;
+ }
+ ifstream correct_in;
+ correct_in.open( correct_fname.c_str() );
+ if( !correct_in.is_open() ){
+ cerr << "Error opening " << correct_fname << endl;
+ return -1;
+ }
+ ifstream calculated_in;
+ calculated_in.open( calculated_fname.c_str() );
+ if( !calculated_in.is_open() ){
+ cerr << "Error opening " << calculated_fname << endl;
+ return -1;
+ }
+try{
+ IntervalList correct_ivs;
+ IntervalList calculated_ivs;
+ correct_ivs.ReadStandardAlignment( correct_in );
+ correct_in.close();
+ calculated_ivs.ReadStandardAlignment( calculated_in );
+ calculated_in.close();
+ gnSequence empty_seq;
+ vector< gnSequence* > seq_table( correct_ivs[0].SeqCount(), &empty_seq );
+ uint seq_count = seq_table.size();
+ const gnFilter* comp_filter = gnFilter::DNAComplementFilter();
+
+ gnSequence evolved_gnseqs;
+ vector< string > evolved_seqs( seq_count );
+ if( debug_mismatches ){
+ evolved_gnseqs.LoadSource( evolved_fname );
+ for( uint i = 0; i < seq_count; i++ ){
+ evolved_seqs[ i ] = evolved_gnseqs.contig( i ).ToString();
+ }
+ }
+
+ /** A map of locations of each interval to the interval's array index */
+ vector< IntervalMap* > iv_map;
+ uint seqI = 0;
+ for( ; seqI < seq_count; seqI++ ){
+ if( seqI > 0 && slagan_mode ){
+ iv_map.push_back( new VectorIntervalMap() );
+ }else{
+ iv_map.push_back( new TreeIntervalMap() );
+ }
+
+ for( uint map_ivI = 0; map_ivI < calculated_ivs.size(); map_ivI++ ){
+ pair< gnSeqI, gnSeqI > cur_pos;
+ cur_pos.first = absolut( calculated_ivs[ map_ivI ].Start( seqI ) );
+ cur_pos.second = cur_pos.first + calculated_ivs[ map_ivI ].Length( seqI ) - 1;
+ iv_map[ seqI ]->add( cur_pos.first, cur_pos.second );
+ }
+ }
+
+ // now compare these alignments somehow (use the evil megaloop)
+ gnSeqI true_pos = 0; /**< when a base is correctly aligned to an orthologous base */
+ gnSeqI true_neg = 0; /**< when a base is correctly aligned to a gap */
+ gnSeqI false_pos = 0; /**< when a base is wrongly aligned to another base */
+ gnSeqI false_neg = 0; /**< when a base is wrongly aligned to a gap */
+ gnSeqI total = 0;
+ gnSeqI unaligned_fn = 0; /**< tally for errors due to unaligned regions */
+ gnSeqI unaligned_tn = 0;
+
+ gnSeqI bad_context = 0;
+ gnSeqI multiple_intersection = 0;
+ gnSeqI no_j = 0;
+
+ for( uint cor_ivI = 0; cor_ivI < correct_ivs.size(); cor_ivI++ ){
+ uint calc_ivI = 0;
+ int64 calc_iv_lend = 0;
+ int64 calc_iv_lendJ = 0;
+ boolean parity_match = true;
+ gnAlignedSequences cor_gnas;
+ gnAlignedSequences calc_gnas;
+ correct_ivs[ cor_ivI ].GetAlignedSequences( cor_gnas, seq_table );
+
+ for( seqI = 0; seqI < seq_count; seqI++ ){
+ int64 cor_iv_lend = correct_ivs[ cor_ivI ].Start( seqI );
+ if( cor_iv_lend == NO_MATCH )
+ continue; // not defined in seqI, skip it
+
+ for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
+ if( seqI == seqJ )
+ continue;
+
+ int64 cor_iv_lendJ = correct_ivs[ cor_ivI ].Start( seqJ );
+
+ /** base index for seqI in correct alignment */
+ int64 baseI = cor_iv_lend < 0 ? -correct_ivs[ cor_ivI ].Length( seqI ) + 1 : 0;
+ /** base index for seqJ in correct alignment */
+ int64 baseJ = cor_iv_lendJ < 0 ? -correct_ivs[ cor_ivI ].Length( seqJ ) + 1 : 0;
+ int64 calc_baseI; /**< The current base pair in sequence I of the calculated alignment */
+ int64 calc_baseJ; /**< The current base pair in sequence J of the calculated alignment */
+ int64 calc_colI = 0; /**< The current column of the calculated alignment */
+ // update calc_* variables with the current seqI/seqJ pair
+ if( calc_ivI < calculated_ivs.size() && calc_iv_lend != 0 ){
+ calc_iv_lend = calculated_ivs[ calc_ivI ].Start( seqI );
+ calc_iv_lendJ = calculated_ivs[ calc_ivI ].Start( seqJ );
+ calc_baseI = calculated_ivs[ calc_ivI ].Start( seqI );
+ calc_baseJ = calculated_ivs[ calc_ivI ].Start( seqJ );
+ if( ( calc_iv_lend > 0 && cor_iv_lend > 0 ) || ( calc_iv_lend < 0 && cor_iv_lend < 0 ) ){
+ parity_match = true;
+ calc_baseI += calc_baseI < 0 ? -calculated_ivs[ calc_ivI ].Length( seqI ) + 1 : 0;
+ calc_baseJ += calc_baseJ < 0 ? -calculated_ivs[ calc_ivI ].Length( seqJ ) + 1 : 0;
+ }else{
+ parity_match = false;
+ calc_baseI += calc_baseI > 0 ? calculated_ivs[ calc_ivI ].Length( seqI ) - 1 : 0;
+ calc_baseJ += calc_baseJ > 0 ? calculated_ivs[ calc_ivI ].Length( seqJ ) - 1 : 0;
+ }
+ calc_colI = parity_match ? 0 : calc_gnas.alignedSeqsSize() - 1;
+ // scan calc_colI to the first actual residue
+ boolean saw_baseJ = false;
+ while( true ){
+ if( calc_colI < 0 || calc_colI >= calc_gnas.alignedSeqsSize() ){
+ cerr << "Error locating residue in alignment, calculated alignment is corrupt\n";
+ break;
+ }
+ if( calc_gnas.sequences[ seqI ][ calc_colI ] == '-' ){
+ if( calc_gnas.sequences[ seqJ ][ calc_colI ] != '-' ){
+ calc_baseJ += parity_match ? 1 : -1;
+ saw_baseJ = true;
+ }
+ calc_colI += parity_match ? 1 : -1;
+
+ }else
+ break;
+ }
+ // if seqJ still contains a gap in calc_baseJ we haven't actually seen calc_baseJ yet
+ if( !saw_baseJ && calc_gnas.sequences[ seqJ ][ calc_colI ] == '-' ){
+ calc_baseJ += parity_match ? -1 : 1;
+ }
+ }
+
+ for( gnSeqI colI = 0; colI < cor_gnas.alignedSeqsSize(); colI++ ){
+ if( cor_gnas.sequences[ seqI ][ colI ] == '-' ){
+ if( cor_gnas.sequences[ seqJ ][ colI ] != '-' )
+ baseJ++;
+ continue;
+ }else if( seqJ < seqI && ( cor_gnas.sequences[ seqJ ][ colI ] != '-' )){
+ // this one was already scored when seqI had the current value of seqJ
+ baseI++;
+ baseJ++;
+ continue;
+ }
+
+ total++; /** this aligned pair counts towards the totals */
+
+ // calculate the actual base index in seqJ for the correct alignment
+ int64 cor_baseJ = cor_iv_lendJ + baseJ;
+
+ // check if the current correct alignment entry for seqI is in
+ // the current interval of the calculated alignment
+ // if not, scan through the calculated intervals until we find the right one
+ // also check wether cor_baseJ fits (for the benefit of shuffle-lagan)
+ if( calc_iv_lend == 0 || !(absolut( calc_iv_lend ) <= absolut( cor_iv_lend + baseI ) &&
+ absolut( cor_iv_lend + baseI ) < absolut( calc_iv_lend ) + calculated_ivs[ calc_ivI ].Length( seqI ) &&
+ absolut( calc_iv_lendJ ) <= absolut( cor_baseJ ) &&
+ absolut( cor_baseJ ) < absolut( calc_iv_lendJ ) + calculated_ivs[ calc_ivI ].Length( seqJ ) - 1 ) ){
+
+ boolean possibly_incorrect = false;
+ vector< uint > possible_ivsI, possible_ivsJ;
+ iv_map[ seqI ]->find( absolut( cor_iv_lend + baseI ), possible_ivsI );
+ iv_map[ seqJ ]->find( absolut( cor_baseJ ), possible_ivsJ );
+ calc_ivI = calculated_ivs.size();
+ if( possible_ivsI.size() == 0 )
+ no_j++;
+ // determine the intersection of possible_ivI and possible_ivJ
+ vector< uint > intersection;
+ uint pivI = 0;
+ for( ; pivI < possible_ivsI.size(); pivI++ ){
+ possibly_incorrect = true;
+ uint pivJ = 0;
+ for( ; pivJ < possible_ivsJ.size(); pivJ++ ){
+ int64 s = absolut( calculated_ivs[ possible_ivsJ[ pivJ ] ].Start( seqJ ) );
+ if( !(s <= cor_baseJ <= s + calculated_ivs[ possible_ivsJ[ pivJ ] ].Length( seqJ ) - 1 ) )
+ cerr << "cor_baseJ doesn't fit!\n";
+ if( possible_ivsI[ pivI ] == possible_ivsJ[ pivJ ] )
+ intersection.push_back( pivI );
+ }
+ }
+ if( intersection.size() > 0 ){
+ calc_ivI = possible_ivsI[ intersection[ 0 ] ];
+ calc_iv_lend = calculated_ivs[ calc_ivI ].Start( seqI );
+ calc_iv_lendJ = calculated_ivs[ calc_ivI ].Start( seqJ );
+ }
+ if( intersection.size() > 1 ){
+ multiple_intersection++;
+ }
+
+ // if we couldn't find baseI anywhere in the calculated alignment then treat
+ // it as aligned to a gap, otherwise
+ // update the gnAlignedSequences object for the new interval
+ if( calc_ivI < calculated_ivs.size() ){
+ calculated_ivs[ calc_ivI ].GetAlignedSequences( calc_gnas, seq_table );
+ calc_baseI = calc_iv_lend;
+ calc_baseJ = calc_iv_lendJ;
+ if( ( calc_iv_lend > 0 && cor_iv_lend > 0 ) || ( calc_iv_lend < 0 && cor_iv_lend < 0 ) ){
+ parity_match = true;
+ calc_baseI += calc_baseI < 0 ? -calculated_ivs[ calc_ivI ].Length( seqI ) + 1 : 0;
+ calc_baseJ += calc_baseJ < 0 ? -calculated_ivs[ calc_ivI ].Length( seqJ ) + 1 : 0;
+ }else{
+ parity_match = false;
+ calc_baseI += calc_baseI > 0 ? calculated_ivs[ calc_ivI ].Length( seqI ) - 1 : 0;
+ calc_baseJ += calc_baseJ > 0 ? calculated_ivs[ calc_ivI ].Length( seqJ ) - 1 : 0;
+ }
+ calc_colI = parity_match ? 0 : calc_gnas.alignedSeqsSize() - 1;
+ boolean saw_baseJ = false;
+ while( true ){
+ if( calc_colI < 0 || calc_colI >= calc_gnas.alignedSeqsSize() ){
+ cerr << "Error locating residue in alignment, calculated alignment is corrupt\n";
+ break;
+ }
+ if( calc_gnas.sequences[ seqI ][ calc_colI ] == '-' ){
+ if( calc_gnas.sequences[ seqJ ][ calc_colI ] != '-' ){
+ calc_baseJ += parity_match ? 1 : -1;
+ saw_baseJ = true;
+ }
+ calc_colI += parity_match ? 1 : -1;
+
+ }else
+ break;
+ }
+ // if seqJ still contains a gap in calc_baseJ we haven't actually seen calc_baseJ yet
+ if( !saw_baseJ && calc_gnas.sequences[ seqJ ][ calc_colI ] == '-' ){
+ calc_baseJ += parity_match ? -1 : 1;
+ }
+
+ }else{
+ if( possibly_incorrect ){
+ // aligned to the wrong context
+ bad_context++;
+ false_pos++;
+ if( cor_gnas.sequences[ seqJ ][ colI ] != '-' )
+ baseJ++;
+ }else if( cor_gnas.sequences[ seqJ ][ colI ] != '-' ){
+ // wrongly aligned to a gap
+ unaligned_fn++;
+ false_neg++;
+ baseJ++;
+ }else{
+ // correctly aligned to a gap
+ unaligned_tn++;
+ true_neg++;
+ }
+ baseI++;
+ calc_iv_lend = 0; // reset calc_iv_lend
+ continue;
+ }
+ }
+
+ int64 diffI;
+ if( parity_match )
+ diffI = baseI + cor_iv_lend - calc_baseI;
+ else
+ diffI = baseI + cor_iv_lend + calc_baseI;
+
+ gnSeqI cbI = 0, cbJ = 0;
+ while( cbI < diffI ){
+ gnSeqI next_colI = parity_match ? calc_colI + 1 : calc_colI - 1;
+ if ( next_colI > 100000000 )
+ cerr << "bug?\n";
+ if( calc_gnas.sequences[ seqI ][ next_colI ] != '-' )
+ cbI++;
+ if( calc_gnas.sequences[ seqJ ][ next_colI ] != '-' )
+ cbJ++;
+ calc_colI += parity_match ? 1 : -1;
+ }
+
+ calc_baseI += parity_match ? cbI : -cbI;
+ calc_baseJ += parity_match ? cbJ : -cbJ;
+ // if cor_baseJ == calc_baseJ then this pair of sequences were correctly aligned!
+ // classify the correctness of the aligned pair
+ char cor_chI = cor_gnas.sequences[ seqI ][ colI ];
+ char cor_chJ = cor_gnas.sequences[ seqJ ][ colI ];
+ char calc_chI = calc_gnas.sequences[ seqI ][ calc_colI ];
+ char calc_chJ = calc_gnas.sequences[ seqJ ][ calc_colI ];
+ if( cor_iv_lend < 0 ){
+ cor_chI = comp_filter->Filter( cor_chI );
+ }
+ if( cor_iv_lendJ < 0 ){
+ cor_chJ = comp_filter->Filter( cor_chJ );
+ }
+ if( calc_iv_lend < 0 ){
+ calc_chI = comp_filter->Filter( calc_chI );
+ }
+ if( calc_iv_lendJ < 0 ){
+ calc_chJ = comp_filter->Filter( calc_chJ );
+ }
+ if( cor_chI != calc_chI && debug_mismatches ){
+ if( evolved_seqs[ seqI ][ absolut( calc_baseI ) - 1 ] == cor_chI ){
+ cerr << "The calculated alignment has incorrect base: " << calc_chI;
+ cerr << " instead of " << evolved_seqs[ seqI ][ absolut( calc_baseI ) - 1 ];
+ cerr << " at " << absolut( calc_baseI ) << " in sequence " << seqI << endl;
+ }else{
+ cerr << "The \"correct\" alignment has incorrect base: " << cor_chI;
+ cerr << " instead of " << evolved_seqs[ seqI ][ absolut( calc_baseI ) - 1 ];
+ cerr << " at " << absolut( calc_baseI ) << " in sequence " << seqI << endl;
+ }
+ }
+
+ if( calc_chJ != '-' ){
+ // make sure the calculated base actually matches the original sequence
+ if( debug_mismatches && calc_chJ != evolved_seqs[ seqJ ][ absolut( calc_baseJ ) - 1 ] ){
+ cerr << "The calculated alignment has incorrect base: " << calc_chJ;
+ cerr << " instead of " << evolved_seqs[ seqJ ][ absolut( calc_baseJ ) - 1 ];
+ cerr << " at " << absolut( calc_baseJ ) << " in sequence " << seqJ << endl;
+ }
+ if( cor_chJ != '-' &&
+ ( ( parity_match && cor_baseJ == calc_baseJ ) ||
+ ( !parity_match && cor_baseJ == -calc_baseJ ) ) ){
+ true_pos++;
+ // sanity check that the bases are really identical:
+ if( cor_chI != calc_chI || cor_chJ != calc_chJ )
+ cerr << "Calculated alignment contains a different base than the correct!\n";
+ }else if( cor_chJ == '-' )
+ false_neg++;
+ else
+ false_pos++;
+ }else{
+ if( cor_chJ == '-' )
+ true_neg++;
+ else
+ false_pos++;
+ }
+
+ if( cor_gnas.sequences[ seqJ ][ colI ] != '-' )
+ baseJ++;
+ baseI++;
+ }
+ }
+ }
+ }
+
+ cout << "Sensitivity: TP / TP + FN = " << (double)(true_pos) / (double)(true_pos + false_neg) << endl;
+ cout << "Specificity: TN / TN + FP = " << (double)(true_neg) / (double)(true_neg + false_pos) << endl;
+ cout << "TP + TN / total = " << (double)(true_pos + true_neg) / (double)(total) << endl;
+ cout << "FP + FN / total = " << (double)(false_pos + false_neg) / (double)(total) << endl;
+ cout << "unaligned error = " << (double)unaligned_fn / (double)total << endl;
+ cout << "bad_context = " << (double)bad_context / (double)total << endl;
+ cout << "multiple_intersection = " << (double)multiple_intersection / (double)total << endl;
+ cout << "no_j = " << (double)no_j / (double)total << endl;
+
+}catch( gnException& gne ){
+ cerr << gne << endl;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+}
+
+}
+
+
diff --git a/src/scoreProcrastAlignment.cpp b/src/scoreProcrastAlignment.cpp
new file mode 100644
index 0000000..3729bc3
--- /dev/null
+++ b/src/scoreProcrastAlignment.cpp
@@ -0,0 +1,458 @@
+/*******************************************************************************
+ * $Id: scoreAlignment.cpp,v 1.14 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/CompactGappedAlignment.h"
+#include "libMems/MatchList.h"
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include "libMems/IntervalList.h"
+#include "libGenome/gnFilter.h"
+#include <boost/program_options/cmdline.hpp>
+#include <boost/program_options.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <algorithm>
+namespace po = boost::program_options;
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+// basic data structures
+
+/** store a pair of aligned positions and the characters */
+typedef struct aligned_coords_s {
+ int64 pos1;
+ int64 pos2;
+ char char1;
+ char char2;
+} aligned_coords_t;
+
+
+class AlignedCoordSeqIComparator {
+public:
+ boolean operator()(const aligned_coords_t& a, const aligned_coords_t& b) const{
+ if( abs(a.pos1) == abs(b.pos1) )
+ return abs(a.pos2) < abs(b.pos2);
+ return abs(a.pos1) < abs(b.pos1);
+ }
+};
+
+void constructCoordList( uint seqI, uint seqJ, IntervalList& iv_list, vector< aligned_coords_t >& coord_list, vector< gnSequence* >& seq_table ){
+
+ //
+ // pre-allocate the vector
+ //
+ gnSeqI ij_vec_size = 0;
+ for( int ivI = 0; ivI < iv_list.size(); ivI++ ){
+ ij_vec_size += iv_list[ivI].AlignmentLength();
+ }
+ coord_list = vector< aligned_coords_t >( ij_vec_size );
+
+ //
+ // fill in the vector with all aligned pairs
+ //
+ gnSeqI vecI = 0; // current place in vector
+ for( int ivI = 0; ivI < iv_list.size(); ivI++ ){
+ GappedAlignment* aln;
+ aln = dynamic_cast< GappedAlignment* >( iv_list[ ivI ].GetMatches()[0] );
+ if( aln == NULL ){
+ throw "Error: expecting interval to contain a single GappedAlignment";
+ }
+ int64 pos1 = aln->Start( seqI );
+ int64 pos2 = aln->Start( seqJ );
+
+ // if rev. comp then we're starting at the other (right) side
+ if( pos1 < 0 )
+ pos1 -= aln->Length( seqI ) - 1;
+ if( pos2 < 0 )
+ pos2 -= aln->Length( seqJ ) - 1;
+
+
+ const std::vector< std::string >& align_matrix = GetAlignment( *aln, seq_table );
+ for( gnSeqI colI = 0; colI < aln->Length(); colI++ ){
+ aligned_coords_t act;
+ act.char1 = align_matrix[ seqI ][ colI ];
+ act.char2 = align_matrix[ seqJ ][ colI ];
+ act.pos1 = act.char1 == '-' ? 0 : pos1;
+ act.pos2 = act.char2 == '-' ? 0 : pos2;
+
+ coord_list[ vecI++ ] = act;
+
+ if( act.char1 != '-' )
+ pos1++;
+ if( act.char2 != '-' )
+ pos2++;
+ }
+
+ }
+
+ //
+ // sort the vector on aligned position
+ //
+ AlignedCoordSeqIComparator acsc;
+ sort( coord_list.begin(), coord_list.end(), acsc );
+}
+
+
+const gnFilter* comp_filter = gnFilter::DNAComplementFilter();
+
+template< class PairType >
+class PairFirstComparator
+{
+public:
+ bool operator()( const PairType& a, const PairType& b )
+ {
+ return a.first < b.first;
+ }
+};
+
+void compareAlignmentsAceD( IntervalList& correct, IntervalList& procrastinated, gnSequence& concat_sequence )
+{
+ gnSeqI sp_truepos = 0;
+ gnSeqI sp_possible = 0;
+
+ uint seqI = 0;
+ uint seqJ = 0;
+ // for now, use this value to create a unique identifier for the pairwise_component_hits bitset vector
+// uint MAX_MULTIPLICITY = 1000;
+ uint seq_count = concat_sequence.contigListLength();
+
+ // create a data structure that indicates the start offset in concatenated coordinates for a given sequence
+ vector< gnSeqI > concat_coords(seq_count+1, 0);
+ for( size_t seqI = 0; seqI < seq_count; ++seqI )
+ {
+ concat_coords[seqI+1] = concat_coords[seqI] + concat_sequence.contigLength(seqI);
+ }
+
+ // tuple stores pointer to interval, the component of the interval, and the interval's index in procrastinated
+ // for each position of the concatenated sequence.
+ typedef std::pair< size_t, uint > iv_tracker_t;
+ typedef vector< iv_tracker_t, boost::pool_allocator< iv_tracker_t > > tracker_vector_t;
+ typedef vector< tracker_vector_t, boost::pool_allocator< tracker_vector_t > > coord_iv_map_vector_t;
+ // create a map from sequence position to (interval,component) for the total length of the concat sequence
+ // use boost pool allocators since this never needs to get freed
+ coord_iv_map_vector_t* tmp = new coord_iv_map_vector_t( concat_coords.back() + 1 ); // heap allocate to avoid destruction when the stack frame is popped
+ coord_iv_map_vector_t& coord_iv_map = *tmp;
+ vector< size_t > coord_iv_counts( concat_coords.back() + 1, 0 );
+ // first count the number of ivs that contain each position so we know how much to allocate
+ for( size_t calcI = 0; calcI < procrastinated.size(); ++calcI )
+ {
+ Interval& iv = procrastinated[calcI];
+ for( size_t seqI = 0; seqI < iv.SeqCount(); ++seqI )
+ {
+ const gnSeqI lend = iv.LeftEnd(seqI);
+ if( lend == NO_MATCH )
+ continue; // this shouldn't happen with procrastAligner output, but let's be safe
+ const gnSeqI rend = iv.RightEnd(seqI);
+ for( size_t posI = lend; posI <= rend; ++posI )
+ coord_iv_counts[posI]++;
+ }
+ }
+ // now allocate space for the map
+ for( size_t mapI = 0; mapI < coord_iv_map.size(); ++mapI )
+ coord_iv_map[mapI].resize( coord_iv_counts[mapI] );
+
+ std::fill( coord_iv_counts.begin(), coord_iv_counts.end(), 0 ); // recycle this storage to count the number added thus far
+
+ // finally, populate the map
+ for( size_t calcI = 0; calcI < procrastinated.size(); ++calcI )
+ {
+ Interval& iv = procrastinated[calcI];
+ for( size_t seqI = 0; seqI < iv.SeqCount(); ++seqI )
+ {
+ const gnSeqI lend = iv.LeftEnd(seqI);
+ if( lend == NO_MATCH )
+ continue; // this shouldn't happen with procrastAligner output, but let's be safe
+ const gnSeqI rend = iv.RightEnd(seqI);
+ for( size_t posI = lend; posI <= rend; ++posI )
+ {
+ coord_iv_map[posI][coord_iv_counts[posI]] = make_pair( calcI, seqI );
+ coord_iv_counts[posI]++;
+ }
+ }
+ }
+
+ size_t all_component_count = 0;
+ size_t all_component_pair_count = 0;
+ size_t component_pair_count = 0;
+ // create a vector of bitsets for each iv to store whether their components were correctly aligned
+ vector< bitset_t > component_hits( procrastinated.size() );
+ // Follow Aaron's lead and store pairwise component hits in bitset_t vector
+ vector< bitset_t > pairwise_component_hits( procrastinated.size() );
+ for( size_t ivI = 0; ivI < component_hits.size(); ++ivI )
+ {
+ // make sure this value is always greater than the largest max multiplicity
+// if( MAX_MULTIPLICITY < procrastinated[ivI].SeqCount())
+// MAX_MULTIPLICITY *= 10;
+ // possible pairwise component combinations for this interval
+// component_pair_count = ( ( procrastinated[ivI].SeqCount() * (procrastinated[ivI].SeqCount() - 1) ) / 2 );
+ // let this be oversized for easier indexing, but correct for it when calculating the PPV below...
+ component_pair_count = procrastinated[ivI].SeqCount() * procrastinated[ivI].SeqCount();
+ pairwise_component_hits[ivI].resize(component_pair_count, false);
+ component_hits[ivI].resize(procrastinated[ivI].SeqCount(), false);
+ all_component_pair_count += ( ( procrastinated[ivI].SeqCount() * (procrastinated[ivI].SeqCount() - 1) ) / 2 );
+ all_component_count += procrastinated[ivI].SeqCount();
+ }
+
+ // sort each vector of iv_tracker_t by iv memory address (first element) so we can later do set intersections
+ for( size_t posI = 0; posI < coord_iv_map.size(); ++posI )
+ std::sort( coord_iv_map[posI].begin(), coord_iv_map[posI].end() );
+
+ tracker_vector_t intersect_buf1( all_component_count ); // storage for set intersections
+ tracker_vector_t intersect_buf2( all_component_count ); // storage for set intersections
+ PairFirstComparator< iv_tracker_t > pfc;
+
+ // now, for each pair of aligned positions in the correct alignment, determine whether they
+ // lie in a procrastAligner chain
+ size_t all_pair_count = (seq_count * (seq_count - 1)) / 2;
+ size_t pair_count = 0;
+
+ for( seqI = 0; seqI < seq_count; seqI++ )
+ {
+ for( seqJ = seqI+1; seqJ < seq_count; seqJ++ )
+ {
+ size_t prev_count = pair_count;
+ pair_count++;
+ if( (pair_count * 100) / all_pair_count != (prev_count * 100) / all_pair_count )
+ {
+ cout << (pair_count * 100) / all_pair_count << "%..";
+ cout.flush();
+ }
+ vector< aligned_coords_t > cor;
+
+ //construct the coord list just for the correct alignment
+ vector< gnSequence* > seq_table( seq_count, (gnSequence*)NULL );
+ constructCoordList( seqI, seqJ, correct, cor, seq_table );
+
+ gnSeqI corI = 0;
+ // skip any gaps aligned to gaps
+ while( corI < cor.size() && cor[ corI ].pos1 == 0 )
+ corI++;
+
+ for( ; corI < cor.size(); corI++ )
+ {
+ if( cor[ corI ].pos1 != 0 && cor[ corI ].pos2 != 0) // don't count positions aligned to gaps
+ sp_possible++;
+ else
+ continue;
+
+ // which positions do the correct pair have in the concatenated alignment space?
+ gnSeqI trans_pos1 = genome::absolut( cor[corI].pos1 ) + concat_coords[seqI];
+ gnSeqI trans_pos2 = genome::absolut( cor[corI].pos2 ) + concat_coords[seqJ];
+
+ // which chain(s) do these positions fall into?
+ // are any of them the same chain?
+ tracker_vector_t::iterator last_int1 = std::set_intersection(
+ coord_iv_map[trans_pos1].begin(),coord_iv_map[trans_pos1].end(),
+ coord_iv_map[trans_pos2].begin(),coord_iv_map[trans_pos2].end(),
+ intersect_buf1.begin(), pfc );
+
+ if( last_int1 == intersect_buf1.begin() )
+ {
+ // not contained in any chain. false negative
+ }else{
+ // make a list of pairs for each position
+ // set_intersection always puts elements from the first set into the output buffer,
+ // since the elements in the second set may have the same iv ptr but a different
+ // match component, we want a list of those as well
+ tracker_vector_t::iterator last_int2 = std::set_intersection(
+ coord_iv_map[trans_pos2].begin(),coord_iv_map[trans_pos2].end(),
+ coord_iv_map[trans_pos1].begin(),coord_iv_map[trans_pos1].end(),
+ intersect_buf2.begin(), pfc );
+
+ size_t pcount = last_int1 - intersect_buf1.begin();
+ bool found = false; // set this to true if at least one element has different match components
+ for( size_t pI = 0; pI < pcount; ++pI )
+ {
+ // make sure they're not in the same component (probably a very rare occurrence)
+ size_t component_1 = intersect_buf1[pI].second;
+ size_t component_2 = intersect_buf2[pI].second;
+ size_t ivI = intersect_buf1[pI].first;
+ if( component_1 == component_2 )
+ continue; // no alignment here
+
+ // make sure the relative orientations match
+ bool cor_orient = (cor[corI].pos1 > 0) == (cor[corI].pos2 > 0);
+ bool calc_orient = (procrastinated[ivI].Orientation(component_1) == procrastinated[ivI].Orientation(component_2));
+ if( cor_orient != calc_orient )
+ continue; // calculated alignment has the wrong strand
+
+ // make sure they're not aligned to something else...
+ CompactGappedAlignment<>* cga = dynamic_cast< CompactGappedAlignment<>* >(procrastinated[ivI].GetMatches()[0]);
+ size_t col_1 = cga->SeqPosToColumn(component_1, trans_pos1);
+ const vector< bitset_t >& aln_mat = cga->GetAlignment();
+ // if they're not aligned, make sure they're in the same gap.
+ // they might get aligned later if we were to actually align the procrastAligner chains
+ // instead of just finding anchors.
+ if( !aln_mat[component_2].test(col_1) )
+ {
+ // if we encounter any columns between col_1 and col_2 that have
+ // component_1 and component_2 aligned then we wouldn't ever align
+ // pos_1 and pos_2 without changing the anchoring
+ size_t col_2 = cga->SeqPosToColumn(component_2, trans_pos2);
+ size_t col_first = col_1;
+ size_t col_last = col_2;
+ if( col_first < col_last )
+ swap(col_first, col_last);
+ size_t colI = col_first;
+ for( ; colI <= col_last; ++colI )
+ {
+ if( aln_mat[component_1].test(colI) && aln_mat[component_2].test(colI) )
+ break;
+ }
+ if( colI <= col_last )
+ continue; // an anchor intervenes... bummer.
+ }
+
+ // mark these components as good
+ found = true;
+ component_hits[ivI].set( component_1 );
+ component_hits[ivI].set( component_2 );
+
+ // Always use the smallest component first
+ if( component_2 < component_1 )
+ swap(component_1, component_2);
+
+ // calculate signficand for creating double
+// double significand = (double)(component_2+1)/(double)MAX_MULTIPLICITY;
+ // store merged_component
+// double merged_component = (double)(component_1+1)+significand;
+
+ // and use as unique pairwise index for each pair to take advantage
+ // of bitset_t vector
+ size_t sig = component_1 * cga->SeqCount() + component_2;
+ pairwise_component_hits[ivI].set( sig );
+ }
+ if( found )
+ sp_truepos++;
+ }
+ }
+ }
+ }
+
+ cout << "\ndone!\n";
+ // yaaay! we're done. report score.
+ cout << "sp_truepos " << sp_truepos << endl;
+ cout << "sp_possible " << sp_possible << endl;
+ cout << "SP sensitivity: " << ((double)sp_truepos) / ((double)sp_possible) << endl;
+ double components_correct = 0;
+ double components_possible = 0;
+ for( size_t ivI = 0; ivI < component_hits.size(); ++ivI )
+ {
+ components_correct += component_hits[ivI].count();
+ components_possible += component_hits[ivI].size();
+ }
+ cout << "Match component PPV: " << components_correct / components_possible << endl;
+
+ double pairwise_components_correct = 0;
+ for( size_t ivI = 0; ivI < pairwise_component_hits.size(); ++ivI )
+ {
+ pairwise_components_correct += pairwise_component_hits[ivI].count();
+ }
+ cout << "Pairwise match component PPV: " << pairwise_components_correct / (double)all_component_pair_count << endl;
+}
+
+
+/**
+ * program to score alignments
+ * reads in a "correct" alignment and a procrastinated alignment
+ * scores the procrastinated alignment based on the correct one
+ */
+int main( int argc, char* argv[] )
+{
+
+ string correct_aln_fname;
+ string procrast_aln_fname;
+ string sequence_file;
+
+
+ if( argc < 2 ){
+ cout << "scoreProcrastAlignment <correct alignment> <procrastAligner output>\n";
+ return -1;
+ }
+ // Declare the supported options.
+
+ po::variables_map vm;
+ try {
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help", "get help message")
+ ("correct", po::value<string>(&correct_aln_fname), "correct Alignment(XMFA)")
+ ("calculated", po::value<string>(&procrast_aln_fname), "procrastAligner output")
+ ("sequence", po::value<string>(&sequence_file), "FastA sequence file")
+ ;
+
+
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ cout << desc << "\n";
+ return 1;
+ }
+
+
+ }
+ catch(exception& e) {
+ cerr << "error: " << e.what() << "\n";
+ return 1;
+ }
+ catch(...) {
+ cerr << "Exception of unknown type!\n";
+ }
+
+
+
+ ifstream correct_in;
+ correct_in.open( correct_aln_fname.c_str() );
+ if( !correct_in.is_open() ){
+ cerr << "Error opening " << correct_aln_fname << endl;
+ return -1;
+ }
+ ifstream procrast_in;
+ procrast_in.open( procrast_aln_fname.c_str() );
+ if( !procrast_in.is_open() ){
+ cerr << "Error opening " << procrast_aln_fname << endl;
+ return -1;
+ }
+
+try{
+ IntervalList correct_ivs;
+ IntervalList procrast_ivs;
+ std::vector< bitset_t > align_matrix;
+ vector< gnSeqI > leftend;
+ cout << "Reading correct alignment into interval list...";
+ correct_ivs.ReadStandardAlignment( correct_in );
+ cout << " finished" << endl;
+ correct_in.close();
+
+ cout << "Reading procrastAlignment into interval list...";
+ procrast_ivs.ReadStandardAlignmentCompact( procrast_in );
+ cout << " finished" << endl;
+ procrast_in.close();
+
+ gnSequence concat_sequence;
+ concat_sequence.LoadSource( sequence_file ); // fixme, read this filename from command line or something -- this should be unaligned sequence
+ compareAlignmentsAceD( correct_ivs, procrast_ivs, concat_sequence );
+
+}catch( gnException& gne ){
+ cerr << gne << endl;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+}catch( char const* c ){
+ cerr << c << endl;
+}
+
+}
diff --git a/src/sortContigs.cpp b/src/sortContigs.cpp
new file mode 100644
index 0000000..333d646
--- /dev/null
+++ b/src/sortContigs.cpp
@@ -0,0 +1,181 @@
+#include "libMems/IntervalList.h"
+#include "libMems/Aligner.h"
+#include "libGenome/gnFASSource.h"
+#include <fstream>
+#include <string>
+#include <vector>
+#include <iomanip>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+
+int main( int argc, char* argv[] )
+{
+ //
+ if( argc != 3 )
+ {
+ cerr << "Usage: sortContigs <Mauve Alignment> <reference sequence #>\n";
+ cerr << "Where <Mauve Alignment> is the .mln file generated by Mauve, NOT the .alignment\n";
+ cerr << "Sequences are numbered from 0 in the order they were input to Mauve\n";
+ cerr << "This program will write out a new reordered FastA file for each of the non-reference sequences\n";
+ return -1;
+ }
+ ifstream aln_file( argv[1] );
+ if( !aln_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+
+ uint ref_seqI = atoi( argv[2] );
+
+try{
+ IntervalList iv_list;
+ iv_list.ReadList( aln_file );
+ cerr << "Read " << argv[1] << endl;
+ LoadSequences(iv_list, &cout );
+
+ // remove all but the n-way intervals
+ IntervalList nway_iv_list;
+ for( uint ivI = 0; ivI < iv_list.size(); ivI++ )
+ {
+ uint def_seqI = 0;
+ for( ; def_seqI < iv_list.seq_table.size(); def_seqI++ )
+ if( iv_list[ivI].Start( def_seqI ) == 0 )
+ break;
+ if( def_seqI == iv_list.seq_table.size() )
+ nway_iv_list.push_back( iv_list[ivI] );
+ }
+ iv_list.erase(iv_list.begin(), iv_list.end() );
+ iv_list.insert( iv_list.end(), nway_iv_list.begin(), nway_iv_list.end() );
+
+ // compute LCB adjacencies
+ vector< int64 > weights = vector< int64 >( iv_list.size(), 1 );
+ vector< LCB > adjacencies;
+ cerr << "computeLCBAdjacencies\n";
+ computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
+ uint seq_count = iv_list.seq_filename.size();
+ vector< gnSequence* > new_seq_table = vector< gnSequence* >( seq_count );
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ new_seq_table[seqI] = new gnSequence();
+ delete new_seq_table[ref_seqI];
+ new_seq_table[ref_seqI] = iv_list.seq_table[ref_seqI];
+
+ uint leftmost_lcb = 0;
+ for( ; leftmost_lcb < adjacencies.size(); leftmost_lcb++ )
+ if( adjacencies[ leftmost_lcb ].left_adjacency[ref_seqI] == -1 )
+ break;
+ uint adjI = leftmost_lcb;
+ vector< set< uint > > placed_contigs = vector< set< uint > >( iv_list.seq_table.size() );
+ cerr << "placing contigs\n";
+
+ while( adjI != -1 && adjI != -2 && adjI < adjacencies.size() )
+ {
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ if( seqI == ref_seqI )
+ continue;
+ int64 lend = absolut(adjacencies[ adjI ].left_end[seqI] );
+ int64 rend = absolut(adjacencies[ adjI ].right_end[seqI] ) - 1;
+ bool cur_forward = (adjacencies[ adjI ].left_end[seqI] > 0);
+ bool ref_forward = (adjacencies[ adjI ].left_end[ref_seqI] > 0);
+ bool forward = cur_forward == ref_forward;
+ uint r_contig, l_contig;
+ try{
+ l_contig = iv_list.seq_table[seqI]->contigIndexByBase( absolut(lend) );
+ }catch( gnException& gne )
+ {
+ cerr << gne << endl;
+ cerr << "Thrown while getting contig for base lend: " << absolut(lend) << endl;
+ }
+ try{
+ r_contig = iv_list.seq_table[seqI]->contigIndexByBase( absolut(rend) );
+ }catch( gnException& gne )
+ {
+ cerr << gne << endl;
+ cerr << "Thrown while getting contig for base rend: " << absolut(rend) << endl;
+ }
+
+ uint first_contig = forward? l_contig : r_contig;
+ uint last_contig = forward? r_contig : l_contig;
+ first_contig++;
+ last_contig++;
+ try{
+ for( uint contigI = first_contig; forward? (contigI <= last_contig) : (contigI >= last_contig); (forward? contigI++ : contigI--) )
+ {
+ // place these if they haven't already been placed
+ set< uint >::iterator p_iter = placed_contigs[seqI].find(contigI-1);
+ if( p_iter != placed_contigs[seqI].end() )
+ continue; // already placed this contig
+ try{
+ (*new_seq_table[ seqI ]) += iv_list.seq_table[seqI]->contig(contigI-1);
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ cerr << "Thrown while accessing seq " << seqI << " contig " << contigI-1<< endl;
+ }
+ if(!forward)
+ new_seq_table[seqI]->setReverseComplement( true, new_seq_table[seqI]->contigListLength()-1 );
+ placed_contigs[seqI].insert( contigI-1 );
+ }
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ cerr << "Thrown while adding contigs in the range: " << first_contig << " to " << last_contig << endl;
+ }
+ }
+ adjI = adjacencies[ adjI ].right_adjacency[ref_seqI];
+ }
+ cerr << "adding unplaced contigs\n";
+
+ // add any remaining contigs that the alignment didn't place
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ if( seqI == ref_seqI )
+ continue;
+ for( uint contigI = 0; contigI < iv_list.seq_table[seqI]->contigListLength(); contigI++ )
+ {
+ // place this contig if it hasn't already been placed
+ set< uint >::iterator p_iter = placed_contigs[seqI].find(contigI);
+ if( p_iter != placed_contigs[seqI].end() )
+ continue; // already placed this contig
+ (*new_seq_table[ seqI ]) += iv_list.seq_table[seqI]->contig(contigI);
+ placed_contigs[seqI].insert( contigI );
+ }
+ }
+
+ cerr << "writing reordered sequence\n";
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ for( uint contigI = 0; contigI < new_seq_table[seqI]->contigListSize(); contigI++ )
+ {
+ string name = new_seq_table[seqI]->contigName( contigI );
+ stringstream ss( name );
+ string new_name;
+ ss >> new_name;
+ stringstream new_ss;
+ int fillsize = ceil(log((double)new_seq_table[seqI]->contigListSize())/log(10.0));
+ new_ss << setfill('0') << setw(fillsize);
+ new_ss << contigI << "_" << new_name;
+ if( new_seq_table[seqI]->isReverseComplement(contigI) )
+ new_ss << "-";
+ else
+ new_ss << "+";
+ new_seq_table[seqI]->setContigName( contigI, new_ss.str() );
+ }
+ if( seqI == ref_seqI )
+ continue; // reference sequence didn't change
+ string o_filename = iv_list.seq_filename[ seqI ] + ".reordered";
+ ofstream out_file( o_filename.c_str() );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"" << o_filename << "\"\n";
+ return -1;
+ }
+ gnFASSource::Write( *new_seq_table[seqI], out_file, false, false );
+ }
+
+}catch(gnException& gne){
+ cerr << gne << endl;
+}
+}
diff --git a/src/stripGapColumns.cpp b/src/stripGapColumns.cpp
new file mode 100644
index 0000000..cde0499
--- /dev/null
+++ b/src/stripGapColumns.cpp
@@ -0,0 +1,74 @@
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ if( argc != 3 )
+ {
+ cerr << "Usage: stripGapColumns <input XMFA> <output XMFA>\n";
+ return -1;
+ }
+
+ ifstream aln_infile( argv[1] );
+ if( !aln_infile.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+ IntervalList iv_list;
+ iv_list.ReadStandardAlignment( aln_infile );
+ LoadSequences( iv_list, &cout );
+ IntervalList iv_outlist;
+ iv_outlist.seq_filename = iv_list.seq_filename;
+ iv_outlist.seq_table = iv_list.seq_table;
+ for( uint ivI = 0; ivI < iv_list.size(); ivI++ )
+ {
+ Interval& cur_iv = iv_list[ivI];
+ vector< string > alignment;
+ GetAlignment( cur_iv, iv_list.seq_table, alignment );
+ vector< string > seq_align = vector< string >( cur_iv.SeqCount() );
+ for( gnSeqI colI = 0; colI < cur_iv.AlignmentLength(); colI++ )
+ {
+ uint seqI = 0;
+ for( ; seqI < cur_iv.SeqCount(); seqI++ )
+ {
+ if( alignment[seqI][colI] == '-' )
+ break;
+ }
+ if( seqI != cur_iv.SeqCount() )
+ continue;
+ for( seqI = 0; seqI < cur_iv.SeqCount(); seqI++ )
+ {
+ seq_align[seqI] += alignment[seqI][colI];
+ }
+ }
+
+ GappedAlignment* new_ga = new GappedAlignment( seq_align.size(), seq_align[0].size() );
+ new_ga->SetAlignment( seq_align );
+ for( uint seqI = 0; seqI < cur_iv.SeqCount(); seqI++ )
+ {
+ new_ga->SetStart( seqI, cur_iv.Start( seqI ) );
+ new_ga->SetLength( cur_iv.Length( seqI ), seqI );
+ }
+ vector< AbstractMatch* > am_list( 1, new_ga );
+ Interval new_iv(am_list.begin(), am_list.end());
+ iv_outlist.push_back( new_iv );
+ }
+
+ ofstream iv_outfile( argv[2] );
+ if( !iv_outfile.is_open() )
+ {
+ cerr << "Error opening \"" << argv[2] << "\"\n" << endl;
+ return -2;
+ }
+ iv_outlist.WriteStandardAlignment( iv_outfile );
+ return 0;
+}
diff --git a/src/stripSubsetLCBs.cpp b/src/stripSubsetLCBs.cpp
new file mode 100644
index 0000000..dd4a362
--- /dev/null
+++ b/src/stripSubsetLCBs.cpp
@@ -0,0 +1,183 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include "libGenome/gnFilter.h"
+#include "libMems/IntervalList.h"
+#include "libMems/MatchList.h"
+#include "libMems/GappedAlignment.h"
+#include "libMems/Matrix.h"
+#include "libMems/MatchProjectionAdapter.h"
+#include "libMems/Aligner.h"
+#include "libGenome/gnFASSource.h"
+#include <boost/tuple/tuple.hpp>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+typedef boost::tuple< uint, gnSeqI, gnSeqI, vector< uint > > bbcol_t;
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 4 )
+ {
+ cerr << "Usage: stripSubsetLCBs <input xmfa> <input bbcols> <output xmfa> [min LCB size] [min genomes] [randomly subsample to X kb]\n";
+ return -1;
+ }
+ ifstream aln_in;
+ aln_in.open( argv[1] );
+ if( !aln_in.is_open() ){
+ cerr << "Error opening " << argv[1] << endl;
+ return -1;
+ }
+ ifstream bbcols_in;
+ bbcols_in.open( argv[2] );
+ if( !bbcols_in.is_open() )
+ {
+ cerr << "Error opening " << argv[2] << endl;
+ return -2;
+ }
+ ofstream aln_out;
+ aln_out.open( argv[3] );
+ if( !aln_out.is_open() ){
+ cerr << "Error writing to " << argv[3] << endl;
+ return -1;
+ }
+
+ size_t min_block_length = 0;
+ if(argc>=5){
+ min_block_length = atol(argv[4]);
+ }
+ size_t min_genome_count = -1;
+ if(argc>=6){
+ min_genome_count = atol(argv[5]);
+ }
+ size_t subsample_kb = 0;
+ if(argc>=7){
+ subsample_kb = atol(argv[6]);
+ }
+
+
+ try{
+ IntervalList input_ivs;
+ input_ivs.ReadStandardAlignment( aln_in );
+ aln_in.close();
+
+ LoadSequences( input_ivs, NULL );
+
+ // read the bbcols file
+ vector< bbcol_t > bbcols;
+ string cur_line;
+ while( getline( bbcols_in, cur_line ) )
+ {
+ stringstream line_str(cur_line);
+ size_t cur_token;
+ size_t tokenI = 0;
+ bbcol_t bbcol;
+ while( line_str >> cur_token )
+ {
+ switch(tokenI)
+ {
+ case 0:
+ bbcol.get<0>() = cur_token;
+ break;
+ case 1:
+ bbcol.get<1>() = cur_token;
+ break;
+ case 2:
+ bbcol.get<2>() = cur_token;
+ break;
+ default:
+ bbcol.get<3>().push_back(cur_token);
+ break;
+ }
+ tokenI++;
+ }
+ bbcols.push_back(bbcol);
+ }
+ cout << "Read " << bbcols.size() << " backbone entries\n";
+
+ IntervalList output_ivs;
+ output_ivs.seq_table = input_ivs.seq_table;
+ output_ivs.seq_filename = input_ivs.seq_filename;
+/* for( size_t i = 0; i < input_ivs.size(); ++i )
+ {
+ cout << "LCB " << i << " multiplicity: " << input_ivs[i].Multiplicity() << endl;
+ for( size_t seqI = 0; seqI < input_ivs.seq_table.size(); ++seqI )
+ {
+ cout << input_ivs[i].LeftEnd(seqI) << '\t' << input_ivs[i].RightEnd(seqI) << '\t';
+ }
+ cout << endl;
+ if( input_ivs[i].Multiplicity() == input_ivs.seq_table.size() )
+ output_ivs.push_back( input_ivs[i] );
+ }
+*/
+ cout << "seq_count is: " << input_ivs.seq_table.size() << endl;
+ if(min_genome_count==-1) min_genome_count = input_ivs.seq_table.size();
+
+ for( size_t bbI = 0; bbI < bbcols.size(); bbI++ )
+ {
+ if( bbcols[bbI].get<3>().size() < min_genome_count )
+ continue;
+ Interval* sub_iv = input_ivs[bbcols[bbI].get<0>()].Copy();
+ sub_iv->CropStart( bbcols[bbI].get<1>() - 1 );
+ sub_iv->CropEnd( sub_iv->Length() - bbcols[bbI].get<2>() );
+ // calculate mean length
+ size_t avglen = 0;
+ for(size_t seqI=0; seqI < sub_iv->SeqCount(); seqI++){
+ avglen += sub_iv->Length(seqI);
+ }
+ avglen /= sub_iv->SeqCount();
+ if(avglen >= min_block_length){
+ output_ivs.push_back( *sub_iv );
+ }
+ sub_iv->Free();
+ }
+ if(subsample_kb==0){
+ cout << "output_ivs.size() " << output_ivs.size() << endl;
+ output_ivs.WriteStandardAlignment( aln_out );
+ }else{
+ set<size_t> sampled;
+ double cur_kb=0;
+ for(; cur_kb < (double)subsample_kb && sampled.size() < output_ivs.size(); cur_kb++){
+ int block = rand()%output_ivs.size();
+ if(sampled.find(block)!=sampled.end()){
+ continue;
+ }
+ sampled.insert(block);
+ cur_kb += (double)(output_ivs[block].AlignmentLength()) / 1000.0;
+
+ }
+ IntervalList new_ivs;
+ new_ivs.seq_table=output_ivs.seq_table;
+ new_ivs.seq_filename=output_ivs.seq_filename;
+ int i=0;
+ for(set<size_t>::iterator siter = sampled.begin(); siter != sampled.end(); siter++){
+ new_ivs.push_back(output_ivs[*siter]);
+ }
+ cout << "Writing " << cur_kb << " kb of alignment columns in " << new_ivs.size() << " blocks" << endl;
+ new_ivs.WriteStandardAlignment( aln_out );
+ }
+
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }catch( exception& e ){
+ cerr << e.what() << endl;
+ return -2;
+ }catch( char const* c ){
+ cerr << c << endl;
+ return -3;
+ }catch(...){
+ cerr << "Unhandled exception" << endl;
+ return -4;
+ }
+}
+
diff --git a/src/toEvoHighwayFormat.cpp b/src/toEvoHighwayFormat.cpp
new file mode 100644
index 0000000..1f2d725
--- /dev/null
+++ b/src/toEvoHighwayFormat.cpp
@@ -0,0 +1,148 @@
+#include "libMems/IntervalList.h"
+#include "libMems/Aligner.h"
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+// find the chromosome that a given coordinate belongs to
+int getChromosome( vector< int64 >& chr_lens, int64 pos )
+{
+ int chrI = 0;
+ for( ; chrI < chr_lens.size(); chrI++ )
+ if( chr_lens[chrI] > pos )
+ break;
+ return chrI;
+}
+
+// convert a number to a four letter base 26 number
+string getAlphabetID( uint chromo_counter )
+{
+ string rval = "aaaa";
+ int charI = 3;
+ while( charI > 0 && chromo_counter > 0 )
+ {
+ int rem1 = chromo_counter % 26;
+ chromo_counter /= 26;
+ rval[charI--] = (char)(rem1 + 97);
+ }
+ return rval;
+}
+
+int main( int argc, char* argv[] )
+{
+ //
+ if( argc < 4 )
+ {
+ cerr << "Usage: toEvoHighwayFormat <Mauve Alignment> <reference genome id> <genome 1 chr lengths>...<genome N chr lengths>\n";
+ return -1;
+ }
+ ifstream aln_file( argv[1] );
+ if( !aln_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+ uint ref_id = atoi( argv[2] );
+
+ vector< vector< int64 > > chr_lens;
+ for( uint genomeI = 3; genomeI < argc; genomeI++ )
+ {
+ ifstream cur_file( argv[genomeI] );
+ if( !cur_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[genomeI] << "\"\n";
+ return -2;
+ }
+ int64 cur_len = 0;
+ vector< int64 > len_vector;
+ while( cur_file >> cur_len )
+ {
+ if( len_vector.size() > 0 )
+ len_vector.push_back( cur_len + len_vector[ len_vector.size() - 1 ] );
+ else
+ len_vector.push_back( cur_len );
+ }
+ chr_lens.push_back( len_vector );
+ cerr << "Read " << argv[genomeI] << ", " << len_vector.size() << " chromosomes covering " << len_vector[len_vector.size()-1] << " nt " << endl;
+ }
+try{
+ IntervalList iv_list;
+ iv_list.ReadList( aln_file );
+ cerr << "Read " << argv[1] << endl;
+ vector< int64 > weights = vector< int64 >( iv_list.size(), 1 );
+ vector< LCB > adjacencies;
+ cerr << "computeLCBAdjacencies\n";
+ computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
+ uint seq_count = iv_list.seq_filename.size();
+
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ if( seqI == ref_id )
+ continue;
+ uint leftmost_lcb = 0;
+ for( ; leftmost_lcb < adjacencies.size(); leftmost_lcb++ )
+ if( adjacencies[ leftmost_lcb ].left_adjacency[seqI] == -1 )
+ break;
+ uint adjI = leftmost_lcb;
+ uint cur_chromosome = 0;
+ uint chromo_counter = 0;
+
+ while( adjI != -1 && adjI != -2 && adjI < adjacencies.size() )
+ {
+ if( absolut(adjacencies[adjI].left_end[seqI]) > chr_lens[seqI][cur_chromosome] )
+ {
+ cur_chromosome++;
+ chromo_counter = 0;
+ }
+
+ // write out a row for an evo highway synteny block
+ // write ref name
+ cout << iv_list.seq_filename[ref_id];
+ // write ref chromosome
+ int ref_chr = getChromosome( chr_lens[ref_id], absolut(adjacencies[adjI].left_end[ref_id]) );
+ cout << '\t' << ref_chr + 1;
+
+ // write ref interval
+ if( ref_chr > 0 )
+ {
+ cout << '\t' << absolut(adjacencies[ adjI ].left_end[ref_id]) - chr_lens[ref_id][ref_chr - 1];
+ cout << '\t' << absolut(adjacencies[ adjI ].right_end[ref_id]) - chr_lens[ref_id][ref_chr - 1];
+ }else{
+ cout << '\t' << absolut(adjacencies[ adjI ].left_end[ref_id]);
+ cout << '\t' << absolut(adjacencies[ adjI ].right_end[ref_id]);
+ }
+
+ // write species chromosome
+ cout << '\t' << cur_chromosome + 1;
+ cout << getAlphabetID( chromo_counter );
+ // write species interval
+ if( cur_chromosome > 0 )
+ {
+ cout << '\t' << absolut(adjacencies[ adjI ].left_end[seqI]) - chr_lens[seqI][cur_chromosome - 1];
+ cout << '\t' << absolut(adjacencies[ adjI ].right_end[seqI]) - chr_lens[seqI][cur_chromosome - 1];
+ }else{
+ cout << '\t' << absolut(adjacencies[ adjI ].left_end[seqI]);
+ cout << '\t' << absolut(adjacencies[ adjI ].right_end[seqI]);
+ }
+ // write strand
+ cout << '\t';
+ if( adjacencies[ adjI ].left_end[ref_id] > 0 && adjacencies[ adjI ].left_end[seqI] < 0 ||
+ adjacencies[ adjI ].left_end[ref_id] < 0 && adjacencies[ adjI ].left_end[seqI] > 0 )
+ cout << "-";
+ cout << 1;
+ // write target name
+ cout << '\t' << iv_list.seq_filename[seqI];
+ // write lcb id
+ cout << '\t' << adjacencies[adjI].lcb_id + 1 << endl;
+ adjI = adjacencies[adjI].right_adjacency[seqI];
+ chromo_counter++;
+ }
+ }
+}catch(gnException& gne){
+ cerr << gne << endl;
+}
+}
diff --git a/src/toGBKsequence.cpp b/src/toGBKsequence.cpp
new file mode 100644
index 0000000..f61977f
--- /dev/null
+++ b/src/toGBKsequence.cpp
@@ -0,0 +1,38 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libGenome/gnGBKSource.h"
+#include "libGenome/gnStringHeader.h"
+
+using namespace genome;
+using namespace std;
+
+int main( int argc, char* argv[] ){
+
+ if( argc != 3 ){
+ cout << argv[0] << " <input sequence> <output file>\n";
+ }
+ gnSequence seq;
+ try{
+ seq.LoadSource( argv[1] );
+ cout << argv[1] << " has " << seq.contigListLength() << " contigs\n";
+ for( int contigI = 0; contigI < seq.contigListLength(); contigI++ ){
+ gnSequence contig = seq.contig( contigI );
+ string contig_name = seq.contigName( contigI );
+ cout << "contig " << contig_name << " has " << contig.length() << "b.p.\n";
+ // add all necessary headers
+ string locus_hdr = "LOCUS " + contig_name;
+ locus_hdr += " DNA CON 27-Jan-2005";
+ gnStringHeader* gnsh = new gnStringHeader( "LOCUS", locus_hdr );
+ contig.addHeader( 0, gnsh, 0 );
+ gnGBKSource::Write( contig, contig_name+".gbk" );
+ }
+// gnRAWSource::Write( seq, argv[2] );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+ return 0;
+}
diff --git a/src/toGrimmFormat.cpp b/src/toGrimmFormat.cpp
new file mode 100644
index 0000000..b36475e
--- /dev/null
+++ b/src/toGrimmFormat.cpp
@@ -0,0 +1,84 @@
+#include "libMems/IntervalList.h"
+#include "libMems/Aligner.h"
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+
+int main( int argc, char* argv[] )
+{
+ //
+ if( argc < 4 )
+ {
+ cerr << "Usage: toGrimmFormat <Mauve Alignment> <genome 1 chr lengths>...<genome N chr lengths>\n";
+ return -1;
+ }
+ ifstream aln_file( argv[1] );
+ if( !aln_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+ vector< vector< int64 > > chr_lens;
+ for( uint genomeI = 2; genomeI < argc; genomeI++ )
+ {
+ ifstream cur_file( argv[genomeI] );
+ if( !cur_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[genomeI] << "\"\n";
+ return -2;
+ }
+ int64 cur_len = 0;
+ vector< int64 > len_vector;
+ while( cur_file >> cur_len )
+ {
+ if( len_vector.size() > 0 )
+ len_vector.push_back( cur_len + len_vector[ len_vector.size() - 1 ] );
+ else
+ len_vector.push_back( cur_len );
+ }
+ chr_lens.push_back( len_vector );
+ cerr << "Read " << argv[genomeI] << ", " << len_vector.size() << " chromosomes covering " << len_vector[len_vector.size()-1] << " nt " << endl;
+ }
+try{
+ IntervalList iv_list;
+ iv_list.ReadList( aln_file );
+ cerr << "Read " << argv[1] << endl;
+ vector< int64 > weights = vector< int64 >( iv_list.size(), 1 );
+ vector< LCB > adjacencies;
+ cerr << "computeLCBAdjacencies\n";
+ computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
+ uint seq_count = iv_list.seq_filename.size();
+ for( uint seqI = 0; seqI < seq_count; seqI++ )
+ {
+ cerr << "Analyzing seq " << seqI << endl;
+ cout << ">" << iv_list.seq_filename[seqI] << endl;
+ uint leftmost_lcb = 0;
+ for( ; leftmost_lcb < adjacencies.size(); leftmost_lcb++ )
+ if( adjacencies[ leftmost_lcb ].left_adjacency[seqI] == -1 )
+ break;
+ uint adjI = leftmost_lcb;
+ uint cur_chromosome = 0;
+ while( adjI != -1 && adjI != -2 && adjI < adjacencies.size() )
+ {
+ if( absolut(adjacencies[ adjI ].left_end[seqI]) > chr_lens[seqI][cur_chromosome] )
+ {
+ cout << " $\n";
+ cur_chromosome++;
+ }else if( adjI != leftmost_lcb )
+ cout << " ";
+ if( adjacencies[ adjI ].left_end[seqI] < 0 )
+ cout << "-";
+ cout << adjacencies[ adjI ].lcb_id + 1;
+ adjI = adjacencies[ adjI ].right_adjacency[seqI];
+ }
+ cout << " $" << endl;
+ }
+}catch(gnException& gne){
+ cerr << gne << endl;
+}
+}
diff --git a/src/toMultiFastA.cpp b/src/toMultiFastA.cpp
new file mode 100644
index 0000000..3afaaa3
--- /dev/null
+++ b/src/toMultiFastA.cpp
@@ -0,0 +1,54 @@
+#include "libMems/Interval.h"
+#include "libMems/Islands.h"
+#include "libGenome/gnFASSource.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+int main( int argc, char* argv[] )
+{
+ IntervalList iv_list;
+ if( argc != 3 )
+ {
+ cerr << "Usage: <input interval file> <output base name>";
+ return -1;
+ }
+ ifstream in_file( argv[1] );
+ if( !in_file.is_open() )
+ {
+ cerr << "Error opening \"" << argv[1] << "\"\n";
+ return -1;
+ }
+ iv_list.ReadList( in_file );
+ LoadSequences(iv_list, NULL);
+ string base_name = argv[2];
+ cout << "Input alignment has " << iv_list.size() << " intervals\n";
+ for( uint lcbI = 0; lcbI < iv_list.size(); lcbI++ )
+ {
+ gnAlignedSequences gnas;
+ iv_list[lcbI].GetAlignedSequences( gnas, iv_list.seq_table );
+ stringstream lcb_filename;
+ lcb_filename << base_name << ".lcb_" << lcbI;
+ ofstream out_file( lcb_filename.str().c_str() );
+ if( !out_file.is_open() )
+ {
+ cerr << "Error opening \"" << lcb_filename.str() << "\"\n";
+ return -2;
+ }
+ // write a multi-FastA
+ gnSequence gns;
+ for( uint seqI = 0; seqI < gnas.sequences.size(); seqI++ )
+ {
+ stringstream seq_name;
+ seq_name << seqI;
+// seq_name << "(" << iv_list[lcbI].Start(seqI) << "-" << iv_list[lcbI].Start(seqI) + iv_list[lcbI].Length(seqI) << ")";
+ gns += gnas.sequences[seqI];
+ gns.setContigName( gns.contigListSize()-1, seq_name.str() );
+ }
+ gnFASSource::Write( gns, out_file, false, false );
+ }
+
+ return 0;
+}
+
diff --git a/src/toRawSequence.cpp b/src/toRawSequence.cpp
new file mode 100644
index 0000000..36ab3dd
--- /dev/null
+++ b/src/toRawSequence.cpp
@@ -0,0 +1,27 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libGenome/gnSequence.h"
+#include "libGenome/gnRAWSource.h"
+
+using namespace std;
+using namespace genome;
+
+
+int main( int argc, char* argv[] ){
+
+ if( argc != 3 ){
+ cout << argv[0] << " <input sequence> <output file>\n";
+ }
+ gnSequence seq;
+ try{
+ seq.LoadSource( argv[1] );
+ cout << argv[1] << " is " << seq.length() << "b.p.\n";
+ gnRAWSource::Write( seq, argv[2] );
+ }catch( gnException& gne ){
+ cerr << gne << endl;
+ return -1;
+ }
+ return 0;
+}
diff --git a/src/transposeCoordinates.cpp b/src/transposeCoordinates.cpp
new file mode 100644
index 0000000..4dcc906
--- /dev/null
+++ b/src/transposeCoordinates.cpp
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * $Id: transposeCoordinates.cpp,v 1.1 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <iostream>
+#include "libMems/Aligner.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cout << "Usage: " << pname << " <match list> <coordinates file> <sequence ID> <match list output>\n";
+}
+
+int main( int argc, const char* argv[] ){
+ if( argc != 5 ){
+ print_usage("transposeCoordinates");
+ return -1;
+ }
+
+ string match_filename = argv[1];
+ ifstream match_file( match_filename.c_str() );
+ if( !match_file.is_open() ){
+ cerr << "Error opening \"" << match_filename << "\"" << endl;
+ return -1;
+ }
+
+ string coord_filename = argv[2];
+ ifstream coord_file( coord_filename.c_str() );
+ if( !coord_file.is_open() ){
+ cerr << "Error opening \"" << coord_filename << "\"" << endl;
+ return -1;
+ }
+
+ int trans_seq = atoi( argv[3] );
+
+ MatchList mlist;
+ ReadList( mlist, match_file );
+ mlist.MultiplicityFilter( mlist.seq_filename.size() );
+
+ int64 coord;
+ vector< int64 > coord_list;
+ while( coord_file >> coord ){
+ coord_list.push_back( coord );
+ }
+ transposeMatches( mlist, trans_seq, coord_list );
+// for( uint ivI = 0; ivI < iv_list.size(); ivI++ ){
+// }
+
+ string match_outname = argv[4];
+ ofstream match_out( match_outname.c_str() );
+ if( !match_out.is_open() ){
+ cerr << "Error opening \"" << match_outname << "\"" << endl;
+ return -1;
+ }
+ WriteList( mlist, match_out );
+
+
+ return 0;
+}
+
+
diff --git a/src/unalign.cpp b/src/unalign.cpp
new file mode 100644
index 0000000..3aaf3df
--- /dev/null
+++ b/src/unalign.cpp
@@ -0,0 +1,91 @@
+#include "libMems/IntervalList.h"
+#include "libGenome/gnFASSource.h"
+#include "libMems/GappedAlignment.h"
+#include <algorithm>
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+/**
+ * program to extract source sequences from an alignment
+ */
+int main( int argc, char* argv[] ){
+
+ if( argc < 3 ){
+ cout << "Sometimes you've got an alignment but you just can't seem to find the sequences that went into it." << endl;
+ cout << "unalign <input alignment xmfa> <output Multi-FastA>\n";
+ return -1;
+ }
+
+ string input_fname = argv[ 1 ];
+ string output_fname = argv[ 2 ];
+
+ ifstream alignment_in;
+ alignment_in.open( input_fname.c_str() );
+ if( !alignment_in.is_open() ){
+ cerr << "Error opening " << input_fname << endl;
+ return -1;
+ }
+
+ ofstream mfa_out;
+ mfa_out.open( output_fname.c_str() );
+ if( !mfa_out.is_open() ){
+ cerr << "Error opening " << output_fname << endl;
+ return -1;
+ }
+
+try{
+ IntervalList ivs;
+ cerr << "Reading " << input_fname << endl;
+ ivs.ReadStandardAlignment( alignment_in );
+ alignment_in.close();
+ if( ivs.size() == 0 ){
+ cerr << "Error! The alignment doesn't contain any intervals!\n";
+ return -1;
+ }
+ cerr << "Successfully read " << input_fname << endl;
+ cerr << "Removing gaps...\n";
+ uint seq_count = ivs[ 0 ].SeqCount();
+ gnSequence output_seq;
+ for( uint seqI = 0; seqI < seq_count; seqI++ ){
+ gnSequence cur_seq;
+ AbstractMatchStartComparator<Interval> ivcomp(seqI);
+ sort( ivs.begin(), ivs.end(), ivcomp );
+ for( uint ivI = 0; ivI < ivs.size(); ivI++ ){
+ const vector< AbstractMatch* >& matches = ivs[ivI].GetMatches();
+ const vector< string >& alignment = GetAlignment(*((GappedAlignment*)matches[0]), vector<gnSequence*>(seq_count) );
+ cur_seq += alignment[seqI];
+ if(ivs[ivI].LeftEnd(seqI)<0) cur_seq.setReverseComplement(true, cur_seq.contigListLength()-1);
+ }
+ string strseq = cur_seq.ToString();
+ // strip gaps
+ string gapless_seq;
+ for( string::size_type charI = 0; charI < cur_seq.size(); charI++ ){
+ if( strseq[ charI ] != '-' )
+ gapless_seq += strseq[ charI ];
+ }
+ output_seq += gapless_seq;
+ if(ivs.seq_filename.size()>0){
+ output_seq.setContigName(seqI,ivs.seq_filename[seqI]);
+ gnSequence file_seq;
+ file_seq += gapless_seq;
+ gnFASSource::Write( file_seq, ivs.seq_filename[seqI] );
+ }
+ }
+ cerr << "Writing " << output_fname << endl;
+ gnFASSource::Write( output_seq, mfa_out );
+
+
+}catch( gnException& gne ){
+ cerr << gne << endl;
+ return -2;
+}catch( exception& e ){
+ cerr << e.what() << endl;
+ return -3;
+}catch( char const* c ){
+ cerr << c << endl;
+ return -4;
+}
+
+}
diff --git a/src/uniqueMerCount.cpp b/src/uniqueMerCount.cpp
new file mode 100644
index 0000000..86d75a8
--- /dev/null
+++ b/src/uniqueMerCount.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * $Id: uniqueMerCount.cpp,v 1.1 2004/02/28 00:01:31 darling Exp $
+ * This file is copyright 2002-2004 Aaron Darling. All rights reserved.
+ * Please see the file called COPYING for licensing, copying, and modification
+ * rights. Redistribution of this file, in whole or in part is prohibited
+ * without express permission.
+ ******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "libMems/DNAFileSML.h"
+
+using namespace std;
+using namespace genome;
+using namespace mems;
+
+void print_usage( const char* pname ){
+ cerr << "Usage: " << pname << " <Sorted Mer List>\n";
+}
+
+int main( int argc, const char* argv[] ){
+ if( argc != 2 ){
+ print_usage("uniqueMerCount");
+ return -1;
+ }
+
+ string sml_filename = argv[1];
+ DNAFileSML* file_sml = new DNAFileSML();
+ boolean success = true;
+ try{
+ file_sml->LoadFile( sml_filename );
+ }catch( gnException& gne ){
+ success = false;
+ cerr << gne << endl;
+ return -1;
+ }
+ cout << endl << file_sml->UniqueMerCount() << endl;
+}
+
diff --git a/src/uniquifyTrees.cpp b/src/uniquifyTrees.cpp
new file mode 100644
index 0000000..b9515d1
--- /dev/null
+++ b/src/uniquifyTrees.cpp
@@ -0,0 +1,250 @@
+#include "libMems/PhyloTree.h"
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <utility>
+#include <fstream>
+
+using namespace std;
+
+typedef unsigned int uint;
+
+bool taxonNameLessThan( string name1, string name2 )
+{
+ stringstream n1_str( name1 );
+ stringstream n2_str( name2 );
+ int n1, n2;
+ n1_str >> n1;
+ n2_str >> n2;
+ return n1 < n2;
+}
+
+template<class T, class S>
+void findAndErase( T& container, S& item )
+{
+ T new_container;
+ for( typename T::iterator t_iter = container.begin(); t_iter != container.end(); t_iter++ )
+ if( *t_iter != item )
+ new_container.push_back( *t_iter );
+ container = new_container;
+};
+
+/**
+ * Depth first search to check whether a subtree contains a given node
+ */
+bool containsNode( PhyloTree< TreeNode >& t, node_id_t subtree_nodeI, node_id_t query_nodeI )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push( subtree_nodeI );
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( cur_node == query_nodeI )
+ return true;
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ }
+ return false;
+}
+
+
+/** place a root on the branch with endpoints root_left and root_right
+ */
+void rerootTree( PhyloTree< TreeNode >& t, node_id_t new_root )
+{
+ // new root must be an internal node
+ if( t[new_root].children.size() == 0 )
+ throw "Can't root on a leaf node";
+ if( new_root == t.root )
+ return; // idiot caller didn't realize it's already rooted here
+
+ // change the old root node to an internal node
+ uint childI = 0;
+ for( ; childI < t[t.root].children.size(); childI++ ){
+ if( containsNode( t, t[t.root].children[childI], new_root ) )
+ {
+ t[t.root].parents.push_back( t[t.root].children[childI] );
+ findAndErase( t[t.root].children, t[t.root].children[childI] );
+ break;
+ }
+ }
+ // shake the tree out on the new root node
+ t.root = new_root;
+ t[t.root].children.insert( t[t.root].children.end(), t[t.root].parents.begin(), t[t.root].parents.end() );
+
+ stack<node_id_t> node_stack;
+ node_stack.push(t.root);
+ while( node_stack.size() > 0 )
+ {
+ // delete the current node from all of its child nodes lists
+ // and insert it as a parent
+ // make all other nodes reference by the child grandchildren
+ // recurse on each child
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ for( uint childI = 0; childI < t[cur_node].children.size(); childI++ )
+ {
+ TreeNode& child_n = t[t[cur_node].children[childI]];
+ findAndErase( child_n.children, cur_node );
+ findAndErase( child_n.parents, cur_node );
+ child_n.children.insert( child_n.children.end(), child_n.parents.begin(), child_n.parents.end() );
+ child_n.parents.clear();
+ child_n.parents.push_back(cur_node);
+ node_stack.push(t[cur_node].children[childI]);
+ }
+ }
+}
+
+/**
+ * Find the leaf node lexicographically least taxon name in the
+ * subtree below nodeI
+ */
+node_id_t getRepresentativeTaxon( PhyloTree< TreeNode >& t, node_id_t nodeI )
+{
+ stack< node_id_t > node_stack;
+ node_stack.push( nodeI );
+ string least_name = "";
+ node_id_t least_node = nodeI;
+ while( node_stack.size() > 0 )
+ {
+ node_id_t cur_node = node_stack.top();
+ node_stack.pop();
+ if( t[cur_node].children.size() > 0 )
+ {
+ for( size_t childI = 0; childI < t[cur_node].children.size(); childI++ )
+ node_stack.push( t[cur_node].children[childI] );
+ }
+ else
+ {
+ if( least_name == "" )
+ {
+ least_name = t[cur_node].name;
+ least_node = cur_node;
+ }
+ if( taxonNameLessThan( t[cur_node].name, least_name ) )
+ {
+ least_name = t[cur_node].name;
+ least_node = cur_node;
+ }
+ }
+ }
+ return least_node;
+}
+
+class TaxonNamePairComparator
+{
+public:
+ bool operator()( const pair<string, size_t>& p1, const pair<string, node_id_t>& p2 )
+ {
+ return taxonNameLessThan( p1.first, p2.first );
+ }
+};
+
+void sortTaxa( PhyloTree< TreeNode >& t )
+{
+ for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ )
+ {
+ if( t[nodeI].children.size() == 0 )
+ continue;
+ // get the "representative" of each subtree
+ vector< pair<string, node_id_t> > representatives = vector< pair<string, node_id_t> >( t[nodeI].children.size() );
+ for( size_t repI = 0; repI < representatives.size(); repI++ )
+ {
+ node_id_t rep_node = getRepresentativeTaxon( t, t[nodeI].children[ repI ] );
+ representatives[ repI ] = make_pair( t[rep_node].name, repI );
+ }
+ // sort children on their representative taxon names
+ TaxonNamePairComparator tnc;
+ sort( representatives.begin(), representatives.end(), tnc );
+ // repopulate the children array with the sorted order
+ vector< node_id_t > sorted_children;
+ for( size_t repI = 0; repI < representatives.size(); repI++ )
+ sorted_children.push_back( t[nodeI].children[representatives[repI].second] );
+ t[nodeI].children = sorted_children;
+ }
+}
+
+/**
+ * Assumes that taxa have numeric labels starting at 1 and simply
+ * subtracts 1 from each node label
+ */
+void relabelTaxaToStartWithZero( PhyloTree< TreeNode >& t )
+{
+ for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ )
+ {
+ if( t[nodeI].name == "" )
+ continue;
+ stringstream name_str( t[nodeI].name );
+ uint number;
+ name_str >> number;
+ number--;
+ stringstream new_name_str;
+ new_name_str << number;
+ t[nodeI].name = new_name_str.str();
+ }
+}
+
+int main( int argc, char* argv[] )
+{
+ if( argc < 3 )
+ {
+ cerr << "Usage: uniquifyTrees <nexus input file> <nexus output file>\n";
+ cerr << "All trees in the input file must have the same number of taxa and the same taxon labels\n";
+ }
+ string input_filename = argv[1];
+ string output_filename = argv[2];
+ ifstream input_file( input_filename.c_str() );
+ if( !input_file.is_open() )
+ {
+ cerr << "Error opening \"" << input_filename << "\"\n";
+ return -1;
+ }
+ ofstream output_file( output_filename.c_str() );
+ if( !output_file.is_open() )
+ {
+ cerr << "Error opening \"" << output_filename << "\"\n";
+ return -1;
+ }
+
+ size_t tree_sizes = 0;
+ uint tree_count = 0;
+ vector< string > tree_list;
+ while( true )
+ {
+ PhyloTree< TreeNode > t;
+ t.readTree( input_file );
+ if( t.size() == 0 )
+ break;
+ if( tree_sizes == 0 )
+ tree_sizes = t.size();
+ if( t.size() != tree_sizes )
+ {
+ cerr << "Error: tree " << tree_count + 1 << " has a different number of taxa\n";
+ return -2;
+ }
+ sortTaxa( t );
+ relabelTaxaToStartWithZero( t );
+ stringstream ss;
+ t.writeTree(ss);
+ tree_list.push_back(ss.str());
+ cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+ cout << "Read " << tree_list.size() << " trees";
+ }
+ cout << endl;
+ cout << "Writing unique trees to \"" << output_filename << "\"\n";
+ sort(tree_list.begin(), tree_list.end() );
+ size_t unique_count = 0;
+ for( size_t treeI = 0; treeI < tree_list.size(); treeI++ )
+ {
+ if( treeI > 0 && tree_list[treeI] == tree_list[treeI - 1] )
+ continue;
+ output_file << tree_list[treeI] << endl;
+ unique_count++;
+ }
+ cerr << "There are " << unique_count << " unique trees\n";
+ return 0;
+}
diff --git a/src/xmfa2maf.cpp b/src/xmfa2maf.cpp
new file mode 100644
index 0000000..9876c69
--- /dev/null
+++ b/src/xmfa2maf.cpp
@@ -0,0 +1,87 @@
+#include "libMems/IntervalList.h"
+#include "libMems/ProgressiveAligner.h"
+#include <fstream>
+
+using namespace mems;
+using namespace std;
+using namespace genome;
+
+int main(int argc, char* argv[] ){
+ if(argc != 3){
+ cerr << "Usage: xmfa2maf <xmfa input> <maf output>\n";
+ return -1;
+ }
+ ifstream ifile(argv[1]);
+ if(!ifile.is_open()){
+ cerr << "Error reading \"" << argv[1] << "\"\n";
+ return -2;
+ }
+ ofstream ofile(argv[2]);
+ if(!ofile.is_open()){
+ cerr << "Error writing to \"" << argv[2] << "\"\n";
+ return -2;
+ }
+
+ IntervalList xmfa;
+ xmfa.ReadStandardAlignment(ifile);
+ LoadSequences(xmfa, &cout);
+
+ // break alignments on chromosome boundaries
+ vector<AbstractMatch*> alignments;
+ for(int ivI=0; ivI < xmfa.size(); ivI++){
+ alignments.push_back( xmfa[ivI].Clone() );
+ }
+
+ vector< vector< gnSeqI > > chromo_bounds( xmfa.seq_table.size() );
+ for(int seqI=0; seqI < xmfa.seq_table.size(); seqI++){
+ for(int cI=1; cI < xmfa.seq_table[seqI]->contigListSize(); cI++){
+ chromo_bounds[seqI].push_back( xmfa.seq_table[seqI]->contigStart(cI) );
+ }
+ SSC<AbstractMatch> msc( seqI );
+ sort( alignments.begin(), alignments.end(), msc );
+ AbstractMatchSeqManipulator amsm( seqI );
+ applyBreakpoints( chromo_bounds[seqI], alignments, amsm );
+ }
+
+ ofile << "##maf version=1 program=progressiveMauve\n";
+ for(int ivI=0; ivI < alignments.size(); ivI++ ){
+ ofile << "a\n";
+ vector<string> aln;
+ GetAlignment( *((Interval*)(alignments[ivI])), xmfa.seq_table, aln );
+
+ for( int seqI=0; seqI < xmfa.seq_filename.size(); seqI++ ){
+ if(alignments[ivI]->LeftEnd(seqI)==0)
+ continue; // sequence not defined in this block
+
+ // determine which contig this alignment is in
+ uint32 l_contigI, r_contigI;
+ gnSeqI l_baseI = alignments[ivI]->LeftEnd(seqI);
+ gnSeqI r_baseI = alignments[ivI]->RightEnd(seqI)-1;
+ xmfa.seq_table[seqI]->globalToLocal( l_contigI, l_baseI );
+ xmfa.seq_table[seqI]->globalToLocal( r_contigI, r_baseI );
+ string contig_name = xmfa.seq_table[seqI]->contigName( l_contigI );
+ if(l_contigI != r_contigI){
+ cerr << "interval " << ivI << " seq " << seqI << " left " << alignments[ivI]->LeftEnd(seqI) << " right " << alignments[ivI]->RightEnd(seqI) << endl;
+ cerr << "l_baseI " << l_baseI << " r_baseI " << r_baseI << " l_contigI " << l_contigI << " r_contigI " << r_contigI << " name " << contig_name << endl;
+ cerr << "Error, input alignment spans multiple contigs/chromosomes. Unable to translate to MAF\n";
+ return -1;
+ }
+ ofile << "s " << xmfa.seq_filename[seqI] << "." << contig_name;
+ ofile.flush();
+
+ int64 lend = l_baseI-1;
+ if(alignments[ivI]->Orientation(seqI) == AbstractMatch::reverse){
+ lend = xmfa.seq_table[seqI]->contigLength(l_contigI) - l_baseI - alignments[ivI]->Length(seqI) + 1;
+ }
+ ofile << " " << lend;
+ ofile << " " << alignments[ivI]->Length(seqI);
+ ofile << " " << (alignments[ivI]->Orientation(seqI) == AbstractMatch::reverse ? "-" : "+");
+ ofile << " " << xmfa.seq_table[seqI]->contigLength(l_contigI);
+ ofile << " " << aln[seqI] << endl;
+ }
+ ofile << endl;
+ }
+ ofile.close();
+
+ return 0;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mauvealigner.git
More information about the debian-med-commit
mailing list