[med-svn] [lagan] 05/08: New upstream version 2.0
Andreas Tille
tille at debian.org
Sat Dec 9 18:16:25 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository lagan.
commit 1afdb838e2840e0812d4d7553b9559579d0bdde0
Author: Andreas Tille <tille at debian.org>
Date: Sat Dec 9 18:31:26 2017 +0100
New upstream version 2.0
---
Makefile | 5 +
Readmes/LICENSE | 341 ++++
Readmes/README.FIRST | 80 +
Readmes/README.chaos | 164 ++
Readmes/README.lagan | 74 +
Readmes/README.mlagan | 71 +
Readmes/README.shuffle | 83 +
Readmes/README.tools | 323 ++++
Utils.pm | 553 +++++++
anal_gloc.pl | 142 ++
blosum62.txt | 25 +
blosum62s.txt | 25 +
debian/changelog | 13 -
debian/compat | 1 -
debian/control | 24 -
debian/copyright | 32 -
debian/docs | 1 -
debian/install | 6 -
.../do_not_define_conflicting_getline.patch | 24 -
debian/patches/gcc-4.8.patch | 25 -
debian/patches/series | 2 -
debian/rules | 10 -
debian/source/format | 1 -
debian/upstream/metadata | 12 -
debian/watch | 8 -
lagan.pl | 242 +++
nucmatrix.txt | 9 +
rechaos.pl | 375 +++++
sample.fasta | 25 +
sample.params | 59 +
slagan-mfa.pl | 35 +
slagan.pl | 153 ++
src/.gdb_history | 2 +
src/Makefile | 54 +
src/Utils.pm | 553 +++++++
src/anal_gloc.pl | 142 ++
src/anchors.c | 279 ++++
src/ancseq.cpp | 720 +++++++++
src/ancseqrest.cpp | 324 ++++
src/cutmfa.cpp | 148 ++
src/diagmatrix.c | 344 ++++
src/diagmatrix.h | 54 +
src/faindex.cpp | 56 +
src/fchaos.c | 1254 +++++++++++++++
src/fchaos.h | 39 +
src/filebuffer.c | 199 +++
src/filebuffer.h | 36 +
src/global.c | 176 +++
src/global.h | 14 +
src/glocal/Makefile | 19 +
src/glocal/default.score | 5 +
src/glocal/glocal.cpp | 258 +++
src/glocal/glocal.h | 23 +
src/glocal/io.cpp | 293 ++++
src/glocal/io.h | 22 +
src/glocal/leftinfluence.cpp | 637 ++++++++
src/glocal/leftinfluence.h | 100 ++
src/glocal/rightinfluence.cpp | 203 +++
src/glocal/rightinfluence.h | 42 +
src/glocal/score.cpp | 225 +++
src/glocal/score.h | 39 +
src/glocal/structs.h | 92 ++
src/glocal/test.score | 5 +
src/lagan.pl | 242 +++
src/lagan2mfa.cpp | 95 ++
src/makecons.cpp | 220 +++
src/mempage.c | 55 +
src/mempage.h | 34 +
src/mlagan.c | 1095 +++++++++++++
src/multial.c | 1648 ++++++++++++++++++++
src/multial.h | 125 ++
src/order.c | 842 ++++++++++
src/order.h | 29 +
src/prolagan.c | 1115 +++++++++++++
src/rechaos.pl | 375 +++++
src/skiplist.c | 210 +++
src/skiplist.h | 29 +
src/slagan-mfa.pl | 35 +
src/slagan.pl | 172 ++
src/sortlist.c | 43 +
src/supermap.pl | 1622 +++++++++++++++++++
src/thrtrie.c | 330 ++++
src/thrtrie.h | 67 +
src/translate.c | 78 +
src/translate.h | 3 +
src/util.cpp | 68 +
src/utils/Glue.cpp | 493 ++++++
src/utils/MultiSequence.h | 124 ++
src/utils/Output.h | 20 +
src/utils/SafeVector.h | 44 +
src/utils/Sequence.h | 229 +++
src/utils/bin2bl.c | 187 +++
src/utils/bin2mf.c | 69 +
src/utils/cextract.c | 113 ++
src/utils/cmerge2.pl | 207 +++
src/utils/contigorder.c | 350 +++++
src/utils/cstat.c | 252 +++
src/utils/dotplot.cpp | 107 ++
src/utils/draft.pl | 267 ++++
src/utils/fa2xfa.c | 122 ++
src/utils/getbounds.c | 90 ++
src/utils/getcontigpos.c | 99 ++
src/utils/getlength.c | 47 +
src/utils/getoverlap.c | 33 +
src/utils/mextract.pl | 88 ++
src/utils/mf2bin.pl | 93 ++
src/utils/mpretty.pl | 263 ++++
src/utils/mproject.pl | 90 ++
src/utils/mrun.pl | 267 ++++
src/utils/mrunfile.pl | 111 ++
src/utils/mrunpairs.pl | 267 ++++
src/utils/mviz.pl | 222 +++
src/utils/overlay.c | 261 ++++
src/utils/rc.c | 71 +
src/utils/scorealign.c | 479 ++++++
src/utils/scorecontigs.c | 410 +++++
src/utils/seqmerge.c | 46 +
src/xmfa2mfa.pl | 65 +
supermap.pl | 1622 +++++++++++++++++++
test.score | 5 +
utils/Utils.pm | 553 +++++++
utils/cmerge2.pl | 207 +++
utils/draft.pl | 267 ++++
utils/flipchaos.pl | 13 +
utils/mextract.pl | 88 ++
utils/mf2bin.pl | 93 ++
utils/mpretty.pl | 263 ++++
utils/mproject.pl | 90 ++
utils/mrun.pl | 267 ++++
utils/mrunfile.pl | 111 ++
utils/mrunpairs.pl | 267 ++++
utils/msplit.pl | 90 ++
utils/mviz.pl | 222 +++
xmfa2mfa.pl | 65 +
134 files changed, 27482 insertions(+), 159 deletions(-)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..fbbbe79
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,5 @@
+all:
+ (cd src; $(MAKE))
+clean:
+ rm -f chaos anchors order glocal utils/bin2bl mlagan utils/cstat utils/bin2mf utils/rc *~ utils/contigorder utils/getbounds utils/cextract utils/seqmerge utils/getlength utils/getoverlap utils/*~ utils/scorealign utils/scorecontigs mlagan.purify utils/getcontigpos utils/fa2xfa utils/Glue utils/dotplot utils/overlay
+ (cd src; $(MAKE) clean)
diff --git a/Readmes/LICENSE b/Readmes/LICENSE
new file mode 100644
index 0000000..35d36f9
--- /dev/null
+++ b/Readmes/LICENSE
@@ -0,0 +1,341 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
diff --git a/Readmes/README.FIRST b/Readmes/README.FIRST
new file mode 100644
index 0000000..7640d1b
--- /dev/null
+++ b/Readmes/README.FIRST
@@ -0,0 +1,80 @@
+README.first for LAGAN Toolkit (Limited Area Global Alignment of Nucleotides) v2.0
+Author: Michael Brudno (brudno at cs.toronto.edu) 09/14/2006
+
+LAGAN was developed by
+Michael Brudno, Chuong Do, Sanket Malde, Michael F Kim and Serafim Batzoglou of
+the Dept of Computer Science at Stanford University, with assistance from many
+other people. See http://lagan.stanford.edu or contact lagan at cs.stanford.edu
+for more information.
+
+0. Availability + Legalese
+ The source code of this version of LAGAN is freely available to all users
+under the GNU Public License (GPL). See the file LICENSE in this directory for more
+information. You can download the LAGAN sources from http://lagan.stanford.edu
+If you use LAGAN regularly please consider contacting lagan at cs.stanford.edu
+to be placed on a mailing list to be contacted about any updates and bug-fixes.
+If you use LAGAN in a published result please see
+http://lagan.stanford.edu/cite.html for the latest citation information.
+
+I. Installation
+ To install LAGAN you need to copy the source files to your local
+computer, untar/ungzip them, and run "make". I am assuming you have a
+reasonably modern installation of gcc and perl. The sequence of commands should
+be:
+
+% gunzip lagan.tar.gz
+% tar xvf lagan.tar
+% make
+
+This will create the executable files chaos, anchors, order, mlagan,
+glocal, prolagan as well as many tools in the utils directory.
+
+You may also need to go into all the .pl file, and change the first line
+to call your perl interpreter. You must also specify an environment
+variable $LAGAN_DIR to point to the directory where you installed LAGAN.
+
+Because LAGAN uses no system-dependent or implementation dependent
+libraries it should compile on all platforms and ANSI C compilers.
+We use it on a Linux box. Please tell us if you have trouble compiling/running
+LAGAN tools on your favorite platform, we have found that most of these problems
+are easily resolved.
+
+II Description
+LAGAN toolkit is a set of tools for local, global, and multiple alignment of DNA
+sequences. Please see our website (http://lagan.stanford.edu) for publications
+describing LAGAN and its components.
+
+The 4 main parts of LAGAN, each documented in its own README file are:
+
+1. CHAOS local alignment tool
+2. LAGAN pairwise global alignment tool
+3. MLAGAN multiple global alignment tool.
+4. Shuffle-LAGAN pairwise glocal alignment (with the SuperMap chaining addition)
+
+There are also numerous utilities and scripts, mainly in the utils subdirectory.
+Some of these are documented in the README.tools file. Of particular interest
+may be scorealign, that can score a LAGAN or MLAGAN alignment, and the series
+of "m" tools: mproject, mextract, mpretty, mrunfile for running mlagan and
+parsing its output.
+
+
+III Repeat Masking
+
+LAGAN, MLAGAN, and Shuffle-LAGAN can use masking information to improve
+the quality of the alignment. If you are trying to align sequence seq1.fa
+and seq2.fa you should create the files seq1.fa.masked and seq2.fa.masked
+which should have repeats masked to Ns. LAGAN, M-LAGAN and S-LAGAN
+will know to look for these files when aligning. CHAOS doesn't recognise
+repeat information, you should just use it on the masked files if this is
+appropriate.
+
+IV Changes from previous version
+
+0.9 -> 1.0: Several bug fixes, alignment parameters are now in the nucmatrix.txt file
+1.0 -> 1.1: Several bug fixes, Fastreject now clips at intersection rather than union.
+1.1 -> 1.2: A few bug fixes, Shuffle-LAGAN added.
+1.2 -> 1.21: A few bug fixes, sped up shuffle-lagan, code is now GPLed
+1.21-> 2.0: A few minor (and couple of major) bug fixes. MLAGAN no longer
+requires a tree, and takes a substitution matrix as an argument, added
+supermap chaining (and a new implementation of glocal chaining), updated
+to align up to 63 sequences.
\ No newline at end of file
diff --git a/Readmes/README.chaos b/Readmes/README.chaos
new file mode 100644
index 0000000..9ddc80b
--- /dev/null
+++ b/Readmes/README.chaos
@@ -0,0 +1,164 @@
+README for CHAOS (CHAins Of Score) version 0.933 10/22/2003
+Author: Michael Brudno (brudno at cs.stanford.edu)
+
+0. Availability + Legalese
+ The source code of this version of CHAOS is freely available to all users
+under the GNU Public License (GPL). See the file LICENSE in this directory for more
+information.You can download it from http://www.stanford.edu/~brudno/chaos/
+ If you use CHAOS regularly please consider contacting brudno at cs.stanford.edu to be
+placed on a mailing list to be contacted about any updates and bug-fixes.
+If you use CHAOS in a published result please cite:
+
+Michael Brudno and Burkhard Morgenstern. "Fast and sensitive alignment of
+large genomic sequences" Proceedings of the IEEE Computer Society
+Bioinformatics Conference (CSB) 2002 pp. 138-47
+
+I. Installation
+ To install CHAOS you need to copy the source files to your local
+computer, untar/ungzip them, and run "make". I am assuming you have a
+reasonably modern installation of gcc. The sequence of commands should be:
+
+% gunzip chaos.tar.gz
+% tar xvf chaos.tar
+% make
+
+This will create the executable files "chaos" and "anchors". This
+distibutiuon also includes the program ancs4dialign.pl, a perl script for
+connecting CHAOS with DIALIGN. Both these tools are described in section V.
+
+
+Because CHAOS uses no system-dependent or implementation dependent libraries
+it should compile on all platforms and ANSI C compilers. If you have problems
+compiling the sources please e-mail the author. You will need to also set the
+environment variable LAGAN_DIR to the directory where you installed CHAOS. in
+c-shell this can be done by executing
+
+% setenv LAGAN_DIR `pwd`
+
+on the prompt. For other shells the command differs.
+
+II Description
+ CHAOS is a heuristic local alignment tool optimized for non-coding
+regions of the genome. The main idea behind the algorithm lies in the
+chaining together of similar regions, or seeds. A seed is a pair of k-long
+words with at least n identical base pairs (bp). A seed k1 can then be
+chained to the seed k2 whenever the indeces of k1 in both sequences are
+higher than the indeces of k2, and k1 and k2 are "near" each other, with
+"near" defined by both a distance and a gap criteria. The final score of a
+chain is the total number of matching bp in it. There is no explicit gap
+penalty for matching seeds which are seperated by an unequal number of
+bases in the two sequences.
+
+III Usage
+1. Input Parameters
+ The main input are two fasta files. the first should contain a single
+query sequence, while the second can be a database of several sequences.
+There are followed by any number of command line options. This list is partial,
+(run chaos without args for the full list):
+
+nucmatrix.txt -- This file has the substitution matrix used by lagan and the gap
+penalties. The gaps penalties are on the line immediately after the matrix,
+the first number is the gap open, the second the gap continue.
+
+blosum62s.txt -- This file has a (scaled) version of the blosum62 matrix and
+appropriate gap parameters.
+
+-p = Peptide sequence [default genomic]
+Whether the input is a peptide or genomic sequence. For peptide sequences
+we call "similar" letters equal. In the default configuration we have
+"PCMH[DE][KR][NQ][ST][ILV][FYW][AG]X*", where letters in the same brackets
+are considered equal. Currently this is not user-settable, but as usual if
+you really want to be able to change this e-mail me.
+
+-v = Verbose mode [default brief]
+Displays the Smith-Waterman alignments of the resulting conserved regions.
+
+-b = Both strands [default forward-only]
+Add this if you are interested in similarities on both strands of the DNA.
+Meaningless if used with -p.
+
+-t = Translated [default off]
+Makes the 6 translated frames of the sequences and compares them,
+forward against forward, backward against backward (all against all if -b
+specified).
+
+-wl # = Word Length [default 10 for genomic, 4 for peptide]
+The length of the seed (k in the description above).
+
+-nd # = Number of Degeneracy [default 1 for genomic, 0 for peptide]
+Amount of degeneracy allowed in the seed (k-n in the description above).
+
+-co # = score CutOff [default 25]
+Scores above this cutoff are shown.
+
+-rsc $ = reScoring cutoff [default 0]
+After the alignments are found they are rescored using a fast Smith-Waterman like
+algorithm. This lets you set the rescoring cutoff, to see only the high confidence hits.
+Scores around 2500 and greater are indicative of strong homology. One common use of this
+is to set -co to something small, and control only the S-W quality of alignments.
+
+-lb # = LookBack distance [default 20 for genomic, 8 for peptide]
+How far away two seeds are allowed to be so that they are chained.
+
+-gl # = maximum Gap Length [default 5]
+Maximum sized gap allowed between two seeds if they are to be chained.
+
+-version = prints the version number
+
+2. Usage notes/suggestions
+The part of the algorithm which usually takes longest is chaining. So if
+it is too slow, try increasing the wl parameter, decreasing the -nd
+parameter or both. If you do so, you probably need to adjust the -co or -rsc
+paramters so that the results you get are meaningful. The -ext parameter seems
+to be very effective, we strongly suggest it.
+
+IV Description of Algorithm
+1. Seed Location
+
+Seeds are found by first indexing the query sequence in a "threaded trie"
+of height k. In a trie every node corresponds to some [part of a] word. In
+a threaded trie, every node has a back pointer to the node which
+corresponds to the same word without its first letter. We start by
+inserting into the threaded trie all of the k-mers of the query sequence.
+Then we do a "walk" using the database sequence, where starting at the
+root, for every letter if the current node has a child corresponding to
+this letter we go down to it, and if it does not we folloe back pointers
+until it does, or we hit the root. If degeneracy is allowed, we just allow
+multiple current nodes, which correspond to the possible degenerate words.
+
+2. Search Space and Chaining
+
+The seeds seen over the course of the past -lb basepairs are stored in a
+skip list, indexed by the difference of its indeces in the two sequences
+(diagonal number). For each seed we do a range query in the skip list,
+finding the possible hits with which it can be chained. the highest
+scoring chain is picked, and it can then be further extended by future
+hits.
+
+IV anchors and ancs4dialign
+
+Anchors is a small C program, that given a list of CHAOS local alignments
+resolves them into a strictly increasing list of anchors using an algorithm
+based on the Longest increasing subsequence problem. The anchors given out
+by the program can be used to anchor any global aligner that supports an
+external anchors file, e.g. LAGAN or dialign. For Dialign we include an extra
+script, ancs4dialign, written by Burkhard Morgenstern that given a
+multi-fasta file with several sequences will create a .anc file that dialign
+will use if given the -anc option.
+
+V Future Work
+
+I am interested in further extending CHAOS. However with most such features I
+will be user driven: if you want a specific feature, ask me. This way I'll
+spend less time working on things no one will ever use. One issue which is
+of particular interest is placing statistical confidence estimates on the
+chains. If you are interested in helping me work on CHAOS please contact me,
+I am open to collaborations in this area.
+
++-----------------------------------------------------------------+
+| Michael Brudno | 260S Clark Center |
+| PhD Candidate | (650) 725-6094 |
+| Dept. of Computer Science | brudno at cs.stanford.edu |
+| Stanford University | http://www.stanford.edu/~brudno |
++-----------------------------------------------------------------+
+
diff --git a/Readmes/README.lagan b/Readmes/README.lagan
new file mode 100644
index 0000000..fd903e9
--- /dev/null
+++ b/Readmes/README.lagan
@@ -0,0 +1,74 @@
+NOTE: Pairwise lagan has not changed in the 2.0 release
+
+README.lagan for LAGAN aligner (Limited Area Global Alignment of Nucleotides) v1.1
+Author: Michael Brudno (brudno at cs.stanford.edu) 04/02/2003
+
+LAGAN was developed by
+Michael Brudno, Chuong Do, Michael F Kim and Serafim Batzoglouof the Dept of
+Computer Science at Stanford University, with assistance from many other people.
+See http://lagan.stanford.edu or contact lagan at cs.stanford.edu
+for more information.
+
+I Description
+
+LAGAN is a global alignment tool. It does a Needleman-Wunsch alignment in a
+limited area of the matrix, determined during an anchoring phase.
+
+The algorithm consists of 3 main parts, each documented in its own README file:
+
+1. Generation of local alignments, using the CHAOS local alignment tool
+2. Finding a monotonically increasing set of anchors from these local alignment,
+using the anchors program.
+3. Doing global alignment in a limited area of thw NW matrix given the set of
+anchors (order tool).
+
+lagan.pl is the main executable that calls the three steps.
+
+II Usage
+
+1. Input
+Lagan accepts requires two fasta files (first two arguments),reads gap and
+substitution parameters from the nucmatrix.txt file and takes several
+optional command line options.
+
+nucmatrix.txt -- This file has the substitution matrix used by lagan and the gap
+penalties. The gaps penalties are on the line immediately after the matrix,
+the first number is the gap open, the second the gap continue.
+
+-chaos "string" [default none]
+The contents of string will be passed as arguments to CHAOS. See the CHAOS readme
+for details.
+
+-order "string" [default none]
+The contents of string will be passed as arguments to order.
+
+-recurfl "list of k-tuplets" [default: "(12,0,25,0),(13,1,25,0),(8,1,30,0)(7,1,30,0)"]
+A list of (wordlength,number of degeneracies,score cutoff, rescoringcutoff) k-tuplets to be
+used in the recursive anchoring. See README.chaos for the meaning of these numbers.
+
+-translate [default off]
+Use translated anchoring (homology done on the amino acid level). This is useful
+for distant (human/chicken, human/fish, and the like) comparisons.
+
+-bin [default off]
+print the output in binary format, for use by the bin2bl tool, or VISTA
+
+-mfa [default off]
+print the output in Multi-FASTA format, for use by many standard tools
+
+-rc [default off]
+reverse-complement the second sequence before doing the alignment
+
+-fastreject
+Abandon the alignment if the homology looks weak. Currently tuned for human/mouse
+distance, or closer. Please contact the authors for more details on this option.
+
+2. Output
+
+The output by default is in a blast like format, but you can use the -mfa or -bin
+options to save the results in multi-fasta, or binary format respectively. The
+binary format is a compact representation accepted by VISTA. There are some converters
+between the formats in the utils directory (see README.tools)
+
+
+
diff --git a/Readmes/README.mlagan b/Readmes/README.mlagan
new file mode 100644
index 0000000..ecbaba9
--- /dev/null
+++ b/Readmes/README.mlagan
@@ -0,0 +1,71 @@
+README.mlagan for MLAGAN multiple aligner v2.0
+Author: Michael Brudno (brudno at cs.toronto.edu) Updated 09/14/2006
+
+LAGAN was developed by
+Michael Brudno, Chuong Do, Michael F Kim, Mukund Sundararajan and Serafim
+Batzoglou of the Dept of Computer Science at Stanford University,
+with assistance from many other people.
+See http://lagan.stanford.edu or contact lagan at cs.stanford.edu
+for more information.
+
+I Description
+
+MLAGAN is a multiple global alignment tool. It does a Needleman-Wunsch alignment in a
+limited area of the matrix, determined during an anchoring phase.
+
+The algorithm consists of 3 main parts, each documented in its own README file:
+
+1. Generation of ordered local alignments (anchors) between all pairs of sequences,
+using the CHAOS local alignment tool and anchors program
+2. Doing progressive global alignment, guided by a phylogenetic tree, in a
+limited area of thw NW matrix given the set of anchors.
+
+mlagan is the main executable.
+
+II Usage
+
+1. Input
+Mlagan accepts requires two or more fasta files (first arguments), optionally
+takes a -tree argument specifying a phylogenetic tree, reads gap and
+substitution parameters from nucmatrix.txt file (or another optionally
+provided file) and takes several optional command line options:
+
+nucmatrix.txt -- This file has the substitution matrix used by lagan and the gap
+penalties. The gaps penalties are on the line immediately after the matrix,
+the first number is the gap open, the second the gap continue.
+
+-tree "string"
+You need to specify a phylogenetic tree for the sequences. This must be a pairwise tree,
+with parenthesis specifying nodes. Here are a few examples:
+"(human (mouse rat))"
+"((human mouse)(fugu zebrafish))"
+The name of each sequence must be specified somewhere on the fasta line of the input sequence:
+>g324325|Homo sapiens human
+ACTGG....
+Either "Homo" or "sapiens" or "human" are valid names to call the sequence.
+
+-translate [default off]
+Use translated anchoring (homology done on the amino acid level). This is useful
+for distant (human/chicken, human/fish, and the like) comparisons.
+
+-fastreject [default off]
+Abandon the alignment if the homology looks weak. Currently tuned for
+human/mouse distance, or closer. Please contact the authors for more
+details on this option.
+
+-out filename [default standard out]
+Output the alignment to filename, rather than standard out.
+
+2. Output
+
+The output by default is in Multi-FASTA format. You can use the mpretty tool in the
+utils directory to view a human-friendly version.
+
+3. Prolagan
+Prolagan is the pairwise progressive step of mlagan. It should be run just
+like mlagan, but with two additional arguments, -pro1 and -pro2 which are files
+with profiles (alignments) which should be aligned together. Note that all
+sequences (and the tree) must still be given to prolagan. This program is useful
+if you have two alignments already and want to just align them, instead of
+realigning all sequences.
+
diff --git a/Readmes/README.shuffle b/Readmes/README.shuffle
new file mode 100644
index 0000000..3fa5fd4
--- /dev/null
+++ b/Readmes/README.shuffle
@@ -0,0 +1,83 @@
+Shuffle-LAGAN with SuperMap README
+Michael Brudno, brudno at cs.toronto.edu
+
+0. Overview
+This directory contains the code for Shuffle-LAGAN, a glocal alignment tool described
+in Brudno, Malde, Poliakov, Do, Couronee, Dubchak & Batzoglou "Glocal alignment: Finding
+rearrangements during alignment", ISMB 2003 Proceedings (see
+http://lagan.stanford.edu/cite.html for detailed citation information).
+It also
+It is distributed under the SuperMap chaining algorithm which is currently
+unpublished.
+
+1. Installation
+
+If you received Shuffle-LAGAN as part of the LAGAN toolkit it is installed
+automatically with the rest of the package. The code assumes $LAGAN_DIR
+has been set.
+
+
+2. Running
+Just give it two sequences and let it roll:
+
+#slagan.pl seq1.fa seq2.fa
+
+3. Input
+
+The input sequences should be in FASTA format. You should provide a
+.masked file for each of the sequences (see README.FIRST) Output will be
+in XMFA format, described lower.
+
+
+4. Output
+The overall result are three files, a .chaos file with the local
+alignments in the chaos format, a .mon file with the 1-monotonic chain
+(see http://lagan.stanford.edu/manual.html for what this is) and a .xmfa
+file with the actual alignments in the XMFA format.
+
+A. XMFA Format
+
+The format is based on Multi-FASTA, but allows for several multiple local alignments to be
+stored in a file. It is as follows:
+
+> seq_num:start1-end1 +/- comments (sequence name, etc.)
+AC-TG-NAC--TG
+AC-TG-NACTGTG
+...
+> seq_num:startN-endN +/- comments (sequence name, etc.)
+AC-TG-NAC--TG
+AC-TG-NACTGTG
+...
+= (line starting with an "=" separates different alignments, and can have any comments)
+> seq_num:start1-end1 +/- comments (sequence name, etc.)
+AC-TG-NAC--TG
+AC-TG-NACTGTG
+...
+> seq_num:startN-endN +/- comments (sequence name, etc.)
+AC-TG-NAC--TG
+AC-TG-NACTGTG
+...
+
+5. Parameters
+Will be described for the next release. E-mail the author for details.
+
+6. Utilities
+
+The utilities directory ($LAGAN_DIR/utils) has 2 programs which may be of
+use to Shuffle-LAGAN users:
+
+A. Glue
+Given a Shuffle-LAGAN alignment in XMFA format it glues together a "fake"
+second sequence and builds a single pairwise alignment in multi-fasta
+format. This can then be visualized using VISTA, or used in other ways
+(e.g. you can get several of these "fake" sequence and use MLAGAN to do
+multiple alignment).
+
+B. dotplot
+Given a list of local alignments in the format of the monotinic file
+(.mon) it builds a series of gnuplot commands that build a dotplot of the
+local alignments. Useful for seeing which rearrangements were found.
+
+This README will be extended in the future.
+Please send questions to Michael Brudno, brudno at cs.stanford.edu
+
diff --git a/Readmes/README.tools b/Readmes/README.tools
new file mode 100644
index 0000000..1a6b7fa
--- /dev/null
+++ b/Readmes/README.tools
@@ -0,0 +1,323 @@
+LAGAN tools README (Authors: Michael Brudno, Michael F. Kim & Chuong Do)
+lagan at cs.stanford.edu 04/02/2003
+
+This document describes how to use LAGAN associated wrappers and tools.
+
+Both mrun.pl and mrunpairs.pl are wrappers to mlagan. The only
+difference is that mrunpairs.pl generates a set of pairwise
+alignments, whereas mrun.pl does the standard multiple alignment.
+Both of these tools use a helper script mextract.pl to parse out the
+individual sequence files from a Multi-FASTA file.
+
+Having run MLAGAN, we can visualize the output on a nucleotide level
+in a "pretty" format using mpretty.pl. We can also project the
+multiple sequence alignment into any number of its constituent
+sequences, using mproject.pl. We provide a tool (mviz.pl) which will
+take a multiple alignment in Multi-FASTA form and create a VISTA plot.
+
+Using the parameter file, you can completely specify the parameters to
+an mlagan job. We provide a sample file (sample.params) with more
+information on how to use the various parameters.
+
+Sequence names are always taken to be the first white-space terminated
+string after the ">" in a FASTA or Multi-FASTA file, e.g.:
+
+>sample1 This is the first sample sequence.
+ACGT...
+
+>sample2 This is the second sapmle sequence.
+ACGT...
+
+Here the sequence names would be sample1 and sample2.
+
+
+The scorealign tool scores an alignment (multiple or pairwise in MFA format). The rc script
+reverse-complements a sequence, and the bin2mf, mf2bin.pl and bin2bl scripts convert between the
+various output formats.
+
+mrunfile.pl
+-----------
+Usage:
+mrunfile.pl filename [-pairwise] [-vista]
+
+Required Parameter:
+filename : name of the parameter file (e.g. sample.params)
+
+Optional parameters:
+-pairwise : generates a set of pairwise alignments
+-vista : creates a VISTA plot using the output
+
+Example:
+mrunfile.pl sample.params -vista
+
+This would run MLAGAN using the parameters in sample.params and
+generate a VISTA plot at the end.
+
+Uses:
+mrun.pl or mrunpairs.pl
+
+
+mrun.pl
+-------
+Usage:
+mrun.pl filename -tree "(tree...)"
+
+Required parameters:
+filename : name of the Multi-FASTA file with the sequences to align.
+-tree "(tree)" : a fully parenthesized phylogenetic tree over the
+sequence names.
+
+Optional parameters:
+[base sequence name [sequence pairs]] : For projection into pairs for
+VISTA output, you may wish to specify a base sequence and specific
+pairs of sequences to have projected. If you do not specify sequence
+pairs, then all possible pairings to the base sequence will be
+generated. If you do not specify a base sequence, the default base
+sequence is the first sequence in the multi-FASTA input.
+
+other MLAGAN parameters:
+-nested : runs iterative improvement in a nested fashion
+-postir : incorporates the final improvement phase
+-lazy : uses lazy mode for anchor generation
+-verbose : give verbose output
+-translate : do translated comparisons
+-out "filename": outputs to filename
+-version : prints version info
+
+other VISTA parameters:
+(see VISTA plotfile definition for more info)
+per sequence pair:
+--regmin # (default: 75)
+--regmax # (default: 100)
+--min # (default: 50)
+per plotfile:
+--bases # (default: 10000)
+--tickdist # (default: 2000)
+--resolution # (default: 25)
+--window # (default: 40)
+--numwindows # (default: 4)
+
+Example:
+mrun.pl sample.fasta -tree "(sample1 (sample2 sample3))"
+
+This will run mlagan on the sequences in sample.fasta with the
+phylogenetic tree specified above.
+
+
+Uses:
+mextract.pl to parse out the constituent sequences into individual
+FASTA files for use by mlagan. Also uses mextract.pl with -masked
+option for parsing out .masked multi-FASTA files.
+
+
+mrunpairs.pl
+------------
+Usage:
+mrunpairs.pl filename
+
+Required parameter:
+filename : multi-FASTA file.
+
+Optional parameters:
+(same as mrun.pl optional parameters, see above)
+
+Example:
+mrunpairs.pl sample.fasta sample1 sample1 sample2 sample1 sample3
+
+This will generate the pairs (sample1 sample2), (sample1 sample3),
+using sample1 as a base sequence (for VISTA plots).
+
+
+Uses:
+mextract.pl to parse out the constituent sequences into individual
+FASTA files for use by mlagan. Also uses mextract.pl with -masked
+option for parsing out .masked multi-FASTA files.
+
+
+mpretty.pl
+----------
+Usage:
+mpretty.pl filename
+
+Required parameter:
+filename : Multi-FASTA file to view.
+
+Optional parameters:
+-linelen value : number of bases to display per line
+ (min: 10, default: 50)
+-interval value : frequency of markers
+ (min: 10, default: 10, none: 0)
+-labellen value : length of the sequence label
+ (min: 5, default: 5, none: 0)
+-start value : position to start from (>=1)
+-end value : position to end from (>=start position)
+-base sequence_name : sequence name on which to base start/end positions.
+-nocounts : turn off sequence position counts
+
+
+Example:
+mpretty.pl sample.fasta -nocounts -interval 0 -linelen 72
+
+This will print out the contents of sample.fasta without sequence
+position counters, without interval markers and at 72 bases per line,
+with the sequence labels on each line at their default length.
+Because of the way the labels are printed, this will cause each line
+to have length 80 characters.
+
+mpretty.pl sample.fasta -start 101 -end 150
+
+This will print out the contents of sample.fasta from positions 101 to
+positions 150 in the alignment, inclusive.
+
+mpretty.pl sample.fasta -start 131 -end 140 -base sample1_aligned
+
+This will print out the contents of sample.fasta from position 131 to
+position 140 relative to the sequence sample1_aligned.
+
+
+mextract.pl
+-----------
+Usage:
+mextract.pl filename [-masked]
+
+Required parameter:
+filename : Multi-FASTA file to extract sequences from.
+
+Optional parameter:
+-masked : For dealing with masked Multi-FASTA files.
+
+Example:
+mextract.pl sample.fasta
+
+This will extract the contents of sample.fasta (e.g. sample1, sample2,
+sample3) and put them into files:
+sample_sample1.fa
+sample_sample2.fa
+sample_sample3.fa
+
+Masked Example:
+mextract.pl sample.fasta.masked -masked
+
+This will extract the contents of sample.fasta.masked (e.g. sample1, sample2,
+sample3) and put them into files:
+sample_sample1.fa.masked
+sample_sample2.fa.masked
+sample_sample3.fa.masked
+
+For use with rechaos.pl in anchoring.
+
+
+mproject.pl
+-----------
+Usage:
+mproject.pl filename seqname1 [seqname2 ... ]
+
+Required parameters:
+filename : Multi-FASTA file to extract sequences from.
+and at least one sequence name.
+
+Example:
+mproject.pl sample.out sample1 sample2
+
+In this example, sample.out is the resulting alignment of a number of
+sequences -- including sample1 and sample2. This script will project
+the multiple alignment into the pair sample1 and sample2.
+
+
+mviz.pl
+-------
+Usage:
+mviz.pl data_file param_file [plotfile]
+
+Required parameters:
+data_file : Multi-FASTA file to visualize using VISTA
+ (this must be the first argument)
+param_file : Parameter file (same format as used in other scripts)
+ (this must be the second argument)
+
+Optional parameter:
+plotfile : VISTA plotfile (if specified, must be specified third)
+ Script will use this plotfile instead of automatically
+ generated one.
+
+Example:
+mviz.pl sample.out sample.params sample.plotfile
+
+This will generate a VISTA plot using the data in sample.out, the
+settings in sample.params, but with sample.plotfile as the given
+plotfile.
+
+Uses:
+RunVista
+
+
+scorealign
+----------
+Usage:
+scorealign mfa_alignment %cutoff [-regions]
+Optional parameters:
+regions: Print the high scoring regions in the alignment.
+
+Example:
+scorealign alignment.mfa 80
+
+This will return the score of the alignment in the file
+"alignment.mfa" that meat an 80% threshold.
+
+scorealign
+----------
+Usage:
+scorealign mfa_alignment %cutoff [-regions]
+Optional parameters:
+regions: Print the high scoring regions in the alignment.
+
+Example:
+scorealign alignment.mfa 80
+
+This will return the score of the alignment in the file
+"alignment.mfa" that meat an 80% threshold.
+
+mf2bin.pl
+---------
+Usage:
+mf2bin.pl inputfile [-out outputfile]
+
+Required parameter:
+inputfile : Multi-FASTA file with two sequences to convert to bin.
+
+Optional parameter:
+-out outputfile : Put bin output to ouputfile.
+
+Example:
+mf2bin.pl sample1_sample2.fa -out sample1_sample2.bin
+
+This will take the file sample1_sample2.fa (which contains the
+alignment or projection of a larger alignment of sample1 and sample2)
+and pack it into VISTA binary format and output the result to
+sample1_sample2.bin.
+
+
+bin2mf
+------
+Usage:
+bin2mf { - | alignment_file}
+
+Example
+bin2mf align.bin > align.mfa
+cat align.bin | bin2mf - > align.mfa
+
+This will convert the binary file in align.bin into multi-fasta format,
+and save it as align.mfa.
+
+bin2bl
+------
+Usage:
+bin2mf { - | alignment_file}
+
+Example
+bin2mf align.bin > align.bl
+cat align.bin | bin2mf - > align.bl
+
+This will convert the binary file in align.bin into BLAST-like format,
+and save it as align.bl.
+
diff --git a/Utils.pm b/Utils.pm
new file mode 100644
index 0000000..e4e7214
--- /dev/null
+++ b/Utils.pm
@@ -0,0 +1,553 @@
+#!/usr/bin/env perl
+
+package Utils;
+require 5.000;
+
+use strict;
+use Exporter;
+use Cwd;
+use IO::File;
+use POSIX qw(setsid);
+use Sys::Syslog qw(:DEFAULT setlogsock);
+
+sub Trim( @ );
+sub Lock_File( $ ; $ $ $ );
+sub Unlock_File( $ );
+sub Write_Log( $ $ ; $ $ );
+sub Parse_Filename( $ );
+sub Get_Abs_Path( $ );
+sub Expand_Path( $ );
+sub Get_Random_Key( ; $ );
+sub Hex2Ascii( $ );
+sub Ascii2Hex( $ );
+sub Get_Config_Record( $ $ );
+sub Round( $ );
+sub Set_Log( $ $ );
+sub Log( $ $ );
+sub Min( $ $ );
+sub Max( $ $ );
+sub Reg_Diff( $ $ ; $ $ $ $ $ );
+sub Reg_Rem_Overlap( $ ; $ $ $ );
+sub Reg_Sort( $ ; $ $ $ );
+sub Reg_Intersect( $ $ ; $ $ $ $ $ );
+sub Reg_Merge( $ ; $ $ $ );
+
+use vars qw(@ISA @EXPORT $VERSION $JOB $Error $Syslog $Facility $Msg_Prefix);
+
+ at ISA = qw(Exporter);
+ at EXPORT = qw(Trim Lock_File Unlock_File Write_Log Parse_Filename
+ Get_Abs_Path Expand_Path Hex2Ascii Ascii2Hex Get_Config_Record
+ Get_Random_Key Round Set_Log Log Min Max Reg_Diff Reg_Rem_Overlap
+ Reg_Sort Reg_Intersect Reg_Merge redirect_err2log openlogs safe_glob
+ daemon wr_log wr_err start_watcher confirm $JOB);
+
+my $Id = '$Id: Utils.pm,v 1.21 2005/01/07 23:08:59 poliakov Exp $';
+($VERSION) = ($Id =~ /,v\s+(\d+\S+)/o);
+$JOB = '^(\S+)\@(\S+?)_(\d{4})(?:_(.+)|)$';
+
+$Error = 0;
+$Syslog = 0;
+$Facility = "user";
+$Msg_Prefix = undef;
+
+my $E_FORK = "cannot fork";
+my @LOG_FILE = ();
+my %Locks = ();
+
+sub Trim( @ ) {
+ for (my $i = 0; $i <= $#_; ++$i) {
+ $_[$i] =~ s/^\s+//;
+ $_[$i] =~ s/\s+$//
+ }
+}
+
+sub Lock_File( $ ; $ $ $ ) {
+ my ($file, $retry, $timeout, $max_mtime) = @_;
+ my ($lock_fh, $start_time, $mtime);
+
+ if (!$file || ($file =~ /\/$/o)) {
+ $Error = "Invalid filename";
+ return 0;
+ }
+ $file = Get_Abs_Path("$file.lock");
+ if (exists($Locks{$file})) { $Error = "Already locked"; return 1; }
+ if (!-w (Parse_Filename($file))[0]) {
+ $Error = "Permission denied";
+ return 0;
+ }
+ if (!defined($retry)) { $retry = 1; }
+ if (!defined($timeout)) { $timeout = 1200; }
+ if (!defined($max_mtime)) {
+ $max_mtime = ($timeout > 0) ? int($timeout / 2) : 0;
+ }
+ $start_time = time();
+ LOCK: {
+ if (!($lock_fh = IO::File->new($file, O_RDWR|O_CREAT|O_EXCL))) {
+ if (!$retry || (($timeout > 0) && ((time() - $start_time) > $timeout))) {
+ $Error = "Locked by someone else";
+ return 0;
+ }
+ if ($max_mtime > 0) {
+ $mtime = (stat($file))[9];
+ if ($mtime && ((time() - $mtime) > $max_mtime)) { unlink($file); }
+ }
+ redo LOCK;
+ }
+ }
+ $lock_fh->close();
+ $Locks{$file} = 1;
+ return 1;
+}
+
+sub Unlock_File( $ ) {
+ my ($file) = @_;
+
+ if (!$file) { $Error = "Invalid filename"; return 0; }
+ $file = Get_Abs_Path("$file.lock");
+ if (!exists($Locks{$file})) { $Error = "Not locked"; return 0; }
+ if (!unlink($file)) { $Error = "Cannot unlock"; return 0; }
+ delete($Locks{$file});
+ return 1;
+}
+
+{
+ my $Uname;
+ foreach my $dir ('/bin', '/sbin', '/usr/bin', '/usr/sbin') {
+ -x "$dir/uname" and $Uname = "$dir/uname", last;
+ }
+ my $Host = $Uname ? `$Uname -n` : 'localhost';
+ chomp($Host);
+ ($Host) = ($Host =~ /^([^\.]+)(\..*)?$/);
+
+sub Write_Log( $ $ ; $ $ ) {
+ no strict "refs";
+ my ($log_file, $msg, $name, $pid) = @_;
+ my $error = 0;
+ my $date;
+ local *LOG;
+
+ if (!defined($log_file) || !defined($msg)) { return 0; }
+ if (*{$log_file}{IO}) {
+ *LOG = *{$log_file}{IO};
+ } elsif ($log_file eq '/dev/null') {
+ return 1;
+ } else {
+ if (!Lock_File($log_file)) { return 0; }
+ if (!open(LOG, ">> $log_file")) { $error = 1; }
+ }
+ if (!$error) {
+ chomp($msg);
+ $date = localtime(time());
+ if (!$name) { $name = $0; }
+ if (!$pid) { $pid = $$; }
+ if (!print LOG "$date $Host $name\[$pid\]: $msg\n") { $error = 1; }
+ if (!*{$log_file}{IO}) { close(LOG); }
+ }
+ if ($error && $!) { $Error = "$!"; }
+ if (!*{$log_file}{IO}) { Unlock_File($log_file); }
+ return !$error;
+}}
+
+sub Parse_Filename( $ ) {
+ my ($name) = @_;
+ my ($last_slash_pos, $dir, $file);
+
+ if (!defined($name)) { return (); }
+ $last_slash_pos = rindex($name, "/");
+ if ($last_slash_pos >= 0) {
+ $dir = substr($name, 0, $last_slash_pos + 1);
+ $file = substr($name, $last_slash_pos + 1);
+ } else {
+ $dir = "";
+ $file = $name;
+ }
+ return ($dir, $file);
+}
+
+sub Expand_Path( $ ) {
+ my ($path) = @_;
+ my $home_dir;
+
+ $path && ($path =~ /^~/o) or return $path;
+ $path =~ /^~([^\/]*)(.*)$/o;
+ $home_dir = $1 ? (getpwnam($1))[7] :
+ ($ENV{"HOME"} || $ENV{"LOGDIR"} || (getpwuid($>))[7]);
+ defined($home_dir) and $path = "$home_dir$2";
+ return $path;
+}
+
+sub Get_Abs_Path( $ ) {
+ my ($path) = @_;
+
+ defined($path) or return $path;
+ $path = Expand_Path($path);
+ $path =~ /^\//o or $path = getcwd() . "/$path";
+ $path =~ s(/{2,})(/)g;
+
+# get rid of "/./"
+
+ while ($path =~ /^(.*?)\/\.(?:|\/(.*))$/o) {
+ $path = "$1/" . ($2 ? $2 : "");
+ }
+
+# get rid of "/../"
+
+ while ($path =~ /^(((?:.*?\/)*?)[^\/]+){0,1}?\/\.\.(?:|\/(.*))$/o) {
+ $path = ($1 ? $2 : "/") . ($3 ? $3 : "");
+ }
+ return $path;
+}
+
+{
+ my @Chars = ("A" .. "Z", "a" .. "z", 0 .. 9);
+ srand();
+
+sub Get_Random_Key( ; $ ) {
+ my ($len) = @_;
+
+ if (!defined($len) || ($len !~ /^\d+$/o) || ($len < 2) || ($len > 1024)) {
+ $len = 8;
+ }
+ return join("", @Chars[map {rand @Chars } (1 .. 8)]);
+}}
+
+sub Hex2Ascii( $ ) {
+ my ($str) = @_;
+
+ if ($str) { $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; }
+ return $str;
+}
+
+{
+ my $a2h = {
+ "\t" => "%29",
+ "+" => "%2B",
+ "," => "%2C",
+ "." => "%2E",
+ ";" => "%3B",
+ "/" => "%2F",
+ "?" => "%3F",
+ ":" => "%3A",
+ "@" => "%40",
+ "=" => "%3D",
+ "&" => "%26",
+ " " => "%20",
+ "<" => "%3C",
+ ">" => "%3E",
+ "\"" => "%22",
+ "%" => "%25",
+ "#" => "%23",
+ "[" => "%5B",
+ "]" => "%5D",
+ "{" => "%7B",
+ "}" => "%7D",
+ "|" => "%7C",
+ "\\" => "%5C",
+ "^" => "%5E",
+ "~" => "%7E",
+ "`" => "%60"};
+
+sub Ascii2Hex( $ ) {
+ my ($str) = @_;
+ my $new_str = "";
+
+ if (!$str) { return $str; }
+ foreach my $char (split(//, $str)) {
+ if (exists($a2h->{$char})) { $char = $a2h->{$char}; }
+ $new_str .= $char;
+ }
+ return $new_str;
+}}
+
+sub Get_Config_Record( $ $ ) {
+ my ($conf_file, $rec) = @_;
+ my ($db, $field, $value);
+ my @result = ();
+
+ if (!($db = Registry->New($conf_file, "r", 1))) {
+ $Error = "$Registry::Error", return ();
+ }
+ if (!$db->Record_Exists($rec)) {
+ $Error = qq("$rec" record not found);
+ return ();
+ }
+ foreach my $field (qw(dir users log)) {
+ if (!($value = Expand_Path($db->Get_Val($rec, $field)))) {
+ if ($field eq "log") {
+ $value = "";
+ } else {
+ $Error = qq("$field" field of "$rec" record is missing), return ();
+ }
+ } elsif ($value !~ /^\//o) {
+ $Error = qq("$field" field of "$rec" record should be absolute path);
+ return ();
+ }
+ push(@result, $value);
+ }
+ foreach my $field (qw(max_down grace_period)) {
+ if (!($value = $db->Get_Val($rec, $field)) ||
+ ($value !~ /^\d+$/o)) {
+ $value = 0;
+ }
+ push(@result, $value);
+ }
+ return @result;
+}
+
+sub Round( $ ) {
+ my ($num) = @_;
+
+ return int($num + 0.5);
+}
+
+sub Log( $ $ ) {
+ my ($log_num, $msg) = @_;
+
+ (defined($log_num) && ($log_num >= 0) && $LOG_FILE[$log_num]) and
+ Write_Log($LOG_FILE[$log_num], $msg);
+}
+
+sub Set_Log( $ $ ) {
+ my ($log_num, $file) = @_;
+
+ (defined($log_num) && ($log_num >= 0) && $file) and
+ $LOG_FILE[$log_num] = $file;
+}
+
+sub Min( $ $ ) {
+ my ($i, $j) = @_;
+
+ return ($i < $j) ? $i : $j;
+}
+
+sub Max( $ $ ) {
+ my ($i, $j) = @_;
+
+ return ($i > $j) ? $i : $j;
+}
+
+sub Reg_Diff( $ $ ; $ $ $ $ $ ) {
+ my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_;
+ my (@new_regs, $start, $end, $new_reg);
+
+ $regs1 && $regs2 or return $regs1;
+ $s1 ||= 0;
+ defined($e1) or $e1 = 1;
+ $s2 ||= 0;
+ defined($e2) or $e2 = 1;
+ for (my $i = 0; $i < @$regs1; ++$i) {
+ $start = $$regs1[$i][$s1];
+ $end = $$regs1[$i][$e1];
+ for (my $j = 0; $j < @$regs2; ++$j) {
+ $$regs2[$j][$s2] > $end and last;
+ $$regs2[$j][$e2] < $start and next;
+ if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] >= $end)) {
+ undef($start), last;
+ }
+ if (($$regs2[$j][$s2] > $start) && ($$regs2[$j][$e2] >= $end)) {
+ $end = $$regs2[$j][$s2] - 1, last;
+ }
+ if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] < $end)) {
+ $start = $$regs2[$j][$e2] + 1, next;
+ }
+ ($start < ($$regs2[$j][$s2] - 1)) || !$strict and
+ $new_reg = [@{$$regs1[$i]}],
+ $$new_reg[$s1] = $start,
+ $$new_reg[$e1] = $$regs2[$j][$s2] - 1,
+ push(@new_regs, $new_reg);
+ $start = $$regs2[$j][$e2] + 1;
+ }
+ !defined($start) || ($start > $end) and next;
+ ($start < $end) || !$strict and
+ $new_reg = [@{$$regs1[$i]}],
+ $$new_reg[$s1] = $start,
+ $$new_reg[$e1] = $end,
+ push(@new_regs, $new_reg);
+ }
+ return \@new_regs;
+}
+
+sub Reg_Rem_Overlap( $ ; $ $ $ ) {
+ my ($regs, $strict, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ if (($i < $#new_regs) && ($new_regs[$i + 1][$s] <= $new_regs[$i][$e])) {
+ $new_regs[$i + 1][$e] <= $new_regs[$i][$e] and
+ splice(@new_regs, $i + 1, 1),
+ --$i, next;
+ $new_regs[$i + 1][$s] = $new_regs[$i][$e] + 1;
+ }
+ ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next;
+ splice(@new_regs, $i, 1);
+ --$i;
+ }
+ return \@new_regs;
+}
+
+sub Reg_Sort( $ ; $ $ $ ) {
+ my ($regs, $rev, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ if ($rev) {
+ @new_regs = sort { ($$b[$s] <=> $$a[$s]) || ($$b[$e] <=> $$a[$e]) } @$regs;
+ } else {
+ @new_regs = sort { ($$a[$s] <=> $$b[$s]) || ($$a[$e] <=> $$b[$e]) } @$regs;
+ }
+ return \@new_regs;
+}
+
+sub Reg_Intersect( $ $ ; $ $ $ $ $ ) {
+ my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_;
+
+ $regs1 && $regs2 or return undef;
+ $s1 ||= 0;
+ defined($e1) or $e1 = 1;
+ $s2 ||= 0;
+ defined($e2) or $e2 = 1;
+ return Reg_Diff($regs1, Reg_Diff($regs1, $regs2, $strict, $s1, $e1,
+ $s2, $e2), $strict, $s1, $e1, $s1, $e1);
+}
+
+sub Reg_Merge( $ ; $ $ $ ) {
+ my ($regs, $strict, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ ($i < $#new_regs) &&
+ ($new_regs[$i + 1][$s] == ($new_regs[$i][$e] + 1)) and
+ $new_regs[$i][$e] = $new_regs[$i + 1][$e],
+ splice(@new_regs, $i + 1, 1),
+ --$i, next;
+ }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next;
+ splice(@new_regs, $i, 1);
+ --$i;
+ }
+ return \@new_regs;
+}
+
+sub safe_glob {
+ my ($regexp, $dir) = @_;
+ my (@files);
+ local (*DIR);
+
+ $dir ||= ".";
+ $regexp ||= ".*";
+ opendir(DIR, $dir) or return;
+ @files = grep { /$regexp/ } readdir(DIR);
+ closedir(DIR);
+ return wantarray() ? @files : scalar(@files);
+}
+
+sub redirect_err2log {
+ my ($facility) = @_;
+
+ $Facility = $facility;
+ stderr2log();
+}
+
+sub stderr2log {
+ my ($oldfh);
+
+ open(STDERR, "> /dev/null");
+ open(STDERR, "| logger -p $Facility.err -t '$0\[$$\]'");
+ $oldfh = select(STDERR); $| = 1; select($oldfh);
+}
+
+sub openlogs {
+ my ($facility) = @_;
+
+ $facility and $Facility = $facility;
+ stderr2log();
+ setlogsock("unix");
+ openlog($0, "pid", $Facility);
+ $Syslog = 1;
+}
+
+sub daemon {
+ my ($facility) = @_;
+ my ($pid);
+
+ if ($pid = fork()) {
+ exit(0);
+ } elsif (!defined($pid)) {
+ wr_err("$E_FORK: $!");
+ die;
+ } else {
+ setsid();
+ close(STDIN);
+ close(STDOUT);
+ open(STDOUT, "> /dev/null");
+ openlogs($facility);
+ }
+}
+
+sub start_watcher {
+ my ($watcher, $facility, @params) = @_;
+ my ($pid, $parent);
+
+ $parent = $$;
+ if ($pid = fork()) {
+ return;
+ } elsif (!defined($pid)) {
+ wr_err("$E_FORK: $!");
+ die;
+ } else {
+ setsid();
+ close(STDIN);
+ close(STDOUT);
+ open(STDOUT, "> /dev/null");
+ $0 .= "_watcher";
+ openlogs($facility);
+ &$watcher($parent, @params);
+ }
+}
+
+sub wr_log {
+ my $msg = shift;
+
+ chomp($msg);
+ $msg = ( $Msg_Prefix ? &$Msg_Prefix : "") . $msg;
+ if ($Syslog) {
+ syslog("info", "%s", $msg);
+ } else {
+ print "$msg\n";
+ }
+}
+
+sub wr_err {
+ my $msg = shift;
+
+ chomp($msg);
+ print STDERR (( $Msg_Prefix ? &$Msg_Prefix : ""), "$msg\n");
+ return 1;
+}
+
+sub confirm {
+ my ($msg) = @_;
+ my ($ans);
+
+ print $msg;
+ $ans = <STDIN>;
+ chomp($ans);
+ return ($ans =~ /^(y|yes)$/io) ? 1 : 0;
+}
+
+END {
+ foreach my $lock (keys(%Locks)) { unlink($lock); }
+}
+
+1;
diff --git a/anal_gloc.pl b/anal_gloc.pl
new file mode 100755
index 0000000..644d952
--- /dev/null
+++ b/anal_gloc.pl
@@ -0,0 +1,142 @@
+#!/usr/bin/env perl
+
+$savname1 = "";
+$savname2 = "";
+$skip = 0;
+$endblock = 0;
+$score = 0;
+$strand = "";
+$initstrnd;
+$s1s = 999999999;
+$s2s = 999999999;
+$first = 1;
+$plus_sc = 0;
+$minus_sc = 0;
+
+
+while ($line = <STDIN>) {
+
+ if ($line =~ /^>/) {
+ if (!$first) {
+ if ($strand eq "+") {
+ print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n";
+ }
+ else {
+ print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n";
+ }
+
+ if ($strand ne $initstrnd) {
+ print STDOUT "INV\n"
+ }
+ if ($strand eq "+") { $plus_sc += $score; }
+ else { $minus_sc += $score; }
+ if ($plus_sc > $minus_sc) {
+ print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n";
+ }
+ else {
+ print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n";
+ }
+ $plus_sc = 0;
+ $minus_sc = 0;
+ $score = 0;
+ $s1s = 999999999;
+ $s2s = 999999999;
+ $strand = "";
+ }
+ $first = 1;
+ $name1 = $line;
+ chomp $name1;
+ $line = <STDIN>;
+ if ($line !~ /^>/) {
+ print STDERR "Expecting a name, but got $line";
+ exit (1);
+ }
+ $name2 = $line;
+ chomp $name2;
+ $inblock = 1;
+ $skip = 0;
+ if (($name1 eq $savname1) && ($name2 eq $savname2)) {
+ $skip = 1;
+ }
+ else { print STDOUT "$name1 $name2\n"; }
+
+ $savname1 = $name1;
+ $savname2 = $name2;
+ }
+ elsif (!$skip) {
+ $endblock = 0;
+ $line =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) ([0-9\.]*) (.) (.*)/;
+ if ($1 == 0 || $3 == 0) {
+ next;
+ }
+# print STDOUT "strand $strand $s2s $4\n";
+ if (($strand eq "+") && ($6 eq "+") && ($s2s + 20 < $4) ) {
+ $endblock += 2;
+ }
+ if (($strand eq "-") && ($6 eq "-") && ($s2s > $4 + 20) ) {
+ $endblock += 2;
+ }
+ if ($strand eq "") { $strand = $6; }
+ if ($6 ne $strand) {
+ $endblock += 1;
+ }
+
+ if (!$endblock) {
+ $s2s = $3;
+ $s1s = $1;
+ $s1e = $2;
+ $s2e = $4;
+ $score += $5;
+ if ($first) {
+ print STDOUT " ";
+ print STDOUT " ";
+ $initstrnd = $strand;
+ $reg1s = $2;
+ $reg2s = $4;
+ $first = 0;
+ }
+ }
+ else {
+ if ($strand eq "+") {
+ print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n";
+ }
+ else {
+ print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n";
+ }
+
+ if ($strand eq "+") { $plus_sc += $score; }
+ else { $minus_sc += $score; }
+
+ if ($endblock %2) { print STDOUT "INV "; }
+ else {print STDOUT " "; }
+ if ($endblock > 1) { print STDOUT "TRL "; }
+ else {print STDOUT " "; }
+ $s2s = $3;
+ $s1s = $1;
+ $s1e = $2;
+ $s2e = $4;
+ $reg1s = $s1e;
+ $reg2s = $s2e;
+ $score = $5;
+ $strand = $6;
+ # print STDOUT "strand $strand\n";
+ }
+ }
+}
+if (!$first){
+ if ($strand eq "+") {
+ print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n";
+ }
+ else {
+ print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n";
+ }
+ if ($strand eq "+") { $plus_sc += $score; }
+ else { $minus_sc += $score; }
+}
+
+if ($plus_sc > $minus_sc) {
+ print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n";
+}
+else {
+ print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n";
+}
diff --git a/blosum62.txt b/blosum62.txt
new file mode 100644
index 0000000..15aa43e
--- /dev/null
+++ b/blosum62.txt
@@ -0,0 +1,25 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
+R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
+N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
+D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
+C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
+Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
+E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
+G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
+H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
+I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
+L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
+K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
+M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
+F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
+S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
+Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
+V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
+B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
+Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
diff --git a/blosum62s.txt b/blosum62s.txt
new file mode 100644
index 0000000..0ae216d
--- /dev/null
+++ b/blosum62s.txt
@@ -0,0 +1,25 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 223 -55 -111 -111 0 -55 -55 0 -111 -55 -55 -55 -55 -111 -55 55 0 -167 -111 0 -111 -55 0 -223
+R -55 278 0 -111 -167 55 0 -111 0 -167 -111 111 -55 -167 -111 -55 -55 -167 -111 -167 -55 0 -55 -223
+N -111 0 334 55 -167 0 0 0 55 -167 -167 0 -111 -167 -111 55 0 -223 -111 -167 167 0 -55 -223
+D -111 -111 55 334 -167 0 111 -55 -55 -167 -223 -55 -167 -167 -55 0 -55 -223 -167 -167 223 55 -55 -223
+C 0 -167 -167 -167 502 -167 -223 -167 -167 -55 -55 -167 -55 -111 -167 -55 -55 -111 -111 -55 -167 -167 -111 -223
+Q -55 55 0 0 -167 278 111 -111 0 -167 -111 55 0 -167 -55 0 -55 -111 -55 -111 0 167 -55 -223
+E -55 0 0 111 -223 111 278 -111 0 -167 -167 55 -111 -167 -55 0 -55 -167 -111 -111 55 223 -55 -223
+G 0 -111 0 -55 -167 -111 -111 334 -111 -223 -223 -111 -167 -167 -111 0 -111 -111 -167 -167 -55 -111 -55 -223
+H -111 0 55 -55 -167 0 0 -111 446 -167 -167 -55 -111 -55 -111 -55 -111 -111 111 -167 0 0 -55 -223
+I -55 -167 -167 -167 -55 -167 -167 -223 -167 223 111 -167 55 0 -167 -111 -55 -167 -55 167 -167 -167 -55 -223
+L -55 -111 -167 -223 -55 -111 -167 -223 -167 111 223 -111 111 0 -167 -111 -55 -111 -55 55 -223 -167 -55 -223
+K -55 111 0 -55 -167 55 55 -111 -55 -167 -111 278 -55 -167 -55 0 -55 -167 -111 -111 0 55 -55 -223
+M -55 -55 -111 -167 -55 0 -111 -167 -111 55 111 -55 278 0 -111 -55 -55 -55 -55 55 -167 -55 -55 -223
+F -111 -167 -167 -167 -111 -167 -167 -167 -55 0 0 -167 0 334 -223 -111 -111 55 167 -55 -167 -167 -55 -223
+P -55 -111 -111 -55 -167 -55 -55 -111 -111 -167 -167 -55 -111 -223 390 -55 -55 -223 -167 -111 -111 -55 -111 -223
+S 55 -55 55 0 -55 0 0 0 -55 -111 -111 0 -55 -111 -55 223 55 -167 -111 -111 0 0 0 -223
+T 0 -55 0 -55 -55 -55 -55 -111 -111 -55 -55 -55 -55 -111 -55 55 278 -111 -111 0 -55 -55 0 -223
+W -167 -167 -223 -223 -111 -111 -167 -111 -111 -167 -111 -167 -55 55 -223 -167 -111 613 111 -167 -223 -167 -111 -223
+Y -111 -111 -111 -167 -111 -55 -111 -167 111 -55 -55 -111 -55 167 -167 -111 -111 111 390 -55 -167 -111 -55 -223
+V 0 -167 -167 -167 -55 -111 -111 -167 -167 167 55 -111 55 -55 -111 -111 0 -167 -55 223 -167 -111 -55 -223
+B -111 -55 167 223 -167 0 55 -55 0 -167 -223 0 -167 -167 -111 0 -55 -223 -167 -167 223 55 -55 -223
+Z -55 0 0 55 -167 167 223 -111 0 -167 -167 55 -55 -167 -55 0 -55 -167 -111 -111 55 223 -55 -223
+X 0 -55 -55 -55 -111 -55 -55 -55 -55 -55 -55 -55 -55 -55 -111 0 0 -111 -55 -55 -55 -55 -55 -223
+* -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 -223 55
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index ed0f0d0..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,13 +0,0 @@
-lagan (2.0-1) UNRELEASED; urgency=low
-
- * Initial release
- * TODO:
- - fix installation location possibly wrapper script featuring
- some $lagandir path variable (/usr/lib/lagan ???)
- - manpages
- - permissions
- - hardening
- - testing
-
-
- -- Andreas Tille <tille at debian.org> Fri, 15 Nov 2013 10:31:20 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 7b911f5..0000000
--- a/debian/control
+++ /dev/null
@@ -1,24 +0,0 @@
-Source: lagan
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: science
-Priority: optional
-Build-Depends: debhelper (>= 9),
- libboost1.54-dev
-Standards-Version: 3.9.4
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/lagan/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/lagan/trunk/
-Homepage: http://lagan.stanford.edu/lagan_web/index.shtml
-
-Package: lagan
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends}
-Description: highly parametrizable pairwise global alignment program
- Lagan takes local alignments generated by CHAOS as anchors, and limits
- the search area of the Needleman-Wunsch algorithm around these anchors.
- .
- Multi-LAGAN is a generalization of the pairwise algorithm to multiple
- sequence alignment. M-LAGAN performs progressive pairwise alignments,
- guided by a user-specified phylogenetic tree. Alignments are aligned to
- other alignments using the sum-of-pairs metric.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 8c60067..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,32 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Contact: Michael Brudno <brudno at cs.toronto.edu>
-Source: http://lagan.stanford.edu/lagan_web/lagan20.tar.gz
-Files-Excluded:
- prolagan
- src/glocal/glocal
-
-Files: *
-Copyright: © 2003-2006 Michael Brudno, Chuong Do, et. al.
-License: GPLv2+
-
-Files: debian/*
-Copyright: © 2012-2013 Andreas Tille <tille at debian.org>
-License: GPLv2+
-
-License: GPLv2+
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- .
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- .
- On Debian systems you can find the full text of the GPL 2+ license
- at /usr/share/common-licenses/GPL-2.
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index b3b3c2b..0000000
--- a/debian/docs
+++ /dev/null
@@ -1 +0,0 @@
-Readmes/R*
diff --git a/debian/install b/debian/install
deleted file mode 100644
index bd4082e..0000000
--- a/debian/install
+++ /dev/null
@@ -1,6 +0,0 @@
-anchors usr/bin
-chaos usr/bin
-glocal usr/bin
-mlagan usr/bin
-order usr/bin
-utils usr/lib/lagan
diff --git a/debian/patches/do_not_define_conflicting_getline.patch b/debian/patches/do_not_define_conflicting_getline.patch
deleted file mode 100644
index 075753a..0000000
--- a/debian/patches/do_not_define_conflicting_getline.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-LastChanged: Fri, 15 Nov 2013 10:31:20 +0100
-Description: Prevent conflicting getline by simply renaming it
-
---- a/src/anchors.c
-+++ b/src/anchors.c
-@@ -225,7 +225,7 @@ char* rolltonum(char* str) {
- return &str[i];
- }
-
--int getline(FILE* infile, hll* tt) {
-+int anchors_getline(FILE* infile, hll* tt) {
- char temp[1024];
- char* help;
- int z, h;
-@@ -248,7 +248,7 @@ hll* parseCHAOS(FILE* infile, int* totnu
- *totnum = 0;
- while(!feof(infile)) {
- tt = (hll*) malloc(sizeof(hll));
-- while (!feof(infile) && !getline(infile, tt))
-+ while (!feof(infile) && !anchors_getline(infile, tt))
- ;
- if (feof(infile)) break;
- if (gapfreechunks) {
diff --git a/debian/patches/gcc-4.8.patch b/debian/patches/gcc-4.8.patch
deleted file mode 100644
index 3d7ed78..0000000
--- a/debian/patches/gcc-4.8.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-LastChanged: Fri, 15 Nov 2013 10:31:20 +0100
-Description: Fix some includes to build using gcc-4.8
-
---- a/src/utils/Glue.cpp
-+++ b/src/utils/Glue.cpp
-@@ -6,6 +6,7 @@
- #include <fstream>
- #include <iostream>
- #include <algorithm>
-+#include <string.h>
-
- #define NUCLEOTIDE_MATRIX_FILE "nucmatrix.txt"
- #define MAX_LINE_LENGTH 1024
---- a/src/glocal/score.cpp
-+++ b/src/glocal/score.cpp
-@@ -2,7 +2,7 @@
- #include<score.h>
- #include<leftinfluence.h>
- #include<rightinfluence.h>
--#include<fstream.h>
-+#include<fstream>
-
- extern vector<class Score*> scoreFunctions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)];
-
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 8b30535..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,2 +0,0 @@
-do_not_define_conflicting_getline.patch
-gcc-4.8.patch
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index dc081e1..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/make -f
-
-# DH_VERBOSE := 1
-
-%:
- dh $@
-
-get-orig-source:
- mkdir -p ../tarballs
- uscan --verbose --force-download --repack-compression xz --destdir=../tarballs
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index 7de6f91..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,12 +0,0 @@
-Reference:
- Author: Michael Brudno and Chuong Do and Gregory Cooper and Michael F. Kim and Eugene Davydov and Eric D. Green and Arend Sidow and Serafim Batzoglou
- Title: "LAGAN and Multi-LAGAN: efficient tools for large-scale multiple alignment of genomic DNA"
- Journal: Genome Research
- Year: 2003
- Volume: 13
- Number: 4
- Pages: 721-31
- DOI: 10.1101/gr.926603
- PMID: 12654723
- URL: http://genome.cshlp.org/content/13/4/721
- eprint: http://genome.cshlp.org/content/13/4/721.full.pdf+html
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 5ec7260..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,8 +0,0 @@
-version=3
-
-# normally this would be something like:
-# echo 20 |sed 's/\(.\)/\1\./g;s/.$//'
-# but somehow this does not work here
-
-opts=uversionmangle=s/2/2\./\
- http://lagan.stanford.edu/lagan_web/citing.shtml lagan(.*)\.tar\.gz
diff --git a/lagan.pl b/lagan.pl
new file mode 100755
index 0000000..19e53fa
--- /dev/null
+++ b/lagan.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+
+$lagandir = $ENV{LAGAN_DIR};
+$consrate = 45;
+$consupperrate = 65;
+
+if (@ARGV < 2) {
+ print ("usage:\n lagan seqfile1 seqfile2 [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1,rsc1),(wl2,nd2,co2,rsc2),...\"] [-bin] [-mfa] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-usebounds] [-rc] [-translate] [-draft] [-info] [-fastreject]\n");
+ exit(1);
+}
+
+$firstName = $ARGV[0];
+$secondName = $ARGV[1];
+$rcFlag = 0;
+$arglist = "";
+$contigflag = 0;
+$infofile = 0;
+$okformat = 0;
+$binfile = 0;
+$infofilename = "alignment";
+$direction = "+";
+$gfc = " -gfc ";
+$rundraft = 0;
+$draftparams = "";
+$dofastreject = 0;
+$doxmfa = 0;
+$filename = "";
+$format = "";
+
+for ($i = 2; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-order/) {
+ $orderfl = $ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-bin/) {
+ $orderfl = $orderfl." -bin";
+ $binfile = 1;
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-info/) {
+ $infofile++;
+ }
+ elsif ($ARGV[$i] =~ /-mfa/) {
+ $orderfl = $orderfl." -mfa";
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-xmfa/) {
+ $orderfl = $orderfl." -xmfa";
+ $doxmfa = 1;
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-out/) {
+ $filename = $ARGV[++$i];
+ $infofile++;
+ $infofilename = $ARGV[$i];
+ }
+ elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)){
+ $orderfl = $orderfl." ".$ARGV[$i];
+ $orderfl = $orderfl." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-s1/) {
+ $orderfl = $orderfl." -s1 $ARGV[++$i]";
+ $orderfl = $orderfl." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-maskedonly/) {
+ $arglist = $arglist." -maskedonly";
+ }
+ elsif ($ARGV[$i] =~ /-translate/) {
+ $arglist = $arglist." -translate";
+ $draftparams = $draftparams." -translate";
+ }
+ elsif ($ARGV[$i] =~ /-fastreject/) {
+ $arglist = $arglist." -fastreject";
+ $dofastreject = 1;
+ $doxmfa = 1;
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-draftreject/) {
+ $draftparams = $draftparams." -fastreject";
+ }
+ elsif ($ARGV[$i] =~ /-gap/) {
+ $arglist = $arglist." -gap ".$ARGV[++$i];
+ $arglist = $arglist." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-recurse/) {
+ $arglist = $arglist." -recurse \"".$ARGV[++$i]."\"";
+ }
+ elsif ($ARGV[$i] =~ /-chaos/) {
+ $arglist = $arglist." -chaos \"".$ARGV[++$i]."\"";
+ }
+ elsif ($ARGV[$i] =~ /-usebounds/) {
+ $contigflag = 1;
+ }
+ elsif ($ARGV[$i] =~ /-rc/) {
+ `$lagandir/utils/rc < $ARGV[1] > $ARGV[1].rc`;
+ if ($?) { exit(1); }
+ $secondName = "$ARGV[1].rc";
+ if (-e "$ARGV[1].masked") {
+ `$lagandir/utils/rc < $ARGV[1].masked > $ARGV[1].rc.masked`;
+ if ($?) { exit(1);}
+ }
+ $rcFlag = 1;
+ $direction = "-";
+ }
+ elsif ($ARGV[$i] =~ /-draft/){
+ $rundraft = 1;
+ }
+ elsif ($ARGV[$i] =~ /-cons/){
+ $draftparams = $draftparams." -cons $ARGV[$++i]";
+ }
+ elsif ($ARGV[$i] =~ /-draftskipfr/){
+ $draftparams = $draftparams." -skipfr $ARGV[$++i]";
+ }
+ elsif ($ARGV[$i] =~ /-lazy/){
+ $draftparams = $draftparams." -cons $ARGV[$++i]";
+ }
+
+ else {
+ print "Invalid option for lagan: $ARGV[$i]";
+ exit(1);
+ }
+}
+
+$arglist = $arglist." -ext ";
+
+if ($rundraft){
+ `$lagandir/draft.pl $firstName $secondName $draftparams`;
+ if ($?) { exit(1);}
+ $secondName = "merged_seq.fa";
+}
+
+# print STDERR "perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final\n";
+`perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final`;
+
+$ex_val = $? >> 8;
+if ($ex_val == 3) { exit(0); }
+
+if ($ex_val) { exit(1); }
+if ($contigflag){
+ @bounds = `$lagandir/utils/getbounds $$.anchs.final $firstName $secondName`;
+ if ($?) { exit(1); }
+ chomp $bounds[0];
+ print STDERR ("Aligning with bounds: $bounds[0]\n");
+ print `$lagandir/order $firstName $secondName $bounds[0] $orderfl -anc $$.anchs.final`;
+ if ($?) { exit(1); }
+}
+else {
+ if ($dofastreject){
+ if (!$filename) {
+ print STDERR "-fastreject requires -out filename!\n";
+ exit(1);
+ }
+ open(SFILE, "$$.anchs.final");
+ @anchors = <SFILE>;
+ close(SFILE);
+
+ $anchors[0] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $end1 = $1 - 1;
+ $end2 = $3 - 1;
+ $anchors[@anchors - 1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $start1 = $2 + 1;
+ $start2 = $4 + 1;
+ $bounds = "-s1 $start1 $end1 -s2 $start2 $end2 ";
+
+ @anchors = 0;
+ $orderfl = $bounds.$orderfl." -xmfa";
+ }
+ if (!$okformat) {
+ $format = "-bin";
+ }
+
+ `$lagandir/order $firstName $secondName $format -out $$.align $orderfl -anc $$.anchs.final`;
+ if ($?) { exit(1); }
+
+ if (!$okformat) {
+ if ($filename) {
+ `$lagandir/utils/bin2bl $$.align > $filename`;
+ }
+ else {
+ print `$lagandir/utils/bin2bl $$.align`;
+ }
+ }
+ else {
+ if ($filename) {
+ `cat $$.align > $filename`;
+ }
+ else {
+ print `cat $$.align`;
+ }
+ }
+ if ($dofastreject){
+ `$lagandir/utils/scorealign $filename $consrate -ibounds -cropxmfa > $$.temp`;
+ if ($?) { exit(1); }
+ `mv $$.temp $filename`;
+ }
+}
+
+$infofile += $okformat;
+if ($infofile == 3){
+ open (INFOFILE, ">$infofilename.info");
+ if ($binfile){
+ `$lagandir/utils/bin2mf $infofilename > $infofilename.mfa`;
+ if ($?) { exit(1); }
+ $infofilename = $infofilename.".mfa";
+ }
+ @temp = `head $secondName`;
+ if ($?) { exit(1); }
+ chomp $temp[0]; $temp[0] = substr $temp[0], 1;
+ print INFOFILE "$temp[0]\n";
+
+ $len = `$lagandir/utils/getlength $secondName`; chomp $len;
+ if ($?) { exit(2); }
+ $first = $last = $first2 = $last2 = -1;
+
+ $score = `$lagandir/utils/scorealign $infofilename $consupperrate`; chomp $score;
+ if ($?) { exit(3); }
+ if ($score > 0){
+ $score = `$lagandir/utils/scorealign $infofilename $consrate`; chomp $score;
+ if ($?) { exit(4); }
+ @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 0`;
+ if ($?) { exit(5); }
+ $temp[0] =~ /(.*) (.*)/;
+ $first = $1; $last = $2;
+
+ @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 1`;
+ if ($?) { exit(6); }
+ $temp[0] =~ /(.*) (.*)/;
+ $first2 = $1; $last2 = $2;
+ }
+
+ print INFOFILE "1 $first $last 1 $len 0 0 $direction $score $first2 $last2\n";
+
+ close (INFOFILE);
+# `$lagandir/utils/rm $infofilename` if ($binfile);
+}
+
+`rm $secondName` if ($rcflag);
+`rm $$.*`;
+if ($?) { exit(1); }
+
+exit(0);
+
+
diff --git a/nucmatrix.txt b/nucmatrix.txt
new file mode 100644
index 0000000..93cf9b1
--- /dev/null
+++ b/nucmatrix.txt
@@ -0,0 +1,9 @@
+ A C G T . N
+A 91 -114 -31 -123 0 -43
+C -114 100 -125 -31 0 -43
+G -31 -125 100 -114 0 -43
+T -123 -31 -114 91 0 -43
+. 0 0 0 0 0 0
+N -43 -43 -43 -43 0 -43
+
+-400 -25
diff --git a/rechaos.pl b/rechaos.pl
new file mode 100755
index 0000000..9a6a062
--- /dev/null
+++ b/rechaos.pl
@@ -0,0 +1,375 @@
+#!/usr/bin/env perl
+
+$lagandir = $ENV{LAGAN_DIR};
+
+# Status
+# -- extension problems
+
+if (@ARGV < 2) {
+ print ("usage:\n rechaos seqfile1 seqfile2 [-chaos \"chaos flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-translate] [-fastreject]\n");
+ exit(1);
+}
+
+#$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(8,1,30,0)x,(7,1,30,0)x";
+$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(4,0,4,3000)xt,(8,1,30,0)x,(7,1,30,0)x";
+#$recurfl = "(12,0,10,200)x,(12,0,10,150)x,(3,0,10,150)xt,(8,0,10,150)x,(12,0,25,0),(13,1,30,0),(3,0,30,0)t,(8,1,30,0),(7,1,25,0)";
+$minbox = 10;
+$minside = 5;
+$seq1 = $ARGV[0];
+$seq2 = $ARGV[1];
+$tofile = 0;
+$masker = 1;
+$lazycheck = 0;
+$fastreject = 0;
+$frminlevel = 0;
+$frmaxlevel = 3;
+ at frseq1 = (150000, 50000, 30000, 15000);
+ at frseq2 = (150000, 50000, 30000, 15000);
+#@frseq1 = (70000, 60000, 60000, 20000);
+#@frseq2 = (70000, 60000, 60000, 20000);
+$sentinelleft = 1.1;
+$sentinelright = 1.2;
+$gfc = " ";
+$dounmasked = 1;
+$filename = "";
+$debug = 0;
+$anchparams = "";
+$translate = 0;
+
+sub max {
+ my ($a, $b) = @_;
+ return $a if ($a > $b);
+ return $b;
+}
+
+sub min {
+ my ($a, $b) = @_;
+ return $a if ($a < $b);
+ return $b;
+}
+
+$i = 2;
+while ($i < @ARGV) {
+ if ($ARGV[$i] =~ /-\chaos/) {
+ $chaosfl = $chaosfl." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-ext/) {
+ $chaosfl = $chaosfl." -ext ";
+ }
+ elsif ($ARGV[$i] =~ /-recurse/) {
+ $recurfl = $ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-lazy/) {
+ $lazycheck = 1;
+ }
+ elsif ($ARGV[$i] =~ /-nomask/) {
+ $masker = 0;
+ }
+ elsif ($ARGV[$i] =~ /-out/) {
+ $tofile = 1;
+ $filename = $ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-maskedonly/) {
+ $dounmasked = 0;
+ }
+ elsif ($ARGV[$i] =~ /-fastreject/) {
+ $fastreject = 1;
+ }
+ elsif ($ARGV[$i] =~ /-debug/) {
+ $debug = 1;
+ }
+ elsif ($ARGV[$i] =~ /-translate/) {
+ $translate = 1;
+ }
+ elsif ($ARGV[$i] =~ /-gfc/) {
+ $gfc = " -gfc ";
+ }
+ elsif ($ARGV[$i] =~ /-gap/){
+ $anchparams = $anchparams." -gap ".$ARGV[++$i];
+ $anchparams = $anchparams." ".$ARGV[++$i];
+ }
+ else {
+ die ("Unrecognized option $ARGV[$i]\n");
+ }
+ $i++;
+}
+
+if ($lazycheck) {
+ if (-f $filename) {
+ print STDERR "Output file already exists, lazy mode exit!\n";
+ exit (0);
+ }
+}
+
+$extracase1 = 0;
+$extracase2 = 0;
+if (-e "$seq1.masked") { $extra1 = $seq1; $seq1 = "$seq1.masked"; $extracase1 = 1; }
+if (-e "$seq2.masked") { $extra2 = $seq2; $seq2 = "$seq2.masked"; $extracase2 = 1; }
+if (! $dounmasked){ $extracase1 = 0; $extracase2 = 0; }
+
+#open(SEQ1, "$seq1");
+#open(SEQ2, "$seq2");
+
+#$line1 = <SEQ1>;
+#while ($line1 = <SEQ1>) {
+# chomp $line1;
+# $seq1len += length($line1);
+#}
+#
+#$line2 = <SEQ2>;
+#while ($line2 = <SEQ2>) {
+# chomp $line2;
+# $seq2len += length($line2);
+#}
+
+$seq1len = `$lagandir/utils/getlength $seq1`; chomp $seq1len;
+$seq2len = `$lagandir/utils/getlength $seq2`; chomp $seq2len;
+
+$b1[0] = $b2[0] = 1;
+$e1[0] = $seq1len;
+$e2[0] = $seq2len;
+
+$cumanchs = 0;
+
+$clipleft1 = 0;
+$clipleft2 = 0;
+$clipright1 = $seq1len + 1;
+$clipright2 = $seq2len + 1;
+$app_str = "";
+
+$i = 0;
+while (1) {
+ $goodanchs = 0;
+ $totalanchs = 0;
+
+ $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/);
+ if (! $stillmore) {
+ if ($extracase1 || $extracase2) {
+ if ($extracase1) { $seq1 = $extra1; $extracase1 = 0; }
+ if ($extracase2) { $seq2 = $extra2; $extracase2 = 0; }
+ }
+ else {
+ last;
+ }
+ }
+ else {
+ $wordlen = $1;
+ $degeneracy = $2;
+ $cutoff = $3;
+ $extcutoff = $4;
+ $tail = $5;
+
+ $extraparams = "";
+ $extraparams = "-t ".$extraparams if ((index ($tail, "t") != -1) && ($translate));
+ $extraparams = $extraparams." -rsc $extcutoff" if (index ($tail, "x") != -1);
+ }
+
+ $recurfl = $6;
+ next if ((index ($tail, "t") != -1) && (!$translate));
+
+ print STDERR "Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff, $extcutoff) $tail\n";
+
+# PRINT OUT LIST OF REGIONS TO ALIGN
+
+ open (PFILE, ">$$.anchs.pairs");
+ for ($j = 0; $j < @b1; $j++) {
+ print PFILE "-s1 $b1[$j] $e1[$j] -s2 $b2[$j] $e2[$j]\n";
+ }
+ close (PFILE);
+
+# print STDERR "PAIRS hits\n";
+# print STDERR `cat $$.anchs.pairs`;
+# print STDERR "-----------------\n";
+# print STDERR `cat $$.anchs.pairs`;
+# print STDERR "-----------------\n";
+# print STDERR "$lagandir/chaos $seq1 $seq2 -wl $wordlen -nd $degeneracy -co $cutoff $extraparams $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp";
+
+# PERFORM THE ALIGNMENTS USING CHAOS
+
+ $saver = "$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp";
+ `$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp`;
+ if ($?) {
+ print STDERR "$saver\n";
+ exit(1);
+ }
+
+# ADD IN BOUNDARIES
+
+ $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/);
+ if ($fastreject || $stillmore || $extracase1 || $extracase2){
+ $temp1 = $seq1len + 1;
+ $temp2 = $seq2len + 1;
+ $app_str = $app_str."seq1 0 $clipleft1; seq2 0 $clipleft2; score=$sentinelleft (+)\n";
+ $app_str = $app_str."seq1 $clipright1 $temp1; seq2 $clipright2 $temp2; score=$sentinelright (+)\n";
+ }
+
+# APPEND HITS FROM $app_str TO LOCAL ALIGNMENT LIST
+
+ open (OFILE, ">>$$.anchtemp");
+ print OFILE $app_str;
+ close (OFILE);
+
+# `wc $$.anchtemp` =~ /(\d+)/x;
+# $totalanchs = $totalanchs + $1;
+# print STDERR "CHAOS hits\n";
+# print STDERR `cat $$.anchtemp`;
+
+# FIND MAXIMAL-SCORING CONSISTENT CHAIN
+
+ `$lagandir/anchors $$.anchtemp $gfc $anchparams | sort -n +1 > $$.anchs.sorted`;
+ if ($?) { exit(1); }
+
+# IF WE'RE DONE, THEN QUIT!
+
+ $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/);
+ if (!$stillmore && !$extracase1 && !$extracase2) {
+ last;
+ }
+
+# `wc $$.anchs` =~ /(\d+)/x;
+# print STDERR "ANCHS hits\n";
+# print STDERR `cat $$.anchs.sorted`;
+# $goodanchs = $goodanchs + $1;
+
+# if ($?) { exit(1); }
+
+# READ SORTED ANCHORS TO @anchors
+
+ open(SFILE, "$$.anchs.sorted");
+ @anchors = <SFILE>;
+ close(SFILE);
+
+ @b1new = 0;
+ @b2new = 0;
+ @e1new = 0;
+ @e2new = 0;
+ @scores = 0;
+
+ $app_str = "";
+
+ # FOR EACH UNALIGNED REGION
+
+ $area = 0;
+ $maxarea = 0;
+ $k = 0;
+
+ for ($m = 0; $m < @anchors; $m++){
+
+ # SAVE OLD ANCHORS (SKIP FIRST AND LAST FAKE ANCHORS)
+
+ if ($m >= 1 && $m < @anchors - 1){
+ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $score = $5; chomp $score;
+ $app_str = $app_str."seq1 $1 $2; seq2 $3 $4; score=$score (+)\n";
+ }
+
+ if ($m == 0){ next; }
+
+ # DETERMINE REGION BOUNDARIES
+
+ $anchors[$m-1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $gap1begin = $2 + 1;
+ $gap2begin = $4 + 1;
+ $prevanchorscore = $5; chomp $prevanchorscore;
+
+ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $gap1end = $1 - 1;
+ $gap2end = $3 - 1;
+ $nextanchorscore = $5; chomp $nextanchorscore;
+
+ # CHECK IF RECURSION NEEDED
+
+ $boxarea = ($gap1end - $gap1begin + 1) * ($gap2end - $gap2begin + 1);
+ $area = $area + $boxarea;
+ $maxarea = $boxarea if ($boxarea > $maxarea);
+
+ if ($boxarea >= $minbox && ($gap1end - $gap1begin + 1) > $minside &&
+ ($gap2end - $gap2begin + 1) > $minside ){
+
+ # FAST REJECT
+
+ if ($fastreject && ($i >= $frminlevel) && ($i <= $frmaxlevel)){
+
+ # SKIP MARKED ENDS OF ALIGNMENT
+
+ if ($nextanchorscore == $sentinelleft ||
+ $prevanchorscore == $sentinelright){
+ next;
+ }
+
+ # TRIM NEW ENDS OF ALIGNMENT
+
+ if ($prevanchorscore == $sentinelleft){
+# if ($boxarea > $frseq1[$i] * $frseq2[$i]){
+ if (($gap1end - $gap1begin > $frseq1[$i]) ||
+ ($gap2end - $gap2begin > $frseq2[$i])){
+ if (@anchors == 2){ exit(3); }
+ $clipleft1 = max ($gap1begin-1, $gap1end - $frseq1[$i]);
+ $clipleft2 = max ($gap2begin-1, $gap2end - $frseq2[$i]);
+ $gap1begin = $clipleft1 + 1;
+ $gap2begin = $clipleft2 + 1;
+ }
+ }
+ elsif ($nextanchorscore == $sentinelright){
+# if ($boxarea > $frseq1[$i] * $frseq2[$i]){
+ if (($gap1end - $gap1begin > $frseq1[$i]) ||
+ ($gap2end - $gap2begin > $frseq2[$i])){
+ if (@anchors == 2){ exit(3); }
+ $clipright1 = min ($gap1end+1, $gap1begin + $frseq1[$i]);
+ $clipright2 = min ($gap2end+1, $gap2begin + $frseq2[$i]);
+ $gap1end = $clipright1 - 1;
+ $gap2end = $clipright2 - 1;
+ }
+ }
+ }
+
+ # ADD REGION
+
+ if ($gap1begin < $gap1end && $gap2begin < $gap2end){
+ $b1new[$k] = $gap1begin;
+ $b2new[$k] = $gap2begin;
+ $e1new[$k] = $gap1end;
+ $e2new[$k] = $gap2end;
+ $k++;
+ }
+ }
+ }
+
+ @b1 = @b1new;
+ @b2 = @b2new;
+ @e1 = @e1new;
+ @e2 = @e2new;
+ if ($debug) {
+ print STDERR "Level $i Summary:\n";
+ print STDERR " Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff)\n";
+ if ($totalanchs == 0) {
+ $percentage = 0;
+ }
+ else {
+ $percentage = $goodanchs / $totalanchs * 100.0;
+ }
+ print STDERR " $goodanchs good out of $totalanchs total anchors ($percentage%)\n";
+ $area = $area / 1000000;
+ $maxarea = $maxarea / 1000000;
+ print STDERR " Total area left = $area (max = $maxarea)\n";
+ }
+ $cumanchs = $cumanchs + $goodanchs;
+ $i++;
+}
+
+$res = `sort -nr +1 $$.anchs.sorted`;
+if ($?) { exit(1); }
+
+`rm $$.*`;
+
+if($tofile) {
+ open(OUTFILE, ">$filename");
+ print OUTFILE "$res";
+ close OUTFILE;
+}
+else {
+ print "$res";
+}
+
+print STDERR "$cumanchs cumulative anchors\n"
+
diff --git a/sample.fasta b/sample.fasta
new file mode 100644
index 0000000..5329f81
--- /dev/null
+++ b/sample.fasta
@@ -0,0 +1,25 @@
+>sample1
+GGCATGTCCAGAAAATCCAAGTGCCTCTTCCTCTTGATCTTCTCCAACGATGTCCAGA
+AAATCCAAGTGCCTCATTCCTCTTGATCTTCTCCAGGCATGTCCAGAAAATCCAAGTG
+CCTCTTCCTCTCTGATCTTCTCCTCGGTTGGTCCAGAAAATCCAAGTGCCTCTTCCTC
+TTGATCTTCTCCAGAAATGTCCAGAAAATCCAAGTAGCCTCTTCCTCTTGATCGGCTC
+CAGAAATGTCCAGAAAAATCCAAGTGCCTCTTCCTCTTGATCGGCTCCATAAATGTCC
+AGAAAATCCAACGTGCCTCTTCCTCTTGATCGGCTCCAGAAATGTCCAGAAATATCCA
+AGTGCCTCTTCCTCTTGATCGGCTCCTTA
+>sample2
+CGATCCCAAATCCAAGTGCCTCAGAGTCTACTTGATCTTCAATTCAGATCCCAAATCC
+AAGTGCCTCAGAGTCTACTTGAATCTTCTATCGGGTCCCAAATCCAAGTGCCTCAGAG
+TCTACTTGATCTTCTCTCTCGATCCCATATCCAAGTGCCTCCTAGAGTCTACTTGATC
+TTCTCGATAACCAAAATCCAAGTGCCTCAGAGTCTACTTCACTCTTCTCGACTAACCC
+AAATCCAAGTGCCTCAGATGAGTCTACTTCCTCTTCTCATAACTCAAATCCAAGTGCC
+TCAGAGTCTAACTTCCTCTTCTCGAATAACCCAAATCCAAGTGCCTCAGAGTGTCTAC
+TTCCTCTTCTCG
+>sample3
+TACCCAAATCCAAGTGCCTCAGCGTCTAATAAAACAAGTCTTGATCTTCAACTCCTCC
+CAAATCCAAGTGCAACCTCAGCCGCTAATAAAAAGTCTTGATCTTCTCGCGTCCGGCA
+AATCCAAGTGCCTCAGCGCTAATAAAAAGTCTTGATCTTCTCGGGAGTCCCAAATCCA
+AGTGCCTCAGCGCTAATAAAAAGTCTTGATCTTCTCGGAGGAACAACAAATCCAAGTG
+CCTCAAGCGCTAATAAAAAGTCCCGATCTTCTCGTGACAATACAAATCCAAGTGCCTC
+AGCGCTAATAAAAAGTCCCGATCTTCTCCCGTGTAAACAAATCCAAGTGCCTCAGCGC
+TAATAAAAAGTCCCGATCTTCTCTGGTAACACAACAAATCCAAGTCACGCCTCAGATA
+CGCTAATAAAAAGTCCCGATCTTCTC
diff --git a/sample.params b/sample.params
new file mode 100644
index 0000000..2f09885
--- /dev/null
+++ b/sample.params
@@ -0,0 +1,59 @@
+# This is a comment
+# The first thing in the file must be the Multi-FASTA file
+
+sample.fasta
+
+# The rest of the parameters can be in any order.
+# Compound parameters must be on one line.
+
+
+# Base sequence MUST appear before sequence pairs
+
+# Optional base sequence (default is the first sequence in file).
+sample1
+
+# Optional pairs
+sample1 sample2
+sample1 sample3
+sample2 sample3
+
+
+# MLAGAN parameters
+# preceded by "-"
+
+# Phylogenetic tree specification (required).
+#-tree "(...)"
+
+-tree "((sample1 sample2) sample3)"
+
+# Lazy evaluation...
+#-lazy
+
+# Nested Iterative Refinement
+#-nested
+
+# Turning on post-alignment Iterative Refinement
+#-postir
+
+# VISTA parameters (defaults)
+# preceded by "--"
+
+# ALIGN REGION MIN (75), REGION MAX (100), MIN (50)
+#--regmin 75
+#--regmax 100
+#--min 50
+
+# BASES (10000)
+#--bases 10000
+
+# TICK DISTANCE (2000)
+#--tickdist 1000
+
+# RESOLUTION (25)
+#--resolution 25
+
+# WINDOW (70)
+#--window 70
+
+# NUM WINDOWS (4)
+#--numwindows 4
diff --git a/slagan-mfa.pl b/slagan-mfa.pl
new file mode 100755
index 0000000..ba15fa3
--- /dev/null
+++ b/slagan-mfa.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl
+
+use strict;
+
+$0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0;
+
+die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"};
+my $LAGAN_DIR = $ENV{LAGAN_DIR};
+
+my ($outfile, $base);
+
+foreach my $arg (@ARGV) {
+ if ($arg =~ /-out\s+([^\s]+)/) {
+ $outfile = $1;
+ $arg =~ s/-out\s+([^\s]+)//;
+ } elsif ($arg =~ /-base[\s\=]+([^\s]+)/) {
+ $base = $1;
+ $arg =~ s/-base[\s\=]+([^\s]+)//;
+ die("$0: Invalid base parameter (expected 1 or 2). Stopped") unless $base eq "1" or $base eq "2";
+ }
+}
+
+if (@ARGV < 2) {
+ print ("Usage:\n$0 seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n");
+ exit(1);
+}
+
+my $args = join(" ", @ARGV);
+system($LAGAN_DIR."/slagan.pl $args > slagan.pl.out");
+die("$0: slagan.pl returned error $?. Stopped") if $?;
+
+system($LAGAN_DIR."/xmfa2mfa.pl ".($base eq "2" ? "2" : "1")." < slagan.pl.out ".($outfile ? "> $outfile" : ""));
+die("$0: xmfa2mfa.pl returned error $?. Stopped") if $?;
+
+unlink "slagan.pl.out";
diff --git a/slagan.pl b/slagan.pl
new file mode 100755
index 0000000..66dfb4b
--- /dev/null
+++ b/slagan.pl
@@ -0,0 +1,153 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $lagandir = $ENV{LAGAN_DIR};
+
+if (@ARGV < 2) {
+ print ("Usage:\n slagan.pl seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n");
+ exit(1);
+}
+
+my ($seq1, $firstName) = ($ARGV[0], $ARGV[0]);
+die("$0: File not found: $seq1. Stopped") unless -f $seq1;
+my ($seq2, $secondName) = ($ARGV[1], $ARGV[1]);
+die("$0: File not found: $seq2. Stopped") unless -f $seq2;
+
+my ($extra1, $extra2) =(0, 0);
+if (-e "$seq1.masked") { $seq1 = "$seq1.masked"; $extra1 = 1;}
+if (-e "$seq2.masked") { $seq2 = "$seq2.masked"; $extra2 = 1;}
+
+
+my ($outName1, $outName2) = ($ARGV[0], $ARGV[1]);
+$outName1 =~ s/^.*\///;
+$outName1 =~ s/\..*//;
+$outName2 =~ s/^.*\///;
+$outName2 =~ s/\..*//;
+
+
+my $max_ext = 25000;
+my $ext_mul = 1;
+my $arglist = "";
+my $glocal_fl = " -gapopen 0,1000,2000,2000 -gapcont 0.2,0.06,0.06,0.06 -dist 0,1.0,2.5,2.5";
+my $chaos_fl = " -wl 11 -nd 1 -co 10 -ext -rsc 2250 -b";
+my $lagan_fl = "";
+my $supermap_fl = "-glocal_out=${outName1}_$outName2.out.glocal";
+my $outfile = 0;
+my $fastrej = 0;
+my $lazy = 0;
+
+for (my $i = 2; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-glocal_fl/) {
+ $glocal_fl = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-chaos_fl/) {
+ $chaos_fl = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-lagan_fl/) {
+ $lagan_fl = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-max_ext/) {
+ $max_ext = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-ext_mul/) {
+ $ext_mul = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-out/) {
+ $outfile = $ARGV[++$i];
+ if (-e "$outfile") { system("rm $outfile") and exit(1); }
+ } elsif ($ARGV[$i] =~ /-order/) {
+ $arglist = $arglist." -order $ARGV[++$i]";
+ } elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)) {
+ $arglist = $arglist." ".$ARGV[$i];
+ $arglist = $arglist." ".$ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-ext/) {
+ $arglist = $arglist." -ext $ARGV[++$i]";
+ } elsif ($ARGV[$i] =~ /-maskedonly/) {
+ $arglist = $arglist." -maskedonly";
+ } elsif ($ARGV[$i] =~ /-lazy/) {
+ $lazy = 1;
+ } elsif ($ARGV[$i] =~ /-translate/) {
+ $arglist = $arglist." -translate";
+ } elsif ($ARGV[$i] =~ /-fastreject/) {
+ $fastrej = 1;
+# $arglist = $arglist." -fastreject";
+ } elsif ($ARGV[$i] =~ /-recurse/) {
+ $arglist = $arglist." -recurse \"".$ARGV[++$i]."\"";
+ } elsif ($ARGV[$i] =~ /-chaos/) {
+ $chaos_fl = $chaos_fl." ".$ARGV[++$i];
+ } else {
+ die("$0: Invalid option for rlagan: $ARGV[$i]");
+ }
+}
+
+my $seq1len = `$lagandir/utils/getlength $firstName`;
+my $seq2len = `$lagandir/utils/getlength $secondName`;
+chomp $seq1len;
+chomp $seq2len;
+
+if ($lazy && -e "${outName1}_$outName2.chaos") {
+ `cp ${outName1}_$outName2.chaos chaos.$$`;
+}
+else {
+ `$lagandir/chaos $seq1 $seq2 $chaos_fl > chaos.$$`;
+ if ($?) { exit(1); }
+ `cat chaos.$$ > ${outName1}_$outName2.chaos`;
+}
+open(FH, "> seq1len"); print FH $firstName." ".$seq1len."\n"; close FH;
+open(FH, "> seq2len"); print FH $secondName." ".$seq2len."\n"; close FH;
+my $supermap_outfile = "${outName1}_$outName2.out.smap";
+my $supermap_inv = "$lagandir/supermap.pl -sizes1=seq1len -sizes2=seq2len $supermap_fl chaos.$$ -no_clust_run -f -out=$supermap_outfile 1>&2";
+
+#print $supermap_inv."\n";
+system($supermap_inv);
+
+open(FH, "< $supermap_outfile");
+my @regs = <FH>;
+die("$0: Supermap generated no regions. Stopped") unless scalar @regs;
+close FH;
+unlink "seq1len"; unlink "seq2len"; # unlink $supermap_outfile;
+
+
+for (my $k = 0; $k < @regs; $k++) {
+ $regs[$k] =~ /^([^\s]+)\s([\d]+)\s([\d]+)\s\s\s([^\s]+)\s([\d]+)\s([\d]+)\s(\+|\-)\s\((DM|M1|M2),\s([\d]+)\saligns\)$/o;
+
+ my ($startreg1, $endreg1, $startreg2, $endreg2, $strand, $type) = ($2, $3, $5, $6, $7, $8);
+ my $rcf = "";
+
+ if ($strand eq "+") {
+ $rcf = ""
+ } else {
+ $rcf = "-rc";
+ }
+
+#print "$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n";
+ `$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n`;
+#print "$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n";
+ `$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n`;
+ if ($extra1) { `$lagandir/utils/fa2xfa $seq1 $startreg1 $endreg1 1 > seq1$k.$$.masked\n`; }
+ if ($extra2) { `$lagandir/utils/fa2xfa $seq2 $startreg2 $endreg2 2 $rcf > seq2$k.$$.masked\n`; }
+
+#print "$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n";
+ `$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n`;
+
+ my $suff = "";
+ if ($outfile) { $suff = " >> $outfile"; }
+ if (-e "lagan.$k.$$") {
+ if ($fastrej) {
+#print "$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff\n";
+ print `$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff`;
+ } else {
+#print "$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds\n";
+ my $sc = `$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds`;
+ chomp($sc);
+ if ($sc) {
+ print `cat lagan.$k.$$ $suff`;
+ print `echo \"=$sc $type\n\" $suff`;
+ }
+ }
+ }
+}
+
+####`cat out.$$ > ${outName1}_$outName2.mon`;
+unlink(glob("*.$$"));
+if ($extra1 || $extra2) { `rm *.$$.masked`; }
+exit(0);
+
+
+# out: .chaos .mon->.smap .xmfa
diff --git a/src/.gdb_history b/src/.gdb_history
new file mode 100644
index 0000000..58afc78
--- /dev/null
+++ b/src/.gdb_history
@@ -0,0 +1,2 @@
+run
+quit
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..7f6b6fd
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,54 @@
+CC = gcc $(CFLAGS)
+CPP = g++ $(CFLAGS)
+CFLAGS = -O3 # -Wall -W
+TRGT_DIR = ..
+
+all: ../anchors ../chaos ../order ../mlagan ../prolagan ../utils/bin2mf ../utils/bin2bl ../utils/cextract ../utils/cstat ../utils/contigorder ../utils/getbounds ../utils/getlength ../utils/getoverlap ../utils/rc ../utils/seqmerge ../utils/scorealign ../utils/scorecontigs ../utils/getcontigpos ../utils/fa2xfa ../utils/Glue ../utils/dotplot ../utils/overlay
+ (cd glocal; $(MAKE))
+clean:
+ rm -f *.o *~ utils/*~ mlagan.purify core
+ (cd glocal; $(MAKE) clean)
+../anchors: anchors.c skiplist.c
+ $(CC) -o $(TRGT_DIR)/anchors anchors.c skiplist.c
+../chaos: fchaos.c thrtrie.c skiplist.c global.c translate.c mempage.c filebuffer.c
+ $(CC) -o $(TRGT_DIR)/chaos fchaos.c thrtrie.c skiplist.c global.c translate.c filebuffer.c -lm -DCHAOS__FLAG
+../order: order.c diagmatrix.c filebuffer.c
+ $(CC) -o $(TRGT_DIR)/order order.c diagmatrix.c filebuffer.c
+../mlagan: mlagan.c diagmatrix.c multial.c skiplist.c filebuffer.c
+ $(CC) -o $(TRGT_DIR)/mlagan mlagan.c multial.c diagmatrix.c skiplist.c filebuffer.c -lm -DMULTIAL__FLAG
+../prolagan: prolagan.c diagmatrix.c multial.c skiplist.c filebuffer.c
+ $(CC) -o $(TRGT_DIR)/prolagan prolagan.c multial.c diagmatrix.c skiplist.c filebuffer.c -lm -DMULTIAL__FLAG
+../utils/bin2mf: utils/bin2mf.c
+ $(CC) -o $(TRGT_DIR)/utils/bin2mf utils/bin2mf.c
+../utils/bin2bl: utils/bin2bl.c
+ $(CC) -o $(TRGT_DIR)/utils/bin2bl utils/bin2bl.c
+../utils/cextract: utils/cextract.c
+ $(CC) -o $(TRGT_DIR)/utils/cextract utils/cextract.c
+../utils/cstat: utils/cstat.c
+ $(CC) -o $(TRGT_DIR)/utils/cstat utils/cstat.c
+../utils/contigorder: utils/contigorder.c
+ $(CC) -o $(TRGT_DIR)/utils/contigorder utils/contigorder.c
+../utils/getbounds: utils/getbounds.c
+ $(CC) -o $(TRGT_DIR)/utils/getbounds utils/getbounds.c
+../utils/getcontigpos: utils/getcontigpos.c
+ $(CC) -o $(TRGT_DIR)/utils/getcontigpos utils/getcontigpos.c
+../utils/getlength: utils/getlength.c
+ $(CC) -o $(TRGT_DIR)/utils/getlength utils/getlength.c
+../utils/getoverlap: utils/getoverlap.c
+ $(CC) -o $(TRGT_DIR)/utils/getoverlap utils/getoverlap.c
+../utils/rc: utils/rc.c
+ $(CC) -o $(TRGT_DIR)/utils/rc utils/rc.c
+../utils/seqmerge: utils/seqmerge.c
+ $(CC) -o $(TRGT_DIR)/utils/seqmerge utils/seqmerge.c
+../utils/scorealign: utils/scorealign.c
+ $(CC) -o $(TRGT_DIR)/utils/scorealign utils/scorealign.c -lm
+../utils/scorecontigs: utils/scorecontigs.c
+ $(CC) -o $(TRGT_DIR)/utils/scorecontigs utils/scorecontigs.c -lm
+../utils/fa2xfa: utils/fa2xfa.c
+ $(CC) -o $(TRGT_DIR)/utils/fa2xfa utils/fa2xfa.c
+../utils/overlay: utils/overlay.c
+ $(CC) -o $(TRGT_DIR)/utils/overlay utils/overlay.c
+../utils/Glue: utils/Glue.cpp
+ $(CPP) -o $(TRGT_DIR)/utils/Glue utils/Glue.cpp
+../utils/dotplot: utils/dotplot.cpp
+ $(CPP) -o $(TRGT_DIR)/utils/dotplot utils/dotplot.cpp
diff --git a/src/Utils.pm b/src/Utils.pm
new file mode 100644
index 0000000..e4e7214
--- /dev/null
+++ b/src/Utils.pm
@@ -0,0 +1,553 @@
+#!/usr/bin/env perl
+
+package Utils;
+require 5.000;
+
+use strict;
+use Exporter;
+use Cwd;
+use IO::File;
+use POSIX qw(setsid);
+use Sys::Syslog qw(:DEFAULT setlogsock);
+
+sub Trim( @ );
+sub Lock_File( $ ; $ $ $ );
+sub Unlock_File( $ );
+sub Write_Log( $ $ ; $ $ );
+sub Parse_Filename( $ );
+sub Get_Abs_Path( $ );
+sub Expand_Path( $ );
+sub Get_Random_Key( ; $ );
+sub Hex2Ascii( $ );
+sub Ascii2Hex( $ );
+sub Get_Config_Record( $ $ );
+sub Round( $ );
+sub Set_Log( $ $ );
+sub Log( $ $ );
+sub Min( $ $ );
+sub Max( $ $ );
+sub Reg_Diff( $ $ ; $ $ $ $ $ );
+sub Reg_Rem_Overlap( $ ; $ $ $ );
+sub Reg_Sort( $ ; $ $ $ );
+sub Reg_Intersect( $ $ ; $ $ $ $ $ );
+sub Reg_Merge( $ ; $ $ $ );
+
+use vars qw(@ISA @EXPORT $VERSION $JOB $Error $Syslog $Facility $Msg_Prefix);
+
+ at ISA = qw(Exporter);
+ at EXPORT = qw(Trim Lock_File Unlock_File Write_Log Parse_Filename
+ Get_Abs_Path Expand_Path Hex2Ascii Ascii2Hex Get_Config_Record
+ Get_Random_Key Round Set_Log Log Min Max Reg_Diff Reg_Rem_Overlap
+ Reg_Sort Reg_Intersect Reg_Merge redirect_err2log openlogs safe_glob
+ daemon wr_log wr_err start_watcher confirm $JOB);
+
+my $Id = '$Id: Utils.pm,v 1.21 2005/01/07 23:08:59 poliakov Exp $';
+($VERSION) = ($Id =~ /,v\s+(\d+\S+)/o);
+$JOB = '^(\S+)\@(\S+?)_(\d{4})(?:_(.+)|)$';
+
+$Error = 0;
+$Syslog = 0;
+$Facility = "user";
+$Msg_Prefix = undef;
+
+my $E_FORK = "cannot fork";
+my @LOG_FILE = ();
+my %Locks = ();
+
+sub Trim( @ ) {
+ for (my $i = 0; $i <= $#_; ++$i) {
+ $_[$i] =~ s/^\s+//;
+ $_[$i] =~ s/\s+$//
+ }
+}
+
+sub Lock_File( $ ; $ $ $ ) {
+ my ($file, $retry, $timeout, $max_mtime) = @_;
+ my ($lock_fh, $start_time, $mtime);
+
+ if (!$file || ($file =~ /\/$/o)) {
+ $Error = "Invalid filename";
+ return 0;
+ }
+ $file = Get_Abs_Path("$file.lock");
+ if (exists($Locks{$file})) { $Error = "Already locked"; return 1; }
+ if (!-w (Parse_Filename($file))[0]) {
+ $Error = "Permission denied";
+ return 0;
+ }
+ if (!defined($retry)) { $retry = 1; }
+ if (!defined($timeout)) { $timeout = 1200; }
+ if (!defined($max_mtime)) {
+ $max_mtime = ($timeout > 0) ? int($timeout / 2) : 0;
+ }
+ $start_time = time();
+ LOCK: {
+ if (!($lock_fh = IO::File->new($file, O_RDWR|O_CREAT|O_EXCL))) {
+ if (!$retry || (($timeout > 0) && ((time() - $start_time) > $timeout))) {
+ $Error = "Locked by someone else";
+ return 0;
+ }
+ if ($max_mtime > 0) {
+ $mtime = (stat($file))[9];
+ if ($mtime && ((time() - $mtime) > $max_mtime)) { unlink($file); }
+ }
+ redo LOCK;
+ }
+ }
+ $lock_fh->close();
+ $Locks{$file} = 1;
+ return 1;
+}
+
+sub Unlock_File( $ ) {
+ my ($file) = @_;
+
+ if (!$file) { $Error = "Invalid filename"; return 0; }
+ $file = Get_Abs_Path("$file.lock");
+ if (!exists($Locks{$file})) { $Error = "Not locked"; return 0; }
+ if (!unlink($file)) { $Error = "Cannot unlock"; return 0; }
+ delete($Locks{$file});
+ return 1;
+}
+
+{
+ my $Uname;
+ foreach my $dir ('/bin', '/sbin', '/usr/bin', '/usr/sbin') {
+ -x "$dir/uname" and $Uname = "$dir/uname", last;
+ }
+ my $Host = $Uname ? `$Uname -n` : 'localhost';
+ chomp($Host);
+ ($Host) = ($Host =~ /^([^\.]+)(\..*)?$/);
+
+sub Write_Log( $ $ ; $ $ ) {
+ no strict "refs";
+ my ($log_file, $msg, $name, $pid) = @_;
+ my $error = 0;
+ my $date;
+ local *LOG;
+
+ if (!defined($log_file) || !defined($msg)) { return 0; }
+ if (*{$log_file}{IO}) {
+ *LOG = *{$log_file}{IO};
+ } elsif ($log_file eq '/dev/null') {
+ return 1;
+ } else {
+ if (!Lock_File($log_file)) { return 0; }
+ if (!open(LOG, ">> $log_file")) { $error = 1; }
+ }
+ if (!$error) {
+ chomp($msg);
+ $date = localtime(time());
+ if (!$name) { $name = $0; }
+ if (!$pid) { $pid = $$; }
+ if (!print LOG "$date $Host $name\[$pid\]: $msg\n") { $error = 1; }
+ if (!*{$log_file}{IO}) { close(LOG); }
+ }
+ if ($error && $!) { $Error = "$!"; }
+ if (!*{$log_file}{IO}) { Unlock_File($log_file); }
+ return !$error;
+}}
+
+sub Parse_Filename( $ ) {
+ my ($name) = @_;
+ my ($last_slash_pos, $dir, $file);
+
+ if (!defined($name)) { return (); }
+ $last_slash_pos = rindex($name, "/");
+ if ($last_slash_pos >= 0) {
+ $dir = substr($name, 0, $last_slash_pos + 1);
+ $file = substr($name, $last_slash_pos + 1);
+ } else {
+ $dir = "";
+ $file = $name;
+ }
+ return ($dir, $file);
+}
+
+sub Expand_Path( $ ) {
+ my ($path) = @_;
+ my $home_dir;
+
+ $path && ($path =~ /^~/o) or return $path;
+ $path =~ /^~([^\/]*)(.*)$/o;
+ $home_dir = $1 ? (getpwnam($1))[7] :
+ ($ENV{"HOME"} || $ENV{"LOGDIR"} || (getpwuid($>))[7]);
+ defined($home_dir) and $path = "$home_dir$2";
+ return $path;
+}
+
+sub Get_Abs_Path( $ ) {
+ my ($path) = @_;
+
+ defined($path) or return $path;
+ $path = Expand_Path($path);
+ $path =~ /^\//o or $path = getcwd() . "/$path";
+ $path =~ s(/{2,})(/)g;
+
+# get rid of "/./"
+
+ while ($path =~ /^(.*?)\/\.(?:|\/(.*))$/o) {
+ $path = "$1/" . ($2 ? $2 : "");
+ }
+
+# get rid of "/../"
+
+ while ($path =~ /^(((?:.*?\/)*?)[^\/]+){0,1}?\/\.\.(?:|\/(.*))$/o) {
+ $path = ($1 ? $2 : "/") . ($3 ? $3 : "");
+ }
+ return $path;
+}
+
+{
+ my @Chars = ("A" .. "Z", "a" .. "z", 0 .. 9);
+ srand();
+
+sub Get_Random_Key( ; $ ) {
+ my ($len) = @_;
+
+ if (!defined($len) || ($len !~ /^\d+$/o) || ($len < 2) || ($len > 1024)) {
+ $len = 8;
+ }
+ return join("", @Chars[map {rand @Chars } (1 .. 8)]);
+}}
+
+sub Hex2Ascii( $ ) {
+ my ($str) = @_;
+
+ if ($str) { $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; }
+ return $str;
+}
+
+{
+ my $a2h = {
+ "\t" => "%29",
+ "+" => "%2B",
+ "," => "%2C",
+ "." => "%2E",
+ ";" => "%3B",
+ "/" => "%2F",
+ "?" => "%3F",
+ ":" => "%3A",
+ "@" => "%40",
+ "=" => "%3D",
+ "&" => "%26",
+ " " => "%20",
+ "<" => "%3C",
+ ">" => "%3E",
+ "\"" => "%22",
+ "%" => "%25",
+ "#" => "%23",
+ "[" => "%5B",
+ "]" => "%5D",
+ "{" => "%7B",
+ "}" => "%7D",
+ "|" => "%7C",
+ "\\" => "%5C",
+ "^" => "%5E",
+ "~" => "%7E",
+ "`" => "%60"};
+
+sub Ascii2Hex( $ ) {
+ my ($str) = @_;
+ my $new_str = "";
+
+ if (!$str) { return $str; }
+ foreach my $char (split(//, $str)) {
+ if (exists($a2h->{$char})) { $char = $a2h->{$char}; }
+ $new_str .= $char;
+ }
+ return $new_str;
+}}
+
+sub Get_Config_Record( $ $ ) {
+ my ($conf_file, $rec) = @_;
+ my ($db, $field, $value);
+ my @result = ();
+
+ if (!($db = Registry->New($conf_file, "r", 1))) {
+ $Error = "$Registry::Error", return ();
+ }
+ if (!$db->Record_Exists($rec)) {
+ $Error = qq("$rec" record not found);
+ return ();
+ }
+ foreach my $field (qw(dir users log)) {
+ if (!($value = Expand_Path($db->Get_Val($rec, $field)))) {
+ if ($field eq "log") {
+ $value = "";
+ } else {
+ $Error = qq("$field" field of "$rec" record is missing), return ();
+ }
+ } elsif ($value !~ /^\//o) {
+ $Error = qq("$field" field of "$rec" record should be absolute path);
+ return ();
+ }
+ push(@result, $value);
+ }
+ foreach my $field (qw(max_down grace_period)) {
+ if (!($value = $db->Get_Val($rec, $field)) ||
+ ($value !~ /^\d+$/o)) {
+ $value = 0;
+ }
+ push(@result, $value);
+ }
+ return @result;
+}
+
+sub Round( $ ) {
+ my ($num) = @_;
+
+ return int($num + 0.5);
+}
+
+sub Log( $ $ ) {
+ my ($log_num, $msg) = @_;
+
+ (defined($log_num) && ($log_num >= 0) && $LOG_FILE[$log_num]) and
+ Write_Log($LOG_FILE[$log_num], $msg);
+}
+
+sub Set_Log( $ $ ) {
+ my ($log_num, $file) = @_;
+
+ (defined($log_num) && ($log_num >= 0) && $file) and
+ $LOG_FILE[$log_num] = $file;
+}
+
+sub Min( $ $ ) {
+ my ($i, $j) = @_;
+
+ return ($i < $j) ? $i : $j;
+}
+
+sub Max( $ $ ) {
+ my ($i, $j) = @_;
+
+ return ($i > $j) ? $i : $j;
+}
+
+sub Reg_Diff( $ $ ; $ $ $ $ $ ) {
+ my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_;
+ my (@new_regs, $start, $end, $new_reg);
+
+ $regs1 && $regs2 or return $regs1;
+ $s1 ||= 0;
+ defined($e1) or $e1 = 1;
+ $s2 ||= 0;
+ defined($e2) or $e2 = 1;
+ for (my $i = 0; $i < @$regs1; ++$i) {
+ $start = $$regs1[$i][$s1];
+ $end = $$regs1[$i][$e1];
+ for (my $j = 0; $j < @$regs2; ++$j) {
+ $$regs2[$j][$s2] > $end and last;
+ $$regs2[$j][$e2] < $start and next;
+ if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] >= $end)) {
+ undef($start), last;
+ }
+ if (($$regs2[$j][$s2] > $start) && ($$regs2[$j][$e2] >= $end)) {
+ $end = $$regs2[$j][$s2] - 1, last;
+ }
+ if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] < $end)) {
+ $start = $$regs2[$j][$e2] + 1, next;
+ }
+ ($start < ($$regs2[$j][$s2] - 1)) || !$strict and
+ $new_reg = [@{$$regs1[$i]}],
+ $$new_reg[$s1] = $start,
+ $$new_reg[$e1] = $$regs2[$j][$s2] - 1,
+ push(@new_regs, $new_reg);
+ $start = $$regs2[$j][$e2] + 1;
+ }
+ !defined($start) || ($start > $end) and next;
+ ($start < $end) || !$strict and
+ $new_reg = [@{$$regs1[$i]}],
+ $$new_reg[$s1] = $start,
+ $$new_reg[$e1] = $end,
+ push(@new_regs, $new_reg);
+ }
+ return \@new_regs;
+}
+
+sub Reg_Rem_Overlap( $ ; $ $ $ ) {
+ my ($regs, $strict, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ if (($i < $#new_regs) && ($new_regs[$i + 1][$s] <= $new_regs[$i][$e])) {
+ $new_regs[$i + 1][$e] <= $new_regs[$i][$e] and
+ splice(@new_regs, $i + 1, 1),
+ --$i, next;
+ $new_regs[$i + 1][$s] = $new_regs[$i][$e] + 1;
+ }
+ ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next;
+ splice(@new_regs, $i, 1);
+ --$i;
+ }
+ return \@new_regs;
+}
+
+sub Reg_Sort( $ ; $ $ $ ) {
+ my ($regs, $rev, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ if ($rev) {
+ @new_regs = sort { ($$b[$s] <=> $$a[$s]) || ($$b[$e] <=> $$a[$e]) } @$regs;
+ } else {
+ @new_regs = sort { ($$a[$s] <=> $$b[$s]) || ($$a[$e] <=> $$b[$e]) } @$regs;
+ }
+ return \@new_regs;
+}
+
+sub Reg_Intersect( $ $ ; $ $ $ $ $ ) {
+ my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_;
+
+ $regs1 && $regs2 or return undef;
+ $s1 ||= 0;
+ defined($e1) or $e1 = 1;
+ $s2 ||= 0;
+ defined($e2) or $e2 = 1;
+ return Reg_Diff($regs1, Reg_Diff($regs1, $regs2, $strict, $s1, $e1,
+ $s2, $e2), $strict, $s1, $e1, $s1, $e1);
+}
+
+sub Reg_Merge( $ ; $ $ $ ) {
+ my ($regs, $strict, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ ($i < $#new_regs) &&
+ ($new_regs[$i + 1][$s] == ($new_regs[$i][$e] + 1)) and
+ $new_regs[$i][$e] = $new_regs[$i + 1][$e],
+ splice(@new_regs, $i + 1, 1),
+ --$i, next;
+ }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next;
+ splice(@new_regs, $i, 1);
+ --$i;
+ }
+ return \@new_regs;
+}
+
+sub safe_glob {
+ my ($regexp, $dir) = @_;
+ my (@files);
+ local (*DIR);
+
+ $dir ||= ".";
+ $regexp ||= ".*";
+ opendir(DIR, $dir) or return;
+ @files = grep { /$regexp/ } readdir(DIR);
+ closedir(DIR);
+ return wantarray() ? @files : scalar(@files);
+}
+
+sub redirect_err2log {
+ my ($facility) = @_;
+
+ $Facility = $facility;
+ stderr2log();
+}
+
+sub stderr2log {
+ my ($oldfh);
+
+ open(STDERR, "> /dev/null");
+ open(STDERR, "| logger -p $Facility.err -t '$0\[$$\]'");
+ $oldfh = select(STDERR); $| = 1; select($oldfh);
+}
+
+sub openlogs {
+ my ($facility) = @_;
+
+ $facility and $Facility = $facility;
+ stderr2log();
+ setlogsock("unix");
+ openlog($0, "pid", $Facility);
+ $Syslog = 1;
+}
+
+sub daemon {
+ my ($facility) = @_;
+ my ($pid);
+
+ if ($pid = fork()) {
+ exit(0);
+ } elsif (!defined($pid)) {
+ wr_err("$E_FORK: $!");
+ die;
+ } else {
+ setsid();
+ close(STDIN);
+ close(STDOUT);
+ open(STDOUT, "> /dev/null");
+ openlogs($facility);
+ }
+}
+
+sub start_watcher {
+ my ($watcher, $facility, @params) = @_;
+ my ($pid, $parent);
+
+ $parent = $$;
+ if ($pid = fork()) {
+ return;
+ } elsif (!defined($pid)) {
+ wr_err("$E_FORK: $!");
+ die;
+ } else {
+ setsid();
+ close(STDIN);
+ close(STDOUT);
+ open(STDOUT, "> /dev/null");
+ $0 .= "_watcher";
+ openlogs($facility);
+ &$watcher($parent, @params);
+ }
+}
+
+sub wr_log {
+ my $msg = shift;
+
+ chomp($msg);
+ $msg = ( $Msg_Prefix ? &$Msg_Prefix : "") . $msg;
+ if ($Syslog) {
+ syslog("info", "%s", $msg);
+ } else {
+ print "$msg\n";
+ }
+}
+
+sub wr_err {
+ my $msg = shift;
+
+ chomp($msg);
+ print STDERR (( $Msg_Prefix ? &$Msg_Prefix : ""), "$msg\n");
+ return 1;
+}
+
+sub confirm {
+ my ($msg) = @_;
+ my ($ans);
+
+ print $msg;
+ $ans = <STDIN>;
+ chomp($ans);
+ return ($ans =~ /^(y|yes)$/io) ? 1 : 0;
+}
+
+END {
+ foreach my $lock (keys(%Locks)) { unlink($lock); }
+}
+
+1;
diff --git a/src/anal_gloc.pl b/src/anal_gloc.pl
new file mode 100755
index 0000000..644d952
--- /dev/null
+++ b/src/anal_gloc.pl
@@ -0,0 +1,142 @@
+#!/usr/bin/env perl
+
+$savname1 = "";
+$savname2 = "";
+$skip = 0;
+$endblock = 0;
+$score = 0;
+$strand = "";
+$initstrnd;
+$s1s = 999999999;
+$s2s = 999999999;
+$first = 1;
+$plus_sc = 0;
+$minus_sc = 0;
+
+
+while ($line = <STDIN>) {
+
+ if ($line =~ /^>/) {
+ if (!$first) {
+ if ($strand eq "+") {
+ print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n";
+ }
+ else {
+ print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n";
+ }
+
+ if ($strand ne $initstrnd) {
+ print STDOUT "INV\n"
+ }
+ if ($strand eq "+") { $plus_sc += $score; }
+ else { $minus_sc += $score; }
+ if ($plus_sc > $minus_sc) {
+ print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n";
+ }
+ else {
+ print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n";
+ }
+ $plus_sc = 0;
+ $minus_sc = 0;
+ $score = 0;
+ $s1s = 999999999;
+ $s2s = 999999999;
+ $strand = "";
+ }
+ $first = 1;
+ $name1 = $line;
+ chomp $name1;
+ $line = <STDIN>;
+ if ($line !~ /^>/) {
+ print STDERR "Expecting a name, but got $line";
+ exit (1);
+ }
+ $name2 = $line;
+ chomp $name2;
+ $inblock = 1;
+ $skip = 0;
+ if (($name1 eq $savname1) && ($name2 eq $savname2)) {
+ $skip = 1;
+ }
+ else { print STDOUT "$name1 $name2\n"; }
+
+ $savname1 = $name1;
+ $savname2 = $name2;
+ }
+ elsif (!$skip) {
+ $endblock = 0;
+ $line =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) ([0-9\.]*) (.) (.*)/;
+ if ($1 == 0 || $3 == 0) {
+ next;
+ }
+# print STDOUT "strand $strand $s2s $4\n";
+ if (($strand eq "+") && ($6 eq "+") && ($s2s + 20 < $4) ) {
+ $endblock += 2;
+ }
+ if (($strand eq "-") && ($6 eq "-") && ($s2s > $4 + 20) ) {
+ $endblock += 2;
+ }
+ if ($strand eq "") { $strand = $6; }
+ if ($6 ne $strand) {
+ $endblock += 1;
+ }
+
+ if (!$endblock) {
+ $s2s = $3;
+ $s1s = $1;
+ $s1e = $2;
+ $s2e = $4;
+ $score += $5;
+ if ($first) {
+ print STDOUT " ";
+ print STDOUT " ";
+ $initstrnd = $strand;
+ $reg1s = $2;
+ $reg2s = $4;
+ $first = 0;
+ }
+ }
+ else {
+ if ($strand eq "+") {
+ print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n";
+ }
+ else {
+ print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n";
+ }
+
+ if ($strand eq "+") { $plus_sc += $score; }
+ else { $minus_sc += $score; }
+
+ if ($endblock %2) { print STDOUT "INV "; }
+ else {print STDOUT " "; }
+ if ($endblock > 1) { print STDOUT "TRL "; }
+ else {print STDOUT " "; }
+ $s2s = $3;
+ $s1s = $1;
+ $s1e = $2;
+ $s2e = $4;
+ $reg1s = $s1e;
+ $reg2s = $s2e;
+ $score = $5;
+ $strand = $6;
+ # print STDOUT "strand $strand\n";
+ }
+ }
+}
+if (!$first){
+ if ($strand eq "+") {
+ print STDOUT " Region [$s1s $reg1s][$s2s $reg2s] $score $strand\n";
+ }
+ else {
+ print STDOUT " Region [$s1s $reg1s][$reg2s $s2s] $score $strand\n";
+ }
+ if ($strand eq "+") { $plus_sc += $score; }
+ else { $minus_sc += $score; }
+}
+
+if ($plus_sc > $minus_sc) {
+ print STDOUT "Main score (+) $plus_sc; Inverted $minus_sc\n";
+}
+else {
+ print STDOUT "Main score (-) $minus_sc; Inverted $plus_sc\n";
+}
diff --git a/src/anchors.c b/src/anchors.c
new file mode 100644
index 0000000..44919de
--- /dev/null
+++ b/src/anchors.c
@@ -0,0 +1,279 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "skiplist.h"
+
+typedef struct GapFreeChunkList {
+ int x;
+ int y;
+ int length;
+ int score;
+ struct GapFreeChunkList *next;
+} gfc;
+
+typedef struct HitLocationList {
+ int seq1start;
+ int seq2start;
+ int seq1end;
+ int seq2end;
+ float score;
+ struct HitLocationList *next;
+ struct HitLocationList *bkptr;
+ gfc* first;
+ gfc* last;
+ float scoreSoFar;
+} hll;
+
+typedef struct hllpointer {
+ int number;
+ char isstart;
+ hll* myhll;
+} hptr;
+
+char seq1name[255];
+char seq2name[255];
+
+float gapopen =0, gapcont=0;
+int gapfreechunks = 0;
+hll* parseCHAOS(FILE* infile, int* numhits);
+hll* findBestChain(hptr* myarr, int arrsize);
+void doOutput(hll* mylist);
+hll* sortList(hll* mylist);
+
+
+static int hptrcomp (const void *p1, const void *p2) {
+ int i = ((hptr*)p1)->number;
+ int j = ((hptr*)p2)->number;
+ int it = ((hptr*)p1)->isstart;
+ int jt = ((hptr*)p2)->isstart;
+ if (i > j)
+ return (1);
+ if (i < j)
+ return (-1);
+ if (it)
+ return -1;
+ else
+ return 1;
+}
+
+int main(int argc, char** argv){
+ FILE* inf;
+ hll* mylist, *temp, *best;
+ int numhits, i=0;
+ hptr* myptrs;
+
+ if (argc < 1 || argc > 6) {
+ printf("usage: anchors [filename] [-gap # #]\n");
+ printf("For -gap the first # is the gap open penalty, the second the gap continue");
+ return 1;
+ }
+ i = 2;
+ if (argc == 1 || strchr(argv[1], '-')) {
+ i = 1;
+ inf = stdin;
+ }
+ else if (!(inf = fopen(argv[1],"r"))) {
+ printf("couldn't open input file\n");
+ return 2;
+ }
+ while (i < argc) {
+ if (!strcmp(argv[i], "-gap")) {
+ sscanf(argv[i+1],"%f",&gapopen);
+ sscanf(argv[i+2],"%f",&gapcont);
+ i += 3;
+ }
+ else if (!strcmp(argv[i], "-gfc")) {
+ gapfreechunks = 1;
+ i += 1;
+ }
+ }
+ initLib();
+
+ mylist = parseCHAOS(inf, &numhits);
+ if (!numhits)
+ return 0;
+ myptrs = (hptr*) malloc (sizeof(hptr) * numhits *2);
+ i = 0;
+ for (temp = mylist; temp; temp = temp->next) {
+ myptrs[i].number = temp->seq1start;
+ myptrs[i].isstart = 1;
+ myptrs[i].myhll = temp;
+ myptrs[i+1].number = temp->seq1end;
+ myptrs[i+1].isstart = 0;
+ myptrs[i+1].myhll = temp;
+ i = i+2;
+ }
+ qsort(myptrs, numhits*2, sizeof(hptr), hptrcomp);
+ best = findBestChain(myptrs, numhits*2);
+ doOutput(best);
+ return 0;
+}
+
+int whRulez(hll* one, hll* two) {
+ float gapdiff = ((float)(two->seq2end - one->seq2end)) * gapcont;
+ return two->scoreSoFar-one->scoreSoFar-gapdiff > 0;
+}
+
+float gapPen(hll* next, hll* prev) {
+ float j= ((float)(next->seq2start-prev->seq2end))*gapcont + gapopen;
+ // printf("%d (%f)*(%f) %f gap\n", next->seq2start-prev->seq2end, ((float)(next->seq2start-prev->seq2end)),gapcont,j);
+ return j;
+}
+
+hll* findBestChain(hptr* array, int arrsize) {
+ sklst* skipper = makeSkLst();
+ sle* help, *bestptr;
+ float best = -1;
+ int i;
+ for (i = 0; i < arrsize; i++) {
+ if (array[i].isstart) {
+ help = SLfind(skipper, array[i].myhll->seq2start);
+ if (help->myelem &&
+ (gapPen(array[i].myhll, ((hll*)help->myelem)) + ((hll*)help->myelem)->scoreSoFar) > 0) {
+ array[i].myhll->bkptr = help->myelem;
+ array[i].myhll->scoreSoFar = ((hll*)help->myelem)->scoreSoFar + array[i].myhll->score + gapPen(array[i].myhll, ((hll*)help->myelem));
+ }
+ else {
+ array[i].myhll->bkptr = 0;
+ array[i].myhll->scoreSoFar = array[i].myhll->score;
+ }
+ }
+ else {
+ help = SLfind(skipper, array[i].myhll->seq2end);
+
+ if (help->myelem && whRulez(array[i].myhll,((hll*)help->myelem)))
+ continue;
+ SLinsertAfter(skipper, help, array[i].myhll->seq2end, array[i].myhll);
+ help = help->next[0];
+
+ while (help->next[0] &&
+ !whRulez(((hll*)help->myelem), ((hll*)help->next[0]->myelem)))
+ SLremove(skipper, help->next[0]);
+ }
+ }
+ help = skipper->sentinel->next[0];
+ while (help) {
+ if (((hll*)help->myelem)->scoreSoFar > best) {
+ best = ((hll*)help->myelem)->scoreSoFar;
+ bestptr = help;
+ }
+ help = help->next[0];
+ }
+
+ return (hll*)bestptr->myelem;
+}
+
+void doOutput(hll* best) {
+ int len;
+
+ hll *bestPtr=best, *temp;
+ int chl=0, i, bestscore=-1;
+ gfc* tmpgf;
+ for (temp = bestPtr; temp; temp = temp->bkptr) {
+ chl++;
+ }
+
+ for (temp = bestPtr; temp; temp = temp->bkptr) {
+ len = temp->seq1end - temp->seq1start + 1 ;
+ if (!gapfreechunks || !temp->first) {
+ printf("(%d %d)=",temp->seq2start, temp->seq2end);
+ printf("(%d %d) %f\n",temp->seq1start, temp->seq1end, temp->score);
+ }
+ else {
+ for (tmpgf = temp->first; tmpgf ; tmpgf = tmpgf->next) {
+ printf("(%d %d)=(%d %d) %d\n", tmpgf->y, tmpgf->y + tmpgf->length-1, tmpgf->x, tmpgf->x + tmpgf->length-1,
+ tmpgf->score);
+
+ }
+ }
+ }
+}
+
+char* rolltonum(char* str) {
+ char *got1=0, *got2=0;
+ int in=0, i=0;
+ while (1) {
+ if (str[i] == 0) {
+ break;
+ }
+ if (str[i] == ';' && got1 && got2){
+ return got1;
+ }
+ if (isdigit(str[i])) {
+ if (!in && (!i || isspace(str[i-1]))) {
+ if (got1)
+ got2 = &str[i];
+ else
+ got1 = &str[i];
+ in = 1;
+ }
+ }
+ else if (in && (isspace(str[i]))) {
+ if (got2) {
+ got1 = got2; got2=0; in = 0;
+ }
+ in = 0;
+ }
+
+ else {
+ in = 0;
+ got1=got2=0;
+ }
+ i++;
+ }
+ return &str[i];
+}
+
+int getline(FILE* infile, hll* tt) {
+ char temp[1024];
+ char* help;
+ int z, h;
+ fgets(temp, 1024, infile);
+ help = rolltonum(temp);
+ z = sscanf(help, "%d %d;%n", &tt->seq2start, &tt->seq2end, &h);
+ if (z<2)
+ return 0;
+ help = rolltonum(help+h);
+ if (sscanf(help,"%d %d; score = %f (%*c)\n", &tt->seq1start,
+ &tt->seq1end,&tt->score)<3)
+ return 0;
+ return 1;
+}
+
+
+hll* parseCHAOS(FILE* infile, int* totnum) {
+ hll *myres=0, *tt;
+ gfc* temp;
+ *totnum = 0;
+ while(!feof(infile)) {
+ tt = (hll*) malloc(sizeof(hll));
+ while (!feof(infile) && !getline(infile, tt))
+ ;
+ if (feof(infile)) break;
+ if (gapfreechunks) {
+ tt->first = tt->last = temp = (gfc*) malloc(sizeof (gfc));
+ temp->next = 0;
+ while (fscanf(infile, "%d %d %d %d", &temp->y, &temp->x, &temp->length, &temp->score) == 4){
+ tt->first = temp;
+ temp = (gfc*) malloc(sizeof (gfc));
+ temp->next = tt->first;
+ }
+ free(temp);
+ if (temp == tt->last) {
+ tt->first = tt->last = 0;
+ }
+ }
+ tt->next = myres;
+ tt->bkptr = 0;
+ tt->scoreSoFar = 0;
+ (*totnum)++;
+ myres = tt;
+ }
+ return myres;
+}
+
+
+
+
+
diff --git a/src/ancseq.cpp b/src/ancseq.cpp
new file mode 100644
index 0000000..9e3a93f
--- /dev/null
+++ b/src/ancseq.cpp
@@ -0,0 +1,720 @@
+/**
+ * @file
+ * Compiles ancestor FASTA file using ansestor generation script.
+ *
+ * Arguments:
+ *
+ * -i filename : ansestor generation script <br>
+ * -g genome genomeindex : genome index, genomeindex refers to 2 files: genomeindex.ind and genomeindex.seq <br>
+ * -a alignmentindex : alignment index, alignmentindex refers to 2 files: alignmentindex.ind and alignmentindex.seq <br>
+ * -o filename : output -- ancestor fasta file
+ *
+ * Ansestor generation script example:
+ *
+ * [TODO]
+ *
+ * Comment: [TODO].
+ *
+ *
+ * @author Mikhail Soloviev
+ * @date 31.03.2006
+ * @version 1.0
+ *
+ */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace std;
+
+#include "util.cpp"
+#include "faindex.cpp"
+
+#define fastaRowLength 50
+
+void revComp(char* seq,char* rev,long size) {
+ rev+=size-1;
+ for (long i=0;i<size;i++) {
+ *rev=comp(*seq);
+ seq++;
+ rev--;
+ }
+}
+
+void appendSeq(FILE *out,string header,string path) {
+ fprintf(out,">%s\n",header.c_str());
+ char buf[fastaRowLength+1];
+ FILE *in=openFile(path,"r");
+ while (!feof(in)) {
+ buf[0]='\0';
+ fgets(buf,fastaRowLength,in);
+ if (strlen(buf)>0) fprintf(out,"%s\n",buf);
+ }
+ fclose(in);
+}
+
+typedef char* pchar;
+typedef FILE* pfile;
+typedef pfile* ppfile;
+
+struct Range {
+ int start;
+ int end;
+};
+
+struct AlignLocation {
+ string org;
+ string name; // sequence name/id
+ int start;
+ int end;
+ char strand;
+};
+
+struct AlignMap {
+ string id;
+ map<string,AlignLocation> location; // string: orgId
+ char strand;
+};
+
+map<string,AlignMap> alignMap; // string: alignId
+
+void loadAlignMap(string path) {
+ char line[2000];
+ char id[1000];
+ char name1[1000];
+ char name2[1000];
+ char org0[1000];
+ char org1[1000];
+ char org2[1000];
+ AlignLocation loc0;
+ AlignLocation loc1;
+ AlignLocation loc2;
+
+ FILE *in=openFile(path,"r");
+ while (!feof(in)) {
+ line[0]='\0';
+ fgets(line,2000,in);
+ if (strlen(line)==0) continue;
+ AlignMap aMap;
+ sscanf(line,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c",
+ org0,id,&loc0.start,&loc0.end,&loc0.strand,
+ org1,name1,&loc1.start,&loc1.end,&loc1.strand,
+ org2,name2,&loc2.start,&loc2.end,&loc2.strand);
+ loc0.org="0";
+ loc1.org=org1;
+ loc2.org=org2;
+ loc0.name=id;
+ loc1.name=name1;
+ loc2.name=name2;
+ aMap.id=id;
+ aMap.strand=loc2.strand;
+ aMap.location[loc0.org]=loc0;
+ aMap.location[loc1.org]=loc1;
+ aMap.location[loc2.org]=loc2;
+ alignMap[aMap.id]=aMap;
+ }
+ fclose(in);
+}
+
+// direct cut calculation: genome -> align, receives relative coord., returns absolute coord.
+
+int calcCutStartLetter(char* seq,int start,int end,int relCut) {
+ if (relCut==0) return start;
+ int j=0;
+ for (int i=start;i<=end;i++) {
+ if (seq[i]!='-') j++;
+ if (j==relCut) return i;
+ }
+ return start;
+}
+
+int calcCutEndLetter(char* seq,int start,int end,int relCut) {
+ if (relCut==0) return end;
+ int j=0;
+ for (int i=end;i>=start;i--) {
+ if (seq[i]!='-') j++;
+ if (j==relCut) return i;
+ }
+ return end;
+}
+
+// reverse cut calculation: align -> genome, receives absolute coord., returns relative coord.
+
+int revCalcCutStartLetter(char* seq,int start,int end,int absCut) {
+ if (absCut==0) return 0;
+ int j=0;
+ for (int i=start;(i<=end && i<absCut);i++) {
+ if (seq[i]!='-') j++;
+ }
+ return j;
+}
+
+int revCalcCutEndLetter(char* seq,int start,int end,int absCut) {
+ if (absCut==0) return 0;
+ int j=0;
+ for (int i=end;(i>=start && i>absCut);i--) {
+ if (seq[i]!='-') j++;
+ }
+ return j;
+}
+
+char* readSeqBuf(FILE *seq,long offset,int length) {
+ fseek(seq,offset,0);
+ char* buf=(char*)malloc(length*sizeof(char));
+ fread(buf,sizeof(char),length,seq);
+ return buf;
+}
+
+void writeSeqBuf(FILE *out,char* buf,int length,int sameStrand) {
+ if (sameStrand) {
+ fwrite(buf,sizeof(char),length,out);
+ }
+ else {
+ char* rev=(char*)malloc(length*sizeof(char));
+ revComp(buf,rev,length);
+ fwrite(rev,sizeof(char),length,out);
+ free(rev);
+ }
+ free(buf);
+}
+
+void writeSeq(FILE *out,FILE *seq,long offset,int length,int sameStrand) {
+ char* buf=readSeqBuf(seq,offset,length);
+ writeSeqBuf(out,buf,length,sameStrand);
+}
+
+
+/*OLD
+void writeSeqCut(FILE *out,FILE *seq,long offset,int length,int sameStrand,int cutStart,int cutEnd) {
+ offset+=cutStart;
+ length-=cutStart+cutEnd;
+ writeSeq(out,seq,offset,length,sameStrand);
+}
+*/
+
+/*OLD
+Range writeSeqCutLetter(FILE *out,FILE *seq,long offset,int length,int sameStrand,int cutStart,int cutEnd) {
+ char* buf=readSeqBuf(seq,offset,length);
+ cutStart=cutStartLetter(buf,length,cutStart);
+ cutEnd=cutEndLetter(buf,length,cutEnd);
+ length-=cutStart+cutEnd;
+ memmove(buf,&buf[cutStart],length);
+ writeSeqBuf(out,buf,length,sameStrand);
+ Range r;
+ r.start=cutStart;
+ r.end=cutEnd;
+ return r;
+}
+*/
+
+map<string,FaIndex> genomeIndex;
+
+void openGenomeIndex(string genomeName,string protoNumber,string genomePath) {
+ FaIndex index;
+ index.id=genomeName;
+ index.proto=atoi(protoNumber.c_str());
+ index.file=openFile(genomePath+".seq","r+");
+ FILE *ind=openFile(genomePath+".ind","r");
+ while (!feof(ind)) {
+ FaRecord record=readIndexRecord(ind);
+ if (record.id.size()>0) index.record[record.id]=record;
+ }
+ fclose(ind);
+ genomeIndex[index.id]=index;
+}
+
+AlignLocation writeGenomeSeq(pfile out[],string orgName,int orgProto,string seqName,int start,int end,char strand) {
+ FILE *seq=genomeIndex[orgName].file;
+ for (int p=1;p<=orgProto;p++) {
+ string recId=seqName+":"+itoa(p);
+ FaRecord ind=genomeIndex[orgName].record[recId];
+ writeSeq(out[p-1],seq,ind.offset+start-1,end-start+1,strand=='+');
+ }
+ AlignLocation loc;
+ loc.org=orgName;
+ loc.name=seqName;
+ loc.start=start;
+ loc.end=end;
+ // TODO check
+ loc.strand='+';
+ return loc;
+}
+
+AlignLocation writeGenomeGap(pfile out[],string orgName,int orgProto,string seqName,int start,int end) {
+ int size=end-start+1;
+ char* buf=(char*)malloc(size*sizeof(char));
+ memset(buf,'-',size);
+ for (int p=1;p<=orgProto;p++) {
+ fwrite(buf,sizeof(char),size,out[p-1]);
+ }
+ free(buf);
+ AlignLocation loc;
+ loc.org=orgName;
+ loc.name=seqName;
+ loc.start=start;
+ loc.end=end;
+ // TODO check
+ loc.strand='+';
+ return loc;
+}
+
+FaIndex alignIndex;
+
+void openAlignIndex(string path) {
+ alignIndex.file=openFile(path+".seq","r+");
+ FILE *ind=openFile(path+".ind","r");
+ while (!feof(ind)) {
+ FaRecord record=readIndexRecord(ind);
+ if (record.id.size()>0) alignIndex.record[record.id]=record;
+ }
+ fclose(ind);
+}
+
+int writeAlignSeq(pfile out1[],int proto1,pfile out2[],int proto2,string alignId,string orgName,char strand) {
+ FILE *seq=alignIndex.file;
+ AlignLocation loc=alignMap[alignId].location[orgName];
+ AlignLocation loc0=alignMap[alignId].location["0"];
+ int start=loc0.start-1;
+ int length=loc0.end-loc0.start+1;
+ FaRecord ind;
+ for (int p=1;p<=proto1;p++) {
+ string recId=alignId+":"+itoa(p);
+ ind=alignIndex.record[recId];
+ writeSeq(out1[p-1],seq,ind.offset+start,length,strand==loc.strand);
+ }
+ for (int p=1;p<=proto2;p++) {
+ string recId=alignId+":"+itoa(proto1+p);
+ ind=alignIndex.record[recId];
+ writeSeq(out2[p-1],seq,ind.offset+start,length,strand==loc.strand);
+ }
+ return length;
+}
+
+/* not used anymore
+AlignLocation writeAlignSeqCut(FILE *out,string alignId,string orgIndex,string orgName,char strand,int cutAlignStart,int cutAlignEnd) {
+ FILE *seq=alignIndex.file;
+ FaRecord ind=alignIndex.record[alignId+":"+orgIndex];
+ AlignLocation loc=alignMap[alignId].location[orgName];
+ writeSeqCut(out,seq,ind.offset,ind.length,strand==loc.strand,cutAlignStart,cutAlignEnd);
+
+ // TODO -- find it via cutAlignStart,cutAlignEnd -- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+ //loc.start+=cutStart;
+ //loc.end-=cutEnd;
+ return loc;
+}
+*/
+
+// TODO check implementation when start implementing overlapping, compare with writeAlignSeq
+
+/* OLD
+AlignLocation writeAlignSeqCutLetterAlign(FILE *out,string alignId,string orgIndex,string orgName,char strand,int cutAlignStart,int cutAlignEnd) {
+ FILE *seq=alignIndex.file;
+ FaRecord ind=alignIndex.record[alignId+":"+orgIndex];
+ AlignLocation loc=alignMap[alignId].location[orgName];
+
+ // TODO -- optimize by excluding double reading the same sequence
+
+ writeSeqCut(out,seq,ind.offset,ind.length,strand==loc.strand,cutAlignStart,cutAlignEnd);
+ char* buf=readSeqBuf(seq,ind.offset,ind.length);
+ loc.start+=reCutStartLetter(buf,ind.length,cutAlignStart);
+ loc.end-=reCutEndLetter(buf,ind.length,cutAlignEnd);
+ free(buf);
+ return loc;
+}
+*/
+
+// TODO check implementation when start implementing overlapping, compare with writeAlignSeq
+
+/* OLD
+AlignLocation writeAlignSeqCutLetter(FILE *out,string alignId,string orgIndex,string orgName,char strand,int cutStart,int cutEnd,int& cutAlignStart,int& cutAlignEnd) {
+ FILE *seq=alignIndex.file;
+ FaRecord ind=alignIndex.record[alignId+":"+orgIndex];
+ AlignLocation loc=alignMap[alignId].location[orgName];
+ Range r=writeSeqCutLetter(out,seq,ind.offset,ind.length,strand==loc.strand,cutStart,cutEnd);
+ cutAlignStart=r.start;
+ cutAlignEnd=r.end;
+ loc.start+=cutStart;
+ loc.end-=cutEnd;
+ return loc;
+}
+*/
+
+Range calcCutRangeLetter(char* seqBuf,int start,int end,int cutStartLength,int cutEndLength) {
+ Range r;
+ r.start=calcCutStartLetter(seqBuf,start,end,cutStartLength);
+ r.end=calcCutEndLetter(seqBuf,start,end,cutEndLength);
+ return r;
+}
+
+char* makeCons(string alignId,int protoStart,int protoEnd) {
+ FILE *seqFile=alignIndex.file;
+ char* cons=NULL;
+ for (int p=protoStart;p<=protoEnd;p++) {
+ string recId=alignId+":"+itoa(p);
+ FaRecord ind=alignIndex.record[recId];
+ char* buf=readSeqBuf(seqFile,ind.offset,ind.length);
+ if (p==protoStart) {
+ cons=(char*)malloc(ind.length*sizeof(char));
+ memcpy(cons,buf,ind.length);
+ }
+ else {
+ for (int i=0;i<ind.length;i++) if (buf[i]!='-') cons[i]=buf[i];
+ }
+ free(buf);
+ }
+ return cons;
+}
+
+AlignLocation writeAlignSeqCutLetter(pfile out[],int protoStart,int protoEnd,string alignId,string orgName,char strand,int cutStart,int cutEnd,int& cutAlignStart,int& cutAlignEnd) {
+ FILE *seq=alignIndex.file;
+ AlignLocation loc=alignMap[alignId].location[orgName];
+ AlignLocation loc0=alignMap[alignId].location["0"];
+ int start=loc0.start-1;
+ int end=loc0.end-1;
+ int length=loc0.end-loc0.start+1;
+ FaRecord ind;
+ char* cons=makeCons(alignId,protoStart,protoEnd);
+ Range r=calcCutRangeLetter(cons,start,end,cutStart,cutEnd);
+ for (int p=protoStart;p<=protoEnd;p++) {
+ string recId=alignId+":"+itoa(p);
+ ind=alignIndex.record[recId];
+ writeSeq(out[p-1],seq,ind.offset+r.start,(r.end-r.start+1),strand==loc.strand);
+ }
+ cutAlignStart=r.start;
+ cutAlignEnd=r.end;
+ loc.start+=cutStart;
+ loc.end-=cutEnd;
+ free(cons);
+ return loc;
+}
+
+AlignLocation writeAlignSeqCutLetterAlign(pfile out[],int protoStart,int protoEnd,string alignId,string orgName,char strand,int cutAlignStart,int cutAlignEnd) {
+ FILE *seq=alignIndex.file;
+ AlignLocation loc=alignMap[alignId].location[orgName];
+ AlignLocation loc0=alignMap[alignId].location["0"];
+ int start=loc0.start-1;
+ int end=loc0.end-1;
+ int length=loc0.end-loc0.start+1;
+ FaRecord ind;
+ char* cons=makeCons(alignId,protoStart,protoEnd);
+ for (int p=protoStart;p<=protoEnd;p++) {
+ string recId=alignId+":"+itoa(p);
+ ind=alignIndex.record[recId];
+ writeSeq(out[p-1],seq,ind.offset+cutAlignStart,(cutAlignEnd-cutAlignStart+1),strand==loc.strand);
+ }
+ loc.start+=revCalcCutStartLetter(cons,start,end,cutAlignStart);
+ loc.end-=revCalcCutEndLetter(cons,start,end,cutAlignEnd);
+ free(cons);
+ return loc;
+}
+
+struct Command {
+ char operation;
+ string orgName;
+ string seqName;
+ string alignId1;
+ string alignId2;
+ int start;
+ int end;
+ int over1;
+ int over2;
+ char strand;
+};
+
+vector<Command> command;
+
+void loadCommand(string path) {
+ char line[1000];
+ char orgName[100];
+ char seqName[100];
+ char alignId1[100];
+ char alignId2[100];
+ char operation;
+
+ FILE *in=openFile(path,"r");
+ while (!feof(in)) {
+ line[0]='\0';
+ fgets(line,1000,in);
+ if (strlen(line)==0) continue;
+ Command com;
+ operation=' ';
+ orgName[100]='\0';
+ seqName[100]='\0';
+ alignId1[100]='\0';
+ alignId2[100]='\0';
+ com.over1=0;
+ com.over2=0;
+ sscanf(line,"%c ",&operation);
+ if (operation=='g') {
+ sscanf(line,"%c %s %s %d %d %c",&operation,orgName,seqName,&com.start,&com.end,&com.strand);
+ }
+ else if (operation=='s') {
+ sscanf(line,"%c %s %s %c",&operation,alignId1,orgName,&com.strand);
+ }
+ else if (operation=='o') {
+ sscanf(line,"%c %s %s %s %c %d %d",&operation,alignId1,alignId2,orgName,&com.strand,&com.over1,&com.over2);
+ }
+ else if (operation=='d') {
+ sscanf(line,"%c %s %s %s %c",&operation,alignId1,alignId2,orgName,&com.strand);
+ }
+ else if (operation=='e') {
+ }
+ com.operation=operation;
+ com.orgName=orgName;
+ com.seqName=seqName;
+ com.alignId1=alignId1;
+ com.alignId2=alignId2;
+ command.push_back(com);
+ }
+ fclose(in);
+}
+
+void writeChunkLocation(FILE* blockChunk,AlignLocation loc) {
+ fprintf(blockChunk,"%s %s %d %d %c",loc.org.c_str(),loc.name.c_str(),loc.start,loc.end,loc.strand);
+}
+
+void writeChunk(FILE* blockChunk,AlignMap chunk,string org[]) {
+ writeChunkLocation(blockChunk,chunk.location[org[0]]);
+ fprintf(blockChunk," ");
+ writeChunkLocation(blockChunk,chunk.location[org[1]]);
+ fprintf(blockChunk," ");
+ writeChunkLocation(blockChunk,chunk.location[org[2]]);
+ fprintf(blockChunk,"\n");
+}
+
+void openTmp(pfile tmp[],string outPath,int size,int offset) {
+ for (int i=0;i<size;i++) {
+ tmp[i]=openFile(outPath+"."+itoa(offset+i)+".tmp","w");
+ }
+}
+
+void closeTmp(pfile tmp[],int size) {
+ for (int i=0;i<size;i++) {
+ fclose(tmp[i]);
+ }
+}
+
+int main (int argc,char* argv[]) {
+
+ string org[3];
+ string ancOrg;
+
+ map<string,ppfile> outtmp;
+ map<string,string> other;
+ map<string,string> orgIndex;
+ map<string,int> proto;
+ map<string,int> protoStart;
+
+ AlignMap chunk;
+ string header;
+ int block=1;
+ int multi=0;
+ int start=0;
+ int end=0;
+ int ancProto=0;
+ int ancEnd=0;
+
+ int cutAlignStart=0;
+ int cutAlignEnd=0;
+
+ string outPath=getArg("-o",argc,argv);
+ FILE* out=openFile(outPath,"w");
+
+ FILE* blockChunk=openFile(getArg("-b",argc,argv),"w");
+
+ org[1]=getArg("-g1",argc,argv);
+ org[2]=getArg("-g2",argc,argv);
+
+ proto[org[1]]=atoi(getArgAt("-g1",2,argc,argv).c_str());
+ proto[org[2]]=atoi(getArgAt("-g2",2,argc,argv).c_str());
+
+ protoStart[org[1]]=1;
+ protoStart[org[2]]=proto[org[1]]+1;
+
+ ancProto=proto[org[1]]+proto[org[2]];
+
+ loadAlignMap(getArg("-c",argc,argv));
+
+ openAlignIndex(getArg("-a",argc,argv));
+
+ openGenomeIndex(getArgAt("-g1",1,argc,argv),getArgAt("-g1",2,argc,argv),getArgAt("-g1",3,argc,argv));
+ openGenomeIndex(getArgAt("-g2",1,argc,argv),getArgAt("-g2",2,argc,argv),getArgAt("-g2",3,argc,argv));
+
+ ancOrg=org[1]+"_"+org[2];
+ org[0]=ancOrg;
+
+ chunk.location[org[0]].org=org[0];
+ chunk.location[org[1]].org=org[1];
+ chunk.location[org[2]].org=org[2];
+
+ header=ancOrg+"-anc"+itoa(block);
+ chunk.location[org[0]].name=header;
+ chunk.location[org[0]].start=0;
+ chunk.location[org[0]].end=0;
+
+ other[org[1]]=org[2];
+ other[org[2]]=org[1];
+
+ orgIndex[org[1]]="1";
+ orgIndex[org[2]]="2";
+
+ pfile tmp1[proto[org[1]]];
+ pfile tmp2[proto[org[2]]];
+ outtmp[org[1]]=tmp1;
+ outtmp[org[2]]=tmp2;
+
+ openTmp(outtmp[org[1]],outPath,proto[org[1]],1);
+ openTmp(outtmp[org[2]],outPath,proto[org[2]],proto[org[1]]+1);
+
+ loadCommand(getArg("-i",argc,argv));
+
+ // TODO: check and implement if necessary linking between s,d,o,g
+ // in the same block, currently only d & o is linked
+
+ for (int i=0;i<command.size();i++) {
+ Command com=command[i];
+
+ if (com.operation=='g') {
+ multi=0;
+ chunk.location[org[0]].start=ancEnd+1;
+ ancEnd+=com.end-com.start+1;
+ chunk.location[org[0]].end=ancEnd;
+ chunk.location[com.orgName]=writeGenomeSeq(outtmp[com.orgName],com.orgName,proto[com.orgName],com.seqName,com.start,com.end,com.strand);
+ chunk.location[other[com.orgName]]=writeGenomeGap(outtmp[other[com.orgName]],other[com.orgName],proto[other[com.orgName]],"-",com.start,com.end);
+ writeChunk(blockChunk,chunk,org);
+ }
+ else if (com.operation=='s') {
+ multi=0;
+ chunk.location[org[0]].start=ancEnd+1;
+ ancEnd+=writeAlignSeq(outtmp[org[1]],proto[org[1]],outtmp[org[2]],proto[org[2]],com.alignId1,com.orgName,com.strand);
+ chunk.location[org[0]].end=ancEnd;
+ chunk.location[com.orgName]=alignMap[com.alignId1].location[com.orgName];
+ chunk.location[other[com.orgName]]=alignMap[com.alignId1].location[other[com.orgName]];
+ writeChunk(blockChunk,chunk,org);
+ }
+ else if (com.operation=='d') {
+ if (multi==0) multi=1; else multi=2;
+
+ // align. 1
+ if (multi==1) {
+ chunk.location[org[0]].start=ancEnd+1;
+ ancEnd+=writeAlignSeq(outtmp[org[1]],proto[org[1]],outtmp[org[2]],proto[org[2]],com.alignId1,com.orgName,com.strand);
+ chunk.location[org[0]].end=ancEnd;
+ chunk.location[com.orgName]=alignMap[com.alignId1].location[com.orgName];
+ chunk.location[other[com.orgName]]=alignMap[com.alignId1].location[other[com.orgName]];
+ writeChunk(blockChunk,chunk,org);
+ }
+ // genome between
+ AlignLocation loc1=alignMap[com.alignId1].location[com.orgName];
+ AlignLocation loc2=alignMap[com.alignId2].location[com.orgName];
+ // TODO check possible overlap
+ if (com.strand=='+') {
+ start=loc1.end-1;
+ end=loc2.start-1;
+ }
+ else {
+ start=loc2.end+1;
+ end=loc1.start-1;
+ }
+ // TODO -- currently it is assumed that seqName in the 1st and 2nd align. are the same -- check it !!!
+ // see also the equivalent line below
+ if (start<end) {
+ chunk.location[org[0]].start=ancEnd+1;
+ ancEnd+=end-start+1;
+ chunk.location[org[0]].end=ancEnd;
+ chunk.location[com.orgName]=writeGenomeSeq(outtmp[com.orgName],com.orgName,proto[com.orgName],loc1.name,start,end,com.strand);
+ chunk.location[other[com.orgName]]=writeGenomeGap(outtmp[other[com.orgName]],other[com.orgName],proto[other[com.orgName]],"-",start,end);
+ writeChunk(blockChunk,chunk,org);
+ }
+ else {
+ printf("Warning: No gap between alignments %s and %s in %s (%d to %d)\n",
+ com.alignId1.c_str(),com.alignId2.c_str(),com.orgName.c_str(),start,end);
+ }
+ // align. 2
+ chunk.location[org[0]].start=ancEnd+1;
+ ancEnd+=writeAlignSeq(outtmp[org[1]],proto[org[1]],outtmp[org[2]],proto[org[2]],com.alignId2,com.orgName,com.strand);
+ chunk.location[org[0]].end=ancEnd;
+ chunk.location[com.orgName]=alignMap[com.alignId2].location[com.orgName];
+ chunk.location[other[com.orgName]]=alignMap[com.alignId2].location[other[com.orgName]];
+ writeChunk(blockChunk,chunk,org);
+ }
+ // overlapping
+ else if (com.operation=='o') {
+ if (multi==0) multi=1; else multi=2;
+ Command comNext=command[i+1];
+
+ // align. 1
+ if (multi==1) {
+ if (com.strand=='+') {
+ chunk.location[com.orgName]=writeAlignSeqCutLetter(outtmp[com.orgName],protoStart[com.orgName],proto[com.orgName],com.alignId1,com.orgName,com.strand,0,com.over1,cutAlignStart,cutAlignEnd);
+ chunk.location[other[com.orgName]]=writeAlignSeqCutLetterAlign(outtmp[other[com.orgName]],protoStart[other[com.orgName]],proto[other[com.orgName]],com.alignId1,other[com.orgName],com.strand,cutAlignStart,cutAlignEnd);
+ writeChunk(blockChunk,chunk,org);
+ }
+ else {
+ chunk.location[com.orgName]=writeAlignSeqCutLetter(outtmp[com.orgName],protoStart[com.orgName],proto[com.orgName],com.alignId1,com.orgName,com.strand,com.over1,0,cutAlignStart,cutAlignEnd);
+ chunk.location[other[com.orgName]]=writeAlignSeqCutLetterAlign(outtmp[other[com.orgName]],protoStart[other[com.orgName]],proto[other[com.orgName]],com.alignId1,other[com.orgName],com.strand,cutAlignStart,cutAlignEnd);
+ writeChunk(blockChunk,chunk,org);
+ }
+ }
+ // genome between
+ AlignLocation loc1=alignMap[com.alignId1].location[com.orgName];
+ AlignLocation loc2=alignMap[com.alignId2].location[com.orgName];
+ // TODO check possible overlap
+ if (com.strand=='+') {
+ start=loc1.end-com.over1+1;
+ end=loc2.start+com.over2-1;
+ }
+ else {
+ start=loc2.end-com.over2+1;
+ end=loc1.start+com.over1-1;
+ }
+ // TODO -- see TODO above
+ if (start<end) {
+ chunk.location[com.orgName]=writeGenomeSeq(outtmp[com.orgName],com.orgName,proto[com.orgName],loc1.name,start,end,com.strand);
+ chunk.location[other[com.orgName]]=writeGenomeGap(outtmp[other[com.orgName]],other[com.orgName],proto[other[com.orgName]],"-",start,end);
+ writeChunk(blockChunk,chunk,org);
+ }
+ else {
+ printf("Warning: No gap between alignments %s and %s in %s (%d to %d)\n",
+ com.alignId1.c_str(),com.alignId2.c_str(),com.orgName.c_str(),start,end);
+ }
+ // align. 2
+ if (com.strand=='+') {
+ chunk.location[com.orgName]=writeAlignSeqCutLetter(outtmp[com.orgName],protoStart[com.orgName],proto[com.orgName],com.alignId2,com.orgName,com.strand,com.over2,comNext.over1,cutAlignStart,cutAlignEnd);
+ chunk.location[other[com.orgName]]=writeAlignSeqCutLetterAlign(outtmp[other[com.orgName]],protoStart[other[com.orgName]],proto[other[com.orgName]],com.alignId2,other[com.orgName],com.strand,cutAlignStart,cutAlignEnd);
+ writeChunk(blockChunk,chunk,org);
+ }
+ else {
+ chunk.location[com.orgName]=writeAlignSeqCutLetter(outtmp[com.orgName],protoStart[com.orgName],proto[com.orgName],com.alignId2,com.orgName,com.strand,comNext.over1,com.over2,cutAlignStart,cutAlignEnd);
+ chunk.location[other[com.orgName]]=writeAlignSeqCutLetterAlign(outtmp[other[com.orgName]],protoStart[other[com.orgName]],proto[other[com.orgName]],com.alignId2,other[com.orgName],com.strand,cutAlignStart,cutAlignEnd);
+ writeChunk(blockChunk,chunk,org);
+ }
+ }
+ else if (com.operation=='e') {
+ multi=0;
+ closeTmp(outtmp[org[1]],proto[org[1]]);
+ closeTmp(outtmp[org[2]],proto[org[2]]);
+ for (int i=1;i<=proto[org[1]];i++) appendSeq(out,header,outPath+"."+itoa(i)+".tmp");
+ for (int i=1;i<=proto[org[2]];i++) appendSeq(out,header,outPath+"."+itoa(proto[org[1]]+i)+".tmp");
+ openTmp(outtmp[org[1]],outPath,proto[org[1]],1);
+ openTmp(outtmp[org[2]],outPath,proto[org[2]],proto[org[1]]+1);
+ block++;
+ header=ancOrg+"-anc"+itoa(block);
+ chunk.location[org[0]].name=header;
+ chunk.location[org[0]].start=0;
+ chunk.location[org[0]].end=0;
+ ancEnd=0;
+ }
+ }
+ closeTmp(outtmp[org[1]],proto[org[1]]);
+ closeTmp(outtmp[org[2]],proto[org[2]]);
+ fclose(out);
+ fclose(blockChunk);
+ return 0;
+}
diff --git a/src/ancseqrest.cpp b/src/ancseqrest.cpp
new file mode 100644
index 0000000..30cffcc
--- /dev/null
+++ b/src/ancseqrest.cpp
@@ -0,0 +1,324 @@
+/**
+ * @file
+ * Adds not aligned genome areas to ancestor FASTA file.
+ *
+ * Arguments:
+ *
+ * -b filename : block chunk mapping <br>
+ * -g genomeindex : genome index, it refers to 2 files: genomeindex.ind and genomeindex.seq <br>
+ * -n {1|2} : which genome is taken (1st or 2nd) from block chunk mapping <br>
+ * -p proto : number of original species in genome
+ * -o filename : ancestor fasta file, output sequence data to be appended here
+ *
+ * Block chunk mapping example:
+ *
+ * [TODO]
+ *
+ * Comment: [TODO].
+ *
+ *
+ * @author Mikhail Soloviev
+ * @date 23.05.2006
+ * @version 1.0
+ *
+ */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace std;
+
+#define fastaRowLength 50
+typedef char* pchar;
+
+pchar seqData[100];
+char seqStrand;
+
+string itoa(int i) {
+ char buf[20];
+ sprintf(buf,"%d",i);
+ return buf;
+}
+
+FILE* openFile(string path,char* mode) {
+ FILE *f=fopen(path.c_str(),mode);
+ if (f==NULL) {
+ fprintf(stderr,"ERROR: Failed open file: %s\n",path.c_str());
+ exit(1);
+ }
+ return f;
+}
+
+int isArg(char* key,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0) return 1;
+ }
+ return 0;
+}
+
+string getArg(char* key,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0 && i<argc-1) return argv[i+1];
+ }
+ fprintf(stderr,"ERROR: Parameter for option '%s' not specified\n",key);
+ exit(1);
+ return "";
+}
+
+string getArgAt(char* key,int index,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0 && i<argc-index) return argv[i+index];
+ }
+ fprintf(stderr,"ERROR: Parameter for option '%s' not specified\n",key);
+ exit(1);
+ return "";
+}
+
+struct Range {
+ int start;
+ int end;
+ char strand;
+};
+
+struct Location {
+ string genome;
+ string name; // sequence name/id
+ int start;
+ int end;
+ char strand;
+};
+
+struct ChunkMap {
+ //int blockId;
+ Location location[3];
+};
+
+vector<ChunkMap> chunkMap;
+
+void loadChunkMap(string path) {
+ char line[2000];
+ char genome0[1000];
+ char genome1[1000];
+ char genome2[1000];
+ char name0[1000];
+ char name1[1000];
+ char name2[1000];
+ int tmp;
+ FILE *in=openFile(path,"r");
+ while (!feof(in)) {
+ line[0]='\0';
+ fgets(line,2000,in);
+ if (strlen(line)==0) continue;
+ ChunkMap chunk;
+ sscanf(line,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c",
+ genome0,name0,&chunk.location[0].start,&chunk.location[0].end,&chunk.location[0].strand,
+ genome1,name1,&chunk.location[1].start,&chunk.location[1].end,&chunk.location[1].strand,
+ genome2,name2,&chunk.location[2].start,&chunk.location[2].end,&chunk.location[2].strand);
+ chunk.location[0].genome=genome0;
+ chunk.location[1].genome=genome1;
+ chunk.location[2].genome=genome2;
+ chunk.location[0].name=name0;
+ chunk.location[1].name=name1;
+ chunk.location[2].name=name2;
+ chunkMap.push_back(chunk);
+ }
+ fclose(in);
+}
+
+void writeChunkSeq(FILE *out,string header,int start,int end,int protoStart,int protoEnd) {
+ start--;
+ end--;
+ for (int p=protoStart;p<=protoEnd;p++) {
+ fprintf(out,">%s\n",header.c_str());
+ int j=0;
+ for (int i=start;i<=end;i++) {
+ fputc(seqData[p][i],out);
+ j++;
+ if (j==fastaRowLength) {
+ j=0;
+ fputc('\n',out);
+ }
+ }
+ if (j>0) fputc('\n',out);
+ }
+}
+
+void writeChunkGap(FILE *out,string header,int start,int end,int proto) {
+ start--;
+ end--;
+ for (int p=1;p<=proto;p++) {
+ fprintf(out,">%s\n",header.c_str());
+ int j=0;
+ for (int i=start;i<=end;i++) {
+ fputc('-',out);
+ j++;
+ if (j==fastaRowLength) {
+ j=0;
+ fputc('\n',out);
+ }
+ }
+ if (j>0) fputc('\n',out);
+ }
+}
+
+Range noNext={0,0,'+'};
+
+Range nextRange(int seqSize,Range prev) {
+ Range next;
+ prev.start--;
+ prev.end--;
+ next.start=prev.end+1;
+ if (next.start>=seqSize) return noNext;
+ while (seqData[1][next.start]=='*') {
+ next.start++;
+ if (next.start>=seqSize) return noNext;
+ }
+ next.end=next.start;
+ while (next.end<seqSize && seqData[1][next.end+1]!='*') {
+ next.end++;
+ }
+ next.start++;
+ next.end++;
+ return next;
+}
+
+void fillRange(int start,int end,int proto) {
+ start--;
+ end--;
+ for (int p=1;p<=proto;p++) {
+ for (int i=start;i<=end;i++) seqData[p][i]='*';
+ }
+}
+
+void writeSeqRest(FILE *out,FILE *chunk,string ancestor,int seqSize,int& block,int genomeNumber,string descSeqName,int proto1,int proto2,string desc1,string desc2) {
+ Range range=noNext;
+ while ((range=nextRange(seqSize,range)).start!=0) {
+ block++;
+ string ancSeqName=ancestor+"-ancrest-"+itoa(genomeNumber)+"-"+itoa(block);
+ if (genomeNumber==1) {
+ writeChunkSeq(out,ancSeqName,range.start,range.end,1,proto1);
+ writeChunkGap(out,ancSeqName,range.start,range.end,proto2);
+ fprintf(chunk,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c\n",
+ ancestor.c_str(),ancSeqName.c_str(),1,(range.end-range.start+1),'+',
+ desc1.c_str(),descSeqName.c_str(),range.start,range.end,seqStrand,
+ desc2.c_str(),"-",0,0,'+');
+ }
+ else {
+ writeChunkGap(out,ancSeqName,range.start,range.end,proto1);
+ writeChunkSeq(out,ancSeqName,range.start,range.end,1,proto2);
+ fprintf(chunk,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c\n",
+ ancestor.c_str(),ancSeqName.c_str(),1,(range.end-range.start+1),'+',
+ desc1.c_str(),"-",0,0,'+',
+ desc2.c_str(),descSeqName.c_str(),range.start,range.end,seqStrand);
+ }
+ }
+ for (int i=1;i<=proto1+proto2;i++) free(seqData[i]);
+}
+
+struct FaRecord {
+ string id;
+ long offset;
+ int length;
+};
+
+struct FaIndex {
+ string id;
+ FILE* file;
+ map<string,FaRecord> record;
+};
+
+FaRecord readIndexRecord(FILE *ind) {
+ FaRecord record;
+ record.id="";
+ char line[2000];
+ char id[200];
+ line[0]='\0';
+ id[0]='\0';
+ fgets(line,2000,ind);
+ if (strlen(line)>0) {
+ sscanf(line,"%s %ld %d",id,&record.offset,&record.length);
+ record.id=id;
+ }
+ return record;
+}
+
+FaIndex genomeIndex;
+
+void openGenomeIndex(string genomePath) {
+ genomeIndex.file=openFile(genomePath+".seq","r+");
+ FILE *ind=openFile(genomePath+".ind","r");
+ while (!feof(ind)) {
+ FaRecord record=readIndexRecord(ind);
+ if (record.id.size()>0) genomeIndex.record[record.id]=record;
+ }
+ fclose(ind);
+}
+
+char* readSeqBuf(FILE *seq,long offset,int length) {
+ fseek(seq,offset,0);
+ char* buf=(char*)malloc(length*sizeof(char));
+ fread(buf,sizeof(char),length,seq);
+ return buf;
+}
+
+void readGenomeSeq(string seqName,int& seqSize,int proto) {
+ FILE *seq=genomeIndex.file;
+ for (int i=1;i<=proto;i++) {
+ string id=seqName+":"+itoa(i);
+ FaRecord ind=genomeIndex.record[id];
+ seqSize=ind.length;
+ seqData[i]=readSeqBuf(seq,ind.offset,ind.length);
+ }
+}
+
+int main (int argc,char* argv[]) {
+
+ int block=0;
+ string seqName="";
+ string ancestor="";
+ string desc1="";
+ string desc2="";
+ int seqSize=0;
+ int proto=1;
+ int proto1=1;
+ int proto2=1;
+ int genomeNumber=1;
+ int first=1;
+
+ FILE* out=openFile(getArg("-o",argc,argv),"w");
+ FILE* chunk=openFile(getArg("-c",argc,argv),"w");
+ loadChunkMap(getArg("-b",argc,argv));
+ openGenomeIndex(getArg("-g",argc,argv));
+ genomeNumber=atoi(getArg("-n",argc,argv).c_str());
+ proto1=atoi(getArg("-p1",argc,argv).c_str());
+ proto2=atoi(getArg("-p2",argc,argv).c_str());
+ ancestor=getArg("-a",argc,argv);
+ desc1=getArg("-d1",argc,argv);
+ desc2=getArg("-d2",argc,argv);
+
+ proto=genomeNumber==1?proto1:proto2;
+
+ for (int i=0;i<chunkMap.size();i++) {
+ Location loc=chunkMap[i].location[genomeNumber];
+ seqStrand=loc.strand;
+ if (loc.name=="-") continue;
+ if (loc.name!=seqName) {
+ if (!first) writeSeqRest(out,chunk,ancestor,seqSize,block,genomeNumber,seqName,proto1,proto2,desc1,desc2);
+ else first=0;
+ seqName=loc.name;
+ seqStrand=loc.strand;
+ readGenomeSeq(seqName,seqSize,proto);
+ }
+ fillRange(loc.start,loc.end,proto);
+ }
+ writeSeqRest(out,chunk,ancestor,seqSize,block,genomeNumber,seqName,proto1,proto2,desc1,desc2);
+
+ fclose(out);
+ fclose(chunk);
+ return 0;
+}
diff --git a/src/cutmfa.cpp b/src/cutmfa.cpp
new file mode 100644
index 0000000..dd41536
--- /dev/null
+++ b/src/cutmfa.cpp
@@ -0,0 +1,148 @@
+/**
+ * @file
+ * Cuts Multi-FASTA file into parts using coordinate ranges
+ * produced by supermap.
+ *
+ * Arguments:
+ *
+ * -i filename : input fasta file (containing only 1 sequence) <br>
+ * -o filename : output fasta file <br>
+ * -c filename : alignments' coordinate ranges (supermap output data) <br>
+ * -s number : take prototype organism sequences starting with number <br>
+ * -e number : take prototype organism sequences ending with number <br>
+ * -u number : which alignment coordinate range to use -- first or second,
+ * correspondingly number can be 1 or 2 <br>
+ * -g {0|1} : allow gaps <br>
+ *
+ * Alignments' coordinate range example:
+ *
+ * mouse-ENm001 1 12433 rat-ENm001 400 28619 + (DM, 13 aligns) <br>
+ * mouse-ENm001 7001 14975 rat-ENm001 1 15303 + (M1, 1 aligns) <br>
+ * mouse-ENm001 12872 51014 rat-ENm001 6891 71164 + (DM, 106 aligns)
+ *
+ * Comment: Only the first 6 fields are read, the rest can be anything.
+ *
+ * Resulted output example:
+ *
+ * >mouse-ENm001 <br>
+ * GGACTCGTCGCAGTGCCTTGT <br>
+ * TTTACTGTGCACTTCGCCTGG <br>
+ * ACTGTCTACGCCATGCTTGAT <br>
+ *
+ * Comment: FASTA header contains sequence name (mouse-ENm001).
+ *
+ * @author Mikhail Soloviev
+ * @date 05.04.2006
+ * @version 1.0
+ *
+ */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace std;
+
+// TODO refactor in classes and normal make project
+
+#include "util.cpp"
+#include "faindex.cpp"
+
+FaIndex faIndex;
+
+void writeSeqDirect(FILE *out,char* seq,int start,int end,int gapped,int masked) {
+ start--;
+ end--;
+ int j=0;
+ for (int i=start;i<=end;i++) {
+ if (gapped || seq[i]!='-') {
+ fputc(masked?mask(seq[i]):seq[i],out);
+ j++;
+ if (j==fastaRowLength) {
+ j=0;
+ fputc('\n',out);
+ }
+ }
+ }
+ if (j>0) fputc('\n',out);
+}
+
+void writeSeqRevComp(FILE *out,char* seq,int start,int end,int gapped,int masked) {
+ start--;
+ end--;
+ int j=0;
+ for (int i=end;i>=start;i--) {
+ if (gapped || seq[i]!='-') {
+ fputc(masked?mask(comp(seq[i])):comp(seq[i]),out);
+ j++;
+ if (j==fastaRowLength) {
+ j=0;
+ fputc('\n',out);
+ }
+ }
+ }
+ if (j>0) fputc('\n',out);
+}
+
+void writeSeq(FILE *out,char* seq,int start,int end,int direct,int gapped,int masked) {
+ if (direct) writeSeqDirect(out,seq,start,end,gapped,masked);
+ else writeSeqRevComp(out,seq,start,end,gapped,masked);
+}
+
+int main (int argc,char* argv[]) {
+ char buf[bufSize];
+ char name[bufSize];
+ int start;
+ int end;
+ char name2[bufSize];
+ int start2;
+ int end2;
+ int count=0;
+ char strand;
+
+ int gapped=1;
+ int useOrg=1;
+ int protoStart=1;
+ int protoEnd=1;
+ int masked=0;
+
+ string id;
+ char* seq;
+
+ FILE *out=openFile(getArg("-o",argc,argv),"w");
+ FILE *in=openFile(getArg("-c",argc,argv),"r");
+
+ readFaIndex(faIndex,getArg("-i",argc,argv));
+ useOrg=atoi(getArg("-u",argc,argv).c_str());
+ gapped=atoi(getArg("-g",argc,argv).c_str());
+ protoStart=atoi(getArg("-s",argc,argv).c_str());
+ protoEnd=atoi(getArg("-e",argc,argv).c_str());
+ masked=atoi(getArg("-m",argc,argv).c_str());
+
+ while (!feof(in)) {
+ buf[0]='\0';
+ fgets(buf,bufSize,in);
+ if (strlen(buf)==0) continue;
+ sscanf(buf,"%s %d %d %s %d %d %c ",name,&start,&end,name2,&start2,&end2,&strand);
+ if (useOrg==2) {
+ strcpy(name,name2);
+ start=start2;
+ end=end2;
+ }
+ for (int n=protoStart;n<=protoEnd;n++) {
+ id=name;
+ id=id+":"+itoa(n);
+ seq=getFaIndexSeq(faIndex,id);
+ fprintf(out,">%s\n",name);
+ writeSeq(out,seq,start,end,(useOrg==2 && strand=='-'),gapped,masked);
+ free(seq);
+ }
+ }
+ fclose(in);
+ fclose(out);
+ return 0;
+}
diff --git a/src/diagmatrix.c b/src/diagmatrix.c
new file mode 100644
index 0000000..ce2076c
--- /dev/null
+++ b/src/diagmatrix.c
@@ -0,0 +1,344 @@
+#ifndef __DIAGMATRIX_C
+#define __DIAGMATRIX_C
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include "diagmatrix.h"
+
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+#define MIN2(x,y) ( (x) <= (y) ? (x) : (y) )
+
+alel dummy;
+
+#ifdef MULTIAL__FLAG
+extern int *freed, freedsize, freedcap;
+extern align *freedptr;
+#endif
+
+dmat* makeDM(int d1, int d2) {
+ dmat* trgt = (dmat*)malloc(sizeof(dmat));
+ int i;
+ trgt->d1 = d1;
+ trgt->d2 = d2;
+
+ trgt->diagindex = (int*) calloc(d1+d2+1, sizeof(int));
+ trgt->diagstart = (int*) calloc(d1+d2+1, sizeof(int));
+ trgt->diagend = (int*) calloc(d1+d2+1, sizeof(int));
+ trgt->isneck = (int*) calloc(d1+d2+1, sizeof(int));
+ for (i=0; i < d1+ d2+1; i++) {
+ trgt->diagindex[i] = trgt->diagstart[i] = trgt->diagend[i] = -1;
+ trgt->isneck[i] = 0;
+ }
+ trgt->numelems = 0;
+ trgt->currdiag = 0;
+ trgt->currneck = 0;
+ dummy.M = dummy.N = dummy.O = INT_MIN+(1<<28);
+ return trgt;
+}
+
+
+void freeDM(dmat* trgt) {
+
+ int olddiag = trgt->neckdiag[trgt->currneck%2];
+ int prevsize = (olddiag>0)?trgt->diagend[olddiag]-
+ trgt->diagstart[olddiag]+1 + trgt->diagend[olddiag-1]-
+ trgt->diagstart[olddiag-1]+1 : 0;
+ int i, j;
+
+ // printf("next neck\n");
+
+ for (i=0; i < prevsize; i++) {
+ for (j=0; j<3; j++) {
+ // freeAlign(trgt->myneck[trgt->currneck%2][j][i]);
+ }
+ }
+
+ for (i=0; i< NACT; i++) {
+ free (trgt->myelems[i]);
+ }
+ free(trgt->myptrs);
+ free(trgt->diagindex);
+ free(trgt->diagstart);
+ free(trgt->diagend);
+ free(trgt->isneck);
+ free(trgt);
+
+}
+
+void DMinitDiag(dmat* trgt, int* starts, int* ends) {
+ int i, sav = 0;
+ long long int j = 0, ts = 0;
+ int k = ends[1]-starts[1]+1, ko=-1, kf;
+ int ctr=0, cond=0;
+
+ for (i=1; i < trgt->d1+trgt->d2; i++) {
+ trgt->diagindex[i] = j;
+ trgt->diagstart[i] = starts[i];
+ trgt->diagend[i] = ends[i];
+ kf = (i == trgt->d1+trgt->d2-1)? -1 : ends[i+1]-starts[i+1]+1;
+
+ j += k;
+ cond = (k < kf) || (k <= kf && ctr >= 1000 && k <= 200);
+ if ((ko >= k) && cond) {
+ ctr = 0;
+ // printf("neck %d\n",i);
+
+ if (sav) {
+ trgt->isneck[sav] = j;
+ }
+ else {
+ trgt->myptrs = (char*) calloc (j/2+1, sizeof(char));
+ }
+ ts += j;
+ j = k + ko;
+ sav = i;
+ }
+ ctr++;
+ ko = k;
+ k = kf;
+ }
+ trgt->diagindex[i] = j;
+ trgt->diagstart[i] = starts[i];
+ trgt->diagend[i] = ends[i];
+ if (sav)
+ trgt->isneck[sav] = j;
+ else
+ trgt->myptrs = (char*) calloc (j/2+1, sizeof(char));
+ trgt->numelems = j;
+ trgt->currdiag = 0;
+ ts += j;
+ for (i=0; i < NACT; i++)
+ trgt->myelems[i] = 0;
+ for (i=0; i < 2; i++) {
+ for (j=0; j<3; j++)
+ trgt->myneck[i][j] = 0;
+ trgt->neckdiag[i] = -1;
+ }
+ fprintf(stderr,"Total size = %lld * 10^6\n", ts/1000000);
+}
+
+alel* DMgetDiagStart(dmat* trgt, int dn, int* size, int* startx, int* starty) {
+
+ alel* res = trgt->myelems[dn%NACT];
+ *size = trgt->diagend[dn] - trgt->diagstart[dn]+1;
+
+ if (dn < trgt->d2) {
+ *startx = trgt->diagstart[dn]+1;
+ *starty = dn - trgt->diagstart[dn];
+ }
+ else {
+ *startx = dn - trgt->d2 + trgt->diagstart[dn]+1;
+ *starty = trgt->d2 - trgt->diagstart[dn];
+ }
+ return res;
+}
+
+char DMgetPtr(dmat* trgt, int x, int y) {
+ int dn = x+y-1;
+ int elem = (dn < trgt->d2)? (x-1): trgt->d2-y;
+ int res, loc;
+ if (dn <= 0 || dn >= trgt->d1+trgt->d2 ||
+ elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){
+
+ return -1;
+ }
+ loc = trgt->diagindex[dn] + elem-trgt->diagstart[dn];
+ res= trgt->myptrs[loc >> 1];
+ if (!(loc & 1))
+ res = res >> 4;
+ return res & 0xf;
+}
+
+void DMsetPtr(dmat* trgt, char ptr, int x, int y) {
+ int dn = x+y-1, loc;
+ char res;
+ int elem = (dn < trgt->d2)? (x-1): trgt->d2-y;
+
+ if (dn <= 0 || dn >= trgt->d1+trgt->d2 ||
+ elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){
+ fprintf(stderr,"range error!!!\n");
+ return;
+ }
+
+ dn = trgt->diagindex[dn] + elem-trgt->diagstart[dn];
+ if (dn & 1)
+ trgt->myptrs[dn >> 1] = (char)(trgt->myptrs[dn >> 1] & 0xf0) | (char)(ptr & 0x0f);
+ else
+ trgt->myptrs[dn >> 1] = (char)(trgt->myptrs[dn >> 1] & 0x0f) | (char)(ptr << 4);
+
+}
+
+alel* DMgetElem(dmat* trgt, int x, int y) {
+ register int dn = x+y-1;
+ register int elem = (dn < trgt->d2)? (x-1): trgt->d2-y;
+
+ if (dn <= 0 || dn >= trgt->d1+trgt->d2 ||
+ elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){
+ return &dummy;
+ }
+ return (trgt->myelems[dn % NACT] + elem-trgt->diagstart[dn]);
+}
+
+alel* DMgetElem2(dmat* trgt, int x, int y, alel* prev) {
+ register int dn = x+y-1;
+ register int elem = (dn < trgt->d2)? (x-1): trgt->d2-y;
+
+ if (dn <= 0 || dn >= trgt->d1+trgt->d2 ||
+ elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){
+ return &dummy;
+ }
+
+ if (prev != &dummy)
+ return prev + 1;
+ return (trgt->myelems[dn % NACT] + elem-trgt->diagstart[dn]);
+}
+
+void DMsetElem(dmat* trgt, alel* tbi, int x, int y, char ptr) {
+ int dn = x+y-1;
+ int elem = (dn < trgt->d2)? x: trgt->d2-y;
+ if (elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]) {
+ fprintf(stderr,"Dummy\n");
+ return;
+ }
+ *(trgt->myelems[dn%NACT]+elem-trgt->diagstart[dn]) = *tbi;
+ trgt->myptrs[trgt->diagindex[dn] + elem-trgt->diagstart[dn]]=ptr;
+}
+
+char DMnextDiag(dmat* trgt) {
+ char* newptrs;
+ int i;
+
+ int size = trgt->diagend[trgt->currdiag+1] - trgt->diagstart[trgt->currdiag+1] + 1;
+ free(trgt->myelems[(trgt->currdiag+1)%NACT]);
+ trgt->myelems[(trgt->currdiag+1)%NACT] = (alel*) calloc(size, sizeof(alel));
+
+ if (trgt->isneck[trgt->currdiag]) {
+ // printf("new pointers!\n");
+ newptrs = (char*) calloc ((trgt->isneck[trgt->currdiag]+1)/2+1, sizeof(char));
+ for (i=0; i< (trgt->isneck[trgt->currdiag]+1)/2+1; i++)
+ newptrs[i] = -1;
+ free(trgt->myptrs);
+ trgt->myptrs = newptrs;
+ trgt->diagindex[trgt->currdiag-1] = 0;
+ trgt->diagindex[trgt->currdiag] = (trgt->diagend[trgt->currdiag-1] -
+ trgt->diagstart[trgt->currdiag-1] + 1);
+ }
+
+ return trgt->isneck[++trgt->currdiag] != 0;
+}
+
+int DMnextNecks(dmat* trgt, int diag) {
+ int size = trgt->diagend[diag]-trgt->diagstart[diag]+1 +
+ trgt->diagend[diag-1]-trgt->diagstart[diag-1]+1;
+
+ int olddiag = trgt->neckdiag[trgt->currneck%2];
+ int prevsize = (olddiag>0)?trgt->diagend[olddiag]-trgt->diagstart[olddiag]+1 +
+ trgt->diagend[olddiag-1]-trgt->diagstart[olddiag-1]+1 : 0;
+ int i, j, t1;
+ int norm=0;
+ int minn = 0;
+ // printf("next neck\n");
+
+ for (i=0; i < prevsize; i++) {
+ for (j=0; j<3; j++) {
+ if ((trgt->myneck[trgt->currneck%2][j])[i] &&
+ !(trgt->myneck[trgt->currneck%2][j])[i]->dirty){
+ freeAlign(trgt->myneck[trgt->currneck%2][j][i]);
+ trgt->myneck[trgt->currneck%2][j][i] = 0;
+ }
+ /* else if ((trgt->myneck[trgt->currneck%2][j])[i] &&
+ (trgt->myneck[trgt->currneck%2][j])[i]->dirty &&
+ !(trgt->myneck[trgt->currneck%2][j])[i]->nextalign) {
+ fprintf(stderr, "WARN: diag = %d(%d:%d) \n", diag, olddiag,
+ (trgt->myneck[trgt->currneck%2][j])[i]->algnlen);
+ }
+ */
+ }
+ }
+ for (j=0; j<3; j++) {
+ free (trgt->myneck[trgt->currneck%2][j]);
+ trgt->myneck[trgt->currneck%2][j] = (align**) calloc (size, sizeof (align*));
+ trgt->neckdiag[trgt->currneck%2] = diag;
+ for (i=0; i< size; i++)
+ (trgt->myneck[trgt->currneck%2][j])[i] = 0;
+ }
+
+
+ size = trgt->diagend[trgt->currdiag] - trgt->diagstart[trgt->currdiag]+1;
+ // fprintf(stderr, "size = %d\n ", size);
+ minn = norm = trgt->myelems[(trgt->currdiag)%NACT][0].M;
+ for (j=1; j<size; j++) {
+ norm = MAX2 (trgt->myelems[(trgt->currdiag)%NACT][j].M , norm);
+ minn = MIN2 (trgt->myelems[(trgt->currdiag)%NACT][j].M , minn);
+ }
+ // fprintf(stderr, "currdiag = %d norm = %d minn = %d\n", trgt->currdiag, norm, minn);
+ for (i=0; i < NACT; i++) {
+ size = trgt->diagend[trgt->currdiag-i] - trgt->diagstart[trgt->currdiag-i]+1;
+ for (j=0; j<size; j++) {
+ t1 = trgt->myelems[(trgt->currdiag-i)%NACT][j].M - norm;
+ trgt->myelems[(trgt->currdiag-i)%NACT][j].M = (norm > 0)?
+ MIN2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1):
+ MAX2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1);
+
+ t1 = trgt->myelems[(trgt->currdiag-i)%NACT][j].N - norm;
+ trgt->myelems[(trgt->currdiag-i)%NACT][j].N = (norm > 0)?
+ MIN2(trgt->myelems[(trgt->currdiag-i)%NACT][j].N, t1):
+ MAX2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1);
+ t1 = trgt->myelems[(trgt->currdiag-i)%NACT][j].O - norm;
+ trgt->myelems[(trgt->currdiag-i)%NACT][j].O = (norm > 0)?
+ MIN2(trgt->myelems[(trgt->currdiag-i)%NACT][j].O, t1):
+ MAX2(trgt->myelems[(trgt->currdiag-i)%NACT][j].M, t1);
+ }
+ }
+
+ trgt->currneck++;
+ return norm;
+}
+
+
+align* DMgetNeck(dmat* trgt, int x, int y, int which) {
+ int dn = x + y - 1;
+ int elem = (dn < trgt->d2)? (x-1): trgt->d2-y;
+ int fd;
+
+ if (dn <= 0 || dn >= trgt->d1+trgt->d2) {
+ return 0;
+ }
+ if (elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){
+ return 0;
+ }
+ if (trgt->neckdiag[trgt->currneck%2] == dn) {
+ return *(trgt->myneck[trgt->currneck%2][which] + elem-trgt->diagstart[dn]);
+ }
+ else if (trgt->neckdiag[trgt->currneck%2] == dn+1) {
+ fd = trgt->diagend[dn+1]-trgt->diagstart[dn+1]+1;
+ return *(trgt->myneck[trgt->currneck%2][which] + elem-trgt->diagstart[dn] + fd);
+ }
+ else { fprintf(stderr, "Some dumb error: %d/%d %d %d\n", dn, trgt->d1+trgt->d2-1, trgt->neckdiag[(trgt->currneck-1)%2], trgt->currneck); return 0; }
+}
+
+void DMsetNeck(dmat* trgt, align* myal, int x, int y, int which) {
+ int dn = x + y - 1;
+ int elem = (dn < trgt->d2)? (x-1): trgt->d2-y;
+ int fd;
+
+ if (dn <= 0 || dn >= trgt->d1+trgt->d2) {
+ fprintf(stderr, "setNeck failed at %d, %d\n", x,y);
+ return;
+ }
+ if (elem < trgt->diagstart[dn] || elem > trgt->diagend[dn]){
+ fprintf(stderr, "setNeck failed2 at %d, %d\n", x,y);
+ return;
+ }
+ if (trgt->neckdiag[(trgt->currneck-1)%2] == dn) {
+ *(trgt->myneck[(trgt->currneck-1)%2][which] + elem-trgt->diagstart[dn]) = myal;
+ }
+ else if (trgt->neckdiag[(trgt->currneck-1)%2] == dn+1) {
+ fd = trgt->diagend[dn+1]-trgt->diagstart[dn+1]+1;
+ *(trgt->myneck[(trgt->currneck-1)%2][which] + elem-trgt->diagstart[dn] + fd)=myal;
+ }
+ else { fprintf(stderr, "Some dumb error2: %d %d %d\n", dn, trgt->neckdiag[(trgt->currneck)%2], trgt->currneck); }
+}
+
+#endif
diff --git a/src/diagmatrix.h b/src/diagmatrix.h
new file mode 100644
index 0000000..9452a0a
--- /dev/null
+++ b/src/diagmatrix.h
@@ -0,0 +1,54 @@
+#ifndef __DIAGMATRIX_H
+#define __DIAGMATRIX_H
+
+#ifdef MULTIAL__FLAG
+#include "multial.h"
+#else
+#include "order.h"
+#endif
+
+#define Mmask 0x3
+#define Nmask 0x4
+#define Omask 0x8
+#define NACT 3
+
+typedef struct AlignElement {
+ long int M;
+ long int N;
+ long int O;
+} alel;
+
+typedef struct diagmatrix {
+ int d1;
+ int d2;
+ int* diagindex; /* this points to where in myelems a certain diagonal starts*/
+ int* diagstart; /* the elem on which the "cross-section" starts*/
+ int* diagend; /* the elem on which the "cross-section" ends */
+ int* isneck; /* if so, give size of next block, 0 ow */
+ int numelems;
+ int elemsize;
+ char* myptrs;
+ alel* myelems[NACT]; /* NACT(3) diags active at a time */
+ int currdiag; /*current diagonal */
+ int rangelow;
+ int currneck;
+ align** myneck[2][3]; /* The past 2 necks, 3 ptrs for each */
+ int neckdiag[2]; /* For each the size of its 2 diagonals */
+} dmat;
+
+
+dmat* makeDM(int d1, int d2);
+void freeDM(dmat* trgt);
+void DMinitDiag(dmat* trgt, int* starts, int* ends);
+alel* DMgetElem(dmat* trgt, int x, int y);
+alel* DMgetElem2(dmat* trgt, int x, int y, alel* prev);
+char DMgetPtr(dmat* trgt, int x, int y);
+void DMsetPtr(dmat* trgt, char ptr, int x, int y);
+align* DMgetNeck(dmat* trgt, int x, int y, int which);
+void DMsetNeck(dmat* trgt, align* myal, int x, int y, int which);
+alel* DMgetDiagStart(dmat* trgt, int dn, int* size, int* startx, int* starty);
+void DMsetElem(dmat* trgt, alel* elem, int x, int y, char ptr);
+char DMnextDiag(dmat* trgt);
+int DMnextNecks(dmat* trgt, int diag);
+
+#endif
diff --git a/src/faindex.cpp b/src/faindex.cpp
new file mode 100644
index 0000000..6411b28
--- /dev/null
+++ b/src/faindex.cpp
@@ -0,0 +1,56 @@
+struct FaRecord {
+ string id;
+ long offset;
+ int length;
+};
+
+struct FaIndex {
+ string id;
+ int proto;
+ FILE* file;
+ map<string,FaRecord> record;
+};
+
+FaRecord readIndexRecord(FILE *ind) {
+ FaRecord record;
+ record.id="";
+ char line[1000];
+ char id[100];
+ line[0]='\0';
+ id[0]='\0';
+ fgets(line,1000,ind);
+ if (strlen(line)>0) {
+ sscanf(line,"%s %ld %d",id,&record.offset,&record.length);
+ record.id=id;
+ }
+ return record;
+}
+
+void readFaIndex(FaIndex& faIndex,string path) {
+ faIndex.file=openFile(path+".seq","r+");
+ FILE *ind=openFile(path+".ind","r");
+ while (!feof(ind)) {
+ FaRecord record=readIndexRecord(ind);
+ if (record.id.size()>0) faIndex.record[record.id]=record;
+ }
+ fclose(ind);
+}
+
+char* getFaIndexSeq(FaIndex& faIndex,string seqId) {
+ FaRecord ind=faIndex.record[seqId];
+ fseek(faIndex.file,ind.offset,0);
+ char* seq=(char*)malloc(ind.length*sizeof(char));
+ fread(seq,sizeof(char),ind.length,faIndex.file);
+ return seq;
+}
+
+char* getMFaIndexSeq(FaIndex& faIndex,string seqId,int protoIndex) {
+ char protoId[20];
+ sprintf(protoId,"%d",protoIndex);
+ string id=seqId+":"+protoId;
+ FaRecord ind=faIndex.record[id];
+ fseek(faIndex.file,ind.offset,0);
+ char* seq=(char*)malloc(ind.length*sizeof(char));
+ fread(seq,sizeof(char),ind.length,faIndex.file);
+ return seq;
+}
diff --git a/src/fchaos.c b/src/fchaos.c
new file mode 100644
index 0000000..3527a3e
--- /dev/null
+++ b/src/fchaos.c
@@ -0,0 +1,1254 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include "fchaos.h"
+#include "skiplist.h"
+#include "thrtrie.h"
+#include "global.h"
+#include "translate.h"
+#include "filebuffer.h"
+
+#define VER_NUM "0.932"
+#define BLOSUM_FILE "blosum62s.txt"
+#define BLOSUM_FILE_SIZE 24
+#define NUC_FILE "nucmatrix.txt"
+#define NUC_FILE_SIZE 6
+
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+#define MIN2(x,y) ( (x) <= (y) ? (x) : (y) )
+#define ABS(x) ( ((x) >= (0)) ? (x) : (-x) )
+#define WEQ2(x,y,a) (((x)==(a))? 0: ((y)==(a))? 1:-1)
+#define MIN(A,B) (A>B)?B:A
+#define MAX(A,B) (A>B)?A:B
+
+typedef struct SeqMatch {
+ LList* myll;
+ int offset;
+} match;
+
+extern int indeces[256];
+
+
+void remElem(LList* tbf, int i);
+
+int verbose = 0;
+int wordlen = 10;
+int ndegen = 1;
+int cutoff = 25;
+int lookback = 20;
+int gapfreechunks = 0;
+int mgaplen = 5;
+int gappenc = -1;
+int gappeno = 0 ;
+int both = 0;
+int translated = 0;
+int s1start = 0;
+int s1end = 0;
+int s2start = 0;
+int s2end = 0;
+
+int extend = 0;
+int reScoreCutoff = 0;
+
+//int matchsco = 12;
+//int mismatchsco = -8;
+
+int gappenstart = -1500;
+int gappenext = -50;
+int dropcutoff = 1500;
+
+int substmatrix[256][256];
+
+
+hll* allhits = 0;
+sklst* mylist;
+int gapstart=20;
+int gapcont=1;
+char* alpha = "ATCGN";
+char* triealpha = "ATCG";
+char* protalpha = "PCMH[DE][KR][NQ][ST][ILV][FYW][AG]X*";
+char* prottriealpha = "PCMH[DE][KR][NQ][ST][ILV][FYW][AG]";
+char direction;
+
+FILE* pairfile = 0;
+
+
+char comp(char c) {
+ switch(c) {
+ case 'a': case 'A': return 'T';
+ case 't': case 'T': return 'A';
+ case 'c': case 'C': return 'G';
+ case 'g': case 'G': return 'C';
+ case 'n': case 'N': return 'N';
+ default: printf("ERROR, Bad letter to RC: %c\n",c); return -1;
+ }
+}
+
+void revComplement(char* a) {
+ int length = strlen(a);
+ char lft;
+ int i;
+ for (i=0; i < length/2; i++) {
+ lft = a[i];
+ a[i] = comp(a[length-i-1]);
+ a[length-i-1] = comp(lft);
+ }
+ if (length % 2)
+ a[length/2] = comp(a[length/2]);
+}
+
+void freeSeq (seq* tbf) {
+ free(tbf->name);
+ free(tbf->rptr);
+ free(tbf);
+}
+
+void freeHLL (hll* tbf) {
+ gfc *t = tbf->first;
+ gfc *n;
+ while (t) {
+ n = t->next;
+ free (t);
+ t = n;
+ }
+ free (tbf);
+}
+
+void printHLL(hll* res, seq* query, seq* dbase, int len) {
+ hll* temp;
+ align* myal;
+ gfc* tmpgf;
+ int currx, curry;
+ char *qptr = query->lets, *dptr = dbase->lets;
+ if (direction == '+') {
+ while (res) {
+ if (s1start > 0) {
+ res->seq1start += (s1start-1);
+ res->seq1end += (s1start-1);
+ query->lets = query->rptr;
+ }
+ if (s2start > 0) {
+ res->seq2start += (s2start-1);
+ res->seq2end += (s2start-1);
+ dbase->lets = dbase->rptr;
+ }
+ printf("%s %d %d; %s %d %d; score = %f (%c)\n", query->name,
+ res->seq1start+1, res->seq1end+1,
+ dbase->name, res->seq2start+1, res->seq2end+1,
+ res->score,direction);
+ if (verbose) {
+ myal = global(query->lets, res->seq1start,
+ res->seq1end, dbase->lets, res->seq2start,
+ res->seq2end, gapstart, gapcont);
+ printalign(query->lets, res->seq1start,
+ res->seq1end, dbase->lets, res->seq2start,
+ res->seq2end, myal);
+ }
+ if (gapfreechunks) {
+ currx = res->seq1start+1;
+ curry = res->seq2start+1;
+ tmpgf = res->first;
+ while (tmpgf) {
+ if (tmpgf->length) {
+ printf ("%d %d %d %d\n", currx, curry, tmpgf->length, tmpgf->score);
+ currx += tmpgf->length;
+ curry += tmpgf->length;
+ }
+ tmpgf = tmpgf->next;
+ if (!tmpgf)
+ break;
+ if (tmpgf->offset > 0) {
+ curry += tmpgf->offset;
+ }
+ else {
+ currx -= tmpgf->offset;
+ }
+ }
+ }
+ temp = res;
+ res = res->next;
+ freeHLL(temp);
+ }
+ }
+ else {
+ while (res) {
+ if (s1start > 0) {
+ res->seq1start += (s1start-1);
+ res->seq1end += (s1start-1);
+ query->lets = query->rptr;
+ }
+ if (s2start > 0) {
+ res->seq2start += (len-s2end);
+ res->seq2end += (len-s2end);
+ }
+
+ printf("%s %d %d; %s %d %d; score = %f (%c)\n", query->name,
+ res->seq1start+1, res->seq1end+1,
+ dbase->name, len-(res->seq2start), len - (res->seq2end),
+ res->score, direction);
+ if (verbose) {
+ myal = global(query->lets, res->seq1start,
+ res->seq1end, dbase->lets,
+ res->seq2start, res->seq2end, gapstart, gapcont);
+ printalign(query->lets, res->seq1start,
+ res->seq1end, dbase->lets,
+ res->seq2start, res->seq2end, myal);
+ }
+ if (gapfreechunks) {
+ currx = res->seq1start+1;
+ curry = len - res->seq2start;
+ tmpgf = res->first;
+ while (tmpgf) {
+ if (tmpgf->length) {
+ printf ("%d %d %d %d \n", currx, curry, tmpgf->length, tmpgf->score);
+ currx += tmpgf->length;
+ curry -= tmpgf->length;
+ }
+ tmpgf = tmpgf->next;
+ if (!tmpgf)
+ break;
+ if (tmpgf->offset < 0) {
+ currx -= tmpgf->offset;
+ }
+ else {
+ curry -= tmpgf->offset;
+ }
+ }
+ }
+ temp = res;
+ res = res->next;
+ freeHLL(temp);
+ }
+ }
+ query->lets=qptr;
+ dbase->lets = dptr;
+}
+
+
+void printList (hll *ptr){
+ if (ptr){
+ fprintf (stderr, "(%d %d)=(%d %d) %f\n", ptr->seq1start, ptr->seq1end, ptr->seq2start, ptr->seq2end, ptr->score);
+ printList (ptr->next);
+ }
+}
+
+int compare (hll *list1, hll *list2){
+ return (list1->seq1start < list2->seq1start) ||
+ (list1->seq1start == list2->seq1start && list1->seq1end > list2->seq1end);
+}
+
+hll* merge2(hll* list1, hll* list2) {
+ hll *totallist = 0, *temp = 0;
+
+ if (!list1) return list2;
+ if (!list2) return list1;
+
+ while (list1 || list2) {
+ if (list1 && (!list2 || compare (list1, list2))){
+ if (!totallist)
+ totallist = temp = list1;
+ else {
+ temp->next = list1;
+ temp = temp->next;
+ }
+ list1 = list1->next;
+ }
+ else {
+ if (!totallist)
+ totallist = temp = list2;
+ else {
+ temp->next = list2;
+ temp = temp->next;
+ }
+ list2 = list2->next;
+ }
+ }
+ temp->next = 0;
+ return totallist;
+}
+
+hll* findmiddle(hll* mylist) {
+ hll* other = mylist->next;
+ while (other && other->next) {
+ other = other->next->next;
+ mylist = mylist->next;
+ }
+ return mylist;
+}
+
+hll* sortList(hll* mylist) {
+ hll* premid;
+ hll* mid;
+
+ if (!mylist || !mylist->next)
+ return mylist;
+
+ premid = findmiddle(mylist);
+ mid = premid->next;
+ premid->next = 0;
+ mylist = sortList(mylist);
+ mid = sortList(mid);
+ return merge2(mylist,mid);
+}
+
+int duplicates(hll* f, hll* s) {
+ return (s->seq2start >= f->seq2start) && (s->seq2end <= f->seq2end);
+}
+
+hll* removeDups(hll* allhits, seq* seq1, seq* seq2) {
+ hll *i, *j, *jprev, *temp;
+ for (i = allhits; i; i = i->next){
+ jprev = i;
+ for (j = i->next; j && (j->seq2start >= i->seq2end) ; j = j->next){
+ if (duplicates (i, j) || mergeOverlap (i, j, seq1, seq2)){
+ jprev->next = j->next;
+ freeHLL (j);
+ j = jprev;
+ }
+ else {
+ jprev = j;
+ }
+ }
+ }
+
+ allhits = sortList (allhits);
+ for (i = allhits; i; i = i->next){
+ jprev = i;
+ for (j = i->next; j && (j->seq1start <= i->seq1end) ; j = j->next){
+ if (duplicates (i, j) || mergeOverlap (i, j, seq1, seq2)){
+ jprev->next = j->next;
+ freeHLL (j);
+ j = jprev;
+ }
+ else {
+ jprev = j;
+ }
+ }
+ }
+
+ return allhits;
+}
+
+
+seq* readfile(FILE* input, int seqnum) {
+ char* res = (char*) malloc(sizeof(char));
+ int ressize = 1, numread=0;
+ char temp[256];
+ seq* myseq = (seq*) malloc(sizeof(seq));
+ char currchar;
+ if (feof(input))
+ return 0;
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ myseq->name = (char*) malloc((strlen(temp))*sizeof(char));
+ strcpy(myseq->name, temp+1);
+ *(strchr(myseq->name, '\n')) = 0;
+ currchar = fgetc(input);
+ while ((currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ if (!strchr(alpha, currchar)) {
+ fprintf(stderr, "WARNING %c converted to N\n", currchar, alpha);
+ currchar = 'N';
+ }
+ res[numread++] = currchar;
+ if (numread >= ressize) {
+ res=(char*)realloc(res, sizeof(char)*(ressize*=2));
+ }
+ }
+ currchar = fgetc(input);
+ }
+ if (currchar == '>')
+ ungetc(currchar, input);
+ res[numread]=0;
+
+ myseq->rptr = res;
+ if (seqnum == 1) {
+ if (s1start > 0) {
+ res[s1end] = 0;
+ res = &res[s1start-1];
+ numread = s1end-s1start+1;
+ }
+ }
+ else {
+ if (s2start > 0) {
+ res[s2end] = 0;
+ res = &res[s2start-1];
+ numread = s2end-s2start+1;
+
+ }
+ }
+ myseq->lets = res;
+ myseq->numlets = numread;
+ return myseq;
+}
+
+int isin (char* arr, int size, int elem) {
+ while (--size>=0) {
+ if (arr[size] == elem)
+ return 1;
+ }
+ return 0;
+}
+
+int chain(LList* second, int off2, LList* first, int off1, int diff1, int gap, float baseval) {
+ int i, d1=0, d2=0;
+ int diff2 = second->myloc->locs[off2] - first->myloc->locs[off1];
+ int mindiff;
+ int score=wordlen-second->degleft;
+
+ gap = abs(gap)*gappenc + gappeno;
+
+ if (diff2 <= 0 || diff2 >= lookback)
+ return -1;
+
+ if (diff1 >= wordlen && diff2 >= wordlen) {
+ return score*baseval+gap;
+ }
+ mindiff = MIN(diff1, diff2);
+ /* TODO
+ for (i=second->degleft-1; i >=0; i--) {
+ printf(" %d %d %d \n", second->degloc[i], diff1, diff2);
+ if (!d1 && second->degloc[i] - diff1 <= 0)
+ d1 = 1;
+ if (&d2 && second->degloc[i] - diff2 <= 0)
+ d2 = 1;
+ if (d1 || d2) {
+ break;
+ }
+ }
+ */
+ return mindiff*baseval+gap;
+}
+
+int tc =0;
+int wc = 0;
+
+inline void findPrev(LList* curr, int position, int offset, float baseval) {
+ int j,k;
+ LList* temp;
+ sle* iterator;
+ float bestscore = 0;
+ LList* bestelem = 0;
+ int bestoffset = -1;
+ int doneset = 0;
+ int tempscore, myscore = wordlen - curr->degleft;
+
+ tc++;
+ iterator = SLfind(mylist, position-curr->myloc->locs[offset]-mgaplen+1);
+ if (iterator) {
+ curr->mysles[offset] = iterator;
+ }
+ if (iterator &&
+ iterator->index <= position-curr->myloc->locs[offset]-mgaplen) {
+ iterator = iterator->next[0];
+ }
+
+ if (iterator && (iterator->index < position-curr->myloc->locs[offset])) {
+ curr->mysles[offset] = iterator;
+ }
+
+ while (iterator &&
+ (iterator->index < position-curr->myloc->locs[offset]+mgaplen)) {
+ if (iterator->next[0] && (iterator->index < position-curr->myloc->locs[offset]) &&
+ (iterator->next[0]->index >= position-curr->myloc->locs[offset])) {
+ curr->mysles[offset] = iterator;
+ }
+ temp = ((match*)iterator->myelem)->myll;
+ k = ((match*)iterator->myelem)->offset;
+ j = position-temp->location;
+ tempscore = chain(curr, offset, temp, k,j, iterator->index - position+curr->myloc->locs[offset], baseval);
+ if (tempscore > 0) {
+ if (temp->scores[k]+tempscore > bestscore) {
+ bestscore = temp->scores[k]+tempscore;
+ bestelem = temp;
+ bestoffset=k;
+ }
+ else {
+ temp->scores[k] = -1;
+ }
+ }
+ /* printf("it = %x next = %x\n", iterator, iterator->next[0]); */
+ iterator = iterator->next[0];
+ if (temp->toberemoved[k]) {
+ remElem(temp, k);
+ temp->mysles[k] = 0;
+ }
+ }
+ if (bestelem) {
+ wc++;
+ curr->scores[offset] = bestscore;
+ /* printf("offs = %d, numlocs = %d\n",offset, curr->myloc->numlocs);*/
+ curr->seq1startpnt[offset] = bestelem->seq1startpnt[bestoffset];
+ curr->seq2startpnt[offset] = bestelem->seq2startpnt[bestoffset];
+ curr->myhits[offset].inds1 = (int*) malloc (sizeof(int)*(bestelem->myhits[bestoffset].numind+1));
+ curr->myhits[offset].inds2 = (int*) malloc (sizeof(int)*(bestelem->myhits[bestoffset].numind+1));
+ curr->myhits[offset].numind = bestelem->myhits[bestoffset].numind+1;
+
+ memcpy (curr->myhits[offset].inds2, bestelem->myhits[bestoffset].inds2,
+ bestelem->myhits[bestoffset].numind*sizeof(int));
+ memcpy (curr->myhits[offset].inds1, bestelem->myhits[bestoffset].inds1,
+ bestelem->myhits[bestoffset].numind*sizeof(int));
+ curr->myhits[offset].inds2[bestelem->myhits[bestoffset].numind] = position;
+ curr->myhits[offset].inds1[bestelem->myhits[bestoffset].numind] =
+ (int) curr->myloc->locs[offset];
+
+ }
+ else {
+ curr->scores[offset] = myscore;
+ curr->seq2startpnt[offset] = position;
+ curr->seq1startpnt[offset] = (int)curr->myloc->locs[offset];
+ curr->myhits[offset].inds1 = (int*) malloc (sizeof(int));
+ curr->myhits[offset].inds2 = (int*) malloc (sizeof(int));
+ curr->myhits[offset].inds2[0] = position;
+ curr->myhits[offset].inds1[0] = (int)curr->myloc->locs[offset];
+ curr->myhits[offset].numind = 1;
+ }
+}
+
+void connectToPrev(LList* curr, int index, float baseval) {
+ int j;
+ curr->scores = (float*) malloc(sizeof(float) * curr->myloc->numlocs);
+ curr->myhits = (phits*) malloc(sizeof(phits) * curr->myloc->numlocs);
+ curr->toberemoved = (char*) malloc(sizeof(char) * curr->myloc->numlocs);
+ curr->seq1startpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs);
+ curr->seq2startpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs);
+ curr->seq1endpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs);
+ curr->seq2endpnt = (int*) malloc(sizeof(int) * curr->myloc->numlocs);
+ curr->mysles = (sle**) malloc(sizeof(sle*) * curr->myloc->numlocs);
+ for (j = 0; j < curr->myloc->numlocs; j++) {
+ curr->toberemoved[j] = 0;
+ curr->myhits[j].numind = 0;
+ curr->scores[j] = 0;
+ curr->seq1startpnt[j] = 0;
+ curr->seq2startpnt[j] = 0;
+ curr->mysles[j] = 0;
+ findPrev(curr,index,j,baseval);
+ }
+}
+
+int doAlgo(TNode* root, seq* query, seq* dbase) {
+ char* currword = dbase->lets;
+ LList** LListArr = (LList**) malloc(sizeof(LList*) * dbase->numlets);
+ LList* temp;
+ match* mattemp;
+ int i = 0, j;
+ float bestscore=-1, baseval;
+ int bestqueryloc=-1, bestdbaseloc=-1, numhits;
+ while (*currword) {
+
+ if (!(i%10000)) {
+ // fprintf(stderr,"WORKING %d\n",i);
+ }
+ if (*currword == '.') {
+ /*TODO */
+ }
+ LListArr[i] = temp = getNextWords(root, currword++, ndegen);
+
+ /*****/
+ numhits = 1;
+ while (temp){
+ numhits += temp->myloc->numlocs;
+ temp = temp->next;
+ }
+ baseval = (float) log ((double) query->numsiglets / (double) numhits) / (float) wordlen;
+ temp = LListArr[i];
+ /*****/
+
+ while (temp) {
+ temp->location = i-wordlen+1;
+ connectToPrev(temp, temp->location, baseval);
+ for (j = 0; j < temp->myloc->numlocs; j++) {
+ mattemp = (match*) malloc (sizeof(match));
+ mattemp->myll = temp;
+ mattemp->offset = j;
+ if (temp->mysles[j])
+ temp->mysles[j] = SLinsertAfter(mylist, temp->mysles[j], temp->location-(int)temp->myloc->locs[j], mattemp);
+ else
+ temp->mysles[j] = SLinsert(mylist, temp->location-(int)temp->myloc->locs[j], mattemp);
+ }
+ temp = temp->next;
+ }
+ if (i-lookback >= 0) {
+ LListArr[i-lookback] = savenfreeLList(LListArr[i-lookback], query, dbase);
+ }
+ i++;
+ }
+ j = (i-lookback>=0)?i-lookback:0;
+ for ( ; j < i; j++) {
+ LListArr[j] = savenfreeLList(LListArr[j], query,dbase);
+ }
+ cleanJobQueue();
+ free(LListArr);
+ // fprintf(stderr, "%d chained of %d\n", wc , tc);
+ return 0;
+}
+
+char getLetter (FILE *file){
+ char ch;
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (!isspace (ch)) return ch;
+ }
+ return 0;
+}
+
+void readSubstMatrix (char *filename, int size){
+ FILE *file;
+ char line[1024], *symbs;
+ int i, j;
+
+ sprintf (line, "%s/%s", getenv ("LAGAN_DIR"), filename);
+ file = fopen (line, "r"); assert (file);
+
+ for (i = 0; i < 256; i++){
+ for (j = 0; j < 256; j++){
+ substmatrix[i][j] = 0;
+ }
+ }
+
+ symbs = (char *) malloc (sizeof (char) * size); assert (symbs);
+ for (i = 0; i < size; i++) symbs[i] = getLetter (file);
+ for (i = 0; i < size; i++){
+ getLetter (file);
+ for (j = 0; j < size; j++){
+ fscanf (file, "%d", &(substmatrix[(unsigned char) symbs[i]][(unsigned char) symbs[j]]));
+ }
+ }
+
+
+ fscanf (file, "%d", &gappenstart);
+ fscanf (file, "%d", &gappenext);
+ gappenstart = (gappenext *= 2);
+
+ fclose (file);
+}
+
+void paramParse(int argc, char** argv) {
+ int i = 3;
+
+ for ( ; i < argc; i++) {
+ if (!strcmp(argv[i], "-p") || !strcmp(argv[i], "-P")) {
+ alpha = protalpha;
+ triealpha = prottriealpha;
+ wordlen = 4;
+ lookback = 8;
+ // dropcutoff = 50;
+ readSubstMatrix (BLOSUM_FILE, BLOSUM_FILE_SIZE);
+ }
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "-V")) {
+ verbose = 1;
+ }
+ else if (!strcmp(argv[i], "-b") || !strcmp(argv[i], "-B")) {
+ both = 1;
+ }
+ else if (!strcmp(argv[i], "-t") || !strcmp(argv[i], "-T")) {
+ translated = 1;
+ triealpha = prottriealpha;
+ wordlen = 4;
+ mgaplen = 3;
+ lookback = 8;
+ // dropcutoff = 50;
+ readSubstMatrix (BLOSUM_FILE, BLOSUM_FILE_SIZE);
+ }
+ else if (!strcmp(argv[i], "-rsc") || !strcmp(argv[i], "-RSC")) {
+ reScoreCutoff = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-gfc") || !strcmp(argv[i], "-GFC")) {
+ gapfreechunks = 1;
+ }
+ else if (!strcmp(argv[i], "-ext") || !strcmp(argv[i], "-EXT")) {
+ extend = 1;
+ }
+ else if (!strcmp(argv[i], "-wl") || !strcmp(argv[i], "-WL")) {
+ wordlen = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-nd") || !strcmp(argv[i], "-ND")) {
+ ndegen = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "-CO")) {
+ cutoff = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-lb") || !strcmp(argv[i], "-LB")) {
+ lookback = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-gl") || !strcmp(argv[i], "-GL")) {
+ mgaplen = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-gs") || !strcmp(argv[i], "-GS")) {
+ gappeno = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-gc") || !strcmp(argv[i], "-GC")) {
+ gappenc = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-s1") || !strcmp(argv[i], "-S1")) {
+ s1start = atoi(argv[++i]);
+ s1end = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-s2") || !strcmp(argv[i], "-S2")) {
+ s2start = atoi(argv[++i]);
+ s2end = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-pairs") || !strcmp(argv[i], "-PAIRS")) {
+ if (!(pairfile = fopen(argv[++i],"r"))) {
+ printf("couldnt open pairs file %s\n",argv[i]);
+ exit (2);
+ }
+ }
+ }
+
+ if (!translated) readSubstMatrix (NUC_FILE, NUC_FILE_SIZE);
+
+}
+
+void usage() {
+ printf("usage: \nchaos queryfile dbasefile [options]\n\n");
+ printf("Options:\n");
+ printf("-p = Peptide sequence [default genomic]\n");
+ printf("-v = Verbose mode [default brief]\n");
+ printf("-b = Both strands [default forward-only]\n");
+ printf("-t = Translated [default off]\n");
+ printf("-ext = do BLAST-like extention with given cutoff [default off]\n");
+ printf("-wl # = Word Length [default 10 for genomic, 4 for peptide]\n");
+ printf("-nd # = Number of Degeneracy [default 1 for genomic, 0 for peptide]\n");
+ printf("-co # = score CutOff [default 25]\n");
+ printf("-rsc # = Rescoring cutoff [default 0]\n");
+ printf("-lb # = LookBack distance [default 20 for genomic, 8 for peptide]\n");
+ printf("-gl # = maximum Gap Length [default 5 for genomic, 3 for peptide]\n");
+ printf("-gs # = Gap Start penalty [default 0]\n");
+ printf("-gc # = Gap Continue penalty [default -1]\n");
+ printf("-s1 # # = use the given substring of the query [default whole]\n");
+ printf("-s2 # # = use the givensubstring of the dbase [default whole]\n");
+ printf("-pairs pairfile = read \"-s1 # # -s2 # #\" from pairfile [default off]\n\t[This is not fully functional!!!]\n");
+ printf("-version = prints the version of this CHAOS\n");
+}
+
+void rc(seq* dbase) {
+ revComplement(dbase->lets);
+}
+
+
+int paircnt = 0;
+
+char savs[2];
+int savlocs[2] = {-1,-1};
+
+void procPairs(seq* currquery, seq* currdbase) {
+ // int s1start, s1end, s2start, s2end;
+ if (savlocs[0]>=0)
+ currquery->rptr[savlocs[0]] = savs[0];
+ if (savlocs[1]>=0)
+ currdbase->rptr[savlocs[1]] = savs[1];
+
+ do {
+ //fprintf(stderr,"here\n");
+ if (fscanf(pairfile, "-s1 %d %d -s2 %d %d\n", &s1start, &s1end, &s2start, &s2end) < 4) {
+ pairfile = 0;
+ return;
+ }
+ currquery->numlets = s1end-s1start+1;
+ currdbase->numlets = s2end-s2start+1;
+// fprintf (stderr, "%d %d; %d\n",currquery->numlets,
+// currdbase->numlets, wordlen+1);
+ } while (currquery->numlets < wordlen+1 && currdbase->numlets < wordlen+1)
+ ;
+
+ savlocs[0] = s1end;
+ savs[0] = currquery->rptr[s1end];
+ currquery->rptr[s1end] = 0;
+ currquery->lets = &(currquery->rptr[s1start-1]);
+ currquery->numlets = s1end-s1start+1;
+ savlocs[1] = s2end;
+ savs[1] = currdbase->rptr[s2end];
+ currdbase->rptr[s2end] = 0;
+ currdbase->lets = &(currdbase->rptr[s2start-1]);
+ currdbase->numlets = s2end-s2start+1;
+ paircnt++;
+ if (paircnt%20 ==19)
+ fprintf(stderr, "done with %d\n", paircnt);
+}
+
+void transloc(hll* myhits, int frseq1, int frseq2, int seq1len, int seq2len) {
+ int temp;
+ while (myhits) {
+ if (frseq1<=2) {
+ myhits->seq1start = myhits->seq1start*3 + frseq1;
+ myhits->seq1end = myhits->seq1end*3 + frseq1;
+ }
+ else {
+ temp = (seq1len - myhits->seq1start)*3 + frseq1%3;
+ myhits->seq1start = (seq1len - myhits->seq1end)*3 + frseq1%3;
+ myhits->seq1end = temp;
+ }
+
+ if (frseq2<=2) {
+ myhits->seq2start = myhits->seq2start*3 + frseq2;
+ myhits->seq2end = myhits->seq2end*3 + frseq2;
+ }
+ else {
+ temp = (seq2len - myhits->seq2start)*3 + frseq2%3;
+ myhits->seq2start = (seq2len - myhits->seq2end)*3 + frseq2%3;
+ myhits->seq2end = temp;
+ }
+ myhits = myhits->next;
+ }
+}
+
+void doTranslated(FileBuffer query, FileBuffer dbase) {
+ seq *currquery, *currdbase, *temp;
+ seq *queryframes[6], *dbaseframes[6];
+ char* currword;
+ TNode *roots[6];
+ int i, j;
+ currquery = FileRead(query, s1start, s1end, VER_FCHAOS);
+ currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS);
+
+ if (pairfile) {
+ procPairs(currquery, currdbase);
+ if (!pairfile) {
+ FileClose (query);
+ FileClose (dbase);
+ return;
+ }
+ }
+ do {
+ for (i = 0; i < 6; i++) {
+ queryframes[i] = transSeq(currquery,i);
+ roots[i] = makeTrie(wordlen, triealpha);
+ currword = queryframes[i]->lets;
+ insertString(roots[i],currword);
+ }
+ mylist = makeSkLst();
+
+ while (currdbase) {
+ for (i = 0; i < 6; i++) {
+ dbaseframes[i] = transSeq(currdbase,i);
+ }
+ direction = '+';
+ for (i=0; i < 6; i++)
+ for (j=(i/3)*3; j < (i/3+1)*3; j++) {
+ // fprintf(stderr, "1DOING FRAME %d AGAINST %d\n",i,j);
+ doAlgo(roots[i], queryframes[i], dbaseframes[j]);
+ /****/
+ allhits = removeDups(allhits, queryframes[i], dbaseframes[j]);
+ transloc(allhits, i, j, queryframes[i]->numlets, dbaseframes[j]->numlets);
+ printHLL(allhits, queryframes[i], dbaseframes[j], currdbase->numlets);
+ allhits = 0;
+ }
+ if (both) {
+ direction = '-';
+ for (i=0; i < 6; i++)
+ for (j=(i>2)?0:3; j < ((i>2)?3:6); j++) {
+ // fprintf(stderr, "2DOING FRAME %d AGAINST %d\n",i,j);
+ doAlgo(roots[i], queryframes[i], dbaseframes[j]);
+ /****/
+ allhits = removeDups(allhits, queryframes[i], dbaseframes[j]);
+ transloc(allhits, i, j, queryframes[i]->numlets, dbaseframes[j]->numlets);
+ printHLL(allhits, queryframes[i], dbaseframes[j], currdbase->numlets);
+ allhits = 0;
+ }
+ }
+ temp = currdbase;
+ if (!pairfile)
+ freeSeq(currdbase);
+ currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS);
+ }
+ currdbase = temp;
+ if (pairfile) {
+ procPairs(currquery, currdbase);
+ for (i=0; i < 6; i++) {
+ freeSeq(queryframes[i]);
+ freeTrie(roots[i]);
+ }
+ }
+ } while (pairfile)
+ ;
+
+ FileClose (query);
+ FileClose (dbase);
+}
+
+int main(int argc, char** argv) {
+ FileBuffer query;
+ FileBuffer dbase;
+
+ seq *currquery, *currdbase, *temp;
+ char* currword;
+ TNode* root;
+ int i;
+
+ if (argc < 3) {
+ if (argc == 2)
+ if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) {
+ printf("CHAOS version %s\n", VER_NUM);
+ exit(0);
+ }
+ usage();
+ return 1;
+ }
+ if (!(query = FileOpen(argv[1]))) {
+ printf("couldnt open query file %s\n",argv[1]);
+ usage();
+ return 2;
+ }
+ if (!(dbase = FileOpen(argv[2]))) {
+ printf("couldnt open dbase file %s\n",argv[2]);
+ usage();
+ return 2;
+ }
+ paramParse(argc, argv);
+ initLib();
+
+ if (translated) {
+ doTranslated(query, dbase);
+ return 0;
+ }
+
+ currquery = FileRead(query, s1start, s1end, VER_FCHAOS);
+ currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS);
+ if (pairfile) {
+ procPairs(currquery, currdbase);
+ if (!pairfile) {
+ FileClose (query);
+ FileClose (dbase);
+ return 0;
+ }
+ }
+
+ do {
+ root = makeTrie(wordlen, triealpha);
+ mylist = makeSkLst();
+ currword = currquery->lets;
+ insertString(root,currword);
+
+ while (currdbase) {
+ direction = '+';
+ doAlgo(root, currquery, currdbase);
+ /***/
+ allhits = removeDups(allhits, currquery, currdbase);
+ printHLL(allhits, currquery, currdbase, currdbase->numlets);
+ allhits = 0;
+ if (both) {
+ direction = '-';
+ rc(currdbase);
+ doAlgo(root, currquery, currdbase);
+ /****/
+ allhits = removeDups(allhits, currquery, currdbase);
+ printHLL(allhits, currquery, currdbase, currdbase->numlets);
+ allhits = 0;
+ }
+ temp = currdbase;
+ if (!pairfile) {
+ freeSeq(currdbase);
+ }
+ currdbase = FileRead(dbase, s2start, s2end, VER_FCHAOS);
+ }
+ currdbase = temp;
+ if (pairfile) {
+ procPairs(currquery, currdbase);
+ freeTrie(root);
+ }
+ } while (pairfile)
+ ;
+
+ FileClose (query);
+ FileClose (dbase);
+ return 0;
+
+}
+
+void saveScore(LList* final, int index, gfc* first, gfc* last) {
+
+ hll* myhit = (hll*) malloc(sizeof(hll));
+ int temp;
+
+ myhit->score = final->scores[index];
+ myhit->seq1end = final->seq1endpnt[index];
+ myhit->seq2end = final->seq2endpnt[index];
+ myhit->seq1start = final->seq1startpnt[index];
+ myhit->seq2start = final->seq2startpnt[index];
+ myhit->last = last;
+ myhit->first = first;
+ myhit->next = allhits;
+ allhits = myhit;
+}
+
+void remElem(LList* tbf, int i) {
+ free(tbf->mysles[i]->myelem);
+ SLremove(mylist, tbf->mysles[i]);
+}
+
+inline int CHmatchscore(unsigned char a, unsigned char b) {
+ return substmatrix[a][b];
+ /*
+ if (translated)
+ return substmatrix[a][b];
+ if (a == 'N' || b == 'N' || a == 'X' || b == 'X')
+ return 0;
+ if ((a == '*' || b == '*') && a != b)
+ return -50;
+ if (indeces[a] == indeces[b])
+ return matchsco;
+ return mismatchsco;
+ */
+}
+
+int extendBLAST(int s1i, int s2i, char* s1, char* s2, int s1l, int s2l, int dir) {
+ int peak=0, peakloc = 0, currscore=0, i = 1;
+ while (peak - currscore < dropcutoff) {
+ if (s1i+dir*i < 0 || s2i+dir*i < 0 || !s1[s1i+dir*i] || !s2[s2i+dir*i] || s1i+dir*i >= s1l || s2i+dir*i >= s2l)
+ break;
+ currscore += CHmatchscore (s1[s1i+dir*i], s2[s2i+dir*i]);
+ // fprintf(stderr, "%d(%c %c) ", currscore, s1[s1i+dir*i], s2[s2i+dir*i]);
+ if (currscore > peak) {
+ peak = currscore;
+ peakloc = i;
+ }
+ i++;
+ }
+ // fprintf(stderr, "got to %d, score %d(%d)\n", i, currscore, peak);
+ return peakloc;
+}
+
+int extendMerge(int s1l, int s2l, int s1r, int s2r, char* s1, char* s2, int* dir) {
+
+ int length, i;
+ int *s1arr, *s2arr, bestscore=-9999999, bestloc=0;
+
+ // HACK
+ if (s1l < 0){ int err = -s1l; s1l += err; s2l += err; }
+ if (s2l < 0){ int err = -s2l; s1l += err; s2l += err; }
+
+ length = MIN2(s1r-s1l, s2r-s2l);
+
+ // fprintf(stderr,"extmerge (%d %d) (%d %d)\n", s1l, s2l, s1r, s2r);
+ *dir = WEQ2(s1r-s1l, s2r-s2l, length); //0 vertical, 1 horizontal
+ if (length <= 0)
+ return 0;
+ s1arr = (int*) malloc (sizeof(int) * (length+1));
+ s2arr = (int*) malloc (sizeof(int) * (length+1));
+ s1arr[0] = s2arr[length] = 0;
+ for (i = 1; i <= length; i++) {
+ s1arr[i] = s1arr[i-1] + CHmatchscore(s1[s1l+i], s2[s2l+i]);
+ s2arr[length-i] = s2arr[length-i+1] + CHmatchscore(s1[s1r-i], s2[s2r-i]);
+ }
+ for (i = 0; i < length; i++) {
+ if (s1arr[i]+s2arr[i+1] > bestscore) {
+ bestscore = s1arr[i]+s2arr[i+1];
+ bestloc = i;
+ }
+ }
+ // fprintf(stderr, "extMer score = %d\n", bestscore);
+ free (s1arr);
+ free (s2arr);
+ return bestloc;
+}
+
+int reScore(int s1l, int s2l, int len, char* s1, char* s2) {
+ int i;
+ int totscore = 0;
+
+ // HACK
+ if (s1l < 0){ int err = -s1l; s1l += err; s2l += err; len -= err; }
+ if (s2l < 0){ int err = -s2l; s1l += err; s2l += err; len -= err; }
+
+ for (i=0; i < len; i++) {
+ totscore += CHmatchscore(s1[s1l+i], s2[s2l+i]);
+ }
+ return totscore;
+}
+
+
+void reScoreHit(LList* tbf, int index, char* s1, char* s2, int s1l, int s2l, gfc **frstgf, gfc **mygf) {
+ int totscore = 0, myscore;
+ int ts1, ts2, te1, te2;
+ int i=0, temp=0, offset, dir;
+
+
+ if (extend) {
+ temp = extendBLAST(tbf->myhits[index].inds1[i], tbf->myhits[index].inds2[i],
+ s1, s2, s1l, s2l, -1);
+ }
+
+ tbf->seq1startpnt[index] = ts1 = tbf->myhits[index].inds1[i] - temp;
+ tbf->seq2startpnt[index] = ts2 = tbf->myhits[index].inds2[i] - temp;
+ *frstgf = *mygf = (gfc*) malloc (sizeof (gfc));
+ (*frstgf)->offset = 0;
+
+ for (i = 0; i < tbf->myhits[index].numind-1; i++) {
+ if (!(offset = ((tbf->myhits[index].inds1[i]-tbf->myhits[index].inds2[i]) -
+ (tbf->myhits[index].inds1[i+1]-tbf->myhits[index].inds2[i+1])))) {
+
+ continue;
+ }
+ else {
+
+
+ temp = extendMerge(tbf->myhits[index].inds1[i]+wordlen-1,
+ tbf->myhits[index].inds2[i]+wordlen-1,
+ tbf->myhits[index].inds1[i+1],
+ tbf->myhits[index].inds2[i+1], s1, s2, &dir);
+ te1 = tbf->myhits[index].inds1[i] + wordlen - 1 + temp;
+ te2 = tbf->myhits[index].inds2[i] + wordlen - 1 + temp;
+
+ myscore = reScore(ts1, ts2, te1-ts1+1, s1, s2);
+ totscore += myscore;
+ totscore += (gappenstart + gappenext * ABS(offset));
+ (*mygf)->length = te1-ts1+1;
+ (*mygf)->score = myscore;
+ (*mygf)->next = (gfc*) malloc (sizeof (gfc));
+ (*mygf) = (*mygf)->next;
+ (*mygf)->offset = offset;
+
+ if (dir) {
+ ts1 = te1+ABS(offset)+1;
+ ts2 = te2+1;
+ }
+ else {
+ ts2 = te2+ABS(offset)+1;
+ ts1 = te1+1;
+ }
+ }
+ }
+ temp = 0;
+ if (extend) {
+ temp = extendBLAST(tbf->myhits[index].inds1[i]+wordlen-1,
+ tbf->myhits[index].inds2[i]+wordlen-1, s1, s2, s1l, s2l, 1);
+ }
+ myscore = reScore(ts1, ts2, tbf->myhits[index].inds1[i]+wordlen-ts1+temp, s1, s2);
+ (*mygf)->length = tbf->myhits[index].inds1[i]+wordlen-ts1+temp;
+ (*mygf)->score = myscore;
+ (*mygf)->next = 0;
+ totscore += myscore;
+ tbf->scores[index] = totscore;
+ tbf->seq1endpnt[index] = tbf->myhits[index].inds1[i]+wordlen-1 + temp;
+ tbf->seq2endpnt[index] = tbf->myhits[index].inds2[i]+wordlen-1 + temp;
+}
+
+
+LList* savenfreeLList(LList* tbf, seq* seq1, seq* seq2) {
+ int i,j;
+ LList* next;
+ gfc *first, *last;
+ if (!tbf)
+ return 0;
+ for (i=0; i < tbf->myloc->numlocs; i++) {
+ if (tbf->scores[i] > cutoff) {
+ tbf->seq1endpnt[i] = (int) tbf->myloc->locs[i] + wordlen - 1;
+ tbf->seq2endpnt[i] = tbf->location +wordlen - 1;
+ reScoreHit(tbf, i, seq1->lets, seq2->lets, seq1->numlets, seq2->numlets, &first, &last);
+ j = tbf->scores[i];
+ if (tbf->scores[i] > reScoreCutoff){
+ saveScore(tbf,i, first, last);
+ }
+ }
+ }
+ for (i=0; i < tbf->myloc->numlocs; i++) {
+ if (tbf->mysles[i]) {
+ remElem(tbf,i);
+ }
+ free (tbf->myhits[i].inds1);
+ free (tbf->myhits[i].inds2);
+ }
+
+ next = tbf->next;
+
+ free (tbf->myhits);
+ free (tbf->scores);
+ free (tbf->mysles);
+ free (tbf->seq1startpnt);
+ free (tbf->seq2startpnt);
+ free (tbf->seq1endpnt);
+ free (tbf->seq2endpnt);
+ free (tbf->toberemoved);
+ free (tbf);
+ return savenfreeLList(next, seq1, seq2);
+}
+
+int mergeOverlap(hll* h1, hll* h2, seq* seq1, seq* seq2) {
+ int offset, myscore, nextscore, newscore, bestloc, dir, gappen;
+ int s1l, s2l, s1r, s2r, s1n, s2n;
+
+ // return 0;
+ // fprintf (stderr, "(%d %d) (%d %d)", h1->seq1end, h1->seq2end, h2->seq1start, h2->seq2start);
+
+ if ((h1->seq2end < h2->seq2start) && (h1->seq1end < h2->seq1start)) {
+ // fprintf (stderr, " no\n");
+ return 0;
+ }
+
+ offset = (h1->seq1end-h1->seq2end) - (h2->seq1start-h2->seq2start);
+ if (ABS(offset) > mgaplen)
+ return 0;
+ gappen = gappenstart + gappenext * ABS(offset);
+
+ if ((-gappen) > h1-> score || (-gappen) > h2->score) {
+ // fprintf (stderr, " gap\n");
+ return 0;
+ }
+ s1l = h1->seq1end - h1->last->length;
+ s2l = h1->seq2end - h1->last->length;
+ s1r = h2->seq1start + h2->first->length;
+ s2r = h2->seq2start + h2->first->length;
+
+ if (s1r <= s1l || s2r <= s2l) {
+ // fprintf (stderr, " swap\n");
+ return 0;
+ }
+ if (offset) {
+ bestloc = extendMerge(s1l, s2l, s1r, s2r, seq1->lets, seq2->lets, &dir);
+ myscore = reScore(s1l, s2l, bestloc, seq1->lets, seq2->lets);
+ if (dir) {
+ s1n = s1l + bestloc + ABS(offset)+1;
+ s2n = s2l + bestloc + 1;
+ }
+ else {
+ s2n = s2l + bestloc + ABS(offset)+1;
+ s1n = s1l + bestloc + 1;
+ }
+ nextscore = reScore(s1n, s2n, s2r - s2n, seq1->lets, seq2->lets);
+ // fprintf (stderr, " %d %d %d\n", bestloc, myscore, nextscore);
+ // fprintf (stderr, "a %d %d %d\n", s1l, s1n, s1r);
+ newscore = h1->score + h2->score - (h2->first->score - nextscore) - (h1->last->score - myscore) + gappen;
+ if (newscore < h1-> score || newscore < h2->score) {
+ // fprintf (stderr, " score1\n");
+ return 0;
+ }
+ h1->score = newscore;
+ h1->last->length = bestloc;
+
+ h2->first->score = nextscore;
+ h2->first->offset = offset;
+ h2->first->length = s2r - s2n;
+ h1->last->score = myscore;
+ h1->last->next = h2->first;
+ if (h1->last->next)
+ h1->last = h2->last;
+ h2->first = 0;
+ }
+ else {
+ myscore = reScore(s1l, s2l, s1r-s1l, seq1->lets, seq2->lets);
+ newscore = h1->score + h2->score - (h1->last->score - myscore) + gappen;
+ if (newscore < h1-> score || newscore < h2->score) {
+ // fprintf (stderr, " score2\n");
+ return 0;
+ }
+ h1->score = newscore;
+ h1->last->score = myscore;
+ h1->last->next = h2->first->next;
+ h1->last->length = s1r - s1l;
+ if (h1->last->next)
+ h1->last = h2->last;
+ h2->first->next = 0;
+ }
+ h1->seq2end = h2->seq2end;
+ h1->seq1end = h2->seq1end;
+ return 1;
+}
diff --git a/src/fchaos.h b/src/fchaos.h
new file mode 100644
index 0000000..92053c1
--- /dev/null
+++ b/src/fchaos.h
@@ -0,0 +1,39 @@
+#ifndef __FCHAOS_H
+#define __FCHAOS_H
+
+typedef struct GapFreeChunkList {
+ int offset;
+ int length;
+ int score;
+ struct GapFreeChunkList *next;
+} gfc;
+
+typedef struct HitLocationList {
+ int seq1start;
+ int seq2start;
+ int seq1end;
+ int seq2end;
+ float score;
+ gfc* first;
+ gfc* last;
+ struct HitLocationList *next;
+ char dirty;
+} hll;
+
+
+
+
+typedef struct Sequence {
+ char* lets;
+ int numlets, numsiglets;
+ int leftbound, rightbound;
+ char* name;
+ char* rptr;
+} seq;
+
+
+
+hll* fchaos(int argc, char** argv);
+int mergeOverlap(hll* h1, hll* h2, seq* seq1, seq* seq2);
+
+#endif
diff --git a/src/filebuffer.c b/src/filebuffer.c
new file mode 100644
index 0000000..7dfce8c
--- /dev/null
+++ b/src/filebuffer.c
@@ -0,0 +1,199 @@
+#include "filebuffer.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#ifdef CHAOS__FLAG
+char* alphabet = "ATCGNPCMHDEKRQSILVFYWX*";
+#else
+char* alphabet = "ATCGN-.";
+#endif
+
+FileBuffer FileOpen (const char *path){
+ FileBuffer buf;
+ FILE *data = fopen (path, "r");
+ if (!data) return NULL;
+ buf = (FileBuffer) malloc (sizeof (struct FileBufferImplementation));
+ if (!buf) return NULL;
+ buf->filename = (char*) path;
+ buf->head = NULL;
+ buf->tail = NULL;
+ buf->startpos = 0; //100000000;
+ buf->endpos = 100000000; //0;
+ //buf->pos = BUFFER_SIZE;
+ //buf->len = BUFFER_SIZE;
+ buf->data = data;
+ return buf;
+}
+
+void FileUpdate (FileBuffer buf){
+ if (buf->head >= buf->tail){
+ buf->tail = buf->buffer + fread (buf->buffer, sizeof(char), BUFFER_SIZE, buf->data);
+ buf->head = buf->buffer;
+ }
+}
+
+int FileEOF (FileBuffer buf){
+ FileUpdate (buf);
+ return buf->head >= buf->tail && feof (buf->data);
+}
+
+void FileGetS (char *buffer, int length, FileBuffer buf){
+ int a;
+
+ for (a = 0; a < length && !FileEOF (buf); a++){
+ buffer[a] = FilePeekC (buf);
+ buf->head++;
+ if (a + 1 < length && buffer[a] == '\n'){
+ buffer[a + 1] = '\0';
+ break;
+ }
+ }
+}
+
+char *FileGetLine (FileBuffer buf){
+ int a = 0, length = 1;
+ char *buffer = (char *) malloc (1 * sizeof(char));
+ assert (buffer);
+
+ while (!FileEOF (buf)){
+ buffer[a] = FilePeekC (buf);
+ buf->head++;
+ if (buffer[a] == '\n'){
+ buffer[a] = '\0';
+ break;
+ }
+ a++;
+ if (a == length){
+ buffer = (char *) realloc (buffer, (length *= 2) * sizeof(char));
+ assert (buffer);
+ }
+ }
+
+ return buffer;
+}
+
+void FilePopC (FileBuffer buf){
+ buf->head++;
+}
+
+char FilePeekC (FileBuffer buf){
+ FileUpdate (buf);
+ return *(buf->head);
+ // return buf->buffer[buf->pos];
+}
+
+void FileClose (FileBuffer buf){
+ fclose (buf->data);
+ free (buf);
+}
+
+seq* FileRead (FileBuffer buf, int start, int finish, int version){
+ char* res = (char*) malloc(sizeof(char));
+ int ressize = 1, numread = 0, i, numNs = 0;
+ char *tempname, temp[256], currchar, *curr, *resend;
+ seq* myseq = (seq*) malloc(sizeof(seq));
+
+
+ if (FileEOF(buf))
+ return 0;
+
+ if (start == 1 && finish == 0) {
+ start = buf->startpos;
+ finish = buf->endpos;
+ if (start == 0)
+ start = 1;
+ }
+
+ tempname = FileGetLine (buf);
+ if (tempname[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+
+ myseq->name = (char*) malloc((strlen(tempname))*sizeof(char));
+ strcpy(myseq->name, tempname+1);
+ if (strchr(myseq->name, '\n'))
+ *(char *)(strchr(myseq->name, '\n')) = 0;
+
+ free (tempname);
+
+ for (i = 0; i < 256; i++){
+ temp[i] = (strchr (alphabet, toupper ((char) i)) != 0) ?
+ toupper((char) i) : 'N';
+ }
+
+ FileUpdate (buf);
+ curr = res;
+ resend = res + ressize;
+
+ if (version == VER_ORDER || version == VER_MLAGAN){
+ ressize = 2;
+ numread = 1;
+ if (version == VER_ORDER)
+ res[0] = 0;
+ else
+ res[0] = 'N';
+ curr++;
+ }
+
+ while (buf->head < buf->tail || !feof (buf->data)){
+
+ while (buf->head < buf->tail){
+ currchar = *(buf->head);
+ if (currchar == '>') goto outer;
+ if (currchar != ' ' && currchar != '\n' && currchar != '\r' &&
+ currchar != '\t' && currchar != '\t' && currchar != '\v') {
+ if (currchar == 'N') numNs++;
+ *curr++ = temp[(int) currchar];
+ if (curr >= resend) {
+ numread = curr - res;
+ res = (char *) realloc (res, sizeof(char) * (ressize *= 2));
+ curr = res + numread;
+ resend = res + ressize;
+ }
+ }
+ buf->head++;
+ }
+
+ buf->tail = buf->buffer + fread (buf->buffer, sizeof(char), BUFFER_SIZE, buf->data);
+ buf->head = buf->buffer;
+ }
+
+ outer:
+ numread = curr - res;
+ res[numread]=0;
+ myseq->rptr = res;
+
+ if (version == VER_FCHAOS){
+ if (start > 0) {
+ res[finish] = 0;
+ res = &res[start-1];
+ numread = finish-start+1;
+ }
+ myseq->numlets = numread;
+ }
+ else if (version == VER_ORDER){
+ if (start > 0){
+ res = &res[start-1];
+ res[0] = 0;
+ res[finish-start+2] = 0;
+ numread = finish-start+2;
+ }
+ myseq->numlets = numread-1;
+ }
+ else if (version == VER_MLAGAN){
+ if (start > 0 || finish > 0) {
+ res[finish] = 0;
+ res = &res[start-1];
+ numread = finish-start+1;
+ }
+ myseq->numlets = numread;
+ myseq->leftbound = start;
+ myseq->rightbound = finish;
+ }
+ myseq->numsiglets = numread - numNs;
+ myseq->lets = res;
+ return myseq;
+}
diff --git a/src/filebuffer.h b/src/filebuffer.h
new file mode 100644
index 0000000..38c6ba4
--- /dev/null
+++ b/src/filebuffer.h
@@ -0,0 +1,36 @@
+#ifndef __FILEBUFFER_H
+#define __FILEBUFFER_H
+
+#include <stdio.h>
+
+#ifndef MULTIAL__FLAG
+#include "fchaos.h"
+#else
+#include "multial.h"
+#endif
+
+#define BUFFER_SIZE 1048576
+#define VER_FCHAOS 0
+#define VER_ORDER 1
+#define VER_MLAGAN 2
+
+struct FileBufferImplementation {
+ FILE *data;
+ char* filename;
+ char buffer[BUFFER_SIZE];
+ char *head, *tail;
+ int startpos, endpos;
+ // int pos, len;
+};
+
+typedef struct FileBufferImplementation *FileBuffer;
+
+FileBuffer FileOpen (const char *path);
+int FileEOF (FileBuffer buf);
+void FileGetS (char *buffer, int length, FileBuffer buf);
+char FilePeekC (FileBuffer buf);
+void FilePopC (FileBuffer buf);
+void FileClose (FileBuffer buf);
+seq* FileRead (FileBuffer buf, int start, int end, int version);
+
+#endif
diff --git a/src/global.c b/src/global.c
new file mode 100644
index 0000000..21f3890
--- /dev/null
+++ b/src/global.c
@@ -0,0 +1,176 @@
+#include "global.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+extern int indeces[256];
+
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+#define MAX3(x,y,z) MAX2(MAX2(x,y),z)
+
+
+int ismatch(char a, char b) {
+ return indeces[a] == indeces[b];
+}
+
+int matchscore (char a, char b) {
+ if (a == b)
+ return 4;
+ return -3;
+}
+
+void reverse (char* a, int length) {
+ char lft;
+ int i;
+ for (i=0; i < length/2; i++) {
+ lft = a[i];
+ a[i] = a[length-i-1];
+ a[length-i-1] = lft;
+ }
+}
+
+align* global(char* seq1, int start1, int end1, char* seq2, int start2,
+ int end2, int gapopen, int gapext) {
+
+ int mm = end2 - start2 + 1, score;
+ int i,j,k,c, temp, lastdiag=0;
+ int* M = (int*) malloc (sizeof(int) * (end1-start1+1) * (end2 - start2+1));
+ int* N = (int*) malloc (sizeof(int) * (end1-start1+1) * (end2 - start2+1));
+ int* O = (int*) malloc (sizeof(int) * (end1-start1+1) * (end2 - start2+1));
+ align* result = (align*) malloc (sizeof(align));
+ char* almt = (char*) malloc ( sizeof(char) * ((end1-start1)+(end2-start2)+2));
+
+ M[mm*0+0] = matchscore(seq1[start1],seq2[start2]);
+ N[mm*0+0] = -1*gapopen;
+ O[mm*0+0] = -1*gapopen;
+ for (i = 1; i <= end1-start1; i++) {
+ O[mm*i+0] = O[mm*(i-1)+0]-gapext;
+ N[mm*i+0] = 0;
+ M[mm*i+0] = O[mm*(i-1)+0]+matchscore(seq1[start1+i],seq2[start2]);
+ }
+ for (j = 1; j <= end2-start2; j++) {
+ N[mm*0+j] = N[mm*0 + (j-1)]-gapext;
+ O[mm*0+j] = 0;
+ M[mm*0+j] = N[mm*0+(j-1)]+matchscore(seq1[start1],seq2[start2+j]);
+ }
+ for ( k = 2; k <= end1-start1; k++) {
+ for (i = k-1, j = 1; (i > 0) && (j <= end2-start2); i--, j++) {
+ N[mm*i + j] = MAX2(M[mm*(i-1)+j] - gapopen, N[mm*(i-1)+j] - gapext);
+ O[mm*i + j] = MAX2(M[mm*i+(j-1)] - gapopen, O[mm*i+(j-1)] - gapext);
+ M[mm*i + j] = MAX3(M[mm*(i-1)+(j-1)],N[mm*(i-1)+(j-1)],O[mm*(i-1)+(j-1)]) +
+ matchscore(seq1[start1+i], seq2[start2+j]);
+ }
+ }
+ for ( k = 1; k <= end2-start2; k++) {
+ for (j = k, i = end1-start1; (i>0) && (j <= end2-start2); j++, i--) {
+ N[mm*i + j] = MAX2(M[mm*(i-1)+j] - gapopen, N[mm*(i-1)+j] - gapext);
+ O[mm*i + j] = MAX2(M[mm*i+(j-1)] - gapopen, O[mm*i+(j-1)] - gapext);
+ M[mm*i + j] = MAX3(M[mm*(i-1)+(j-1)],N[mm*(i-1)+(j-1)],O[mm*(i-1)+(j-1)]) +
+ matchscore(seq1[start1+i], seq2[start2+j]);
+ }
+ }
+ i = end1-start1;
+ j = end2-start2;
+ c = 0;
+ result->score = MAX3 ( M[mm*(i)+(j)],
+ N[mm*(i)+(j)],
+ O[mm*(i)+(j)]);
+
+ while(i >= 0 && j >= 0) {
+ if (!i) {
+ almt[c++] = ismatch(seq1[start1], seq2[start2+j]);
+ for ( j = j -1; j >=0; j--,c++) {
+ lastdiag = 0;
+ almt[c] = DELETION;
+ }
+ }
+ else if (!j) {
+ almt[c++] = ismatch(seq1[start1+i], seq2[start2]);
+ for ( i = i -1; i >=0; i--,c++) {
+ almt[c] = INSERTION;
+ lastdiag = 0;
+ }
+ }
+ else {
+ if (!lastdiag) {
+ M[mm*i+j] = M[mm*i+j] - gapopen;
+ N[mm*i+j] = N[mm*i+j] - gapext;
+ O[mm*i+j] = O[mm*i+j] - gapext;
+ }
+
+ temp = MAX3 ( M[mm*(i)+(j)],
+ N[mm*(i)+(j)],
+ O[mm*(i)+(j)]);
+ if (temp == N[mm*(i)+(j)]) {
+ lastdiag = 0;
+ almt[c++] = INSERTION;
+ i--;
+ }
+ else if (temp == O[mm*(i)+(j)]) {
+ lastdiag = 0;
+ almt[c++] = DELETION;
+ j--;
+ }
+ else if (temp == M[mm*(i)+(j)]) {
+ lastdiag = 1;
+ almt[c++] = ismatch(seq1[start1+i], seq2[start2+j]);
+ i--; j--;
+ }
+ }
+ }
+ free(M);
+ free(N);
+ free(O);
+ result->algnlen = c;
+ reverse(almt,c);
+ result->algn = almt;
+ return result;
+}
+
+int printalign(char* seq1, int start1, int end1, char* seq2, int start2,
+ int end2, align* myalign) {
+ int s1=start1, s2=start2, c, k;
+ int nm=0, nga=0, ngb=0, nlets=0;
+ int hasst=0;
+ for (c = 0; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != DELETION)
+ printf("%c", seq1[s1++]);
+ else {
+ printf("-");
+ if (hasst)
+ nga++;
+ }
+ }
+ printf("\n");
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] == 1) {
+ printf(":");
+ nm++;
+ nlets++;
+ hasst = 1;
+ }
+ else {
+ printf(" ");
+ if (hasst) nlets++;
+ }
+ }
+ printf("\n");
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != INSERTION)
+ printf("%c", seq2[s2++]);
+ else {
+ printf("-");
+ if (hasst)
+ ngb++;
+ }
+ }
+ printf("\n\n");
+ }
+ printf("score = %d, nmatches = %d, nga=%d, ngb=%d nletters=%d, perc = %f\n",
+ myalign->score,nm,nga,ngb,nlets,(float)nm/(float)nlets);
+ printf("\n");
+}
+
+
+
+
diff --git a/src/global.h b/src/global.h
new file mode 100644
index 0000000..3038edb
--- /dev/null
+++ b/src/global.h
@@ -0,0 +1,14 @@
+#define INSERTION 2
+#define DELETION 3
+
+typedef struct align_res {
+ int score;
+ int algnlen;
+ char* algn;
+} align;
+
+align* global(char* seq1, int start1, int end1, char* seq2, int start2, int end2,
+ int gapstart, int gapcont);
+
+int printalign(char* seq1, int start1, int end1, char* seq2, int start2, int end2,
+ align* myalign);
diff --git a/src/glocal/Makefile b/src/glocal/Makefile
new file mode 100755
index 0000000..ce1421a
--- /dev/null
+++ b/src/glocal/Makefile
@@ -0,0 +1,19 @@
+CC = g++
+OPTFLAGS =
+CFLAGS = $(OPTFLAGS) -O3
+CLINKER = g++
+# LIBDIR = -L/usr/local/lib
+MLIB = -lm
+INCDIR = -I./
+TRGT_DIR = ../..
+TRGT = glocal
+OBJECTS = glocal.o io.o rightinfluence.o leftinfluence.o score.o
+
+.cpp.o:
+ $(CC) -Wno-deprecated $(CFLAGS) $(INCDIR) -c $*.cpp
+
+$(TRGT): $(OBJECTS)
+ $(CLINKER) $(OPTFLAGS) $(OBJECTS) -o $(TRGT_DIR)/$(TRGT) $(MLIB)
+
+clean :
+ rm -f *.o ./*~ *~ core
diff --git a/src/glocal/default.score b/src/glocal/default.score
new file mode 100755
index 0000000..da01ef8
--- /dev/null
+++ b/src/glocal/default.score
@@ -0,0 +1,5 @@
+{+R+;-L-}{0 0.02 0 0;40000 0 0 0}
+{+R-;-L+}{3000 0.02 0.1 0;40000 0 0 0}
+{-R+;+L-}{7000 0.02 0.5 0;40000 0 0 0}
+{+L+;-R-}{7000 0.02 0.5 0;40000 0 0 0}
+{+U+;+U-;-U+;-U-}{30000 0 0 0}
diff --git a/src/glocal/glocal.cpp b/src/glocal/glocal.cpp
new file mode 100755
index 0000000..4b0258c
--- /dev/null
+++ b/src/glocal/glocal.cpp
@@ -0,0 +1,258 @@
+#include<score.h>
+#include<glocal.h>
+#include<algorithm>
+
+bool seq1StartCompare(const Fragment &f1, const Fragment &f2) {
+ return f1.seq1Start < f2.seq1Start;
+}
+
+//vectors that would be needed globally
+vector<Fragment> fragments;
+vector<Point>startPoints;
+vector<Point>endPoints;
+long long int numFragments;
+InterPoint inter;
+
+
+/*SLAGANCHANGE This has to change*/
+
+RI RI_regions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)];
+LI LI_regions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)];
+
+vector<class Score*> scoreFunctions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)];
+
+Name allNames;
+
+
+extern Fragment LI_dummy;
+Fragment * unrelatedFrag;
+
+Fragment *max_score_index;
+float max_score;
+
+int main(int, char **argv) {
+ long long int nextEndRow,nextStartRow, nextInterPointRow;
+ long long int i;
+ Point intersectionPoint;
+
+ numFragments = readInput(argv[1]);
+
+ findAllNames( numFragments);
+ decideContigBase();
+ storeIterators(numFragments);
+
+ initScoreFunctionPointers(argv[2]);
+ unrelatedFrag = &LI_dummy;
+
+ /*SLAGANCHANGE need a LI, RI pointer array and init */
+ /*SLAGANCHANGE:: Need score function init */
+
+ if (DEBUG) { fprintf(stderr,"Numfrg::%lld",numFragments); }
+ max_score_index=NULL;
+ max_score =-INF;
+
+ long long int break_flag =0;
+
+ createPointLists(numFragments);
+// printFragmentsInPointListOrder(numFragments);
+// exit(0);
+
+ //The initial Row upto which startPointHandler goes
+ nextEndRow = endPoints[0].seq1;
+ nextStartRow = startPoints[0].seq1;
+
+ for (i=0;i<1<<TOTALSHIFT;i++) {
+ initRI(&RI_regions[i],i);
+ InitLI(&LI_regions[i],i);
+ }
+
+ if (DEBUG) { fprintf(stderr,"The number of regions was %lld",i); }
+
+ while (1) {
+ if (inter.begin()==inter.end()) {
+ nextInterPointRow = INF;
+ if (DEBUG) { fprintf(stderr,"\nORHERE"); }
+ } else {
+ intersectionPoint = (inter.begin())->first;
+ nextInterPointRow = intersectionPoint.seq1;
+ if (DEBUG) { fprintf(stderr,"\nHERE"); }
+ }
+
+ if (nextStartRow <= nextEndRow) {
+ //CHANGE HERE
+ if (nextStartRow<nextInterPointRow) {
+ nextStartRow=startPointHandler();
+
+ if (nextStartRow == INF) {
+ //break;
+ break_flag = 1;
+ }
+ } else {
+ intersectionPointHandler();
+ }
+ } else {
+ //CHANGE HERE
+ if (nextEndRow<nextInterPointRow) {
+ nextEndRow=endPointHandler();
+ if (break_flag == 1) {
+ break;
+ }
+ } else {
+ intersectionPointHandler();
+ }
+ }
+ }
+
+ if (DEBUG) { fprintf(stderr,"\nMAX CHAIN\n"); }
+ printChain(max_score_index);
+
+ //fprintf(stderr,"\nALL\n");
+ //printAllFragments(numFragments);
+ return 0;
+}
+
+
+//Processes till the row number reaches the argument
+long long int startPointHandler() {
+ static long long int current=0;
+ Fragment *owner;
+ long long int current_seq1= startPoints[current].seq1;
+ float current_score;
+ if (DEBUG) { fprintf(stderr,"\nStart PointHandler"); }
+
+ while (startPoints[current].seq1==current_seq1) {
+ long long int upStrand,downStrand,relPos,possibleCase;
+
+ downStrand = (startPoints[current].frag)->strand;
+
+ relPos = startPoints[current].seq2 > 0 ? RIGHT:LEFT;
+
+ upStrand = POSITIVE;
+ possibleCase = downStrand << DOWNSTRANDSHIFT | upStrand <<UPSTRANDSHIFT | relPos<< RELPOSSHIFT;
+
+ owner=LILookUpOwnerStart(&LI_regions[possibleCase],startPoints[current].frag);
+
+ current_score = fragmentSetScore(startPoints[current].frag, owner, &LI_regions[possibleCase], NULL, FALSE);
+
+ owner = lookUpOwnerStart(&RI_regions[possibleCase], startPoints[current].frag);
+
+ current_score = fragmentSetScore(startPoints[current].frag, owner, NULL, &RI_regions[possibleCase], TRUE);
+
+ upStrand = NEGATIVE;
+ possibleCase = downStrand << DOWNSTRANDSHIFT | upStrand <<UPSTRANDSHIFT | relPos << RELPOSSHIFT;
+
+ owner = lookUpOwnerStart(&RI_regions[possibleCase], startPoints[current].frag);
+
+ current_score = fragmentSetScore(startPoints[current].frag, owner, NULL,&RI_regions[possibleCase], TRUE);
+ if (DEBUG) { fprintf(stderr, "HI1"); }
+
+ owner = LILookUpOwnerStart(&LI_regions[possibleCase],startPoints[current].frag);
+ current_score = fragmentSetScore(startPoints[current].frag, owner, &LI_regions[possibleCase], NULL, FALSE);
+ if (DEBUG) { fprintf(stderr, "HI2"); }
+
+ current_score = fragmentSetScore(startPoints[current].frag, unrelatedFrag, NULL, NULL, 3);
+ if (DEBUG) { fprintf(stderr, "HI3"); }
+
+ if ((startPoints[current].frag)->back == NULL) {
+ if (DEBUG) { fprintf(stderr, "\n The fragment did not chain!"); }
+ // exit(1);
+ } else if (DEBUG) {
+ fprintf(stderr, "Score for the current fragment is::%f", startPoints[current].frag->totalScore);
+ fprintf(stderr, "Score for the owner fragment is::%f", startPoints[current].frag->back->totalScore);
+ }
+
+ if (startPoints[current].frag->totalScore > max_score) {
+ max_score = startPoints[current].frag->totalScore;
+ max_score_index = startPoints[current].frag ;
+ }
+
+ current++;
+
+ if (DEBUG) { fprintf(stderr,"\ncurrent fragment is %lld",current); }
+
+ if (current>=2*numFragments) {
+ return INF;
+ }
+ }
+
+ return startPoints[current].seq1;
+}
+
+
+//takes as arguements the start row number and the end row number and processes all the rows
+//This would usually have to find the case
+long long int endPointHandler() {
+ static long long int current=0;
+
+ long long int current_seq1= endPoints[current].seq1;
+
+ if (DEBUG) { fprintf(stderr,"\nEnd PointHandler"); }
+
+ /*SLAGANCHANGE:: There is going to be a commit to 4 strucures depending on the strand, loop with continue*/
+ /*SLAGANCHANGE:: find the best scoring fragment in the current row and update the best so far at the end*/
+
+ while (endPoints[current].seq1 == current_seq1) {
+ long long int upStrand, downStrand, relPos, possibleCase;
+
+ //MUKFIXME: This sends the highest scoring one into the leftinfluence machinery
+
+ while (current<2*numFragments-1 &&( endPoints[current].seq1== endPoints[current+1].seq1) && (endPoints[current+1].seq2 == endPoints[current].seq2)) {
+ if ((endPoints[current].frag->totalScore) > (endPoints[current+1].frag->totalScore)) {
+ Fragment * temp;
+
+ temp=endPoints[current+1].frag;
+ endPoints[current+1].frag=endPoints[current].frag;
+ endPoints[current].frag =temp;
+ }
+ current++;
+ }
+
+ /*
+ if( current>1 &&(endPoints[current].seq1== endPoints[current-1].seq1) && (endPoints[current-1].seq2 == endPoints[current].seq2))
+ {
+ current++;
+ continue;
+ }
+ */
+ upStrand = endPoints[current].frag->strand;
+
+ // This works because POSITIVE and NEGATIVE are 0 and 1
+ // This works because LEFT and RIGHT are 0 and 1
+
+ for (downStrand=0;downStrand<2;downStrand++) {
+ for (relPos=0;relPos<2;relPos++) {
+ possibleCase = downStrand << DOWNSTRANDSHIFT | upStrand <<UPSTRANDSHIFT | relPos<< RELPOSSHIFT;
+
+ RICommitEndPoint(&RI_regions[possibleCase],endPoints[current].frag);
+ LICommitPoint(&LI_regions[possibleCase],endPoints[current].frag);
+ }
+ }
+
+ if (endPoints[current].frag->totalScore > unrelatedFrag->totalScore)
+ unrelatedFrag = endPoints[current].frag;
+
+ current++;
+ }
+
+ return endPoints[current].seq1;
+}
+
+
+void intersectionPointHandler() {
+ long long int current_seq1;
+ Point p,curr;
+
+ p=inter.begin()->first;
+
+ current_seq1=p.seq1;
+
+ if (DEBUG) { fprintf(stderr,"\nIntersection PointHandler"); }
+ do {
+ // printState(&LI_regions[0]);
+ HandleOneIntersectionPoint();
+
+ //printState(&LI_regions[0]);
+ p=inter.begin()->first;
+ current_seq1=p.seq1;
+ } while (current_seq1==curr.seq1);
+}
diff --git a/src/glocal/glocal.h b/src/glocal/glocal.h
new file mode 100755
index 0000000..b99c878
--- /dev/null
+++ b/src/glocal/glocal.h
@@ -0,0 +1,23 @@
+#ifndef GLOCAL
+#define GLOCAL
+
+#define DEBUG 1
+
+#ifndef LLONG_MAX
+// limits.h entries from ISO C99
+#define LLONG_MAX 9223372036854775807LL
+#define LLONG_MIN (-LLONG_MAX - 1LL)
+#endif
+
+#include<structs.h>
+#include<io.h>
+#include<rightinfluence.h>
+#include<leftinfluence.h>
+#include<score.h>
+
+long long int startPointHandler();
+long long int endPointHandler();
+float fragmentSetScore(Fragment * current,Fragment *owner);
+void intersectionPointHandler();
+
+#endif
diff --git a/src/glocal/io.cpp b/src/glocal/io.cpp
new file mode 100755
index 0000000..c5d301a
--- /dev/null
+++ b/src/glocal/io.cpp
@@ -0,0 +1,293 @@
+#include<structs.h>
+#include<glocal.h>
+#include<io.h>
+#include<algorithm>
+
+extern vector <Fragment> fragments;
+extern vector <Point> startPoints;
+extern vector <Point> endPoints;
+extern Name allNames;
+
+bool PointCompare(const Point &f1, const Point &f2) {
+ if (f1.seq1 < f2.seq1) {
+ return (f1.seq1 < f2.seq1);
+ } else if (f1.seq1 == f2.seq1) {
+ return (f1.seq2 < f2.seq2);
+ } else {
+ return (f1.seq1 < f2.seq1);
+ }
+}
+
+
+//internal function that i dont need to care about.
+char* rolltonum(char* str) {
+ char *got1 = 0, *got2 = 0;
+ long long int in = 0, i = 0;
+ while (1) {
+ if (str[i] == 0) { break; }
+
+ if (str[i] == ';' && got1 && got2) { return got1; }
+
+ if (isdigit(str[i])) {
+ if (!in && (!i || isspace(str[i-1]))) {
+ if (got1) {
+ got2 = &str[i];
+ } else {
+ got1 = &str[i];
+ }
+ in = 1;
+ }
+ } else if (in && isspace(str[i])) {
+ if (got2) {
+ got1 = got2; got2 = 0; in = 0;
+ }
+ in = 0;
+ } else {
+ got1 = got2 = NULL;
+ }
+ i++;
+ }
+ return &str[i];
+}
+
+
+//reads one line of input at a time.
+long long int getline(FILE *infile, hll *tt) {
+ char temp[1024];
+ char* help;
+ long long int z;
+ int h;
+ fgets(temp, 1024, infile);
+ sscanf(temp, "%s", tt->seq1Name);
+
+ help = rolltonum(temp);
+ z = sscanf(help, "%lld %lld;%n", &tt->seq1start, &tt->seq1end, &h);
+ if (z < 2) { return 0; }
+
+ sscanf(help+h, "%s", tt->seq2Name);
+ help = rolltonum(help + h);
+
+ if (sscanf(help, "%lld %lld; score = %f (%c)\n", &tt->seq2start, &tt->seq2end, &tt->score, &tt->strand)<3) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+
+void printFragment ( Fragment * curfrag ) {
+ if (curfrag == NULL) {
+ printf("done");
+ return;
+ }
+ else if (curfrag->score == -1) {
+ return;
+ }
+
+ // TODO: remove space after s2 and check supermap sorts and regexes
+ printf("(%lld %lld)=(%lld %lld) %f %c [%f] s1:%s s2: %s\n",
+ curfrag->seq1Start,
+ curfrag->seq1End,
+ curfrag->seq2Start-curfrag->base,
+ curfrag->seq2End-curfrag->base,
+ curfrag->score,
+ (curfrag->strand==POSITIVE)?'+':'-',
+ curfrag->totalScore,
+ curfrag->seq1Name,
+ curfrag->seq2Name
+ );
+}
+
+
+void printAllFragments(long long int numFragments) {
+ long long int i;
+ for (i=0; i<numFragments; i++) {
+ printFragment(&fragments[i]);
+ }
+ return;
+}
+
+
+// prints a chain upwards starting at the fragment called last.
+long long int printChain(Fragment *current) {
+ while (current) {
+ printFragment(current);
+ current = current->back;
+ }
+ return 0;
+}
+
+
+void swap(long long int *a, long long int *b) {
+ long long int temp;
+ temp = *a;
+ *a = *b;
+ *b = temp;
+}
+
+
+// initialises the parameters for a fragment.
+// note the swap at the end of this function.
+Fragment createFragment(hll *temp) {
+ Fragment frag;
+ frag.seq1Start = temp->seq1start;
+ frag.seq1End = temp->seq1end;
+
+ frag.seq2Start = temp->seq2start;
+
+ frag.seq2End = temp->seq2end;
+
+ strcpy(frag.seq1Name, temp->seq1Name);
+ strcpy(frag.seq2Name, temp->seq2Name);
+
+ if (temp->strand == '+') {
+ frag.strand = POSITIVE;
+ } else {
+ frag.strand = NEGATIVE;
+ }
+
+ frag.score = temp->score;
+
+ frag.back = NULL;
+
+ frag.totalScore = -1;
+ frag.deleted = FALSE;
+
+ if (frag.seq1Start > frag.seq1End) {
+ swap(&(frag.seq1Start), &(frag.seq1End));
+ }
+ return frag;
+}
+
+
+// reads the input file and returns the number of fragments read.
+long long int readInput(char * fileName) {
+ hll tempInput;
+ FILE * fp;
+ long long int i=0;
+ char line[1024];
+
+ unsigned long long int line_count = 0;
+
+ fp = fopen(fileName, "r");
+
+ if (!fp) {
+ printf("SLAGAN: Error: Could not open file '%s'\n", fileName);
+ exit(0);
+ } else if (feof(fp)) {
+ printf("SLAGAN: Error: Empty file %s\n", fileName);
+ exit(0);
+ }
+
+ // Count the number of lines in the file
+ while (fgets(line, 1023, fp)) {
+ line_count++;
+ }
+ rewind(fp);
+
+ fragments.reserve(line_count);
+
+ while (!feof(fp)) {
+ while (!feof(fp) && !getline(fp, &tempInput));
+ if (feof(fp)) { break; }
+
+ // ignoring the low scoring fragments ?
+ if (tempInput.score < CUTOFF ) { continue; }
+
+ //createfragment
+
+ fragments.push_back(createFragment(&tempInput));
+ i++;
+ }
+
+ return i;
+}
+
+
+void createPointLists(long long int numFragments) {
+ long long int i;
+ Point startPoint, endPoint;
+
+ //SLAGANCHANGE:: Push -seq2,seq1 on the start list as well.
+
+ for (i=0; i<numFragments; i++) {
+ startPoint.seq1 = fragments[i].seq1Start;
+ startPoint.seq2 = fragments[i].seq2Start;
+ endPoint.seq1 = fragments[i].seq1End;
+ endPoint.seq2 = fragments[i].seq2End;
+ startPoint.frag = &fragments[i];
+ endPoint.frag = &fragments[i];
+ startPoints.push_back(startPoint);
+
+ startPoint.seq2 = -fragments[i].seq2Start;
+ startPoints.push_back(startPoint);
+ endPoints.push_back(endPoint);
+ }
+ sort(startPoints.begin(), startPoints.end(), PointCompare);
+ sort(endPoints.begin(), endPoints.end(), PointCompare);
+}
+
+
+void printPointLists(long long int numFragments) {
+ long long int i;
+ printf("StartPoint lists:\n");
+
+ for (i=0; i<numFragments; i++) {
+ printf(" seq1 :%lld seq2:%lld \n", startPoints[i].seq1, startPoints[i].seq2);
+ }
+
+ printf("EndPoint lists:\n");
+ for (i=0; i<numFragments; i++) {
+ printf(" seq1 :%lld seq2:%lld \n", endPoints[i].seq1, endPoints[i].seq2);
+ }
+ printf("End lists");
+}
+
+
+void findAllNames(long long int numFragments) {
+ long long int i;
+ long long int size;
+ long long int numContigs=0;
+ Name::iterator currName;
+
+ for (i=0; i<numFragments; i++) {
+ size = fragments[i].seq2Start>fragments[i].seq2End ? fragments[i].seq2Start : fragments[i].seq2End;
+
+ currName = allNames.find(fragments[i].seq2Name);
+
+ if (currName != allNames.end()) {
+ if (currName->second < size) {
+ currName->second = size;
+ }
+ } else {
+ allNames[fragments[i].seq2Name] = size;
+ numContigs ++;
+ }
+ }
+ if (DEBUG) { fprintf(stderr, "The number of contigs is %lld",numContigs); }
+}
+
+
+void decideContigBase() {
+ Name::iterator currName;
+ long long int offset =0;
+ long long int temp;
+
+ for (currName=allNames.begin(); currName!=allNames.end(); currName++) {
+ temp = currName->second;
+ currName->second = offset;
+ offset += (10 + temp);
+ }
+}
+
+
+void storeIterators(long long int numFragments) {
+ long long int i;
+
+ for (i=0; i<numFragments; i++) {
+ fragments[i].nameIter = allNames.find(fragments[i].seq2Name);
+ fragments[i].seq2Start += (fragments[i].nameIter)->second;
+ fragments[i].seq2End += (fragments[i].nameIter)->second;
+ fragments[i].base = (fragments[i].nameIter)->second;
+ }
+}
diff --git a/src/glocal/io.h b/src/glocal/io.h
new file mode 100755
index 0000000..d1d87b1
--- /dev/null
+++ b/src/glocal/io.h
@@ -0,0 +1,22 @@
+#ifndef IO
+#define IO
+
+#include<stdio.h>
+#include<vector>
+#include<map>
+#include<stdlib.h>
+#include<ctype.h>
+#include<structs.h>
+
+
+long long int printChain(Fragment *current);
+long long int readInput(char * fileName);
+void printAllFragments( long long int numFragments);
+void createPointLists(long long int numFragments);
+void printPointLists(long long int numFragments);
+void printFragment ( Fragment * curfrag );
+void findAllNames(long long int numFragments);
+void storeIterators(long long int numFragments);
+void decideContigBase();
+
+#endif
diff --git a/src/glocal/leftinfluence.cpp b/src/glocal/leftinfluence.cpp
new file mode 100755
index 0000000..41f8f7b
--- /dev/null
+++ b/src/glocal/leftinfluence.cpp
@@ -0,0 +1,637 @@
+#include<leftinfluence.h>
+
+Fragment LI_dummy;
+
+// Returns the fragment who is the owner of the region in which the current point is
+Owner::iterator LILookUpOwnerIterator(LI * LeftInfluence, long long int seq1, long long int seq2) {
+ CBound::iterator citer;
+ DBound::iterator diter;
+
+ citer = (LeftInfluence->c).lower_bound(seq2);
+
+ if ((LeftInfluence->c).end() == (LeftInfluence->c).begin() || (citer == (LeftInfluence->c).begin())) {
+ return (LeftInfluence->o).end();
+ }
+
+ citer--;
+
+ diter = (LeftInfluence->d).upper_bound(seq2 - seq1);
+
+ if (diter == (LeftInfluence->d).begin()) {
+ return citer->second;
+ }
+
+ diter--;
+
+ if ((citer->first - diter->first) > seq1) {
+ return citer->second;
+ } else {
+ return diter->second;
+ }
+}
+
+
+Fragment * LILookUpOwnerEnd(LI * LeftInfluence,Fragment * current) {
+ Owner::iterator own = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+
+ if (own == (LeftInfluence->o).end()) {
+ return &LI_dummy;
+ } else {
+ return *own;
+ }
+}
+
+
+Fragment * LILookUpOwnerStart(LI * LeftInfluence, Fragment * current) {
+ Owner::iterator own = LILookUpOwnerIterator(LeftInfluence, current->seq1Start, current->getSeq2Start(LeftInfluence->reflectFlag));
+
+ if (own == (LeftInfluence->o).end()) {
+ return &LI_dummy;
+ } else {
+ return *own;
+ }
+}
+
+
+// Returns the column boundary before the current point, if there is none it returns end
+CBound::iterator LICColumn(LI * LeftInfluence, long long int /* seq1 */, long long int seq2) {
+ CBound::iterator citer;
+
+ citer = (LeftInfluence->c).lower_bound(seq2);
+
+ //should not decrement, also means that the point is before all the column boundaries.
+ //FIX #2 if(citer == (LeftInfluence->c).begin())
+
+ if ((LeftInfluence->c).end() == (LeftInfluence->c).begin() || (citer == (LeftInfluence->c).begin())) {
+ return (LeftInfluence->c).end();
+ } else {
+ citer--;
+ return citer;
+ }
+}
+
+
+Fragment * LICOwner(LI * LeftInfluence, long long int seq1, long long int seq2) {
+ CBound::iterator citer;
+ citer = LICColumn(LeftInfluence, seq1, seq2);
+
+ if (citer == (LeftInfluence->c).end()) {
+ return &LI_dummy;
+ } else {
+ return *(citer->second);
+ }
+}
+
+
+Fragment * LIDOwner(LI * LeftInfluence, long long int seq1, long long int seq2) {
+ DBound::iterator diter;
+ diter = LIDDiagonal(LeftInfluence, seq1, seq2);
+
+ if (diter == (LeftInfluence->d).end()) {
+ return &LI_dummy;
+ } else {
+ return *(diter->second);
+ }
+}
+
+
+//returns the diagonal boundary, or end if all the point is before all the diagonal boundaries
+DBound::iterator LIDDiagonal(LI * LeftInfluence, long long int seq1, long long int seq2) {
+ DBound::iterator diter;
+
+ diter = (LeftInfluence->d).upper_bound(seq2-seq1);
+
+ if ((LeftInfluence->d).end() == (LeftInfluence->d).begin() || diter == (LeftInfluence->d).begin()) {
+ return (LeftInfluence->d).end();
+ } else {
+ diter--;
+ return diter;
+ }
+}
+
+
+// this function should never get called with the LI dummy
+// can the scores become negative and how do we handle this?
+float LILookUpScore(LI * LeftInfluence, Fragment * current) {
+ Fragment * owner = LILookUpOwnerStart(LeftInfluence, current);
+
+ if (owner==NULL) {
+ fprintf(stderr,"Owner NULL in call LILookUpScore");
+ exit(0);
+ }
+
+ if (owner->score == -1) {
+ //MUKCHECK
+ return -1;
+ } else {
+ return scoreAll(owner,current,LeftInfluence->scoreIndex);
+ }
+}
+
+
+void InitLI(LI * LeftInfluence, long long int scoreIndex) {
+ LeftInfluence->scoreIndex = scoreIndex;
+
+ if (((scoreIndex >> RELPOSSHIFT) & 1) == LEFT) {
+ LeftInfluence->reflectFlag = TRUE;
+ } else {
+ LeftInfluence->reflectFlag = FALSE;
+ }
+
+ LI_dummy.score = -1;
+ LI_dummy.totalScore = 0;
+ LI_dummy.back = NULL;
+
+ //there will be a list of structures to insert this into
+ (LeftInfluence->o).insert((LeftInfluence->o).begin(), &LI_dummy);
+}
+
+
+long long int LI_Winner(LI * LeftInfluence, Fragment * first, Fragment * second) {
+ Fragment dummy;
+
+ if (first->score == -1) { return FALSE; }
+
+ if (second->score == -1) { return TRUE; }
+
+ dummy.seq1Start = max(first->seq1End, second->seq1End) + 2;
+ dummy.seq2Start = max(first->getSeq2End(LeftInfluence->reflectFlag), second->getSeq2End(LeftInfluence->reflectFlag)) + 1;
+
+ if (first->getSeq2End(LeftInfluence->reflectFlag) > second->getSeq2End(LeftInfluence->reflectFlag)) {
+ dummy.nameIter = first->nameIter;
+ } else {
+ dummy.nameIter = second->nameIter;
+ }
+
+ if (scoreAll(first, &dummy, LeftInfluence->scoreIndex) >= scoreAll(second, &dummy, LeftInfluence->scoreIndex)) {
+ return TRUE;
+ } else {
+ return FALSE;
+ }
+}
+
+
+long long int LICommitPoint(LI * LeftInfluence, Fragment * current) {
+ Owner::iterator cowner, ownerIter;
+ Fragment * owner;
+ CBound::iterator citer;
+ DBound::iterator diter;
+ long long int colFlag;
+
+ ownerIter = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+
+ citer = LICColumn(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+ diter = LIDDiagonal(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+ owner = LILookUpOwnerEnd(LeftInfluence, current);
+
+ if (citer == (LeftInfluence->c).end()) {
+ colFlag = TRUE;
+ } else if (diter == (LeftInfluence->d).end()) {
+ colFlag = TRUE;
+ } else {
+ cowner = citer->second;
+ if (cowner == ownerIter) {
+ colFlag = TRUE;
+ } else {
+ colFlag = FALSE;
+ }
+ }
+
+ if (LI_Winner(LeftInfluence, owner, current)) {
+ return FALSE;
+ }
+
+ if (colFlag) {
+ return LI_CommitColumnOwner(LeftInfluence, current, owner);
+ } else {
+ return LI_CommitDiagonalOwner(LeftInfluence, current, owner);
+ }
+}
+
+
+Owner::iterator LI_OwnerInsertAfter(LI * LeftInfluence, Owner::iterator current, Fragment * curfrag) {
+ current++;
+ return (LeftInfluence->o).insert(current, curfrag);
+}
+
+
+long long int LI_CommitDiagonalOwner(LI * LeftInfluence, Fragment * current, Fragment * owner) {
+ CBound::iterator current_column, next_column;
+ DBound::iterator current_diagonal, prevDiag;
+ DInter::iterator current_diag_inter, my_diag_inter, prevDiagInter;
+ CInter::iterator my_col_inter, next_column_inter, colInter;
+
+ Owner::iterator own, tempowner;
+
+ //searching for the next column to switch on
+ current_column = LICColumn(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+ current_diagonal = LIDDiagonal(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+ current_diag_inter = (LeftInfluence->di).find(current_diagonal->first);
+ own = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+
+ //this implies that the point is before all the cbounds:: THIS CANT HAPPEN!!
+
+ if (current_column == (LeftInfluence->c).end()) {
+ //FIX#7
+ fprintf(stderr, "\n diagonal owner, but no column before it");
+ exit(0);
+ } else {
+ next_column = current_column;
+ next_column++;
+ }
+
+ //2cases
+ if (next_column == (LeftInfluence->c).end() || next_column->first > current->getSeq2End(LeftInfluence->reflectFlag)) {
+ if (current_diagonal->first < current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End) {
+ if (DEBUG) { fprintf(stderr, "In Diagonal Commit::FIRSTCASE"); }
+
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, current_diagonal->second, current);
+ (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner;
+ (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end();
+ my_col_inter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag));
+
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, tempowner, owner);
+
+ (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = tempowner;
+ (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end();
+ my_diag_inter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag)-current->seq1End);
+
+ if (next_column!= (LeftInfluence->c).end()) {
+ next_column_inter = (LeftInfluence->ci).find(next_column->first);
+
+ if (next_column_inter->second == current_diag_inter->second && current_diag_inter->second!=inter.end()) {
+ DeleteIntersectionPoint(next_column_inter->second, next_column_inter, current_diag_inter);
+ CreateIntersectionPoint(LeftInfluence, next_column->first,
+ current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End,
+ next_column_inter, my_diag_inter);
+ } else if (next_column_inter->second == inter.end()) {
+ CreateIntersectionPoint(LeftInfluence, next_column->first,
+ current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End,
+ next_column_inter, my_diag_inter);
+ }
+ }
+
+ CreateIntersectionPoint(LeftInfluence, current->getSeq2End(LeftInfluence->reflectFlag),
+ current_diagonal->first, my_col_inter, current_diag_inter);
+ } else {
+ if (DEBUG) { fprintf(stderr, "\n In Diagonal Commit:SECONDCASE"); }
+
+ //There will be a previous owner as this is a diagonal case
+ own = LILookUpOwnerIterator(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+ own--;
+
+ if (LI_Winner(LeftInfluence, *own, current)) {
+ return FALSE;
+ }
+
+ own++;
+ tempowner = (LeftInfluence->o).insert(own, current);
+ (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner;
+ (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end();
+ colInter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag));
+
+ //There is no diagonal here
+
+ //intersection Point Handling
+ // check is the previous intersection Point exists, if it does check if the flag is off in which
+ //case insert an intersection Point into Intersect and Handle flags appropriately
+
+ //There is a problem here
+ //FIX #7 #4 major fix
+ if (current_diagonal != (LeftInfluence->d).begin()) {
+ prevDiag = current_diagonal;
+ prevDiag--;
+
+ prevDiagInter = (LeftInfluence->di).find(prevDiag->first);
+ if (prevDiagInter->second == inter.end()) {
+ CreateIntersectionPoint(LeftInfluence, current->getSeq2End(LeftInfluence->reflectFlag),
+ prevDiag->first, colInter, prevDiagInter);
+ }
+ }
+ }
+ } else {
+ if (DEBUG) { fprintf(stderr, "\n In Diagonal Commit:THIRDCASE"); }
+ if (LI_Winner(LeftInfluence, *(next_column->second), current)) { return false; }
+
+ tempowner = (LeftInfluence->o).insert(next_column->second, current);
+
+ //He does the intersection point processing with lower priority!!?
+ //This might mean that the diagonal entry already exists, also this might mean that
+ //The intersection point processing removes the entry?!
+
+ (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = next_column->second;
+ (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end();
+ my_diag_inter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End);
+
+ next_column->second = tempowner;
+
+ //checking if the next column exists
+ next_column++;
+
+ if (next_column!= (LeftInfluence->c).end()) {
+ next_column_inter =(LeftInfluence->ci).find(next_column->first);
+
+ if (next_column_inter->second == inter.end()) {
+ CreateIntersectionPoint(LeftInfluence, next_column->first,
+ current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End,
+ next_column_inter, my_diag_inter);
+ }
+ }
+ }
+ return TRUE;
+}
+
+
+long long int LI_CommitColumnOwner(LI * LeftInfluence, Fragment * current, Fragment * owner) {
+ CBound::iterator current_column, next_column;
+ CInter::iterator nextColInter, colInter;
+ DInter::iterator diagInter;
+ Owner::iterator tempowner;
+
+ current_column= LICColumn(LeftInfluence, current->seq1End, current->getSeq2End(LeftInfluence->reflectFlag));
+
+ if ((LeftInfluence->c).end() == (LeftInfluence->c).begin()) {
+ //Init has already put in one fragment
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, (LeftInfluence->o).begin(), current);
+ (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner;
+ (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end();
+
+ //FIX #5 FIRST MAJOR FIX
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, tempowner, &LI_dummy);
+ (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = tempowner;
+ (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end();
+ return TRUE;
+ }
+
+ // If the current_column is the end , that means that we are before all the column boundaries
+ //as the other case has been taken care of above
+
+ if (current_column == (LeftInfluence->c).end()) {
+ next_column = (LeftInfluence->c).begin();
+ } else {
+ next_column = current_column;
+ next_column++;
+ }
+
+ // Either the case that the column boundary is that last column boundary or that the next column is after the current point
+
+ if (next_column == (LeftInfluence->c).end() || next_column->first > current->getSeq2End(LeftInfluence->reflectFlag)) {
+ if (DEBUG) { fprintf(stderr, "\nColCommit::FIRSTCASE"); }
+ // this means that the next column is not the first column
+ if (current_column != (LeftInfluence->c).end()) {
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, current_column->second, current);
+ } else {
+ // this means that the next column is the first column
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, (LeftInfluence->o).begin(), current);
+ }
+
+ (LeftInfluence->c)[current->getSeq2End(LeftInfluence->reflectFlag)] = tempowner;
+ (LeftInfluence->ci)[current->getSeq2End(LeftInfluence->reflectFlag)] = inter.end();
+ //This is inefficient
+ colInter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag));
+ tempowner = LI_OwnerInsertAfter(LeftInfluence, tempowner, owner);
+ (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = tempowner;
+ (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag)-current->seq1End] = inter.end();
+
+ //This is inefficient
+ diagInter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End);
+
+ //if there is a next column then there is an issue of an intersection point
+ if (next_column != (LeftInfluence->c).end()) {
+ nextColInter = (LeftInfluence->ci).find(next_column->first);
+
+ if (nextColInter->second == inter.end()) {
+ CreateIntersectionPoint(LeftInfluence, next_column->first,
+ current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, nextColInter, diagInter);
+ }
+ }
+ } else {
+ if (DEBUG) { fprintf(stderr, "\nColCommit::SECONDCASE"); }
+
+ if (LI_Winner(LeftInfluence, *(next_column->second), current)) {
+ return FALSE;
+ }
+
+ tempowner = (LeftInfluence->o).insert(next_column->second, current);
+ (LeftInfluence->d)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = next_column->second;
+ //FIX #6 SECOND MAJOR FIX
+ (LeftInfluence->di)[current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End] = inter.end();
+
+ //I dont think that i need this
+ diagInter = (LeftInfluence->di).find(current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End);
+ colInter = (LeftInfluence->ci).find(current->getSeq2End(LeftInfluence->reflectFlag));
+ next_column->second = tempowner;
+
+ //intersection Point handling
+ next_column++;
+ if (next_column != (LeftInfluence->c).end()) {
+ nextColInter = (LeftInfluence->ci).find(next_column->first);
+
+ if (nextColInter->second == inter.end()) {
+ CreateIntersectionPoint(LeftInfluence, next_column->first,
+ current->getSeq2End(LeftInfluence->reflectFlag) - current->seq1End, nextColInter, diagInter);
+ }
+ }
+ }
+ return TRUE;
+}
+
+
+void CreateIntersectionPoint(LI * LeftInfluence, long long int col, long long int diag, CInter::iterator colInter, DInter::iterator diagInter) {
+ Point temp;
+
+ InterPoint::iterator tempinter;
+ temp.seq1 = col - diag;
+ temp.seq2 = col;
+
+ pair<Point,LI*> pairp(temp, LeftInfluence);
+ tempinter = inter.insert(pairp);
+
+ colInter->second = tempinter;
+ diagInter->second = tempinter;
+}
+
+
+void DeleteIntersectionPoint(InterPoint::iterator tobeerased, CInter::iterator colInter, DInter::iterator diagInter) {
+ inter.erase(tobeerased);
+ colInter->second = inter.end();
+ diagInter->second = inter.end();
+}
+
+
+// handles one intersection point that is at the head of inter
+void HandleOneIntersectionPoint() {
+ InterPoint::iterator head;
+ Owner::iterator delOwner, leftOwner, rightOwner;
+
+ CBound::iterator col, nextCol;
+ CInter::iterator nextColInter, colInter;
+ DInter::iterator prevDiagInter, diagInter;
+ DBound::iterator diag, prevDiag;
+
+ head = inter.begin();
+
+ LI * LeftInfluence;
+
+ //find the three owners that are invloved.
+ LeftInfluence = head->second;
+
+ col = (LeftInfluence->c).find((head->first).seq2);
+
+ if (col == (LeftInfluence->c).end()) {
+ fprintf(stderr, "\nIn HandleOneIntersectionPoint::The column does not exist. Point is %lld %lld", (head->first).seq1, (head->first).seq2);
+ exit(0);
+ }
+
+ colInter = (LeftInfluence->ci).find(col->first);
+ diag = (LeftInfluence->d).find((head->first).seq2 - (head->first).seq1);
+
+ if (DEBUG) { fprintf(stderr, "\nIn HandleOneIntersectionPoint::The intersection point that is being handled: %lld %lld", (head->first).seq1, (head->first).seq2); }
+
+ if (diag == (LeftInfluence->d).end()) {
+ fprintf(stderr, "\nIn HandleOneIntersectionPoint::The diagonal does not exist Point is %lld %lld", (head->first).seq1, (head->first).seq2);
+ exit(0);
+ }
+
+ diagInter = (LeftInfluence->di).find(diag->first);
+ delOwner = diag->second;
+
+ leftOwner = delOwner;
+ leftOwner--;
+ rightOwner = delOwner;
+ rightOwner++;
+
+ if (*leftOwner == *rightOwner) {
+ fprintf(stderr, "\nIn HandleOneIter:: The leftOwner is the same as the right owner");
+ exit(0);
+ }
+
+ if (LI_Winner(LeftInfluence, *leftOwner, *rightOwner)) {
+ //the diagonal continues
+ if (DEBUG) { fprintf(stderr, "\nIn HandleOneIter:: Diagonal continues"); }
+ diag->second = col->second;
+ nextCol = col;
+ nextCol++;
+ nextColInter = (LeftInfluence->ci).find(nextCol->first);
+ (LeftInfluence->c).erase(col);
+ //FIX #8 MAJOR FIX
+ (LeftInfluence->ci).erase(colInter);
+
+ if (nextCol != (LeftInfluence->c).end()) {
+ // the column exists
+ if (nextColInter->second == inter.end()) {
+ // the column is not involved in an intersection
+ diagInter->second = inter.end();
+ CreateIntersectionPoint(LeftInfluence, nextCol->first, diag->first, nextColInter, diagInter);
+ } else {
+ //should unset the diagonal
+ diagInter->second = inter.end();
+ }
+ } else {
+ diagInter->second = inter.end();
+ }
+ } else {
+ if (DEBUG) { fprintf(stderr, "\nIn HandleOneIter Column continues %f %f %f", (*delOwner)->score, (*leftOwner)->score, (*rightOwner)->score); }
+
+ prevDiag = diag;
+ prevDiag--;
+ prevDiagInter = (LeftInfluence->di).find(prevDiag->first);
+
+ (LeftInfluence->d).erase(diag);
+ (LeftInfluence->di).erase(diagInter);
+
+ if (prevDiag != (LeftInfluence->d).end()) {
+ if (prevDiagInter == (LeftInfluence->di).end()) {
+ fprintf(stderr, "\nIn HandleOneIter:No diag inter corresponding to PrevDiag: %lld", prevDiag->first);
+ exit(0);
+ }
+
+ if (prevDiagInter->second == inter.end()) {
+ // the diagonal is not involved in an intersection
+ colInter->second = inter.end();
+ CreateIntersectionPoint(LeftInfluence, col->first,prevDiag->first, colInter, prevDiagInter);
+ } else {
+ //should unset the column flag
+ colInter->second = inter.end();
+ }
+ } else {
+ colInter->second = inter.end();
+ }
+ }
+
+ //delete the owner
+ (LeftInfluence->o).erase(delOwner);
+
+ inter.erase(inter.begin());
+}
+
+
+long long int printDBound(LI * LeftInfluence) {
+ if (DEBUG) { return 0; }
+ DBound::iterator i;
+ long long int diagCount = 0;
+ fprintf(stderr, "\nThe DBound is ::");
+
+ for (i = (LeftInfluence->d).begin(); i != (LeftInfluence->d).end(); i++) {
+ fprintf(stderr, "%lld ", i->first);
+ diagCount++;
+ }
+
+ fprintf(stderr, "Dbound Done/n");
+ return diagCount;
+}
+
+
+long long int printCBound(LI * LeftInfluence) {
+ if (DEBUG) { return 0; }
+ CBound::iterator i;
+ long long int colCount = 0;
+ fprintf(stderr, "\nThe CBound is ::");
+
+ for (i = (LeftInfluence->c).begin(); i != (LeftInfluence->c).end(); i++) {
+ fprintf(stderr, "%lld ", i->first);
+ colCount++;
+ }
+
+ fprintf(stderr, "Cbound Done/n");
+ return colCount;
+}
+
+
+long long int printOwners(LI * LeftInfluence) {
+ if (DEBUG) { return 0; }
+ Owner::iterator i;
+ long long int ownerCount = 0;
+ fprintf(stderr, "\nThe Owner is ::");
+
+ for (i = (LeftInfluence->o).begin(); i != (LeftInfluence->o).end(); i++) {
+ ownerCount++;
+ fprintf(stderr, "%f ", (*i)->score);
+ }
+
+ fprintf(stderr, "Owners Done/n");
+ return ownerCount;
+}
+
+
+void printState(LI * LeftInfluence) {
+ if (DEBUG) { return; }
+ long long int colCount, diagCount, ownerCount;
+
+ fprintf(stderr, "\nCurrent State:\n");
+ ownerCount = printOwners(LeftInfluence);
+ colCount = printCBound(LeftInfluence);
+ diagCount = printDBound(LeftInfluence);
+ interPointPrint();
+}
+
+
+void interPointPrint() {
+ if (DEBUG) { return; }
+ InterPoint::iterator i;
+ fprintf(stderr, "\nThe Inter is ::");
+ for (i = inter.begin(); i != inter.end(); i++) {
+ fprintf(stderr, "%lld %lld ", (i->first).seq1, (i->first).seq2);
+ }
+ fprintf(stderr, "Inter Done/n");
+}
diff --git a/src/glocal/leftinfluence.h b/src/glocal/leftinfluence.h
new file mode 100755
index 0000000..77229ff
--- /dev/null
+++ b/src/glocal/leftinfluence.h
@@ -0,0 +1,100 @@
+#ifndef LEFTINFLUENCE
+#define LEFTINFLUENCE
+
+#include<structs.h>
+#include<score.h>
+
+struct LI;
+
+
+struct longlongCompare2
+{
+
+ bool operator()(long long int p1,long long int p2) const
+ {
+ if(p1< p2)
+ return 1;
+ else
+ return 0;
+
+ }
+};
+
+
+struct paircomp
+{
+
+ bool operator()(const Point p1,const Point p2) const
+ {
+ if(p1.seq1< p2.seq1)
+ return 1;
+ else if((p1.seq1 == p2.seq1) && (p1.seq2 < p2.seq2))
+ return 1;
+ else
+ return 0;
+
+ }
+};
+
+
+
+
+typedef list<Fragment*> Owner;
+typedef map <long long int ,Owner::iterator,longlongCompare2> CBound;
+
+typedef multimap <Point ,struct LI *,paircomp> InterPoint;
+
+typedef map <long long int ,InterPoint::iterator,longlongCompare2> CInter;
+typedef map <long long int,Owner::iterator,longlongCompare2> DBound;
+
+typedef map <long long int,InterPoint::iterator,longlongCompare2> DInter;
+
+
+
+typedef struct LI
+{
+ Owner o;
+ CBound c;
+ DBound d;
+ CInter ci;
+ DInter di;
+ long long int scoreIndex;
+ long long int reflectFlag;
+
+
+}LI;
+
+
+extern InterPoint inter;
+
+
+
+
+
+Owner::iterator LILookUpOwnerIterator(LI* LeftInfluence,long long int seq1,long long int seq2) ;
+Fragment * LILookUpOwnerStart(LI* LeftInfluence,Fragment *current);
+Fragment * LILookUpOwnerEnd(LI* LeftInfluence,Fragment *current);
+CBound::iterator LICColumn(LI* LeftInfluence,long long int seq1, long long int seq2);
+Fragment *LICOwner(LI* LeftInfluence,long long int seq1, long long int seq2);
+Fragment *LIDOwner(LI* LeftInfluence,long long int seq1, long long int seq2);
+DBound::iterator LIDDiagonal(LI* LeftInfluence,long long int seq1, long long int seq2);
+float LILookUpScore(LI *LeftInfluence,Fragment *current);
+void InitLI(LI* LeftInfluence, long long int scoreIndex);
+long long int LI_Winner(LI* LeftInfluence,Fragment * first,Fragment * second);
+long long int LICommitPoint(LI *LeftInfluence,Fragment *current);
+Owner::iterator LI_OwnerInsertAfter(LI* LeftInfluence,Owner::iterator current,Fragment * curfrag);
+long long int LI_CommitDiagonalOwner(LI* LeftInfluence,Fragment *current,Fragment *owner);
+long long int LI_CommitColumnOwner(LI* LeftInfluence,Fragment *current,Fragment *owner);
+void CreateIntersectionPoint(LI* LeftInfluence,long long int col,long long int diag,CInter::iterator colInter,DInter::iterator diagInter);
+void DeleteIntersectionPoint(InterPoint::iterator tobeerased,CInter::iterator colInter,DInter::iterator diagInter);
+void HandleOneIntersectionPoint();
+
+long long int printDBound(LI * LeftInfluence);
+long long int printOwners(LI * LeftInfluence);
+long long int printCBound(LI * LeftInfluence);
+void printState(LI* LeftInfluence);
+void interPointPrint();
+
+
+
+#endif
diff --git a/src/glocal/rightinfluence.cpp b/src/glocal/rightinfluence.cpp
new file mode 100755
index 0000000..57b0c67
--- /dev/null
+++ b/src/glocal/rightinfluence.cpp
@@ -0,0 +1,203 @@
+#include <rightinfluence.h>
+
+Fragment origin, end;
+
+// Sets the first default owner of the whole region
+void initRI(RI *RightInfluence, long long int scoreIndex) {
+ RightInfluence->scoreIndex = scoreIndex;
+
+ if (((scoreIndex >> RELPOSSHIFT) & 1) == LEFT) {
+ RightInfluence->reflectFlag = TRUE;
+ } else {
+ RightInfluence->reflectFlag = FALSE;
+ }
+
+ // will lose to anyone
+ origin.seq1End = 0; origin.seq2End = 0;
+ origin.seq1Start = 0; origin.seq2Start = 0;
+
+ // hack to aid winner selection
+ origin.score = -1;
+ end.score = -2;
+ origin.totalScore = end.totalScore = 0;
+
+ // will win against anyone
+ end.seq1End = 0; end.seq2End = 0;
+ end.seq1Start = 0; end.seq2Start = 0;
+
+ origin.back = NULL;
+
+ RightInfluence->act[-INF] = &origin;
+ RightInfluence->act[+INF] = &end;
+}
+
+
+// Finds the owner in the current right influence region and returns the score using the appropriate score function
+float lookUpScore(RI * RightInfluence, Fragment * current) {
+ Fragment* owner;
+
+ // find the owner of the region that you are in
+ owner = lookUpOwnerStart(RightInfluence, current);
+
+ // return the score using the appropriate score function
+ return scoreAll(owner, current, RightInfluence->scoreIndex);
+}
+
+
+// Returns the owner of the region
+Fragment * lookUpOwnerStart(RI * RightInfluence, Fragment * current) {
+ Active::iterator ownerIterator;
+
+ // find the owner of the region that you are in.
+ ownerIterator = RightInfluence->act.upper_bound(current->getSeq2Start(RightInfluence->reflectFlag) - current->seq1Start);
+ ownerIterator--;
+
+ return (*ownerIterator).second;
+}
+
+
+Fragment * lookUpOwnerEnd(RI * RightInfluence, Fragment * current) {
+ Active::iterator ownerIterator;
+
+ // find the owner of the region that you are in.
+ ownerIterator=RightInfluence->act.upper_bound(current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End);
+ ownerIterator--;
+
+ return (*ownerIterator).second;
+}
+
+
+// Returns true if the first argument is the winner in their common region
+long long int RIWinner(RI * RightInfluence, Fragment * first, Fragment * second) {
+ Fragment dummy;
+
+ //if the first frag is the origin or the second frag is the end then the first frag loses
+ if (first->score==-1 || second->score==-2) { return FALSE; }
+
+ //if the first frag is the end or the second frag is the origin then the first frag wins
+ if (second->score==-1 || first->score==-2) { return TRUE; }
+
+ dummy.seq1Start = Mymax(first->seq1End, second->seq1End) + 1;
+ dummy.seq2Start = Mymax(first->getSeq2End(RightInfluence->reflectFlag), second->getSeq2End(RightInfluence->reflectFlag)) + 2;
+
+ if (first->getSeq2End(RightInfluence->reflectFlag) > second->getSeq2End(RightInfluence->reflectFlag)) {
+ dummy.nameIter = first->nameIter;
+ } else {
+ dummy.nameIter = second->nameIter;
+ }
+
+ if (scoreAll(first, &dummy, RightInfluence->scoreIndex) > scoreAll(second, &dummy, RightInfluence->scoreIndex)) {
+ return TRUE;
+ } else {
+ return FALSE;
+ }
+}
+
+
+long long int RICommitEndPoint(RI * RightInfluence, Fragment * current) {
+ Fragment * owner;
+ Fragment * temp;
+ owner = lookUpOwnerEnd(RightInfluence, current);
+
+ if (RIWinner(RightInfluence, owner, current)) { return 0; }
+
+ owner = nextOnActive(RightInfluence, owner);
+
+ while (1) {
+ if (RIWinner(RightInfluence, current, owner)) {
+ temp = owner;
+ owner = nextOnActive(RightInfluence, owner);
+ RightInfluence->act.erase(temp->getSeq2End(RightInfluence->reflectFlag)-temp->seq1End);
+ } else {
+ break;
+ }
+ }
+
+ //inserting into the list of active owners
+ RightInfluence->act[current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End] = current;
+
+int possibleCase = NEGATIVE << DOWNSTRANDSHIFT | NEGATIVE <<UPSTRANDSHIFT | LEFT << RELPOSSHIFT;
+if (RightInfluence->scoreIndex == possibleCase) {
+ Active::iterator j,i = RightInfluence->act.begin();
+ i++;
+ while(i != RightInfluence->act.end()) {
+ // if (i == NULL) { continue;}
+ j = i;
+ j++;
+ if (j != RightInfluence->act.end()) {
+ if ((*j).second->score == -2) { break;} // j is act.end (why does the check above fail?)
+ if ((*i).second->totalScore > (*j).second->totalScore) {
+ /* fprintf(stdout,"Assertion failed in RICommitEndPoint: Cur frag:\n");
+ printFragment(current);
+ fprintf(stdout,"Cur orig owner:\n");
+ printFragment(tempOwner);
+ fprintf(stdout,"Cur frag diag: %lld\n", (current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End));
+ fprintf(stdout," Frag 1 in pair (j):\n ");
+ printFragment((*j).second);
+ fprintf(stdout," Frag 2 in pair (i):\n ");
+ printFragment((*i).second);
+ fprintf(stdout,"RI:\n");
+ printActive(RightInfluence);
+ assert (0);
+ */
+ break;
+ // assert(i->first->score >= j->first->score);
+ }
+ }
+ i++;
+ }
+}
+
+
+ return 1;
+}
+
+
+long long int diagonal(Fragment * current, RI * RightInfluence) {
+ return (current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End);
+}
+
+
+// Returns the successor on the active list
+Fragment * nextOnActive(RI * RightInfluence, Fragment * current) {
+ Active::iterator holder;
+ long long int diagCurrent;
+
+ diagCurrent = current->getSeq2End(RightInfluence->reflectFlag) - current->seq1End;
+
+ //MUKMOD start
+ if(current->score==-1)
+ {
+ diagCurrent = -INF;
+
+ }
+
+ if(current->score ==-2)
+ {
+ diagCurrent = INF;
+ }
+ //MUKMOD end
+
+
+ holder = RightInfluence->act.upper_bound(diagCurrent);
+
+ if (holder != RightInfluence->act.end()) {
+ return (*holder).second;
+ } else {
+ return NULL;
+ }
+}
+
+
+long long int printActive(RI * RightInfluence) {
+ Active::iterator temp;
+ long long int i = 0;
+ fprintf(stdout, "Active RI:\n");
+ for (temp = RightInfluence->act.begin(); temp != RightInfluence->act.end(); temp++) {
+ fprintf(stdout, " %lld", (*temp).first);
+ fprintf(stdout, ":sc=%f:totsc=%f;",((*temp).second)->score, ((*temp).second)->totalScore);
+ i++;
+ }
+ fprintf(stdout, "\n");
+ return i;
+}
diff --git a/src/glocal/rightinfluence.h b/src/glocal/rightinfluence.h
new file mode 100755
index 0000000..117434e
--- /dev/null
+++ b/src/glocal/rightinfluence.h
@@ -0,0 +1,42 @@
+#ifndef RIGHTINFLUENCE
+#define RIGHTINFLUENCE
+
+#include<structs.h>
+#include<io.h>
+#include<score.h>
+
+
+struct longlongCompare {
+ bool operator()(long long int p1,long long int p2) const {
+ if (p1 < p2) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+};
+
+
+typedef map<const long long int , Fragment*,longlongCompare> Active;
+
+typedef struct RI {
+ //List of active regions
+ Active act;
+ long long int scoreIndex;
+ long long int reflectFlag;
+} RI;
+
+
+void initRI(RI *RightInfluence,long long int scoreIndex);
+float lookUpScore(RI * RightInfluence,Fragment *current);
+Fragment* lookUpOwnerEnd(RI * RightInfluence,Fragment *current);
+Fragment* lookUpOwnerStart(RI * RightInfluence,Fragment *current);
+long long int RIWinner(RI *RightInfluence,Fragment *first,Fragment * second);
+//long long int processRowofEndPoints(RI *RightInfluence,long long int firstIndex);
+long long int diagonal(Fragment * current,RI * RightInfluence);
+Fragment * nextOnActive(RI* RightInfluence,Fragment * current);
+long long int printActive(RI * RightInfluence);
+long long int RICommitEndPoint(RI *RightInfluence,Fragment *current);
+
+
+#endif
diff --git a/src/glocal/score.cpp b/src/glocal/score.cpp
new file mode 100755
index 0000000..24525e7
--- /dev/null
+++ b/src/glocal/score.cpp
@@ -0,0 +1,225 @@
+#include<structs.h>
+#include<score.h>
+#include<leftinfluence.h>
+#include<rightinfluence.h>
+#include<fstream.h>
+
+extern vector<class Score*> scoreFunctions[1<<(UPSTRANDBITS+DOWNSTRANDBITS+RELPOSBITS)];
+
+
+float Score::getScore(Fragment *up, Fragment * down) {
+ long long int absSeq1,absSeq2,absDiagonal,absMin,absMax;
+
+ absSeq1= Myabs((up->seq1End) - (down->seq1Start));
+ absSeq2= Myabs((up->seq2End) - (down->seq2Start));
+
+ absMin = Mymin(absSeq1,absSeq2);
+ absMax=Mymax(absSeq1,absSeq2);
+
+ absDiagonal = absMax-absMin;
+
+ return absMin*(-minConstant) + absMax* (-maxConstant) + absDiagonal *(-diagConstant) -openConstant +up->totalScore;
+}
+
+
+ScoreInterface::ScoreInterface (float iopenConstant, float iminConstant, float imaxConstant, float idiagConstant) {
+ openConstant = iopenConstant;
+ minConstant = iminConstant;
+ maxConstant = imaxConstant;
+ diagConstant = idiagConstant;
+}
+
+
+Score::Score (float iopenConstant , float iminConstant ,float imaxConstant,float idiagConstant):ScoreInterface(iopenConstant,iminConstant, imaxConstant, idiagConstant) {
+
+}
+
+
+void initScoreFunctionPointers(char * scoreFileName) {
+ ifstream SFP;
+ char line[255];
+
+ SFP.open(scoreFileName);
+
+ if (!SFP.good()) {
+ printf("The score file is invalid");
+ exit(0);
+ }
+
+ while (1) {
+ SFP.getline(line,255);
+ if (line[0]=='\0') { break; }
+ createScoreFunctionObjects(line);
+ }
+}
+
+void createScoreFunctionObjects(char * line) {
+ long long int i;
+ long long int j;
+ long long int rem[4];
+ long long int remCases[MAXCASES],remObjects[MAXOBJECTS];
+ long long int numCases;
+ long long int numObjects;
+ long long int cases [MAXCASES];
+ float objects[MAXOBJECTS][4];
+ char updir,downdir,relpos;
+
+ Score * SFObjects[MAXOBJECTS];
+
+ j=0;
+
+ for (i=0; (unsigned)i<strlen(line); i++) {
+ if (line[i]=='{' || line[i]=='}') {
+ rem[j++]=i;
+ }
+ }
+
+ //forming cases
+
+ numCases=0;
+
+ for (i=rem[0]; i<=rem[1]; i++) {
+ if (line[i]=='{' ||line[i]=='}'||line[i]==';') {
+ remCases[numCases++]=i;
+ }
+ }
+
+ numCases--;
+
+ for (i=0; i<numCases; i++) {
+ sscanf(&line[remCases[i]+1],"%c %c %c",&updir,&relpos,&downdir);
+ if (DEBUG) { fprintf(stderr,"\n%c %c %c",updir,downdir,relpos); }
+ cases[i]= charToCase(updir)<<UPSTRANDSHIFT | charToCase(downdir)<<DOWNSTRANDSHIFT |charToCase(relpos)<<RELPOSSHIFT;
+ }
+
+ numObjects=0;
+ for (i=rem[2]; i<=rem[3]; i++) {
+ if (line[i]=='{' || line[i]=='}' || line[i]==';') {
+ remObjects[numObjects++]=i;
+ }
+ }
+
+ numObjects--;
+
+ for (i=0; i<numObjects; i++) {
+ sscanf(&line[remObjects[i]+1],"%f %f %f %f",&objects[i][0],&objects[i][1],&objects[i][2],&objects[i][3]);
+ if (DEBUG) { fprintf(stderr,"\t%f %f %f %f\n",objects[i][0],objects[i][1],objects[i][2],objects[i][3]); }
+ SFObjects[i] = new Score(objects[i][0],objects[i][2],objects[i][3],objects[i][1]);
+ }
+
+ for (i=0; i<numCases; i++) {
+ for (j=0; j<numObjects; j++) {
+ scoreFunctions[cases[i]].push_back(SFObjects[j]);
+ }
+ }
+}
+
+
+long long int charToCase(char in) {
+ switch(in) {
+ case '+': return POSITIVE;
+ case '-': return NEGATIVE;
+ case 'R': return RIGHT;
+ case 'L': return LEFT;
+ case 'U': return UNRELATED;
+
+ default:
+ {
+ fprintf(stderr,"\n Unrecognisable character in score file");
+ exit(0);
+ }
+ }
+}
+
+
+float scoreAll(Fragment * up, Fragment * down, long long int ret_case) {
+ unsigned long long int i;
+// TODO TODO TODO
+ float ret_score=NEGINF;
+// float ret_score = -99999999999;
+ float temp_score;
+
+ if (up->nameIter != down->nameIter) {
+ if (ret_case >> RELPOSSHIFT != UNRELATED) {
+ //MUKCHECK HOPE THIS WORKS
+ return NEGINF;
+ }
+ }
+
+ for (i=0; i<scoreFunctions[ret_case].size(); i++) {
+ temp_score = scoreFunctions[ret_case][i]->getScore(up,down);
+
+ if (temp_score > ret_score) {
+ ret_score = temp_score;
+ }
+ }
+
+ if (ret_score == NEGINF) {
+ printf("Score function case not handled::%lld\n",ret_case);
+ //exit(0);
+ }
+ return ret_score;
+}
+
+
+long long int Mymax(long long int a, long long int b) {
+ return (a>=b ? a : b);
+}
+
+
+long long int Mymin(long long int a,long long int b) {
+ return (a<=b ? a : b);
+}
+
+
+long long int Myabs(long long int a) {
+ return (a<0 ? -a : a);
+}
+
+
+float fragmentSetScore(Fragment * current, Fragment *owner, LI *LeftInfluence, RI * RightInfluence, long long int rightInfluenceFlag) {
+ /*SLAGANCHANGE change call to the score based on the Leftinfluence, this has to be passed i guess*/
+ float tempScore;
+
+ if (rightInfluenceFlag == 3) {
+ tempScore = scoreAll(owner,current, current->strand << DOWNSTRANDSHIFT | owner->strand <<UPSTRANDSHIFT | UNRELATED<< RELPOSSHIFT);
+ if (tempScore == NEGINF) { // TODO
+ if (current->totalScore <= 0) {
+ current->totalScore = current->score;
+ current->back = owner;
+ }
+ } else
+ if (tempScore + current->score > current->totalScore) {
+ current->totalScore = tempScore + current->score;
+ current->back = owner;
+ }
+ } else if (rightInfluenceFlag == TRUE) {
+ tempScore = scoreAll(owner,current,RightInfluence->scoreIndex);
+
+ if (tempScore == NEGINF) { // TODO
+ if (current->totalScore <= 0) {
+ current->totalScore = current->score;
+ current->back = owner;
+ }
+ } else
+ if (tempScore + current->score > current->totalScore) {
+ current->totalScore = tempScore + current->score;
+ current->back = owner;
+ }
+ } else {
+ tempScore = scoreAll(owner,current,LeftInfluence->scoreIndex);
+
+ if (tempScore == NEGINF) { // TODO
+ if (current->totalScore <= 0) {
+ current->totalScore = current->score;
+ current->back = owner;
+ }
+ } else
+ if (tempScore + current->score > current->totalScore) {
+ current->totalScore = tempScore + current->score;
+ current->back = owner;
+ }
+ }
+
+ return current->totalScore;
+}
diff --git a/src/glocal/score.h b/src/glocal/score.h
new file mode 100755
index 0000000..f1420d5
--- /dev/null
+++ b/src/glocal/score.h
@@ -0,0 +1,39 @@
+#ifndef SCORE
+#define SCORE
+
+#include <structs.h>
+#include <glocal.h>
+
+#define MAXCASES 20
+#define MAXOBJECTS 10
+
+struct LI;
+struct RI;
+
+class ScoreInterface {
+ protected:
+ float openConstant,minConstant,maxConstant,diagConstant;
+
+ ScoreInterface (float iopenConstant , float iminConstant ,float imaxConstant,float idiagConstant);
+ float getScore(Fragment *up, Fragment * down){return -1;};
+};
+
+
+class Score :public ScoreInterface {
+ public:
+ Score(float iopenConstant , float iminConstant ,float imaxConstant,float idiagConstant);
+
+ float getScore(Fragment *up, Fragment * down);
+};
+
+
+void initScoreFunctionPointers(char *scoreFileName);
+void createScoreFunctionObjects(char * line);
+long long int charToCase(char in);
+float scoreAll(Fragment *up,Fragment *down, long long int ret_case);
+long long int Myabs(long long int a);
+long long int Mymin(long long int a,long long int b);
+long long int Mymax(long long int a,long long int b);
+float fragmentSetScore(Fragment * current,Fragment *owner,LI *LeftInfluence, RI * RightInfluence,long long int rightInfluenceFlag);
+
+#endif
diff --git a/src/glocal/structs.h b/src/glocal/structs.h
new file mode 100755
index 0000000..cce0483
--- /dev/null
+++ b/src/glocal/structs.h
@@ -0,0 +1,92 @@
+#ifndef STRUCTS
+#define STRUCTS
+
+//general defines
+#include <stdio.h>
+#include <vector>
+#include <map>
+#include <set>
+#include <stdlib.h>
+#include <iostream>
+#include <limits.h>
+#include <list>
+#include <string.h>
+
+using namespace std;
+
+#define RIGHT 0
+#define LEFT 1
+#define UNRELATED 2
+
+#define NEGINF LLONG_MIN
+
+#define UPSTRANDBITS 3
+#define DOWNSTRANDBITS 3
+#define RELPOSBITS 3
+
+
+#define UPSTRANDSHIFT 0
+#define DOWNSTRANDSHIFT UPSTRANDBITS
+#define RELPOSSHIFT UPSTRANDBITS + DOWNSTRANDBITS
+#define TOTALSHIFT UPSTRANDBITS + DOWNSTRANDBITS + RELPOSBITS
+
+#define POSITIVE 1
+#define NEGATIVE 0
+#define CUTOFF 0
+
+#define TRUE 1
+#define FALSE 0
+
+#define INF LLONG_MAX
+#define MIN LLONG_MIN
+#define NAMESIZE 100
+
+
+struct ltstr {
+ bool operator() (const char* s1, const char* s2) const {
+ return strcmp(s1,s2) < 0;
+ }
+};
+
+
+typedef map<const char*,long long int ,ltstr> Name;
+
+
+typedef struct Fragment {
+ long long int seq1Start,seq2Start,seq1End,seq2End;
+ char strand;
+ float score;
+ float totalScore;
+ struct Fragment *back;
+ char deleted;
+ char seq1Name[NAMESIZE];
+ Name::iterator nameIter;
+ char seq2Name[NAMESIZE];
+ long long int base;
+ long long int getSeq2End(long long int reflectFlag){ return this->seq2End*((reflectFlag == TRUE)?(-1): 1);};
+ long long int getSeq2Start(long long int reflectFlag){return this->seq2Start*((reflectFlag == TRUE)?(-1): 1);};
+} Fragment;
+
+
+typedef struct HitLocationList {
+ long long int seq1start;
+ long long int seq2start;
+ long long int seq1end;
+ long long int seq2end;
+ float score;
+ char strand;
+ struct HitLocationList *next;
+ struct HitLocationList *bkptr;
+ float scoreSoFar;
+ char seq1Name[NAMESIZE];
+ char seq2Name[NAMESIZE];
+} hll;
+
+
+
+typedef struct Point {
+ long long int seq1,seq2;
+ Fragment *frag;
+} Point;
+
+#endif
diff --git a/src/glocal/test.score b/src/glocal/test.score
new file mode 100755
index 0000000..da01ef8
--- /dev/null
+++ b/src/glocal/test.score
@@ -0,0 +1,5 @@
+{+R+;-L-}{0 0.02 0 0;40000 0 0 0}
+{+R-;-L+}{3000 0.02 0.1 0;40000 0 0 0}
+{-R+;+L-}{7000 0.02 0.5 0;40000 0 0 0}
+{+L+;-R-}{7000 0.02 0.5 0;40000 0 0 0}
+{+U+;+U-;-U+;-U-}{30000 0 0 0}
diff --git a/src/lagan.pl b/src/lagan.pl
new file mode 100755
index 0000000..19e53fa
--- /dev/null
+++ b/src/lagan.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+
+$lagandir = $ENV{LAGAN_DIR};
+$consrate = 45;
+$consupperrate = 65;
+
+if (@ARGV < 2) {
+ print ("usage:\n lagan seqfile1 seqfile2 [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1,rsc1),(wl2,nd2,co2,rsc2),...\"] [-bin] [-mfa] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-usebounds] [-rc] [-translate] [-draft] [-info] [-fastreject]\n");
+ exit(1);
+}
+
+$firstName = $ARGV[0];
+$secondName = $ARGV[1];
+$rcFlag = 0;
+$arglist = "";
+$contigflag = 0;
+$infofile = 0;
+$okformat = 0;
+$binfile = 0;
+$infofilename = "alignment";
+$direction = "+";
+$gfc = " -gfc ";
+$rundraft = 0;
+$draftparams = "";
+$dofastreject = 0;
+$doxmfa = 0;
+$filename = "";
+$format = "";
+
+for ($i = 2; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-order/) {
+ $orderfl = $ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-bin/) {
+ $orderfl = $orderfl." -bin";
+ $binfile = 1;
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-info/) {
+ $infofile++;
+ }
+ elsif ($ARGV[$i] =~ /-mfa/) {
+ $orderfl = $orderfl." -mfa";
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-xmfa/) {
+ $orderfl = $orderfl." -xmfa";
+ $doxmfa = 1;
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-out/) {
+ $filename = $ARGV[++$i];
+ $infofile++;
+ $infofilename = $ARGV[$i];
+ }
+ elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)){
+ $orderfl = $orderfl." ".$ARGV[$i];
+ $orderfl = $orderfl." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-s1/) {
+ $orderfl = $orderfl." -s1 $ARGV[++$i]";
+ $orderfl = $orderfl." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-maskedonly/) {
+ $arglist = $arglist." -maskedonly";
+ }
+ elsif ($ARGV[$i] =~ /-translate/) {
+ $arglist = $arglist." -translate";
+ $draftparams = $draftparams." -translate";
+ }
+ elsif ($ARGV[$i] =~ /-fastreject/) {
+ $arglist = $arglist." -fastreject";
+ $dofastreject = 1;
+ $doxmfa = 1;
+ $okformat = 1;
+ }
+ elsif ($ARGV[$i] =~ /-draftreject/) {
+ $draftparams = $draftparams." -fastreject";
+ }
+ elsif ($ARGV[$i] =~ /-gap/) {
+ $arglist = $arglist." -gap ".$ARGV[++$i];
+ $arglist = $arglist." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-recurse/) {
+ $arglist = $arglist." -recurse \"".$ARGV[++$i]."\"";
+ }
+ elsif ($ARGV[$i] =~ /-chaos/) {
+ $arglist = $arglist." -chaos \"".$ARGV[++$i]."\"";
+ }
+ elsif ($ARGV[$i] =~ /-usebounds/) {
+ $contigflag = 1;
+ }
+ elsif ($ARGV[$i] =~ /-rc/) {
+ `$lagandir/utils/rc < $ARGV[1] > $ARGV[1].rc`;
+ if ($?) { exit(1); }
+ $secondName = "$ARGV[1].rc";
+ if (-e "$ARGV[1].masked") {
+ `$lagandir/utils/rc < $ARGV[1].masked > $ARGV[1].rc.masked`;
+ if ($?) { exit(1);}
+ }
+ $rcFlag = 1;
+ $direction = "-";
+ }
+ elsif ($ARGV[$i] =~ /-draft/){
+ $rundraft = 1;
+ }
+ elsif ($ARGV[$i] =~ /-cons/){
+ $draftparams = $draftparams." -cons $ARGV[$++i]";
+ }
+ elsif ($ARGV[$i] =~ /-draftskipfr/){
+ $draftparams = $draftparams." -skipfr $ARGV[$++i]";
+ }
+ elsif ($ARGV[$i] =~ /-lazy/){
+ $draftparams = $draftparams." -cons $ARGV[$++i]";
+ }
+
+ else {
+ print "Invalid option for lagan: $ARGV[$i]";
+ exit(1);
+ }
+}
+
+$arglist = $arglist." -ext ";
+
+if ($rundraft){
+ `$lagandir/draft.pl $firstName $secondName $draftparams`;
+ if ($?) { exit(1);}
+ $secondName = "merged_seq.fa";
+}
+
+# print STDERR "perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final\n";
+`perl $lagandir/rechaos.pl $firstName $secondName $gfc $arglist > $$.anchs.final`;
+
+$ex_val = $? >> 8;
+if ($ex_val == 3) { exit(0); }
+
+if ($ex_val) { exit(1); }
+if ($contigflag){
+ @bounds = `$lagandir/utils/getbounds $$.anchs.final $firstName $secondName`;
+ if ($?) { exit(1); }
+ chomp $bounds[0];
+ print STDERR ("Aligning with bounds: $bounds[0]\n");
+ print `$lagandir/order $firstName $secondName $bounds[0] $orderfl -anc $$.anchs.final`;
+ if ($?) { exit(1); }
+}
+else {
+ if ($dofastreject){
+ if (!$filename) {
+ print STDERR "-fastreject requires -out filename!\n";
+ exit(1);
+ }
+ open(SFILE, "$$.anchs.final");
+ @anchors = <SFILE>;
+ close(SFILE);
+
+ $anchors[0] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $end1 = $1 - 1;
+ $end2 = $3 - 1;
+ $anchors[@anchors - 1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $start1 = $2 + 1;
+ $start2 = $4 + 1;
+ $bounds = "-s1 $start1 $end1 -s2 $start2 $end2 ";
+
+ @anchors = 0;
+ $orderfl = $bounds.$orderfl." -xmfa";
+ }
+ if (!$okformat) {
+ $format = "-bin";
+ }
+
+ `$lagandir/order $firstName $secondName $format -out $$.align $orderfl -anc $$.anchs.final`;
+ if ($?) { exit(1); }
+
+ if (!$okformat) {
+ if ($filename) {
+ `$lagandir/utils/bin2bl $$.align > $filename`;
+ }
+ else {
+ print `$lagandir/utils/bin2bl $$.align`;
+ }
+ }
+ else {
+ if ($filename) {
+ `cat $$.align > $filename`;
+ }
+ else {
+ print `cat $$.align`;
+ }
+ }
+ if ($dofastreject){
+ `$lagandir/utils/scorealign $filename $consrate -ibounds -cropxmfa > $$.temp`;
+ if ($?) { exit(1); }
+ `mv $$.temp $filename`;
+ }
+}
+
+$infofile += $okformat;
+if ($infofile == 3){
+ open (INFOFILE, ">$infofilename.info");
+ if ($binfile){
+ `$lagandir/utils/bin2mf $infofilename > $infofilename.mfa`;
+ if ($?) { exit(1); }
+ $infofilename = $infofilename.".mfa";
+ }
+ @temp = `head $secondName`;
+ if ($?) { exit(1); }
+ chomp $temp[0]; $temp[0] = substr $temp[0], 1;
+ print INFOFILE "$temp[0]\n";
+
+ $len = `$lagandir/utils/getlength $secondName`; chomp $len;
+ if ($?) { exit(2); }
+ $first = $last = $first2 = $last2 = -1;
+
+ $score = `$lagandir/utils/scorealign $infofilename $consupperrate`; chomp $score;
+ if ($?) { exit(3); }
+ if ($score > 0){
+ $score = `$lagandir/utils/scorealign $infofilename $consrate`; chomp $score;
+ if ($?) { exit(4); }
+ @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 0`;
+ if ($?) { exit(5); }
+ $temp[0] =~ /(.*) (.*)/;
+ $first = $1; $last = $2;
+
+ @temp = `$lagandir/utils/scorealign $infofilename $consrate -bounds 1`;
+ if ($?) { exit(6); }
+ $temp[0] =~ /(.*) (.*)/;
+ $first2 = $1; $last2 = $2;
+ }
+
+ print INFOFILE "1 $first $last 1 $len 0 0 $direction $score $first2 $last2\n";
+
+ close (INFOFILE);
+# `$lagandir/utils/rm $infofilename` if ($binfile);
+}
+
+`rm $secondName` if ($rcflag);
+`rm $$.*`;
+if ($?) { exit(1); }
+
+exit(0);
+
+
diff --git a/src/lagan2mfa.cpp b/src/lagan2mfa.cpp
new file mode 100644
index 0000000..4050fb7
--- /dev/null
+++ b/src/lagan2mfa.cpp
@@ -0,0 +1,95 @@
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace std;
+
+// TODO refactor in classes and normal make project
+
+#include "util.cpp"
+#include "faindex.cpp"
+
+FaIndex faIndex;
+
+void writeSeq(FILE *f,char* seq,int start,int end) {
+ start--;
+ end--;
+ int j=0;
+ for (int i=start;i<=end;i++) {
+ fputc(seq[i],f);
+ j++;
+ if (j==fastaRowLength) {
+ j=0;
+ fputc('\n',f);
+ }
+ }
+ if (j>0) fputc('\n',f);
+}
+
+
+int main (int argc,char* argv[]) {
+ char buf[bufSize];
+
+ char org0[1000];
+ char name0[1000];
+ int start0;
+ int end0;
+ char strand0;
+
+ char org1[1000];
+ char name1[1000];
+ int start1;
+ int end1;
+ char strand1;
+
+ char org2[1000];
+ char name2[1000];
+ int start2;
+ int end2;
+ char strand2;
+
+ int proto=1;
+
+ string id;
+ string name;
+ char* seq;
+
+ FILE *out=openFile(getArg("-o",argc,argv),"w");
+ FILE *chunk=openFile(getArg("-c",argc,argv),"w");
+ FILE *in=openFile(getArg("-m",argc,argv),"r");
+ proto=atoi(getArg("-p",argc,argv).c_str());
+ readFaIndex(faIndex,getArg("-i",argc,argv));
+
+ while (!feof(in)) {
+ buf[0]='\0';
+ fgets(buf,bufSize,in);
+ if (strlen(buf)==0) continue;
+
+ sscanf(buf,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c",
+ org0,name0,&start0,&end0,&strand0,org1,name1,&start1,&end1,&strand1,org2,name2,&start2,&end2,&strand2);
+
+ name=org0;
+ name=name+"-anc"+name0;
+
+ for (int n=1;n<=proto;n++) {
+ id=name0;
+ id=id+":"+itoa(n);
+ seq=getFaIndexSeq(faIndex,id);
+ fprintf(out,">%s\n",name.c_str());
+ writeSeq(out,seq,start0,end0);
+ free(seq);
+ }
+ end0=end0-start0+1;
+ start0=1;
+
+ fprintf(chunk,"%s %s %d %d %c %s %s %d %d %c %s %s %d %d %c\n",org0,name.c_str(),start0,end0,strand0,org1,name1,start1,end1,strand1,org2,name2,start2,end2,strand2);
+ }
+ fclose(in);
+ fclose(out);
+ fclose(chunk);
+ return 0;
+}
diff --git a/src/makecons.cpp b/src/makecons.cpp
new file mode 100644
index 0000000..19beb8b
--- /dev/null
+++ b/src/makecons.cpp
@@ -0,0 +1,220 @@
+/**
+ * @file
+ *
+ * [TODO]
+ *
+ * @author Mikhail Soloviev
+ * @date 31.03.2006
+ * @version 1.0
+ *
+ */
+
+//#include <iostream>
+//#include <string>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <time.h>
+
+using namespace std;
+
+#define fastaRowLength 50
+#define bufSize 2000
+
+typedef char* pchar;
+
+int isArg(char* key,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0) return 1;
+ }
+ return 0;
+}
+
+char* getArg(char* key,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0 && i<argc-1) return argv[i+1];
+ }
+ fprintf(stderr,"ERROR: Parameter for option '%s' not specified\n",key);
+ exit(1);
+ return NULL;
+}
+
+int trim(char* s) {
+ int i=strlen(s);
+ while (i>0 && (s[i-1]=='\n' || s[i-1]=='\r')) s[--i]='\0';
+ return i;
+}
+
+FILE* openFile(char* path,char* mode) {
+ FILE *f=fopen(path,mode);
+ if (f==NULL) {
+ printf("ERROR: Failed open file: %s\n",path);
+ exit(1);
+ }
+ return f;
+}
+
+char* loadSeq(FILE *f,char* annot,int& seqLen) {
+ char* seq=NULL;
+ char buf[bufSize];
+ int bufLen=0;
+ seqLen=0;
+ while (!feof(f)) {
+ buf[0]='\0';
+ fgets(buf,bufSize,f);
+ bufLen=trim(buf);
+ if (bufLen>0) {
+ if (buf[0]=='>') {
+ strcpy(annot,buf);
+ break;
+ }
+ else {
+ if (seqLen==0) seq=(char*)malloc(sizeof(char)*bufLen);
+ else seq=(char*)realloc(seq,sizeof(char)*(seqLen+bufLen));
+ memcpy(&seq[seqLen],buf,bufLen);
+ seqLen+=bufLen;
+ }
+ }
+ }
+ return seq;
+}
+
+void writeSeq(FILE *f,char* seq,int len) {
+ int j=0;
+ for (int i=0;i<len;i++,seq++) {
+ fputc(*seq,f);
+ j++;
+ if (j==fastaRowLength) {
+ j=0;
+ fputc('\n',f);
+ }
+ }
+ if (j>0) fputc('\n',f);
+}
+
+/*
+char* makeCons(char* seq1,char* seq2,int len) {
+ char* cons=seq1;
+ char ch=' ';
+ for (int i=0;i<len;i++,seq1++,seq2++) {
+ if (*seq1=='-') {
+ *seq1=*seq2;
+ }
+ else if (toupper(*seq1)=='N') {
+ if (*seq2!='-') *seq1=*seq2;
+ }
+ else if (toupper(*seq1)==toupper(*seq2)) {
+ if (islower(*seq1)) *seq1=*seq2;
+ }
+ else {
+ ch=(rand()&1)?*seq1:*seq2;
+ if (isupper(*seq1) || isupper(*seq2)) *seq1=toupper(ch); else *seq1=ch;
+ }
+ }
+ return cons;
+}
+*/
+
+/*
+void makeCons(char seq1[],char seq2[],char cons[],int len) {
+ for (int i=0;i<len;i++) {
+ if (seq1[i]=='-') {
+ cons[i]=seq2[i];
+ }
+ else if (seq2[i]=='-') {
+ cons[i]=seq1[i];
+ }
+ else if (toupper(seq1[i])=='N') {
+ cons[i]=seq2[i];
+ }
+ else if (toupper(seq2[i])=='N') {
+ cons[i]=seq1[i];
+ }
+ else if (toupper(seq1[i])==toupper(seq2[i])) {
+ cons[i]=isupper(seq1[i])?seq1[i]:seq2[i];
+ }
+ else {
+ cons[i]=(rand()&1)?seq1[i]:seq2[i];
+ if (isupper(seq1[i]) || isupper(seq2[i])) cons[i]=toupper(cons[i]);
+ }
+ }
+}
+*/
+
+char dna[]={'N','A','C','G','T'};
+
+int findMaxLetter(int count[],char* letter) {
+ int max=0;
+ int index=0;
+ for (int i=1;i<5;i++) if (count[i]>max) max=count[i];
+ for (int i=1;i<5;i++) if (count[i]==max) letter[index++]=dna[i];
+ return index;
+}
+
+char makeConsLetter(char letter[],int proto) {
+ int count[5];
+ char maxLetter[5];
+ int maxNumber;
+ for (int j=0;j<5;j++) count[j]=0;
+ for (int i=0;i<proto;i++) count[letter[i]]++;
+ if (count[1]==0 && count[2]==0 && count[3]==0 && count[4]==0) {
+ return 'N';
+ }
+ else {
+ maxNumber=findMaxLetter(count,maxLetter);
+ return maxNumber==1?maxLetter[0]:maxLetter[rand()%maxNumber];
+ }
+}
+
+void makeCons(char cons[],char** seq,int proto,int len) {
+ char letter[proto];
+ for (int i=0;i<len;i++) {
+ for (int j=0;j<proto;j++) {
+ switch (toupper(seq[j][i])) {
+ case 'A': letter[j]=1; break;
+ case 'C': letter[j]=2; break;
+ case 'G': letter[j]=3; break;
+ case 'T': letter[j]=4; break;
+ default: letter[j]=0; break;
+ }
+ }
+ cons[i]=makeConsLetter(letter,proto);
+ }
+}
+
+int main (int argc,char* argv[]) {
+
+ pchar seq[100];
+ pchar cons=NULL;
+ int len=0;
+ int proto=0;
+ char annot[2000];
+ char nextAnnot[2000];
+
+ srand((int)time(NULL));
+
+ FILE *out=openFile(getArg("-o",argc,argv),"w");
+ FILE *in=openFile(getArg("-i",argc,argv),"r");
+ proto=atoi(getArg("-p",argc,argv));
+
+ cons=loadSeq(in,annot,len);
+
+ while (!feof(in)) {
+ for (int i=0;i<proto;i++) seq[i]=loadSeq(in,nextAnnot,len);
+
+ cons=(char*)malloc(sizeof(char)*len);
+ makeCons(cons,seq,proto,len);
+
+ fprintf(out,"%s\n",annot);
+ writeSeq(out,cons,len);
+
+ strcpy(annot,nextAnnot);
+ for (int i=0;i<proto;i++) free(seq[i]);
+ free(cons);
+ }
+ fclose(in);
+ fclose(out);
+ return 0;
+}
diff --git a/src/mempage.c b/src/mempage.c
new file mode 100644
index 0000000..78aee42
--- /dev/null
+++ b/src/mempage.c
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MINPAGESIZE 1000000
+
+typedef struct MemoryPage {
+ char* memory;
+ int size;
+ int used;
+ struct MemoryPage* next;
+} mpage;
+
+
+mpage* globalpage = 0;
+
+void initMP(int pagesize) {
+ mpage* newpage;
+ if (pagesize < MINPAGESIZE)
+ pagesize = MINPAGESIZE;
+
+ newpage = (mpage*) malloc(sizeof(mpage));
+ newpage->next = globalpage;
+ globalpage = newpage;
+ globalpage->memory = (char*) malloc (pagesize);
+ globalpage->used = 0;
+ globalpage->size = pagesize;
+}
+
+void* MPmalloc(int size) {
+ void* tbr;
+ if (globalpage->size - globalpage->used < size) {
+ initMP(size);
+ }
+ tbr = globalpage->memory+ globalpage->used;
+ globalpage->used += size;
+ return tbr;
+}
+
+void* MPallfree() {
+ mpage *n;
+ while (globalpage) {
+ free (globalpage->memory);
+ n = globalpage;
+ globalpage = globalpage->next;
+ free(n);
+ }
+ initMP(0);
+}
+
+void* MPrealloc(void* prevptr, int prevsize, int newsize) {
+ void* tbr = MPmalloc(newsize);
+ memcpy(tbr, prevptr, prevsize);
+ // fprintf(stderr, "realloc returns %x instead of %x, (%d %d)\n", tbr, prevptr, prevsize, newsize);
+ return tbr;
+}
diff --git a/src/mempage.h b/src/mempage.h
new file mode 100644
index 0000000..ef19c7c
--- /dev/null
+++ b/src/mempage.h
@@ -0,0 +1,34 @@
+#define MINPAGESIZE 256
+
+typdef struct MemoryPage {
+ void* memory;
+ int size;
+ int used;
+ struct MemoryPage* next;
+} mpage;
+
+
+mpage globalpage;
+
+void* initMP() {
+ globalpage.memory = realloc (globalpage.memory, MINPAGESIZE);
+ globalpage.used = 0;
+ globalpage.size = MINPAGESIZE;
+}
+
+void* MPmalloc(int size) {
+ void* tbr;
+ while (globalpage.size - globalpage.used > size)
+ globalpage.memory = realloc (globalpage.memory, (globalpage.size *=2));
+ tbr = &(globalpage.memory[globalpage.used]);
+ globalpage.used += size;
+ return tbr;
+}
+
+void* MPallfree() {
+ globalpage.memory = realloc (globalpage.memory, MINPAGESIZE);
+ globalpage.used = 0;
+ globalpage.size = MINPAGESIZE;
+}
+
+
diff --git a/src/mlagan.c b/src/mlagan.c
new file mode 100644
index 0000000..1a62a95
--- /dev/null
+++ b/src/mlagan.c
@@ -0,0 +1,1095 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "skiplist.h"
+#include "multial.h"
+#include "filebuffer.h"
+
+#define VER_NUM "2.0"
+#define MIN2(x,y) ( (x) >= (y) ? (y) : (x) )
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+
+// Global variables
+
+static int nested = 0;
+static int postir = 0;
+static int lazy = 0;
+static int notree = 1;
+static int verbose = 0;
+static int numseqs = 0;
+static int itertimes = 1;
+static int cutoffmatch = 12;
+static int translate = 0;
+static int extend = 1;
+static int fastreject = 0;
+static int gapfreechunks = 0;
+
+static align *simaligns[MAX_SEQ];
+static char* lagan_dir;
+
+static int hptrcomp (const void *p1, const void *p2) {
+ int i = ((hptr*)p1)->number;
+ int j = ((hptr*)p2)->number;
+ int it = ((hptr*)p1)->isstart;
+ int jt = ((hptr*)p2)->isstart;
+ if (i > j)
+ return (1);
+ if (i < j)
+ return (-1);
+ if (it)
+ return -1;
+ else
+ return 1;
+}
+
+
+void usage(void) {
+ printf("mlagan seqfile_1 seqfile_2 [... seqfile_%d] [-parameters]\n\n",
+ MAX_SEQ);
+ printf("-nested : runs improvement in a nested fashion\n");
+ printf("-postir : incorporates the final improvement phase\n");
+ printf("-lazy : uses lazy mode\n");
+ printf("-translate : use translated anchors\n");
+ // printf("-ext : extend the anchors\n"); This is now default
+ printf("-fastreject : use fast rejection (tuned for human/mouse or closer)\n");
+ // printf("-gfc : find gap free chunks as anchors\n"); This is currently broken
+ printf("-verbose : give debug output\n");
+ printf("-tree \"(...)\" : runs with given phylogenetic tree\n");
+ printf("-out \"filename\": outputs to filename\n");
+ printf("-nucmatrixfile \"filename\": uses given substitution matrix instead of $LAGAN_DIR/nucmatrix.txt\n");
+ printf("-version : prints version info\n");
+}
+
+seq* readfile(FILE* input) {
+ int seqstart=0;
+ int seqend=0;
+ char* res = (char*) malloc(sizeof(char)*2);
+ int ressize = 2, numread=1; //N at 1st letter
+ char temp[256];
+ seq* myseq = (seq*) malloc(sizeof(seq));
+ char currchar;
+
+ res[0] = 'N';
+ if (feof(input))
+ return 0;
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+
+ myseq->name = (char*) malloc((strlen(temp))*sizeof(char));
+ strcpy(myseq->name, temp+1);
+ *(strchr(myseq->name, '\n')) = 0;
+ currchar = fgetc(input);
+ while ((currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ if (!strchr(alpha, currchar)) {
+ fprintf(stderr, "Warning: %c converted to 'N'\n", currchar, alpha);
+ currchar = 'N';
+ }
+ res[numread++] = currchar;
+ if (numread >= ressize) {
+ res=(char*)realloc(res, sizeof(char)*(ressize*=2));
+ }
+ }
+ currchar = fgetc(input);
+ }
+ if (currchar == '>')
+ ungetc(currchar, input);
+ res[numread]=0;
+ myseq->rptr = res;
+
+ if (seqstart > 0) {
+ res = &res[seqstart-1];
+ res[seqend-seqstart+1] = 0;
+ numread = seqend-seqstart+1;
+ }
+
+ myseq->lets = res;
+ myseq->numlets = numread;
+ // printf("read: %d lets\n",numread);
+ return myseq;
+}
+
+int starts_with(char *str, char *word) {
+ int len;
+ char *first_word;
+
+ len = strlen(str);
+ first_word = (char *)malloc((len + 1) * sizeof(char));
+ sscanf(str, "%s", first_word);
+ return strcmp(word, first_word);
+}
+
+align* findAlignByName(align *aligns[], char *name) {
+ int i=0;
+ // printf("findAlignByName: %s\n", name);
+ while(i<numseqs) {
+ if (strstr(aligns[i]->seqs[0]->name, name)) {
+ return(aligns[i]);
+ }
+ i++;
+ }
+ fprintf(stderr, "alignment not found for: %s", name);
+ exit(2);
+ return NULL;
+}
+
+int kk = 0;
+
+void printHLL(hll *myres) {
+ fprintf(stderr, "into %d\n", ++kk);
+ fflush(stderr);
+ while(myres) {
+
+ fprintf(stderr, "(%d %d)=(%d %d) %f\n",
+ myres->seq1start, myres->seq1end,
+ myres->seq2start, myres->seq2end, myres->score);
+ fflush(stderr);
+ myres=myres->next;
+ }
+}
+
+hll* getAnchsFromFile(char *fname, FileBuffer f1, FileBuffer f2) {
+ FILE *ancfile;
+ hll *myres = 0, *tt = 0, *first = 0;
+ char buff[256];
+ int i=0, j=0;
+
+ // printf("getHLLFromNames: %s, %s\n", name1, name2);
+
+ sprintf(buff, "%s.anchors", fname);
+ ancfile=fopen(buff, "r");
+ if(ancfile==NULL) {
+ fprintf(stderr, "anchor file not found:: %s.anchors\n",
+ fname);
+ exit(2);
+ }
+
+ while (!feof(ancfile)) {
+ if (!fgets(buff, 256, ancfile)) {
+ break;
+ }
+ tt = (hll*) malloc(sizeof(hll));
+ sscanf(buff, "(%d %d)=(%d %d) %f", &tt->seq1start, &tt->seq1end,
+ &tt->seq2start, &tt->seq2end, &tt->score);
+ tt->next = myres;
+ i++;
+ myres = tt;
+ }
+ if (fastreject) {
+ f1->startpos = MAX2(f1->startpos, myres->seq1end);
+ f2->startpos = MAX2(f2->startpos, myres->seq2end);
+ for (tt = myres; tt->next->next; tt = tt->next) {
+ j++;
+ }
+ f1->endpos = MIN2(f1->endpos, tt->next->seq1start);
+ f2->endpos = MIN2(f2->endpos, tt->next->seq2start);
+ // fprintf (stderr, "%d %d %d %d %d\n", j, f1->startpos, f1->endpos, f2->startpos, f2->endpos);
+ myres = myres->next;
+ tt->next = 0;
+ }
+ fprintf(stderr,"read %d anchs\n", i);
+ fclose(ancfile);
+ return myres;
+}
+
+
+
+hll* generateAnchors( FileBuffer a1, FileBuffer a2) {
+ char buff[256];
+ char fname[80];
+ char *name1, *name2;
+ char *endpnt;
+ int diff1, diff2;
+ align* temp;
+ hll* res;
+ char flip = 0;
+ int retstat;
+
+ name1 = strrchr (a1->filename, '/');
+ if (!name1) name1 = a1->filename;
+ else name1++;
+ name2 = strrchr (a2->filename, '/');
+ if (!name2) name2 = a2->filename;
+ else name2++;
+
+ endpnt = strchr ( name1, '.');
+ diff1 = (endpnt)? endpnt - name1: strlen(name1);
+ endpnt = strchr ( name2, '.');
+ diff2 = (endpnt)? endpnt - name2: strlen(name2);
+ strncpy (fname, name1, diff1);
+ strncpy (fname+diff1, name2, diff2);
+ fname[diff1+diff2] = 0;
+
+ sprintf(buff, "%s/rechaos.pl %s %s -out %s.anchors %s %s %s %s %s\n",
+ lagan_dir,
+ a1->filename,
+ a2->filename,
+ fname,
+ (extend ? "-ext" : ""),
+ (translate ? "-translate" : ""),
+ (fastreject ? "-fastreject" : ""),
+ (gapfreechunks ? "-gfc" : ""),
+ (lazy ? "-lazy" : ""));
+
+ retstat = system(buff) >> 8;
+ if (fastreject && (retstat == 3)) {
+ return 0;
+ }
+ else if (retstat) {
+ fprintf (stderr, "Error from rechaos\n");
+ exit (1);
+ }
+ res = getAnchsFromFile(fname, a1, a2);
+ return res;
+}
+
+
+void printFASTASeq(FILE *outfile, seq *myseq) {
+ int i;
+ // printf("kva\n");
+ if (!outfile)
+ outfile = stdout;
+
+ fprintf(outfile, ">%s\n", myseq->name);
+ // printf("kva2\n");
+ for(i=0; i<myseq->numlets; i++)
+ fprintf(outfile, "%c", myseq->rptr[i]);
+ // printf("kva %d\n",i);
+ fprintf(outfile, "\n");
+
+ if (outfile!=stdout) fclose(outfile);
+}
+
+
+hll* findBestChain(hptr* array, int arrsize) {
+ sklst* skipper = makeSkLst();
+ sle* help;
+ int i;
+ hll* t;
+ for (i = 0; i < arrsize; i++) {
+ if (array[i].isstart) {
+ help = SLfind(skipper, array[i].myhll->seq2start);
+ if (help->myelem) {
+ array[i].myhll->bkptr = help->myelem;
+ array[i].myhll->scoreSoFar = ((hll*)help->myelem)->scoreSoFar + array[i].myhll->score;
+ }
+ else {
+ array[i].myhll->bkptr = 0;
+ array[i].myhll->scoreSoFar = array[i].myhll->score;
+ }
+ }
+ else {
+ help = SLfind(skipper, array[i].myhll->seq2end);
+ if (help->myelem && (array[i].myhll->scoreSoFar <= ((hll*)help->myelem)->scoreSoFar))
+ continue;
+ SLinsertAfter(skipper, help, array[i].myhll->seq2end, array[i].myhll);
+ help = help->next[0];
+ while (help->next[0] &&
+ ((hll*)help->myelem)->scoreSoFar >= ((hll*)help->next[0]->myelem)->scoreSoFar)
+ SLremove(skipper, help->next[0]);
+ }
+ }
+ t= (hll*)SLgetLast(skipper)->myelem;
+ delSkLst(skipper);
+ return t;
+}
+
+
+hll* remakeHLL(hll* bestPtr) {
+ int len;
+ hll *res=0;
+ hll *temp, *t2, *t3;
+ int i, bestscore=-1;
+ for (temp = bestPtr; temp; temp = temp->bkptr) {
+ temp->next=res;
+ temp->dirty = 1;
+ res=temp;
+ }
+
+ return res;
+}
+
+
+hll* reanchorHLL(hll* mylist) {
+
+ hll *temp, *best, *t2;
+ int numhits=0, i=0;
+ hptr* myptrs;
+
+ temp=mylist;
+ while (temp) { numhits++; temp->dirty = 1; temp=temp->next; }
+
+ myptrs = (hptr*) malloc (sizeof(hptr) * numhits *2);
+ for (temp = mylist; temp; temp = temp->next) {
+ myptrs[i].number = temp->seq1start;
+ myptrs[i].isstart = 1;
+ myptrs[i].myhll = temp;
+ myptrs[i+1].number = temp->seq1end;
+ myptrs[i+1].isstart = 0;
+ myptrs[i+1].myhll = temp;
+ i = i+2;
+ }
+ qsort(myptrs, numhits*2, sizeof(hptr), hptrcomp);
+ best = findBestChain(myptrs, numhits*2);
+ temp=best;
+ while (temp) { temp->dirty = 0; temp=temp->bkptr; }
+ temp=mylist;
+ while (temp) { t2 = temp; temp=temp->next; if (t2->dirty) free(t2); }
+
+ best = remakeHLL(best);
+ // printf("newbest\n");
+ // printHLL(best);
+ free (myptrs);
+ return best;
+}
+
+
+void orderAligns(align *a1, align *a2,
+ align **first, align **second,
+ int *index, int *hllindex) {
+ int a1index, a2index;
+
+ a1index = a1->index;
+ a2index = a2->index;
+
+ if (a1index > a2index) {
+ *first = a2;
+ *second = a1;
+ *index = a2index;
+ *hllindex = a1index;
+ } else {
+ *first = a1;
+ *second = a2;
+ *index = a1index;
+ *hllindex = a2index;
+ }
+}
+
+
+void doRemapHLLs(align *aligns[], align *uni, int *index, int hllindex) {
+ int i, mapi, done=0;
+
+ // take all hlls into first, and into the second and remap them
+
+ for(mapi=*index; !done; mapi=hllindex) {
+
+ for (i=0; i<mapi; i++) {
+ if (aligns[i]->hlls[mapi] != NULL && i != *index) {
+ // remap them into i
+ // fprintf(stderr, "\n called1 %d %d(%d)\n", i, mapi, *index);
+ aligns[i]->hlls[mapi] = remapHLLs(aligns[i]->hlls[mapi],
+ 1, uni,
+ (mapi!=*index));
+ }
+ }
+ for (i=mapi+1; i<numseqs; i++) {
+ if (aligns[mapi]->hlls[i] != NULL && i != hllindex) {
+ // remap them into first or second
+ // fprintf(stderr, "\n called2 %d %d(%d)\n", mapi, i,*index);
+ aligns[mapi]->hlls[i] = remapHLLs(aligns[mapi]->hlls[i],
+ 0, uni,
+ (mapi!=*index));
+ }
+ }
+ if (mapi==hllindex) done=1;
+ }
+
+ // free memory? what's that?
+ // aligns[*index] = result;
+ // aligns[hllindex] = result;
+
+
+}
+
+void doReanchorHLLs(align *aligns[],
+ int *index, int hllindex) {
+ int i;
+
+ // for each pair of hlls from (i to first) and (i to second)
+
+ for(i=0; i<*index; i++) {
+ aligns[i]->hlls[*index] =
+ reanchorHLL(mergeHLLs(aligns[i]->hlls[*index], 0,
+ aligns[i]->hlls[hllindex], 0));
+
+ // if (verbose) {
+ // printf("aligns[%d]->hlls[%d]\n",i ,*index);
+ // printHLL(aligns[i]->hlls[*index]);
+ // }
+ aligns[i]->hlls[hllindex] = 0;
+ }
+ for(i=*index+1; i<hllindex; i++) {
+ aligns[*index]->hlls[i] =
+ reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0,
+ aligns[i]->hlls[hllindex], 1));
+ // if (verbose) {
+ // printf("aligns[%d]->hlls[%d]\n",*index ,i);
+ // printHLL(aligns[*index]->hlls[i]);
+ // }
+ aligns[i]->hlls[hllindex] = 0;
+ }
+ for(i=hllindex+1; i<numseqs; i++) {
+ aligns[*index]->hlls[i] =
+ reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0,
+ aligns[hllindex]->hlls[i], 0));
+ // if (verbose) {
+ // printf("aligns[%d]->hlls[%d]\n", *index, i);
+ // printHLL(aligns[*index]->hlls[i]);
+ // }
+ aligns[hllindex]->hlls[i] = 0;
+ }
+}
+
+
+align* processAlign(align *aligns[], align *a1, align *a2, int *index) {
+ int hllindex;
+ align *first, *second, *result, *uni;
+
+ orderAligns(a1, a2, &first, &second, index, &hllindex);
+
+ // if (verbose
+ // printHLL(aligns[first->index]->hlls[hllindex]);
+
+ result = makeAlign(first, second, aligns[first->index]->hlls[hllindex], &uni);
+ result->index = *index;
+
+ freeHLLs(aligns[first->index]->hlls[hllindex]);
+ aligns[first->index]->hlls[hllindex] = 0;
+
+
+ doRemapHLLs(aligns, uni, index, hllindex);
+
+ doReanchorHLLs(aligns, index, hllindex);
+
+ // if the constituent alignments were not simple alignments, free them
+ freeAlign(uni); uni = 0;
+ if (first->numseq > 1){ freeAlign(first); first = 0; }
+ if (second->numseq > 1){ freeAlign(second); second = 0; }
+
+ return(result);
+}
+
+
+align* iterativeImprovement (align *current, align *rpntree[], int length) {
+ int converged = 0;
+ int i=0, oldscore, cutoff;
+ seq *removed;
+ align *readd, *old, *new;
+ hll* anchs, *tt;
+ if (current->numseq <= 2)
+ return current;
+ // printf("iterative improvement!\n");
+
+ cutoff = cutoffmatch * 100;
+ fprintf(stderr, "cutoff = %d\n", cutoff);
+ while (!converged) {
+
+ // Throw out a sequence. Calling code in multial.
+ removed = current->seqs[0];
+ new = findAlignByName(simaligns, removed->name);
+ old = current;
+ anchs = getAnchsFromAlign(current, 0, cutoff);
+ current = removeSeq(current, 0);
+ free (old);
+
+ // Re-align this thrown-out sequence to the remaining alignment.
+
+ current = makeAlign (current, new, anchs, &old);
+ if (verbose) {
+ printf("improved:\n");
+ printHLL(anchs);
+ printTextAlign(stdout, current);
+ }
+ while (anchs) {
+ tt = anchs;
+ anchs = anchs->next;
+ free (tt);
+ }
+ free (old);
+
+ i++;
+ if (i==numseqs*itertimes) converged = 1;
+ }
+ return current;
+}
+
+
+
+int treeToRPN(char *treestr, align *stack[MAX_SEQ*2], int *depth) {
+
+ int i=0; int j, k;
+ char buffer[256];
+
+ while (treestr[i]!='(') { i++; } i++;
+
+ while ((treestr[i] != ')') && (treestr[i] != '\0')) {
+ // printf("%d: %s\n", *depth, treestr+i);
+
+
+ if (treestr[i]=='(') {
+ i += treeToRPN(treestr+i, stack, depth);
+ }
+ else if (isalnum(treestr[i])) {
+ k = 0;
+ // push alignment
+ while((!isspace(treestr[i])) && (treestr[i]!='(') && (treestr[i]!=')')) {
+ buffer[k++] = treestr[i++];
+ }
+ buffer[k] = 0;
+ stack[(*depth)++]=findAlignByName(simaligns, buffer);
+ // printf("pushed: %s\n", stack[*depth-1]->seqs[0]->name);
+ }
+ else if (treestr[i]==')')
+ // (*depth)++;
+ break;
+ else { i++; }
+
+ }
+
+ if (treestr[i]==')') {
+ (*depth)++; //null is '+'
+ return i+1;
+ }
+ if (treestr[i] == '\0') {
+ fprintf(stderr, "ERROR parsing tree, depth %d, %d chars read", *depth, i);
+ exit(1);
+ }
+}
+
+align* procStack(align* rpntree[MAX_SEQ*2], int length, align *myaligns[]) {
+ align* stack[MAX_SEQ];
+ int i = 0, sp = 0;
+ int index=0;
+
+ while (i < length) {
+
+ if (rpntree[i]) {
+ stack[sp++] = rpntree[i];
+ }
+ else {
+ stack[sp-2] = processAlign(myaligns, stack[sp-2], stack[sp-1], &index);
+ stack[--sp] = 0;
+ if(verbose) printTextAlign(stdout, stack[sp-1]);
+ }
+
+ if (nested) {
+ iterativeImprovement(stack[sp-1], rpntree, i);
+ }
+
+ i++;
+ }
+ return stack[sp-1];
+}
+
+
+char* buildTree (align *simalign[], float distances[MAX_SEQ][MAX_SEQ]) {
+ char *names[MAX_SEQ];
+ int namelens[MAX_SEQ];
+ float max;
+ int mli, mlj;
+ int i, j;
+ char *result, *temp;
+
+ // fprintf (stderr, "into build\n");
+
+ for (i=0; i< numseqs; i++) {
+ namelens[i] = strlen(simalign[i]->seqs[0]->name);
+ names[i] = (char*) malloc ((namelens[i]+1) * sizeof (char));
+ sscanf (simalign[i]->seqs[0]->name,"%s",names[i]);
+ }
+
+ do {
+ max = -1;
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ if (distances[i][j] > max) {
+ max = distances[i][j];
+ mli = i;
+ mlj = j;
+ }
+ }
+ }
+ if (max < 0)
+ break;
+ // fprintf (stderr, "join! %d %d (score %f)\n", mli, mlj, distances[mli][mlj]);
+ temp = (char*) malloc ((namelens[mli] + namelens[mlj] +4)* sizeof(char));
+ sprintf(temp, "(%s %s)", names[mli], names[mlj]);
+
+ // fprintf (stderr, "%d(%d)+%d(%d)+3=%d(really %d)\n", namelens[mli],strlen(names[mli]),
+ // namelens[mlj], strlen(names[mlj]), strlen(temp), namelens[mli]+namelens[mlj]+3);
+
+ // fprintf (stderr, "malloc gave %x\n", temp);
+ // fprintf (stderr, "new = %s\n", temp);
+ // fprintf (stderr, "done free1 %x\n", names[mli]);
+ free (names[mli]);
+ // fprintf (stderr, "done free2 %x\n", names[mlj]);
+ free (names[mlj]);
+ names[mlj] = 0;
+ names[mli] = result = temp;
+ namelens[mli] = namelens[mli] + namelens[mlj] + 3;
+ distances[mli][mlj] = -1;
+ // fprintf (stderr, "done concat\n");
+ for (i=0; i < mli; i++) {
+ // fprintf (stderr, "h1\n");
+ if (distances[i][mli] >= 0)
+ distances[i][mli] = (distances[i][mli] + distances[i][mlj]) / 2;
+ distances[i][mlj] = -1;
+ }
+ for (i=mli+1; i < mlj; i++) {
+ // fprintf (stderr, "h2\n");
+ if (distances[mli][i] >= 0)
+ distances[mli][i] = (distances[mli][i] + distances[i][mlj]) / 2;
+ distances[i][mlj] = -1;
+ }
+ for (i=mlj+1; i < numseqs; i++) {
+ // fprintf (stderr, "h3\n");
+ if (distances[mli][i] >= 0)
+ distances[mli][i] = (distances[mli][i] + distances[mlj][i]) / 2;
+ distances[mlj][i] = -1;
+ }
+ // fprintf (stderr, "end of loop\n");
+ } while (max >= 0);
+
+ for (i=0; i< numseqs; i++) {
+ if (names[i] != result)
+ free (names[i]);
+ }
+ fprintf (stderr, "We built the tree: \"%s\"\n", result);
+ return result;
+}
+
+
+char* graphCollapsal (align *simaligns[]) {
+ float distances[MAX_SEQ][MAX_SEQ];
+ int i, j;
+ float sum = 0, length = 0;
+ float score = 0, count = 0;
+ hll* temp;
+
+ for (i=0; i< MAX_SEQ; i++)
+ for (j=0; j< MAX_SEQ; j++)
+ distances[i][j] = -1;
+
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ sum = 0; count = 0;
+ length = 0; score = 0;
+ temp = simaligns[i]->hlls[j];
+ while (temp) {
+ sum += temp->score;
+ length += (temp->seq1end - temp->seq1start);
+ score += temp->score/(temp->seq1end - temp->seq1start);
+ count += 1;
+ temp = temp->next;
+ }
+ if (count != 0 && sum > 0) {
+ //distances[i][j] = score/count;
+ distances[i][j] = sum/length;
+ //MIN2(simaligns[i]->seqs[0]->numsiglets, simaligns[j]->seqs[0]->numsiglets);
+ fprintf (stderr, "Similarity %s and %s = %f\n",
+ simaligns[i]->seqs[0]->name, simaligns[j]->seqs[0]->name, distances[i][j]);
+ }
+ else
+ distances[i][j] = 0;
+ }
+ }
+ return buildTree (simaligns, distances);
+}
+
+int parseParameters(int argc, char** argv, FileBuffer *files, char **treestr) {
+
+ int i=1;
+
+ if (argc < 3) {
+ if (argc == 2)
+ if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) {
+ fprintf(stderr, "MLAGAN version %s\n", VER_NUM);
+ exit(0);
+ }
+ usage();
+ return 1;
+ }
+ while((argv[i][0]!='-')) {
+
+ // Read in sequence files.
+
+ // printf("sequence %d: %s\n", i, argv[i]);
+
+ if (!(files[numseqs++] = FileOpen(argv[i]))) {
+ fprintf(stderr, "couldnt open dbase file %s\n",argv[i]);
+ usage();
+ return 2;
+ }
+
+ // seqs[numseqs] = FileRead(seqfile, 0, 0, VER_MLAGAN);
+ // seqs[numseqs]->filename = argv[i];
+ // numseqs++;
+
+
+ if(++i>=argc) break;
+ }
+
+ // printf("\n");
+
+ while (i<argc) {
+
+ // printf("parameters: %s\n", argv[i]);
+
+ if (!(strcmp(argv[i], "-nested") ||
+ strcmp(argv[i], "-nopost") ||
+ strcmp(argv[i], "-postir") ||
+ strcmp(argv[i], "-fastreject") ||
+ strcmp(argv[i], "-gfc") ||
+ strcmp(argv[i], "-lazy") ||
+ strcmp(argv[i], "-verbose") ||
+ strcmp(argv[i], "-out") ||
+ strcmp(argv[i], "-translate") ||
+ strcmp(argv[i], "-ext") || strcmp(argv[i], "-scorematrix") ||
+ strcmp(argv[i], "-match") || strcmp(argv[i], "-mismatch") ||
+ strcmp(argv[i], "-gapstart") || strcmp(argv[i], "-gapend") ||
+ strcmp(argv[i], "-gapcont") || strcmp(argv[i], "-gapperseq") ||
+ strcmp(argv[i], "-overlap") || strcmp(argv[i], "-glwidth") ||
+ strcmp(argv[i], "-tree"))) {
+ fprintf(stderr, "unrecognized parameter: %s\n", argv[i]);
+ usage();
+ return 1;
+ }
+ if (!strcmp(argv[i], "-nested")) {
+ nested = 1;
+ }
+
+ if (!strcmp(argv[i], "-translate")) {
+ translate = 1;
+ }
+
+ if (!strcmp(argv[i], "-ext")) { //default, do not use
+ extend = 1;
+ }
+
+
+ if (!strcmp(argv[i], "-verbose")) {
+ verbose = 1;
+ }
+
+ if (!strcmp(argv[i], "-postir")) {
+ postir = 1;
+ }
+ if (!strcmp(argv[i], "-lazy")) {
+ lazy = 1;
+ }
+ if (!strcmp(argv[i], "-fastreject")) {
+ fastreject = 1;
+ }
+ if (!strcmp(argv[i], "-gfc")) { //Broken, do not use
+ gapfreechunks = 1;
+ }
+
+ if (!strcmp(argv[i], "-out")) {
+ i++;
+ if ((i>=argc) || (argv[i][0]=='-')) {
+ fprintf(stderr, "missing parameter specification for [-out].\n");
+ return 1;
+ }
+ fprintf(stderr, "outputting to: %s\n", argv[i]);
+ outfile = fopen(argv[i], "w");
+ if (outfile==NULL) {
+ fprintf(stderr, "error with output file...\n");
+ exit(2);
+ }
+ }
+
+ if (!strcmp(argv[i], "-tree")) {
+ i++;
+ if ((i>=argc) || (argv[i][0]=='-')) {
+ fprintf(stderr, "missing parameter specification for [-tree].\n");
+ return 1;
+ }
+ notree = 0;
+ *treestr = argv[i];
+ fprintf(stderr, "using given phylogenetic tree:\n%s\n", *treestr);
+ }
+
+ if (!strcmp(argv[i], "-gapperseq")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-gapperseq].\n");
+ return 1;
+ }
+ gapperseq = atoi(argv[i]);
+ fprintf(stderr, "using gapperseq score: %d\n", gapperseq);
+ }
+ if (!strcmp(argv[i], "-overlap")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-overlap].\n");
+ return 1;
+ }
+ overlap = atoi(argv[i]);
+ fprintf(stderr, "using overlap value: %d\n", overlap);
+ }
+ if (!strcmp(argv[i], "-glwidth")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-glwidth].\n");
+ return 1;
+ }
+ glwidth = atoi(argv[i]);
+ fprintf(stderr, "using glwidth value: %d\n", glwidth);
+ }
+
+ if (!strcmp(argv[i], "-nucmatrixfile")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-scorematrix.\n");
+ return 1;
+ }
+ nucmatrixfile = argv[i];
+ fprintf(stderr, "using nucmatrixfile value: %s\n", nucmatrixfile);
+ }
+
+ i++;
+ }
+
+ // setScores(gapstart, gapcont, gapend, gapperseq, overlap, glwidth);
+
+ return 0;
+}
+
+hll* updateAnchorPos(hll* myhll, FileBuffer f1, FileBuffer f2) {
+ hll *res, *temp, *prev=0;
+ res = myhll;
+ fprintf (stderr, "Updating anchs...\n");
+ for ( ; myhll; myhll = myhll->next) {
+ myhll->seq1start -= (f1->startpos-1);
+ myhll->seq1end -= (f1->startpos-1);
+ myhll->seq2start -= (f2->startpos-1);
+ myhll->seq2end -= (f2->startpos-1);
+ }
+ while (res && (res->seq1start < 0 || res->seq2start < 0)) {
+ // fprintf (stderr, "first..\n");
+ temp = res;
+ // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end,
+ // f1->endpos, f2->endpos);
+ res = res->next;
+ free(temp);
+ }
+ temp = res;
+ while (temp && temp->seq1end < (f1->endpos-f1->startpos) && temp->seq2end < (f2->endpos-f2->startpos)) {
+ // fprintf (stderr, "second...\n");
+ // fprintf(stderr, "Kept %d %d(%d %d)\n", temp->seq1end, temp->seq2end,
+ // f1->endpos-f1->startpos, f2->endpos-f2->startpos);
+ prev = temp;
+ temp = temp->next;
+ }
+ if (prev) {
+ temp = prev;
+ prev = prev->next;
+ temp->next = 0;
+ }
+ else if (temp == res) {
+ res = 0;
+ }
+ else {
+ // fprintf (stderr, "returning %d\n", res);
+ return res;
+ }
+ while ( prev ) {
+ // fprintf (stderr, "third...\n");
+ // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end,
+ // f1->endpos, f2->endpos);
+ temp = prev;
+ prev = prev->next;
+ free(temp);
+ }
+ return res;
+}
+
+int connectedGraph(hll* graph[MAX_SEQ][MAX_SEQ], int numseqs) {
+ int M[MAX_SEQ][MAX_SEQ];
+ int i, j, k;
+
+ for (i = 0; i < numseqs - 1; i++){
+ for (j = i + 1; j < numseqs; j++){
+ M[i][j] = M[j][i] = (graph[i][j] != NULL);
+ }
+ }
+
+ for (k = 0; k < numseqs; k++)
+ for (i = 0; i < numseqs; i++)
+ for (j = 0; j < numseqs; j++)
+ if (M[i][k] && M[k][j]) M[i][j] = 1;
+
+ k = 1;
+ for (i = 0; k && i < numseqs; i++)
+ k = M[0][i];
+
+ return k;
+}
+
+
+int main(int argc, char** argv) {
+ FileBuffer seqfile;
+ seq **seqs;
+ int i = 1, j = 1, x, y;
+ char command[256];
+
+ char *treestr = NULL;
+ align *stack[MAX_SEQ*2];
+ align *final;
+ align *myaligns[MAX_SEQ];
+ hll* table[MAX_SEQ][MAX_SEQ];
+ FileBuffer files[MAX_SEQ];
+
+ outfile = stdout;
+ lagan_dir = getenv ("LAGAN_DIR");
+ if (!lagan_dir) {
+ fprintf(stderr, "Environment variable LAGAN_DIR not set\n");
+ exit(1);
+ }
+
+ buildcache();
+ initLib();
+
+ seqs = (seq**) malloc((argc-1)*sizeof(seq*));
+
+ if (parseParameters(argc, argv, files, &treestr)) return 1;
+
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ table[i][j] = generateAnchors(files[i], files[j]);
+ }
+ }
+
+ if (fastreject && !connectedGraph(table, numseqs)) {
+ if (outfile != stdout)
+ fclose (outfile);
+ exit (0);
+ }
+
+ if (fastreject) {
+ for (i=0; i<numseqs; i++) {
+ for (j=i+1; j<numseqs; j++) {
+ if (table[i][j])
+ table[i][j] = updateAnchorPos(table[i][j], files[i], files[j]);
+ else
+ fprintf (stderr, "hmm\n");
+ }
+ }
+ }
+
+ if (fastreject && !connectedGraph(table, numseqs)) {
+ if (outfile != stdout)
+ fclose (outfile);
+ exit (0);
+ }
+
+ gapstart += gapcont;
+
+
+ // Take all sequences and make simple alignments
+
+ for (i=0; i<numseqs; i++) {
+ if (fastreject) {
+ if (files[i]->startpos > files[i]->endpos) {
+ if (outfile != stdout)
+ fclose (outfile);
+ exit (0);
+ }
+ seqs[i] = FileRead(files[i], 1, 0, VER_MLAGAN);
+
+
+
+ }
+ else
+ seqs[i] = FileRead(files[i], 0, 0, VER_MLAGAN);
+ seqs[i]->index = i+1;
+ myaligns[i]=simaligns[i]=mkSimAlign(seqs[i]);
+ simaligns[i]->index = i;
+ }
+
+
+ // Find all pairwise anchors.
+
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ simaligns[i]->hlls[j]=table[i][j];
+ }
+ }
+
+ // printf("\n");
+
+ for (i=0; i<MAX_SEQ*2; i++) {
+ stack[i] = NULL;
+ }
+
+ /*
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ printf("Sanity Check: simaligns[%d]->hlls[%d].score=%g\n",
+ i,j,
+ simaligns[i]->hlls[j]==NULL ? 0 : simaligns[i]->hlls[j]->score);
+ }
+ }
+ */
+
+ fprintf(stderr, "\n****************************\n");
+ fprintf(stderr, "gs: %d; ge: %d;\n", gapstart, gapend);
+ fprintf(stderr, "gc: %d; gp: %d\n", gapcont, gapperseq);
+ //fprintf(stderr, "match: %d; mismatch: %d\n", match, mismatch);
+ fprintf(stderr, "overlap: %d; glwidth: %d\n", overlap, glwidth);
+ fprintf(stderr, "\n****************************\n");
+
+ if (notree) {
+ treestr = graphCollapsal(myaligns);
+ }
+
+ //REMOVE the next line once debugged!!!
+ // exit(2);
+ //End of remove
+
+ i = 0;
+ treeToRPN(treestr, stack, &i);
+
+ final = procStack(stack, i, myaligns);
+
+
+ if (postir) {
+ final = iterativeImprovement(final, stack, i);
+ }
+
+ // Ouput end result.
+ fprintf(stderr, "final alignment... \n");
+ if (fastreject) {
+ printXMFAAlign(outfile, final);
+ }
+ else {
+ printFASTAAlign(outfile, final);
+ }
+ if (outfile != stdout) fclose (outfile);
+
+
+ fprintf(stderr, "mlagan -- end.\n");
+ return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/multial.c b/src/multial.c
new file mode 100644
index 0000000..39be819
--- /dev/null
+++ b/src/multial.c
@@ -0,0 +1,1648 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include <assert.h>
+#include "diagmatrix.h"
+#include "multial.h"
+
+#define INSERTION 1
+#define DELETION 2
+#define BOTH 3
+
+#define MISMATCH_CUTOFF 8
+#define ANCHOR_LENGTH_CUTOFF 10
+#define ANCHOR_SCORE_CUTOFF 1500
+
+#define MAX_SQ_SIZE (100 * (1 << 20))
+#define BIG_SQ_WIDTH 20
+
+#define CONS_FRAC 0.6
+
+#define MIN2(x,y) ( (x) >= (y) ? (y) : (x) )
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+#define MAX3(x,y,z) MAX2(MAX2(x,y),z)
+#define MIN3(x,y,z) MIN2(MIN2(x,y),z)
+#define PROD(x,y) ( (x) * (y) )
+
+#define WEQ2(x,y,a) (((x)==(a))? 0: ((y)==(a))? 1:-1)
+#define WEQ3(x,y,z,a) (((x)==(a))? 0: ((y)==(a))? 1: ((z)==(a))? 2:-1)
+
+char* alpha = "ATCG.N";
+char* nucmatrixfile = 0;
+
+int s1start = 0;
+int s1end = 0;
+int s2start = 0;
+int s2end = 0;
+//int match = 18;
+//int mismatch = -8;
+int gapstart = -50;
+int gapend = -50;
+int gapcont = -5;
+int gapperseq = -1;
+int overlap = 0;
+int glwidth= 15;
+char dobin = 0;
+
+float factor, offset;
+int logs[MAX_SEQ*MAX_SEQ];
+
+FILE* outfile;
+
+static int substmatrix[256][256];
+static int matchcache[1 << 24], gapcache[1 << 24];
+int *freed = 0, freedsize, freedcap;
+align **freedptr;
+
+int normf;
+int normprev;
+
+inline int ismatch(char a, char b) {
+ return (a == b);
+}
+
+inline int isGap(align* ali, int seqn, int loc) {
+ int i = !((ali->algn[loc] >> seqn) & 1);
+ return i;
+}
+
+inline int scoreLocal(int which, align* ali, int loc) {
+ int i, lets = 0;
+ for (i=0; i < 4; i++)
+ lets += ali->cnts[i][loc];
+ // printf ("which is %d lets is %d, cnts[w] is %d \n",which, lets, ali->cnts[which][loc]);
+
+ if (which <4)
+ return (ali->cnts[which][loc]-1) * 100 + (lets - ali->cnts[which][loc]) * -70 +
+ ali->cnts[CNTS_GS][loc] * gapstart + ali->cnts[CNTS_GC][loc] * gapcont;
+ if (which == CNTS_GS)
+ return lets * gapstart;
+ if (which == CNTS_GC)
+ return lets+ali->cnts[CNTS_GS][loc] * gapcont;
+}
+
+inline hll* reverseHLL(hll* tbr) {
+ hll *nn, *prev=0;
+ while (tbr) {
+ nn = tbr->next;
+ tbr->next = prev;
+ prev = tbr;
+ tbr = nn;
+ }
+ return prev;
+}
+
+hll* getAnchsFromAlign(align* current, int seqnum, int cutoff) {
+ int i=0, j, newj=0;
+ int currscore=0, oldscore, peakscore;
+ hll *res = 0, *temp = (hll*) malloc (sizeof(hll));
+ int which;
+ long long int mask = ~(1<<seqnum);
+ char ingap = 0, isfrst = 1;
+ float peakfrac;
+
+ assert (temp);
+
+ for (j = 0; j < current->algnlen; j++) {
+ if (!isGap(current, seqnum, j)) {
+ ingap = 0;
+ which = strchr(alpha, current->seqs[seqnum]->lets[i]) - alpha;
+ which = (which>3)?CNTS_LEN:which;
+ i++;
+ }
+ else {
+ if (ingap)
+ which = CNTS_GC;
+ else {
+ ingap = 1;
+ which = CNTS_GS;
+ }
+ }
+
+
+ currscore += scoreLocal(which, current, j);
+
+ if (currscore > cutoff) {
+ temp->score = currscore;
+ temp->seq1end = newj; temp->seq2start = i;
+ temp->seq2end = i; temp->seq1start = newj;
+ currscore = 0;
+ temp->next = res; res = temp;temp = (hll*) malloc (sizeof(hll));
+ assert (temp);
+ }
+ if (currscore < 0)
+ currscore = 0;
+ if (current->algn[j]&mask)
+ newj++;
+ }
+
+ if (currscore > cutoff) {
+ temp->score = currscore;
+ temp->seq1end = newj; temp->seq2start = i;
+ temp->seq2end = i; temp->seq1start = newj;
+ temp->next = res; res = temp;
+ }
+ else free(temp);
+ return reverseHLL(res);
+}
+
+int cons_cnt = 0;
+
+
+seq* mkConsensus(align* ali) {
+ int i, j;
+ seq* res = (seq*) malloc (sizeof(seq));
+ assert (res);
+ res->name = (char*) malloc(sizeof(char)*64);
+ assert (res->name);
+ sprintf(res->name, "Consensus_%d", ++cons_cnt);
+ res->numlets = ali->algnlen;
+ res->rptr = res->lets = (char*) malloc (sizeof(char) * res->numlets);
+ assert (res->lets);
+ for (i=0; i< res->numlets; i++) {
+ res->lets[i] = 'N';
+ for (j=0; j< 4; j++) {
+ if (ali->cnts[j][i] >= ((float)ali->numseq) * CONS_FRAC)
+ res->lets[i] = alpha[j];
+ }
+ }
+ return res;
+}
+
+inline void reverse (long long int* a, int length) {
+ long long int lft;
+ int i;
+ for (i=0; i < length/2; i++) {
+ lft = a[i];
+ a[i] = a[length-i-1];
+ a[length-i-1] = lft;
+ }
+}
+
+
+align* unifyAlign(align* ali1, align* ali2, align* uni){
+ char *mat[MAX_SEQ];
+ int i,j,k, cbc, brcount;
+ int s1 = 0, s2 = 0, tgs, tgc;
+ align *res = (align*) malloc(sizeof(align));
+
+ assert (res);
+ res->score = uni->score;
+ res->numseq = ali1->numseq + ali2->numseq;
+ res->algnlen = uni->algnlen;
+ res->nextalign = 0;
+ res->dirty = 0;
+
+ // memory allocation and alignment creation
+ res->algn = (long long int*) malloc ((res->algnlen+1) * sizeof (long long int)); assert (res->algn);
+ res->algn[0] = 0;
+ for (j = 0; j < CNTS_LEN; j++){
+ res->cnts[j] = (char*) malloc((res->algnlen+1) * sizeof(char));
+ assert (res->cnts[j]);
+ }
+ for (i=0; i<= res->algnlen; i++){
+ res->algn[i] = 0;
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j][i] = 0;
+ if (!isGap(uni, 0, i)) res->algn[i] |= ali1->algn[s1++];
+ if (!isGap(uni, 1, i)) res->algn[i] |= (ali2->algn[s2++] << ali1->numseq);
+ }
+
+ for (i = 0; i < res->numseq; i++){
+ res->seqs[i] = (i < ali1->numseq) ? ali1->seqs[i] : ali2->seqs[i - ali1->numseq];
+ mat[i] = (char *) malloc (sizeof (char) * (res->algnlen + 1)); assert (mat[i]);
+ mat[i][0] = 0;
+ for (j = 0, k = 0; j <= res->algnlen; j++)
+ mat[i][j] = isGap (res, i, j) ? '-' : res->seqs[i]->lets[k++];
+ }
+
+ s1 = s2 = 1;
+
+ for (i=0; i<=res->algnlen; i++){
+ for (j = 0; j < res->numseq; j++){
+ switch (mat[j][i]){
+ case 'A': res->cnts[CNTS_A][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break;
+ case 'T': res->cnts[CNTS_T][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break;
+ case 'C': res->cnts[CNTS_C][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break;
+ case 'G': res->cnts[CNTS_G][i]++; if (i > 1 && mat[j][i-1] == '-') res->cnts[CNTS_GE][i]++; break;
+ case '-':
+ if (i > 0 && mat[j][i-1] == '-')
+ res->cnts[CNTS_GC][i]++;
+ else
+ res->cnts[CNTS_GS][i]++;
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < res->numseq; i++) free (mat[i]);
+
+ return res;
+}
+
+
+align* getChain(dmat* mydm, int x, int y, int j) {
+ int temp;
+ align *res = (align*) malloc (sizeof(align)), *help;
+ long long int* almt = (long long int*) malloc ( sizeof(long long int));
+ int i=0, almtsize = 1, which, inrun = j;
+ char zz = DMgetPtr(mydm, x, y);
+ assert (res);
+ assert (almt);
+
+ for (i=0; i<CNTS_LEN; i++)
+ res->cnts[i] = 0;
+ i = 0;
+
+ ///////////////
+ res->dirty = 0;
+ res->nextalign = 0;
+ res->algn = 0;
+ res->algnlen = 0;
+
+ res->num = freedsize;
+ freed[freedsize] = 0;
+ freedptr[freedsize] = res;
+ if (++freedsize == freedcap){
+ freedcap *= 2;
+ freed = (int *) realloc (freed, sizeof (int) * freedcap);
+ freedptr = (align **) realloc (freedptr, sizeof (align *) * freedcap);
+ }
+
+ do {
+ // printf("I am at %d,%d %x\n", x,y, zz);
+ which = zz & Mmask;
+
+ if (which == 0x3) {
+ help = DMgetNeck(mydm, x, y, inrun);
+ if (!help) {
+ if (i > 2)
+ fprintf (stderr, "PROBLEM %d %d after %d (norm %d, %d)\n", x, y,i, normf, normprev);
+ free(almt);
+ res->algn = 0;
+ res->algnlen = i;
+ return res;
+ }
+ /* if (! help->nextalign)
+ fprintf (stderr, "check %d %d after %d\n", x, y,i);
+ */
+ help->dirty++;
+ res->nextalign = help;
+ break;
+ }
+
+
+ if (inrun == 1 && (zz & Nmask))
+ which = 1;
+ else if (inrun == 2 && (zz & Omask))
+ which = 2;
+ else
+ which = 0;
+
+
+ /*
+ if (inrun == 1) {
+ if (zz & Nmask) {
+ which = 1;
+ }
+ }
+ else if (inrun == 2) {
+ if (zz & Omask) {
+ which = 2;
+ }
+ }
+ */
+
+ if (which == 0) {
+ inrun = zz & Mmask;
+ almt[i++] = BOTH;
+ zz = DMgetPtr(mydm,--x,--y);
+ }
+
+ else if (which == 1) { /*N*/
+ inrun = 1;
+ almt[i++] = INSERTION;
+ zz = DMgetPtr(mydm, --x, y);
+ }
+
+ else if (which == 2) {
+ inrun = 2;
+ almt[i++] = DELETION;
+ zz = DMgetPtr(mydm, x, --y);
+ }
+ else
+ printf("a really dumb error %d\n", i);
+
+ if (i >= almtsize) {
+ almt = realloc (almt, sizeof(long long int)* (almtsize *= 2));
+ }
+ // printf ("retrace %d %d after %d\n", x, y,i);
+
+ } while (x > 0 && y > 0);
+ reverse(almt, i);
+
+ // fprintf(stderr, "getChain done at %d %d after %d\n", x , y , i);
+ // printf("gotChain\n");
+ res->algn = almt;
+ res->algnlen = i;
+ // printf("done w it\n");
+ return res;
+}
+
+
+void saveNeck(dmat* mydm, int neckdiag) {
+ int size1, size2, x1, x2, y1, y2;
+ alel *first = DMgetDiagStart(mydm, neckdiag-1, &size1, &x1, &y1),
+ *second = DMgetDiagStart(mydm, neckdiag, &size2, &x2, &y2);
+ int i, j;
+ align* a;
+
+ // printf("saving neck %d\n", neckdiag);
+ normprev = normf;
+ normf = DMnextNecks(mydm, neckdiag);
+
+ for (i=0; i<size2; i++,x2++,y2--) {
+ for (j=0; j<3; j++) {
+ a = getChain(mydm, x2, y2, j);
+ DMsetNeck(mydm, a, x2, y2, j);
+ }
+ }
+ for (i=0; i<size1; i++,x1++,y1--) {
+ for (j=0; j<3; j++) {
+ a = getChain(mydm, x1, y1, j);
+ DMsetNeck(mydm, a, x1, y1, j);
+ }
+ }
+}
+
+void joinAligns (align* a) {
+ align *n = a->nextalign, *t;
+ long long int* temp, *temp2;
+ int totsize=0;
+ int i =0;
+ for (t = a; t; t = t->nextalign) {
+ totsize += t->algnlen;
+ i++;
+ }
+
+ temp = malloc ((totsize+1)*sizeof(long long int));
+ assert (temp);
+ temp[totsize] = 0;
+ temp2 = temp + totsize;
+ totsize = 0;
+ for (t=a; t; t = t->nextalign) {
+ totsize += t->algnlen;
+ memcpy(temp2-totsize, t->algn, t->algnlen*sizeof(long long int));
+ }
+ free (a->algn);
+ a->algn = temp;
+ a->algnlen = totsize;
+ a->nextalign = 0;
+ /*
+ for (a = a->nextalign; a;) {
+ t = a;
+ a = a->nextalign;
+ freeAlign(t);
+ }
+ */
+}
+
+inline int scoreGap(int numgs, int numgc, int numge, int numseq) {
+ return (MIN2(numgc, numseq-numgc) * gapcont) +
+ (MIN2(numgs, numseq-numgs) * gapstart) +
+ (MIN2(numge, numseq-numge) * gapend);
+}
+
+void printcache(){
+ int a, b, c, d;
+ for (a = 0; a < 3; a++){
+ for (b = 0; b < 3; b++){
+ for (c = 0; c < 3; c++){
+ for (d = 0; d < 3; d++){
+ fprintf (stderr, "%d %d %d %d -- %d\n", a, b, c, d, matchcache[a | (b << 6) | (c << 12) | (d << 18)]);
+ }
+ }
+ }
+ }
+}
+
+char getLetter (FILE *file){
+ char ch;
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (!isspace (ch)){
+ // fprintf (stderr, "LETTER READ: \"%c\"\n", ch);
+ return ch;
+ }
+ }
+
+ assert (0);
+ return 0;
+}
+
+int readit = 0;
+
+void readSubstMatrix (char *filename, int size, int substmatrix[256][256]){
+ FILE *file;
+ char line[1024];
+ unsigned char *symbs, ch;
+ int i, j, k;
+
+ if (readit) return;
+ readit = 1;
+
+ if (!nucmatrixfile) {
+ sprintf (line, "%s/%s", getenv ("LAGAN_DIR"), filename);
+ file = fopen (line, "r"); assert (file);
+ }
+ else {
+ file = fopen (nucmatrixfile, "r"); assert (file);
+
+ }
+
+ for (i = 0; i < 256; i++){
+ for (j = 0; j < 256; j++){
+ substmatrix[i][j] = 0;
+ }
+ }
+
+ symbs = (unsigned char *) malloc (sizeof (unsigned char) * size); assert (symbs);
+ for (i = 0; i < size; i++) symbs[i] = (unsigned char) getLetter (file);
+ for (i = 0; i < size; i++){
+ ch = getLetter (file);
+ assert (ch == symbs[i]);
+ for (j = 0; j < size; j++){
+ fscanf (file, "%d", &k);
+ // fprintf (stderr, "NUMBER READ: %d\n", k);
+ substmatrix[(int) symbs[i]][(int) symbs[j]] = k;
+ assert ((int) symbs[i] > 0);
+ assert ((int) symbs[j] > 0);
+ }
+ }
+
+ fscanf (file, "%d", &gapstart);
+ fscanf (file, "%d", &gapcont);
+ // fprintf (stderr, "GAP SCORES: %d %d\n", gapstart, gapcont);
+ gapend = gapstart / 2;
+ gapstart -= gapend;
+
+ free (symbs);
+ fclose (file);
+}
+
+inline int chmatchscore (unsigned char a, unsigned char b, int substmatrix[256][256]) {
+ return substmatrix[a][b];
+}
+
+void buildcache (){
+ int score, i, j;
+ int gs, gc, ge, ns;
+ char *lets = "ATCG";
+ int num[4];
+ int numseqs = MAX_SEQ;
+
+ readSubstMatrix (NUC_FILE, NUC_FILE_SIZE, substmatrix);
+
+ for (num[0] = 0; num[0] <= numseqs; num[0]++){ // A
+ for (num[1] = 0; num[1] <= numseqs; num[1]++){ // T
+ for (num[2] = 0; num[2] <= numseqs; num[2]++){ // C
+ for (num[3] = 0; num[3] <= numseqs; num[3]++){ // G
+
+ score = 0;
+ for (i = 0; i < 4; i++){
+ score += num[i] * (num[i] - 1) / 2 * chmatchscore ((unsigned char)lets[i], (unsigned char)lets[i], substmatrix);
+ for (j = i + 1; j < 4; j++){
+ score += num[i] * num[j] * chmatchscore ((unsigned char) lets[i], (unsigned char) lets[j], substmatrix);
+ }
+ }
+ matchcache[num[0] | (num[1] << 6) | (num[2] << 12) | (num[3] << 18)] = score;
+ }
+ }
+ }
+ }
+
+ for (gs = 0; gs <= numseqs; gs++){
+ for (gc = 0; gc <= numseqs; gc++){
+ for (ge = 0; ge <= numseqs; ge++){
+ for (ns = 0; ns <= numseqs; ns++){
+ gapcache[gs | (gc << 6) | (ge << 12) | (ns << 18)] = scoreGap (gs, gc, ge, ns);
+ }
+ }
+ }
+ }
+
+ // builtcache = 1;
+
+ // printcache();
+}
+
+inline int v (int y){
+ if (y >= 0 && y <= MAX_SEQ) return y;
+ fprintf(stderr, "Got %d in v\n", y);
+ assert (0);
+ return 0;
+}
+
+inline int matchscore (align*a, int ai, align *b, int bi){
+
+ return
+ matchcache[v(a->cnts[0][ai] + b->cnts[0][bi]) |
+ (v(a->cnts[1][ai] + b->cnts[1][bi]) << 6) |
+ (v(a->cnts[2][ai] + b->cnts[2][bi]) << 12) |
+ (v(a->cnts[3][ai] + b->cnts[3][bi]) << 18)] +
+ gapcache[v(a->cnts[CNTS_GS][ai] + b->cnts[CNTS_GS][bi]) |
+ (v(a->cnts[CNTS_GC][ai] + b->cnts[CNTS_GC][bi]) << 6) |
+ (v(a->cnts[CNTS_GE][ai] + b->cnts[CNTS_GE][bi]) << 12) |
+ (v(a->numseq + b->numseq - (a->cnts[CNTS_CB][ai] + b->cnts[CNTS_CB][bi])) << 18)];
+}
+
+inline int scoreOpp (align *other, int ow, int oppnum){
+ return matchcache[v(other->cnts[0][ow]) |
+ (v(other->cnts[1][ow]) << 6) |
+ (v(other->cnts[2][ow]) << 12) |
+ (v(other->cnts[3][ow]) << 18)];
+}
+
+inline int endGap0 (align* a, int ai, align* b, int bi){
+ return gapcache[(v(a->cnts[CNTS_GE][ai]+b->cnts[CNTS_GE][bi])<<12) |
+ (v(a->numseq + b->numseq-(b->cnts[CNTS_CB][bi]+a->cnts[CNTS_CB][ai])) << 18)];
+}
+
+inline int endGap1 (align* a, int ai, align* b, int bi){
+
+ return gapcache[(v((b->numseq - b->cnts[CNTS_GS][bi] - b->cnts[CNTS_GC][bi]) + a->cnts[CNTS_GE][ai]) << 12) |
+ (v(a->numseq + b->numseq - (b->cnts[CNTS_CB][bi]+a->cnts[CNTS_CB][ai])) << 18)];
+}
+
+inline int endGap2 (align* a, int ai, align* b, int bi){
+ return gapcache[(v((a->numseq - a->cnts[CNTS_GS][ai] - a->cnts[CNTS_GC][ai]) + b->cnts[CNTS_GE][bi])<<12) |
+ (v(a->numseq + b->numseq - (b->cnts[CNTS_CB][bi]+a->cnts[CNTS_CB][ai])) << 18)];
+}
+
+inline int contGap(align* ali, int myw, align* other, int ow, int *sopp) {
+ return gapcache[(v(other->cnts[CNTS_GS][ow])) |
+ (v(ali->numseq + other->cnts[CNTS_GC][ow]) << 6) |
+ (v(other->cnts[CNTS_GE][ow]) << 12) |
+ (v(ali->numseq + other->numseq - (ali->cnts[CNTS_CB][myw] + other->cnts[CNTS_CB][ow])) << 18)] +
+ sopp[ow];
+}
+
+inline int openGap(align* ali, int w, align* other, int ow, int *sopp, char *desc) {
+ int alopen, pen, sav, i;
+
+ alopen = ali->cnts[CNTS_GC][w] + ali->cnts[CNTS_GE][w];
+ /**
+ * Watch out for running off end of array.
+ */
+ // if (w < ali->algnlen) alopen += ali->cnts[CNTS_GS][w+1];
+
+
+ sav = gapcache[(v(ali->numseq - (alopen + ali->cnts[CNTS_CB][w]) + other->cnts[CNTS_GS][ow])) |
+ (v(alopen + other->cnts[CNTS_GC][ow]) << 6) |
+ (v(other->cnts[CNTS_GE][ow]) << 12) |
+ (v(ali->numseq+other->numseq - (ali->cnts[CNTS_CB][w]+other->cnts[CNTS_CB][ow])) << 18)];
+
+ return sav;
+}
+
+
+void mkBarrel(int s1, int s2, int e1, int e2, int width, int *dn, int dt, int* starts, int *ends, dmat* mydm) {
+ int sd = s1+s2-1, dlen;
+ int elem = (sd < mydm->d2)? s1: mydm->d2-s2;
+ int incr;
+ double fl = 0;
+ double slope = (double)(e2-s2)/(double)(e1-s1);
+ double cloc = elem;
+
+ if ((e2-s2 == 0) && (e1-s1 == 0))
+ slope = 1;
+ else if (e1-s1 == 0)
+ slope = 100000;
+ // // printf("dt = %d\n", dt);
+ // printf("BA: %d, %d to %d, %d %f\n", s1,s2,e1,e2,slope);
+ for ( ; sd <(*dn); sd++) {
+ if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) {
+ cloc+=slope;
+ fl -= slope;
+ }
+ else {
+ elem--;
+ fl++;
+ }
+ if (sd <= mydm->d2)
+ elem++;
+ }
+ fl = 0;
+ for ( ; *dn < dt; (*dn)++) {
+ // // printf("dn =%d ", *dn);
+ if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) {
+ cloc+=slope;
+ fl -= slope;
+ }
+ else {
+ elem -=1;
+ fl++;
+ }
+ if (*dn <= mydm->d2)
+ elem++;
+
+ if (*dn < MIN2(mydm->d2, mydm->d1))
+ dlen = *dn;
+ else if (*dn < MAX2(mydm->d2, mydm->d1))
+ dlen = MIN2(mydm->d2, mydm->d1);
+ else
+ dlen = mydm->d2 + mydm->d1 - *dn;
+ starts[*dn] = MAX2(elem - width, 0);
+ ends[*dn] = MIN2(elem+width, dlen-1);
+ }
+}
+
+
+
+void mkSquare(int s1, int s2, int e1, int e2, int *dn, int dt, int* starts, int *ends, dmat* mydm) {
+ int dists[2], dlen;
+ long long int size = ((long long int)e1-(long long int)s1)
+ * ((long long int)e2-(long long int)s2);
+ int dn2;
+ int eval, sval;
+
+ if (size > MAX_SQ_SIZE) {
+ fprintf (stderr, "SQUARE TOO BIG: %d,%d to %d,%d\n", s1, e1,s2,e2);
+ mkSquare(s1, s2, (s1+e1)/2+glwidth, (s2+e2)/2+glwidth, dn, (*dn+dt)/2, starts, ends, mydm);
+ mkSquare((s1+e1)/2-glwidth, (s2+e2)/2-glwidth, e1, e2, dn, dt, starts, ends, mydm);
+ return;
+ }
+ // // printf("dt = %d\n", dt);
+ // // printf("SQ: %d, %d to %d, %d\n", s1,s2,e1,e2);
+
+ // fill in part before square
+ dn2 = *dn - 1;
+ while (1){
+ if (dn2 < mydm->d2) {
+ dists[0] = s1-1;
+ dists[1] = dn2 - e2;
+ }
+ else {
+ dists[0] = mydm->d2 - e2;
+ dists[1] = s1 - (dn2 - mydm->d2)-1;
+ }
+ starts[dn2] = MIN2(starts[dn2], sval = MAX3(dists[0], dists[1],0));
+
+ if (dn2 < mydm->d2) {
+ dists[0] = e1-1;
+ dists[1] = dn2 - s2;
+ }
+ else {
+ dists[0] = mydm->d2 - s2;
+ dists[1] = e1 - (dn2-mydm->d2)-1;
+ }
+ if (dn2 < MIN2(mydm->d2, mydm->d1))
+ dlen = dn2;
+ else if (dn2 < MAX2(mydm->d2, mydm->d1))
+ dlen = MIN2(mydm->d2, mydm->d1);
+ else
+ dlen = mydm->d2 + mydm->d1 - dn2;
+ ends[dn2] = MAX2(ends[dn2], eval = MIN3(dists[0], dists[1],dlen-1));
+ if (eval - sval <= 5) break; // break after fill in
+ dn2--;
+ }
+
+ for ( ; *dn < dt; (*dn)++) {
+ // // printf("square dn = %d\n", *dn);
+ if (*dn < mydm->d2) {
+ dists[0] = s1-1;
+ dists[1] = *dn - e2;
+ }
+ else {
+ dists[0] = mydm->d2 - e2;
+ dists[1] = s1 - (*dn - mydm->d2)-1;
+ }
+ starts[*dn] = MAX3(dists[0], dists[1],0);
+
+ if (*dn < mydm->d2) {
+ dists[0] = e1-1;
+ dists[1] = *dn - s2;
+ }
+ else {
+ dists[0] = mydm->d2 - s2;
+ dists[1] = e1 - (*dn-mydm->d2)-1;
+ }
+ if (*dn < MIN2(mydm->d2, mydm->d1))
+ dlen = *dn;
+ else if (*dn < MAX2(mydm->d2, mydm->d1))
+ dlen = MIN2(mydm->d2, mydm->d1);
+ else
+ dlen = mydm->d2 + mydm->d1 - *dn;
+ ends[*dn] = MIN3(dists[0], dists[1],dlen-1);
+ }
+}
+
+void doShapes(hll* myres, dmat* mydm, int* starts, int *ends) {
+ int p1=MAX2(overlap,glwidth)+1, p2=MAX2(overlap,glwidth)+1;
+ int t1, t2;
+ int dn = 1, dt;
+ int width = glwidth;
+ while (myres) {
+
+ while (1){
+ if (!myres || (myres->seq1start >= 1 && myres->seq2start >= 1 &&
+ myres->seq1end >= 1 && myres->seq2end >= 1 &&
+ myres->seq1start < mydm->d1 && myres->seq2start < mydm->d2 &&
+ myres->seq1start < myres->seq1end && myres->seq2start < myres->seq2end &&
+ myres->seq1end < mydm->d1 && myres->seq2end < mydm->d2 &&
+ abs((myres->seq1end-myres->seq1start) -
+ (myres->seq2end-myres->seq2start)) <= MISMATCH_CUTOFF))
+ break;
+ myres = myres->next;
+ }
+ if (!myres) break;
+
+ /*
+ printf("--> (%d %d)=(%d %d)\n",
+ myres->seq1start, myres->seq1end,
+ myres->seq2start, myres->seq2end);
+ */
+ t1 = myres->seq1start; /* between hits */
+ t2 = myres->seq2start;
+ dt = t1 + t2 - 1 + overlap;
+ mkSquare(p1-MAX2(overlap, width), p2-MAX2(overlap, width),
+ t1+MAX2(overlap, width), t2+MAX2(overlap, width),
+ &dn, dt, starts, ends, mydm);
+ p1 = myres->seq1end; /* within a hit */
+ p2 = myres->seq2end;
+ dt = p1 + p2 - 1 - overlap;
+ mkBarrel(t1, t2, p1, p2, width, &dn, dt, starts, ends, mydm);
+ myres = myres->next;
+ }
+ t1 = mydm->d1;
+ t2 = mydm->d2;
+ dt = t1 + t2;
+ mkSquare(p1-MAX2(overlap,width), p2-MAX2(overlap,width), t1, t2, &dn, dt, starts, ends, mydm);
+}
+
+
+void doAncs(dmat* mydm, align* ali1, align* ali2, hll* ancs) {
+ int *starts, *ends;
+
+ starts = (int*) malloc(sizeof(int)*(ali1->algnlen + ali2->algnlen+2)); assert (starts);
+ ends = (int*) malloc(sizeof(int)*(ali1->algnlen + ali2->algnlen+2)); assert (ends);
+ doShapes(ancs, mydm, starts, ends);
+ DMinitDiag(mydm, starts,ends);
+ free(starts);
+ free(ends);
+}
+
+
+align* doNW(dmat* mydm, align* ali1, align* ali2) {
+ int i, j;
+ int x, y, size;
+ int gapstartN = 0, gapstartO = 0;
+ int gapcontN, gapcontO;
+ int gapend[3];
+ int tt, prevgap;
+ alel *curr, *pasts0, *pasts1, *pasts2;
+ align* a, *b;
+ char rh, ptr=0, isneck;
+ int ndiags = mydm->d1 + mydm->d2 -1;
+ int *sopp1, *sopp2;
+ int numNecks =0, oldneck =0;
+ register int s1, s2, s3, z1, z2,z3;
+
+ // int M[20][20][6];
+
+
+ isneck = DMnextDiag(mydm);
+ curr = DMgetDiagStart(mydm, 1, &size, &x, &y);
+ curr->N = curr->O = 0;
+ curr->M = 0;
+ DMsetPtr(mydm, 0, 1, 1);
+
+ buildcache();
+
+ sopp1 = (int*) malloc (sizeof (int) * (ali1->algnlen+1));
+ sopp2 = (int*) malloc (sizeof (int) * (ali2->algnlen+1));
+ assert (sopp1); assert (sopp2);
+
+ for (i = 0; i < ali1->algnlen; i++) sopp1[i] = scoreOpp (ali1, i, 0);
+ for (i = 0; i < ali2->algnlen; i++) sopp2[i] = scoreOpp (ali2, i, 0);
+
+ /*fprintf (stderr, "Checking diagonals...\n");
+ for (i = ndiags - 50; i <= ndiags; i++){
+ DMgetDiagStart (mydm, i, &size, &x, &y); */
+
+ // fprintf (stderr, "ndiag = %d (%d %d)\n", ndiags, ali1->algnlen, ali2->algnlen);
+
+ for (i = 2; i <= ndiags; i++) {
+ isneck = DMnextDiag(mydm);
+ if (!(i%10000))
+ fprintf(stderr, "WORKING %d/%d\n", i/10000,ndiags/10000 );
+
+ curr = DMgetDiagStart(mydm, i, &size, &x, &y);
+ pasts2 = DMgetElem(mydm, x-1, y);
+ pasts1 = DMgetElem(mydm, x-1, y-1);
+
+ for (j = 0; j < size; j++) {
+ gapstartN = openGap(ali2, y, ali1, x, sopp1, "gapstartN");
+ gapstartO = openGap(ali1, x, ali2, y, sopp2, "gapstartO");
+
+ gapcontN = contGap(ali2, y, ali1, x-1, sopp1);
+ gapcontO = contGap(ali1, x, ali2, y-1, sopp2);
+
+ pasts0 = pasts2;
+ pasts2 = DMgetElem2(mydm, x, y-1, pasts2);
+
+ curr->M = matchscore (ali1, x - 1, ali2, y - 1);
+
+ z1 = pasts1->M + endGap0 (ali1, x - 1, ali2, y - 1);
+ z2 = pasts1->N + endGap1 (ali1, x - 1, ali2, y - 1);
+ z3 = pasts1->O + endGap2 (ali1, x - 1, ali2, y - 1);
+
+ if (z1 >= z2){
+ if (z1 >= z3){ curr->M += z1; ptr = 0; }// + endGap0 (ali1, x - 0, ali2, y - 0); }
+ else { curr->M += z3; ptr = 2; }// + endGap2 (ali1, x - 0, ali2, y - 0); }
+ }
+ else {
+ if (z2 >= z3){ curr->M += z2; ptr = 1; } // + endGap1 (ali1, x - 0, ali2, y - 0); }
+ else { curr->M += z3; ptr = 2; } // + endGap2 (ali1, x - 0, ali2, y - 0); }
+ }
+
+ s2 = pasts0->N + gapcontN;
+ s3 = pasts2->O + gapcontO;
+
+ s1 = curr->M + gapstartN;
+ if (s1 >= s2){ curr->N = s1; }
+ else { curr->N = s2; ptr |= 4; }
+ s1 = curr->M + gapstartO;
+ if (s1 >= s3){ curr->O = s1; }
+ else { curr->O = s3; ptr |= 8; }
+
+ DMsetPtr(mydm, ptr, x, y);
+
+ curr++; x++; y--;
+
+ pasts1 = DMgetElem2(mydm, x-1, y-1, pasts1);
+ }
+ if (isneck) {
+ numNecks++;
+ saveNeck(mydm, i);
+ oldneck = i;
+ }
+ }
+
+ free (sopp1);
+ free (sopp2);
+
+ mydm->currneck++;
+ a = getChain(mydm, mydm->d1, mydm->d2, 0);
+ curr--;
+ a->score = MAX3(curr->M, curr->N, curr->O);
+ freed[a->num] = 1;
+ joinAligns(a);
+
+
+
+ // fprintf(stderr, "done NW\n");
+ return a;
+}
+
+align* makeAlign(align* ali1, align* ali2, hll* anchors, align **uni) {
+ align *res;
+ dmat* mydm;
+ int numseq = ali1->numseq + ali2->numseq, i;
+ int oldgapstart = gapstart, oldgapcont = gapcont, oldgapend = gapend;
+
+ mydm = makeDM(ali1->algnlen, ali2->algnlen);
+
+ gapstart *= (numseq-1); gapend *= (numseq-1);
+ gapcont *= (numseq-1);
+ fprintf (stderr, "gs ge gc %d %d %d\n", gapstart, gapend, gapcont);
+ // initEntropy(ali1, ali2);
+
+ doAncs(mydm, ali1, ali2, anchors);
+
+ freedsize = 0; freedcap = 1;
+ freed = (int *) malloc (sizeof (int) * freedcap);
+ freedptr = (align **) malloc (sizeof (align *) * freedcap);
+ assert (freed);
+ assert (freedptr);
+
+ *uni = doNW(mydm, ali1, ali2);
+ res = unifyAlign(ali1, ali2, *uni);
+ // printf("firstlen = %d, seclen = %d, relen = %d\n", ali1->algnlen, ali2->algnlen, res->algnlen);
+ freeDM(mydm);
+
+ // fprintf(stderr, "Final freeing\n");
+ for (i = freedsize-1; i >= 0; i--){
+ if (!freed[i]){
+ freeAlign (freedptr[i]);
+ freedptr[i] = 0;
+ }
+ }
+ // fprintf(stderr, "Final freeing done\n");
+ free (freed); free (freedptr);
+ freed = 0;
+ gapstart = oldgapstart; gapend = oldgapend; gapcont = oldgapcont;
+
+ return res;
+}
+
+align* mkSimAlign(seq* seq1) {
+ int i,j,k,oldk=-1;
+ align* res = (align*) malloc( sizeof(align));
+ assert (res);
+
+ res->score = 0;
+ res->nextalign = 0;
+ res->dirty = 0;
+ res->numseq = 1;
+ res->algnlen = seq1->numlets;
+ res->seqs[0] = seq1;
+
+ /**
+ * Evidence that you need one more character.
+ */
+ res->algn = (long long int*) malloc((res->algnlen+1) * sizeof(long long int));
+ assert (res->algn);
+ for (j=0; j<CNTS_LEN; j++){
+ res->cnts[j] = (char*) malloc((res->algnlen+1) * sizeof(char));
+ assert (res->cnts[j]);
+ }
+ for (i=0; i< res->algnlen;i++) {
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j][i] = 0;
+ res->algn[i] = 1;
+ k=strchr(alpha,seq1->lets[i])-alpha;
+ if (k<5)
+ res->cnts[k][i]++;
+ if (oldk == 4)
+ res->cnts[4][i]++;
+ oldk = k;
+ }
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j][i] = 0;
+ res->algn[i] = 0;
+ return res;
+}
+
+
+align* removeSeq(align* ali, int seqnum) {
+ int i,j, k, n, p, bit = (1 << seqnum);
+ int mask = bit - 1, resint, flag = 0;
+ align* res = (align*) malloc(sizeof(align));
+ res->score = 0;
+ res->numseq = ali->numseq-1;
+ for (i=0; i< seqnum; i++)
+ res->seqs[i] = ali->seqs[i];
+ for (i++; i< ali->numseq; i++)
+ res->seqs[i-1] = ali->seqs[i];
+
+ res->algn = (long long int*) malloc(ali->algnlen * sizeof(long long int));
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j] = (char*) malloc(ali->algnlen * sizeof(char));
+
+ for (i=0, j=0, n=0; i < ali->algnlen; i++) {
+ resint = (ali->algn[i] & mask) | ((ali->algn[i] & ~(mask|bit)) >> 1);
+ if (resint) {
+ for (k=0; k<CNTS_LEN; k++)
+ res->cnts[k][j] = ali->cnts[k][i];
+ res->algn[j] = resint;
+ if (!isGap(ali, seqnum, i)) {
+ k=strchr(alpha,ali->seqs[seqnum]->lets[n])-alpha;
+ if (k<5)
+ res->cnts[k][j]--;
+ if (i && isGap(ali, seqnum, i-1))
+ res->cnts[CNTS_GE][j]--;
+ n++;
+ }
+ else {
+ if (i && isGap(ali, seqnum, i-1))
+ res->cnts[CNTS_GC][j]--;
+ else
+ res->cnts[CNTS_GS][j]--;
+ }
+ if (flag) {
+
+ res->cnts[CNTS_GS][j] = 0;
+ res->cnts[CNTS_GC][j] = 0;
+ res->cnts[CNTS_GE][j] = 0;
+ for (p = 0; p < res->numseq; p++) {
+ if (j<=1 || isGap(res, p, j-1)) {
+ if (!isGap(res, p, j))
+ res->cnts[CNTS_GE][j]++;
+ else
+ res->cnts[CNTS_GC][j]++;
+ }
+ else {
+ if (j && isGap(res, p, j))
+ res->cnts[CNTS_GS][j]++;
+ }
+ }
+ }
+ j++;
+ }
+ else { n++; flag = 1;}
+ }
+
+ res->algnlen = j;
+
+ for (i=0; i<CNTS_LEN; i++)
+ res->cnts[i][j] = 0;
+
+ // printf("%d squished to %d\n", ali->algnlen, res->algnlen);
+ return res;
+}
+
+
+align* removeSeqByName(align* ali, char *name) {
+ int i=0;
+
+ seq *removed;
+
+ while (strcmp(ali->seqs[i]->name, name)) { i++; }
+ removed = ali->seqs[i];
+
+ removeSeq(ali, i);
+}
+
+int getSeqNum(align* ali, seq* trgt) {
+ int i=0;
+
+ seq *removed;
+
+ while (ali->seqs[i] != trgt) { i++; }
+ return i;
+}
+
+
+void swapHLL(hll* h1) {
+ int i, j;
+
+ while(h1) {
+ i=h1->seq1start;
+ j=h1->seq1end;
+ h1->seq1start=h1->seq2start;
+ h1->seq1end=h1->seq2end;
+ h1->seq2start=i;
+ h1->seq2end=j;
+ h1=h1->next;
+ }
+}
+
+
+int countpos (align* aln, int seqnum){
+ int i, j = 0;
+ for (i = 0; i < aln->algnlen; i++){
+ if (!isGap (aln, seqnum, i)) j++;
+ }
+ return j;
+}
+
+hll* remapHLLs(hll* anchs, int which, align* aln, int seqnum) {
+ int mybp, i, *searchint, stmybp, mylen, olen, osize;
+ hll *wlist = anchs, *temp, *prev;
+ float scale;
+ char isfrst=1;
+
+ // fprintf (stderr, "which=%d\n", which);
+ //
+ // fprintf (stderr, "This is a list of the entries before going into remapHLLs:\n");
+ // printHLL (anchs);
+
+ if (!anchs)
+ return anchs;
+
+ mylen = countpos (aln, seqnum);
+ // olen = countpos (aln, !seqnum);
+
+ // fprintf (stderr, "Here is some information about the alignment:\n");
+ // fprintf (stderr, " alignment length = %d\n", aln->algnlen);
+ // fprintf (stderr, " number of positions in sequence to remap = %d\n", mylen);
+ // fprintf (stderr, " number of positions in other sequence = %d\n", olen);
+
+ prev = NULL;
+ for (temp = wlist; temp; temp = temp->next){
+ if (temp->seq1start < 1) temp->seq1start = 1;
+ if (temp->seq2start < 1) temp->seq2start = 1;
+ if (!which && temp->seq1end > mylen) temp->seq1end = mylen;
+ else if (which && temp->seq2end > mylen) temp->seq2end = mylen;
+
+ if (temp->seq1start > temp->seq1end) {
+ fprintf(stderr, "1 (%d %d)(%d %d)", temp->seq1start, temp->seq1end, temp->seq2start, temp->seq2end);
+ assert(0);
+ }
+
+ if (temp->seq2start > temp->seq2end) {
+ fprintf(stderr, "2 (%d %d)(%d %d)", temp->seq1start, temp->seq1end, temp->seq2start, temp->seq2end);
+ assert(0);
+ }
+ }
+
+ wlist = (hll*)malloc(sizeof(hll)); assert (wlist);
+ wlist->next = anchs;
+ prev = wlist;
+
+ mybp = stmybp = 0;
+ searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start);
+
+ for (i=1; i<=aln->algnlen; i++) {
+ if (isGap(aln,seqnum,i)){
+ if (isfrst) continue;
+
+ scale = (!which) ?
+ ((anchs->seq1end == stmybp) ? 0 : (float)(mybp - stmybp) / (float)(anchs->seq1end - stmybp)) :
+ ((anchs->seq2end == stmybp) ? 0 : (float)(mybp - stmybp) / (float)(anchs->seq2end - stmybp));
+ osize = (!which) ?
+ (int)((anchs->seq2end - anchs->seq2start) * scale) :
+ (int)((anchs->seq1end - anchs->seq1start) * scale);
+ assert (osize >= 0);
+
+ if (//mybp - stmybp < ANCHOR_LENGTH_CUTOFF || osize < ANCHOR_LENGTH_CUTOFF ||
+ anchs->score * scale < ANCHOR_SCORE_CUTOFF){
+
+ // fprintf (stderr, "1. The region from %d to %d was cropped.\n", stmybp, mybp);
+
+ if (!which){
+ anchs->score -= anchs->score * scale;
+ anchs->seq1start = mybp+1;
+ anchs->seq2start = anchs->seq2start + osize + 1;
+ isfrst = 1;
+ searchint = &(anchs->seq1start);
+ }
+ else {
+ anchs->score -= anchs->score * scale;
+ anchs->seq1start = anchs->seq1start + osize + 1;
+ anchs->seq2start = mybp+1;
+ isfrst = 1;
+ searchint = &(anchs->seq2start);
+ }
+
+ if (anchs->seq1start >= anchs->seq1end || anchs->seq2start >= anchs->seq2end){
+ // fprintf (stderr, "6. The region from %d to %d was thrown away.\n", stmybp, mybp);
+ temp = anchs;
+ prev->next = anchs->next;
+ anchs = anchs->next;
+ free (temp);
+ if (!anchs) break;
+ searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start);
+ }
+ continue;
+ }
+
+ temp = (hll*) malloc(sizeof(hll)); assert (temp);
+ temp->next = anchs->next;
+ anchs->next = temp;
+ temp->seq1end = anchs->seq1end;
+ temp->seq2end = anchs->seq2end;
+
+
+ // fprintf (stderr, "2. A new region from %d to %d was created.\n", stmybp, mybp);
+ //fprintf (stderr, "Currently looking at (%d %d)=(%d %d)\n", anchs->seq1start, anchs->seq1end, anchs->seq2start, anchs->seq2end);
+
+
+ if (!which){
+ temp->score = anchs->score * scale;
+ anchs->score -= temp->score;
+ anchs->seq1end = i;
+ anchs->seq2end = anchs->seq2start + osize;
+ temp->seq1start = mybp+1;
+ temp->seq2start = anchs->seq2end + 1;
+ isfrst = 1;
+ searchint=&(temp->seq1start);
+ }
+ else {
+ temp->score = anchs->score * scale;
+ anchs->score -= temp->score;
+ anchs->seq1end = anchs->seq1start + osize;
+ anchs->seq2end = i;
+ temp->seq1start = anchs->seq1end + 1;
+ temp->seq2start = mybp+1;
+ isfrst = 1;
+ searchint=&(temp->seq2start);
+ }
+ assert (anchs->seq1start <= anchs->seq1end);
+ assert (anchs->seq2start <= anchs->seq2end);
+ prev = anchs;
+ anchs = temp;
+
+ if (anchs->seq1start >= anchs->seq1end || anchs->seq2start >= anchs->seq2end){
+ // fprintf (stderr, "5. The region from %d to %d was thrown away.\n", stmybp, mybp);
+ temp = anchs;
+ prev->next = anchs->next;
+ anchs = anchs->next;
+ free (temp);
+ if (!anchs) break;
+ searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start);
+ }
+
+ // fprintf (stderr, "Now, I am looking for %d, isfrst=%d (%d %d).\n", *searchint, isfrst, temp->seq1start, temp->seq1end);
+ // fprintf (stderr, "Currently, we are position %d in the sequence.\n", mybp);
+ continue;
+ }
+ mybp++;
+ if (mybp==*searchint){
+ if (isfrst) {
+ *searchint = i;
+ searchint = (!which)?&(anchs->seq1end):&(anchs->seq2end);
+ stmybp = mybp;
+ isfrst = !isfrst;
+ // fprintf (stderr, "2) Now, I am looking for %d, isfrst=%d.\n", *searchint, isfrst);
+ // fprintf (stderr, "Currently, we are position %d in the sequence.\n", mybp);
+ }
+ }
+ if (mybp==*searchint){
+ if (!isfrst){
+ *searchint = i;
+
+ assert (anchs->seq1start <= anchs->seq1end);
+ assert (anchs->seq2start <= anchs->seq2end);
+
+ if (which == 0 && anchs->seq1end - anchs->seq1start < ANCHOR_LENGTH_CUTOFF ||
+ which == 1 && anchs->seq2end - anchs->seq2start < ANCHOR_LENGTH_CUTOFF){
+ // fprintf (stderr, "4. The region from %d to %d was thrown away.\n", stmybp, mybp);
+ temp = anchs;
+ prev->next = anchs->next;
+ anchs = anchs->next;
+ free (temp);
+ }
+ else {
+ // fprintf (stderr, "3. The region from %d to %d was saved.\n", stmybp, mybp);
+ prev = anchs;
+ anchs = anchs->next;
+ }
+ if (!anchs)
+ break;
+ searchint = (!which)?&(anchs->seq1start):&(anchs->seq2start);
+
+ isfrst = !isfrst;
+ // fprintf (stderr, "Now, I am looking for %d, isfrst=%d.\n", *searchint, isfrst);
+ // fprintf (stderr, "Currently, we are position %d in the sequence.\n", mybp);
+ }
+ }
+ }
+
+ // fprintf (stderr, "By the end, I have reached mybp=%d, stmybp=%d.\n", mybp, stmybp);
+ // fprintf (stderr, " number of positions in sequence to remap = %d\n", mylen);
+ // fprintf (stderr, " number of positions in other sequence = %d\n", olen);
+
+ temp = wlist;
+ wlist = wlist->next;
+ free (temp);
+
+ for (temp = wlist; temp; temp = temp->next){
+ // fprintf (stderr, "(%d %d)=(%d %d) %f\n", temp->seq1start, temp->seq1end, temp->seq2start, temp->seq2end, temp->score);
+ assert (temp->seq1start <= temp->seq1end);
+ assert (temp->seq2start <= temp->seq2end);
+ assert (temp->seq1start >= 0);
+ assert (temp->seq2start >= 0);
+ assert (temp->seq1end >= 0);
+ assert (temp->seq2end >= 0);
+ }
+
+ return wlist;
+}
+
+
+int hllIntersection(hll *h1, hll *h2) {
+ int i, j;
+ int r1, r2;
+
+ if (!h1 || !h2) return 0;
+
+ i=MAX2(h1->seq1start, h2->seq1start);
+ j=MIN2(h1->seq1end, h2->seq1end);
+
+ r1 = ((i<j) ? j-i : 0);
+
+ i=MAX2(h1->seq2start, h2->seq2start);
+ j=MIN2(h1->seq2end, h2->seq2end);
+
+ r2 = ((i<j) ? j-i : 0);
+
+ return (MIN2(r1, r2));
+}
+
+int hllUnion(hll *h1, hll *h2) {
+ int i, j;
+ int r1, r2;
+
+ if (!h1 && !h2) return 0;
+ if (!h1) return MAX2(h2->seq1end - h2->seq1start,
+ h2->seq2end - h2->seq2start);
+ if (!h2) return MAX2(h1->seq1end - h1->seq1start,
+ h1->seq2end - h1->seq2start);
+
+ i=MIN2(h1->seq1start, h2->seq1start);
+ j=MAX2(h1->seq1end, h2->seq1end);
+
+ r1 = ((i<j) ? j-i : 0);
+
+ i=MIN2(h1->seq2start, h2->seq2start);
+ j=MAX2(h1->seq2end, h2->seq2end);
+
+ r2 = ((i<j) ? j-i : 0);
+
+ return (MAX2(r1, r2));
+}
+
+
+hll* hllJoin(hll *h1, hll *h2, int score) {
+ int i, j;
+ hll *res = malloc (sizeof(hll));
+
+
+ res->seq1start=MIN2(h1->seq1start, h2->seq1start);
+ res->seq1end=MAX2(h1->seq1end, h2->seq1end);
+
+ res->seq2start=MIN2(h1->seq2start, h2->seq2start);
+ res->seq2end=MAX2(h1->seq2end, h2->seq2end);
+ res->score = score;
+
+ return res;
+}
+
+
+int minHLL(hll *h1, hll *h2){
+ int i, j;
+
+ i=MIN2(h1->seq1end, h2->seq1end);
+ return (i==h2->seq1end);
+}
+
+
+float scoreMerge(hll* h1, hll *h2) {
+ float i, u;
+ i = hllIntersection(h1, h2);
+ u = hllUnion(h1, h2);
+
+ return (h1->score + h2->score)*(i/u);
+}
+
+
+void printSeqsNames(align *a) {
+ int i;
+ printf("( ");
+ for (i=0; i<a->numseq; i++) {
+ printf("%s ", a->seqs[i]->name);
+ }
+ printf(")\n");
+}
+
+
+void printMyHLL(hll *myres) {
+ /*
+ while(myres) {
+
+ printf("***: (%d %d)=(%d %d)\n",
+ myres->seq1start, myres->seq1end,
+ myres->seq2start, myres->seq2end);
+
+ myres=myres->next;
+ }
+ */
+}
+
+hll* mergeHLLs(hll* anchs1, int wh1, hll* anchs2, int wh2) {
+ int i, j, mscore;
+ hll* res=0, *temp;
+ if(wh1) swapHLL(anchs1);
+ if(wh2) swapHLL(anchs2);
+ /*
+ printf("anchs1: \n");
+ printMyHLL(anchs1);
+ printf("anchs2: \n");
+ printMyHLL(anchs2);
+ */
+ if (anchs1==anchs2) {
+ // fprintf(stderr, "mergeHLLs called on same hll!\n");
+ return anchs1;
+ }
+
+ while((anchs1 && anchs2)) {
+ // printf("calling printMyHLL!\n");
+ // printMyHLL(res);
+ if (hllIntersection(anchs1, anchs2)) {
+ mscore = scoreMerge(anchs1, anchs2);
+ if (MAX3(anchs1->score, anchs2->score, mscore) == mscore) {
+ temp = hllJoin(anchs1, anchs2, mscore);
+ temp->next = res;
+ res = temp;
+ }
+ }
+ if (minHLL(anchs1, anchs2)) {
+ temp = anchs2->next;
+ anchs2->next = res;
+ res = anchs2;
+ anchs2 = temp;
+ }
+ else {
+ temp = anchs1->next;
+ anchs1->next = res;
+ res = anchs1;
+ anchs1 = temp;
+ }
+ }
+ if (anchs1 && !anchs2)
+ while (anchs1) {
+ temp = anchs1->next;
+ anchs1->next = res;
+ res = anchs1;
+ anchs1 = temp;
+ }
+ if (!anchs1 && anchs2)
+ while (anchs2) {
+ temp = anchs2->next;
+ anchs2->next = res;
+ res = anchs2;
+ anchs2 = temp;
+ }
+ return res;
+}
+
+int printTextAlign(FILE* outfile, align* myalign) {
+ int s1=0, s2=0, c, k, i;
+ int nlets=0;
+ int* inds = (int*) malloc (sizeof(int)* myalign->numseq);
+ if (!outfile)
+ outfile = stdout;
+
+ for (i=0; i< myalign->numseq; i++) {
+ inds[i] = 1;
+ }
+
+ // fprintf(outfile, "ALIGNMENT LENGTH=%d\n\n", myalign->algnlen);
+
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+
+ for (i=0; i< myalign->numseq; i++) {
+
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+
+ if (myalign->algn[k] & (1<<i))
+ fprintf(outfile, "%c", myalign->seqs[i]->lets[inds[i]++]);
+ else
+ fprintf(outfile,"-");
+
+ }
+ fprintf(outfile,"\n");
+
+ }
+ for (i=4; i < CNTS_LEN; i++) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ fprintf(outfile, "%d", myalign->cnts[i][k] % 10 );
+ }
+ fprintf(outfile,"\n");
+ }
+
+ /*
+ fprintf(outfile,"\n");
+ for (k=c;(k < (c + 60)) && (k < myalign->algnlen); k++) {
+ fprintf(outfile, "%d", k/100);
+ }
+ fprintf(outfile,"\n");
+ for (k=c;(k < (c + 60)) && (k < myalign->algnlen); k++) {
+ fprintf(outfile, "%d", (k/10)%10);
+ }
+ fprintf(outfile,"\n");
+ for (k=c;(k < (c + 60)) && (k < myalign->algnlen); k++) {
+ fprintf(outfile, "%d", k%10);
+ }
+ fprintf(outfile,"\n");
+ */
+
+ fprintf(outfile,"\n\n");
+ }
+
+
+ fprintf(outfile,"\n");
+ free(inds);
+}
+
+int printFASTAAlign(FILE* outfile, align* myalign) {
+ int s1=0, s2=0, c, k, i;
+ int nlets=0;
+ int* inds = (int*) malloc (sizeof(int)* myalign->numseq);
+ if (!outfile)
+ outfile = stdout;
+
+ for (i=0; i< myalign->numseq; i++) {
+ inds[i] = 1;
+ }
+
+ for (i=0; i< myalign->numseq; i++) {
+ fprintf(outfile, ">%s\n", myalign->seqs[i]->name);
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] & (1<<i))
+ fprintf(outfile, "%c", myalign->seqs[i]->lets[inds[i]++]);
+ else
+ fprintf(outfile,"-");
+ }
+ fprintf(outfile,"\n");
+ }
+ }
+ fprintf(outfile,"\n");
+
+ free (inds);
+}
+
+int printXMFAAlign(FILE* outfile, align* myalign) {
+ int s1=0, s2=0, c, k, i;
+ int nlets=0;
+ int* inds = (int*) malloc (sizeof(int)* myalign->numseq);
+ if (!outfile)
+ outfile = stdout;
+
+ for (i=0; i< myalign->numseq; i++) {
+ inds[i] = 1;
+ }
+
+ for (i=0; i< myalign->numseq; i++) {
+ fprintf(outfile, ">%d:%d-%d + %s\n", myalign->seqs[i]->index, myalign->seqs[i]->leftbound,
+ myalign->seqs[i]->rightbound-1, myalign->seqs[i]->name);
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] & (1<<i))
+ fprintf(outfile, "%c", myalign->seqs[i]->lets[inds[i]++]);
+ else
+ fprintf(outfile,"-");
+ }
+ fprintf(outfile,"\n");
+ }
+ fprintf(outfile,"\n");
+
+ }
+
+ free (inds);
+}
+
+
+
+
+void freeHLLs(hll *myHLL) {
+ hll* a = myHLL;
+ while (a) {
+ myHLL = myHLL->next;
+ free (a);
+ a = myHLL;
+ }
+}
+
+
+void freeSequence(seq *mySeq) {
+ free(mySeq->rptr);
+ free(mySeq->name);
+ // rptr is a utility pointer, do not free
+ // filename is not allocated, do not free
+ free(mySeq);
+}
+
+void freeAlign(align *myAlign) {
+ int i;
+ // if (freed[myAlign->num]) {
+ // fprintf (stderr, "Something very wrong... %d/%d", myAlign->num, freedsize);
+ // }
+ assert (myAlign->dirty != 23);
+
+ if (myAlign->nextalign) {
+ myAlign->nextalign->dirty--;
+ if (!myAlign->nextalign->dirty){
+ freeAlign(myAlign->nextalign);
+ }
+ }
+ myAlign->nextalign = 0;
+ myAlign->dirty = 23;
+
+ if (myAlign->algn){
+ free(myAlign->algn);
+ myAlign->algn = (long long int *) 0;
+ }
+
+ for (i=0; i<CNTS_LEN; i++) {
+ if (myAlign->cnts[i]){
+ free(myAlign->cnts[i]);
+ myAlign->cnts[i] = (char *) 0;
+ }
+ }
+
+ // sequences not freed
+ // HLLs not freed
+ if (freed)
+ freed[myAlign->num] = 1;
+ free(myAlign);
+}
+
+/*
+void setScores(int gapstartV, int gapcontV, int gapendV, int gapperseqV, int overlapV, int glwidthV) {
+ gapstart = gapstartV;
+ gapcont = gapcontV;
+ gapend = gapendV;
+ gapperseq = gapperseqV;
+ overlap = overlapV;
+ glwidth = glwidthV;
+ }*/
+
+
+
diff --git a/src/multial.h b/src/multial.h
new file mode 100644
index 0000000..48e7058
--- /dev/null
+++ b/src/multial.h
@@ -0,0 +1,125 @@
+#ifndef __MULTIAL_H
+#define __MULTIAL_H
+
+
+#include <stdio.h>
+
+#define NUC_FILE "nucmatrix.txt"
+#define NUC_FILE_SIZE 6
+
+#define MAX_SEQ 63
+#define CNTS_LEN 8
+#define CNTS_A 0
+#define CNTS_T 1
+#define CNTS_C 2
+#define CNTS_G 3
+#define CNTS_CB 4
+#define CNTS_GS 5
+#define CNTS_GC 6
+#define CNTS_GE 7
+
+
+typedef struct HitLocationList {
+ int seq1start;
+ int seq2start;
+ int seq1end;
+ int seq2end;
+ float score;
+ struct HitLocationList *next;
+ struct HitLocationList *bkptr;
+ float scoreSoFar;
+ char dirty;
+} hll;
+
+typedef struct hllpointer {
+ int number;
+ char isstart;
+ hll* myhll;
+} hptr;
+
+typedef struct Sequence {
+ char* lets;
+ int numlets, numsiglets;
+ char* name;
+ char* rptr;
+ char* filename;
+ int leftbound, rightbound;
+ int index;
+} seq;
+
+typedef struct align_res {
+ int num;
+ int index;
+ int score;
+ int algnlen;
+ int numseq;
+ seq* seqs[MAX_SEQ];
+ long long int* algn;
+ char* cnts[CNTS_LEN];
+ hll* hlls[MAX_SEQ];
+ int dirty;
+ struct align_res* nextalign;
+} align;
+
+
+seq* mkConsensus(align* ali);
+align* mkSimAlign(seq* seq1);
+align* makeAlign(align* ali1, align* ali2, hll* anchors, align **uni);
+align* removeSeq(align* ali, int seqnum);
+void swapHLL(hll* arg);
+hll* remapHLLs(hll* anchs, int which, align* aln, int seqnum);
+hll* mergeHLLs(hll* anchs1, int wh1, hll* anchs2, int wh2);
+hll* getAnchsFromAlign(align* current, int seqnum, int cutoff);
+int getSeqNum(align* ali, seq* trgt);
+int printTextAlign(FILE *, align* myalign);
+int printFASTAAlign(FILE *, align* myalign);
+void printSeqsNames(align *a);
+void buildcache();
+
+void freeHLLs(hll *myHLL);
+void freeSequence(seq *mySequence);
+void freeAlign(align *myAlign);
+
+void setScores(int gapperseqV, int overlapV, int glwidthV);
+
+extern char* alpha;
+
+extern int s1start;
+extern int s1end;
+extern int s2start;
+extern int s2end;
+//int match;
+//int mismatch;
+extern int gapstart;
+extern int gapend;
+extern int gapcont;
+extern int gapperseq;
+extern int overlap;
+extern int glwidth;
+extern char dobin;
+extern char* nucmatrixfile;
+
+extern float factor, offset;
+extern int logs[MAX_SEQ*MAX_SEQ];
+
+extern FILE* outfile;
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/order.c b/src/order.c
new file mode 100644
index 0000000..da12f73
--- /dev/null
+++ b/src/order.c
@@ -0,0 +1,842 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include "diagmatrix.h"
+#include "filebuffer.h"
+
+#define NUC_FILE "nucmatrix.txt"
+#define NUC_FILE_SIZE 6
+
+#define MAX_SQ_SIZE (500 * (1 << 20))
+#define BIG_SQ_WIDTH 20
+
+#define VER_NUM "1.1"
+
+#define INSERTION 2
+#define DELETION 3
+
+#define ISCB(c) ((c)=='.')
+
+#define MIN2(x,y) ( (x) >= (y) ? (y) : (x) )
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+#define MAX3(x,y,z) MAX2(MAX2(x,y),z)
+
+#define WEQ2(x,y,a) ((x==a)? 0: (y==a)? 1:-1)
+#define WEQ3(x,y,z,a) ((x==a)? 0: (y==a)? 1: (z==a)? 2:-1)
+
+align* makeAlign(dmat* mydm, char* seq1, char* seq2);
+
+
+char* alpha = "ATCGN.";
+
+int s1start = 0;
+int s1end = 0;
+int s2start = 0;
+int s2end = 0;
+int gapstart = -1500;
+int gapcont = -50;
+//int match =12;
+//int mismatch = -8;
+int overlap = 0;
+int glwidth= 15;
+char dobin = 0;
+char domfa = 0;
+char doxmfa = 0;
+FILE* ancfile = 0;
+FILE* outfile;
+
+int substmatrix[256][256];
+
+
+seq* readfile(FILE* input, int seqnum) {
+ char* res = (char*) malloc(sizeof(char)*2);
+ int ressize = 2, numread=1;
+ char temp[256];
+ seq* myseq = (seq*) malloc(sizeof(seq));
+ char currchar;
+ if (feof(input))
+ return 0;
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ myseq->name = (char*) malloc((strlen(temp))*sizeof(char));
+ strcpy(myseq->name, temp+1);
+ *(strchr(myseq->name, '\n')) = 0;
+ res[0] = 0;
+ currchar = fgetc(input);
+ while ((currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ if (!strchr(alpha, currchar)) {
+ fprintf(stderr, "WARNING %c converted to 'N'\n", currchar);
+ }
+ res[numread++] = currchar;
+ if (numread >= ressize) {
+ res=(char*)realloc(res, sizeof(char)*(ressize*=2));
+ }
+ }
+ currchar = fgetc(input);
+ }
+ if (currchar == '>')
+ ungetc(currchar, input);
+ res[numread]=0;
+ myseq->rptr = res;
+ if (seqnum == 1) {
+ if (s1start > 0) {
+ res = &res[s1start-1];
+ res[s1end-s1start+1] = 0;
+ numread = s1end-s1start+1;
+ }
+ else {
+ s1start = 1;
+ s1end = numread;
+ }
+ }
+ else {
+ if (s2start > 0) {
+ res = &res[s2start-1];
+ res[s2end-s2start+1] = 0;
+ numread = s2end-s2start+1;
+ }
+ else {
+ s2start = 1;
+ s2end = numread;
+ }
+ }
+ myseq->lets = res;
+ myseq->numlets = numread-1;
+ // printf("red %d lets\n",numread);
+ return myseq;
+}
+
+char getLetter (FILE *file){
+ char ch;
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (!isspace (ch)) return ch;
+ }
+ return 0;
+}
+
+void readSubstMatrix (char *filename, int size){
+ FILE *file;
+ char line[1024], *symbs;
+ int i, j;
+
+ sprintf (line, "%s/%s", getenv ("LAGAN_DIR"), filename);
+ file = fopen (line, "r"); assert (file);
+
+ for (i = 0; i < 256; i++){
+ for (j = 0; j < 256; j++){
+ substmatrix[i][j] = 0;
+ }
+ }
+
+ symbs = (char *) malloc (sizeof (char) * size); assert (symbs);
+ for (i = 0; i < size; i++) symbs[i] = getLetter (file);
+ for (i = 0; i < size; i++){
+ getLetter (file);
+ for (j = 0; j < size; j++){
+ fscanf (file, "%d", &(substmatrix[(unsigned char) symbs[i]][(unsigned char) symbs[j]]));
+ }
+ }
+
+ fscanf (file, "%d", &gapstart);
+ fscanf (file, "%d", &gapcont);
+
+ fclose (file);
+}
+
+void paramParse(int argc, char** argv) {
+ int i = 3;
+ for ( ; i < argc; i++) {
+ if (!strcmp(argv[i], "-gs") || !strcmp(argv[i], "-GS")) {
+ gapstart = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-gc") || !strcmp(argv[i], "-GC")) {
+ gapcont = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-bin") || !strcmp(argv[i], "-BIN")) {
+ dobin =1;
+ }
+ else if (!strcmp(argv[i], "-mfa") || !strcmp(argv[i], "-MFA")) {
+ domfa =1;
+ }
+ else if (!strcmp(argv[i], "-xmfa") || !strcmp(argv[i], "-XMFA")) {
+ doxmfa =1;
+ }
+ /* else if (!strcmp(argv[i], "-mt") || !strcmp(argv[i], "-MT")) {
+ match = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-ms") || !strcmp(argv[i], "-MS")) {
+ mismatch = atoi(argv[++i]);
+ }*/
+ else if (!strcmp(argv[i], "-bw") || !strcmp(argv[i], "-BW")) {
+ glwidth = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-s1") || !strcmp(argv[i], "-S1")) {
+ s1start = atoi(argv[++i]);
+ s1end = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-s2") || !strcmp(argv[i], "-S2")) {
+ s2start = atoi(argv[++i]);
+ s2end = atoi(argv[++i]);
+ }
+ else if (!strcmp(argv[i], "-anc") || !strcmp(argv[i], "-ANC")) {
+ if (!(ancfile = fopen(argv[++i],"r"))) {
+ printf("couldnt open anchors file %s\n",argv[i]);
+ exit(2);
+ }
+ }
+ else if (!strcmp(argv[i], "-out") || !strcmp(argv[i], "-OUT")) {
+ if (!(outfile = fopen(argv[++i],"w"))) {
+ printf("couldnt open output file %s\n",argv[i]);
+ exit(2);
+ }
+ }
+ }
+
+ readSubstMatrix (NUC_FILE, NUC_FILE_SIZE);
+}
+
+void usage() {
+ printf("usage: \norder seq1file seq2file [options]\n\n");
+ printf("Options:\n");
+ printf("-gs # = Gap Start [default -100]\n");
+ printf("-gc # = Gap Continue [default -2]\n");
+ /* printf("-mt # = MaTch [default 12]\n");
+ printf("-ms # = MiSmatch [default -8]\n");*/
+ printf("-bw # = Barrel Width around conserved regions [default 15]\n");
+ printf("-anc anchorfile = specify an anchorfile to use [default no file]\n");
+ printf("-out outfile = write output to outfile [default screen]\n");
+ printf("-bin = write output in BINary format [default text]\n");
+ printf("-mfa = write output in MultiFAsta format [default text]\n");
+ printf("-s1 # # = use the given substring of the query [default whole]\n");
+ printf("-s2 # # = use the givensubstring of the dbase [default whole]\n");
+ printf("-version = prints the version of this ORDER\n");
+}
+
+hll* readAncFile(seq* seq1, seq* seq2) {
+ hll *myres = 0, *tt;
+ char buff[256];
+ int i=0;
+
+ while (!feof(ancfile)) {
+ if (!fgets(buff, 256, ancfile)) {
+ break;
+ }
+ tt = (hll*) malloc(sizeof(hll));
+ sscanf(buff, "(%d %d)=(%d %d) %*f", &tt->seq1start, &tt->seq1end,
+ &tt->seq2start, &tt->seq2end);
+
+ if ((tt->seq1start >= s1start && tt->seq1end <= s1end || s1start == 0 && s1end == 0) &&
+ (tt->seq2start >= s2start && tt->seq2end <= s2end || s2start == 0 && s2end == 0)){
+
+ if (tt->seq1start <= 0 && tt->seq1end <= 0) continue;
+ if (tt->seq2start <= 0 && tt->seq2end <= 0) continue;
+ if (tt->seq1start > s1start + seq1->numlets && tt->seq1end > s1start + seq1->numlets) continue;
+ if (tt->seq2start > s2start + seq2->numlets && tt->seq2end > s2start + seq2->numlets) continue;
+
+ if (s1start > 0){
+ tt->seq1start = MAX2 (tt->seq1start - s1start + 1, 1);
+ tt->seq1end = MIN2 (tt->seq1end - s1start + 1, s1end);
+ }
+ if (s2start > 0){
+ tt->seq2start = MAX2 (tt->seq2start - s2start + 1, 1);
+ tt->seq2end = MIN2 (tt->seq2end - s2start + 1, s2end);
+ }
+
+ tt->seq1start = MAX2 (tt->seq1start, 1);
+ tt->seq2start = MAX2 (tt->seq2start, 1);
+ tt->seq1end = MIN2 (tt->seq1end, seq1->numlets);
+ tt->seq2end = MIN2 (tt->seq2end, seq2->numlets);
+
+ tt->next = myres;
+ i++;
+ myres = tt;
+
+
+
+ }
+ }
+ fprintf(stderr,"read %d anchs\n", i);
+ return myres;
+}
+
+void mkBarrel(int s1, int s2, int e1, int e2, int width, int *dn, int dt, int* starts, int *ends, dmat* mydm) {
+ int sd = s1+s2-1, dlen;
+ int elem = (sd < mydm->d2)? s1: mydm->d2-s2;
+ int incr;
+ double fl = 0;
+ double slope = (double)(e2-s2)/(double)(e1-s1);
+ double cloc = elem;
+
+ if ((e2-s2 == 0) && (e1-s1 == 0))
+ slope = 1;
+ // printf("dt = %d\n", dt);
+ // printf("BA: %d, %d to %d, %d %f\n", s1,s2,e1,e2,slope);
+ for ( ; sd <(*dn); sd++) {
+ if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) {
+ cloc+=slope;
+ fl -= slope;
+ }
+ else {
+ elem--;
+ fl++;
+ }
+ if (sd <= mydm->d2)
+ elem++;
+ }
+ fl = 0;
+ for ( ; *dn < dt; (*dn)++) {
+ // printf("dn =%d ", *dn);
+ if (fl>=slope || (int)(cloc) == (int)(cloc+slope)) {
+ cloc+=slope;
+ fl -= slope;
+ }
+ else {
+ elem -=1;
+ fl++;
+ }
+ if (*dn <= mydm->d2)
+ elem++;
+
+ if (*dn < MIN2(mydm->d2, mydm->d1))
+ dlen = *dn;
+ else if (*dn < MAX2(mydm->d2, mydm->d1))
+ dlen = MIN2(mydm->d2, mydm->d1);
+ else
+ dlen = mydm->d2 + mydm->d1 - *dn;
+// if (*dn < 0 || *dn >= 34939) fprintf (stderr, "%d %d\n", *dn, dt);
+ starts[*dn] = MAX2(elem - width, 0);
+ ends[*dn] = MIN2(elem+width, dlen-1);
+ // printf("BARREL %d %d %d\n",*dn,starts[*dn],ends[*dn]);
+ }
+}
+
+void mkSquare(int s1, int s2, int e1, int e2, int *dn, int dt, int* starts, int *ends, dmat* mydm) {
+ int dists[2];
+ long long int size = ((long long int)e1-(long long int)s1)
+ * ((long long int)e2-(long long int)s2);
+ // printf("dt = %d\n", dt);
+ // printf("SQ: %d, %d to %d, %d\n", s1,s2,e1,e2);
+ if (size > MAX_SQ_SIZE) {
+ fprintf (stderr, "SQUARE TOO BIG: %d,%d to %d,%d\n", s1, e1,s2,e2);
+ mkSquare(s1, s2, (s1+e1)/2+glwidth, (s2+e2)/2+glwidth, dn, (*dn+dt)/2, starts, ends, mydm);
+ mkSquare((s1+e1)/2-glwidth, (s2+e2)/2-glwidth, e1, e2, dn, dt, starts, ends, mydm);
+ return;
+ }
+ for ( ; *dn < dt; (*dn)++) {
+ // printf("square dn = %d\n", *dn);
+ if (*dn < mydm->d2) {
+ dists[0] = s1-1;
+ dists[1] = *dn - e2;
+ }
+ else {
+ dists[0] = mydm->d2 - e2;
+ dists[1] = s1 - (*dn - mydm->d2)-1;
+ }
+// if (*dn < 0 || *dn >= 34939) fprintf (stderr, "%d\n", *dn);
+ starts[*dn] = MAX2(dists[0], dists[1]);
+
+ if (*dn < mydm->d2) {
+ dists[0] = e1-1;
+ dists[1] = *dn - s2;
+ }
+ else {
+ dists[0] = mydm->d2 - s2;
+ dists[1] = e1 - (*dn-mydm->d2)-1;
+ }
+ ends[*dn] = MIN2(dists[0], dists[1]);
+ // printf("SQUARE %d %d %d\n",*dn, starts[*dn],ends[*dn]);
+ }
+}
+
+void doShapes(hll* myres, dmat* mydm, int* starts, int *ends) {
+ int p1=MAX2(overlap,glwidth)+1, p2=MAX2(overlap,glwidth)+1;
+ int t1, t2;
+ int dn = 1, dt;
+ int width = glwidth;
+ while (myres) {
+ t1 = myres->seq1start; /* between hits */
+ t2 = myres->seq2start;
+ dt = t1 + t2 - 1 + overlap;
+ mkSquare(p1-MAX2(overlap, width), p2-MAX2(overlap, width),
+ t1+MAX2(overlap, width), t2+MAX2(overlap, width),
+ &dn, dt, starts, ends, mydm);
+ p1 = myres->seq1end; /* within a hit */
+ p2 = myres->seq2end;
+ dt = p1 + p2 - 1 - overlap;
+ mkBarrel(t1, t2, p1, p2, width, &dn, dt, starts, ends, mydm);
+ myres = myres->next;
+ }
+ t1 = mydm->d1;
+ t2 = mydm->d2;
+ dt = t1 + t2;
+ mkSquare(p1-MAX2(overlap,width), p2-MAX2(overlap,width), t1, t2, &dn, dt, starts, ends, mydm);
+}
+
+
+void parseAncs(dmat* mydm, seq* seq1, seq* seq2) {
+ int *starts = (int*) malloc(sizeof(int)*(seq1->numlets + seq2->numlets+2));
+ int *ends = (int*) malloc(sizeof(int)*(seq1->numlets + seq2->numlets+2));
+ hll* myres = 0;
+ if (ancfile) {
+ myres = readAncFile(seq1, seq2);
+ }
+ // printf("khe0\n");
+ doShapes(myres, mydm, starts, ends);
+ // printf("khe1\n");
+ DMinitDiag(mydm, starts,ends);
+ // printf("khe2\n");
+ free(starts);
+ free(ends);
+}
+
+void doAlign(dmat* mydm, seq* seq1, seq* seq2) {
+ align *a = (align*) makeAlign(mydm, seq1->lets, seq2->lets);
+ // printf("into printing\n");
+ if (!dobin && !domfa && !doxmfa)
+ printTextAlign(seq1->lets, seq2->lets, a);
+ else if (!domfa && !doxmfa)
+ printBinAlign(seq1->lets, seq2->lets, a);
+ else if (!doxmfa)
+ printMFAAlign(seq1->lets, seq2->lets, a, seq1->name, seq2->name);
+ else
+ printXMFAAlign(seq1->lets, seq2->lets, a, seq1->name, seq2->name);
+ // printf("doneprinting\n");
+}
+
+int main(int argc, char** argv) {
+ FileBuffer fseq1, fseq2;
+ seq *seq1, *seq2;
+ dmat* mydm;
+ if (argc < 3) {
+ if (argc == 2)
+ if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) {
+ printf("ORDER version %s\n", VER_NUM);
+ exit(0);
+ }
+ usage();
+ return 1;
+ }
+ if (!(fseq1 = FileOpen(argv[1]))) {
+ printf("couldnt open query file %s\n",argv[1]);
+ usage();
+ return 2;
+ }
+ if (!(fseq2 = FileOpen(argv[2]))) {
+ printf("couldnt open dbase file %s\n",argv[2]);
+ usage();
+ return 2;
+ }
+ outfile = stdout;
+ paramParse(argc, argv);
+ seq1 = FileRead(fseq1, s1start, s1end, VER_ORDER);
+ seq2 = FileRead(fseq2, s2start, s2end, VER_ORDER);
+ if (s1start == s1end && s1end == 0) {
+ s1start = 1;
+ s1end = seq1->numlets;
+ }
+ if (s2start == s2end && s2end == 0) {
+ s2start = 1;
+ s2end = seq2->numlets;
+ }
+ mydm = makeDM(seq1->numlets+1, seq2->numlets+1);
+ parseAncs(mydm, seq1, seq2);
+ doAlign(mydm, seq1, seq2);
+ return 0;
+}
+
+
+inline int ismatch(char a, char b) {
+ return a == b;
+}
+
+inline int matchscore (unsigned char a, unsigned char b) {
+ return substmatrix[a][b];
+ /*
+
+ if (!a || !b)
+ return 0;
+ if (a == 'N' || b == 'N')
+ return 0;
+ if (a == b)
+ return match;
+ return mismatch;
+ */
+}
+
+void reverse (char* a, int length) {
+ char lft;
+ int i;
+ for (i=0; i < length/2; i++) {
+ lft = a[i];
+ a[i] = a[length-i-1];
+ a[length-i-1] = lft;
+ }
+}
+
+align* getChain(dmat* mydm, char* seq1, char* seq2, int x, int y, int inrun) {
+ int temp;
+ align *res = (align*) malloc (sizeof(align)), *help;
+ char* almt = (char*) malloc ( sizeof(char));
+ int i=0, almtsize = 1, which;
+ char zz;
+ zz = DMgetPtr(mydm, x, y);
+
+ res->dirty = 0;
+ res->nextalign = 0;
+ res->algn = 0;
+ res->algnlen = 0;
+
+ do {
+ // printf("I am at %d,%d %x\n", x,y, zz);
+ which = zz & Mmask;
+
+ if (which == 0x3) {
+ help = DMgetNeck(mydm, x, y,inrun);
+ if (!help) {
+ return res;
+ }
+ help->dirty = 1;
+ res->nextalign = help;
+ break;
+ }
+
+ if (inrun == 1) {
+ if (zz & Nmask) {
+ which = 1;
+ }
+ }
+ else if (inrun == 2) {
+ if (zz & Omask) {
+ which = 2;
+ }
+ }
+
+ if (which == 0) {
+ inrun = 0;
+ almt[i++] = ismatch(seq1[x-1], seq2[y-1]);
+ zz = DMgetPtr(mydm,--x,--y);
+ }
+
+ else if (which == 1) { /*N*/
+ inrun = 1;
+ almt[i++] = INSERTION;
+ zz = DMgetPtr(mydm, --x, y);
+ }
+
+ else if (which == 2) {
+ inrun = 2;
+ almt[i++] = DELETION;
+ zz = DMgetPtr(mydm, x, --y);
+ }
+ else
+ printf("a really dumb error %d\n", i);
+
+ if (i >= almtsize) {
+ almt = (char *) realloc (almt, sizeof(char)* (almtsize *= 2));
+ }
+
+ } while (x > 0 && y > 0);
+
+
+ // printf("gotChain\n");
+ reverse(almt, i);
+ res->algn = almt;
+ res->algnlen = i;
+ // printf("done w it\n");
+ return res;
+}
+
+void saveNeck(dmat* mydm, char* seq1, char* seq2, int neckdiag) {
+ int size1, size2, x1, x2, y1, y2;
+ alel *first = DMgetDiagStart(mydm, neckdiag-1, &size1, &x1, &y1),
+ *second = DMgetDiagStart(mydm, neckdiag, &size2, &x2, &y2);
+ int i, j;
+ align* a;
+
+ DMnextNecks(mydm, neckdiag);
+ for (i=0; i<size2; i++,x2++,y2--) {
+ for (j=0; j<3; j++) {
+ a = getChain(mydm, seq1, seq2, x2, y2, j);
+ DMsetNeck(mydm, a, x2, y2, j);
+ }
+ }
+ for (i=0; i<size1; i++,x1++,y1--) {
+ for (j=0; j<3; j++) {
+ a = getChain(mydm, seq1, seq2, x1, y1, j);
+ DMsetNeck(mydm, a, x1, y1, j);
+ }
+ }
+}
+
+void freeAlign(align* t) {
+ free(t->algn);
+ free(t);
+}
+
+void joinAligns (align* a) {
+ align *n = a->nextalign, *t;
+ char* temp, *temp2;
+ int totsize=0;
+ for (t = a; t; t = t->nextalign) {
+ totsize += t->algnlen;
+ }
+ temp = (char *) malloc (totsize*sizeof(*temp));
+ temp2 = temp + totsize;
+ totsize = 0;
+ for (t=a; t; t = t->nextalign) {
+ totsize += t->algnlen;
+ memcpy(temp2-totsize, t->algn, t->algnlen*sizeof(*temp));
+ }
+ free (a->algn);
+ a->algn = temp;
+ a->algnlen = totsize;
+ for (a = a->nextalign; a;) {
+ t = a;
+ a = a->nextalign;
+ freeAlign(t);
+ }
+}
+
+align* makeAlign(dmat* mydm, char* seq1, char* seq2) {
+ int i, j;
+ int x, y, size;
+ alel *curr, *pasts0, *pasts1, *pasts2;
+ align* a;
+ char isneck;
+ int ndiags = mydm->d1 + mydm->d2 -1;
+ register int s1, s2, s3;
+ register char ptr;
+
+ isneck = DMnextDiag(mydm);
+ curr = DMgetDiagStart(mydm, 1, &size, &x, &y);
+ curr->N = curr->O = gapstart;
+ curr->M = 0;
+ DMsetPtr(mydm, 0, 1, 1);
+ // printf("[%d %d]=%d %d %d\n",x,y,curr->M, curr->N, curr->O);
+ for (i = 2; i <= ndiags; i++) {
+ isneck = DMnextDiag(mydm);
+ if (!(i%10000))
+ fprintf(stderr, "WORKING %d/%d\n", i/10000, ndiags/10000);
+ curr = DMgetDiagStart(mydm, i, &size, &x, &y);
+
+ pasts2 = DMgetElem(mydm, x-1, y);
+ pasts1 = DMgetElem(mydm, x-1, y-1);
+ for (j = 0; j < size; j++) {
+
+ /***************************************************/
+ pasts0 = pasts2;
+ pasts2 = DMgetElem2(mydm, x, y-1, pasts2);
+
+ s1 = pasts1->M;
+ s2 = pasts1->N + ((ISCB(seq2[y-1]))?0:gapcont);
+ s3 = pasts1->O + ((ISCB(seq1[x-1]))?0:gapcont);
+ curr->M = matchscore (seq1[x-1], seq2[y-1]);
+ if (s1 >= s2){
+ if (s1 >= s3){ curr->M += s1; /*ptr = 0;*/ }
+ else { curr->M += s3; /*ptr = 2;*/ }
+ }
+ else {
+ if (s2 >= s3){ curr->M += s2; /*ptr = 1;*/ }
+ else { curr->M += s3; /*ptr = 2;*/ }
+ }
+
+ s1 = curr->M + ((ISCB(seq2[y-1]))?0:gapstart);
+ s2 = pasts0->N + ((ISCB(seq2[y-1]))?0:gapcont);
+ if (s1 >= s2){ curr->N = s1; ptr = 0; }
+ else { curr->N = s2; ptr = 4; }
+
+ s1 = curr->M + ((ISCB(seq1[x-1]))?0:gapstart);
+ s2 = pasts2->O + ((ISCB(seq1[x-1]))?0:gapcont);
+ if (s1 >= s2){ curr->O = s1; }
+ else { curr->O = s2; ptr |= 8; }
+
+ s1 = curr->M;
+ s2 = curr->N;
+ s3 = curr->O;
+ if (curr->M >= curr->N){
+ if (curr->M < curr->O)
+ ptr |= 2;
+ }
+ else {
+ if (curr->N >= curr->O)
+ ptr |= 1;
+ else
+ ptr |= 2;
+ }
+ //ptr |= WEQ3(curr->M, curr->N, curr->O, MAX3(curr->M, curr->N, curr->O));
+ //ptr = ptr | (WEQ2(curr->M+gapstart, pasts0->N+gapcont, curr->N) << 2);
+ //ptr = ptr | (WEQ2(curr->M+gapstart, pasts0->O+gapcont, curr->O) << 3);
+ /***************************************************/
+ /*
+ curr->M = MAX3(pasts[1]->M, pasts[1]->N+gapcont, pasts[1]->O+gapcont);
+ curr->M += matchscore(seq1[x-1], seq2[y-1]);
+ curr->N = MAX2(curr->M+gapstart, pasts[0]->N+gapcont);
+ curr->O = MAX2(curr->M+gapstart, pasts[2]->O+gapcont);
+ ptr = WEQ3(curr->M, curr->N, curr->O, MAX3(curr->M, curr->N, curr->O));
+ ptr = ptr | (WEQ2(curr->M+gapstart, pasts[0]->N+gapcont, curr->N) << 2);
+ ptr = ptr | (WEQ2(curr->M+gapstart, pasts[0]->O+gapcont, curr->O) << 3);
+ */
+
+ DMsetPtr(mydm, ptr, x, y);
+ curr++; x++; y--;
+
+ pasts1 = DMgetElem2(mydm, x-1, y-1, pasts1);
+ }
+ if ((i < ndiags - 2) && isneck) {
+ saveNeck(mydm, seq1, seq2, i);
+ }
+ }
+ mydm->currneck++;
+ a = getChain(mydm, seq1, seq2, mydm->d1, mydm->d2, 0);
+ curr--;
+ a->score = MAX3(curr->M, curr->N, curr->O);
+ // printf("here! %d\n", a);
+ joinAligns(a);
+ return a;
+}
+
+int printBinAlign(char* seq1, char* seq2, align* myalign) {
+ int s1=1, s2=1, c;
+ char lets[256];
+ char left, right;
+ // fprintf(stderr,"kuku\n");
+ for (c = 0; c < 256; c++)
+ lets[c] = -1;
+ lets['A'] = 1; lets['C'] = 2; lets['T'] = 3; lets['G'] = 4; lets['N'] = 5; lets['.'] = 0;
+ for (c = 1; c < myalign->algnlen; c++) {
+ left=right=0;
+ if (myalign->algn[c] != DELETION)
+ left = lets[seq1[s1++]];
+ if (myalign->algn[c] != INSERTION)
+ right = lets[seq2[s2++]];
+ right = right | (left << 4);
+ putc(right, outfile);
+ }
+ fclose(outfile);
+}
+
+int printTextAlign(char* seq1, char* seq2, align* myalign) {
+ int s1=1, s2=1, c, k;
+ int nm=0, nga=0, ngb=0, nlets=0;
+ int hasst=0;
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != DELETION)
+ fprintf(outfile, "%c", seq1[s1++]);
+ else {
+ fprintf(outfile,"-");
+ if (hasst)
+ nga++;
+ }
+ }
+ fprintf(outfile,"\n");
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] == 1) {
+ fprintf(outfile, ":");
+ nm++;
+ nlets++;
+ hasst = 1;
+ }
+ else {
+ fprintf(outfile, " ");
+ if (hasst) nlets++;
+ }
+ }
+ fprintf(outfile, "\n");
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != INSERTION)
+ fprintf(outfile, "%c", seq2[s2++]);
+ else {
+ fprintf(outfile, "-");
+ if (hasst)
+ ngb++;
+ }
+ }
+ fprintf(outfile, "\n\n");
+ }
+ fprintf(outfile,"score = %d, nmatches = %d, nga=%d, ngb=%d nletters=%d, perc = %f\n",
+ myalign->score,nm,nga,ngb,nlets,(float)nm/(float)nlets);
+ fprintf(outfile,"\n");
+}
+
+int printMFAAlign(char* seq1, char* seq2, align* myalign, char* n1, char* n2) {
+ int s1=1, s2=1, c, k;
+ int nm=0, nga=0, ngb=0, nlets=0;
+ int hasst=0;
+ fprintf(outfile,">%s\n", n1);
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != DELETION)
+ fprintf(outfile, "%c", seq1[s1++]);
+ else {
+ fprintf(outfile,"-");
+ if (hasst)
+ nga++;
+ }
+ }
+ fprintf(outfile,"\n");
+ }
+ fprintf(outfile,">%s\n", n2);
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != INSERTION)
+ fprintf(outfile, "%c", seq2[s2++]);
+ else {
+ fprintf(outfile, "-");
+ if (hasst)
+ ngb++;
+ }
+ }
+ fprintf(outfile, "\n");
+ }
+}
+
+int printXMFAAlign(char* seq1, char* seq2, align* myalign, char* n1, char* n2) {
+ int s1=1, s2=1, c, k;
+ int nm=0, nga=0, ngb=0, nlets=0;
+ int hasst=0;
+ fprintf(outfile,">1:%d-%d + %s\n", s1start, s1end, n1);
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != DELETION)
+ fprintf(outfile, "%c", seq1[s1++]);
+ else {
+ fprintf(outfile,"-");
+ if (hasst)
+ nga++;
+ }
+ }
+ fprintf(outfile,"\n");
+ }
+ fprintf(outfile,">2:%d-%d + %s\n", s2start, s2end, n2);
+ for (c = 1; c < myalign->algnlen; c = c + 60) {
+ for (k = c; (k < (c + 60)) && (k < myalign->algnlen); k++) {
+ if (myalign->algn[k] != INSERTION)
+ fprintf(outfile, "%c", seq2[s2++]);
+ else {
+ fprintf(outfile, "-");
+ if (hasst)
+ ngb++;
+ }
+ }
+ fprintf(outfile, "\n");
+ }
+}
+
+
+
+
+
+
+
+
+
diff --git a/src/order.h b/src/order.h
new file mode 100644
index 0000000..dee9dad
--- /dev/null
+++ b/src/order.h
@@ -0,0 +1,29 @@
+#ifndef ORDER__H
+#define ORDER__H
+
+#include "fchaos.h"
+
+typedef struct align_res {
+ int score;
+ int algnlen;
+ char* algn;
+ struct align_res *nextalign;
+ int nextloc;
+ char dirty;
+} align;
+
+
+//align* makeAlign(dmat* mydm, char* seq1, char* seq2);
+int printAlign(char* seq1, char* seq2, align* myalign);
+void freeAlign(align* t);
+int printBinAlign(char* seq1, char* seq2, align* myalign);
+int printTextAlign(char* seq1, char* seq2, align* myalign);
+
+#endif
+
+
+
+
+
+
+
diff --git a/src/prolagan.c b/src/prolagan.c
new file mode 100644
index 0000000..293f9d4
--- /dev/null
+++ b/src/prolagan.c
@@ -0,0 +1,1115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "skiplist.h"
+#include "multial.h"
+#include "filebuffer.h"
+
+#define VER_NUM "1.1"
+#define MIN2(x,y) ( (x) >= (y) ? (y) : (x) )
+#define MAX2(x,y) ( (x) >= (y) ? (x) : (y) )
+
+// Global variables
+
+static int nested = 0;
+static int postir = 0;
+static int lazy = 0;
+static int notree = 1;
+static int verbose = 0;
+static int numseqs = 0;
+static int itertimes = 1;
+static int cutoffmatch = 12;
+static int translate = 0;
+static int extend = 1;
+static int fastreject = 0;
+static int gapfreechunks = 0;
+
+static align *simaligns[MAX_SEQ];
+static char* lagan_dir;
+
+static align *profile1 = 0;
+static align *profile2 = 0;
+
+static int hptrcomp (const void *p1, const void *p2) {
+ int i = ((hptr*)p1)->number;
+ int j = ((hptr*)p2)->number;
+ int it = ((hptr*)p1)->isstart;
+ int jt = ((hptr*)p2)->isstart;
+ if (i > j)
+ return (1);
+ if (i < j)
+ return (-1);
+ if (it)
+ return -1;
+ else
+ return 1;
+}
+
+
+void usage(void) {
+ printf("mlagan seqfile_1 seqfile_2 [... seqfile_%d] [-parameters]\n\n",
+ MAX_SEQ);
+ printf("-lazy : uses lazy mode\n");
+ printf("-translate : use translated anchors\n");
+ // printf("-ext : extend the anchors\n"); This is now default
+ printf("-fastreject : use fast rejection (tuned for human/mouse or closer)\n");
+ // printf("-gfc : find gap free chunks as anchors\n"); This is currently broken
+ printf("-verbose : give debug output\n");
+ printf("-tree \"(...)\" : runs with given phylogenetic tree\n");
+ printf("-out \"filename\": outputs to filename\n");
+ printf("-version : prints version info\n");
+}
+
+seq* readfile(FILE* input) {
+ int seqstart=0;
+ int seqend=0;
+ char* res = (char*) malloc(sizeof(char)*2);
+ int ressize = 2, numread=1; //N at 1st letter
+ char temp[256];
+ seq* myseq = (seq*) malloc(sizeof(seq));
+ char currchar;
+
+ res[0] = 'N';
+ if (feof(input))
+ return 0;
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+
+ myseq->name = (char*) malloc((strlen(temp))*sizeof(char));
+ strcpy(myseq->name, temp+1);
+ *(strchr(myseq->name, '\n')) = 0;
+ currchar = fgetc(input);
+ while ((currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ if (!strchr(alpha, currchar)) {
+ fprintf(stderr, "Warning: %c converted to 'N'\n", currchar, alpha);
+ currchar = 'N';
+ }
+ res[numread++] = currchar;
+ if (numread >= ressize) {
+ res=(char*)realloc(res, sizeof(char)*(ressize*=2));
+ }
+ }
+ currchar = fgetc(input);
+ }
+ if (currchar == '>')
+ ungetc(currchar, input);
+ res[numread]=0;
+ myseq->rptr = res;
+
+ if (seqstart > 0) {
+ res = &res[seqstart-1];
+ res[seqend-seqstart+1] = 0;
+ numread = seqend-seqstart+1;
+ }
+
+ myseq->lets = res;
+ myseq->numlets = numread;
+ // printf("read: %d lets\n",numread);
+ return myseq;
+}
+
+int starts_with(char *str, char *word) {
+ int len;
+ char *first_word;
+
+ len = strlen(str);
+ first_word = (char *)malloc((len + 1) * sizeof(char));
+ sscanf(str, "%s", first_word);
+ return !strcmp(word, first_word);
+}
+
+align* findAlignByName(align *aligns[], char *name) {
+ int i=0;
+ // printf("findAlignByName: %s\n", name);
+ while(i<numseqs) {
+ if (starts_with(aligns[i]->seqs[0]->name, name)) {
+ return(aligns[i]);
+ }
+ i++;
+ }
+ fprintf(stderr, "alignment not found for: %s", name);
+ exit(2);
+ return NULL;
+}
+
+int kk = 0;
+
+// Profile stuff start
+
+// replaces the sequence of same name with replacer, returning which was
+// replaced or -1 if none.
+
+int getSeqNumber(align* res, seq* replacer) {
+ int i;
+ for (i=0; i < res->numseq; i++) {
+ if (!strcmp(res->seqs[i]->name, replacer->name)) {
+ res->seqs[i] = replacer;
+ return i;
+ }
+ }
+ return -1;
+}
+
+void appendAlignProfile(align *res, seq* seqwgaps) {
+ int i,j,k;
+ res->seqs[res->numseq] = seqwgaps;
+ for (i=1; i < res->algnlen; i++) {
+ if (seqwgaps->lets[i] != '-') {
+ k=strchr(alpha,seqwgaps->lets[i])-alpha;
+ if (k < 4) {
+ res->cnts[k][i]++;
+ }
+ res->algn[i] |= (1 << res->numseq);
+ if (i > 0 && seqwgaps->lets[i-1] == '-')
+ res->cnts[CNTS_GE][i]++;
+ }
+ else if (i > 0) {
+ if (i > 0 && seqwgaps->lets[i-1] != '-') {
+ res->cnts[CNTS_GS][i]++;
+ }
+ else
+ res->cnts[CNTS_GC][i]++;
+ res->algn[i] |= (0 << res->numseq);
+ }
+ }
+ res->numseq++;
+}
+
+align* readProfile(FileBuffer with_gaps) {
+ int i,j;
+ seq* myseq;
+ align* res = (align*) malloc (sizeof(align));
+ res->score = 0;
+ res->nextalign = 0;
+ res->dirty = 0;
+ res->numseq = 0;
+ res->algnlen = -1;
+ res->index = 32;
+
+ while ( myseq = FileRead( with_gaps,0,0,VER_MLAGAN )) {
+ // fprintf(stdout, "seq: %s\n", myseq->lets);
+ if (res->algnlen < 0) {
+ res->algnlen = myseq->numlets;
+ res->algn = (long long int*) malloc((res->algnlen+1) * sizeof(long long int));
+ assert (res->algn);
+ for (j=0; j<CNTS_LEN; j++) {
+ res->cnts[j] = (char*) malloc((res->algnlen+1) * sizeof(char));
+ assert (res->cnts[j]);
+ }
+ for (i=0; i<= res->algnlen;i++) {
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j][i] = 0;
+ res->algn[i] = 0;
+ }
+ }
+ if ( res->algnlen != myseq->numlets) {
+ fprintf (stderr, "Lengths screwed up!!!\n");
+ exit(1);
+ }
+ appendAlignProfile(res, myseq);
+ }
+ if (verbose) {
+ fprintf(stdout, "LOADED RES\n");
+ printTextAlign(stdout,res);
+ }
+ return res;
+}
+
+
+// Profile stuff end
+
+
+void printHLL(hll *myres) {
+ fprintf(stderr, "into %d\n", ++kk);
+ fflush(stderr);
+ while(myres) {
+
+ fprintf(stderr, "(%d %d)=(%d %d) %f\n",
+ myres->seq1start, myres->seq1end,
+ myres->seq2start, myres->seq2end, myres->score);
+ fflush(stderr);
+ myres=myres->next;
+ }
+}
+
+hll* getAnchsFromFile(char *fname, FileBuffer f1, FileBuffer f2) {
+ FILE *ancfile;
+ hll *myres = 0, *tt = 0, *first = 0;
+ char buff[256];
+ int i=0, j=0;
+
+ // printf("getHLLFromNames: %s, %s\n", name1, name2);
+
+ sprintf(buff, "%s.anchors", fname);
+ ancfile=fopen(buff, "r");
+ if(ancfile==NULL) {
+ fprintf(stderr, "anchor file not found:: %s.anchors\n",
+ fname);
+ exit(2);
+ }
+
+ while (!feof(ancfile)) {
+ if (!fgets(buff, 256, ancfile)) {
+ break;
+ }
+ tt = (hll*) malloc(sizeof(hll));
+ sscanf(buff, "(%d %d)=(%d %d) %f", &tt->seq1start, &tt->seq1end,
+ &tt->seq2start, &tt->seq2end, &tt->score);
+ tt->next = myres;
+ i++;
+ myres = tt;
+ }
+ if (fastreject) {
+ f1->startpos = MAX2(f1->startpos, myres->seq1end);
+ f2->startpos = MAX2(f2->startpos, myres->seq2end);
+ for (tt = myres; tt->next->next; tt = tt->next) {
+ j++;
+ }
+ f1->endpos = MIN2(f1->endpos, tt->next->seq1start);
+ f2->endpos = MIN2(f2->endpos, tt->next->seq2start);
+ // fprintf (stderr, "%d %d %d %d %d\n", j, f1->startpos, f1->endpos, f2->startpos, f2->endpos);
+ myres = myres->next;
+ tt->next = 0;
+ }
+ fprintf(stderr,"read %d anchs\n", i);
+ fclose(ancfile);
+ return myres;
+}
+
+
+
+hll* generateAnchors( FileBuffer a1, FileBuffer a2) {
+ char buff[256];
+ char fname[80];
+ char *name1, *name2;
+ char *endpnt;
+ int diff1, diff2;
+ align* temp;
+ hll* res;
+ char flip = 0;
+ int retstat;
+
+ name1 = strrchr (a1->filename, '/');
+ if (!name1) name1 = a1->filename;
+ else name1++;
+ name2 = strrchr (a2->filename, '/');
+ if (!name2) name2 = a2->filename;
+ else name2++;
+
+ endpnt = strchr ( name1, '.');
+ diff1 = (endpnt)? endpnt - name1: strlen(name1);
+ endpnt = strchr ( name2, '.');
+ diff2 = (endpnt)? endpnt - name2: strlen(name2);
+ strncpy (fname, name1, diff1);
+ strncpy (fname+diff1, name2, diff2);
+ fname[diff1+diff2] = 0;
+
+ sprintf(buff, "%s/rechaos.pl %s %s -out %s.anchors %s %s %s %s %s\n",
+ lagan_dir,
+ a1->filename,
+ a2->filename,
+ fname,
+ (extend ? "-ext" : ""),
+ (translate ? "-translate" : ""),
+ (fastreject ? "-fastreject" : ""),
+ (gapfreechunks ? "-gfc" : ""),
+ (lazy ? "-lazy" : ""));
+
+ retstat = system(buff) >> 8;
+ if (fastreject && (retstat == 3)) {
+ return 0;
+ }
+ else if (retstat) {
+ fprintf (stderr, "Error from rechaos\n");
+ exit (1);
+ }
+ res = getAnchsFromFile(fname, a1, a2);
+ return res;
+}
+
+
+void printFASTASeq(FILE *outfile, seq *myseq) {
+ int i;
+ // printf("kva\n");
+ if (!outfile)
+ outfile = stdout;
+
+ fprintf(outfile, ">%s\n", myseq->name);
+ // printf("kva2\n");
+ for(i=0; i<myseq->numlets; i++)
+ fprintf(outfile, "%c", myseq->rptr[i]);
+ // printf("kva %d\n",i);
+ fprintf(outfile, "\n");
+
+ if (outfile!=stdout) fclose(outfile);
+}
+
+
+hll* findBestChain(hptr* array, int arrsize) {
+ sklst* skipper = makeSkLst();
+ sle* help;
+ int i;
+ hll* t;
+ for (i = 0; i < arrsize; i++) {
+ if (array[i].isstart) {
+ help = SLfind(skipper, array[i].myhll->seq2start);
+ if (help->myelem) {
+ array[i].myhll->bkptr = help->myelem;
+ array[i].myhll->scoreSoFar = ((hll*)help->myelem)->scoreSoFar + array[i].myhll->score;
+ }
+ else {
+ array[i].myhll->bkptr = 0;
+ array[i].myhll->scoreSoFar = array[i].myhll->score;
+ }
+ }
+ else {
+ help = SLfind(skipper, array[i].myhll->seq2end);
+ if (help->myelem && (array[i].myhll->scoreSoFar <= ((hll*)help->myelem)->scoreSoFar))
+ continue;
+ SLinsertAfter(skipper, help, array[i].myhll->seq2end, array[i].myhll);
+ help = help->next[0];
+ while (help->next[0] &&
+ ((hll*)help->myelem)->scoreSoFar >= ((hll*)help->next[0]->myelem)->scoreSoFar)
+ SLremove(skipper, help->next[0]);
+ }
+ }
+ t= (hll*)SLgetLast(skipper)->myelem;
+ delSkLst(skipper);
+ return t;
+}
+
+
+hll* remakeHLL(hll* bestPtr) {
+ int len;
+ hll *res=0;
+ hll *temp, *t2, *t3;
+ int i, bestscore=-1;
+ for (temp = bestPtr; temp; temp = temp->bkptr) {
+ temp->next=res;
+ temp->dirty = 1;
+ res=temp;
+ }
+
+ return res;
+}
+
+
+hll* reanchorHLL(hll* mylist) {
+
+ hll *temp, *best, *t2;
+ int numhits=0, i=0;
+ hptr* myptrs;
+
+ temp=mylist;
+ while (temp) { numhits++; temp->dirty = 1; temp=temp->next; }
+
+ myptrs = (hptr*) malloc (sizeof(hptr) * numhits *2);
+ for (temp = mylist; temp; temp = temp->next) {
+ myptrs[i].number = temp->seq1start;
+ myptrs[i].isstart = 1;
+ myptrs[i].myhll = temp;
+ myptrs[i+1].number = temp->seq1end;
+ myptrs[i+1].isstart = 0;
+ myptrs[i+1].myhll = temp;
+ i = i+2;
+ }
+ qsort(myptrs, numhits*2, sizeof(hptr), hptrcomp);
+ best = findBestChain(myptrs, numhits*2);
+ temp=best;
+ while (temp) { temp->dirty = 0; temp=temp->bkptr; }
+ temp=mylist;
+ while (temp) { t2 = temp; temp=temp->next; if (t2->dirty) free(t2); }
+
+ best = remakeHLL(best);
+ // printf("newbest\n");
+ // printHLL(best);
+ free (myptrs);
+ return best;
+}
+
+
+void orderAligns(align *a1, align *a2,
+ align **first, align **second,
+ int *index, int *hllindex) {
+ int a1index, a2index;
+
+ a1index = a1->index;
+ a2index = a2->index;
+
+ if (a1index > a2index) {
+ *first = a2;
+ *second = a1;
+ *index = a2index;
+ *hllindex = a1index;
+ } else {
+ *first = a1;
+ *second = a2;
+ *index = a1index;
+ *hllindex = a2index;
+ }
+}
+
+
+void doRemapHLLs(align *aligns[], align *uni, int *index, int hllindex) {
+ int i, mapi, done=0;
+
+ // take all hlls into first, and into the second and remap them
+
+ for(mapi=*index; !done; mapi=hllindex) {
+
+ for (i=0; i<mapi; i++) {
+ if (aligns[i]->hlls[mapi] != NULL && i != *index) {
+ // remap them into i
+ // fprintf(stderr, "\n called1 %d %d(%d)\n", i, mapi, *index);
+ aligns[i]->hlls[mapi] = remapHLLs(aligns[i]->hlls[mapi],
+ 1, uni,
+ (mapi!=*index));
+ }
+ }
+ for (i=mapi+1; i<numseqs; i++) {
+ if (aligns[mapi]->hlls[i] != NULL && i != hllindex) {
+ // remap them into first or second
+ // fprintf(stderr, "\n called2 %d %d(%d)\n", mapi, i,*index);
+ aligns[mapi]->hlls[i] = remapHLLs(aligns[mapi]->hlls[i],
+ 0, uni,
+ (mapi!=*index));
+ }
+ }
+ if (mapi==hllindex) done=1;
+ }
+
+ // free memory? what's that?
+ // aligns[*index] = result;
+ // aligns[hllindex] = result;
+
+
+}
+
+void doReanchorHLLs(align *aligns[],
+ int *index, int hllindex) {
+ int i;
+
+ // for each pair of hlls from (i to first) and (i to second)
+
+ for(i=0; i<*index; i++) {
+ aligns[i]->hlls[*index] =
+ reanchorHLL(mergeHLLs(aligns[i]->hlls[*index], 0,
+ aligns[i]->hlls[hllindex], 0));
+
+ // if (verbose) {
+ // printf("aligns[%d]->hlls[%d]\n",i ,*index);
+ // printHLL(aligns[i]->hlls[*index]);
+ // }
+ aligns[i]->hlls[hllindex] = 0;
+ }
+ for(i=*index+1; i<hllindex; i++) {
+ aligns[*index]->hlls[i] =
+ reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0,
+ aligns[i]->hlls[hllindex], 1));
+ // if (verbose) {
+ // printf("aligns[%d]->hlls[%d]\n",*index ,i);
+ // printHLL(aligns[*index]->hlls[i]);
+ // }
+ aligns[i]->hlls[hllindex] = 0;
+ }
+ for(i=hllindex+1; i<numseqs; i++) {
+ aligns[*index]->hlls[i] =
+ reanchorHLL(mergeHLLs(aligns[*index]->hlls[i], 0,
+ aligns[hllindex]->hlls[i], 0));
+ // if (verbose) {
+ // printf("aligns[%d]->hlls[%d]\n", *index, i);
+ // printHLL(aligns[*index]->hlls[i]);
+ // }
+ aligns[hllindex]->hlls[i] = 0;
+ }
+}
+
+
+align* processAnchors(align *aligns[], align *a1, align *a2, int *index) {
+ int hllindex;
+ align *first, *second, *result, *uni;
+
+ result = (align*) malloc(sizeof(align));
+
+ assert (result);
+ result->score = -1;
+ result->numseq = a1->numseq + a2->numseq;
+ result->algnlen = -1;
+ result->nextalign = 0;
+ result->dirty = 0;
+
+ orderAligns(a1, a2, &first, &second, index, &hllindex);
+
+ if (verbose)
+ printHLL(aligns[first->index]->hlls[hllindex]);
+
+ // result = makeAlign(first, second, aligns[first->index]->hlls[hllindex], &uni);
+ result->index = *index;
+
+ doReanchorHLLs(aligns, index, hllindex);
+
+ fprintf(stderr,"done reanchor, leaving processAnchors\n");
+ return(result);
+}
+
+align* processAlign(align *aligns[], align *a1, align *a2, int *index) {
+ int hllindex;
+ align *first, *second, *result, *uni;
+
+ fprintf(stderr, "into processalign\n");
+
+ orderAligns(a1, a2, &first, &second, index, &hllindex);
+
+ if (verbose)
+ printHLL(aligns[first->index]->hlls[hllindex]);
+
+ fprintf(stderr, "about to make\n");
+ result = makeAlign(first, second, aligns[first->index]->hlls[hllindex], &uni);
+ fprintf(stderr, "done make\n");
+ result->index = *index;
+ return(result);
+}
+
+
+align* iterativeImprovement (align *current, align *rpntree[], int length) {
+ int converged = 0;
+ int i=0, oldscore, cutoff;
+ seq *removed;
+ align *readd, *old, *new;
+ hll* anchs, *tt;
+ if (current->numseq <= 2)
+ return current;
+ // printf("iterative improvement!\n");
+
+ cutoff = cutoffmatch * 100;
+ fprintf(stderr, "cutoff = %d\n", cutoff);
+ while (!converged) {
+
+ // Throw out a sequence. Calling code in multial.
+ removed = current->seqs[0];
+ new = findAlignByName(simaligns, removed->name);
+ old = current;
+ anchs = getAnchsFromAlign(current, 0, cutoff);
+ current = removeSeq(current, 0);
+ free (old);
+
+ // Re-align this thrown-out sequence to the remaining alignment.
+
+ current = makeAlign (current, new, anchs, &old);
+ if (verbose) {
+ printf("improved:\n");
+ printHLL(anchs);
+ printTextAlign(stdout, current);
+ }
+ while (anchs) {
+ tt = anchs;
+ anchs = anchs->next;
+ free (tt);
+ }
+ free (old);
+
+ i++;
+ if (i==numseqs*itertimes) converged = 1;
+ }
+ return current;
+}
+
+
+
+int treeToRPN(char *treestr, align *stack[MAX_SEQ*2], int *depth) {
+
+ int i=0; int j, k;
+ char buffer[256];
+
+ while (treestr[i]!='(') { i++; } i++;
+
+ while ((treestr[i] != ')') && (treestr[i] != '\0')) {
+ // printf("%d: %s\n", *depth, treestr+i);
+
+
+ if (treestr[i]=='(') {
+ i += treeToRPN(treestr+i, stack, depth);
+ }
+ else if (isalnum(treestr[i])) {
+ k = 0;
+ // push alignment
+ while((!isspace(treestr[i])) && (treestr[i]!='(') && (treestr[i]!=')')) {
+ buffer[k++] = treestr[i++];
+ }
+ buffer[k] = 0;
+ stack[(*depth)++]=findAlignByName(simaligns, buffer);
+ // printf("pushed: %s\n", stack[*depth-1]->seqs[0]->name);
+ }
+ else if (treestr[i]==')')
+ // (*depth)++;
+ break;
+ else { i++; }
+
+ }
+
+ if (treestr[i]==')') {
+ (*depth)++; //null is '+'
+ return i+1;
+ }
+ if (treestr[i] == '\0') {
+ fprintf(stderr, "ERROR parsing tree, depth %d, %d chars read", *depth, i);
+ exit(1);
+ }
+}
+
+align* procStack(align* rpntree[MAX_SEQ*2], int length, align *myaligns[]) {
+ align* stack[MAX_SEQ];
+ int i = 0, sp = 0;
+ int index=0;
+
+ while (i < (length-1)) {
+
+ if (rpntree[i]) {
+ stack[sp++] = rpntree[i];
+ }
+ else {
+ stack[sp-2] = processAnchors(myaligns, stack[sp-2], stack[sp-1], &index);
+ stack[--sp] = 0;
+ // if(verbose) printTextAlign(stdout, stack[sp-1]);
+ }
+ i++;
+ }
+ if (rpntree[i]) {
+ fprintf(stderr,"Unexpeceted error\n");
+ }
+ else {
+ stack[sp-2] = processAlign(myaligns, profile1, profile2, &index);
+ stack[--sp] = 0;
+ if(verbose) printTextAlign(stdout, stack[sp-1]);
+ }
+
+ return stack[sp-1];
+}
+
+
+void graphCollapsal (align *simaligns[]) {
+
+ // for now...
+
+ fprintf(stderr, "Please specify a phylogenetic tree, using [-tree]\n");
+ exit(1);
+}
+
+int parseParameters(int argc, char** argv, FileBuffer *files, char **treestr) {
+
+ int i=1;
+
+ FileBuffer fb;
+
+ if (argc < 3) {
+ if (argc == 2)
+ if (!strcmp(argv[1], "-version") || !strcmp(argv[1], "-Version")) {
+ fprintf(stderr, "PROLAGAN version %s\n", VER_NUM);
+ exit(0);
+ }
+ usage();
+ return 1;
+ }
+ while((argv[i][0]!='-')) {
+
+ // Read in sequence files
+
+ // printf("sequence %d: %s\n", i, argv[i]);
+
+ if (!(files[numseqs++] = FileOpen(argv[i]))) {
+ fprintf(stderr, "couldnt open dbase file %s\n",argv[i]);
+ usage();
+ return 2;
+ }
+
+ // seqs[numseqs] = FileRead(seqfile, 0, 0, VER_MLAGAN);
+ // seqs[numseqs]->filename = argv[i];
+ // numseqs++;
+
+
+ if(++i>=argc) break;
+ }
+
+ // printf("\n");
+
+ while (i<argc) {
+
+ // printf("parameters: %s\n", argv[i]);
+
+ if (!(strcmp(argv[i], "-nested") ||
+ strcmp(argv[i], "-nopost") ||
+ strcmp(argv[i], "-postir") ||
+ strcmp(argv[i], "-fastreject") ||
+ strcmp(argv[i], "-gfc") ||
+ strcmp(argv[i], "-lazy") ||
+ strcmp(argv[i], "-verbose") ||
+ strcmp(argv[i], "-out") ||
+ strcmp(argv[i], "-translate") ||
+ strcmp(argv[i], "-ext") ||
+ strcmp(argv[i], "-match") || strcmp(argv[i], "-mismatch") ||
+ strcmp(argv[i], "-pro1") || strcmp(argv[i], "-pro2") ||
+ strcmp(argv[i], "-gapstart") || strcmp(argv[i], "-gapend") ||
+ strcmp(argv[i], "-gapcont") || strcmp(argv[i], "-gapperseq") ||
+ strcmp(argv[i], "-overlap") || strcmp(argv[i], "-glwidth") ||
+ strcmp(argv[i], "-tree"))) {
+ fprintf(stderr, "unrecognized parameter: %s\n", argv[i]);
+ usage();
+ return 1;
+ }
+ if (!strcmp(argv[i], "-nested")) {
+ nested = 1;
+ }
+
+ if (!strcmp(argv[i], "-translate")) {
+ translate = 1;
+ }
+
+ if (!strcmp(argv[i], "-ext")) { //default, do not use
+ extend = 1;
+ }
+
+
+ if (!strcmp(argv[i], "-verbose")) {
+ verbose = 1;
+ }
+
+ if (!strcmp(argv[i], "-postir")) {
+ postir = 1;
+ }
+ if (!strcmp(argv[i], "-lazy")) {
+ lazy = 1;
+ }
+ if (!strcmp(argv[i], "-fastreject")) {
+ fastreject = 1;
+ }
+ if (!strcmp(argv[i], "-gfc")) { //Broken, do not use
+ gapfreechunks = 1;
+ }
+
+ if (!strcmp(argv[i], "-out")) {
+ i++;
+ if ((i>=argc) || (argv[i][0]=='-')) {
+ fprintf(stderr, "missing parameter specification for [-out].\n");
+ return 1;
+ }
+ fprintf(stderr, "outputting to: %s\n", argv[i]);
+ outfile = fopen(argv[i], "w");
+ if (outfile==NULL) {
+ fprintf(stderr, "error with output file...\n");
+ exit(2);
+ }
+ }
+
+ if (!strcmp(argv[i], "-tree")) {
+ i++;
+ if ((i>=argc) || (argv[i][0]=='-')) {
+ fprintf(stderr, "missing parameter specification for [-tree].\n");
+ return 1;
+ }
+ notree = 0;
+ *treestr = argv[i];
+ fprintf(stderr, "using given phylogenetic tree:\n%s\n", *treestr);
+ }
+
+ if (!strcmp(argv[i], "-gapperseq")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-gapperseq].\n");
+ return 1;
+ }
+ gapperseq = atoi(argv[i]);
+ fprintf(stderr, "using gapperseq score: %d\n", gapperseq);
+ }
+ if (!strcmp(argv[i], "-overlap")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-overlap].\n");
+ return 1;
+ }
+ overlap = atoi(argv[i]);
+ fprintf(stderr, "using overlap value: %d\n", overlap);
+ }
+ if (!strcmp(argv[i], "-glwidth")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing parameter specification for [-glwidth].\n");
+ return 1;
+ }
+ glwidth = atoi(argv[i]);
+ fprintf(stderr, "using glwidth value: %d\n", glwidth);
+ }
+
+ if (!strcmp(argv[i], "-pro1")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing filename for [-pro1].\n");
+ return 1;
+ }
+ fb = FileOpen (argv[i]);
+ profile1 = readProfile(fb);
+ fprintf(stderr, "Profile1 is: %s\n", argv[i]);
+ }
+
+ if (!strcmp(argv[i], "-pro2")) {
+ i++;
+ if (i>=argc) {
+ fprintf(stderr, "missing filename for [-pro2].\n");
+ return 1;
+ }
+ fb = FileOpen (argv[i]);
+ profile2 = readProfile(fb);
+ fprintf(stderr, "Profile2 is: %s\n", argv[i]);
+ }
+
+ i++;
+ }
+
+ // setScores(gapstart, gapcont, gapend, gapperseq, overlap, glwidth);
+
+ return 0;
+}
+
+hll* updateAnchorPos(hll* myhll, FileBuffer f1, FileBuffer f2) {
+ hll *res, *temp, *prev=0;
+ res = myhll;
+ fprintf (stderr, "Updating anchs...\n");
+ for ( ; myhll; myhll = myhll->next) {
+ myhll->seq1start -= (f1->startpos-1);
+ myhll->seq1end -= (f1->startpos-1);
+ myhll->seq2start -= (f2->startpos-1);
+ myhll->seq2end -= (f2->startpos-1);
+ }
+ while (res && (res->seq1start < 0 || res->seq2start < 0)) {
+ // fprintf (stderr, "first..\n");
+ temp = res;
+ // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end,
+ // f1->endpos, f2->endpos);
+ res = res->next;
+ free(temp);
+ }
+ temp = res;
+ while (temp && temp->seq1end < (f1->endpos-f1->startpos) && temp->seq2end < (f2->endpos-f2->startpos)) {
+ // fprintf (stderr, "second...\n");
+ // fprintf(stderr, "Kept %d %d(%d %d)\n", temp->seq1end, temp->seq2end,
+ // f1->endpos-f1->startpos, f2->endpos-f2->startpos);
+ prev = temp;
+ temp = temp->next;
+ }
+ if (prev) {
+ temp = prev;
+ prev = prev->next;
+ temp->next = 0;
+ }
+ else if (temp == res) {
+ res = 0;
+ }
+ else {
+ // fprintf (stderr, "returning %d\n", res);
+ return res;
+ }
+ while ( prev ) {
+ // fprintf (stderr, "third...\n");
+ // fprintf(stderr, "Tossed %d %d(%d %d)\n", temp->seq1end, temp->seq2end,
+ // f1->endpos, f2->endpos);
+ temp = prev;
+ prev = prev->next;
+ free(temp);
+ }
+ return res;
+}
+
+int connectedGraph(hll* graph[MAX_SEQ][MAX_SEQ], int numseqs) {
+ int M[MAX_SEQ][MAX_SEQ];
+ int i, j, k;
+
+ for (i = 0; i < numseqs - 1; i++){
+ for (j = i + 1; j < numseqs; j++){
+ M[i][j] = M[j][i] = (graph[i][j] != NULL);
+ }
+ }
+
+ for (k = 0; k < numseqs; k++)
+ for (i = 0; i < numseqs; i++)
+ for (j = 0; j < numseqs; j++)
+ if (M[i][k] && M[k][j]) M[i][j] = 1;
+
+ k = 1;
+ for (i = 0; k && i < numseqs; i++)
+ k = M[0][i];
+
+ return k;
+}
+
+
+int main(int argc, char** argv) {
+ FileBuffer seqfile;
+ seq **seqs;
+ int i = 1, j = 1, x, y;
+ int pro1cnt=0, pro2cnt=0;
+ int pro1lst[MAX_SEQ], pro2lst[MAX_SEQ];
+ int pro1ptr[MAX_SEQ], pro2ptr[MAX_SEQ];
+ char command[256];
+
+ char *treestr = NULL;
+ align *stack[MAX_SEQ*2];
+ align *final;
+ align *myaligns[MAX_SEQ];
+ hll* table[MAX_SEQ][MAX_SEQ];
+ FileBuffer files[MAX_SEQ];
+
+ outfile = stdout;
+ lagan_dir = getenv ("LAGAN_DIR");
+ if (!lagan_dir) {
+ fprintf(stderr, "Environment variable LAGAN_DIR not set\n");
+ exit(1);
+ }
+
+ buildcache();
+ initLib();
+
+ seqs = (seq**) malloc((argc-1)*sizeof(seq*));
+
+
+ if (parseParameters(argc, argv, files, &treestr)) return 1;
+
+ gapstart += gapcont;
+
+
+ // Take all sequences and make simple alignments
+
+ for (i=0; i<numseqs; i++) {
+ seqs[i] = FileRead(files[i], 0, 0, VER_MLAGAN);
+ seqs[i]->index = i+1;
+ myaligns[i]=simaligns[i]=mkSimAlign(seqs[i]);
+ simaligns[i]->index = i;
+ x = getSeqNumber(profile1, seqs[i]);
+ y = getSeqNumber(profile2, seqs[i]);
+ if (x < 0 && y < 0) {
+ fprintf(stderr, "Sequence %s not found in either profile!!!\n", seqs[i]->name);
+ exit(1);
+ }
+ if (x >= 0 && y >= 0) {
+ fprintf(stderr, "Sequence %s found in both profiles!!!\n", seqs[i]->name);
+ exit(1);
+ }
+ if (x >= 0) {
+ fprintf(stderr, "Sequence %s[%d/%d] in 1st profile\n", seqs[i]->name, i, numseqs);
+ if (profile1->index > i) {
+ profile1->index = i;
+ }
+ pro1lst[pro1cnt++] = i;
+ pro1ptr[i] = x;
+ pro2ptr[i] = -1;
+ }
+ if (y >= 0) {
+ fprintf(stderr, "Sequence %s[%d/%d] in 2nd profile\n", seqs[i]->name, i, numseqs);
+ if (profile2->index > i) {
+ profile2->index = i;
+ }
+ pro2lst[pro2cnt++] = i;
+ pro1ptr[i] = -1;
+ pro2ptr[i] = y;
+ }
+ }
+
+
+ // Find all pairwise anchors.
+ fprintf(stderr,"pro1cnt = %d, pro2cnt = %d\n", pro1cnt, pro2cnt);
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ simaligns[i]->hlls[j]=0;
+ }
+ }
+ for (i=0; i< pro1cnt; i++) {
+ for (j=0; j< pro2cnt; j++) {
+ if (pro1lst[i] < pro2lst[j]) {
+ simaligns[pro1lst[i]]->hlls[pro2lst[j]] = generateAnchors(files[pro1lst[i]], files[pro2lst[j]]);
+ simaligns[pro1lst[i]]->hlls[pro2lst[j]] = remapHLLs(simaligns[pro1lst[i]]->hlls[pro2lst[j]],
+ 0, profile1, pro1ptr[pro1lst[i]]);
+ simaligns[pro1lst[i]]->hlls[pro2lst[j]] = remapHLLs(simaligns[pro1lst[i]]->hlls[pro2lst[j]],
+ 1, profile2, pro2ptr[pro2lst[j]]);
+ }
+ else {
+ simaligns[pro2lst[j]]->hlls[pro1lst[i]] = generateAnchors(files[pro2lst[j]], files[pro1lst[i]]);
+ simaligns[pro2lst[j]]->hlls[pro1lst[i]] = remapHLLs(simaligns[pro2lst[j]]->hlls[pro1lst[i]],
+ 0, profile2, pro2ptr[pro2lst[j]]);
+ simaligns[pro2lst[j]]->hlls[pro1lst[i]] = remapHLLs(simaligns[pro2lst[j]]->hlls[pro1lst[i]],
+ 1, profile1, pro1ptr[pro1lst[j]]);
+ }
+ }
+ }
+
+ // printf("\n");
+
+ for (i=0; i<MAX_SEQ*2; i++) {
+ stack[i] = NULL;
+ }
+
+
+ /*
+ for (i=0; i<(numseqs-1); i++) {
+ for (j=i+1; j<numseqs; j++) {
+ printf("Sanity Check: simaligns[%d]->hlls[%d].score=%g\n",
+ i,j,
+ simaligns[i]->hlls[j]==NULL ? 0 : simaligns[i]->hlls[j]->score);
+ }
+ }
+ */
+
+ // Processall closest pairs
+
+ if (notree) { // Not yet implemented
+ graphCollapsal(myaligns);
+ }
+ else {
+
+ fprintf(stderr, "\n****************************\n");
+ fprintf(stderr, "gs: %d; ge: %d;\n", gapstart, gapend);
+ fprintf(stderr, "gc: %d; gp: %d\n", gapcont, gapperseq);
+ //fprintf(stderr, "match: %d; mismatch: %d\n", match, mismatch);
+ fprintf(stderr, "overlap: %d; glwidth: %d\n", overlap, glwidth);
+ fprintf(stderr, "\n****************************\n");
+
+ i = 0;
+ treeToRPN(treestr, stack, &i);
+ final = procStack(stack, i, myaligns);
+ }
+
+
+ // Ouput end result.
+ fprintf(stderr, "final alignment... \n");
+ if (fastreject) {
+ printXMFAAlign(outfile, final);
+ }
+ else {
+ printFASTAAlign(outfile, final);
+ }
+ if (outfile != stdout) fclose (outfile);
+
+
+ fprintf(stderr, "mlagan -- end.\n");
+ return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/rechaos.pl b/src/rechaos.pl
new file mode 100755
index 0000000..9a6a062
--- /dev/null
+++ b/src/rechaos.pl
@@ -0,0 +1,375 @@
+#!/usr/bin/env perl
+
+$lagandir = $ENV{LAGAN_DIR};
+
+# Status
+# -- extension problems
+
+if (@ARGV < 2) {
+ print ("usage:\n rechaos seqfile1 seqfile2 [-chaos \"chaos flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-translate] [-fastreject]\n");
+ exit(1);
+}
+
+#$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(8,1,30,0)x,(7,1,30,0)x";
+$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(4,0,4,3000)xt,(8,1,30,0)x,(7,1,30,0)x";
+#$recurfl = "(12,0,10,200)x,(12,0,10,150)x,(3,0,10,150)xt,(8,0,10,150)x,(12,0,25,0),(13,1,30,0),(3,0,30,0)t,(8,1,30,0),(7,1,25,0)";
+$minbox = 10;
+$minside = 5;
+$seq1 = $ARGV[0];
+$seq2 = $ARGV[1];
+$tofile = 0;
+$masker = 1;
+$lazycheck = 0;
+$fastreject = 0;
+$frminlevel = 0;
+$frmaxlevel = 3;
+ at frseq1 = (150000, 50000, 30000, 15000);
+ at frseq2 = (150000, 50000, 30000, 15000);
+#@frseq1 = (70000, 60000, 60000, 20000);
+#@frseq2 = (70000, 60000, 60000, 20000);
+$sentinelleft = 1.1;
+$sentinelright = 1.2;
+$gfc = " ";
+$dounmasked = 1;
+$filename = "";
+$debug = 0;
+$anchparams = "";
+$translate = 0;
+
+sub max {
+ my ($a, $b) = @_;
+ return $a if ($a > $b);
+ return $b;
+}
+
+sub min {
+ my ($a, $b) = @_;
+ return $a if ($a < $b);
+ return $b;
+}
+
+$i = 2;
+while ($i < @ARGV) {
+ if ($ARGV[$i] =~ /-\chaos/) {
+ $chaosfl = $chaosfl." ".$ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-ext/) {
+ $chaosfl = $chaosfl." -ext ";
+ }
+ elsif ($ARGV[$i] =~ /-recurse/) {
+ $recurfl = $ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-lazy/) {
+ $lazycheck = 1;
+ }
+ elsif ($ARGV[$i] =~ /-nomask/) {
+ $masker = 0;
+ }
+ elsif ($ARGV[$i] =~ /-out/) {
+ $tofile = 1;
+ $filename = $ARGV[++$i];
+ }
+ elsif ($ARGV[$i] =~ /-maskedonly/) {
+ $dounmasked = 0;
+ }
+ elsif ($ARGV[$i] =~ /-fastreject/) {
+ $fastreject = 1;
+ }
+ elsif ($ARGV[$i] =~ /-debug/) {
+ $debug = 1;
+ }
+ elsif ($ARGV[$i] =~ /-translate/) {
+ $translate = 1;
+ }
+ elsif ($ARGV[$i] =~ /-gfc/) {
+ $gfc = " -gfc ";
+ }
+ elsif ($ARGV[$i] =~ /-gap/){
+ $anchparams = $anchparams." -gap ".$ARGV[++$i];
+ $anchparams = $anchparams." ".$ARGV[++$i];
+ }
+ else {
+ die ("Unrecognized option $ARGV[$i]\n");
+ }
+ $i++;
+}
+
+if ($lazycheck) {
+ if (-f $filename) {
+ print STDERR "Output file already exists, lazy mode exit!\n";
+ exit (0);
+ }
+}
+
+$extracase1 = 0;
+$extracase2 = 0;
+if (-e "$seq1.masked") { $extra1 = $seq1; $seq1 = "$seq1.masked"; $extracase1 = 1; }
+if (-e "$seq2.masked") { $extra2 = $seq2; $seq2 = "$seq2.masked"; $extracase2 = 1; }
+if (! $dounmasked){ $extracase1 = 0; $extracase2 = 0; }
+
+#open(SEQ1, "$seq1");
+#open(SEQ2, "$seq2");
+
+#$line1 = <SEQ1>;
+#while ($line1 = <SEQ1>) {
+# chomp $line1;
+# $seq1len += length($line1);
+#}
+#
+#$line2 = <SEQ2>;
+#while ($line2 = <SEQ2>) {
+# chomp $line2;
+# $seq2len += length($line2);
+#}
+
+$seq1len = `$lagandir/utils/getlength $seq1`; chomp $seq1len;
+$seq2len = `$lagandir/utils/getlength $seq2`; chomp $seq2len;
+
+$b1[0] = $b2[0] = 1;
+$e1[0] = $seq1len;
+$e2[0] = $seq2len;
+
+$cumanchs = 0;
+
+$clipleft1 = 0;
+$clipleft2 = 0;
+$clipright1 = $seq1len + 1;
+$clipright2 = $seq2len + 1;
+$app_str = "";
+
+$i = 0;
+while (1) {
+ $goodanchs = 0;
+ $totalanchs = 0;
+
+ $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/);
+ if (! $stillmore) {
+ if ($extracase1 || $extracase2) {
+ if ($extracase1) { $seq1 = $extra1; $extracase1 = 0; }
+ if ($extracase2) { $seq2 = $extra2; $extracase2 = 0; }
+ }
+ else {
+ last;
+ }
+ }
+ else {
+ $wordlen = $1;
+ $degeneracy = $2;
+ $cutoff = $3;
+ $extcutoff = $4;
+ $tail = $5;
+
+ $extraparams = "";
+ $extraparams = "-t ".$extraparams if ((index ($tail, "t") != -1) && ($translate));
+ $extraparams = $extraparams." -rsc $extcutoff" if (index ($tail, "x") != -1);
+ }
+
+ $recurfl = $6;
+ next if ((index ($tail, "t") != -1) && (!$translate));
+
+ print STDERR "Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff, $extcutoff) $tail\n";
+
+# PRINT OUT LIST OF REGIONS TO ALIGN
+
+ open (PFILE, ">$$.anchs.pairs");
+ for ($j = 0; $j < @b1; $j++) {
+ print PFILE "-s1 $b1[$j] $e1[$j] -s2 $b2[$j] $e2[$j]\n";
+ }
+ close (PFILE);
+
+# print STDERR "PAIRS hits\n";
+# print STDERR `cat $$.anchs.pairs`;
+# print STDERR "-----------------\n";
+# print STDERR `cat $$.anchs.pairs`;
+# print STDERR "-----------------\n";
+# print STDERR "$lagandir/chaos $seq1 $seq2 -wl $wordlen -nd $degeneracy -co $cutoff $extraparams $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp";
+
+# PERFORM THE ALIGNMENTS USING CHAOS
+
+ $saver = "$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp";
+ `$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp`;
+ if ($?) {
+ print STDERR "$saver\n";
+ exit(1);
+ }
+
+# ADD IN BOUNDARIES
+
+ $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/);
+ if ($fastreject || $stillmore || $extracase1 || $extracase2){
+ $temp1 = $seq1len + 1;
+ $temp2 = $seq2len + 1;
+ $app_str = $app_str."seq1 0 $clipleft1; seq2 0 $clipleft2; score=$sentinelleft (+)\n";
+ $app_str = $app_str."seq1 $clipright1 $temp1; seq2 $clipright2 $temp2; score=$sentinelright (+)\n";
+ }
+
+# APPEND HITS FROM $app_str TO LOCAL ALIGNMENT LIST
+
+ open (OFILE, ">>$$.anchtemp");
+ print OFILE $app_str;
+ close (OFILE);
+
+# `wc $$.anchtemp` =~ /(\d+)/x;
+# $totalanchs = $totalanchs + $1;
+# print STDERR "CHAOS hits\n";
+# print STDERR `cat $$.anchtemp`;
+
+# FIND MAXIMAL-SCORING CONSISTENT CHAIN
+
+ `$lagandir/anchors $$.anchtemp $gfc $anchparams | sort -n +1 > $$.anchs.sorted`;
+ if ($?) { exit(1); }
+
+# IF WE'RE DONE, THEN QUIT!
+
+ $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/);
+ if (!$stillmore && !$extracase1 && !$extracase2) {
+ last;
+ }
+
+# `wc $$.anchs` =~ /(\d+)/x;
+# print STDERR "ANCHS hits\n";
+# print STDERR `cat $$.anchs.sorted`;
+# $goodanchs = $goodanchs + $1;
+
+# if ($?) { exit(1); }
+
+# READ SORTED ANCHORS TO @anchors
+
+ open(SFILE, "$$.anchs.sorted");
+ @anchors = <SFILE>;
+ close(SFILE);
+
+ @b1new = 0;
+ @b2new = 0;
+ @e1new = 0;
+ @e2new = 0;
+ @scores = 0;
+
+ $app_str = "";
+
+ # FOR EACH UNALIGNED REGION
+
+ $area = 0;
+ $maxarea = 0;
+ $k = 0;
+
+ for ($m = 0; $m < @anchors; $m++){
+
+ # SAVE OLD ANCHORS (SKIP FIRST AND LAST FAKE ANCHORS)
+
+ if ($m >= 1 && $m < @anchors - 1){
+ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $score = $5; chomp $score;
+ $app_str = $app_str."seq1 $1 $2; seq2 $3 $4; score=$score (+)\n";
+ }
+
+ if ($m == 0){ next; }
+
+ # DETERMINE REGION BOUNDARIES
+
+ $anchors[$m-1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $gap1begin = $2 + 1;
+ $gap2begin = $4 + 1;
+ $prevanchorscore = $5; chomp $prevanchorscore;
+
+ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/;
+ $gap1end = $1 - 1;
+ $gap2end = $3 - 1;
+ $nextanchorscore = $5; chomp $nextanchorscore;
+
+ # CHECK IF RECURSION NEEDED
+
+ $boxarea = ($gap1end - $gap1begin + 1) * ($gap2end - $gap2begin + 1);
+ $area = $area + $boxarea;
+ $maxarea = $boxarea if ($boxarea > $maxarea);
+
+ if ($boxarea >= $minbox && ($gap1end - $gap1begin + 1) > $minside &&
+ ($gap2end - $gap2begin + 1) > $minside ){
+
+ # FAST REJECT
+
+ if ($fastreject && ($i >= $frminlevel) && ($i <= $frmaxlevel)){
+
+ # SKIP MARKED ENDS OF ALIGNMENT
+
+ if ($nextanchorscore == $sentinelleft ||
+ $prevanchorscore == $sentinelright){
+ next;
+ }
+
+ # TRIM NEW ENDS OF ALIGNMENT
+
+ if ($prevanchorscore == $sentinelleft){
+# if ($boxarea > $frseq1[$i] * $frseq2[$i]){
+ if (($gap1end - $gap1begin > $frseq1[$i]) ||
+ ($gap2end - $gap2begin > $frseq2[$i])){
+ if (@anchors == 2){ exit(3); }
+ $clipleft1 = max ($gap1begin-1, $gap1end - $frseq1[$i]);
+ $clipleft2 = max ($gap2begin-1, $gap2end - $frseq2[$i]);
+ $gap1begin = $clipleft1 + 1;
+ $gap2begin = $clipleft2 + 1;
+ }
+ }
+ elsif ($nextanchorscore == $sentinelright){
+# if ($boxarea > $frseq1[$i] * $frseq2[$i]){
+ if (($gap1end - $gap1begin > $frseq1[$i]) ||
+ ($gap2end - $gap2begin > $frseq2[$i])){
+ if (@anchors == 2){ exit(3); }
+ $clipright1 = min ($gap1end+1, $gap1begin + $frseq1[$i]);
+ $clipright2 = min ($gap2end+1, $gap2begin + $frseq2[$i]);
+ $gap1end = $clipright1 - 1;
+ $gap2end = $clipright2 - 1;
+ }
+ }
+ }
+
+ # ADD REGION
+
+ if ($gap1begin < $gap1end && $gap2begin < $gap2end){
+ $b1new[$k] = $gap1begin;
+ $b2new[$k] = $gap2begin;
+ $e1new[$k] = $gap1end;
+ $e2new[$k] = $gap2end;
+ $k++;
+ }
+ }
+ }
+
+ @b1 = @b1new;
+ @b2 = @b2new;
+ @e1 = @e1new;
+ @e2 = @e2new;
+ if ($debug) {
+ print STDERR "Level $i Summary:\n";
+ print STDERR " Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff)\n";
+ if ($totalanchs == 0) {
+ $percentage = 0;
+ }
+ else {
+ $percentage = $goodanchs / $totalanchs * 100.0;
+ }
+ print STDERR " $goodanchs good out of $totalanchs total anchors ($percentage%)\n";
+ $area = $area / 1000000;
+ $maxarea = $maxarea / 1000000;
+ print STDERR " Total area left = $area (max = $maxarea)\n";
+ }
+ $cumanchs = $cumanchs + $goodanchs;
+ $i++;
+}
+
+$res = `sort -nr +1 $$.anchs.sorted`;
+if ($?) { exit(1); }
+
+`rm $$.*`;
+
+if($tofile) {
+ open(OUTFILE, ">$filename");
+ print OUTFILE "$res";
+ close OUTFILE;
+}
+else {
+ print "$res";
+}
+
+print STDERR "$cumanchs cumulative anchors\n"
+
diff --git a/src/skiplist.c b/src/skiplist.c
new file mode 100644
index 0000000..ef738bb
--- /dev/null
+++ b/src/skiplist.c
@@ -0,0 +1,210 @@
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include "skiplist.h"
+#include <time.h>
+#include <assert.h>
+
+
+char init = 0;
+
+void printSLE(sle* tbp) {
+ printf(" %d %x\n", tbp->index, tbp->myelem);
+}
+
+int makeLevel() {
+ unsigned int r = lrand48();
+ int i = 1;
+ while ((r&1) && (i<MAX_LISTS)) {
+ i++;
+ r = r >> 1;
+ }
+ /* printf("lev = %d\n", i);*/
+ return i;
+}
+
+void initLib() {
+ init = 1;
+ srand48(time(0));
+}
+
+/* makes a new skip list*/
+sklst* makeSkLst() {
+ int i;
+ sklst* res = (sklst*) malloc (sizeof(sklst));
+ if (!init) {
+ fprintf(stderr, "Skip Lists not initialized\n");
+ exit(2);
+ }
+ res->sentinel = mksle(MAX_LISTS, INT_MIN, 0);
+ res->maxlevel = 1;
+ return res;
+}
+
+/*deletes an old skip list */
+void delSkLst(sklst* trgt) {
+ sle *next, *tbd = trgt->sentinel;
+ while(tbd) {
+ next = tbd->next[0];
+ delSLE(tbd);
+ tbd = next;
+ }
+}
+
+void chklst2(sklst* trgt) {
+ sle* tt = trgt->sentinel;
+ sle* tt2 = tt->next[0];
+ while (tt2) {
+ assert(tt->index <= tt2->index);
+ assert(tt == tt2->prev[0]);
+ tt = tt->next[0];
+ tt2 = tt2->next[0];
+ }
+}
+
+void chklst(sklst* trgt) {
+ sle* tt = trgt->sentinel;
+ sle* tt2 = tt->next[0];
+ while (tt2) {
+ assert(tt->index <= tt2->index);
+ assert(tt == tt2->prev[0]);
+ tt = tt->next[0];
+ tt2 = tt2->next[0];
+ }
+}
+
+sle* SLinsertAfter(sklst* trgt, sle* prev, int index, void* elem) {
+ int i;
+ sle *tbe;
+ int lc = makeLevel();
+ if (lc > trgt->maxlevel) {
+ trgt->maxlevel = lc;
+ }
+ tbe = mksle(lc, index, elem);
+ for (i = 0; i < tbe->linkcnt; i++) {
+ tbe->prev[i] = prev;
+ if (prev->next[i]) {
+ prev->next[i]->prev[i] = tbe;
+ }
+ tbe->next[i] = prev->next[i];
+ prev->next[i] = tbe;
+ while (prev && i >= prev->linkcnt-1)
+ prev = prev->prev[i];
+
+ }
+ return tbe;
+}
+
+/*inserts the elem with the index */
+sle* SLinsert(sklst* trgt, int index, void* elem) {
+ sle* prev = SLfind(trgt, index), *tbe;
+ return SLinsertAfter(trgt, prev, index, elem);
+}
+
+/*removes & destroys this element */
+void SLremove(sklst* trgt, sle* tbr) {
+ int i;
+ if (trgt)
+ for (i = 0; i < tbr->linkcnt; i++) {
+ if (tbr->prev[i])
+ tbr->prev[i]->next[i] = tbr->next[i];
+ if (tbr->next[i])
+ tbr->next[i]->prev[i] = tbr->prev[i];
+ }
+ delSLE(tbr);
+}
+
+
+/* I could just keep a pointer to last, but since I'll rarely
+ use it I'll find it this way instead.. */
+
+sle* SLgetLast(sklst* trgt) {
+ int i;
+ sle* currpivot = trgt->sentinel;
+ i = trgt->maxlevel-1;
+ for ( ; i >= 0; i--) {
+ while (currpivot->next[i]) {
+ currpivot = currpivot->next[i];
+ }
+ }
+ return currpivot;
+
+}
+
+/* Same as the method below, but good for searching for things
+ near the beginning. it uses an up-down method */
+
+sle* SLlowFind(sklst* trgt, int index) {
+ int i;
+ sle* currpivot = trgt->sentinel;
+ i = 0;
+ for ( ; i < trgt->maxlevel-1; i++) {
+ if (!currpivot->next[i] || currpivot->next[i]->index > index)
+ break;
+ currpivot = currpivot->next[i];
+ }
+
+ for ( ; i >= 0; i--) {
+
+ while (currpivot->index < index) {
+ if (!currpivot->next[i]) {
+ goto cont;
+ }
+ currpivot = currpivot->next[i];
+ }
+ currpivot = currpivot->prev[i];
+ cont: {}
+ }
+ return currpivot;
+}
+
+/*gets the elem with the next lowest index. 0 if none */
+sle* SLfind(sklst* trgt, int index) {
+ int i;
+ sle* currpivot = trgt->sentinel;
+ i = trgt->maxlevel-1;
+ for ( ; i >= 0; i--) {
+
+ while (currpivot->index < index) {
+ if (!currpivot->next[i]) {
+ goto cont;
+ }
+ currpivot = currpivot->next[i];
+ }
+ currpivot = currpivot->prev[i];
+ cont: {}
+ }
+ return currpivot;
+
+}
+
+sle* mksle(int linkcnt, int index, void* myelem) {
+ int i;
+ sle* res = (sle*)malloc (sizeof(sle));
+ res->next = (sle**) malloc(linkcnt*sizeof(sle*));
+ res->prev = (sle**) malloc(linkcnt*sizeof(sle*));
+ res->linkcnt = linkcnt;
+ res->index = index;
+ res->myelem = myelem;
+ for (i = 0; i < linkcnt; i++) {
+ res->next[i] = 0;
+ res->prev[i] = 0;
+ }
+ return res;
+}
+
+void delSLE(sle* tbd) {
+ free(tbd->next);
+ free(tbd->prev);
+ free(tbd);
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/src/skiplist.h b/src/skiplist.h
new file mode 100644
index 0000000..ad41a9c
--- /dev/null
+++ b/src/skiplist.h
@@ -0,0 +1,29 @@
+#define MAX_LISTS 32
+
+typedef struct skiplistelem {
+ struct skiplistelem** next;
+ struct skiplistelem** prev;
+ int linkcnt;
+ int index;
+ void* myelem;
+} sle;
+
+typedef struct skiplist {
+ sle* sentinel;
+ int maxlevel;
+} sklst;
+
+
+void initLib();
+sklst* makeSkLst();
+void chklst(sklst* trgt);
+void delSkLst(sklst* trgt);
+sle* SLinsertAfter(sklst* trgt, sle* prev, int index, void* elem);
+sle* SLinsert(sklst* trgt, int index, void* elem);
+sle* SLgetLast(sklst* trgt);
+void SLremove(sklst* trgt, sle* tbr);
+sle* SLfind(sklst* trgt, int index);
+sle* SLlowFind(sklst* trgt, int index);
+sle* mksle(int linkcnt, int index, void* myelem);
+void delSLE(sle* tbd);
+
diff --git a/src/slagan-mfa.pl b/src/slagan-mfa.pl
new file mode 100755
index 0000000..ba15fa3
--- /dev/null
+++ b/src/slagan-mfa.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl
+
+use strict;
+
+$0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0;
+
+die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"};
+my $LAGAN_DIR = $ENV{LAGAN_DIR};
+
+my ($outfile, $base);
+
+foreach my $arg (@ARGV) {
+ if ($arg =~ /-out\s+([^\s]+)/) {
+ $outfile = $1;
+ $arg =~ s/-out\s+([^\s]+)//;
+ } elsif ($arg =~ /-base[\s\=]+([^\s]+)/) {
+ $base = $1;
+ $arg =~ s/-base[\s\=]+([^\s]+)//;
+ die("$0: Invalid base parameter (expected 1 or 2). Stopped") unless $base eq "1" or $base eq "2";
+ }
+}
+
+if (@ARGV < 2) {
+ print ("Usage:\n$0 seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n");
+ exit(1);
+}
+
+my $args = join(" ", @ARGV);
+system($LAGAN_DIR."/slagan.pl $args > slagan.pl.out");
+die("$0: slagan.pl returned error $?. Stopped") if $?;
+
+system($LAGAN_DIR."/xmfa2mfa.pl ".($base eq "2" ? "2" : "1")." < slagan.pl.out ".($outfile ? "> $outfile" : ""));
+die("$0: xmfa2mfa.pl returned error $?. Stopped") if $?;
+
+unlink "slagan.pl.out";
diff --git a/src/slagan.pl b/src/slagan.pl
new file mode 100755
index 0000000..6aed1e2
--- /dev/null
+++ b/src/slagan.pl
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $lagandir = $ENV{LAGAN_DIR};
+
+if (@ARGV < 2) {
+ print ("Usage:\n slagan.pl seqfile1 seqfile2 [-glocal \"glocal flags\"] [-chaos \"chaos flags\"] [-order \"order flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-mfa] [-out \"filename\"] [-maskedonly] [-debug] [-translate] [-fastreject]\n");
+ exit(1);
+}
+
+my ($seq1, $firstName) = ($ARGV[0], $ARGV[0]);
+die("$0: File not found: $seq1. Stopped") unless -f $seq1;
+my ($seq2, $secondName) = ($ARGV[1], $ARGV[1]);
+die("$0: File not found: $seq2. Stopped") unless -f $seq2;
+
+my ($extra1, $extra2) =(0, 0);
+if (-e "$seq1.masked") { $seq1 = "$seq1.masked"; $extra1 = 1;}
+if (-e "$seq2.masked") { $seq2 = "$seq2.masked"; $extra2 = 1;}
+
+my $max_ext = 25000;
+my $ext_mul = 1;
+my $arglist = "";
+my $glocal_fl = " -gapopen 0,1000,2000,2000 -gapcont 0.2,0.06,0.06,0.06 -dist 0,1.0,2.5,2.5";
+my $chaos_fl = " -wl 11 -nd 1 -co 10 -ext -rsc 2250 -b";
+my $lagan_fl = "";
+my $supermap_fl = "-glocal_out=slagan.out.glocal";
+my $outfile = 0;
+my $fastrej = 0;
+
+for (my $i = 2; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-glocal_fl/) {
+ $glocal_fl = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-chaos_fl/) {
+ $chaos_fl = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-lagan_fl/) {
+ $lagan_fl = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-max_ext/) {
+ $max_ext = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-ext_mul/) {
+ $ext_mul = $ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-out/) {
+ $outfile = $ARGV[++$i];
+ if (-e "$outfile") { system("rm $outfile") and exit(1); }
+ } elsif ($ARGV[$i] =~ /-order/) {
+ $arglist = $arglist." -order $ARGV[++$i]";
+ } elsif (($ARGV[$i] =~ /-gs/) || ($ARGV[$i] =~ /-gc/) || ($ARGV[$i] =~ /-mt/) || ($ARGV[$i] =~ /-ms/) || ($ARGV[$i] =~ /-bw/)) {
+ $arglist = $arglist." ".$ARGV[$i];
+ $arglist = $arglist." ".$ARGV[++$i];
+ } elsif ($ARGV[$i] =~ /-ext/) {
+ $arglist = $arglist." -ext $ARGV[++$i]";
+ } elsif ($ARGV[$i] =~ /-maskedonly/) {
+ $arglist = $arglist." -maskedonly";
+ } elsif ($ARGV[$i] =~ /-translate/) {
+ $arglist = $arglist." -translate";
+ } elsif ($ARGV[$i] =~ /-fastreject/) {
+ $fastrej = 1;
+# $arglist = $arglist." -fastreject";
+ } elsif ($ARGV[$i] =~ /-recurse/) {
+ $arglist = $arglist." -recurse \"".$ARGV[++$i]."\"";
+ } elsif ($ARGV[$i] =~ /-chaos/) {
+ $chaos_fl = $ARGV[++$i];
+ } else {
+ die("$0: Invalid option for rlagan: $ARGV[$i]");
+ }
+}
+
+my $seq1len = `$lagandir/utils/getlength $firstName`;
+my $seq2len = `$lagandir/utils/getlength $secondName`;
+chomp $seq1len;
+chomp $seq2len;
+
+`$lagandir/chaos $seq1 $seq2 $chaos_fl > chaos.$$`;
+if ($?) { exit(1); }
+
+#`$lagandir/glocal chaos.$$ $glocal_fl > out.$$`;
+#@regs = `$lagandir/anal_gloc.pl < out.$$`;
+#print @regs;
+
+open(FH, "> seq1len"); print FH $firstName." ".$seq1len."\n"; close FH;
+open(FH, "> seq2len"); print FH $secondName." ".$seq2len."\n"; close FH;
+my $supermap_outfile = "slagan.out.smap";
+my $supermap_inv = "$lagandir/supermap.pl -sizes1=seq1len -sizes2=seq2len $supermap_fl chaos.$$ -no_clust_run -f -out=$supermap_outfile 1>&2";
+#print $supermap_inv."\n";
+system($supermap_inv);
+
+open(FH, "< $supermap_outfile");
+my @regs = <FH>;
+die("$0: Supermap generated no regions. Stopped") unless scalar @regs;
+close FH;
+unlink "seq1len"; unlink "seq2len"; # unlink $supermap_outfile;
+
+#$prevend1 = $seq1len;
+#$prevend2 = $seq2len;
+#$nextstart1 = 1;
+#$nextstart2 = 1;
+
+for (my $k = 0; $k < @regs; $k++) {
+ $regs[$k] =~ /^([^\s]+)\s([\d]+)\s([\d]+)\s\s\s([^\s]+)\s([\d]+)\s([\d]+)\s(\+|\-)\s\((DM|M1|M2),\s([\d]+)\saligns\)$/o;
+
+ my ($startreg1, $endreg1, $startreg2, $endreg2, $strand, $type) = ($2, $3, $5, $6, $7, $8);
+
+=head1
+ $regs[$k] =~ /.* Region \[(\d+) (\d+)\]\[(\d+) (\d+)\] (.*) (.)/;
+ $startreg1 = $1; $endreg1 = $2; $startreg2 = $3; $endreg2 = $4;
+ $strand = $6;
+ if ($k+2 < @regs) {
+ $regs[$k+1] =~ /.* Region \[(\d+) (\d+)\]\[(\d+) (\d+)\] (.*) (.)/;
+ $nextstart1 = $2;
+ } else {
+ $nextstart1 = 1;
+ }
+ $y1 = $prevend1-$endreg1;
+ $y2 = $startreg1-$nextstart1;
+ $expandback = ($max_ext < $y1)? $max_ext:$prevend1-$endreg1;
+ $expandforw = ($max_ext < $y2)? $max_ext:$startreg1-$nextstart1;
+ $prevend1 = $startreg1;
+ $startreg1 = $startreg1 - $expandforw;
+ $endreg1 = $endreg1 + $expandback;
+=cut
+
+ my $rcf = "";
+ if ($strand eq "+") {
+# $endreg2 = ($endreg2 + $expandback * $ext_mul > $prevend2)? $prevend2:($endreg2 + $expandback * $ext_mul);
+# $startreg2 = ($startreg2 - $expandforw * $ext_mul < $nextstart2)? $nextstart2:($startreg2 - $expandforw * $ext_mul);
+ } else {
+ $rcf = "-rc";
+# $endreg2 = ($endreg2 + $expandforw * $ext_mul > $prevend2)? $prevend2:($endreg2 + $expandforw * $ext_mul);
+# $startreg2 = ($startreg2 - $expandback * $ext_mul < $nextstart2)? $nextstart2:($startreg2 - $expandback * $ext_mul);
+ }
+
+#print "$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n";
+ `$lagandir/utils/fa2xfa $firstName $startreg1 $endreg1 1 > seq1$k.$$\n`;
+#print "$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n";
+ `$lagandir/utils/fa2xfa $secondName $startreg2 $endreg2 2 $rcf > seq2$k.$$\n`;
+# if ($extra1) { `$lagandir/utils/fa2xfa $seq1 $startreg1 $endreg1 1 > seq1$k.$$.masked\n`; }
+# if ($extra2) { `$lagandir/utils/fa2xfa $seq2 $startreg2 $endreg2 2 $rcf > seq2$k.$$.masked\n`; }
+#print "$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n";
+ `$lagandir/lagan.pl seq1$k.$$ seq2$k.$$ $arglist $lagan_fl -mfa -out lagan.$k.$$\n`;
+
+ my $suff = "";
+ if ($outfile) { $suff = " >> $outfile"; }
+ if (-e "lagan.$k.$$") {
+ if ($fastrej) {
+#print "$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff\n";
+ print `$lagandir/utils/scorealign lagan.$k.$$ 45 -cropxmfa -ibounds $suff`;
+ } else {
+#print "$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds\n";
+ my $sc = `$lagandir/utils/scorealign lagan.$k.$$ 45 -ibounds`;
+ chomp($sc);
+ if ($sc) {
+ print `cat lagan.$k.$$ $suff`;
+ print `echo \"=$sc $type\n\" $suff`;
+ }
+ }
+ }
+}
+
+my ($outName1, $outName2) = ($ARGV[0], $ARGV[1]);
+$outName1 =~ s/^.*\///;
+$outName1 =~ s/\..*//;
+$outName2 =~ s/^.*\///;
+$outName2 =~ s/\..*//;
+
+`cat chaos.$$ > ${outName1}_$outName2.chaos`;
+####`cat out.$$ > ${outName1}_$outName2.mon`;
+unlink(glob("*.$$"));
+if ($extra1 || $extra2) { `rm *.$$.masked`; }
+exit(0);
+
+
+# out: .chaos .mon->.smap .xmfa
diff --git a/src/sortlist.c b/src/sortlist.c
new file mode 100644
index 0000000..f1b4111
--- /dev/null
+++ b/src/sortlist.c
@@ -0,0 +1,43 @@
+hll* merge2(hll* list1, hll* list2) {
+ hll* totallist = 0;
+ hll* temp;
+ while (list1 || list2) {
+ if ((list1 && !list2) || (list1->seq1start > list2->seq1start)) {
+ temp = list1->next;
+ list1->next = totallist;
+ totallist = list1;
+ list1 = temp;
+ }
+ else {
+ temp = list2->next;
+ list2->next = totallist;
+ totallist = list2;
+ list2 = temp;
+ }
+ }
+ return totallist;
+}
+
+hll* findmiddle(hll* mylist) {
+ hll* other = mylist;
+ while (other && other->next) {
+ other = other->next->next;
+ mylist = mylist->next;
+ }
+ return mylist;
+}
+
+hll* sortList(hll* mylist) {
+ hll* premid;
+ hll* mid;
+ if (!mylist || !mylist->next)
+ return mylist;
+
+ premid = findmiddle(mylist);
+ mid = premid->next;
+ premid->next = 0;
+ mylist = sortList(mylist);
+ mid = sortList(mylist);
+ mylist = merge2(mylist,mid);
+}
+
diff --git a/src/supermap.pl b/src/supermap.pl
new file mode 100755
index 0000000..78296e9
--- /dev/null
+++ b/src/supermap.pl
@@ -0,0 +1,1622 @@
+#!/usr/bin/perl
+
+# Supermap: Piecewise monotonic alignment map generator for Shuffle-LAGAN
+# Author: Andrey Kislyuk (kislyuk at ocf.berkeley.edu)
+
+package Supermap;
+require 5.005;
+my ($VERSION) = ('$Id: supermap.pl,v 1.50 2005/06/15 22:40:04 kislyuk Exp $' =~ /,v\s+(\d+\S+)/o);
+
+# Default constant values
+my $overlap_factor = 0.8; # Aligns will be discarded if another align overlaps them by this factor or more in both seqs and has the same orientation
+my $max_asym = 10; # Chains will be formed only if the resulting region's lengths differ by at most this factor
+my $min_seq_score; # All aligns for sequences with this total score will be discarded. See getMinSeqScore
+my $max_expand_len = 30000; # Aligns will be expanded or contracted on both sides on both strands by this amount up to the total length below
+my $expand_factor = 4; # When one of an align's sequences is constrained in its expansion by a neighbor/start/end, the other one will be expanded by this times more than the first one
+my $max_chainlen = 1500000; # Aligns will not be joined if the total length on either strand exceeds this. Set 0 to disable (no chain length limit)
+my $max_job_size = 50000; # Maximum job size, in blat hits, for chunking when running glocal in parallel
+my $erode_align = 15; # Amount by which to erode the coords of each align loaded (to avoid overlap problems when chaining)
+my ($c1, $c2, $c3, $c4) = (100, 50, 400, 25); # BLAT->CHAOS score conversion parameters
+#my $max_dist_y = 10000; # Join x-monotonic into same single-chain only if at most that apart in y-species.
+my $default_lagan_dir = "/home/genome/glocal";
+my $glocal_name = (0 ? "SLAGAN" : "glocal");
+
+use Getopt::Long;
+use File::Path;
+use File::Copy;
+use Cwd;
+use IPC::Open2;
+use IO::Handle;
+#use Carp;
+use strict;
+use warnings;
+no warnings "uninitialized";
+
+sub main();
+sub init();
+sub getSeqSizes($$$);
+sub prepareHits();
+sub runSLAGAN();
+sub reprintInputHits($$$);
+sub processResults();
+sub removeSLAGANOutput();
+sub seqBelowMinScore($);
+sub alignHashID($);
+sub printChainToTemp($$$$);
+sub chainBase1Hits($$);
+sub chainBase2Hits($);
+sub load2MHashes($);
+sub loadBase2Hashes($);
+sub postProcessRegions();
+sub workerRun($$$$);
+sub dequeueClustJobs($);
+sub get_all_seqs($$);
+sub isBLAT($);
+sub useIf($$);
+sub writeSizes($$);
+sub getMinSeqScore($);
+sub checkAlignCoords($);
+sub expandSeq1($$);
+sub expandSeq2($$);
+sub finalExpand($$);
+sub expSeq1Reg($$$$$);
+sub expSeq2Reg($$$$$);
+sub finalExpReg($$$$$);
+
+# array index constants
+use constant START1 => 0; use constant END1 => 1;
+use constant START2 => 2; use constant END2 => 3;
+use constant SEQ1 => 4; use constant SEQ2 => 5;
+use constant ORIENT => 6; use constant ORIGIN => 7;
+use constant SCORE => 8; use constant TOTSC => 9;
+use constant HASHID => 10; use constant FLIPPED=> 11;
+use constant CHALO1 => 12; use constant CHAHI1 => 13;
+use constant CHALO2 => 14; use constant CHAHI2 => 15;
+use constant CHALO1E=> 16; use constant CHAHI1E=> 17;
+use constant CHALO2E=> 18; use constant CHAHI2E=> 19;
+#use constant PREV1 => 8; use constant NEXT1 => 9;
+#use constant PREV2 => 10; use constant NEXT2 => 11;
+#use constant OSTART1=> 12; use constant OEND1 => 13;
+#use constant OSTART2=> 14; use constant OEND2 => 15;
+
+$SIG{'INT'} = $SIG{'QUIT'} = $SIG{'HUP'} = $SIG{'TRAP'} = $SIG{'ABRT'} = $SIG{'STOP'} = $SIG{'TERM'} = \&dequeueClustJobs;
+
+my ($debug, $quiet, $outfile, $proflip, $skip, $no_pid, $input_glob, $input_dir,
+ $server, $db, $gen1, $gen2, $gen1sizefile, $gen2sizefile, $write_sizes1, $write_sizes2,
+ $score_file, $cfg, $cfg_file, $sizes1, $sizes2, $dbh, $tmp_dir, $tmp_prefix, $nodelete,
+ $clust_run_pid, $print_chains, $no_aligntotals, $no_clust_run, $num_jobs, $input_is_blat,
+ $force_overwrite, $print_csv, $using_GP, $slagan_params, $tmp_existed, $print_stats, $lagan_dir, $glocal_out_logfile);
+my (@input_files);
+my (%offsets1, %offsets2, %aligns1, %aligns2, %flipped_aligns);
+
+my $supermapexec = $0; my $mycwd = getcwd(); $supermapexec =~ s/^\./$mycwd/ unless $supermapexec =~ /^\.\./; $supermapexec = $mycwd."/".$supermapexec if $supermapexec =~ /^\.\./;
+die("$0: Problem resolving my name, \'$supermapexec\' is not a file") unless -f $supermapexec or $ARGV[0] eq "worker";
+$0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0;
+
+$lagan_dir = $ENV{"LAGAN_DIR"} if defined $ENV{"LAGAN_DIR"};
+$lagan_dir = $ENV{"LAGAN_DIR"} = $default_lagan_dir unless defined $ENV{"LAGAN_DIR"};
+$lagan_dir =~ s/^\.\./$mycwd\/\.\./;
+$lagan_dir =~ s/^\./$mycwd\//;
+$ENV{"LAGAN_DIR"} = $lagan_dir;
+print STDERR "$0: Warning: LAGAN_DIR=$lagan_dir is not a valid directory\n" unless -d $lagan_dir;
+push @INC, $lagan_dir;
+
+my $SLAGAN = $lagan_dir."/".$glocal_name;
+my $error_file = "./$0.$$.error.log";
+my $default_score_file = $lagan_dir."/test.score";
+my $default_outfile = "$0.out";
+my $worker_tmp_dir = "/tmp/$0.$$.worker/"; # The directory where workers store their intermediate files (two workers should not use the same directory)
+
+my $usage = "
+-infile=file \t Name of input file containing all hits for the two genomes
+-outfile=file \t Output filename (default: $default_outfile)
+-gen1=id \t First genome ID (must exist in the GPDB)
+-gen2=id \t Second genome ID (must exist in the GPDB)
+-sizes1=file \t File with sequence sizes for first genome
+-sizes2=file \t File with sequence sizes for second genome
+-bacteria \t Rearrange circular DNA to find a better alignment map
+-server=hostname GPDB server (default: lemur)
+-db=dbname \t GPDB name (default: GP)
+-config=file \t GPDB config file (default: ~/.gprc)
+-score=file \t Score file for SLAGAN (default: $default_score_file)
+-glocal_out=file \t Save intermediate GLOCAL alignment hits to this file
+-no_clust_run \t Run CPU/memory intensive jobs locally, not on the GP cluster
+-tmp_dir=dir \t Working directory (default: /tmp/$0.pid)
+-f \t\t Overwrite output file without prompting if it exists
+-v \t\t Verbose mode
+-q \t\t Quiet mode
+-k \t\t Keep all temporary files
+-expand_length=N Maximum length by which to expand alignments (default: $max_expand_len)
+-max_length=N \t Maximum length for any alignment chain in either strand
+\t\t (default: $max_chainlen)
+-min_seq_score=N Sequences with total align score below this threshold will be
+\t\t discarded (default: U penalty in SLAGAN score file)
+-max_job_size=N Threshold, in hits, for splitting workload into separate jobs
+\t\t for clust_run (default: $max_job_size)
+-c1, c2, c3, c4=N: Score factors for BLAT->CHAOS conversion
+\t\t (default: $c1, $c2, $c3, $c4)
+
+Options may be abbreviated.
+Input file format is BLAT or CHAOS. Sequence names should not contain spaces.
+Alignments with negative scores are discarded.
+Sequence size file format, one sequence per line: seq_name seq_size
+";
+
+exit(main());
+
+# ___ Subroutines _______________
+
+sub main() {
+ if ($ARGV[0] eq "worker") { workerRun($ARGV[1], $ARGV[2], $ARGV[3], $ARGV[4]); exit(0); } # Running SLAGAN in distributed mode
+ init();
+
+ print("$0: Retrieving sequence info...\n") unless $quiet;
+ $sizes1 = getSeqSizes($dbh, $gen1, $gen1sizefile);
+ (writeSizes($sizes1, $write_sizes1), exit(0)) if defined $write_sizes1;
+ $sizes2 = getSeqSizes($dbh, $gen2, $gen2sizefile);
+ (writeSizes($sizes2, $write_sizes2), exit(0)) if defined $write_sizes2;
+
+ die("$0: No sequence size data found. Stopped") if (keys(%$sizes1) < 1 or keys(%$sizes2) < 1);
+ die("$0: Flip mode is only applicable for two single-sequence organisms. Stopped") if ($proflip and not (keys(%$sizes1) == 1 and keys(%$sizes2) == 1));
+
+ # Sort and separate the alignments, run SLAGAN on them
+ prepareHits();
+ runSLAGAN();
+
+ # Chain SLAGAN alignments into supermonotonic chain and save the intermediate results
+ my ($dc, $sc1, $sc2) = processResults();
+
+ # Load the results back and expand regions, then print them
+ postProcessRegions();
+
+ print "$0: Output written to $outfile\n" unless $quiet;
+ print "$0: Intermediate files kept in $tmp_dir\n" if $nodelete and not $quiet;
+ rmdir $tmp_dir unless $tmp_existed or $nodelete;
+
+ return 0;
+}
+
+
+# Startup tasks
+sub init() {
+ system('export LC_ALL="C"'); # Things may misbehave if locale is set to UTF-8
+
+ # Berkeley Genome Pipeline functionality is used if corresponding Perl modules are found in @INC
+ foreach my $dir (@INC) {
+ $using_GP = 1 if -f $dir."/GPDBI.pm" and -f $dir."/GPutils.pm";
+ }
+
+ useIf $using_GP, "GPDBI";
+ useIf $using_GP, "GPutils";
+ useIf 1, "Utils";
+# useIf 1, "Desoverlap";
+
+ die("$0: GetOptions failed to retrieve options. Check the input options. Usage:".$usage) unless
+ GetOptions(
+ "server=s" => \$server,
+ "gen1=s" => \$gen1,
+ "gen2=s" => \$gen2,
+ "sizes1=s" => \$gen1sizefile,
+ "sizes2=s" => \$gen2sizefile,
+ "blatfile=s" => \$input_glob,
+ "infile=s" => \$input_glob,
+ "outfile=s" => \$outfile,
+ "glocal_out=s" => \$glocal_out_logfile,
+ "bacteria" => \$proflip,
+ "server=s" => \$server,
+ "db=s" => \$db,
+ "config=s" => \$cfg_file,
+ "tmp_dir=s" => \$tmp_dir,
+ "skip" => \$skip,
+ "no_pid" => \$no_pid,
+ "no_clust_run" => \$no_clust_run,
+ "print_chains" => \$print_chains,
+ "print_stats" => \$print_stats,
+ "no_aligntotals"=> \$no_aligntotals,
+ "print_csv" => \$print_csv,
+ "max_job_size" => \$max_job_size,
+ "max_length=i" => \$max_chainlen,
+ "expand_length=i"=>\$max_expand_len,
+ "min_seq_score=i"=>\$min_seq_score,
+ "max_asym=i" => \$max_asym,
+ "overlap_factor"=> \$overlap_factor,
+ "score=s" => \$score_file,
+ "c1=i" => \$c1,
+ "c2=i" => \$c2,
+ "c3=i" => \$c3,
+ "c4=i" => \$c4,
+ "slagan_params" => \$slagan_params,
+ "write_sizes1=s"=> \$write_sizes1,
+ "write_sizes2=s"=> \$write_sizes2,
+ "keep" => \$nodelete,
+ "f" => \$force_overwrite,
+ "v" => \$debug,
+ "q" => \$quiet
+ );
+
+ undef $quiet if $debug;
+ my @uinfo = getpwuid($>);
+ print("$0: Version ".$VERSION." started ".localtime()." by ".$uinfo[0]."\n") unless $quiet;
+ $tmp_prefix = $0.($no_pid ? "" : ".".$$);
+
+ unless ($no_clust_run) {
+ $no_clust_run = `which clust_run 2> /dev/null`; $no_clust_run = not $no_clust_run;
+ print("$0: clust_run not found - cluster operation disabled\n") if $no_clust_run and not $quiet;
+ }
+
+ if ($tmp_dir) {
+ $tmp_existed = 1 if -d $tmp_dir;
+ mkdir $tmp_dir unless -d $tmp_dir;
+ $tmp_dir .= "/" unless /\/^Z/;
+ } else {
+ $tmp_dir = "/tmp/".$tmp_prefix;
+ mkdir $tmp_dir;
+ $tmp_dir .= "/";
+ }
+ die("$0: No write permissions in working directory $tmp_dir. Stopped") unless -w $tmp_dir;
+ die("$0: Genome IDs or size files not specified. Usage:".$usage) unless ($gen1 or $gen1sizefile) and ($gen2 or $gen2sizefile);
+ die("$0: '-gen' options are invalid because GPDB is not available. Use '-sizes'. Stopped") if (($gen1 or $gen2) and not $using_GP);
+ die("$0: Sequence size file $gen1sizefile not found. Stopped") unless -f $gen1sizefile or $gen1;
+ die("$0: Sequence size file $gen2sizefile not found. Stopped") unless -f $gen2sizefile or $gen2;
+ die("$0: Maximum job size too small, must exceed 10000 hits. Stopped") if $max_job_size < 10000;
+ die("$0: Overlap factor must be between 0 and 1. Stopped") if $overlap_factor < 0 or $overlap_factor > 1;
+ print("$0: SLAGAN score file not specified, using default $default_score_file\n") unless $score_file or $quiet;
+ print("$0: Output file not specified, using default $default_outfile\n") unless $outfile or $quiet;
+
+ # Check input file or glob
+ if (defined $input_glob) {
+ if ($input_glob =~ /\//) { ($input_dir, $input_glob) = ($input_glob =~ /\A(.*\/)([^\/]+)\Z/); }
+ $input_glob .= "\$" unless $input_glob =~ /\$$/;
+ $input_glob = "^".$input_glob unless $input_glob =~ /^\^/;
+ @input_files = Utils::safe_glob($input_glob, $input_dir);
+ } elsif (@ARGV > 0) {
+ foreach my $file (@ARGV) {
+ if ($file =~ /\//) { ($input_dir, $file) = ($file =~ /\A(.*\/)([^\/]+)\Z/); }
+ push @input_files, $file;
+ }
+ } else { # TODO: split stdin for >2GB input
+ open(FH, "> $tmp_dir$tmp_prefix.in");
+ print FH while <STDIN>;
+ close FH;
+ push @input_files, "$tmp_prefix.in";
+ $input_dir = $tmp_dir;
+ }
+ unless ($input_dir =~ /\A\//) { $input_dir = $mycwd."/".$input_dir; }
+ die("$0: No input files matching \"$input_dir$input_glob\" found. Stopped") unless @input_files > 0;
+ print "$0: ". at input_files." input file(s)\n" if $debug;
+
+ # Check output file
+ $outfile = $default_outfile unless $outfile;
+ if (-f $outfile and not $force_overwrite and -t STDERR) {
+ print STDERR "$0: $outfile exists. Overwrite? (y/N, '-f' to force) ";
+ my $overwrite = <STDIN>; chomp $overwrite;
+ (print("Move \"$outfile\" or use option '-f'.\n"), exit(1)) unless ($overwrite eq "Y" or $overwrite eq "y" or $overwrite eq "yes");
+ }
+ open(FH, "> ".$outfile) or die("$0: Cannot open $outfile for writing: $!");
+ close FH;
+
+ # Check SLAGAN score file
+ $score_file = $default_score_file unless $score_file;
+ unless ($score_file =~ /\A\//) { $score_file = $mycwd."/".$score_file; }
+ $max_expand_len += $erode_align;
+ die("$0: max_length cannot be less than 0. Stopped") if $max_chainlen < 0;
+ $max_chainlen = 1000000000 if $max_chainlen == 0;
+ $max_chainlen -= 2*$max_expand_len;
+ # SLAGAN output for a given sequence will be discarded if the total score for the sequence is below this threshold. Default value is the SLAGAN unrelated gap penalty.
+ $min_seq_score = getMinSeqScore($score_file) unless defined $min_seq_score;
+
+ # Connect to GPDB
+ if ($using_GP) {
+ $GPutils::Error = "";
+ $cfg = read_gp_config(Get_Abs_Path($cfg_file)) or die($GPutils::Error);
+ $server ||= $cfg->Get_Val("DB", "server");
+ $db ||= $cfg->Get_Val("DB", "main_db");
+ $dbh = GPDBI->connect($server, 0, $db, undef, undef, "gp_cgi", undef, {PrintError => 0, RaiseError => 1});
+ }
+}
+
+
+# Load sequence names and sizes either from GPDB or from file
+sub getSeqSizes($$$) {
+ my ($dbh, $dataset, $gen_size_file) = @_;
+ if ($dataset) {
+ return get_all_seqs($dbh, $dataset);
+ } else {
+ my %sizes;
+ open(FH, "< ".$gen_size_file) or die("$0: Could not open file $gen_size_file for reading: ".$!);
+ while (<FH>) {
+ chomp;
+ my ($seq, $size) = split;
+ die("$0: Invalid format in file $gen_size_file") unless $seq and $size;
+ $sizes{$seq} = $size;
+ }
+ close FH;
+ return \%sizes;
+ }
+}
+
+
+# Convert BLAT to CHAOS if necessary
+# Flip hits on circular sequence if necessary
+sub prepareHits() {
+ my ($cur_align);
+ local (*FH, *OUT1);
+
+ print "$0: Preparing files...\n" unless $quiet;
+ $input_is_blat = 1 if isBLAT($input_dir.$input_files[0]);
+
+ if ($input_is_blat) {
+ foreach my $file (@input_files) {
+ system('awk \'{$13=($13+$15)?$13:1; print $1,$2,$3";",$5,$6,$7"; '.
+ 'score = "' . $c1 . '*$8-' . $c2 . '*$9-' . $c3 . '*($12+$14)-' . $c4 .
+ '*log($13+$15),"("$4")"}\''.
+ "< $input_dir$file > $tmp_dir$file.chaos");
+ }
+ } else {
+ foreach my $file (@input_files) {
+ system('ln -s "'.$input_dir.$file.'" "'.$tmp_dir.$file.'.chaos"');
+ }
+ }
+
+ if ($proflip) {
+ open(FH, "< ".$tmp_dir.$input_files[0].".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".chaos for reading: ".$!);
+ open(OUT1, "> ".$tmp_dir.$input_files[0].".flipped.chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".flipped.chaos for writing: ".$!);
+
+ my (@seq1s, @seq1e, @seq2s, @seq2e, @scores, @orientations, @seqn1, @seqn2);
+ my ($seq1center, $seq2center, $seq1median, $seq2median);
+ my $i = 0;
+ while (<FH>) {
+ /\A[\s]*.*\s([\d]+)\s([\d]+)\;\s.*\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/;
+# ($seqn1[$i], $seq1s[$i], $seq1e[$i], $seqn2[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6, $7, $8);
+ ($seq1s[$i], $seq1e[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6);
+ if ($seq1s[$i] > $seq1e[$i]) { my $j = $seq1s[$i]; $seq1s[$i] = $seq1e[$i]; $seq1e[$i] = $j; }
+ if ($seq2s[$i] > $seq2e[$i]) { my $j = $seq2s[$i]; $seq2s[$i] = $seq2e[$i]; $seq2e[$i] = $j; }
+ $i++;
+ }
+
+ # For each interval pair,
+ # if the seq1 interval median is greater than seq1 median, and the corresponding interval median in seq2 is less than seq2 median,
+ # OR if the seq1 interval median is less than seq1 median, and the corresponding interval median in seq2 is greater than seq2 median,
+ # set start of interval in seq1 to 2CoM1 - previous end of interval
+ # set end of interval in seq1 to 2CoM1 - previous start of interval
+ # flip the orientation (+/-)
+ $seq1center = $$sizes1{(keys(%$sizes1))[0]} / 2;
+ $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2;
+ my $flip_counter = 0;
+ foreach $i (0.. at seq1s-1) {
+ $seq1median = ($seq1s[$i] + $seq1e[$i]) / 2;
+ $seq2median = ($seq2s[$i] + $seq2e[$i]) / 2;
+ if (($seq1median > $seq1center and $seq2median < $seq2center)
+ or ($seq1median < $seq1center and $seq2median > $seq2center)) {
+ my $j = $seq2s[$i];
+ $seq2s[$i] = (2 * $seq2center) - $seq2e[$i];
+ $seq2e[$i] = (2 * $seq2center) - $j;
+ if ($orientations[$i] eq "+") { $orientations[$i] = "-"; } else { $orientations[$i] = "+"; }
+ $cur_align = [];
+ $$cur_align[START1] = $seq1s[$i]; $$cur_align[START2] = $seq2s[$i];
+ $$cur_align[END1] = $seq1e[$i]; $$cur_align[END2] = $seq2e[$i];
+ $$cur_align[SCORE] = $scores[$i]; $$cur_align[ORIENT] = $orientations[$i];
+$$cur_align[SEQ1] = (keys(%$sizes1))[0]; $$cur_align[SEQ2] = (keys(%$sizes2))[0];
+$$cur_align[START1] += $erode_align; $$cur_align[END1] -= $erode_align;
+$$cur_align[START2] += $erode_align; $$cur_align[END2] -= $erode_align;
+ $flipped_aligns{alignHashID($cur_align)} = $cur_align;
+ $flip_counter++;
+ }
+ print OUT1 "seq1 ".$seq1s[$i]." ".$seq1e[$i]."; seq2 ".$seq2s[$i]." ".$seq2e[$i]."; score = ".$scores[$i]." (".$orientations[$i].")\n";
+ }
+ close FH; close OUT1;
+ print "$0: Single-sequence flip mode: ".($flip_counter+0)." hits flipped\n" if $debug;
+ }
+}
+
+
+# Load all hits into a hash table, then write the hits for each sequence into a file
+# Run SLAGAN on each of these files, via worker instances either on the cluster or sequentially
+sub runSLAGAN() {
+ my ($clust_run_invoke, $num_jobs, $sort_pid1, $sort_pid2, $sort_pid3, $one_seq_mode,
+ $cur_align, $next_align, $curlen1, $curlen2, $nextlen1, $nextlen2, $overlap1, $overlap2, $dump_count);
+ local (*RH1, *WH1, *RH2, *WH2, *RH3, *WH3, *IN, *DUPES);
+# my $filter = Desoverlap->new($overlap_factor, $debug);
+
+ print "$0: Sorting input hits...\n" if $debug;
+ open(DUPES, "> supermap.duplicates") if $debug;
+
+ $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1);
+
+ $sort_pid1 = open2(\*RH1, \*WH1, "sort --key=1,1 --key=2,2n"); # pre-scan
+ $sort_pid2 = open2(\*RH2, \*WH2, "sort --key=1,1 --key=2,2n"); # gen1base
+ $sort_pid3 = open2(\*RH3, \*WH3, "sort --key=4,4 --key=5,5n"); # gen2base
+
+ # Sort input on seq1
+ foreach my $file (@input_files) {
+ open(IN, "< $tmp_dir$file".($proflip?".flipped":"").".chaos");
+ print WH1 while <IN>;
+ close IN;
+ }
+ close WH1;
+
+ # Scan input, check if start2, end2 are ascending for sorting, erode alignments
+ while (<RH1>) {
+ /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o;
+
+ $next_align=[];
+ ($$next_align[SEQ1], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[START2], $$next_align[END2], $$next_align[SCORE], $$next_align[ORIENT])
+ = ($1, $2, $3, $4, $5, $6, $7, $8);
+ next if $$next_align[SCORE] <= 0;
+ if ($one_seq_mode) { $$next_align[SEQ1] = (keys(%$sizes1))[0]; $$next_align[SEQ2] = (keys(%$sizes2))[0]; }
+ checkAlignCoords($next_align);
+
+ unless ($$next_align[END1]-$$next_align[START1] <= $erode_align*2 or $$next_align[END2]-$$next_align[START2] <= $erode_align*2) {
+ $$next_align[START1] += $erode_align; $$next_align[END1] -= $erode_align;
+ $$next_align[START2] += $erode_align; $$next_align[END2] -= $erode_align;
+ }
+
+=head1
+ # Overlap scan
+ if ($$next_align[START1] <= $$cur_align[END1] and $$next_align[END1] >= $$cur_align[START1] # overlap in seq1
+ and $$next_align[START2] <= $$cur_align[END2] and $$next_align[END2] >= $$cur_align[START2] # overlap in seq2
+ and $$cur_align[SEQ1] eq $$next_align[SEQ1] and $$cur_align[SEQ2] eq $$next_align[SEQ2]
+ and $$cur_align[ORIENT] eq $$next_align[ORIENT]) {
+ ($curlen1, $curlen2, $nextlen1, $nextlen2)
+ = ($$cur_align[END1] - $$cur_align[START1] + 1, $$cur_align[END2] - $$cur_align[START2] + 1,
+ $$next_align[END1] - $$next_align[START1] + 1, $$next_align[END2] - $$next_align[START2] + 1);
+
+ if ($$next_align[START1] <= $$cur_align[START1] and $$next_align[END1] >= $$cur_align[END1]) {
+ $overlap1 = $$cur_align[END1] - $$cur_align[START1] + 1; # next covers cur
+ } elsif ($$next_align[START1] <= $$cur_align[START1]) {
+ $overlap1 = $$next_align[END1] - $$cur_align[START1] + 1; # next is to the left
+ } elsif ($$next_align[END1] >= $$cur_align[END1]) {
+ $overlap1 = $$cur_align[END1] - $$next_align[START1] + 1; # next is to the right
+ } else {
+ $overlap1 = $$next_align[END1] - $$next_align[START1] + 1; # cur covers next
+ }
+ if ($$next_align[START2] <= $$cur_align[START2] and $$next_align[END2] >= $$cur_align[END2]) {
+ $overlap2 = $$cur_align[END2] - $$cur_align[START2] + 1;
+ } elsif ($$next_align[START2] <= $$cur_align[START2]) {
+ $overlap2 = $$next_align[END2] - $$cur_align[START2] + 1;
+ } elsif ($$next_align[END2] >= $$cur_align[END2]) {
+ $overlap2 = $$cur_align[END2] - $$next_align[START2] + 1;
+ } else {
+ $overlap2 = $$next_align[END2] - $$next_align[START2] + 1;
+ }
+ die("$0: Bad internal state") if $overlap1 < 0 or $overlap2 < 0;
+
+ if (($overlap1 / $curlen1 > $overlap_factor) and ($overlap2 / $curlen2 > $overlap_factor)
+ and $$cur_align[SCORE] <= $$next_align[SCORE]) {
+ $dump_count++;
+ print DUPES "Cur: (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]." over with (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]."\n" if $debug;
+ $cur_align = $next_align; next; # discard current align
+ } elsif (($overlap1 / $nextlen1 > $overlap_factor) and ($overlap2 / $nextlen2 > $overlap_factor)
+ and $$cur_align[SCORE] >= $$next_align[SCORE]) {
+ $dump_count++;
+ print DUPES "Nxt: (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]." over with (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]."\n" if $debug;
+ next; # discard next align
+ }
+ }
+=cut
+ foreach my $cur_align ($next_align){ # (@{$filter->put($next_align)}) {
+ print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n";
+ print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n";
+ }
+
+# print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align;
+# print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align;
+# $cur_align = $next_align;
+ }
+# $filter->printAll();
+ # Flush alignments remaining in filter buffer
+# foreach my $cur_align (@{$filter->getBuffer()}) {
+# print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0;
+# print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0;
+# }
+
+ close RH1; waitpid $sort_pid1, 0;
+
+ close WH2;
+ $num_jobs = reprintInputHits(1, 1, \*RH2);
+ close RH2; waitpid $sort_pid2, 0;
+
+ close WH3;
+ $num_jobs = reprintInputHits(2, $num_jobs, \*RH3);
+ close RH3; waitpid $sort_pid3, 0;
+
+ close DUPES if defined fileno DUPES;
+# print STDERR "$0: Warning: ".$filter->{dump_count}." near duplicate alignments discarded (overlap factor $overlap_factor)\n" if $filter->{dump_count} and not $quiet;
+
+ open(FH, "> ".$tmp_dir."CLUSTER_JOB_PARAMS") or die;
+ foreach my $i (1..$num_jobs-1) {
+ print FH "worker JOB".$i.".tar ".$score_file." ".$SLAGAN.($debug ? " -v" : "");
+ print FH " << JOB$i.tar > CLUSTER_JOB_MESSAGES.$i >> CLUSTER_JOB_ERRMSG.$i" unless $no_clust_run;
+ print FH "\n";
+ }
+ close FH;
+
+ if ($no_clust_run) {
+ open(FH, "< ".$tmp_dir."CLUSTER_JOB_PARAMS") or die;
+ print "$0: Running ".($num_jobs-1)." SLAGAN jobs locally...\n" unless $quiet;
+ while (<FH>) {
+ chomp;
+ print("Job $.: \"$0 $_\"\n") if $debug;
+ system("cd $tmp_dir; $supermapexec ".$_);
+ }
+ close FH;
+ } else {
+ $clust_run_invoke = "clust_run -program=".$supermapexec." -parameters=".$tmp_dir."CLUSTER_JOB_PARAMS -init_dir=$tmp_dir -wait";
+ print "$0: Running ".($num_jobs-1)." distributed SLAGAN jobs with clust_run...\n" unless $quiet;
+ print "$0: \"$clust_run_invoke\"\n" if $debug;
+
+ if ($clust_run_pid = fork()) { # I am the parent
+ waitpid($clust_run_pid, 0);
+ } elsif (not defined $clust_run_pid) {
+ die("$0: Could not fork");
+ } else { # I am the child
+ die("$0: Could not exec \"$clust_run_invoke\"") unless exec($clust_run_invoke);
+ }
+ undef $clust_run_pid;
+ }
+
+ foreach my $i (1..$num_jobs-1) {
+ system("cd $tmp_dir; tar -xf ".$tmp_dir."JOB".$i.".results.tar");
+ unlink $tmp_dir."JOB".$i.".tar" unless $nodelete;
+ unlink $tmp_dir."JOB".$i.".results.tar" unless $nodelete;
+ unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i" unless $nodelete;
+ unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i" unless $nodelete;
+ }
+
+ unlink "$tmp_dir$input_glob.chaos" unless $nodelete;
+ unlink $tmp_dir."CLUSTER_JOB_PARAMS" unless $nodelete;
+
+ foreach my $file (@input_files) {
+ unlink $tmp_dir.$file.".chaos" unless $nodelete;
+ }
+}
+
+
+sub reprintInputHit($$$) {
+ my ($base_gen, $align, $FH) = @_;
+ if ($base_gen == 1 and $$align[ORIENT] eq "+") {
+ print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } elsif ($base_gen == 1 and $$align[ORIENT] eq "-") {
+ print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[END2]." ".$$align[START2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } elsif ($base_gen == 2 and $$align[ORIENT] eq "+") {
+ print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } elsif ($base_gen == 2 and $$align[ORIENT] eq "-") {
+ print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[END1]." ".$$align[START1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } else {
+ die("$0: Bad internal state from hit ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")");
+ }
+}
+
+
+sub writeJobFile($$) {
+ my ($job_id, $seq_list) = @_;
+ local *LIST;
+
+ open(LIST, "| cd $tmp_dir; xargs tar --append --file=".$tmp_dir."JOB".$job_id.".tar");
+ foreach my $file (sort alnum keys(%$seq_list)) { $file =~ /\/([^\/]+)$/; print LIST $1." "; }
+ close LIST;
+
+ foreach my $file (sort alnum keys(%$seq_list)) { unlink $file unless $nodelete; }
+}
+
+
+# Separate input into files based on sequence name and reverse order in gen2base hits
+sub reprintInputHits($$$) {
+ my ($base_gen, $job_id, $RH) = @_;
+ my ($one_seq_mode, $line_count, $prev_seq, $cur_seq, $cur_align);
+ my (%cur_seq_list, %pruned_sizes);
+ local (*OUT, *LIST);
+
+ $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1);
+
+ print "$0: Reprinting hits (base genome $base_gen)..." if $debug;
+
+ $line_count = 0;
+ while (<$RH>) {
+ /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o;
+
+ $cur_align=[];
+ ($$cur_align[SEQ1], $$cur_align[START1], $$cur_align[END1], $$cur_align[SEQ2], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT])
+ = ($1, $2, $3, $4, $5, $6, $7, $8);
+
+ $cur_seq = ($base_gen == 1 ? $$cur_align[SEQ1] : $$cur_align[SEQ2]);
+
+ if ($cur_seq ne $prev_seq) {
+ $pruned_sizes{$cur_seq} = ($base_gen == 1 ? $$sizes1{$cur_seq} : $$sizes2{$cur_seq});
+ print " ".$cur_seq if $debug;
+ close OUT if defined fileno OUT;
+ open(OUT, "> ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos for writing: ".$!);
+ if ($line_count > $max_job_size) {
+ writeJobFile($job_id, \%cur_seq_list);
+ undef %cur_seq_list; $line_count = 0; $job_id++;
+ }
+ $cur_seq_list{$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos"} = 1;
+ }
+ reprintInputHit($base_gen, $cur_align, \*OUT) if @$cur_align;
+
+ $prev_seq = $cur_seq;
+# $cur_align = $next_align;
+ $line_count++;
+ }
+
+# reprintInputHit($base_gen, $next_align, \*OUT) if @$next_align;
+ writeJobFile($job_id, \%cur_seq_list);
+ $job_id++;
+
+ close OUT;
+ print "\n" if $debug;
+ $sizes1 = \%pruned_sizes if $base_gen == 1;
+ $sizes2 = \%pruned_sizes if $base_gen == 2;
+ return $job_id;
+}
+
+
+sub seqBelowMinScore($) {
+ my ($line) = @_;
+ $line =~ /\A[\s]*\([\d]+\s[\d]+\)\=\([\d]+\s[\d]+\)\s([\d\.\-]+)\s[\+\-]+\s\[([\d\.\-]+)\][\s]*s1\:.*[\s]*s2\:.*\n\Z/;
+ die("$0: Unable to extract score values from SLAGAN output:\n$line") if not defined $2;
+ return ($2 < $min_seq_score);
+}
+
+sub processResults() {
+ my ($cur_seq, $input_prefix, $dropped_seqs, $sort_pid, $sort_pid2);
+ local (*RH, *WH, *IN, *OUT, *hashesDM_RH, *hashesDM_WH);
+ print "$0: Loading SLAGAN output...\n" unless $quiet;
+ open(GLOCAL_OUT_LOG, "> ".$glocal_out_logfile) if $glocal_out_logfile;
+
+ # Sort gen2base aligns on seq1, then seq2, then start2, then print them to separate files, one file per gen1 seq
+ # These files will be loaded on demand when scanning gen1base aligns (chainBase1Hits())
+ $sort_pid = open2(\*RH, \*WH, "sort --key=9,9 --key=7,7 --key=1.2,1n"); # input is base 2, key is 9 because a space is expected between s2: and seq2name
+ $input_prefix = $tmp_dir.$input_files[0].".gen2base";
+ foreach my $seq (sort alnum keys(%$sizes2)) {
+ open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes2{$seq}), next);
+ my $line = <IN>;
+ die("$0: Empty SLAGAN output file $input_prefix.$seq.chaos.glocal-out, check corresponding job logs. Stopped") unless $line;
+ if (seqBelowMinScore($line)) { print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; }
+ seek IN, 0, 0; # back to start
+ print WH while <IN>;
+ close IN;
+ }
+ close WH or die("$0: Error executing sort");
+ while (<RH>) {
+ /\ss2\:[\s]*([^\s]+)[\s]*\n\Z/;
+ if ($1 ne $cur_seq or not defined $cur_seq) {
+ next unless $1;
+ close OUT if defined fileno OUT;
+ $cur_seq = $1;
+ open(OUT, "> $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out") or die("$0: Could not open file $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out for writing: ".$!);
+ }
+ print OUT $_;
+ }
+ close RH; close OUT if defined fileno OUT;
+ waitpid $sort_pid, 0;
+
+ # Sort gen1base aligns on seq1, then start1
+ $sort_pid = open2(\*RH, \*WH, "sort --key=7,7 --key=1.2,1n"); # input is base 1
+ $input_prefix = $tmp_dir.$input_files[0].".gen1base";
+ foreach my $seq (sort alnum keys(%$sizes1)) {
+ open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes1{$seq}), next);
+ my $line = <IN>;
+ if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; }
+ seek IN, 0, 0; # back to start
+ print WH while <IN>;
+ if ($glocal_out_logfile) { seek IN, 0, 0; print GLOCAL_OUT_LOG while <IN>; }
+ close IN;
+ unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete;
+ }
+ unlink $input_prefix.".chaos" unless $nodelete;
+ close WH or die("$0: Error executing sort");
+
+ # Feed the gen1base aligns to the 2M/1M1 chain scanner (chainBase1Hits())
+ # The hashesDM handle is used to write 2M aligns' hashes to be sorted in seq2 order
+ print "$0: Generating supermonotonic map...\n" unless $quiet;
+ $sort_pid2 = open2(\*hashesDM_RH, \*hashesDM_WH, "sort --key=2,2");
+ chainBase1Hits(*RH, *hashesDM_WH);
+ close RH;
+ waitpid $sort_pid, 0;
+ close hashesDM_WH or die("$0: Error executing sort");
+
+ # Print sorted 2M aligns' hashes, one file per gen2 seq
+ undef $cur_seq;
+ while(<hashesDM_RH>) {
+ my $line = $_;
+ $line =~ /\A[^\s]+\s([^\s]+)\s[^\s]+\n\Z/;
+ if ($1 ne $cur_seq or not defined $cur_seq) {
+ close OUT if defined fileno OUT;
+ $cur_seq = $1;
+ open(OUT, "> $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq") or die("$0: Could not open file $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq for writing: ".$!);
+ }
+ print OUT $line;
+ }
+ close hashesDM_RH;
+ waitpid $sort_pid2, 0;
+
+ # Sort gen2base aligns on seq2, then start2
+ $sort_pid = open2(\*RH, \*WH, "sort --key=7,7 --key=1.2,1n"); # input is base 2
+ $input_prefix = $tmp_dir.$input_files[0].".gen2base";
+ foreach my $seq (sort alnum keys(%$sizes2)) {
+ open(IN, "< $input_prefix.$seq.chaos.glocal-out") or next;
+ my $line = <IN>;
+ if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; }
+ seek IN, 0, 0; # back to start
+ print WH while <IN>;
+ close IN;
+ unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete;
+ }
+ unlink $input_prefix.".chaos" unless $nodelete;
+ close WH or die("$0: Error executing sort");
+
+ # Feed the gen2base aligns to the 1M2 chain scanner (chainBase2Hits())
+ chainBase2Hits(*RH);
+ close RH;
+ waitpid $sort_pid, 0;
+
+ close GLOCAL_OUT_LOG if defined fileno GLOCAL_OUT_LOG;
+
+ removeSLAGANOutput();
+ print STDERR "$0: Warning: Alignments for $dropped_seqs sequences discarded due to total score below cutoff ($min_seq_score)\n" if $dropped_seqs and not $quiet;
+}
+
+
+sub removeSLAGANOutput() {
+ my $input_prefix = $tmp_dir.$input_files[0].".gen1base";
+ foreach my $seq (sort alnum keys(%$sizes1)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; }
+ unlink $input_prefix.".chaos" unless $nodelete;
+
+ $input_prefix = $tmp_dir.$input_files[0].".gen2base";
+ foreach my $seq (sort alnum keys(%$sizes2)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; }
+ unlink $input_prefix.".chaos" unless $nodelete;
+
+ rmdir $tmp_dir;
+}
+
+
+sub alignHashID($) {
+ my ($align) = @_;
+# return 23*$$align[START1] + 41*$$align[START2] + 61*$$align[END1] + 83*$$align[END2];
+ return $$align[SEQ1].":".$$align[START1]."-".$$align[END1]."=".$$align[SEQ2].":".$$align[START2]."-".$$align[END2];
+}
+
+
+# The chain writer lags the chainer by two chains because the full contents of neighboring chains must be known.
+sub printChainToTemp($$$$) {
+ my ($FH, $prev_chain, $cur_chain, $next_chain) = @_;
+ return unless defined $cur_chain;
+
+ my $type = ${$$cur_chain[0]}[ORIGIN];
+ my ($first_align, $last_align) = ($$cur_chain[0], $$cur_chain[@$cur_chain-1]);
+ print $FH ${$$cur_chain[0]}[ORIGIN]." ".@$cur_chain." ".
+ $$first_align[START1]." ".$$first_align[END1]." ".$$first_align[START2]." ".$$first_align[END2]." ".
+ $$first_align[SEQ1]." ".$$first_align[SEQ2]." ".$$first_align[ORIENT]." ".$$first_align[SCORE]." ".
+ $$last_align[START1]." ".$$last_align[END1]." ".$$last_align[START2]." ".$$last_align[END2]." ".
+ $$last_align[SEQ1]." ".$$last_align[SEQ2]." ".$$last_align[ORIENT]." ".$$last_align[SCORE];
+ if ($print_chains) {
+ foreach my $align (@$cur_chain) {
+ print $FH " ".$$align[START1]." ".$$align[END1]." ".$$align[START2]." ".$$align[END2];
+ }
+ }
+ print $FH "\n";
+}
+
+
+sub chainBase1Hits($$) {
+ my ($FH, $hashesDM) = @_;
+ local *OUT;
+ my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M1,
+ $cur_seq, $align_peers, $flip_counter);
+ my @bad_aligns; my %base2peers;
+
+ while (<$FH>) {
+ /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/;
+
+ next if ($1==$2); # skip null alignments
+ (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6;
+
+ $cur_align = [];
+ ($$cur_align[START1], $$cur_align[END1], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ1], $$cur_align[SEQ2])
+ = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
+ $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//;
+ $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//;
+#warn("Seen: ".$_) if $$cur_align[SEQ1] eq "AC002301.1";
+ checkAlignCoords($cur_align);
+
+ if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) {
+ my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2;
+ my $j = $$cur_align[START2];
+ $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2];
+ $$cur_align[END2] = (2 * $seq2center) - $j;
+ if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; }
+ $$cur_align[FLIPPED]=1;
+ $flip_counter++;
+ }
+
+ $$cur_align[HASHID] = alignHashID($cur_align);
+
+ if ($$cur_align[SEQ1] ne $cur_seq) {
+#warn("Handling seq trans") if $prev_align and $$prev_align[SEQ1] eq "AC002301.1";
+printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);# unless defined $cur_seq;
+printChainToTemp(*OUT, $prev_chain, $cur_chain, undef);# unless defined $cur_seq;
+
+ undef $chain_start_2M; undef $chain_start_1M1; undef $prev_align;
+ undef $pre_prev_chain; undef $prev_chain; undef $cur_chain;
+ $cur_seq = $$cur_align[SEQ1];
+ %base2peers = %{loadBase2Hashes($tmp_dir.$input_files[0].".gen2base.sorted-gen1.$cur_seq.chaos.glocal-out")};
+ close OUT if defined fileno OUT;
+ open(OUT, "> ".$tmp_dir.$input_files[0].".2MM1.$cur_seq");
+ }
+
+ $align_peers = $base2peers{$$cur_align[HASHID]};
+ $$cur_align[ORIGIN] = defined($align_peers) ? 2 : 1;
+
+ if ($chain_start_2M and defined $align_peers and defined $prev_align # continue open 2M chain
+ and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2]
+ and $$prev_align[HASHID] eq $$align_peers[0])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2]
+ and $$prev_align[HASHID] eq $$align_peers[1])
+ or ($$cur_align[FLIPPED] and ($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2]
+ and $$prev_align[HASHID] eq $$align_peers[0])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2]
+ and $$prev_align[HASHID] eq $$align_peers[1])))
+ and $$cur_align[ORIENT] eq $$prev_align[ORIENT]
+ and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED]
+ and $$cur_align[SEQ2] eq $$prev_align[SEQ2]
+ and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1]))
+ and abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_chainlen
+ and abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_chainlen
+#and abs($$cur_align[END1] - $$chain_start_2M[START1])/abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_asym
+#and abs($$cur_align[END2] - $$chain_start_2M[START2])/abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_asym
+ ) {
+ push(@$cur_chain, $cur_align);
+ print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n";
+ } elsif (defined $align_peers) { # start new 2M chain
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);
+ $chain_start_2M = $cur_align; undef $chain_start_1M1;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n";
+ } elsif ($chain_start_1M1 and defined $prev_align # continue open 1M1 chain
+ and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2]))
+ or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2]))))
+ and $$cur_align[ORIENT] eq $$prev_align[ORIENT]
+ and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED]
+ and $$cur_align[SEQ2] eq $$prev_align[SEQ2]
+ and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1]))
+ and abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_chainlen
+ and abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_chainlen
+#and abs($$cur_align[END1] - $$chain_start_1M1[START1])/abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_asym
+#and abs($$cur_align[END2] - $$chain_start_1M1[START2])/abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_asym
+ ) {
+ push(@$cur_chain, $cur_align);
+ } else { # start new 1M1 chain
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);
+ $chain_start_1M1 = $cur_align; undef $chain_start_2M;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ }
+ $prev_align = $cur_align;
+ }
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);
+ printChainToTemp(*OUT, $prev_chain, $cur_chain, undef);
+ print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen1base hits backflipped\n" if $debug and $proflip;
+ warn "$0: Warning: ". at bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0;
+}
+
+
+# Input is base 2, i.e. (start2 end2)=(start1 end1)...
+sub chainBase2Hits($) {
+ my ($FH) = @_;
+ local *OUT;
+ my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M2,
+ $cur_seq, $align_is_2M, $flip_counter);
+ my @bad_aligns; my %aligns2M;
+
+ while(<$FH>) {
+ /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/;
+
+ next if ($1==$2); # skip null alignments
+ (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6;
+
+ $cur_align = [];
+ ($$cur_align[START2], $$cur_align[END2], $$cur_align[START1], $$cur_align[END1], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ2], $$cur_align[SEQ1])
+ = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
+ $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//;
+ $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//;
+ checkAlignCoords($cur_align);
+
+ if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) {
+ my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2;
+ my $j = $$cur_align[START2];
+ $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2];
+ $$cur_align[END2] = (2 * $seq2center) - $j;
+ if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; }
+ $$cur_align[FLIPPED] = 1;
+ $flip_counter++;
+ }
+
+ $$cur_align[HASHID] = alignHashID($cur_align);
+
+ if ($$cur_align[SEQ2] ne $cur_seq) {
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;# and not defined $cur_seq;
+ printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3;# and not defined $cur_seq;
+ undef $chain_start_1M2; undef $prev_align;
+ undef $pre_prev_chain; undef $prev_chain; undef $cur_chain;
+ $cur_seq = $$cur_align[SEQ2];
+ %aligns2M = %{load2MHashes($tmp_dir.$input_files[0].".hashesDM.gen2.$cur_seq")};
+ close OUT if defined fileno OUT;
+ open(OUT, "> ".$tmp_dir.$input_files[0].".M2.$cur_seq");
+ }
+ $$cur_align[ORIGIN] = defined($aligns2M{$$cur_align[HASHID]}) ? 2 : 3;
+
+ if (defined $aligns2M{$$cur_align[HASHID]}) { # align is 2M
+ my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : [];
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;
+ undef $chain_start_1M2; # close 1M2 chain
+ $chain_start_2M = $cur_align;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ } elsif ($chain_start_1M2 # continue open 1M2 chain
+ and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START1] > $$prev_align[END1])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] < $$prev_align[START1]))
+ or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START1] < $$prev_align[END1])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] > $$prev_align[START1]))))
+ and $$cur_align[ORIENT] eq $$prev_align[ORIENT]
+ and $$cur_align[SEQ1] eq $$prev_align[SEQ1]
+ and $$cur_align[FLIPPED] == $$prev_align[FLIPPED]
+ and ($$cur_align[START2] > $$prev_align[END2] or ($$cur_align[FLIPPED] and $$cur_align[START2] < $$prev_align[END2]))
+ and abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_chainlen
+ and abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_chainlen
+#and abs($$cur_align[END1] - $$chain_start_1M2[START1])/abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_asym
+#and abs($$cur_align[END2] - $$chain_start_1M2[START2])/abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_asym
+ ) {
+ push(@$cur_chain, $cur_align);
+ } else { # start new 1M2 chain
+ my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : [];
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;
+ $chain_start_1M2 = $cur_align;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ }
+ $prev_align = $cur_align;
+ }
+ my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : [];
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;
+ printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3;
+ print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen2base hits backflipped\n" if $debug and $proflip;
+ warn "$0: Warning: ". at bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0;
+}
+
+
+# Input: file with lines of the form "seq1 seq2 hash" (seq2 should be the same per file)
+# Output: hash(key->align hash ID, value->1). Input file is deleted.
+sub load2MHashes($) {
+ my ($file) = @_;
+ my %hashes;
+ local *FH;
+ open(FH, "< $file") or return {};
+ while (<FH>) {
+ /\A[^\s]+\t[^\s]+\t([^\s]+)\n\Z/;
+ warn("Hash collision in \"$_\" vs. \"".$hashes{$1}."\"") if defined $hashes{$1};
+ $hashes{$1} = 1;
+ }
+ close FH;
+ unlink $file unless $nodelete;
+ return \%hashes;
+}
+
+
+# Input: file with gen2base alignments which should have the same seq1 ordered by start2 or not exist
+# Output: hash(key->align hash ID, value->[prev align hash ID, next align hash ID]). Input file is deleted.
+# Input is base 2, i.e. (start2 end2)=(start1 end1)...
+sub loadBase2Hashes($) {
+ my ($file) = @_;
+ my ($prev_align, $cur_align, $next_align);
+ my %hashes;
+ local *FH;
+ open(FH, "< $file") or return {};
+ while (<FH>) { # Scan 1 line ahead because the next align must also be seen
+ /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s.*s1\:(.*)[\s]*s2\:(.*)/;
+
+ $next_align = [];
+ # Hits are gen2base
+ ($$next_align[START2], $$next_align[END2], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[SEQ1]) = ($1, $2, $3, $4, $5, $6);
+ checkAlignCoords($next_align);
+ $$next_align[SEQ1] =~ s/^\s+//; $$next_align[SEQ1] =~ s/\s+$//;
+ $$next_align[SEQ2] =~ s/^\s+//; $$next_align[SEQ2] =~ s/\s+$//;
+ $$next_align[HASHID] = alignHashID($next_align);
+ warn("LB2H: Hash collision in \"$_\"") if defined $cur_align and defined $hashes{$$cur_align[HASHID]};
+ $hashes{$$cur_align[HASHID]} =
+ [$prev_align ? $$prev_align[HASHID] : 1,
+ $next_align ? $$next_align[HASHID] : 1] if $cur_align;
+ $prev_align = $cur_align; $cur_align = $next_align;
+ }
+ $hashes{$$cur_align[HASHID]} = [$prev_align ? $$prev_align[HASHID] : 1, undef] if $cur_align;
+ close FH;
+ unlink $file unless $nodelete;
+ return \%hashes;
+}
+
+
+# Load chained regions and expand them according to the expansion rules, then print them out and display some chain statistics
+sub postProcessRegions() {
+ local (*IN, *OUT, *RH1, *WH1, *RH2, *WH2, *RH3, *WH3);
+ my ($first_align, $last_align, $type, $num_aligns, $sort_pid1, $sort_pid2, $sort_pid3);
+ my (@line, @min_lengths, @max_lengths, @means, @pos_counts, @neg_counts);
+
+ $sort_pid1 = open2(\*RH1, \*WH1, "sort --key=7,7 --key=3,3n"); # sort on seq1, start1
+ $sort_pid2 = open2(\*RH2, \*WH2, "sort --key=8,8 --key=5,5n"); # sort on seq2, start2
+ $sort_pid3 = open2(\*RH3, \*WH3, "sort --key=7,7 --key=3,3n"); # sort on seq1, start1
+# open(WH1, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!);
+
+ open(OUT, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!);
+# open(OUT, "| sort --key=1,1 --key=2,2n > ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!);
+ foreach my $seq (sort alnum keys %$sizes1) {
+ open(IN, "< ".$tmp_dir.$input_files[0].".2MM1.$seq") or next;
+ print WH1 while <IN>;
+ close IN;
+ unlink $tmp_dir.$input_files[0].".2MM1.$seq" unless $nodelete;
+ }
+
+ foreach my $seq (sort alnum keys %$sizes2) {
+ open(IN, "< ".$tmp_dir.$input_files[0].".M2.$seq") or next;
+ print WH1 while <IN>;
+ close IN;
+ unlink $tmp_dir.$input_files[0].".M2.$seq" unless $nodelete;
+ }
+
+ close WH1;
+ expandSeq1(\*RH1, \*WH2);
+ close RH1; waitpid $sort_pid1, 0;
+ close WH2;
+ expandSeq2(\*RH2, \*WH3);
+ close RH2; waitpid $sort_pid2, 0;
+ close WH3;
+ finalExpand(\*RH3, \*OUT);
+ close RH3; waitpid $sort_pid3, 0;
+ close OUT;
+}
+
+
+# Input: chains ordered by seq1, start1
+# Output: chains expanded on seq1
+sub expandSeq1($$) {
+ my ($RH, $WH) = @_;
+ my ($first_align, $last_align, $type, $num_aligns,
+ $cur_seq, $preexpand1, $postexpand1,
+ $prev_chain, $cur_chain, $next_chain);
+ my (@line);
+
+ while (<$RH>) {
+ chomp; @line = split;
+
+ # skip M2 regions
+ if ($line[0] == 3) {
+ $,= " "; print $WH @line[0..17]; print $WH " 0 0 0 0 "; print $WH @line[18..$#line]; print $WH "\n"; undef $,; next;
+ }
+
+ $prev_chain = $cur_chain;
+ $cur_chain = $next_chain;
+
+ $first_align = []; $last_align = [];
+ ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2],
+ $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE],
+ $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2],
+ $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE]) = @line;
+
+ $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]);
+ $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]);
+
+ my @saved_line = @line;
+ $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line];
+ next unless defined $cur_chain;
+
+ expSeq1Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq);
+# TODO
+# if ($cur_seq ne $$first_align[SEQ1]) {
+# undef $cur_chain;
+# $cur_seq = $$first_align[SEQ1];
+# }
+ }
+ expSeq1Reg($WH, $cur_chain, $next_chain, undef, $cur_seq);
+}
+
+
+sub expSeq1Reg($$$$$) {
+ my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_;
+ my ($preexpand1, $postexpand1);
+
+ $preexpand1 = $$cur_chain[0][CHALO1] - (defined $prev_chain ? $$prev_chain[0][CHAHI1] : 0);
+ $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len;
+#$preexpand1 = 0 if $preexpand1 < 0;
+ $preexpand1 = $max_expand_len if $preexpand1 < 0; # !!!
+ $postexpand1 = $$next_chain[0][CHALO1] - $$cur_chain[0][CHAHI1];
+ $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len;
+#$postexpand1 = 0 if $postexpand1 < 0;
+ $postexpand1 = $max_expand_len if $postexpand1 < 0;
+#$postexpand1 = 0 if defined $prev_chain and $$prev_chain[0][CHAHI1] > $$cur_chain[0][CHAHI1]; # don't expand if covered by another align
+ $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1;
+ $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1;
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+
+ $cur_seq = $$cur_chain[0][SEQ1] if not defined $cur_seq;
+ if ($cur_seq ne $$cur_chain[0][SEQ1]) { # Correct upper expansion
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $max_expand_len;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+ }
+
+ print $WH $$cur_chain[2]." ".$$cur_chain[3]." ".
+ $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ".
+ $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ".
+ $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ".
+ $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ".
+ $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E];
+
+ if ($print_chains) {
+ my $i = 18;
+ while (1) {
+ print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3];
+ last if @{$$cur_chain[4]} <= $i+4;
+ $i+=4;
+ }
+ }
+ print $WH "\n";
+}
+
+
+# Input: chains ordered by seq2, start2
+# Output: chains expanded on seq1 and seq2 (final output)
+sub expandSeq2($$) {
+ my ($RH, $WH) = @_;
+ my ($first_align, $last_align, $type, $num_aligns,
+ $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2,
+ $prev_chain, $cur_chain, $next_chain);
+ my (@line);
+
+ while (<$RH>) {
+ chomp; @line = split;
+
+ # skip M1 regions
+ if ($line[0] == 1) {
+ $,= " "; print $WH @line[0..21]; print $WH " 0 0 0 0 "; print $WH @line[22..$#line]; print $WH "\n"; undef $,; next;
+ }
+
+ $prev_chain = $cur_chain;
+ $cur_chain = $next_chain;
+
+ $first_align = []; $last_align = [];
+ ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2],
+ $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE],
+ $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2],
+ $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE],
+ $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E]) = @line;
+
+ $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]);
+ $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]);
+
+ my @saved_line = @line;
+ $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line];
+
+ next unless defined $cur_chain;
+ expSeq2Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq);
+# if ($cur_seq ne $$first_align[SEQ2]) {
+# undef $cur_chain;
+# $cur_seq = $$first_align[SEQ2];
+# }
+ }
+ expSeq2Reg($WH, $cur_chain, $next_chain, undef, $cur_seq);
+}
+
+
+sub expSeq2Reg($$$$$) {
+ my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_;
+ my ($preexpand1, $postexpand1, $preexpand2, $postexpand2);
+
+ $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E];
+ $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1];
+
+ $preexpand2 = $$cur_chain[0][CHALO2] - (defined $prev_chain ? $$prev_chain[0][CHAHI2] : 0);
+ $preexpand2 = $preexpand1 * $expand_factor if $preexpand2 > $preexpand1 * $expand_factor and $$cur_chain[2] != 3;
+ $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len;
+#$preexpand2 = 0 if $preexpand2 < 0;
+ $preexpand2 = $max_expand_len if $preexpand2 < 0;
+ $preexpand1 = $preexpand2 * $expand_factor if $preexpand1 > $preexpand2 * $expand_factor and $$cur_chain[2] != 3;
+ $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len;
+
+ $postexpand2 = $$next_chain[0][CHALO2] - $$cur_chain[0][CHAHI2];
+ $postexpand2 = $postexpand1 * $expand_factor if $postexpand2 > $postexpand1 * $expand_factor and $$cur_chain[2] != 3;
+ $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len;
+#$postexpand2 = 0 if $postexpand2 < 0;
+ $postexpand2 = $max_expand_len if $postexpand2 < 0;
+ $postexpand1 = $postexpand2 * $expand_factor if $postexpand1 > $postexpand2 * $expand_factor and $$cur_chain[2] != 3;
+ $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len;
+
+ $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1;
+ $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1;
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+
+ $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2;
+ $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1;
+ $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2;
+ $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]};
+ if ($cur_seq ne $$cur_chain[0][SEQ2]) { # Correct upper expansion
+ $postexpand2 = $postexpand1 * $expand_factor;
+ $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len;
+ $postexpand2 = 0 if $postexpand2 < 0;
+ $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2;
+ $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]};
+ }
+
+ print $WH $$cur_chain[2]." ".$$cur_chain[3]." ".
+ $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ".
+ $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ".
+ $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ".
+ $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ".
+ $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ".
+ $$cur_chain[0][CHALO2]." ".$$cur_chain[0][CHAHI2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E];
+ if ($print_chains) {
+ my $i = 22;
+ while (1) {
+ print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3];
+ last if @{$$cur_chain[4]} <= $i+4;
+ $i+=4;
+ }
+ }
+ print $WH "\n";
+}
+
+
+sub finalExpReg($$$$$) {
+ my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_;
+ my ($preexpand1, $postexpand1, $preexpand2, $postexpand2);
+ if ($$cur_chain[2] == 1) { # M1: expand in seq1 on seq2 expands * factor only
+ $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E];
+ $preexpand2 = $preexpand1 * $expand_factor;
+ $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len;
+ $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1];
+ $postexpand2 = $postexpand1 * $expand_factor;
+ $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len;
+ $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2;
+ $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1;
+ $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2;
+ $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]};
+ } elsif ($$cur_chain[2] == 3) { # M2: expand in seq2 on seq1 expands * factor only
+ $preexpand2 = $$cur_chain[0][CHALO2] - $$cur_chain[0][CHALO2E];
+ $preexpand1 = $preexpand2 * $expand_factor;
+ $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len;
+ $postexpand2 = $$cur_chain[0][CHAHI2E] - $$cur_chain[0][CHAHI2];
+ $postexpand1 = $postexpand2 * $expand_factor;
+ $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len;
+ $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1;
+ $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1;
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+ }
+
+ print $WH $$cur_chain[0][SEQ1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ".
+ $$cur_chain[0][SEQ2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E]." ".$$cur_chain[0][ORIENT];
+ print $WH " (".($$cur_chain[2]==1?"M1, ":$$cur_chain[2]==2?"DM, ":"M2, ").$$cur_chain[3]." aligns)" unless $no_aligntotals;
+ if ($print_chains) {
+ my $i = 26;
+ while (1) {
+ print $WH " [".${$$cur_chain[4]}[$i]."-".${$$cur_chain[4]}[$i+1]."=".${$$cur_chain[4]}[$i+2]."-".${$$cur_chain[4]}[$i+3]."]";
+ last if @{$$cur_chain[4]} <= $i+4;
+ $i+=4;
+ }
+ }
+ print $WH "\n";
+}
+
+
+sub finalExpand($$) {
+ my ($RH, $WH) = @_;
+ my ($first_align, $last_align, $type, $num_aligns,
+ $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2,
+ $prev_chain, $cur_chain, $next_chain);
+ my %stats;
+ my (@line);
+
+ while (<$RH>) {
+ chomp; @line = split;
+
+ $prev_chain = $cur_chain;
+ $cur_chain = $next_chain;
+
+ $first_align = []; $last_align = [];
+ ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2],
+ $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE],
+ $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2],
+ $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE],
+ $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E],
+ $$first_align[CHALO2], $$first_align[CHAHI2], $$first_align[CHALO2E], $$first_align[CHAHI2E]) = @line;
+
+ if ($type == 1) {
+ $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]);
+ $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]);
+ } elsif ($type == 3) {
+ $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]);
+ $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]);
+ }
+
+ my @saved_line = @line;
+ $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line];
+
+ next unless defined $cur_chain;
+
+ finalExpReg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq);
+
+ if ($debug or $print_stats) {
+ if ($type == 1) {
+ $$cur_chain[0][ORIENT] eq "+" ? $stats{"M1+"}++ : $stats{"M1-"}++;
+ $stats{"M1min"} = $num_aligns if $stats{"M1min"} > $num_aligns or not defined $stats{"M1min"};
+ $stats{"M1max"} = $num_aligns if $stats{"M1max"} < $num_aligns or not defined $stats{"M1max"};
+ $stats{"M1mean"} += $num_aligns;
+ } elsif ($type == 2) {
+ $$cur_chain[0][ORIENT] eq "+" ? $stats{"DM+"}++ : $stats{"DM-"}++;
+ $stats{"DMmin"} = $num_aligns if $stats{"DMmin"} > $num_aligns or not defined $stats{"DMmin"};
+ $stats{"DMmax"} = $num_aligns if $stats{"DMmax"} < $num_aligns or not defined $stats{"DMmax"};
+ $stats{"DMmean"} += $num_aligns;
+ } else {
+ $$cur_chain[0][ORIENT] eq "+" ? $stats{"M2+"}++ : $stats{"M2-"}++;
+ $stats{"M2min"} = $num_aligns if $stats{"M2min"} > $num_aligns or not defined $stats{"M2min"};
+ $stats{"M2max"} = $num_aligns if $stats{"M2max"} < $num_aligns or not defined $stats{"M2max"};
+ $stats{"M2mean"} += $num_aligns;
+ }
+ }
+ if ($cur_seq ne $$first_align[SEQ2]) {
+ undef $cur_chain;
+ $cur_seq = $$first_align[SEQ2];
+ }
+ }
+ finalExpReg($WH, $cur_chain, $next_chain, undef, $cur_seq);
+
+ if ($debug or $print_stats) {
+ foreach my $i ("DM", "M1", "M2") {
+ $stats{$i."mean"} /= ($stats{$i."+"} + $stats{$i."-"}) unless ($stats{$i."+"} + $stats{$i."-"} == 0);
+ print $i.": ".($stats{$i."+"} + $stats{$i."-"})." chains (".$stats{$i."+"}."+, ".$stats{$i."-"}."-); ".
+ "length min ".$stats{$i."min"}.", avg ".$stats{$i."mean"}.", max ".$stats{$i."max"}."\n";
+ }
+ }
+}
+
+
+# Called only in a "$0 worker" invocation
+sub workerRun($$$$) {
+ my ($tar_file, $score_file, $SLAGAN, $debug) = @_;
+ my ($tmp_dir, $io_dir) = ($worker_tmp_dir, getcwd);
+ local *FH;
+
+ mkdir($tmp_dir) or die("$0 (worker): Could not create directory $tmp_dir: ".$!);
+
+ copy($score_file, $tmp_dir);
+ $score_file =~ /.*\/([^\/]+)$/;
+ $score_file = $tmp_dir.$1;
+
+ print("$0 (worker): Version ".$VERSION." started ".localtime()."\n") if $debug;
+ print("$0 (worker): Jobfile=$tar_file, scorefile=$score_file, tmpdir=$tmp_dir, iodir=$io_dir, SLAGAN=$SLAGAN\n") if $debug;
+
+ move($io_dir."/".$tar_file, $tmp_dir);
+ my @files = `cd $tmp_dir; tar -xvf $tar_file` or warn("$0 (worker): Error extracting $tar_file");
+ foreach my $file (@files) {
+ chomp $file;
+#print "$SLAGAN $tmp_dir$file $score_file > $tmp_dir$file.glocal-out 2> $tmp_dir$file.glocal-err\n";
+ system("$SLAGAN $tmp_dir$file $score_file ".
+ "> $tmp_dir$file.glocal-out ".
+ "2> $tmp_dir$file.glocal-err");
+ }
+
+ $tar_file =~ /(.*)\.tar$/; $tar_file = $1;
+ open(FH, "| cd $tmp_dir; xargs tar --append --file=$io_dir/$tar_file.results.tar");
+ foreach my $file (glob("$tmp_dir/*glocal-out")) { $file =~ /\/([^\/]+)$/; print FH $1." "; }
+ close FH;
+
+ rmtree $tmp_dir;
+ opendir(DIR, "."); if (my @x = grep(/core\./,readdir(DIR))) { warn("$0 (worker): WARNING: $SLAGAN crashed ". at x." times"); } closedir DIR;
+ unlink(glob("core.*")) unless $nodelete;
+}
+
+
+# Interrupt handler
+sub dequeueClustJobs($) {
+ print "\n$0: Received SIG".$_[0].". Cleaning up... ";
+ if ($clust_run_pid) {
+ # send SIGQUIT to clust_run so it can dequeue cluster jobs
+ kill "QUIT", $clust_run_pid;
+ }
+ unless ($debug or $nodelete) {
+ print "Removing job files...";
+ foreach my $i (1..$num_jobs-1) {
+ unlink $tmp_dir."JOB".$i.".tar";
+ unlink $tmp_dir."JOB".$i.".results.tar";
+ unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i";
+ unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i";
+ }
+
+ unlink "$tmp_dir$input_glob.chaos";
+ unlink $tmp_dir."CLUSTER_JOB_PARAMS";
+ rmtree($tmp_dir) if $ARGV[0] eq "worker";
+ }
+ print "\n";
+ exit(1);
+}
+
+
+# Retrieve sequence length data from GPDB
+sub get_all_seqs($$) {
+ my ($dbh, $genome) = @_;
+ my ($dset, $annot_db, $family, $check_chroms, %sizes, $chroms, @real_chroms,
+ $ctgs);
+
+ ($dset, $annot_db, $family) = ($genome =~ /^\d+$/o) ?
+ ($genome + 0, ($dbh->get_data_set($genome))[4,14]) :
+ ($dbh->get_family_dset($genome))[0,4,14];
+ print "$0: Genome $genome, dataset $dset, annotation db \"$annot_db\", family \"$family\"\n" if $debug;
+ $annot_db and $check_chroms = 1;
+ if ($check_chroms) {
+ $chroms = $dbh->get_chroms(($dbh->get_data_set($dset))[2]);
+ foreach my $chrom (@$chroms) {
+ $$chrom[1] == 1 or next;
+ my $name = "chr$$chrom[2]";
+ my ($chr_id, $chr_type, $ctg_id, $size) =
+ $dbh->find_seq($name, $dset, $annot_db);
+ $chr_id and $sizes{$name} = $size;
+ }
+ }
+ $ctgs = $dbh->selectcol("SELECT name FROM dset$dset\_contigs " .
+ "WHERE name is not null and name != ? group by name", undef, "");
+ foreach my $ctg (@$ctgs) {
+ $sizes{$ctg} = $dbh->get_contig_size($dset, $ctg);
+ }
+ return \%sizes;
+}
+
+
+sub alnum {
+ my ($i);
+ my ($len1, $len2) = (length($a), length($b));
+ for ($i = 0; ($i < $len1) && ($i < $len2); ++$i) {
+ my $c1 = substr($a, $i, 1);
+ my $c2 = substr($b, $i, 1);
+ ($c1 =~ /^\d/o) || ($c2 =~ /^\d/o) || ($c1 ne $c2) and last;
+ }
+ my $a_r = ($i < $len1) ? substr($a, $i) : "";
+ my $b_r = ($i < $len2) ? substr($b, $i) : "";
+ my ($a_n, $a_s) = ($a_r =~ /^(\d+)(.*)$/o);
+ my ($b_n, $b_s) = ($b_r =~ /^(\d+)(.*)$/o);
+ return (defined($a_n) && defined($b_n)) ?
+ (($a_n <=> $b_n) || ($a_s cmp $b_s)) : ($a cmp $b);
+}
+
+
+sub isBLAT($) {
+ my ($file) = @_;
+ local *FH;
+ open(FH, "< ".$file) or die("$0: Cannot open input file $file: ".$!);
+ my $line = <FH>;
+ close FH;
+ if ($line =~ /\A.+\s[\d]+\s[\d]+\;\s.+\s[\d]+\s[\d]+\;\sscore/) {
+ return 0;
+ } elsif ($line =~ /\A[^\s]+\s[\d]+\s[\d]+\s[^\s]+\s/) {
+ return 1;
+ } else {
+ die("$0: Unknown input format in $file. Stopped");
+ }
+}
+
+
+sub getMinSeqScore($) {
+ my ($file) = @_;
+ my $score; local *FH;
+ open(FH, "< ".$file) or die("$0: Could not open SLAGAN scorefile $file: $!");
+ while (<FH>) {
+ # sample line: {+U+;+U-;-U+;-U-}{70000 0 0 0}
+ /\{\+U\+\;.+\}.*\{(\d+)\s.+\}/;
+ $score = $1 if $1;
+ }
+ close FH;
+ die("$0: Could not determine min_seq_score from SLAGAN scorefile $file. Stopped") unless $score;
+ print "$0: min_seq_score: $score\n" if $debug;
+ return $score;
+}
+
+
+sub writeSizes($$) {
+ my ($sizes, $outfile) = @_; local *FH;
+ open(FH, "> ".$outfile) or die("$0: Could not open file $outfile for writing: ".$!);
+ foreach my $key (sort alnum keys %$sizes1) {
+ print FH $key."\t".$$sizes1{$key}."\n";
+ }
+ close FH;
+}
+
+
+# Borrowed from if.pm to enable standalone conditional module loading on earlier versions of Perl
+sub useIf($$) {
+ my $method = 'import';
+ return unless shift; # CONDITION
+
+ my $package = $_[0];
+ (my $file = $package.".pm") =~ s!::!/!g;
+ require $file;
+ my $method_entry_point = $package->can($method);
+ goto &$method_entry_point if $method_entry_point;
+}
+
+
+sub checkAlignCoords($) {
+ my $cur_align = $_[0];
+ if ($$cur_align[START1] > $$cur_align[END1]) { my $i = $$cur_align[START1]; $$cur_align[START1] = $$cur_align[END1]; $$cur_align[END1] = $i; }
+ if ($$cur_align[START2] > $$cur_align[END2]) { my $i = $$cur_align[START2]; $$cur_align[START2] = $$cur_align[END2]; $$cur_align[END2] = $i; }
+
+# if ($$cur_align[OSTART1] > $$cur_align[OEND1]) { my $i = $$cur_align[OSTART1]; $$cur_align[OSTART1] = $$cur_align[OEND1]; $$cur_align[OEND1] = $i; }
+# if ($$cur_align[OSTART2] > $$cur_align[OEND2]) { my $i = $$cur_align[OSTART2]; $$cur_align[OSTART2] = $$cur_align[OEND2]; $$cur_align[OEND2] = $i; }
+}
+
+
+=head1 NAME
+
+Supermap: Piecewise monotonic alignment map generator for shuffle-lagan
+
+=head1 SYNOPSIS
+
+supermap.pl (gen2=id | sizes2=filename) (gen1=id | sizes1=filename)
+[-infile=<file>] [-outfile=<file>] [-bacteria] [-score=filename] [-f]
+[file1 file2 ...]
+
+=head1 EXAMPLES
+
+supermap.pl -sizes1=human.sizes -sizes2=mouse.sizes hm.chr*.chaos
+
+=head1 DESCRIPTION
+
+Supermap is a whole-genome alignment map generator. It is an extension to the
+Shuffle-LAGAN suite (Brudno et al., 2003). Supermap removes the asymmetry between
+the query genomes by running multiple SLAGAN passes and combining them into a full
+two-genome alignment.
+
+To run Supermap without the Berkeley Genome Pipeline functionality, you will need
+sequence length files for each of the genomes. Each file should contain one sequence
+length entry per line, of the form "sequence_name sequence_length".
+
+In the CHAOS output format (this program's input), negative orientation always means second pair of coords is inverted.
+In this program's output, negative orientation does not invert coordinates (coordinate pairs are always ascending).
+
+Run supermap.pl with no arguments to see a further description.
+
+The terms "hit" and "anchor" usually refer to local alignments produced by CHAOS or another program.
+The term "chain" refers to an extended union of a number of these local alignments.
+
+=head1 DEPENDENCIES
+
+Supermap depends on Utils.pm, SLAGAN, and a number of Unix utilities.
+
+To use the Berkeley Genome Pipeline and cluster functionality, Supermap needs
+GPutils.pm, GPDBI.pm, and clust_run.
+
+=head1 LIMITATIONS
+
+Supermap is designed to allow the manipulation of large datasets in a reasonable memory footprint.
+To do this, it allows multiple files on input and keeps most of its intermediate data in small temporary files.
+However, one current limitation is that the alignments for any sequence in either genome must fit into the largest
+addressable file size (typically 2GB), and the output alignments must also fit in that size (the remainder will be truncated).
+
+=head1 BUGS
+
+=head1 TODO
+
+TODO: bacteria description, examples, other input formats
+TODO: installer routine
+TODO: discuss input glob parameters
+TODO: local multithreading
+TODO: ignore escaped slashes when splitting dir/file (copy Alex)
+TODO: check for ++ etc in SLAGAN out
+TODO: .supermaprc file for score files, etc
+TODO: hazelton.lbl.gov/bugzilla for supermap
+
+=head1 AUTHOR
+
+Andrey Kislyuk L<mailto:kislyuk at ocf.berkeley.edu>.
+
+=cut
diff --git a/src/thrtrie.c b/src/thrtrie.c
new file mode 100644
index 0000000..a21fbde
--- /dev/null
+++ b/src/thrtrie.c
@@ -0,0 +1,330 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "skiplist.h"
+#include "thrtrie.h"
+#include <assert.h>
+int triealphasize=0;
+int nnodes=0;
+
+
+#define DEBUG 1
+#define JQ_SIZE 1024
+#include "mempage.c"
+
+TJob* jobqueue=0;
+int jqsize = 1;
+int numjobs = 0;
+
+void makeAlpha(char* alpha) {
+ int i;
+ int isin = 0;
+ for (i=0; i < 256; i++)
+ indeces[i] = -1;
+ i = 0;
+ while (*alpha) {
+ if (!isin && *alpha == '[')
+ isin = 1;
+ else if (isin && *alpha == ']') {
+ isin = 0;
+ i++;
+ }
+ else if (isin)
+ indeces[*alpha] = i;
+
+ else indeces [*alpha] = i++;
+ alpha++;
+ }
+ triealphasize = i;
+}
+
+int lookup(char c) {
+ return indeces[c];
+}
+
+
+TNode* makeTrie(int height, char* alphabet) {
+ TNode* root;
+ initMP(0);
+ makeAlpha(alphabet);
+ if (!jobqueue)
+ jobqueue = (TJob*) malloc(sizeof(TJob));
+ root = makeNode(height);
+ return root;
+}
+
+void junker (TNode** m){
+
+}
+
+int tccc = 0;
+
+void freeTrie (TNode* trgt) {
+ /*
+ int i;
+ if (trgt->height) {
+ for (i = 0; i < triealphasize; i++)
+ if (trgt->kids.ptrs[i])
+ freeTrie(trgt->kids.ptrs[i]);
+ junker (trgt->kids.ptrs);
+ }
+ else
+ free(trgt->kids.locator.locs);
+ free (trgt);
+ */
+ MPallfree();
+}
+
+TNode* makeNode(int height) {
+ TNode* tn = (TNode*) MPmalloc(sizeof(TNode));
+ int i;
+ tn->height=height;
+ if (height) {
+ tn->kids.ptrs = (TNode**) MPmalloc(sizeof(TNode*)*triealphasize);
+ for (i=0; i < triealphasize; i++)
+ tn->kids.ptrs[i]=0;
+ }
+ else {
+ tn->kids.locator.numlocs = 0;
+ tn->kids.locator.locs = (int*)MPmalloc(sizeof(int)*2);
+ tn->kids.locator.locssize = 2;
+ }
+ return tn;
+}
+
+int insertLoc (int word, locs* locator) {
+ locator->locs[locator->numlocs++] = word;
+ if (locator->numlocs >= locator->locssize) {
+ locator->locs = (int*) MPrealloc (locator->locs, sizeof(int)*locator->locssize,
+ sizeof(int)*locator->locssize*2);
+ locator->locssize *= 2;
+ }
+ return 0;
+}
+
+
+int insertWordHelp(TNode* currnode, char* word, char* strbeg, int height,int wordlen) {
+ int letter;
+ if (height == 0)
+ return insertLoc((int)(word-strbeg), &(currnode->kids.locator));
+ else {
+ letter = lookup(word[wordlen-height]);
+ if (letter < 0)
+ return 1;
+ if (!currnode->kids.ptrs[letter]) {
+ currnode->kids.ptrs[letter] = makeNode(height-1);
+ }
+ return insertWordHelp(currnode->kids.ptrs[letter], word, strbeg, height-1, wordlen);
+ }
+ return 42;
+}
+
+int insertWord(TNode* currnode, char* word, char* strbeg) {
+ return insertWordHelp(currnode, word, strbeg, currnode->height, currnode->height);
+}
+
+LList* appendLList(LList* a , LList* b) {
+ if (!a)
+ return b;
+ if (!b)
+ return a;
+ b->next = appendLList(a, b->next);
+ return b;
+}
+
+/*no longer works */
+ /* make iterative??? */
+/*
+LList* lookupZZZWord(TNode* currnode, char* word, int ndegen) {
+ int letter,i;
+ LList *temp, *help, *res=0;
+ int height = currnode->height;
+ if (!currnode || ndegen < 0)
+ return 0;
+ if (!currnode->height) {
+ res = (LList*) malloc (sizeof(LList));
+ res->myloc = &currnode->kids.locator;
+ res->degleft = 0;
+ res->next = 0;
+ return res;
+ }
+ letter = lookup(word[currnode->height-1]);
+ if (letter >=0 && currnode->kids.ptrs[letter]) {
+ temp = lookupZZZWord(currnode->kids.ptrs[letter], word, ndegen);
+ res = appendLList(res, temp);
+ }
+ for (i=0; i < triealphasize; i++) {
+ if (ndegen > 0 && i != letter) {
+ if (currnode->kids.ptrs[i]) {
+ temp = lookupZZZWord(currnode->kids.ptrs[i], word, ndegen-1);
+ help = temp;
+ while (help != 0) {
+ help->degloc[help->degleft++] = currnode->height;
+ help = help->next;
+ }
+ res = appendLList(res, temp);
+ }
+ }
+ }
+ return res;
+ }*/
+
+void insertString(TNode* root, char* word) {
+ char* begin = word;
+ int i, j, wordlen = root->height, letprev, letcurr;
+ TNode* prev, *curr;
+ insertWord(root, word, begin);
+ word++;
+ root->backptr = root;
+ while (*word) {
+ curr = prev = root;
+ insertWord(root, word, begin);
+ for (i=0; i < wordlen; i++) {
+ letprev = lookup(word[i-1]);
+ letcurr = lookup(word[i]);
+ if (letprev >= 0)
+ prev = prev->kids.ptrs[letprev];
+ else break;
+ prev->backptr = curr;
+ if (letcurr >= 0)
+ curr = curr->kids.ptrs[letcurr];
+ else break;
+ }
+ word++;
+ }
+ letcurr = lookup(*(word-1));
+ if (letcurr >=0)
+ root->kids.ptrs[letcurr]->backptr = root;
+}
+
+void addjob(TNode* tn, char *thisdeg, char dirty, int oldindex) {
+ int i;
+ jobqueue[numjobs].mynode = tn;
+ jobqueue[numjobs].dirty = dirty;
+ if (oldindex >= 0) {
+ jobqueue[numjobs].numdeg = jobqueue[oldindex].numdeg;
+ for (i = 0; i < jobqueue[oldindex].numdeg; i++)
+ jobqueue[numjobs].degloc[i] = jobqueue[oldindex].degloc[i];
+ }
+ else {
+ jobqueue[numjobs].numdeg = 0;
+ }
+ if (thisdeg>0) {
+ jobqueue[numjobs].degloc[jobqueue[numjobs].numdeg++] = thisdeg;
+ }
+ numjobs++;
+ if (jqsize == numjobs)
+ jobqueue = (TJob*)realloc(jobqueue, sizeof(TJob)*(jqsize *=2));
+
+}
+
+void cleanJobQueue() {
+ numjobs = 0;
+}
+
+
+void remjob(int i) {
+ jobqueue[i]= jobqueue[--numjobs];
+}
+
+LList* makeLList(TJob* tj, char* word, int offset) {
+ LList* res;
+ int i;
+ TNode* currnode = tj->mynode;
+ res = (LList*) malloc (sizeof(LList));
+ res->myloc = &(currnode->kids.locator);
+ res->degleft = tj->numdeg;
+
+ for (i = 0; i < tj->numdeg; i++)
+ res->degloc[i] = (char *)(word - tj->degloc[i]);
+ res->next = 0;
+ return res;
+}
+
+LList* getNextWords (TNode* currnode, char* word, int ndegen) {
+ int i, j;
+ int height = currnode->height;
+ int letter = lookup(*word);
+ int mynjobs;
+ char mydirty;
+ char myflags;
+ char first = 0;
+ LList* res=0, *temp;
+
+ // -1 --> 0 (second param)
+ if (letter >= 0 && numjobs == 0) /*new string*/
+ addjob(currnode, 0, 0, -1);
+ mydirty = jobqueue[0].dirty;
+ mynjobs = numjobs; /* need my own copy so that I don't go over inserted things */
+ for (i = 0; i < mynjobs; i++) {
+ myflags = - 1 - (1 << triealphasize)+1;
+ first = 0;
+ // printf("jqdl = %d, w = %d, mnh = %d\n", jobqueue[i].degloc[0],(int)word, jobqueue[i].mynode->height);
+ if (jobqueue[i].numdeg > 0 && ((char *) jobqueue[i].degloc[0] < word - (height -jobqueue[i].mynode->height))) {
+ remjob(i);
+ if (jobqueue[i].dirty == mydirty) {
+ mynjobs--;
+ i--;
+ }
+ continue;
+ }
+ do {
+ if (!jobqueue[i].mynode) {
+ remjob(i);
+ if (jobqueue[i].dirty == mydirty) {
+ mynjobs--;
+ i--; /* need this if the guy I moved in the old place is in my pass */
+ }
+ break;
+ }
+ if (jobqueue[i].mynode->height == 0 || first) {
+ jobqueue[i].mynode = jobqueue[i].mynode->backptr;
+ }
+ first = 1;
+ if (ndegen - jobqueue[i].numdeg > 0) {
+ for (j = 0; j < triealphasize; j++) {
+ if (!(myflags & (1<< j)) && jobqueue[i].mynode->kids.ptrs[j]) {
+ // changed -1 --> 0
+ addjob(jobqueue[i].mynode->kids.ptrs[j], (j==letter)?0:word, !mydirty,i);
+ if (jobqueue[i].mynode->height == 1) {
+ temp = makeLList(&jobqueue[numjobs-1], word, j);
+ temp->next = res;
+ res = temp;
+ }
+ myflags = myflags | (1 << j);
+ }
+ }
+ }
+
+ else {
+ if (letter >= 0 && jobqueue[i].mynode->kids.ptrs[letter]) {
+ jobqueue[i].mynode = jobqueue[i].mynode->kids.ptrs[letter];
+ jobqueue[i].dirty = !mydirty;
+ if (jobqueue[i].mynode->height == 0) {
+ temp = makeLList(&jobqueue[i], word, letter);
+ temp->next = res;
+ res = temp;
+ }
+ myflags = -1;
+ }
+ }
+ if (myflags == -1) {
+ break;
+ }
+ } while(jobqueue[i].mynode != jobqueue[i].mynode->backptr);
+ if (jobqueue[i].dirty == mydirty) {
+ remjob(i);
+ if (jobqueue[i].dirty == mydirty) {
+ mynjobs--;
+ i--; /* need this if the guy I moved in the old place is in my pass */
+ }
+ }
+ }
+ return res;
+}
+
+
+
+
+
+
diff --git a/src/thrtrie.h b/src/thrtrie.h
new file mode 100644
index 0000000..83ff1db
--- /dev/null
+++ b/src/thrtrie.h
@@ -0,0 +1,67 @@
+#include "fchaos.h"
+#define MAX_DEGEN 2
+
+
+int indeces[256];
+
+typedef struct PrevHits {
+ int* inds1;
+ int* inds2;
+ int numind;
+} phits;
+
+typedef struct Locator {
+ int* locs;
+ int numlocs;
+ int locssize;
+} locs;
+
+typedef struct LocatorList {
+ locs* myloc;
+ int degleft;
+ char* degloc[MAX_DEGEN];
+ struct LocatorList* next;
+
+ /* Stuff below is for chaining */
+ int location;
+ char* toberemoved;
+ float* scores;
+ int* seq1startpnt;
+ int* seq2startpnt;
+ int* seq1endpnt;
+ int* seq2endpnt;
+ phits* myhits;
+ sle** mysles;
+} LList;
+
+typedef struct TrieNode {
+ union children {
+ struct TrieNode** ptrs;
+ locs locator;
+ } kids;
+ struct TrieNode* backptr; /* added for threading */
+ int height;
+} TNode;
+
+typedef struct TrieJob {
+ TNode* mynode;
+ int numdeg;
+ char *degloc[MAX_DEGEN];
+ char dirty;
+} TJob;
+
+LList* appendLList(LList* a , LList* b);
+LList* savenfreeLList (LList* tbf, seq* seq1, seq* seq2);
+TNode* makeTrie(int height, char* alphabet);
+void freeTrie (TNode* root);
+TNode* makeNode(int height);
+int insertWord(TNode* root, char* word, char* strbeg);
+LList* lookupWord(TNode* currnode, char* word, int ndegen);
+
+/* above this line are things for all tries */
+
+/*this is for threaded stuff */
+void cleanJobQueue();
+LList* getNextWords(TNode* root, char* word, int ndegen);
+void insertString(TNode* root, char* tbi);
+
diff --git a/src/translate.c b/src/translate.c
new file mode 100644
index 0000000..3fb638b
--- /dev/null
+++ b/src/translate.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fchaos.h"
+#include "translate.h"
+#include "assert.h"
+
+char toPeptide (char* dnaword, char revcomp) {
+ int i, j, sum=0, mask = 0;
+ char *table =
+ "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+ if (revcomp) mask = 3; /* Hacking... */
+ for (i = 0; i < 3; i++) {
+ sum*=4;
+ switch (dnaword[(i^mask)-!!revcomp]) {
+ case 'a': case 'A': sum+=(0^mask); break;
+ case 'c': case 'C': sum+=(1^mask); break;
+ case 'g': case 'G': sum+=(2^mask); break;
+ case 't': case 'T': sum+=(3^mask); break;
+ case 'n': case 'N': return 'X';
+ default:
+ fprintf(stderr, "%d = %c: bad letter in sequence\n",i,dnaword[i^mask]);
+ exit(1);
+ }
+ }
+ return table[sum];
+}
+
+
+seq* transSeq(seq* theseq, int frame) {
+ char* res;
+ seq* resseq = (seq*) malloc(sizeof(seq));
+ char revcomp = 0;
+ int i, numXs = 0;
+
+ assert (resseq);
+
+
+ if (frame < 0 || frame > 5) {
+ fprintf(stderr, "Valid frame numbers are 1-6\n");
+ exit(1);
+ }
+ if (frame > 2) revcomp = 1;
+
+ frame = frame % 3;
+ resseq->numlets = (theseq->numlets-frame)/3;
+
+ res = (char*) malloc((resseq->numlets+1)* sizeof(char));
+ assert (res);
+
+ /**
+ * This was the error.
+ */
+ res[(theseq->numlets-frame)/3] = 0;
+ for (i = 0;i < (theseq->numlets-frame)/3; i++) {
+ res[i] = (!revcomp)?toPeptide(&theseq->lets[i*3+frame],0)
+ :toPeptide(&theseq->lets[theseq->numlets-3*(i+1)-frame],1);
+ if (res[i] == 'X') numXs++;
+ }
+
+ resseq->numsiglets = resseq->numlets - numXs;
+ resseq->rptr = resseq->lets = res;
+ resseq->name = (char*) malloc(strlen(theseq->name)+5);
+ resseq->name[0] = 0;
+ sprintf(resseq->name, "%s_f%c%d", theseq->name, (revcomp)?'-':'+', frame);
+ return resseq;
+}
+
+/*
+int main(int argc, char** argv) {
+ printf("%s\n", transSeq(argv[1], strlen(argv[1]), 0));
+ printf("%s\n", transSeq(argv[1], strlen(argv[1]), 1));
+ printf("%s\n", transSeq(argv[1], strlen(argv[1]), 2));
+ printf("%s\n", transSeq(argv[1], strlen(argv[1]), 3));
+ printf("%s\n", transSeq(argv[1], strlen(argv[1]), 4));
+ printf("%s\n", transSeq(argv[1], strlen(argv[1]), 5));
+}
+*/
diff --git a/src/translate.h b/src/translate.h
new file mode 100644
index 0000000..3d73d1d
--- /dev/null
+++ b/src/translate.h
@@ -0,0 +1,3 @@
+
+seq* transSeq(seq*, int);
+char toPeptide (char* dnaword, char revcomp);
diff --git a/src/util.cpp b/src/util.cpp
new file mode 100644
index 0000000..74217ce
--- /dev/null
+++ b/src/util.cpp
@@ -0,0 +1,68 @@
+#define fastaRowLength 50
+#define bufSize 2000
+
+int trim(char* s) {
+ int i=strlen(s);
+ while (i>0 && (s[i-1]=='\n' || s[i-1]=='\r')) s[--i]='\0';
+ return i;
+}
+
+string itoa(int i) {
+ char buf[20];
+ sprintf(buf,"%d",i);
+ return buf;
+}
+
+FILE* openFile(string path,char* mode) {
+ FILE *f=fopen(path.c_str(),mode);
+ if (f==NULL) {
+ fprintf(stderr,"ERROR: Failed open file: %s\n",path.c_str());
+ exit(1);
+ }
+ return f;
+}
+
+int isArg(char* key,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0) return 1;
+ }
+ return 0;
+}
+
+string getArg(char* key,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0 && i<argc-1) return argv[i+1];
+ }
+ fprintf(stderr,"ERROR: Parameter for option '%s' not specified\n",key);
+ exit(1);
+ return "";
+}
+
+string getArgAt(char* key,int index,int argc, char* argv[]) {
+ for (int i=0;i<argc;i++) {
+ if (strcmp(key,argv[i])==0 && i<argc-index) return argv[i+index];
+ }
+ fprintf(stderr,"ERROR: Parameter for option '%s' not specified\n",key);
+ exit(1);
+ return "";
+}
+
+char comp(char c) {
+ switch(c) {
+ case 'A': return 'T';
+ case 'T': return 'A';
+ case 'C': return 'G';
+ case 'G': return 'C';
+ case 'N': return 'N';
+ case 'a': return 't';
+ case 't': return 'a';
+ case 'c': return 'g';
+ case 'g': return 'c';
+ case 'n': return 'n';
+ default: return c;
+ }
+}
+
+char mask(char c) {
+ return islower(c)?'N':c;
+}
diff --git a/src/utils/Glue.cpp b/src/utils/Glue.cpp
new file mode 100644
index 0000000..3121e4e
--- /dev/null
+++ b/src/utils/Glue.cpp
@@ -0,0 +1,493 @@
+#include "MultiSequence.h"
+#include "SafeVector.h"
+#include "Output.h"
+#include <math.h>
+#include <assert.h>
+#include <fstream>
+#include <iostream>
+#include <algorithm>
+
+#define NUCLEOTIDE_MATRIX_FILE "nucmatrix.txt"
+#define MAX_LINE_LENGTH 1024
+#define CONS_RATE 70
+#define INF 2000000000
+#define CNTG_BRK_N 50
+
+typedef SafeVector<int> vi;
+typedef SafeVector<vi> vvi;
+typedef SafeVector<vvi> vvvi;
+
+MultiSequence seqs;
+vvi matchScore (256, vi (256, 0));
+vvi dad, score;
+int gapopen, gapcont;
+int NCtoNC = 0, NCtoCN = -1000, CNtoNC = -1000, CNtoCN = 0;
+
+void readScoreMatrix (char *filename){
+ FILE *file;
+ int i, j, k, numlets = 0;
+ char lets[256], line[1024];
+ char *lagan_dir;
+
+ lagan_dir = getenv ("LAGAN_DIR");
+ if (!lagan_dir){
+ fprintf (stderr, "Error: $LAGAN_DIR not set.\n");
+ exit (1);
+ }
+
+ sprintf (line, "%s/%s", lagan_dir, filename);
+ fprintf (stderr, "%s\n", line);
+
+ file = fopen (line, "r"); assert (file);
+
+ fgets (line, 1024, file);
+ for (i = 0; i < (int) strlen (line); i++){
+ if (!isspace (line[i])){
+ lets[numlets++] = line[i];
+ }
+ }
+
+ for (i = 0; i < numlets; i++){
+ fscanf (file, "%1s", &(line[0]));
+ for (j = 0; j < numlets; j++){
+ fscanf (file, "%d", &k);
+ matchScore[(unsigned char) line[0]][(unsigned char) lets[j]] = k;
+ }
+ }
+
+ fscanf (file, "%d%d", &gapopen, &gapcont);
+ fclose (file);
+}
+
+void calculateScoreMatrix (int cons_rate){
+ char *alpha = "ATCG";
+ int i, j;
+
+ for (int i = 0; i < 256; i++)
+ for (int j = 0; j < 256; j++)
+ matchScore[i][j] = 0;
+
+ if (cons_rate == 0){
+ readScoreMatrix (NUCLEOTIDE_MATRIX_FILE);
+ return;
+ }
+
+ double p_ij = (double) cons_rate / 100.0;
+ double match = log (p_ij / 0.25);
+ double mismatch = log ((1 - p_ij) / 0.75);
+
+ for (i = 0; i < (int) strlen (alpha); i++){
+ for (j = 0; j < (int) strlen (alpha); j++){
+
+ matchScore[(unsigned char) alpha[i]][(unsigned char) alpha[j]] =
+ (i == j) ? (int)(match * 100) : (int)(mismatch * 100);
+ }
+ }
+ gapopen = (int)(-match * 750);
+ gapcont = (int)(-match * 25);
+
+ // fprintf (stderr, "Using match=%d mismatch=%d gapopen=%d gapcont=%d...\n",
+ // (int)(match*100), (int)(mismatch*100), gapopen, gapcont);
+}
+
+#define NUM_STATES 2
+#define NC 0
+#define CN 1
+
+void chooseBestOfTwo (int score1, int score2, int ptr1, int ptr2,
+ int &score, int &ptr){
+ if (score1 >= score2){ score = score1; ptr = ptr1; }
+ else { score = score2; ptr = ptr2; }
+}
+
+void chooseBestOfTwo (int score1, int score2, int &score){
+ if (score1 >= score2){ score = score1; }
+ else { score = score2; }
+}
+
+int scorePosition (char c, char d, int &isGap){
+ if (c == '-' && d == '-') return 0;
+ if (c == '-' || d == '-'){
+ if (isGap) return gapcont;
+ isGap = 1;
+ return gapopen;
+ }
+ isGap = 0;
+ return matchScore[(unsigned char) c][(unsigned char) d];
+}
+
+int rescoreRegion (Sequence &seq1, Sequence &seq2, int begin, int end){
+ SafeVector<char>::iterator lets1 = seq1.getIterator();
+ SafeVector<char>::iterator lets2 = seq2.getIterator();
+
+ lets1 += begin - 1;
+ lets2 += begin - 1;
+ int isGap = 0;
+
+ for (int i = 0; i < NUM_STATES; i++) score[i][begin-1] = dad[i][begin-1] = 0;
+
+ for (int i = begin; i <= end; i++){
+ chooseBestOfTwo (score[NC][i-1] + NCtoNC, score[CN][i-1] + CNtoNC, score[NC][i]);
+ chooseBestOfTwo (score[NC][i-1] + NCtoCN, score[CN][i-1] + CNtoCN, score[CN][i]);
+ score[CN][i] += scorePosition (*(++lets1), *(++lets2), isGap);
+ }
+
+ chooseBestOfTwo (score[NC][end], score[CN][end], isGap);
+ return isGap;
+}
+
+void getNucLabels (Sequence &seq1, Sequence &seq2, vi &nucLabels){
+ SafeVector<char>::iterator lets1 = seq1.getIterator();
+ SafeVector<char>::iterator lets2 = seq2.getIterator();
+ int seqLen = seq1.getLength();
+ int isGap = 0;
+
+ nucLabels = vi (seqLen+1, 0);
+
+ for (int i = 0; i < NUM_STATES; i++) score[i][0] = dad[i][0] = 0;
+
+ for (int i = 1; i <= seqLen; i++){
+ chooseBestOfTwo (score[NC][i-1] + NCtoNC, score[CN][i-1] + CNtoNC, NC, CN, score[NC][i], dad[NC][i]);
+ chooseBestOfTwo (score[NC][i-1] + NCtoCN, score[CN][i-1] + CNtoCN, NC, CN, score[CN][i], dad[CN][i]);
+ score[CN][i] += scorePosition (*(++lets1), *(++lets2), isGap);
+ }
+
+ chooseBestOfTwo (score[NC][seqLen], score[CN][seqLen], NC, CN, isGap, nucLabels[seqLen]);
+ for (int i = seqLen - 1; i >= 1; i--){
+ nucLabels[i] = dad[nucLabels[i+1]][i];
+ }
+}
+
+int getSeqCoord (int seq, int pos){
+ SafeVector<char>::iterator lets = seqs[seq].getIterator();
+ int j = 0;
+
+ for (int i = 1; i <= pos; i++)
+ if (*(++lets) != '-') j++;
+
+ return j;
+}
+
+void printCoordinates (int seq, int begin, int end){
+ cout << seqs[seq].getID() << ":" << getSeqCoord(seq, begin) << "-" << getSeqCoord(seq, end) << " ";
+}
+
+int printRegion (int begin, int end){
+ int score = 0;
+ int numSeqs = seqs.getNumSeqs();
+
+ for (int i = 0; i < numSeqs; i++){
+ printCoordinates (i, begin, end);
+ for (int j = i+1; j < numSeqs; j++){
+ score += rescoreRegion (seqs[i], seqs[j], begin, end);
+ }
+ }
+ cout << score << endl;
+ return score;
+}
+
+void scoreAlign (){
+ int numSeqs = seqs.getNumSeqs();
+ int seqLen = seqs[0].getLength();
+ vvvi nucLabels (numSeqs, vvi (numSeqs, vi()));
+
+ for (int i = 0; i < numSeqs; i++){
+ for (int j = i+1; j < numSeqs; j++){
+ getNucLabels (seqs[i], seqs[j], nucLabels[i][j]);
+ }
+ }
+
+ int begin = -1, end = -1, score = 0;
+ for (int i = 1; i <= seqLen+1; i++){
+
+ int conserved = 1;
+ if (i == seqLen+1)
+ conserved = 0;
+ else {
+ for (int j = 0; conserved && j < numSeqs; j++)
+ for (int k = j+1; conserved && k < numSeqs; k++)
+ conserved = nucLabels[j][k][i];
+ }
+
+ if (conserved){
+ if (begin == -1)
+ begin = i;
+ }
+ else {
+ if (begin != -1){
+ end = i-1;
+ score += printRegion (begin, end);
+ begin = end = -1;
+ }
+ }
+ }
+
+ cout << "= score=" << score << endl;
+}
+
+int countLets (SafeVector<char> &data){
+ int ct = 0;
+ for (int i = 0; i < (int) data.size(); i++){
+ if (data[i] >= 'A' && data[i] <= 'Z' || data[i] >= 'a' && data[i] <= 'z')
+ ct++;
+ }
+ return ct;
+}
+
+int findSplit (SafeVector<char> &data1, SafeVector<char> &data2, int overlap,
+ SafeVector<char> &data1a, SafeVector<char> &data2a){
+
+ int offs1 = data1.size(), num1 = 0;
+ for (int i = (int) data1.size() - 1; i >= 0; i--){
+ if (overlap == 0) break;
+ if (isalpha(data1[i])) num1++;
+ if (num1 == overlap){
+ offs1 = i;
+ break;
+ }
+ }
+
+ int offs2 = 0;
+ num1 = 0;
+ for (int i = 0; i < (int) data2.size(); i++){
+ if (overlap == 0) break;
+ if (isalpha(data2[i])) num1++;
+ if (num1 == overlap){
+ offs2 = i;
+ break;
+ }
+ }
+
+ SafeVector<int> score1 (overlap+1, 0);
+ SafeVector<int> score2 (overlap+1, 0);
+
+ int score = 0;
+ for (int ct = 0,i=0; ct < overlap;i++){
+ if (isalpha(data1[i+offs1])) ct++;
+ score += (data1[i+offs1] == data1a[i+offs1]) ? 18 : -8;
+ score1[ct] = score;
+ }
+
+ score = 0;
+ for (int ct = 0,i=0; ct < overlap;i++){
+ if (isalpha(data2[offs2-i])) ct++;
+ score += (data2[offs2-i] == data2a[offs2-i]) ? 18 : -8;
+ score2[ct] = score;
+ }
+
+ int j = 0, best = -1000000;
+ for (int i = 0; i <= overlap; i++){
+ if (score1[i] + score2[overlap-i] > best){
+ best = score1[i] + score2[overlap-i];
+ j = i;
+ }
+ }
+
+ // fprintf (stderr, "0 <= %d <= %d\n", j, overlap);
+
+ return j;
+}
+
+template<class T>
+int chopLeft (SafeVector<T> &data1, SafeVector<T> &data2, int num, bool inAlign){
+ int num1 = 0, here = -1;
+
+ if (inAlign)
+ here = num - 1;
+ else {
+ for (int i = 0; i < (int) data1.size(); i++){
+ if (num == 0) break;
+ if (isalpha(data1[i])) num1++;
+ if (num1 == num){
+ here = i;
+ break;
+ }
+ }
+ }
+
+ int chopped = here + 1;
+ for (int i = here + 1; i < (int) data1.size(); i++){
+ data1[i - chopped] = data1[i];
+ data2[i - chopped] = data2[i];
+ }
+
+ data1.resize ((int) data1.size() - chopped);
+ data2.resize ((int) data2.size() - chopped);
+
+ return chopped;
+}
+
+template<class T>
+int chopRight (SafeVector<T> &data1, SafeVector<T> &data2, int num, bool inAlign){
+ int num1 = 0, here = data1.size();
+
+ if (inAlign)
+ here = data1.size() - num;
+ else {
+ for (int i = (int) data1.size() - 1; i >= 0; i--){
+ if (num == 0) break;
+ if (isalpha(data1[i])) num1++;
+ if (num1 == num){
+ here = i;
+ break;
+ }
+ }
+ }
+
+ int ret = (int) data1.size() - here;
+ data1.resize (here);
+ data2.resize (here);
+
+ return ret;
+}
+
+template<class T>
+SafeVector<T> merge (SafeVector<T> &data1, SafeVector<T> &data2){
+ SafeVector<T> temp;
+ for (int i = 0; i < (int) data1.size(); i++) temp.push_back (data1[i]);
+ for (int i = 0; i < (int) data2.size(); i++) temp.push_back (data2[i]);
+ return temp;
+
+}
+
+int main (int argc, char **argv){
+ FILE* outfile;
+
+ if (argc < 2 || argc > 3){
+ cerr << "Usage: Glue align.mfa \n" << endl;
+ exit (1);
+ }
+
+ if (argc == 3) {
+ if (!(outfile = fopen (argv[2], "w"))) {
+ fprintf (stderr, "couldn't open %s for writing\n", argv[2]);
+ exit(1);
+ }
+
+ }
+ else outfile = stderr;
+
+ // calculateScoreMatrix (CONS_RATE);
+
+ SafeVector<char> merged1, merged2;
+ SafeVector<char> strand;
+ SafeVector<int> merged1label, merged2label;
+ int begin1 = 1, end1 = 1;
+
+ ifstream data (argv[1]);
+ int alignNum = 0;
+ strand.push_back ('?'); // nothing for alignNum 0
+
+ while (true){
+
+ seqs = MultiSequence();
+ seqs.addRawFromMFA (data);
+
+ if (seqs.getNumSeqs() != 2) break;
+ alignNum++;
+
+ strand.push_back (seqs[1].getStrand());
+
+ if (alignNum == 1){
+ begin1 = seqs[0].getStartCoord();
+ end1 = seqs[0].getEndCoord();
+ merged1 = seqs[0].getData(); merged1label = SafeVector<int>((int) merged1.size(), 1);
+ merged2 = seqs[1].getData(); merged2label = SafeVector<int>((int) merged2.size(), 1);
+ continue;
+ }
+
+ int b1 = seqs[0].getStartCoord();
+ int e1 = seqs[0].getEndCoord();
+
+ SafeVector<char> seqs0;
+ SafeVector<char> seqs1;
+
+ seqs0 = seqs[0].getData();
+ seqs1 = seqs[1].getData();
+
+ SafeVector<int> seqs0label((int) seqs0.size(), alignNum);
+ SafeVector<int> seqs1label((int) seqs1.size(), alignNum);
+
+ int overlap = e1 - begin1 + 1;
+
+ if (overlap > 0){
+ int numLeft = findSplit (seqs0, merged1, overlap, seqs1, merged2);
+ int numRight = overlap - numLeft;
+
+ int choppedLeft = chopLeft (merged1, merged2, numLeft, false);
+ int choppedRight = chopRight (seqs0, seqs1, numRight, false);
+
+ chopLeft (merged1label, merged2label, choppedLeft, true);
+ chopRight (seqs0label, seqs1label, choppedRight, true);
+ }
+ else if (overlap < 0){
+ SafeVector<char> temp1 (-overlap, 'N');
+ SafeVector<char> temp2 (-overlap, 'N');
+ merged1 = merge (temp1, merged1);
+ merged2 = merge (temp2, merged2);
+
+ SafeVector<int> temp1label (-overlap, 0);
+ SafeVector<int> temp2label (-overlap, 0);
+
+ merged1label = merge (temp1label, merged1label);
+ merged2label = merge (temp2label, merged2label);
+ }
+
+ merged1 = merge (seqs0, merged1);
+ merged2 = merge (seqs1, merged2);
+
+ merged1label = merge (seqs0label, merged1label);
+ merged2label = merge (seqs1label, merged2label);
+
+ //seqs[0].writeXMFAHeader(cerr);
+
+ begin1 = b1;
+
+ if (data.eof()) break;
+ if (data.peek() == '=') data.ignore (MAX_LINE_LENGTH, '\n');
+ if (data.eof()) break;
+ }
+
+ SafeVector<char> temp1 (begin1 - 1, 'N');
+ SafeVector<char> temp2 (begin1 - 1, '-');
+
+ for (int i = 0; i < min ((int) temp2.size(), CNTG_BRK_N); i++)
+ temp2[i] = 'N';
+
+ merged1 = merge (temp1, merged1);
+ merged2 = merge (temp2, merged2);
+
+ SafeVector<int> temp1label (begin1 - 1, 0);
+ SafeVector<int> temp2label (begin1 - 1, 0);
+ merged1label = merge (temp1label, merged1label);
+ merged2label = merge (temp2label, merged2label);
+
+ for (int i = 1; i <= alignNum; i++){
+ int min1 = INF, max1 = 0, min2 = INF, max2 = 0;
+ int pos1 = 0, pos2 = 0;
+ for (int j = 0; j < (int) merged1label.size(); j++){
+ if (isalpha(merged1[j])) pos1++;
+ if (isalpha(merged2[j])) pos2++;
+
+ if (merged1label[j] == i){
+ min1 = min (min1, pos1);
+ max1 = max (max1, pos1);
+ }
+ if (merged2label[j] == i){
+ min2 = min (min2, pos2);
+ max2 = max (max2, pos2);
+ }
+ }
+
+ //[FASTA line for this contig in the original sequence file]
+ //n baseFrom baseTo mergedFrom mergedTo startChop endChop {+,-} score secFrom secTo
+ fprintf (outfile, "Align %d\n", i);
+ if (min1 == INF)
+ fprintf (outfile, "%d was cropped completely.\n", i);
+ else
+ fprintf (outfile, "%d %d %d 0 0 0 0 %c 0 %d %d\n", i, min1, max1, strand[i], min2, max2);
+ }
+
+ printMFA (cout, merged1, string ("first"), 60);
+ printMFA (cout, merged2, string ("second"), 60);
+}
diff --git a/src/utils/MultiSequence.h b/src/utils/MultiSequence.h
new file mode 100644
index 0000000..61e38ae
--- /dev/null
+++ b/src/utils/MultiSequence.h
@@ -0,0 +1,124 @@
+// MultiSequence.h
+// ---------------
+// Multiple sequence class
+
+#ifndef MULTISEQUENCE_H
+#define MULTISEQUENCE_H
+
+#include <vector>
+#include <string>
+#include <fstream>
+#include <stdio.h>
+#include "Sequence.h"
+#include "SafeVector.h"
+
+using namespace std;
+
+class MultiSequence {
+ private:
+ SafeVector<Sequence> sequences; // sequences
+ SafeVector<char> cache;
+ bool cacheEnabled;
+
+ public:
+
+ MultiSequence (): cacheEnabled (false) {}
+
+ void buildCache (){
+ assert (!cacheEnabled);
+ cacheEnabled = true;
+
+ int length = sequences[0].getLength();
+ int numSeqs = getNumSeqs();
+
+ cache.resize ((length + 1) * numSeqs, (char) 0);
+ for (int i = 0; i < numSeqs; i++){
+ Sequence &seq = (*this)[i];
+ cache[i] = '@';
+ for (int j = 1; j <= length; j++){
+ cache[j * numSeqs + i] = seq[j];
+ }
+ }
+ }
+
+ // return letter cache for fast processing
+ SafeVector<char>::iterator getCache (){
+ assert (cacheEnabled);
+ return cache.begin();
+ }
+
+ // add a sequence to the alignment
+ void addSequence (Sequence &sequence){
+ sequences.push_back (sequence);
+ }
+
+ // Read in all of the Sequences in an MFA file and append them to the
+ // existing MultiSequence object.
+ void addRawFromMFA (const string& filename){
+
+ // open up file for reading
+ ifstream infile (filename.c_str());
+
+ // check for error
+ assert (!infile.fail());
+
+ // add only sequences that check out ok
+ while (true){
+ Sequence seq (infile);
+ if (seq.fail()) break;
+ sequences.push_back (seq);
+ }
+
+ // close up the input file
+ infile.close();
+ }
+
+ // Read in all of the Sequences in an MFA file and append them to the
+ // existing MultiSequence object.
+ void addRawFromMFA (ifstream &infile){
+
+ // check for error
+ assert (!infile.fail());
+
+ // add only sequences that check out ok
+ while (true){
+ Sequence seq (infile);
+ if (seq.fail()) break;
+ sequences.push_back (seq);
+ }
+ }
+
+ // Writes sequences to outfile in XMFA format.
+ void writeToXMFA (ostream &outfile, int numColumns) const {
+ for (int i = 0; i < (int) sequences.size(); ++i){
+ sequences[i].writeToXMFA (outfile, numColumns);
+ }
+ }
+
+ // Returns a sequence.
+ Sequence& operator[] (int index){
+
+ // error checking on bounds
+ assert (index >= 0 && index < (int) sequences.size());
+
+ // return the correct sequence
+ return sequences[index];
+ }
+
+ // Returns a sequence.
+ const Sequence& operator[] (int index) const {
+
+ // error checking on bounds
+ assert (index >= 0 && index < (int) sequences.size());
+
+ // return the correct sequence
+ return sequences[index];
+ }
+
+ // Returns number of sequences.
+ const int getNumSeqs() const {
+ return sequences.size();
+ }
+};
+
+#endif
diff --git a/src/utils/Output.h b/src/utils/Output.h
new file mode 100644
index 0000000..847ee98
--- /dev/null
+++ b/src/utils/Output.h
@@ -0,0 +1,20 @@
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+// print reversed string in MFA format
+void printMFA (ostream &outfile, SafeVector<char> &data, string comment, int numColumns){
+
+ int charsWritten = 0;
+
+ outfile << ">" << comment << endl;
+ for (int i = 0; i < (int) data.size(); i++){
+ outfile << data[i];
+ charsWritten++;
+ if (charsWritten % numColumns == 0) outfile << endl;
+ }
+
+ if (charsWritten % numColumns != 0) outfile << endl;
+}
+
+
+#endif
diff --git a/src/utils/SafeVector.h b/src/utils/SafeVector.h
new file mode 100644
index 0000000..2d85063
--- /dev/null
+++ b/src/utils/SafeVector.h
@@ -0,0 +1,44 @@
+// SafeVector.h
+// ------------
+// Class for array bounds checking.
+
+// define ENABLE_CHECKS in order to enable array bounds checking.
+
+#ifndef SAFEVECTOR_H
+#define SAFEVECTOR_H
+
+#include <assert.h>
+#include <vector>
+
+using namespace std;
+
+// class derived from the STL std::vector
+template<class TYPE>
+class SafeVector : public std::vector<TYPE>{
+public:
+
+ // miscellaneous constructors
+ SafeVector () {}
+ SafeVector (size_t size) : vector<TYPE>(size) {}
+ SafeVector (size_t size, const TYPE &value) : vector<TYPE>(size, value) {}
+ SafeVector (const SafeVector &source) : vector<TYPE>(source) {}
+
+#ifdef ENABLE_CHECKS
+
+ // [] array bounds checking
+ TYPE &operator[](size_t index){
+ assert (index >= 0 && index < size());
+ return std::vector<TYPE>::operator[] (index);
+ }
+
+ // [] const array bounds checking
+ const TYPE &operator[] (size_t index) const {
+ assert (index >= 0 && index < size());
+ return std::vector<TYPE>::operator[] (index) ;
+ }
+
+#endif
+
+};
+
+#endif
diff --git a/src/utils/Sequence.h b/src/utils/Sequence.h
new file mode 100644
index 0000000..6bf7584
--- /dev/null
+++ b/src/utils/Sequence.h
@@ -0,0 +1,229 @@
+// Sequence.h
+// ----------
+// Class file to hold a sequence object.
+
+#ifndef SEQUENCE_H
+#define SEQUENCE_H
+
+#include <string>
+#include "SafeVector.h"
+
+using namespace std;
+
+class Sequence {
+
+ private:
+
+ // Read header of MFA/XMFA file.
+ bool readHeader (ifstream &infile, bool &isXMFA){
+ string header;
+
+ while (true){
+
+ // check to make sure that the there is more data in the file
+ if (infile.fail() || infile.eof()) return false;
+
+ // get new header line
+ getline (infile, header);
+
+ // check that header line is not empty
+ if (header.length() != 0) break;
+ }
+
+ // check for appropriate header
+ if (header[0] != '>') return false;
+
+ // attempt to read XMFA format
+ isXMFA = true;
+ char buffer[1024];
+ int numread = sscanf (header.c_str(), ">%d:%d-%d %c %s", &id, &startCoord, &endCoord, &direction, buffer);
+
+ // if basic requirements for XMFA not met, then MFA file
+ if (numread < 4){
+ comment = header.substr(1);
+ isXMFA = false;
+ }
+
+ // basic requirements for XMFA met, no comments
+ else if (numread < 5)
+ comment = "";
+
+ // otherwise full XMFA format
+ else
+ comment = buffer;
+
+ return true;
+ }
+
+ protected:
+
+ SafeVector<char> data; // character data for the sequence
+ bool isValid; // is the sequence valid?
+ int length; // length of the sequence
+ int id; // sequence ID (for XMFA)
+ int startCoord; // sequence position of first character
+ int endCoord; // sequence position of last character
+ char direction; // + or -
+ string comment; // comments
+
+ public:
+
+ Sequence (){
+ isValid = true;
+ length = 1;
+ data.resize (1, ' ');
+ startCoord = 1; endCoord = 1;
+ direction = '+';
+ }
+
+ // Constructor. Reads in a sequence from the input file.
+ Sequence (ifstream &infile){
+
+ bool isXMFA = true;
+
+ // sequence starts out not valid
+ isValid = false;
+
+ // check to make sure that the header is read first
+ if (readHeader (infile, isXMFA)){
+
+ // put in a dummy character to fill the zero position
+ data.push_back ('@');
+
+ // read in character data
+ char ch;
+
+ // loop until no more character data or end of sequence found
+ while (infile.get(ch)){
+
+ // check to make sure that the end of a section is not reached
+ if (ch == '>' || ch == '='){
+ infile.unget();
+ break;
+ }
+
+ // check for white space
+ if (ch == ' ' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v') continue;
+
+ // convert lowercase letters to uppercase
+ if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
+
+ // check that characters are letters OR contig breaks OR gaps
+ assert ((ch >= 'A' && ch <= 'Z') || ch == '.' || ch == '-');
+
+
+ // add character to list
+ data.push_back (ch);
+ }
+
+ // check to see if any data was read
+ if (data.size() > 1){
+
+ // if so, the sequence is valid, and compute the length
+ isValid = true;
+ length = data.size() - 1;
+
+ // if the sequence is not originally XMFA
+ if (!isXMFA){
+
+ // assign it some temporary values for XMFA format
+ id = 0;
+ startCoord = 1;
+ endCoord = length;
+ direction = '+';
+ }
+ }
+ }
+
+ // some sanity checks
+ if (isValid){
+ assert (id >= 0);
+ assert (startCoord >= 0);
+ assert (endCoord >= 0);
+ assert (startCoord <= endCoord);
+ assert (direction == '+' || direction == '-');
+ assert (length > 0);
+ }
+ }
+
+ // Constructor. Gets sequence from array data.
+ Sequence (SafeVector<char> data, string comment) : data(data), comment(comment) {
+ length = data.size() - 1;
+ id = 0;
+ startCoord = 1;
+ endCoord = length;
+ direction = '+';
+ isValid = true;
+ comment = "";
+
+ assert (length > 0);
+ }
+
+ SafeVector<char> getData (){
+ SafeVector<char> temp;
+ for (int i = 1; i <= length; i++) temp.push_back (data[i]);
+ return temp;
+ }
+
+ const string getComment () const {
+ return comment;
+ }
+
+ void setLength (int num){
+ if (num > length){
+ length = num;
+ endCoord = length;
+ data.resize(length+1, ' ');
+ }
+ }
+
+ SafeVector<char>::iterator getIterator (){
+ return data.begin();
+ }
+
+ const char operator[] (int index) const {
+ assert (index >= 1 && index <= length);
+ return data[index];
+ }
+
+ // Used to check for sequence validity after construction.
+ const bool fail () const { return !isValid; }
+
+ // Return the length of the sequence.
+ const int getLength () const { assert (isValid); return length; }
+ const char getStrand () const { assert (isValid); return direction; }
+
+ const int getStartCoord () const { assert (isValid); return startCoord; }
+ const int getEndCoord () const { assert (isValid); return endCoord; }
+
+ // Print XMFA header only.
+ void writeXMFAHeader (ostream &outfile) const {
+ assert (isValid);
+ outfile << '>' << id << ':' << startCoord << '-' << endCoord << ' ' << direction << ' ' << comment << endl;
+ }
+
+ // Return sequence ID.
+ const int getID () const { assert (isValid); return id; }
+
+ // Set sequence ID.
+ void setID (int id) { assert (isValid); this->id = id; }
+
+ // Writes sequence to XMFA format.
+ void writeToXMFA (ostream &outfile, int numColumns) const {
+
+ assert (isValid);
+
+ // print XMFA header
+ outfile << ">" << comment << endl;
+ // outfile << '>' << id << ':' << startCoord << '-' << endCoord << ' ' << direction << ' ' << comment << endl;
+
+ // print character data
+ for (int i = 1; i <= length; ++i){
+ outfile << data[i];
+ if (i % numColumns == 0) outfile << endl;
+ }
+ if (length % numColumns != 0) outfile << endl;
+ }
+};
+
+#endif
diff --git a/src/utils/bin2bl.c b/src/utils/bin2bl.c
new file mode 100644
index 0000000..47ed5be
--- /dev/null
+++ b/src/utils/bin2bl.c
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <string.h>
+
+void Add_Tick(char *line, int count, int length);
+void Print_Lines(char *line1, char *line2, char *ticks1, char *ticks2,
+ char *match);
+int Usage(void);
+
+char MyName[1024];
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ FILE *snp_file = NULL;
+ char *slash;
+ int fields, start = -1, end = -1, bp, base1, base2;
+ int base1_count = 0;
+ int base2_count = 0;
+ int start2 = 0;
+ int end2 = 0;
+ int tick1_done = 0;
+ int tick2_done = 0;
+ int width = 60;
+ int length = 0;
+ int html_length = 0;
+ int snp_pos = -1;
+ int param1 = 1;
+ char bases[] = {'-', 'A', 'C', 'T', 'G', 'N'};
+ char line1[1024];
+ char line2[80];
+ char match[80];
+ char ticks1[80] = "";
+ char ticks2[80] = "";
+ char snp_fname[1024] = "";
+ char font_start[80] = "<b><font color=red ";
+ char font_end[] = "</font></b>";
+ char status_start[] = "onmouseover=\"window.status='SNP: ";
+ char status_end[] = "'\" onmouseout=\"window.status=''\">";
+ char dash[] = " - ";
+ char snp_bases[2];
+
+// remove the directory name from the program pathname
+
+ if (((slash = strrchr(argv[0], '/')) != NULL) ||
+ ((slash = strrchr(argv[0], '\\')) != NULL))
+ strcpy(MyName, slash + 1);
+ else
+ strcpy(MyName, argv[0]);
+
+// parse my command line and open input file(s)
+
+ if (argc < 2) return Usage();
+ if (argv[1][0] == '-')
+ if (strcasecmp(argv[1], "-pga") == 0)
+ ++param1;
+ else if (strcmp(argv[1], "-") != 0)
+ return Usage();
+ if ((argc <= param1) ||
+ ((strcmp(argv[param1], "-") != 0) &&
+ ((infile = fopen(argv[param1], "r")) == NULL)) ||
+ ((argc > (param1 + 1)) &&
+ (((fields = sscanf(argv[param1 + 1], "%d", &start)) != 1) ||
+ (start <= 0))) ||
+ ((argc > (param1 + 2)) &&
+ (((fields = sscanf(argv[param1 + 2], "%d", &end)) != 1) ||
+ (start > end))))
+ return Usage();
+ if (infile == NULL)
+ infile = stdin;
+ else if (param1 > 1) {
+ if (((slash = strrchr(argv[param1], '/')) != NULL) ||
+ ((slash = strrchr(argv[param1], '\\')) != NULL)) {
+ strncpy(snp_fname, argv[param1], slash - argv[param1] + 1);
+ snp_fname[slash - argv[param1] + 1] = '\0';
+ }
+ strcat(snp_fname, "SNP.txt");
+ snp_file = fopen(snp_fname, "r");
+ }
+ while (!feof(infile)) {
+ if ((bp = getc(infile)) == EOF) { // get next char
+ if (!ferror(infile)) {
+ end2 = base2_count;
+ continue;
+ }
+ perror("Error reading file"); // stop if an error is found
+ return 1;
+ }
+ // decode bp char
+ base1 = bp >> 4;
+ base2 = bp & 0xf;
+ if (base1 != 0) {
+ ++base1_count;
+ tick1_done = 0;
+ }
+ if (base2 != 0) {
+ ++base2_count;
+ tick2_done = 0;
+ }
+ if (base1_count < start) continue;
+ if (snp_file != NULL) {
+ while (base1_count > snp_pos) {
+ if ((fields = fscanf(snp_file, "%d %2c", &snp_pos, snp_bases)) == 2)
+ continue;
+ fclose(snp_file);
+ snp_file = NULL;
+ break;
+ }
+ }
+ if (start2 == 0) {
+ start2 = base2_count;
+ if (base2 == 0) ++start2;
+ }
+ if (base1_count != snp_pos) {
+ line1[html_length] = bases[base1];
+ line1[html_length + 1] = 0;
+ ++html_length;
+ } else {
+ strcpy(line1 + html_length, font_start);
+ strcat(line1, status_start);
+ html_length = strlen(line1);
+ line1[html_length] = snp_bases[0];
+ strcpy(line1 + html_length + 1, dash);
+ line1[html_length + strlen(dash) + 1] = snp_bases[1];
+ strcpy(line1 + html_length + strlen(dash) + 2, status_end);
+ html_length = strlen(line1);
+ line1[html_length] = bases[base1];
+ strcpy(line1 + html_length + 1, font_end);
+ html_length = strlen(line1);
+ }
+ line2[length] = bases[base2];
+ line2[length + 1] = 0;
+ match[length] = ((base1 == base2) && (base1 != 5)) ? '|' : ' ';
+ match[length + 1] = 0;
+ ++length;
+ if ((tick1_done == 0) && ((base1_count % 10) == 0) && (base1_count > 0)) {
+ Add_Tick(ticks1, base1_count, length);
+ tick1_done = 1;
+ }
+ if ((tick2_done == 0) && ((base2_count % 10) == 0) && (base2_count > 0)) {
+ Add_Tick(ticks2, base2_count, length);
+ tick2_done = 1;
+ }
+ if (length == 60) {
+ Print_Lines(line1, line2, ticks1, ticks2, match);
+ length = 0;
+ html_length = 0;
+ }
+ if (base1_count == end) {
+ end2 = base2_count;
+ break;
+ }
+ }
+ if (length != 0)
+ Print_Lines(line1, line2, ticks1, ticks2, match);
+ fclose(infile);
+ if (param1 > 1)
+ printf("start2=%d\nend2=%d\n", start2, end2);
+ return 0;
+}
+
+void Add_Tick(char *line, int count, int length) {
+ int space;
+ char tick[20];
+
+ sprintf(tick, "%d", count);
+ space = length + 9 - strlen(line) - strlen(tick);
+ if (space > 0) {
+ while (space > 0) {
+ strcat(line, " ");
+ --space;
+ }
+ strcat(line, tick);
+ }
+}
+
+void Print_Lines(char *line1, char *line2, char *ticks1, char *ticks2,
+ char *match) {
+ printf("\n%s\nseq1 %s\n %s\nseq2 %s\n%s\n",
+ ticks1, line1, match, line2, ticks2);
+ line1[0] = line2[0] = ticks1[0] = ticks2[0] = match[0] = 0;
+}
+
+int Usage() {
+ fprintf(stderr, " \
+Usage: %s [-pga] { - | alignment_file } [start [end]]\n",
+ MyName);
+ return 1;
+}
diff --git a/src/utils/bin2mf.c b/src/utils/bin2mf.c
new file mode 100644
index 0000000..dbdffce
--- /dev/null
+++ b/src/utils/bin2mf.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void Add_Tick(char *line, int count, int length);
+void Print_Lines(char *line1, char *line2, char *ticks1, char *ticks2,
+ char *match);
+int Usage(void);
+
+char MyName[1024];
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ char bases[] = {'-', 'A', 'C', 'T', 'G', 'N'};
+ char *seq1, *seq2;
+ int seqsize=1, numread=0;
+ int bp, base1, base2, i;
+ seq1 = (char*) malloc(sizeof(char));
+ seq2 = (char*) malloc(sizeof(char));
+// parse my command line and open input file(s)
+
+ if (argc < 2) return Usage();
+
+ if ((strcmp(argv[1], "-") != 0) &&
+ ((infile = fopen(argv[1], "r")) == NULL))
+ return Usage();
+
+ if (infile == NULL)
+ infile = stdin;
+
+ while (!feof(infile)) {
+ if ((bp = getc(infile)) == EOF) { // get next char
+ break;
+ }
+ // decode bp char
+ base1 = bp >> 4;
+ base2 = bp & 0xf;
+ seq1[numread] = bases[base1];
+ seq2[numread] = bases[base2];
+ numread++;
+ if (numread >= seqsize) {
+ seq1 = (char*) realloc(seq1, sizeof(char)* (seqsize *2));
+ seq2 = (char*) realloc(seq2, sizeof(char)* (seqsize *2));
+ seqsize *= 2;
+ }
+ }
+
+ printf(">seq1");
+ for (i = 0; i < numread; i++) {
+ if (!(i%60))
+ printf("\n");
+ printf("%c", seq1[i]);
+ }
+ printf("\n>seq2");
+ for (i = 0; i < numread; i++) {
+ if (!(i%60))
+ printf("\n");
+ printf("%c", seq2[i]);
+ }
+
+ return 0;
+}
+
+int Usage() {
+ fprintf(stderr, " \
+Usage: %s { - | alignment_file }]\n",
+ MyName);
+ return 1;
+}
diff --git a/src/utils/cextract.c b/src/utils/cextract.c
new file mode 100644
index 0000000..4ad7054
--- /dev/null
+++ b/src/utils/cextract.c
@@ -0,0 +1,113 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+
+int begin, finish, seqIdx, seqExt, seqlen, numseqs, seqlen2, numseqs2;
+char name[1024], name2[1024], **seqs, **seqs2;
+
+int getLength (char *filename){
+ FILE *file;
+ char buffer[1024], ch;
+ int length = 0;
+
+ file = fopen (filename, "r"); assert (file);
+ fgets (buffer, 1024, file);
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (isalpha (ch) || ch == '.' || ch == '-') length++;
+ }
+ fclose (file);
+
+ return length;
+}
+
+void readfile (char *filename, int *seqlen, int *numseqs, char *name, char ***seqs){
+ FILE *file;
+ char buffer[1024], ch;
+ int i;
+
+ *numseqs = 0;
+ *seqlen = getLength (filename);
+ strcpy (name, "");
+ *seqs = (char **) malloc (sizeof (char *) * 1); assert (*seqs);
+ (*seqs)[0] = (char *) malloc (sizeof (char) * (*seqlen));
+
+ file = fopen (filename, "r"); assert (file);
+ while (!feof (file)){
+ i = 0;
+ fgets (buffer, 1024, file);
+ if (strlen (name) == 0) strcpy (name, buffer);
+ if (feof (file)) break;
+ (*numseqs)++;
+ if (*numseqs > 1){
+ *seqs = (char **) realloc (*seqs, sizeof (char *) * (*numseqs)); assert (*seqs);
+ (*seqs)[*numseqs - 1] = (char *) malloc (sizeof (char) * (*seqlen)); assert ((*seqs)[*numseqs - 1]);
+ }
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (isalpha (ch) || ch == '.' || ch == '-'){
+ assert (i < (*seqlen));
+ (*seqs)[*numseqs - 1][i] = ch;
+ i++;
+ }
+ }
+ if (ch == '>') ungetc (ch, file);
+ assert (i == *seqlen);
+ }
+ fclose (file);
+}
+
+void print (void){
+ int i = 0, pos = 0, written = 0, j = 0;
+
+ assert (seqIdx >= 0 && seqIdx < numseqs);
+ assert (seqExt >= 0 && seqExt < numseqs);
+
+ printf ("%s", name);
+ while (pos <= finish && i < seqlen){
+ if (isalpha (seqs[seqIdx][i])) pos++;
+ if (isalpha (seqs[seqExt][i]) || seqs[seqExt][i] == '.'){
+ assert (seqlen2 == 0 || j < seqlen2);
+ if (pos >= begin && pos <= finish){
+ printf ("%c", seqlen2 == 0 ? seqs[seqExt][i] : seqs2[0][j]);
+ written++;
+ if (written % 60 == 0) printf ("\n");
+ }
+ j++;
+ }
+ i++;
+ }
+ if (written % 60 != 0) printf ("\n");
+}
+
+int main (int argc, char** argv){
+ int i;
+
+ if (argc != 6 && !(argc == 8 && strcmp (argv[6], "-subst") == 0)){
+ fprintf (stderr, "Usage:\n\ncextract multi_fasta_file begin end seqidx seqextract\n");
+ exit (1);
+ }
+
+ begin = atoi (argv[2]);
+ finish = atoi (strdup(argv[3]));
+ seqIdx = atoi (argv[4]);
+ seqExt = atoi (argv[5]);
+ seqlen2 = 0;
+
+ readfile (argv[1], &seqlen, &numseqs, name, &seqs);
+ if (argc == 8) readfile (argv[7], &seqlen2, &numseqs2, name2, &seqs2);
+ print ();
+
+ for (i = 0; i < numseqs; i++) free (seqs[i]);
+ free (seqs);
+}
+
+
+
+
+
diff --git a/src/utils/cmerge2.pl b/src/utils/cmerge2.pl
new file mode 100755
index 0000000..dc98edd
--- /dev/null
+++ b/src/utils/cmerge2.pl
@@ -0,0 +1,207 @@
+#!/usr/bin/env perl
+use File::Basename;
+
+$lagandir = $ENV{LAGAN_DIR};
+$pid = $$;
+
+# process arguments
+if (@ARGV < 4 && @ARGV > 6) {
+ print STDERR ("usage:\n cmerge seqfile mfafile draftfile outfile [-nocrop] [-skipfr pid]\n");
+ exit(1);
+}
+$arglist = "";
+$nocrop = 0;
+for ($i = 4; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-nocrop/){
+ $nocrop = 1;
+ }
+ elsif ($ARGV[$i] =~ /-skipfr/){
+ $skipfr = 1;
+ $pid = $ARGV[++$i];
+ chomp $pid;
+ }
+ else {
+ print STDERR "Bad arg to cmerge: $ARGV[$i]";
+ exit(1);
+ }
+}
+$arglist = "$arglist $recurfl";
+
+if (!$skipfr) {
+ exit(1);
+}
+$newdir = `pwd`;
+chomp $newdir;
+$newdir = "$newdir/$pid";
+
+open (LOGFILE, ">>$newdir/log");
+open (INFOFILE, ">$newdir/minfo");
+
+print STDERR ("\n");
+print STDERR ("Computing Contig Overlaps\n");
+print STDERR ("-------------------------\n");
+
+print LOGFILE ("\n");
+print LOGFILE ("Computing Contig Overlaps\n");
+print LOGFILE ("-------------------------\n");
+
+# initialize merged file
+open (OFILE, ">$ARGV[3]");
+print OFILE (">merged\n");
+close (OFILE);
+`cp $ARGV[3] $ARGV[3].masked`;
+
+# initialize padding file
+open (OFILE, ">$newdir/padding");
+print OFILE (">padding\n");
+print OFILE ("NNNNNNNNNNNNNNNNNNNN.NNNNNNNNNNNNNNNNNNNN\n");
+close (OFILE);
+$padlength = `$lagandir/utils/getlength $newdir/padding`; chomp $padlength;
+
+# other initialization
+$totlength = `$lagandir/utils/getlength $ARGV[0]`;
+chomp $totlength;
+$mergedEnd = 0;
+
+# read contig list
+$numContigs = 0;
+ at list = `cat $ARGV[2]`;
+
+for ($i = 3; $i < @list; $i++){
+ $list[$i] =~ /(.*)\.mfa --\> \((\d+) (\d+)\) score=(\d+), offset=\((\d+) (\d+)\), index=(\d+)/;
+ $filenames[$i-3] = $1;
+ $seq1Begin[$i-3] = $2;
+ $seq1End[$i-3] = $3;
+ $score[$i-3] = $4;
+ $s1shifts[$i-3] = $5;
+ $s2shifts[$i-3] = $6;
+ $num[$i-3] = $7;
+
+
+ $temp = $seq1Begin[$i-3] - $s1shifts[$i-3];
+ $seq2Begin[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2Begin[$i-3];
+ $seq2Begin[$i-3] += $s2shifts[$i-3];
+
+ $temp = $seq1End[$i-3] - $s1shifts[$i-3];
+ $seq2End[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2End[$i-3];
+ $seq2End[$i-3] += $s2shifts[$i-3];
+
+ print STDERR "$filenames[$i-3].mfa --> $seq1Begin[$i-3] $seq1End[$i-3] $score[$i-3] $s1shifts[$i-3] $s2shifts[$i-3] $num[$i-3] $seq2Begin[$i-3] $seq2End[$i-3]\n";
+
+ $numContigs++;
+}
+
+# extract contigs
+$contigfile = basename ($ARGV[1]);
+$contigdir = dirname ($ARGV[1]);
+$newdir = `pwd`;
+chomp $newdir;
+$newdir = "$newdir/$pid";
+
+# start out merged file with only padding
+`mv $ARGV[3] $ARGV[3].new`;
+`$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`;
+`mv $ARGV[3].masked $ARGV[3].masked.new`;
+`$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/padding > $ARGV[3].masked`;
+$contigStart[0] = 1;
+$startChop[0] = 0;
+
+`cp $filenames[0] $newdir/current`;
+`cp $filenames[0].masked $newdir/current.masked`;
+
+# merge contigs
+for ($i = 1; $i < $numContigs; $i++){
+ `$lagandir/rechaos.pl $newdir/current $filenames[$i] -recurse \"(12,0,40,0)x\" -maskedonly > $newdir/currentanchs`;
+ # find the overlap
+
+ `$lagandir/utils/getoverlap $newdir/currentanchs` =~ /(-?\d+) (-?\d+) (-?\d+) (-?\d+)/;
+ $rangebegin1 = $1;
+ $rangeend1 = $2;
+ $rangebegin2 = $3;
+ $rangeend2 = $4;
+
+ chomp $rangebegin1;
+ chomp $rangeend1;
+ chomp $rangebegin2;
+ chomp $rangeend2;
+
+ $thislength = `$lagandir/utils/getlength $filenames[$i-1]`; chomp $thislength;
+ $nextlength = `$lagandir/utils/getlength $filenames[$i]`; chomp $nextlength;
+
+ # if no overlap, flush the buffer
+ if ($rangebegin1 == -1 && $rangeend1 == -1){
+
+ print STDERR "No overlap found...\n";
+
+ `mv $ARGV[3] $ARGV[3].new`;
+ `$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`;
+ `cp $filenames[$i] $newdir/current`;
+
+ `mv $ARGV[3].masked $ARGV[3].masked.new`;
+ `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`;
+ `cp $filenames[$i].masked $newdir/current.masked`;
+
+ $contigEnd[$i-1] = $contigStart[$i-1] + $thislength - 1;
+ $contigStart[$i] = $contigEnd[$i-1] + $padlength + 1;
+ $endChop[$i-1] = 0;
+ $startChop[$i] = 0;
+ }
+ else {
+ print STDERR "Overlap detected!\n";
+
+ # extract the overlapped region > overlap
+ $j = $rangebegin1 - 1;
+
+ if ($j > 0){
+ `$lagandir/utils/cextract $newdir/current 1 $j 0 0 > $newdir/overlap`;
+ `$lagandir/utils/cextract $newdir/current.masked 1 $j 0 0 > $newdir/overlap.masked`;
+ $overlaplength = `$lagandir/utils/getlength $newdir/overlap`; chomp $overlaplength;
+
+ `mv $ARGV[3] $ARGV[3].new`;
+ `$lagandir/utils/seqmerge $ARGV[3].new $newdir/overlap > $ARGV[3]`;
+ `mv $ARGV[3].masked $ARGV[3].masked.new`;
+ `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/overlap.masked > $ARGV[3].masked`;
+ }
+
+ # extract the nonoverlapped region > current
+ `$lagandir/utils/cextract $filenames[$i] $rangebegin2 $nextlength 0 0 > $newdir/current`;
+ `$lagandir/utils/cextract $filenames[$i].masked $rangebegin2 $nextlength 0 0 > $newdir/current.masked`;
+
+ $contigEnd[$i-1] = $contigStart[$i-1] + $overlaplength - 1;
+ $contigStart[$i] = $contigEnd[$i-1] + 1;
+ $endChop[$i-1] = $thislength - $rangeend1;
+ $startChop[$i] = $rangebegin2 - 1;
+ }
+
+ if (index ($filenames[$i-1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; }
+ @temp = `head $filenames[$i-1]`;
+ chomp $temp[0]; $temp[0] = substr $temp[0], 1;
+
+ print INFOFILE "$temp[0]\n";
+ print INFOFILE "$num[$i-1] $seq1Begin[$i-1] $seq1End[$i-1] $contigStart[$i-1] $contigEnd[$i-1] $startChop[$i-1] $endChop[$i-1] $direction $score[$i-1] $seq2Begin[$i-1] $seq2End[$i-1]\n";
+
+}
+
+$thislength = `$lagandir/utils/getlength $filenames[$numContigs - 1]`; chomp $thislength;
+$contigEnd[$numContigs - 1] = $contigStart[$numContigs - 1] + $thislength - 1;
+$endChop[$numContigs - 1] = 0;
+
+`mv $ARGV[3] $ARGV[3].new`;
+`$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`;
+`mv $ARGV[3].masked $ARGV[3].masked.new`;
+`$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`;
+
+if (index ($filenames[$numContigs - 1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; }
+ at temp = `head $filenames[$numContigs - 1]`;
+chomp $temp[0]; $temp[0] = substr $temp[0], 1;
+print INFOFILE "$temp[0]\n";
+print INFOFILE "$num[$numContigs - 1] $seq1Begin[$numContigs - 1] $seq1End[$numContigs - 1] $contigStart[$numContigs - 1] $contigEnd[$numContigs - 1] $startChop[$numContigs - 1] $endChop[$numContigs - 1] $direction $score[$numContigs - 1] $seq2Begin[$numContigs - 1] $seq2End[$numContigs - 1]\n";
+
+
+print STDERR "Merging complete!\n\n";
+print LOGFILE "Merging complete!\n\n";
+
+# 1. write getoverlap() -- given a set of chaos hits, find the beginning and end in both seqs
+# 2. implement contigStart, contigStop -- positions of the contig begins/ends in the merged draft sequence
+# 3. startChop, endChop -- number chopped from each end
+# 4. secFrom, secTo -- pos in the chopped contig sequence
diff --git a/src/utils/contigorder.c b/src/utils/contigorder.c
new file mode 100644
index 0000000..ee18332
--- /dev/null
+++ b/src/utils/contigorder.c
@@ -0,0 +1,350 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define MAX_CELLS ((long long int) 100000000)
+#define MAX_TIME ((long long int) 100000 * (long long int) 100000)
+
+int failed = 0;
+
+void getFileInfo (char *filename, int *numContigs, int *seqLen, int *numHits){
+ FILE *file;
+ int dummy, i;
+
+ if (!(file = fopen (filename, "r"))){
+ fprintf (stderr, "contigorder: Error opening file: %s\n");
+ exit (1);
+ }
+
+ fscanf (file, "numContigs = %d\n", numContigs);
+ fscanf (file, "seqLen = %d\n", seqLen);
+
+ *numHits = 0;
+ while (!feof (file)){
+ if (fscanf (file, "(%d %d)", &dummy, &dummy) == 2){
+ for (i = 0; i < *numContigs; i++){
+ fscanf (file, "%&d", &dummy);
+ }
+ while (fgetc (file) != '\n');
+ (*numHits)++;
+ }
+ }
+
+ fclose (file);
+}
+
+void getScores (char *filename, int numContigs, int seqLen, int numHits, int ***score, int ***ranges){
+ FILE *file;
+ int i, j;
+
+ *score = (int **) malloc (sizeof (int *) * numHits);
+ assert (*score);
+ *ranges = (int **) malloc (sizeof (int *) * numHits);
+ assert (*ranges);
+ for (i = 0; i < numHits; i++){
+ (*score)[i] = (int *) calloc (numContigs, sizeof (int));
+ assert ((*score)[i]);
+ (*ranges)[i] = (int *) calloc (2, sizeof (int));
+ assert ((*ranges)[i]);
+ }
+
+ if (!(file = fopen (filename, "r"))){
+ fprintf (stderr, "contigorder: Error opening file: %s\n");
+ exit (1);
+ }
+
+ fscanf (file, "numContigs = %*d\n");
+ fscanf (file, "seqLen = %*d\n");
+
+ i = 0;
+ while (!feof (file) && i < numHits){
+ if (fscanf (file, "(%d %d)", &((*ranges)[i][0]), &((*ranges)[i][1])) == 2){
+ for (j = 0; j < numContigs; j++){
+ fscanf (file, "%d", &((*score)[i][j]));
+ }
+ while (fgetc (file) != '\n');
+ i++;
+ }
+ }
+
+ fclose (file);
+}
+
+void floodfill (int *labels, int *first, int *last, int numContigs, int here, int groupNum){
+ int i;
+
+ labels[here] = groupNum;
+ for (i = 0; i < numContigs; i++){
+ if (i != here && labels[i] == -1 && first[i] != -1){
+ if (!(first[here] > last[i] || last[here] < first[i])){
+ floodfill (labels, first, last, numContigs, i, groupNum);
+ }
+ }
+ }
+}
+
+int *getLabels (int **score, int numContigs, int numHits){
+ int *labels, *first, *last, i, j;
+
+ labels = (int *) calloc (numContigs, sizeof (int)); assert (labels);
+ first = (int *) calloc (numContigs, sizeof (int)); assert (first);
+ last = (int *) calloc (numContigs, sizeof (int)); assert (last);
+
+ for (j = 0; j < numContigs; j++){
+ first[j] = -1;
+ for (i = 0; i < numHits; i++){
+ if (score[i][j] > 0){
+ if (first[j] == -1) first[j] = i;
+ last[j] = i;
+ }
+ }
+ }
+
+ j = 0;
+ for (i = 0; i < numContigs; i++) labels[i] = -1;
+ for (i = 0; i < numContigs; i++){
+ if (labels[i] == -1 && first[i] != -1){
+ floodfill (labels, first, last, numContigs, i, j++);
+ }
+ }
+
+ free (first);
+ free (last);
+ return labels;
+}
+
+int makeRanges (int **score, int numHits, int *cols, int numCols, int **first, int **last){
+ int i, j, k, found, numRanges = 1;
+
+ for (i = 0; i < numHits; i++){
+ for (j = 0; j <= i; j++){
+ for (k = found = 0; !found && k < numCols; k++){
+ found = (score[i][cols[k]] > 0) && (score[j][cols[k]] > 0);
+ }
+ if (found) numRanges++;
+ }
+ }
+
+ *first = (int *) calloc (numRanges, sizeof (int)); assert (*first);
+ *last = (int *) calloc (numRanges, sizeof (int)); assert (*last);
+
+ (*first)[0] = -1; // initial range
+ (*last)[0] = -1; // initial range
+ numRanges = 1;
+
+ for (i = 0; i < numHits; i++){
+ for (j = 0; j <= i; j++){
+ for (k = found = 0; !found && k < numCols; k++){
+ found = (score[i][cols[k]] > 0) && (score[j][cols[k]] > 0);
+ }
+ if (found){
+ (*first)[numRanges] = j;
+ (*last)[numRanges] = i;
+ numRanges++;
+ }
+ }
+ }
+
+ return numRanges;
+}
+
+int **calcRangeScores (int **score, int *cols, int numCols, int *first, int *last, int numRanges){
+ int i, j, k, **scoreOf;
+
+ scoreOf = (int **) malloc (sizeof (int *) * numCols); assert (scoreOf);
+ for (i = 0; i < numCols; i++){
+ scoreOf[i] = (int *) malloc (sizeof (int) * numRanges); assert (scoreOf[i]);
+ for (j = 0; j < numRanges; j++){
+ scoreOf[i][j] = 0;
+
+ if (j > 0){
+ for (k = first[j]; k <= last[j]; k++){
+ scoreOf[i][j] += score[k][cols[i]];
+ }
+ }
+ }
+ }
+
+
+ return scoreOf;
+}
+
+void solveOrder (int **score, int numContigs, int numHits, int *cols, int numCols, int **ranges,
+ int **results, int *resultCtr){
+ int i, j, k, l, m;
+ int numStates = (1 << numCols), numRanges;
+ int **best, *first, *last, ptr, newScore, **scoreOf;
+ int bestScore = 0, bestState, bestRange, newBest, addedScore;
+ int *stateList, *rangeList, *scoreList;
+ int work, totwork;
+
+ numRanges = makeRanges (score, numHits, cols, numCols, &first, &last);
+
+ if ((long long int) numRanges * (long long int) numStates > MAX_CELLS ||
+ (long long int) numRanges * (long long int) numStates * (long long int) numCols * (long long int) numRanges > MAX_TIME){
+ fprintf (stderr, "ordering failed, retrying... (numRanges = %d, numStates = %d)\n", numRanges, numStates);
+ printf ("ordering failed\n");
+ failed = 1;
+ return;
+ }
+
+ best = (int **) malloc (sizeof (int *) * numStates); assert (best);
+ for (i = 0; i < numStates; i++){
+ best[i] = (int *) calloc (numRanges, sizeof (int)); assert (best[i]);
+ }
+ for (i = 0; i < numStates; i++) best[i][0] = 0;
+ for (j = 1; j < numRanges; j++) best[0][j] = 0;
+
+ scoreOf = calcRangeScores (score, cols, numCols, first, last, numRanges);
+
+ // -- DP solution ---------------
+
+ work = 0;
+ totwork = (numRanges - 1) * (numStates - 1);
+
+ // search over all state transitions
+ for (i = 1; i < numRanges; i++){
+ for (j = 1; j < numStates; j++){
+ newBest = -1;
+
+ // compute best previous state
+ for (k = 0; k < numCols; k++) if (j & (1 << k)){
+ m = j - (1 << k);
+ addedScore = scoreOf[k][i];
+ for (l = 0; l < numRanges; l++) if (last[l] < first[i]){
+ newScore = best[m][l] + addedScore;
+ if (newScore > newBest){
+ newBest = newScore;
+ }
+ }
+ }
+
+ best[j][i] = newBest;
+
+ if (best[j][i] > bestScore){
+ bestScore = best[j][i];
+
+ bestState = j;
+ bestRange = i;
+ }
+ work++;
+ if ((work % 100000) == 0){
+ fprintf (stderr, "WORKING %d/%d\n", work, totwork);
+ }
+ }
+ }
+
+ // -- Compute traceback ---------
+
+ l = 0;
+ stateList = (int *) calloc (numCols, sizeof (int)); assert (stateList);
+ rangeList = (int *) calloc (numCols, sizeof (int)); assert (rangeList);
+ scoreList = (int *) calloc (numCols, sizeof (int)); assert (scoreList);
+
+ while (bestState != 0){
+
+ k = 1;
+ for (i = 0; k && i < numCols; i++) if (bestState & (1 << i)){
+ m = bestState - (1 << i);
+ for (j = 0; k && j < numRanges; j++) if (last[j] < first[bestRange]){
+ newScore = best[m][j] + scoreOf[i][bestRange];
+ if (newScore == best[bestState][bestRange]){
+ stateList[l] = cols[i];
+ rangeList[l] = bestRange;
+ scoreList[l] = scoreOf[i][bestRange];
+ l++;
+ bestState = m;
+ bestRange = j;
+ k = 0;
+ }
+ }
+ }
+ }
+
+ // -- Report traceback ----------
+
+ for (i = l - 1; i >= 0; i--){
+ results[*resultCtr][0] = stateList[i];
+ results[*resultCtr][1] = ranges[first[rangeList[i]]][0];
+ results[*resultCtr][2] = ranges[last[rangeList[i]]][1];
+ results[*resultCtr][3] = scoreList[i];
+ (*resultCtr)++;
+ }
+
+ for (i = 0; i < numCols; i++) free (scoreOf[i]);
+ free (scoreOf);
+ for (i = 0; i < numStates; i++) free (best[i]);
+ free (best);
+ free (first);
+ free (last);
+ free (stateList);
+ free (rangeList);
+ free (scoreList);
+}
+
+int compFn (const void *a, const void *b){
+ return (*(int **) a)[1] - (*(int **) b)[1];
+}
+
+void findGroups (int numContigs, int seqLen, int numHits, int **score, int **ranges){
+ int *labels, group, pos, i;
+ int *columns, **results, resultCtr = 0;
+
+ labels = getLabels (score, numContigs, numHits);
+ columns = (int *) malloc (sizeof (int) * numContigs); assert (columns);
+ results = (int **) malloc (sizeof (int *) * numContigs); assert (results);
+ for (i = 0; i < numContigs; i++){
+ results[i] = (int *) calloc (4, sizeof (int)); assert (results[i]);
+ }
+
+ group = pos = 0;
+ while (!failed){
+ for (i = 0; i < numContigs; i++){
+ if (labels[i] == group)
+ columns[pos++] = i;
+ }
+ if (pos == 0) break;
+ solveOrder (score, numContigs, numHits, columns, pos, ranges, results, &resultCtr);
+ pos = 0;
+ group++;
+ }
+
+ if (!failed){
+ qsort (results, resultCtr, sizeof (int *), compFn);
+ for (i = 0; i < resultCtr; i++){
+ printf ("%d --> (%d %d) %d\n", results[i][0], results[i][1], results[i][2], results[i][3]);
+ }
+ }
+
+ for (i = 0; i < numContigs; i++) free (results[i]);
+ free (results);
+ free (labels);
+ free (columns);
+}
+
+int main (int argc, char **argv){
+ int numContigs, seqLen, numHits, i;
+ int **score, **ranges;
+
+ if (argc != 2){
+ fprintf (stderr, "Usage:\ncontigorder rangefile\n");
+ exit (1);
+ }
+
+ getFileInfo (argv[1], &numContigs, &seqLen, &numHits);
+
+ //fprintf (stderr, "numContigs = %d, seqLen = %d, numHits = %d\n", numContigs, seqLen, numHits);
+
+ getScores (argv[1], numContigs, seqLen, numHits, &score, &ranges);
+ findGroups (numContigs, seqLen, numHits, score, ranges);
+
+ for (i = 0; i < numHits; i++){
+ free (score[i]);
+ free (ranges[i]);
+ }
+ free (score);
+ free (ranges);
+
+ return 0;
+}
+
diff --git a/src/utils/cstat.c b/src/utils/cstat.c
new file mode 100644
index 0000000..9555f6f
--- /dev/null
+++ b/src/utils/cstat.c
@@ -0,0 +1,252 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#define MAX_SEQ 31
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+#define CNTS_LEN 6
+#define CNTS_A 0
+#define CNTS_T 1
+#define CNTS_C 2
+#define CNTS_G 3
+#define CNTS_N 4
+#define CNTS_GAP 5
+
+double logs[MAX_SEQ+1];
+double maxentr;
+char* alpha = "ATCGN-";
+int s1shift = 0, s2shift = 0;
+
+typedef struct pair_ints {
+ int s;
+ int e;
+} pair;
+
+typedef struct align_res {
+ char* names[MAX_SEQ];
+ int algnlen;
+ int numseq;
+ int* algn;
+ char* cnts[CNTS_LEN];
+} align;
+
+int cntlets(FILE* input) {
+ int numread=0;
+ char temp[256];
+ char currchar = '~';
+
+ if (feof(input))
+ return 0;
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ while ((currchar != '>') && (currchar != EOF)) {
+ currchar = fgetc(input);
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ numread++;
+ }
+ }
+ rewind(input);
+ return numread-1;
+}
+
+int readseq(FILE* input, align* myal, int seqnum, int checksum) {
+ int numread=0, help;
+ char temp[256];
+ char currchar;
+
+ if (feof(input))
+ return 0;
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ myal->names[seqnum] = (char*) malloc((strlen(temp))*sizeof(char));
+ strcpy(myal->names[seqnum], temp+1);
+ *(strchr(myal->names[seqnum], '\n')) = 0;
+
+ currchar = fgetc(input);
+ while (numread <= checksum &&(currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ if (!strchr(alpha, currchar)) {
+ // fprintf(stderr, "WARNING %c converted to N\n", currchar, alpha);
+ currchar = 'N';
+ }
+ help = strchr(alpha, currchar)-alpha;
+ myal->cnts[help][numread]++;
+ if (help != CNTS_GAP) {
+ myal->algn[numread] |= (1 << seqnum);
+ }
+ numread++;
+ }
+ currchar = fgetc(input);
+ }
+ if (currchar == '>')
+ ungetc(currchar, input);
+ if (numread != checksum) {
+ fprintf(stderr, "Sequence (%s) of different lengths (%d v. %d)!!\n",
+ myal->names[seqnum], numread, checksum);
+ exit(1);
+ }
+ return 1;
+}
+
+
+align* readMultial(FILE* alfile) {
+ int letcnt = cntlets(alfile), i, j;
+ align* res = (align*)malloc (sizeof(align));
+ res->algn = (int*) malloc (sizeof(int)* letcnt);
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j] = (char*) malloc (sizeof(char)* letcnt);
+ for (i=0; i<letcnt; i++) {
+ res->algn[i] = 0;
+ for (j=0; j<CNTS_LEN; j++)
+ res->cnts[j][i] = 0;
+ }
+ i = 0;
+ while (readseq(alfile, res, i++, letcnt))
+ ;
+
+ res->numseq = i-1;
+ res->algnlen = letcnt;
+ return res;
+}
+
+inline int getScore (align* a, int i){
+ return
+ ((a->cnts[0][i] * (a->cnts[0][i] - 1)) +
+ (a->cnts[1][i] * (a->cnts[1][i] - 1)) +
+ (a->cnts[2][i] * (a->cnts[2][i] - 1)) +
+ (a->cnts[3][i] * (a->cnts[3][i] - 1))) / 2;
+}
+
+void skipto (align *myal, int trgt, int *i, int* pos){
+ int j;
+
+ while (*i < trgt){
+ for (j = 0; j < myal->numseq; j++)
+ pos[j] += (myal->algn[*i] & (1 << j)) > 0;
+ (*i)++;
+ }
+}
+
+void print (align *myal, int *first, int *last, int len){
+ int *start, *end, i, j, s = 0, e = 0;
+
+ start = (int *) malloc (sizeof (int) * myal->numseq); assert (start);
+ end = (int *) malloc (sizeof (int) * myal->numseq); assert (end);
+
+ for (i = 0; i < myal->numseq; i++) start[i] = end[i] = 0;
+
+ for (i = 0; i < len; i++){
+ skipto (myal, first[i], &s, start);
+ skipto (myal, last[i], &e, end);
+
+ printf ("(%d %d) --> ", first[i] + s1shift, last[i] + s1shift);
+ if (myal->numseq == 2){
+ printf ("(%d %d)%s", start[0] + s1shift, end[0] + s1shift, (0 == myal->numseq - 1) ? "\n" : ", ");
+ printf ("(%d %d)%s", start[1] + s2shift, end[1] + s2shift, (1 == myal->numseq - 1) ? "\n" : ", ");
+ }
+ else {
+ for (j = 0; j < myal->numseq; j++){
+ printf ("(%d %d)%s", start[0], end[0], (j == myal->numseq - 1) ? "\n" : ", ");
+ }
+ }
+
+ // this is a hack -- can't handle multiple seq's
+ /*
+ for (j = 0; j < myal->numseq; j++){
+ printf ("(%d %d)%s", start[j], end[j], (j == myal->numseq - 1) ? "\n" : ", ");
+ }
+ */
+ }
+
+ free (start);
+ free (end);
+}
+
+void analyze (align *myal, int cutoff, int window){
+ int *first, *last, size = 1, len = 0, i, score, count = 0;
+ int runstart = -1, numpairs = myal->numseq * (myal->numseq - 1) / 2;
+
+ window = MIN (window, myal->algnlen);
+ first = (int *) malloc (size * sizeof (int)); assert (first);
+ last = (int *) malloc (size * sizeof (int)); assert (last);
+
+ score = 0;
+ for (i = 0; i < window; i++)
+ score += getScore (myal, i);
+
+ if (score * 100 >= window * numpairs * cutoff) runstart = 0;
+ for (i = 1; i <= myal->algnlen - window; i++){
+ score += getScore (myal, i + window - 1) - getScore (myal, i - 1);
+
+ if (score * 100 >= window * numpairs * cutoff){
+ if (runstart == -1){
+ if (len > 0 && last[len - 1] >= i)
+ runstart = first[--len];
+ else
+ runstart = i;
+ }
+ }
+ else if (runstart >= 0){
+ first[len] = runstart;
+ last[len++] = i + window - 1;
+ runstart = -1;
+
+ if (len == size){
+ size *= 2;
+
+ first = (int *) realloc (first, sizeof (int) * size); assert (first);
+ last = (int *) realloc (last, sizeof (int) * size); assert (last);
+ }
+ }
+ }
+
+ if (runstart >= 0){
+ first[len] = runstart;
+ last[len++] = myal->algnlen - 1;
+ }
+
+ for (i = 0; i < len; i++){
+ count += last[i] - first[i];
+ }
+
+ printf ("%d\n", count);
+ print (myal, first, last, len);
+
+ free (first);
+ free (last);
+}
+
+int main(int argc, char** argv) {
+ FILE *alignfile;
+ align* myal;
+ int i;
+
+ if (argc != 4 && argc != 7) {
+ fprintf(stderr, "usage:\ncstat multi_fasta_file cutoff window_size [-shift s1shift s2shift]\n");
+ exit(1);
+ }
+ if (!(alignfile = fopen(argv[1],"r"))) {
+ fprintf(stderr, "couldnt open alignment file %s\n",argv[1]);
+ return 2;
+ }
+
+ if (argc == 7){
+ s1shift = atoi (argv[5]);
+ s2shift = atoi (argv[6]);
+ }
+
+ myal = readMultial(alignfile);
+ analyze (myal, atoi (argv[2]), atoi (argv[3]));
+}
diff --git a/src/utils/dotplot.cpp b/src/utils/dotplot.cpp
new file mode 100644
index 0000000..70be83a
--- /dev/null
+++ b/src/utils/dotplot.cpp
@@ -0,0 +1,107 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+int main (int argc, char **argv){
+ FILE *file;
+ int s1b, s1e, s2b, s2e, pa, pb, maxa = 0, maxb = 0;
+ float score;
+ char buffer[105];
+ char* name1 = NULL;
+ char* name2 = NULL;
+ char dummy[] = "unknown";
+ int PAD, PAD2;
+
+ if (argc < 2){
+ fprintf (stderr, "Usage: dotplot anchfile [name1 [name2]] \n");
+ exit(1);
+ }
+
+ if (argc > 2) name1 = argv[2];
+ if (argc > 3) name2 = argv[3];
+ if (name1 == NULL) name1 = dummy;
+ if (name2 == NULL) name2 = dummy;
+
+ pa = -1;
+ pb = -1;
+
+ file = fopen (argv[1], "r");
+ while (!feof (file)){
+ if (fscanf (file,
+ "(%d %d)=(%d %d) %f", &s1b, &s1e, &s2b, &s2e, &score) == 5 &&
+ s2b > 0){
+ if (s1b > maxa) maxa = s1b;
+ if (s1e > maxa) maxa = s1e;
+ if (s2b > maxb) maxb = s2b;
+ if (s2e > maxb) maxb = s2e;
+ }
+ fgets (buffer, 105, file);
+ }
+ fclose (file);
+// PAD = maxa / 1000;
+// PAD2 = maxb / 1000;
+
+ file = fopen (argv[1], "r");
+ printf ("set nokey\n");
+ printf ("set xlabel \"%s\"\n", name1);
+ printf ("set ylabel \"%s\"\n", name2);
+ printf ("set title \"Dotplot: %s vs. %s\"\n", name1, name2);
+ printf ("set style line 1 linetype 3 linewidth 3\n");
+ printf ("set style line 2 linetype 1 linewidth 4\n");
+
+
+ while (!feof (file)){
+ if (fscanf (file,
+ "(%d %d)=(%d %d) %f", &s1b, &s1e, &s2b, &s2e, &score) == 5 && s2b > 0){
+ if (s1b > maxa) maxa = s1b;
+ if (s1e > maxa) maxa = s1e;
+ if (s2b > maxb) maxb = s2b;
+ if (s2e > maxb) maxb = s2e;
+
+ if (s2b < s2e){
+ // draw forward aligns
+ PAD = (s1e-s1b)* 2/10;
+ PAD2 = (s2e-s2b)* 2/10;
+ printf ("set arrow from %d,%d to %d,%d nohead ls 1\n",
+ s1b-PAD, s2b-PAD2, s1e+PAD, s2e+PAD2);
+
+ // draw connections
+ // if (pa != -1 && pb != -1)
+ // printf ("set arrow from %d,%d to %d,%d nohead lt -1 lw 0.01\n", pa, pb, s1b, s2b);
+ pa = s1e;
+ pb = s2e;
+ }
+ }
+ fgets (buffer, 105, file);
+ }
+ fclose (file);
+
+ file = fopen (argv[1], "r");
+ while (!feof (file)){
+ if (fscanf (file,
+ "(%d %d)=(%d %d) %f", &s1b, &s1e, &s2b, &s2e, &score) == 5 && s2b > 0){
+ if (s2b > s2e){
+ // draw rev aligns
+ PAD = (s1e-s1b)* 2/10;
+ PAD2 = (s2b-s2e)* 2/10;
+ printf ("set arrow from %d,%d to %d,%d nohead ls 2\n",
+ s1b-PAD2, s2b+PAD2,
+ s1e+PAD2, s2e-PAD2);
+
+ // draw connections
+ // if (pa != -1 && pb != -1)
+ // printf ("set arrow from %d,%d to %d,%d nohead lt -1 lw 0.01\n", pa, pb, s1b, s2b);
+ pa = s1e;
+ pb = s2b;
+ }
+ }
+ fgets (buffer, 105, file);
+ }
+
+ printf ("plot [1:%d][1:%d] -1\n", maxa * 11/10, maxb*11/10);
+ printf ("set terminal postscript enhanced color\n");
+ printf ("set output \"sin.ps\"\n");
+ printf ("replot\n");
+
+
+ fclose (file);
+}
diff --git a/src/utils/draft.pl b/src/utils/draft.pl
new file mode 100755
index 0000000..4bda5cf
--- /dev/null
+++ b/src/utils/draft.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+use File::Basename;
+
+$lazyflag = 0;
+$lagandir = $ENV{LAGAN_DIR};
+$recurfl = "-recurse \"(12,0,30,0)x,(13,1,30,0)x,(3,0,30,0)xt,(8,1,30,0)x,(7,1,30,0)x,(7,1,15,0)x\"";
+$laganparams = "-maskedonly ";
+$anchgapstart = -5;
+$anchgapcont = -0.2;
+$usebounds = 1;
+
+$startingrate = 65;
+$rateinc = 1;
+$frlevel = "";
+$pid = "mergedir";
+
+if (@ARGV < 2) {
+ if ((@ARGV == 1) && ($ARGV[0] =~ /-version/)){
+ print STDERR "DRAFT version 0.1\n";
+ exit (0);
+ }
+ else {
+ print STDERR ("Usage:\n\ndraft.pl SEQFILE MFAFILE [-cons RATE] [-translate] [-version]\n");
+ exit (1);
+ }
+}
+
+$arglist = "";
+$skipfr = 0;
+for ($i = 2; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-recurse/){
+ $recurfl = " -recurse \"".$ARGV[++$i]."\"";
+ }
+ elsif ($ARGV[$i] =~ /-skipfr/){
+ $skipfr = 1;
+ $pid = $ARGV[++$i];
+ chomp $pid;
+ }
+ elsif ($ARGV[$i] =~ /-translate/){
+ $recurfl = $recurfl." -translate";
+ }
+ elsif ($ARGV[$i] =~ /-cons/){
+ $startingrate = $ARGV[++$i];
+ chomp $startingrate;
+ }
+ elsif ($ARGV[$i] =~ /-lazy/){
+ $lazyflag = 1;
+ }
+ elsif ($ARGV[$i] =~ /-fastreject/){
+ $frarg = " -fastreject $frlevel";
+ }
+ else {
+ print STDERR "Bad arg to draft: $ARGV[$i]";
+ }
+}
+
+$arglist = "$arglist $recurfl -usebounds $laganparams $frarg";
+
+# create new directory
+$newdir = `pwd`;
+chomp $newdir;
+$newdir = "$newdir/$pid";
+`mkdir $newdir` if (!(-e $newdir));
+
+open (LOGFILE, ">$newdir/log");
+
+print STDERR ("\n");
+print STDERR ("Finding Contig Alignments\n");
+print STDERR ("-------------------------\n");
+
+print LOGFILE ("\n");
+print LOGFILE ("Finding Contig Alignments\n");
+print LOGFILE ("-------------------------\n");
+
+# extract contigs;
+$contigfile = basename ($ARGV[1]);
+$contigdir = dirname ($ARGV[1]);
+
+`cp $ARGV[1] $newdir`;
+ at contigs = `perl $lagandir/mextract.pl $newdir/$contigfile`;
+if ($?) { exit(1);}
+for ($i = 0; $i < @contigs; $i++){
+ chomp $contigs[$i];
+ `$lagandir/utils/rc < $contigs[$i] > $contigs[$i].rc`;
+ if ($?) { exit(1); }
+}
+
+# extract masked contigs
+$maskedname = $ARGV[1].".masked";
+
+if (-e $maskedname){
+ $maskedcontigfile = basename ($maskedname);
+ `cp $maskedname $newdir`;
+ @maskedcontigs = `perl $lagandir/mextract.pl $newdir/$maskedcontigfile -masked`;
+ if ($?) { exit(1);}
+ for ($i = 0; $i < @maskedcontigs; $i++){
+ chomp $maskedcontigs[$i];
+ `$lagandir/utils/rc < $maskedcontigs[$i] > $contigs[$i].rc.masked`;
+ if ($?) { exit(1); }
+ }
+}
+
+# create file storing name of contig stats
+open (LFILE, ">$newdir/filenames") if (!$lazyflag);
+$num = 0;
+
+for ($i = 0; $i < @contigs; $i++){
+ chomp $contigs[$i];
+ $skip1 = $skip2 = 0;
+ # make alignments
+ if (!$lazyflag || !(-e "$contigs[$i].mfa")){
+ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i] -mfa $arglist -out $contigs[$i].mfa";
+ $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds);
+ `$execute`;
+ $ex_val = $? >> 8;
+ if (!(-e "$contigs[$i].mfa")) { $skip1 = 1; }
+ elsif ($?) { exit(1);}
+
+ if (!$skip1 && $usebounds){
+ # compute bounds
+ @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i]`;
+ if ($?) { exit(1);}
+ $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/;
+ $s1shift = $1 - 1;
+ $s2shift = $3 - 1;
+ }
+ `rm anchs.final`;
+ }
+
+ if (!$lazyflag || !(-e "$contigs[$i].rc.mfa")){
+ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i].rc -mfa $arglist -out $contigs[$i].rc.mfa";
+ $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds);
+ `$execute`;
+ $ex_val = $? >> 8;
+ if (!(-e "$contigs[$i].rc.mfa")) { $skip2 = 1; }
+ elsif ($?) { exit(1);}
+ if (!$skip2 && $usebounds){
+ # compute bounds
+ @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i].rc`;
+ if ($?) { exit(1);}
+ $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/;
+ $s1rcshift = $1 - 1;
+ $s2rcshift = $3 - 1;
+ }
+ `rm anchs.final`;
+ }
+
+ if ($skip1) {
+ $fscore = 0;
+ }
+ else {
+ $fscore = `$lagandir/utils/scorealign $contigs[$i].mfa $startingrate`; chomp $fscore;
+ if ($?) { exit(1);}
+ }
+ if ($skip2) {
+ $bscore = 0;
+ }
+ else {
+ $bscore = `$lagandir/utils/scorealign $contigs[$i].rc.mfa $startingrate`; chomp $bscore;
+ if ($?) { exit(1);}
+ }
+ # pick strand
+
+# print LFILE "$s1shift $contigs[$i].mfa\n" if (!$lazyflag);
+# print LFILE "$s1rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag);
+
+# if (0){
+ if ($fscore > 0 || $bscore > 0){
+ $j = $i + 1;
+ if ($fscore > $bscore){
+ print STDERR ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n");
+ print LOGFILE ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n");
+ print LFILE "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag);
+ print STDERR "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag);
+ }
+ elsif ($bscore > $fscore){
+ print STDERR ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n");
+ print LOGFILE ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n");
+ print LFILE "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag);
+ print STDERR "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag);
+ }
+ }
+# }
+ else {
+ print STDERR ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n");
+ print LOGFILE ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n");
+ }
+}
+close (LFILE);
+
+print STDERR ("\n");
+print STDERR ("Computing Contig Ordering\n");
+print STDERR ("-------------------------\n\n");
+
+print LOGFILE ("\n");
+print LOGFILE ("Computing Contig Ordering\n");
+print LOGFILE ("-------------------------\n\n");
+
+$foundorder = 0;
+
+for ($cutoff = $startingrate; !$foundorder && ($cutoff < 100); $cutoff += $rateinc){
+ `$lagandir/utils/scorecontigs /$newdir/filenames $ARGV[0] $newdir/contignames $cutoff > $newdir/ranges`;
+ if ($?) { exit(1);}
+ @list = `cat $newdir/ranges`;
+ $list[0] =~ /numContigs = (\d+)/;
+ next if ($1 == 0);
+
+ `$lagandir/utils/contigorder $newdir/ranges > $newdir/corder`;
+ if ($?) { exit(1);}
+ @list = `cat $newdir/corder`;
+ chomp $list[0];
+ $foundorder = 1 if ($list[0] ne "ordering failed");
+}
+
+if ($foundorder){
+ open (OFILE, ">$newdir/draft");
+ print OFILE ("Draft Ordering\n");
+ print OFILE ("--------------\n\n");
+
+ @contignames = `cat $newdir/contignames`;
+ for ($i = 0; $i < @contignames; $i++){
+ $contignames[$i] =~ /(\d+) (\d+) (\d+) (.*)/;
+ $num[$i] = $1; chomp $num[$i];
+ $s1shifts[$i] = $2; chomp $s1shifts[$i];
+ $s2shifts[$i] = $3; chomp $s2shifts[$i];
+ $filenames[$i] = $4; chomp $filenames[$i];
+ }
+
+ @list = `cat $newdir/corder`;
+ for ($i = 0; $i < @list; $i++){
+ $list[$i] =~ /(\d+) --\> \((\d+) (\d+)\) (.*)/;
+ $score = $4; chomp $score;
+ print OFILE ("$filenames[$1] --> ($2 $3) score=$score, offset=($s1shifts[$1] $s2shifts[$1]), index=$num[$1]\n");
+ }
+ close (OFILE);
+
+ print STDERR `cat $newdir/draft`;
+ print LOGFILE `cat $newdir/draft`;
+ close (LOGFILE);
+}
+else {
+ print STDERR "Could not compute ordering.";
+ print LOGFILE "Could not compute ordering.";
+ close (LOGFILE);
+ exit (0);
+}
+
+$filename1 = $ARGV[0];
+$filename2 = "$newdir/$contigfile";
+
+`$lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid`;
+if ($?) { exit(1); }
+
+print STDERR "EXECUTE $lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid\n";
+
+`cp $filename2.merged merged_seq.fa`;
+`cp $filename2.merged.masked merged_seq.fa.masked`;
+`cp $newdir/minfo minfo`;
+`cp $newdir/ranges ranges`;
+`cp $newdir/log log`;
+
+print STDERR ("\n");
+print STDERR ("Computing Final Alignment\n");
+print STDERR ("-------------------------\n\n");
+
+# `rm -rf $newdir`;
+
diff --git a/src/utils/fa2xfa.c b/src/utils/fa2xfa.c
new file mode 100644
index 0000000..8d3c3a1
--- /dev/null
+++ b/src/utils/fa2xfa.c
@@ -0,0 +1,122 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+
+int begin, finish, seqIdx, seqExt, seqlen, numseqs, seqlen2, numseqs2;
+int rcflag = 0;
+char name[1024], name2[1024], **seqs, **seqs2;
+
+char comp(char a) {
+ if (!rcflag) return a;
+ switch (a) {
+ case 'A': case 'a': return 'T';
+ case 'T': case 't': return 'A';
+ case 'C': case 'c': return 'G';
+ case 'G': case 'g': return 'C';
+ case 'N': case 'n': return 'N';
+ }
+ fprintf (stderr, "bad letter to RC %c\n",a);
+ exit(2);
+}
+
+int getLength (char *filename){
+ FILE *file;
+ char buffer[1024], ch;
+ int length = 0;
+
+ file = fopen (filename, "r"); assert (file);
+ fgets (buffer, 1024, file);
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) || ch == '.' || ch == '-') length++;
+ }
+ fclose (file);
+
+ return length;
+}
+
+void readfile (char *filename, int *seqlen, int *numseqs, char *name, char ***seqs){
+ FILE *file;
+ char buffer[1024], ch;
+ int i;
+
+ *numseqs = 0;
+ *seqlen = getLength (filename);
+ strcpy (name, "");
+ *seqs = (char **) malloc (sizeof (char *) * 1); assert (*seqs);
+ (*seqs)[0] = (char *) malloc (sizeof (char) * (*seqlen));
+
+ file = fopen (filename, "r"); assert (file);
+ while (!feof (file)){
+ i = 0;
+ fgets (buffer, 1024, file);
+ if (strlen (name) == 0) strcpy (name, buffer);
+ if (feof (file)) break;
+ (*numseqs)++;
+ if (*numseqs > 1){
+ *seqs = (char **) realloc (*seqs, sizeof (char *) * (*numseqs)); assert (*seqs);
+ (*seqs)[*numseqs - 1] = (char *) malloc (sizeof (char) * (*seqlen)); assert ((*seqs)[*numseqs - 1]);
+ }
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ ch = toupper(ch);
+ if (((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) || (ch == '.') || (ch == '-')){
+// assert (i < (*seqlen));
+ (*seqs)[*numseqs - 1][i] = ch;
+ i++;
+ }
+ }
+ if (ch == '>') ungetc (ch, file);
+ assert (i == *seqlen);
+ }
+ fclose (file);
+}
+
+void print (void){
+ int i = 0, pos = 0, written = 0, j = 0;
+ assert (seqExt >= 0 && seqExt < numseqs);
+ name[0] = ' ';
+
+ printf (">%d:%d-%d %c %s", seqIdx, begin+1, finish, (rcflag)?'-':'+', name);
+
+ for (i = begin; i < finish; i++) {
+ printf ("%c", comp(seqs[seqExt][(rcflag)?(finish+begin-i-1):i]));
+ written++;
+ if (written % 60 == 0) printf ("\n");
+ }
+ if (written % 60 != 0) printf ("\n");
+}
+
+int main (int argc, char** argv){
+ int i;
+
+ if (argc != 5 && !(argc == 6 && strcmp (argv[5], "-rc") == 0)){
+ fprintf (stderr, "Usage:\n\nfa2xfa fasta_file begin end seqid [-rc]\n");
+ exit (1);
+ }
+
+ seqExt = 0;
+ begin = atoi (argv[2])-1;
+ finish = atoi (strdup(argv[3]));
+ seqIdx = atoi (argv[4]);
+ if (argc == 6)
+ rcflag = 1;
+ seqlen2 = 0;
+
+ readfile (argv[1], &seqlen, &numseqs, name, &seqs);
+
+ print ();
+
+ for (i = 0; i < numseqs; i++) free (seqs[i]);
+ free (seqs);
+}
+
+
+
+
+
diff --git a/src/utils/getbounds.c b/src/utils/getbounds.c
new file mode 100644
index 0000000..a281763
--- /dev/null
+++ b/src/utils/getbounds.c
@@ -0,0 +1,90 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+
+#define EXPAND 2
+
+inline int max (int a, int b){ if (a > b) return a; return b; }
+inline int min (int a, int b){ if (a < b) return a; return b; }
+
+int getLength (char *filename){
+ FILE *file;
+ char buffer[1024], ch;
+ int length = 0;
+
+ file = fopen (filename, "r"); assert (file);
+ fgets (buffer, 1024, file);
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (isalpha (ch) || ch == '.') length++;
+ }
+ fclose (file);
+
+ return length;
+}
+
+int main (int argc, char **argv){
+ FILE *file;
+ int s1b, s1e, s2b, s2e, i;
+ int S1B, S1E, S2B, S2E, ext, len1, len2;
+ int m1b, m1e, m2b, m2e;
+ float f;
+
+ if (argc != 4){
+ fprintf (stderr, "Usage:\n\ngetbounds anchfile seqfile1 seqfile2\n");
+ exit (1);
+ }
+
+ file = fopen (argv[1], "r"); assert (file);
+ len1 = getLength (argv[2]);
+ len2 = getLength (argv[3]);
+
+ m1b = m2b = 1000000000;
+ m1e = m2e = -1000000000;
+ while (!feof (file)){
+ if (fscanf (file, "(%d %d)=(%d %d) %f\n", &s1b, &s1e, &s2b, &s2e, &f) == 5){
+ m1b = min (m1b, s1b);
+ m1e = max (m1e, s1e);
+ m2b = min (m2b, s2b);
+ m2e = max (m2e, s2e);
+ }
+ }
+ m1e = len2 - m1e;
+ m2e = len2 - m2e;
+ fclose (file);
+ file = fopen (argv[1], "r"); assert (file);
+
+ i = 0;
+ while (!feof (file)){
+ if (fscanf (file, "(%d %d)=(%d %d) %f\n", &s1b, &s1e, &s2b, &s2e, &f) == 5){
+ if (i == 0){
+ S1B = max (s1b - m2b * EXPAND, 1);
+ S1E = min (s1e + m2e * EXPAND, len1);
+ S2B = max (s2b - m2b * EXPAND, 1);
+ S2E = min (s2e + m2e * EXPAND, len2);
+ i = 1;
+ }
+ else {
+ S1B = min (S1B, max (s1b - m2b * EXPAND, 1));
+ S1E = max (S1E, min (s1e + m2e * EXPAND, len1));
+ S2B = min (S2B, max (s2b - m2b * EXPAND, 1));
+ S2E = max (S2E, min (s2e + m2e * EXPAND, len2));
+ }
+ }
+ }
+ if (i == 0){
+ S1B = 1;
+ S1E = len1;
+ S2B = 1;
+ S2E = len2;
+ }
+
+ printf ("-s1 %d %d -s2 %d %d\n", S1B, S1E, 1, len2);
+
+ fclose (file);
+ return 0;
+}
+
diff --git a/src/utils/getcontigpos.c b/src/utils/getcontigpos.c
new file mode 100644
index 0000000..713314b
--- /dev/null
+++ b/src/utils/getcontigpos.c
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+
+int begin, finish, seqIdx, seqExt, seqlen, numseqs, seqlen2, numseqs2;
+char name[1024], name2[1024], **seqs, **seqs2;
+
+int getLength (char *filename){
+ FILE *file;
+ char buffer[1024], ch;
+ int length = 0;
+
+ file = fopen (filename, "r"); assert (file);
+ fgets (buffer, 1024, file);
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (isalpha (ch) || ch == '.' || ch == '-') length++;
+ }
+ fclose (file);
+
+ return length;
+}
+
+void readfile (char *filename, int *seqlen, int *numseqs, char *name, char ***seqs){
+ FILE *file;
+ char buffer[1024], ch;
+ int i;
+
+ *numseqs = 0;
+ *seqlen = getLength (filename);
+ strcpy (name, "");
+ *seqs = (char **) malloc (sizeof (char *) * 1); assert (*seqs);
+ (*seqs)[0] = (char *) malloc (sizeof (char) * (*seqlen));
+
+ file = fopen (filename, "r"); assert (file);
+ while (!feof (file)){
+ i = 0;
+ fgets (buffer, 1024, file);
+ if (strlen (name) == 0) strcpy (name, buffer);
+ if (feof (file)) break;
+ (*numseqs)++;
+ if (*numseqs > 1){
+ *seqs = (char **) realloc (*seqs, sizeof (char *) * (*numseqs)); assert (*seqs);
+ (*seqs)[*numseqs - 1] = (char *) malloc (sizeof (char) * (*seqlen)); assert ((*seqs)[*numseqs - 1]);
+ }
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (isalpha (ch) || ch == '.' || ch == '-'){
+ assert (i < (*seqlen));
+ (*seqs)[*numseqs - 1][i] = ch;
+ i++;
+ }
+ }
+ if (ch == '>') ungetc (ch, file);
+ assert (i == *seqlen);
+ }
+ fclose (file);
+}
+
+void print (void){
+ int i = 0, pos = 0, pos2 = 0, written = 0, j = 0;
+
+ while (pos <= finish && i < seqlen){
+ if (isalpha (seqs[0][i])) pos++;
+ if (isalpha (seqs[1][i])) pos2++;
+ if (pos == finish){
+ printf ("%d\n", pos2);
+ break;
+ }
+ i++;
+ }
+}
+
+int main (int argc, char** argv){
+ int i;
+
+ if (argc == 0){
+ fprintf (stderr, "Usage:\n\ngetcontigpos multi_fasta_file finished_index\n");
+ exit (1);
+ }
+
+ finish = atoi (strdup(argv[2]));
+
+ readfile (argv[1], &seqlen, &numseqs, name, &seqs);
+ print ();
+
+ for (i = 0; i < numseqs; i++) free (seqs[i]);
+ free (seqs);
+}
+
+
+
+
+
diff --git a/src/utils/getlength.c b/src/utils/getlength.c
new file mode 100644
index 0000000..7b65032
--- /dev/null
+++ b/src/utils/getlength.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#define BUF_SIZE 1024
+
+int main (int argc, char **argv){
+ FILE *file;
+ char buffer[BUF_SIZE], ch;
+ int length = 0, i, done = 0, nread;
+
+ if (argc != 2){
+ fprintf (stderr, "Usage:\n\ngetlength seqfile\n");
+ exit (1);
+ }
+
+ file = fopen (argv[1], "r"); assert (file);
+ fgets (buffer, BUF_SIZE, file);
+ while (!feof (file) && !done){
+ nread = fread (buffer, 1, BUF_SIZE, file);
+ for (i = 0; i < nread; i++){
+ ch = buffer[i];
+ if (ch == '>'){
+ done = 1;
+ break;
+ }
+ if (((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) || ch == '.' || ch == '-')
+ length++;
+ }
+ }
+ fclose (file);
+
+ printf ("%d\n", length);
+ return 0;
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/src/utils/getoverlap.c b/src/utils/getoverlap.c
new file mode 100644
index 0000000..4fa6274
--- /dev/null
+++ b/src/utils/getoverlap.c
@@ -0,0 +1,33 @@
+#include <stdio.h>
+#include <assert.h>
+
+#define INTMAX (100000000)
+#define INTMIN (-INTMAX)
+
+int max (int a, int b){ if (a > b) return a; return b; }
+int min (int a, int b){ if (a < b) return a; return b; }
+
+int main (int argc, char **argv){
+ FILE *file;
+ int seq1begin = INTMAX, seq1end = INTMIN, seq2begin = INTMAX, seq2end = INTMIN;
+ int a, b, c, d, e = 0;
+
+ file = fopen (argv[1], "r"); assert (file);
+
+ while (!feof (file)){
+ if (fscanf (file, "(%d %d)=(%d %d) %*f\n", &a, &b, &c, &d) == 4){
+ seq1begin = min (seq1begin, a);
+ seq1end = max (seq1end, b);
+ seq2begin = min (seq2begin, c);
+ seq2end = max (seq2end, d);
+ e++;
+ }
+ }
+
+ fclose (file);
+
+ if (!e)
+ printf ("-1 -1 -1 -1\n");
+ else
+ printf ("%d %d %d %d\n", seq1begin, seq1end, seq2begin, seq2end);
+}
diff --git a/src/utils/mextract.pl b/src/utils/mextract.pl
new file mode 100755
index 0000000..b609109
--- /dev/null
+++ b/src/utils/mextract.pl
@@ -0,0 +1,88 @@
+#!/usr/bin/env perl
+
+if (@ARGV < 1) {
+ print ("usage:\n mextract.pl filename [-masked]\n");
+ exit(1);
+}
+
+$masked=0;
+$filename = $ARGV[0];
+if(@ARGV==2) {
+ if ($ARGV[1] eq "-masked") {
+ $masked = 1;
+ }
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+$prefix = substr $filename, 0, (rindex $filename, ".");
+if ($masked || index ($filename, ".masked") != -1) {
+ $prefix = substr $filename, 0, (rindex $prefix, ".");
+}
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$suffix = "fa";
+if ($masked) {
+ $suffix = "$suffix.masked";
+}
+
+if (substr($line, 0, 1) eq ">") {
+ $name = substr($line, 1);
+ if (index ($name, " ") != -1){
+ $name = substr($name, 0, index ($name, " "));
+ }
+ if (substr ($name, length ($name) - 1) eq ","){
+ $name = substr($name, 0, length ($name) - 1);
+ }
+# $name = substr($line, 1);
+# $_ = substr($line, 1);
+# /\w+/g;
+# $name = $&;
+
+# substr($line, 1)." " =~ /(.+)[,]\s+/g;
+# $name = $1;
+
+ $fname = "$prefix\_$name.$suffix";
+ print("$fname\n");
+ open(OUTFILE, ">$fname");
+ print OUTFILE ">$name\n";
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ close OUTFILE;
+
+# substr($line, 1)." " =~ /(.+)[,]\s/g;
+# $name = $1;
+
+ $name = substr($line, 1);
+ if (index ($name, " ") != -1){
+ $name = substr($name, 0, index ($name, " "));
+ }
+ if (substr ($name, length ($name) - 1) eq ","){
+ $name = substr($name, 0, length ($name) - 1);
+ }
+# $_ = substr($line, 1);
+# /\w+/g;
+# $name = $&;
+
+ $fname = "$prefix\_$name.$suffix";
+ print("$fname\n");
+ open(OUTFILE, ">$fname");
+ print OUTFILE ">$name\n";
+ } else {
+ print OUTFILE "$line";
+ }
+}
+
+close OUTFILE;
diff --git a/src/utils/mf2bin.pl b/src/utils/mf2bin.pl
new file mode 100755
index 0000000..6e5105c
--- /dev/null
+++ b/src/utils/mf2bin.pl
@@ -0,0 +1,93 @@
+#!/usr/bin/env perl
+
+# defaults
+# constants
+
+# usage notes
+
+if (@ARGV < 1) {
+ print ("usage:\n mf2bin.pl inputfile [-out outputfile] \n");
+ exit(1);
+}
+
+# parse parameters
+
+$tofile = 0;
+for ($i=1; $i<@ARGV; $i++) {
+ if ($ARGV[$i] eq "-out") {
+ $tofile = 1;
+ $outfilename = $ARGV[++$i];
+ }
+}
+
+if ($tofile) {
+ open(OUTFILE, ">$outfilename");
+}
+
+# read in Multi-FASTA file
+
+$infilename = $ARGV[0];
+open(FASTAFILE, "$infilename") || die "Could not open $infilename.\n\n";
+$line = <FASTAFILE>;
+chomp $line;
+
+$i=0;
+%list=();
+ at seqs=(());
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ push @seqs, ();
+ } else {
+ push @{$seqs[$i]}, "$line";
+ }
+}
+
+$i=0;
+for $row (@seqs) {
+ @strs[$i++] = join "", @$row;
+}
+
+if (@keys != 2) {
+ print ("mpack needs two FASTA sequences\n");
+ exit(1);
+}
+
+
+# pack bin
+# format from Alex Poliakov's glass2bin.pl script
+
+%base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5,
+ 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5);
+$l = length @strs[0]; # $l--;
+$s1 = reverse(@strs[0]);
+$s2 = reverse(@strs[1]);
+
+
+for ($i=0; $i<$l; $i++) {
+ if ($tofile) {
+ print OUTFILE pack("H2",
+ $base_code{chop($s1)} . $base_code{chop($s2)});
+ } else {
+ print pack("H2",
+ $base_code{chop($s1)} . $base_code{chop($s2)});
+ }
+}
+
+
diff --git a/src/utils/mpretty.pl b/src/utils/mpretty.pl
new file mode 100755
index 0000000..a090f03
--- /dev/null
+++ b/src/utils/mpretty.pl
@@ -0,0 +1,263 @@
+#!/usr/bin/env perl
+
+# defaults
+
+$linelen = 50;
+$interval = 10;
+$labellen = 5;
+$uselabels = 1;
+$useintervals = 1;
+$usecounts = 1;
+$usebase = 0;
+$liststart = 1;
+$listend = 0;
+$usestart = 0;
+$useend = 0;
+
+# constants
+
+$minlinelen = 10;
+$mininterval = 10;
+$minlabellen = 3;
+
+
+# usage notes
+
+if (@ARGV < 1) {
+ print ("usage:\n mpretty.pl filename\n");
+ print ("options:\n");
+ print (" -linelen value\n");
+ print (" (min: $minlinelen, default: $linelen)\n");
+ print (" -interval value\n");
+ print (" (min: $mininterval, default: $interval, none: 0)\n");
+ print (" -labellen value\n");
+ print (" (min: $labellen, default: $labellen, none: 0)\n");
+ print (" -base sequence_name\n");
+ print (" (if used, must specify a sequence on which to base counting\n");
+ print (" -start value\n");
+ print (" (if used, must specify a start coordinate (>=1)\n");
+ print (" -end value\n");
+ print (" (if used, must specify an end coordinate (>=start)\n");
+ print (" -nocounts\n");
+ exit(1);
+}
+
+
+# parse parameters
+
+for ($i=1; $i<@ARGV; $i++) {
+ if ($ARGV[$i] eq "-nocounts") {
+ $usecounts = 0;
+ }
+ if ($ARGV[$i] eq "-linelen") {
+ $linelen = $ARGV[++$i];
+ if ($linelen < $minlinelen) {
+ $linelen = $minlinelen;
+ }
+ }
+ if ($ARGV[$i] eq "-interval") {
+ $interval = $ARGV[++$i];
+ if ($interval <= 0) {
+ $useintervals = 0;
+ }
+ if ($interval < $mininterval) {
+ $interval = $mininterval;
+ }
+ }
+ if ($ARGV[$i] eq "-labellen") {
+ $labellen = $ARGV[++$i];
+ if ($labellen <= 0) {
+ $uselabels = 0;
+ }
+ if ($labellen < $minlabellen) {
+ $labellen = $minlabellen;
+ }
+ }
+ if ($ARGV[$i] eq "-base") {
+ $baseseq = $ARGV[++$i];
+ $usebase = 1;
+ }
+ if ($ARGV[$i] eq "-start") {
+ $usestart = 1;
+ $liststart = $ARGV[++$i];
+ }
+ if ($ARGV[$i] eq "-end") {
+ $useend = 1;
+ $listend = $ARGV[++$i];
+ }
+}
+
+# preprocessing for labels
+
+if ($uselabels) {
+ $labtail = "";
+ for ($i=0; $i<$labellen; $i++) {
+ $labtail="$labtail ";
+ }
+}
+
+if (($usestart && ($liststart<1)) || ($useend && ($listend<$liststart))) {
+ die "Invalid range specified: [$liststart, $listend].\n\n";
+}
+
+# read in Multi-FASTA file
+
+$filename = $ARGV[0];
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+ at seqs=(());
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ @count[$i]=0;
+ @label[$i] = substr("@keys[$i]$labtail", 0, $labellen);
+ $list{@keys[$i]}=$i;
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ @count[$i]=0;
+ @label[$i] = substr("@keys[$i]$labtail", 0, $labellen);
+ $list{@keys[$i]}=$i;
+ push @seqs, ();
+ } else {
+ push @{$seqs[$i]}, "$line";
+ }
+}
+
+$i=0;
+$maxlen = 0;
+for $row (@seqs) {
+ @strs[$i++] = join "", @$row;
+ $templen = length @strs[$i-1];
+ if ($templen > $maxlen) {
+ $maxlen = $templen;
+ }
+}
+
+$foundseq=0;
+if ($usebase) {
+ foreach $s (@keys) {
+ $foundseq = ($s eq $baseseq) || $foundseq;
+ }
+if (!$foundseq) { die "Could not find Base Sequence: <$baseseq>\n\n"; }
+}
+
+# preprocessing for counts
+
+if ($usecounts) {
+ foreach $s (@keys) {
+ $_ = @strs[$list{$s}];
+ $ls = tr/ATCGNatcgn/ATGCNatcgn/;
+ @tot[$list{$s}] = $ls;
+ }
+}
+
+# length of sequence display
+$l=$maxlen;
+if ((!$listend) || ($listend>$maxlen)) {
+ $listend = $maxlen;
+}
+
+if ($maxlen < $liststart) { die "Starting out of bounds...\b\b"; }
+
+
+if ($usebase) {
+
+# find base sequence position
+
+ $i=0;
+ $j=0;
+ while ($j<$liststart) {
+ if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") {
+ $j++;
+ }
+ $i++;
+ }
+ $liststart = $i;
+ while ($j<$listend) {
+ if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") {
+ $j++;
+ }
+ $i++;
+ }
+ $listend = $i;
+}
+
+# pretty print
+
+if ($usecounts) {
+ foreach $s (@keys) {
+ $_ = substr(@strs[$list{$s}], 0, $liststart-1);
+ $lc = tr/ATCGN/ATGCN/;
+ @count[$list{$s}]+=$lc;
+ }
+}
+
+for ($i=$liststart-1; $i<$listend; $i+=$linelen) {
+ if ($listend-$i<$linelen) { $linelen = $listend-$i;}
+ foreach $s (@keys) {
+ if ($uselabels) {
+ print "@label[$list{$s}] : ";
+ }
+ $p = substr(@strs[$list{$s}], $i, $linelen);
+ print "$p";
+
+ if ($usecounts) {
+ $_ = $p;
+ $lc = tr/ATCGN/ATGCN/;
+ @count[$list{$s}]+=$lc;
+ print " @ @count[$list{$s}]/@tot[$list{$s}]";
+ }
+
+ print "\n";
+ }
+
+ if ($useintervals) {
+ if ($uselabels) {
+ print "$labtail = ";
+ }
+ for ($j=$i+1; $j<=$i+$linelen && $j<=$l; $j+=$interval) {
+ $ct = "$j";
+ print $ct;
+ for ($k=0; $k<($interval-(length $ct)); $k++) {
+ print " ";
+ }
+ }
+ print "\n";
+ }
+ print "\n";
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/utils/mproject.pl b/src/utils/mproject.pl
new file mode 100755
index 0000000..1fef41e
--- /dev/null
+++ b/src/utils/mproject.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+
+if (@ARGV < 2) {
+ print ("usage:\n mproject.pl filename seqname1 [seqname2 ... ]\n");
+ exit(1);
+}
+
+$filename = $ARGV[0];
+
+$i = 1;
+while ($i < @ARGV) {
+ @targets[$i-1] = $ARGV[$i];
+ $i++;
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$line = <FASTAFILE>;
+chomp $line;
+
+$i=0;
+%list=();
+ at seqs=(());
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line,1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ push @seqs, ();
+ } else {
+ push @{$seqs[$i]}, "$line";
+ }
+}
+
+$i=0;
+for $row (@seqs) {
+ @strs[$i++] = join "", @$row;
+}
+
+$seqlen = length $strs[0];
+# $seqlen--;
+
+for ($i=0; $i<$seqlen; $i++) {
+ @isgap[$i] = 1;
+ foreach $s (@targets) {
+ if (substr(@strs[$list{$s}], $i, 1) ne "-") {
+ @isgap[$i] = 0;
+ break;
+ }
+ }
+}
+
+foreach $s (@targets) {
+ print ">@keys[$list{$s}]\n";
+ $j=0;
+ for ($i=0; $i<$seqlen; $i++) {
+ if(!@isgap[$i]) {
+ print substr(@strs[$list{$s}], $i, 1);
+ $j++;
+ if (($j % 60) == 0) {
+ print "\n";
+ }
+ }
+ }
+ print "\n";
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/src/utils/mrun.pl b/src/utils/mrun.pl
new file mode 100755
index 0000000..ca34ce1
--- /dev/null
+++ b/src/utils/mrun.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+# VISTA .plotfile defaults
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+$paregmin = 75;
+$paregmax = 100;
+$pamin = 50;
+
+$pbases = 10000;
+$ptickdist = 2000;
+$presolution = 25;
+$pwindow = 40;
+$pnumwindows = 4;
+
+
+if (@ARGV < 1) {
+ print ("usage:\n mrun.pl filename -tree \"(tree...)\"\n");
+ print ("options: [base sequence name [sequence pairs]]\n");
+ print ("default: [base sequence name = first sequence]\n");
+ print ("other MLAGAN parameters...\n");
+ print ("other VISTA parameters...\n");
+ exit(1);
+}
+
+$filename = $ARGV[0];
+
+$i = 1;
+$j = 0;
+$k = 0;
+$l = 0;
+$treespec = 0;
+while ($i < @ARGV) {
+ if ($ARGV[$i] eq "-tree") {
+ @params[$j] = "-tree";
+ @params[++$j] = "\"$ARGV[++$i]\"";
+ $_ = @params[$j];
+ $topen = tr/"\("/"\("/;
+ $tclose = tr/"\)"/"\)"/;
+ $treespec = ($topen == $tclose);
+ } else {
+ if (substr($ARGV[$i],0,1) eq "-") {
+ if (substr($ARGV[$i],0,2) eq "--") {
+ @vparams[$l++] = $ARGV[$i++];
+ @vparams[$l++] = $ARGV[$i];
+ } else {
+ $j++;
+ @params[$j] = $ARGV[$i];
+ if ((@params[$j] eq "-gapstart") ||
+ (@params[$j] eq "-gapend") ||
+ (@params[$j] eq "-gapcont") ||
+ (@params[$j] eq "-gapperseq") ||
+ (@params[$j] eq "-match") ||
+ (@params[$j] eq "-mismatch") ||
+ (@params[$j] eq "-overlap") ||
+ (@params[$j] eq "-translate") ||
+ (@params[$j] eq "-gfc") ||
+ (@params[$j] eq "-ext") ||
+ (@params[$j] eq "-glwidth")) {
+ @params[++$j] = $ARGV[++$i];
+ }
+ }
+ } else {
+ @targets[$k++] = $ARGV[$i];
+ }
+ }
+ $i++;
+}
+
+for ($i=0; $i<@vparams; $i+=2) {
+ if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; }
+}
+
+if (!$treespec) {
+ print ("Must specify valid phylogenetic tree...\n");
+ exit(1);
+}
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+$mextstr = "$lagandir/utils/mextract.pl $filename";
+print "$mextstr\n";
+if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); }
+
+if (-e "$filename.masked") {
+ $mextstr = "$lagandir/utils/mextract.pl $filename.masked -masked";
+ print "$mextstr\n";
+ if(!`$mextstr`) {
+ print "\nMasked Multi-FASTA extraction failure...\n";
+ exit(1);
+ }
+}
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ if (@targets == 0) {
+ @targets[0] = @keys[$i];
+ print "Setting Base Sequence: @targets[0]\n";
+ }
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ }
+}
+
+$prefix = substr $filename, 0, (rindex $filename, ".");
+$prefix = "$prefix\_";
+
+foreach $s (@keys) {
+ @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa";
+}
+
+if ((@targets > 1)) {
+ if (@targets %2 != 1) {
+ $c = @targets;
+ print ("$c sequences: ");
+ print ("Must specify single base sequence\n");
+ print (" OR base sequence and pairs of sequences.\n");
+ exit(1);
+ }
+}
+
+$mfiles = "";
+foreach $s (@fnames) {
+ $mfiles = "$mfiles $s";
+}
+
+$mparams = "";
+foreach $s (@params) {
+ $mparams = "$mparams $s";
+}
+
+$mlagan = "$lagandir/mlagan$mfiles$mparams > $prefix.out";
+print STDERR "\n$mlagan\n\n";
+if(`$mlagan`) { print "\n\n"; exit(1); }
+
+$i=0;
+if (@targets == 1) {
+ foreach $s (@keys) {
+ if ($s ne @targets[0]) {
+ @targets[++$i] = @targets[0];
+ @targets[++$i] = $s;
+ }
+ }
+
+}
+
+$prjhead = "$lagandir/utils/mproject.pl $prefix.out";
+$binhead = "$lagandir/utils/mf2bin.pl";
+$j=0;
+for($i=1; $i<@targets; $i+=2) {
+ $outprefix = "$prefix at targets[$i]\_ at targets[$i+1]";
+ $pargs = "$targets[$i]_aligned $targets[$i+1]_aligned";
+ $pstr = "$prjhead $pargs > $outprefix.prj";
+ print "$pstr\n";
+ if(`$pstr`) { print "\nprojection failure...\n"; exit(1); }
+ $bstr = "$binhead $outprefix.prj -out $outprefix.bin";
+ print "$bstr\n";
+ if(`$bstr`) { print "\npacking failure...\n"; exit(1); }
+ @bins[$j++] = "$outprefix.bin";
+ print "\n";
+}
+
+%distinct=();
+foreach $s (@targets) {
+ $distinct{$s} = 0;
+}
+
+ at dseqs = keys %distinct;
+
+$plotfile = "$prefix.plotfile";
+open (PLOTFILE, ">$plotfile");
+
+print PLOTFILE "TITLE $prefix.fa - mlagan\n\n";
+print PLOTFILE "OUTPUT $prefix.pdf\n\n";
+
+print PLOTFILE "SEQUENCES ";
+foreach $s (@dseqs) {
+ print PLOTFILE "$s ";
+}
+print PLOTFILE "\n\n";
+
+$i=1;
+foreach $s (@bins) {
+ print PLOTFILE "ALIGN $s BINARY\n";
+ print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n";
+ print PLOTFILE " REGIONS $paregmin $paregmax\n";
+ print PLOTFILE " MIN $pamin\n";
+ print PLOTFILE "END\n\n";
+ $i+=2;
+}
+
+print "touch $prefix.ann\n\n";
+`touch $prefix.ann`;
+
+print PLOTFILE "GENES $prefix.ann\n\n";
+print PLOTFILE "LEGEND on\n\n";
+print PLOTFILE "COORDINATE @targets[0]\n\n";
+print PLOTFILE "PAPER letter\n\n";
+print PLOTFILE "BASES $pbases\n\n";
+print PLOTFILE "TICK_DIST $ptickdist\n\n";
+print PLOTFILE "RESOLUTION $presolution\n\n";
+print PLOTFILE "WINDOW $pwindow\n\n";
+print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n";
+
+#$vistadir = `echo \$VISTA_DIR`;
+#chomp $vistadir;
+
+#if ($vistadir eq "") {
+# print ("Must specify environment variable VISTA_DIR\n");
+# exit(1);
+#}
+
+#$vistastr = "$vistadir/RunVista $plotfile";
+#print "$vistastr\n";
+#if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+print "\n\nmrun.pl -- end.\n\n";
+
+
+
+
+
+
+
+
+
+
diff --git a/src/utils/mrunfile.pl b/src/utils/mrunfile.pl
new file mode 100755
index 0000000..2a20397
--- /dev/null
+++ b/src/utils/mrunfile.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+if (@ARGV < 1) {
+ print ("usage:\n mrunfile.pl filename [-pairwise] [-vista]\n\n");
+ exit(1);
+}
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+
+$filename = $ARGV[0];
+open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$pairwise = 0;
+$dovista = 0;
+
+for ($l=1; $l<@ARGV; $l++) {
+ if ($ARGV[$l] eq "-pairwise") {
+ $pairwise = 1;
+ }
+ elsif ($ARGV[$l] eq "-vista") {
+ $dovista = 1;
+ }
+}
+
+$i=0;
+$j=0;
+$k=0;
+$filespec = 0;
+while ($line = <PARAMFILE>) {
+ chomp $line;
+ if ((substr($line, 0, 1) ne "#") && ($line ne "")) {
+ if (!$filespec) {
+ $seqfile = $line;
+ $filespec = 1;
+ } elsif (substr($line,0,1) eq "-") {
+ if (substr($line,0,2) eq "--") {
+ @vparams[$j++] = $line;
+ } else {
+ @params[$i++] = $line;
+ }
+ } else {
+ @seqs[$k++] = $line;
+ }
+ }
+}
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+if ($pairwise) {
+ $mexecs = "mrunpairs.pl";
+} else {
+ $mexecs = "mrun.pl";
+}
+
+$mstr = "$lagandir/utils/$mexecs $seqfile";
+
+foreach $s (@params) {
+ $mstr = "$mstr $s"
+}
+
+foreach $s (@seqs) {
+ $mstr = "$mstr $s"
+}
+
+foreach $s (@vparams) {
+ $mstr = "$mstr $s"
+}
+
+print "$mstr\n";
+`$mstr`;
+
+if($dovista) {
+
+ $prefix = substr $seqfile, 0, (rindex $filename, ".");
+ $prefix = "$prefix\_";
+
+ if ($pairwise) {
+ $prefix="$prefix\pairwise\_";
+ }
+
+ $plotfile = "$prefix.plotfile";
+
+ ($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set";
+
+ $vistastr = "$vistadir/RunVista $plotfile";
+ print "$vistastr\n";
+ if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+}
+
+print "\nmrunfile.pl -- end.\n\n";
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/utils/mrunpairs.pl b/src/utils/mrunpairs.pl
new file mode 100755
index 0000000..f5fa2be
--- /dev/null
+++ b/src/utils/mrunpairs.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+# VISTA .plotfile defaults
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+$paregmin = 75;
+$paregmax = 100;
+$pamin = 50;
+
+$pbases = 10000;
+$ptickdist = 2000;
+$presolution = 25;
+$pwindow = 40;
+$pnumwindows = 4;
+
+
+if (@ARGV < 1) {
+ print ("usage:\n mrunpairs.pl filename\n");
+ print ("options: [base sequence name [sequence pairs]]\n");
+ print ("default: [base sequence name = first sequence]\n");
+ print ("other MLAGAN parameters...\n");
+ print ("other VISTA parameters...\n");
+ exit(1);
+}
+
+$filename = $ARGV[0];
+
+$i = 1;
+$j = 0;
+$k = 0;
+$l = 0;
+$treespec = 0;
+while ($i < @ARGV) {
+ if ($ARGV[$i] eq "-tree") {
+ $treepos = $j+1;
+ @params[$j] = "-tree";
+ @params[++$j] = "\"$ARGV[++$i]\"";
+ $_ = @params[$j];
+ $topen = tr/"\("/"\("/;
+ $tclose = tr/"\)"/"\)"/;
+ $treespec = ($topen == $tclose);
+ } else {
+ if (substr($ARGV[$i],0,1) eq "-") {
+ if (substr($ARGV[$i],0,2) eq "--") {
+ @vparams[$l++] = $ARGV[$i++];
+ @vparams[$l++] = $ARGV[$i];
+ } else {
+ $j++;
+ @params[$j] = $ARGV[$i];
+ if ((@params[$j] eq "-gapstart") ||
+ (@params[$j] eq "-gapend") ||
+ (@params[$j] eq "-gapcont") ||
+ (@params[$j] eq "-gapperseq") ||
+ (@params[$j] eq "-match") ||
+ (@params[$j] eq "-mismatch") ||
+ (@params[$j] eq "-overlap") ||
+ (@params[$j] eq "-glwidth")) {
+ @params[++$j] = $ARGV[++$i];
+ }
+ }
+ } else {
+ @targets[$k++] = $ARGV[$i];
+ }
+ }
+ $i++;
+}
+
+for ($i=0; $i<@vparams; $i+=2) {
+ if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; }
+}
+
+if (!$treespec) {
+ $j++;
+ $treepos = $j+1;
+ @params[$j] = "-tree";
+ @params[++$j] = "\"()\"";
+}
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+$mextstr = "$lagandir/mextract.pl $filename";
+print "$mextstr\n";
+if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); }
+
+if (-e "$filename.masked") {
+ $mextstr = "$lagandir/mextract.pl $filename.masked -masked";
+ print "$mextstr\n";
+ if(!`$mextstr`) {
+ print "\nMasked Multi-FASTA extraction failure...\n";
+ exit(1);
+ }
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+$i=0;
+%list=();
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ if (@targets == 0) {
+ @targets[0] = @keys[$i];
+ print "Setting Base Sequence: @targets[0]\n";
+ }
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ }
+}
+
+$fprefix = substr $filename, 0, (rindex $filename, ".");
+$prefix = "$fprefix\_";
+$pprefix = "$fprefix\_pairwise\_";
+
+foreach $s (@keys) {
+ @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa";
+}
+
+
+if ((@targets > 1)) {
+ if (@targets %2 != 1) {
+ $c = @targets;
+ print ("$c sequences: ");
+ print ("Must specify single base sequence\n");
+ print (" OR base sequence and pairs of sequences.\n");
+ exit(1);
+ }
+}
+
+$i=0;
+if (@targets == 1) {
+ foreach $s (@keys) {
+ if ($s ne @targets[0]) {
+ @targets[++$i] = @targets[0];
+ @targets[++$i] = $s;
+ }
+ }
+
+}
+
+$j=0;
+for($i=1; $i<@targets; $i+=2) {
+
+ $outprefix = "$pprefix at targets[$i]\_ at targets[$i+1]";
+
+ $mfiles = " @fnames[$list{@targets[$i]}] @fnames[$list{@targets[$i+1]}]";
+
+ @params[$treepos]="\"(@targets[$i] @targets[$i+1])\"";
+
+ $mparams = "";
+ foreach $s (@params) {
+ $mparams = "$mparams $s";
+ }
+
+ $mlagan = "$lagandir/mlagan$mfiles$mparams > $outprefix.out";
+ print "\n$mlagan\n\n";
+ if(`$mlagan`) { print "\n\n"; exit(1); }
+
+ $binhead = "$lagandir/mpack.pl";
+ $bstr = "$binhead $outprefix.out -out $outprefix.bin";
+ print "$bstr\n";
+ if(`$bstr`) { print "\npacking failure...\n"; exit(1); }
+ @bins[$j++] = "$outprefix.bin";
+ print "\n";
+
+}
+
+
+%distinct=();
+foreach $s (@targets) {
+ $distinct{$s} = 0;
+}
+
+ at dseqs = keys %distinct;
+
+
+$plotfile = "$pprefix.plotfile";
+open (PLOTFILE, ">$plotfile");
+
+print PLOTFILE "TITLE $prefix.fa - mlagan\n\n";
+print PLOTFILE "OUTPUT $pprefix.pdf\n\n";
+
+print PLOTFILE "SEQUENCES ";
+foreach $s (@dseqs) {
+ print PLOTFILE "$s ";
+}
+print PLOTFILE "\n\n";
+
+$i=1;
+foreach $s (@bins) {
+ print PLOTFILE "ALIGN $s BINARY\n";
+ print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n";
+ print PLOTFILE " REGIONS $paregmin $paregmax\n";
+ print PLOTFILE " MIN $pamin\n";
+ print PLOTFILE "END\n\n";
+ $i+=2;
+}
+
+print "touch $prefix.ann\n\n";
+`touch $prefix.ann`;
+
+print PLOTFILE "GENES $prefix.ann\n\n";
+print PLOTFILE "LEGEND on\n\n";
+print PLOTFILE "COORDINATE @targets[0]\n\n";
+print PLOTFILE "PAPER letter\n\n";
+print PLOTFILE "BASES $pbases\n\n";
+print PLOTFILE "TICK_DIST $ptickdist\n\n";
+print PLOTFILE "RESOLUTION $presolution\n\n";
+print PLOTFILE "WINDOW $pwindow\n\n";
+print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n";
+
+#$vistadir = `echo \$VISTA_DIR`;
+#chomp $vistadir;
+
+#if ($vistadir eq "") {
+# print ("Must specify environment variable VISTA_DIR\n");
+# exit(1);
+#}
+
+#$vistastr = "$vistadir/RunVista $plotfile";
+#print "$vistastr\n";
+#if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+
+print "\n\nmrunpairs.pl -- end.\n\n";
+
+
+
+
+
diff --git a/src/utils/mviz.pl b/src/utils/mviz.pl
new file mode 100755
index 0000000..121f21a
--- /dev/null
+++ b/src/utils/mviz.pl
@@ -0,0 +1,222 @@
+#!/usr/bin/env perl
+
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+$paregmin = 75;
+$paregmax = 100;
+$pamin = 50;
+
+$pbases = 10000;
+$ptickdist = 2000;
+$presolution = 25;
+$pwindow = 40;
+$pnumwindows = 4;
+
+
+if (@ARGV < 2) {
+ print ("usage:\n mviz.pl data_file param_file [plotfile]\n\n");
+ exit(1);
+}
+
+$pfspec = 0;
+if (@ARGV==3) {
+ $pfspec = 1;
+ $plotfile=@ARGV[2];
+ print "Using VISTA plotfile: $plotfile\n";
+}
+
+
+$filename = $ARGV[1];
+open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$i=0;
+$j=0;
+$k=0;
+$filespec = 0;
+while ($line = <PARAMFILE>) {
+ chomp $line;
+ if ((substr($line, 0, 1) ne "#") && ($line ne "")) {
+ if (!$filespec) {
+ $seqfile = $line;
+ $filespec = 1;
+ } elsif (substr($line,0,1) eq "-") {
+ if (substr($line,0,2) eq "--") {
+ @vparams[$j++] = $line;
+ } else {
+ @params[$i++] = $line;
+ }
+ } else {
+ @targets[$k++] = $line;
+ }
+ }
+}
+
+$seqfile = @ARGV[0];
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+for ($i=0; $i<@vparams; $i+=2) {
+ if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; }
+}
+
+open(FASTAFILE, "$seqfile") || die "Could not open $seqfile.\n\n";
+
+$prefix = substr $seqfile, 0, (rindex $seqfile, ".");
+if (substr($prefix, -1, 1) ne "_") {$prefix = "$prefix\_";}
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+
+if (substr($line, 0, 1) eq ">") {
+ @keys[$i] = substr($line, 1);
+
+ $list{@keys[$i]}=$i;
+
+ if (@targets == 0) {
+ @targets[0] = @keys[$i];
+ print "Setting Base Sequence: @targets[0]\n";
+ }
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ @keys[$i] = substr($line, 1);
+
+ $list{@keys[$i]}=$i;
+ }
+}
+
+if ((@targets > 1)) {
+
+ $j=0;
+ for ($i=1; $i<@targets; $i++) {
+ $_ = @targets[$i];
+ @bp[$j++]=/\w+/g;
+ $_=$&;
+ @bp[$j++]=/\w+/g;
+ }
+ $j=1;
+ foreach $s (@bp) {
+ @targets[$j++]=$s;
+ }
+ if (@targets %2 != 1) {
+ $c = @targets;
+ print ("$c sequences: ");
+ print ("Must specify single base sequence\n");
+ print (" OR base sequence and pairs of sequences.\n");
+ exit(1);
+ }
+}
+
+$i=0;
+if (@targets == 1) {
+ foreach $s (@keys) {
+ $s = substr $s, 0, (rindex $s, "_aligned");
+ if ($s ne @targets[0]) {
+ @targets[++$i] = @targets[0];
+ @targets[++$i] = $s;
+ }
+ }
+}
+
+print "TARGETS:\n";foreach $s (@targets) { print "\"$s\"\n"; }
+
+$prjhead = "$lagandir/utils/mproject.pl $seqfile";
+$binhead = "$lagandir/utils/mf2bin.pl";
+$j=0;
+for($i=1; $i<@targets; $i+=2) {
+ $outprefix = "$prefix at targets[$i]\_ at targets[$i+1]";
+ $pargs = "$targets[$i]_aligned $targets[$i+1]_aligned";
+ $pstr = "$prjhead $pargs > $outprefix.prj";
+ print "$pstr\n";
+ if(`$pstr`) { print "\nprojection failure...\n"; exit(1); }
+ $bstr = "$binhead $outprefix.prj -out $outprefix.bin";
+ print "$bstr\n";
+ if(`$bstr`) { print "\npacking failure...\n"; exit(1); }
+ @bins[$j++] = "$outprefix.bin";
+ print "\n";
+}
+
+%distinct=();
+foreach $s (@targets) {
+ $distinct{$s} = 0;
+}
+
+ at dseqs = keys %distinct;
+
+if (!$pfspec) {
+
+ $plotfile = "$prefix.plotfile";
+ open (PLOTFILE, ">$plotfile");
+
+ print PLOTFILE "TITLE $prefix.fa - mlagan\n\n";
+ print PLOTFILE "OUTPUT $prefix.pdf\n\n";
+
+ print PLOTFILE "SEQUENCES ";
+ foreach $s (@dseqs) {
+ print PLOTFILE "$s ";
+ }
+ print PLOTFILE "\n\n";
+
+ $i=1;
+ foreach $s (@bins) {
+ print PLOTFILE "ALIGN $s BINARY\n";
+ print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n";
+ print PLOTFILE " REGIONS $paregmin $paregmax\n";
+ print PLOTFILE " MIN $pamin\n";
+ print PLOTFILE "END\n\n";
+ $i+=2;
+ }
+
+ print "touch $prefix.ann\n\n";
+ `touch $prefix.ann`;
+
+ print PLOTFILE "GENES $prefix.ann\n\n";
+ print PLOTFILE "LEGEND on\n\n";
+ print PLOTFILE "COORDINATE @targets[0]\n\n";
+ print PLOTFILE "PAPER letter\n\n";
+ print PLOTFILE "BASES $pbases\n\n";
+ print PLOTFILE "TICK_DIST $ptickdist\n\n";
+ print PLOTFILE "RESOLUTION $presolution\n\n";
+ print PLOTFILE "WINDOW $pwindow\n\n";
+ print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n";
+
+}
+
+($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set";
+
+$vistastr = "$vistadir/RunVista $plotfile";
+print "$vistastr\n";
+if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+print "\n\nmviz.pl -- end.\n\n";
+
+
diff --git a/src/utils/overlay.c b/src/utils/overlay.c
new file mode 100644
index 0000000..76637be
--- /dev/null
+++ b/src/utils/overlay.c
@@ -0,0 +1,261 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#define MAX_SEQS 63
+#define MIN2(y,z) ((y)<(z))?(y):(z)
+#define MIN3(x,y,z) MIN2((x),MIN2((y),(z)))
+#define MIN4(w,x,y,z) MIN2((w),MIN3((x),(y),(z)))
+
+
+// Newick: (((One:0.2,Two:0.3):0.3,(Three:0.5,Four:0.3):0.2):0.3,Five:0.7):0.0;
+
+// Takes a tree in newick format, builds an internal "tree" structure
+// generates calls to other programs with correct weights
+
+
+
+typedef struct sequence {
+ char* seqname;
+ char* aligned;
+ char* overlay;
+ int alignlen;
+ int overlaylen;
+ int mynum;
+} seq;
+
+
+seq* allseqs[MAX_SEQS];
+int numseqs;
+
+
+char* dna_alpha = "ACGT";
+char* valid_alpha = "ACGTN-";
+char* DNA_PRINT;
+char* DNA_LET;
+char* NUM_ONES;
+
+void init_consts() {
+ int i;
+ DNA_LET = (char*) malloc (sizeof(char) * 0x10);
+ DNA_PRINT = (char*) malloc (sizeof(char) * 0x10);
+ NUM_ONES = (char*) malloc (sizeof(char) * 0x10);
+
+ for (i=0; i < 0x10; i++) {
+ NUM_ONES[i] = DNA_LET[i] = DNA_PRINT[i] = -1;
+ }
+
+ DNA_LET[1] = 0;
+ DNA_LET[2] = 1;
+ DNA_LET[4] = 2;
+ DNA_LET[8] = 3;
+ DNA_PRINT[0] = 'N';
+ DNA_PRINT[1] = 'A';
+ DNA_PRINT[2] = 'C';
+ DNA_PRINT[4] = 'G';
+ DNA_PRINT[8] = 'T';
+ DNA_PRINT[1|2] = 'M';
+ DNA_PRINT[1|4] = 'R';
+ DNA_PRINT[1|8] = 'W';
+ DNA_PRINT[2|4] = 'S';
+ DNA_PRINT[2|8] = 'Y';
+ DNA_PRINT[4|8] = 'K';
+ DNA_PRINT[1|2|4] = 'V';
+ DNA_PRINT[1|2|8] = 'H';
+ DNA_PRINT[1|4|8] = 'D';
+ DNA_PRINT[2|4|8] = 'B';
+ DNA_PRINT[1|2|4|8] = 'X';
+ NUM_ONES[0] = 0;
+ NUM_ONES[1] = 1;
+ NUM_ONES[2] = 1;
+ NUM_ONES[4] = 1;
+ NUM_ONES[8] = 1;
+ NUM_ONES[1|2] = 2;
+ NUM_ONES[1|4] = 2;
+ NUM_ONES[1|8] = 2;
+ NUM_ONES[2|4] = 2;
+ NUM_ONES[2|8] = 2;
+ NUM_ONES[4|8] = 2;
+ NUM_ONES[1|2|4] = 3;
+ NUM_ONES[1|2|8] = 3;
+ NUM_ONES[1|4|8] = 3;
+ NUM_ONES[2|4|8] = 3;
+ NUM_ONES[1|2|4|8] = 4;
+}
+
+
+seq* mk_seq() {
+ seq* res = (seq*)malloc(sizeof(seq));
+ res->seqname = 0;
+ res->aligned = 0;
+ res->overlay = 0;
+ res->mynum = -1;
+ return res;
+}
+
+int read_align(FILE* input, int target) {
+ char* res = (char*) malloc(sizeof(char)*1);
+ int i, ressize = 1, numread=0;
+ char temp[1024];
+ char currchar, checkchar, *tt;
+
+ if (feof(input)) {
+ fprintf(stderr, "2COULDN'T READ ALIGNMENT\n");
+ exit (2);
+ }
+
+
+ fgets(temp, 255, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ *(strchr(temp, '\n')) = 0;
+
+ currchar = fgetc(input);
+
+ while ((currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ checkchar = toupper(currchar);
+ if (!strchr(valid_alpha, checkchar)) {
+ // fprintf(stderr, "Warning: %d:%c skipped'\n", numread,currchar);
+ currchar = 'N';
+ }
+ res[numread++] = currchar;
+ if (numread >= ressize) {
+ res=(char*)realloc(res, sizeof(char)*(ressize*=2));
+ }
+ }
+ currchar = fgetc(input);
+ }
+ if (target >= 0) {
+ allseqs[target]->seqname = malloc (strlen(temp)+1);
+ strncpy(allseqs[target]->seqname, temp, strlen(temp)+1);
+ allseqs[target]->aligned = res;
+ allseqs[target]->alignlen = numread;
+ }
+ else {
+ for (i = 0; i < numseqs; i++) {
+ if (!strncmp(allseqs[i]->seqname, temp, strlen(temp))) {
+ // fprintf(stderr, "found %d\n",i);
+ allseqs[i]->overlay = res;
+ allseqs[i]->overlaylen = numread;
+ break;
+ }
+ }
+
+ if (i == numseqs) {
+ fprintf(stderr, "seq %s not found!\n", temp);
+ exit(2);
+ }
+ }
+ if (currchar == '>') {
+ ungetc(currchar, input);
+ return 1;
+ }
+ return 0;
+}
+
+void read_align_file (char* filename) {
+
+ FILE* input;
+ if (!(input = fopen (filename, "r"))) {
+ fprintf(stderr, "COULDN'T OPEN ALIGNMENT\n");
+ exit (2);
+ }
+ while (read_align(input,numseqs++))
+ ;
+}
+
+
+void read_sequences(int argc, char**argv) {
+ char* filename;
+ FILE* input;
+ seq* myn;
+ int i, j, kmer, breaker;
+ int zz;
+
+ for (i=2; i < argc; i++) {
+ filename = argv[i];
+ myn = 0;
+ if (!(input = fopen (filename, "r"))) {
+ fprintf(stderr, "COULDN'T OPEN SEQ %d %s\n",i,argv[i]);
+ exit (2);
+ }
+
+ do {
+ myn= allseqs[i-1];
+ myn->mynum = i-1;
+ zz = read_align(input,-1);
+ } while (zz)
+ ;
+ }
+}
+void overlayseq(int w) {
+ int pos=0, i;
+ for (i = 0; i < allseqs[w]->alignlen; i++) {
+ if (allseqs[w]->aligned[i] != '-')
+ allseqs[w]->aligned[i] = allseqs[w]->overlay[pos++];
+ }
+ fprintf(stderr, "check %d == %d\n",pos,allseqs[w]->overlaylen);
+}
+
+
+void overlay() {
+ int i;
+ for (i=0; i < numseqs; i++) {
+ overlayseq(i);
+ }
+}
+
+void printAlign() {
+ int i,j;
+ seq* a;
+ for (j=0; j < numseqs; j++) {
+ a = allseqs[j];
+ fprintf(stdout, "%s", a->seqname);
+ for (i=0; i < a->alignlen; i++) {
+ if (!(i%60))
+ fprintf(stdout, "\n");
+ // fprintf(stdout, "%d:[%x]%c", i+1,a->aligned[i],DNA_PRINT[a->aligned[i]]);
+ fprintf(stdout, "%c", a->aligned[i]);
+ }
+ fprintf(stdout, "\n");
+ }
+}
+
+
+int main(int argc, char** argv) {
+ char string_tree[16537]; //noone will ever need more :)))
+ int moved, i;
+ float ttree, test;
+
+ // fprintf(stderr, "Parsed tree\n");
+ if (argc < 3) {
+ fprintf(stderr, "Usage: overlay align.mfa seq1 [seq2].... > newalign.mfa\n");
+ exit(2);
+ }
+ numseqs = 0;
+ init_consts();
+
+
+ for (i=0; i < MAX_SEQS; i++) {
+ allseqs[i] = mk_seq();
+ }
+
+ // ttree = get_outgroups(align_node, 0);
+ // fprintf(stdout, "ALIGN %s %s RES %s OUTS", align_node->lc->seqname,
+ // align_node->rc->seqname, align_node->seqname);
+ // for (i=0; i< numouts; i++) {
+ // fprintf(stdout, " %s %f", outgroups[i]->seqname, outdists[i]);
+ // test += outdists[i];
+ // }
+ // fprintf(stdout, "\n");
+
+ read_align_file(argv[1]);
+ read_sequences(argc, argv);
+ overlay();
+ printAlign();
+ return 0;
+}
diff --git a/src/utils/rc.c b/src/utils/rc.c
new file mode 100644
index 0000000..a1ca3b7
--- /dev/null
+++ b/src/utils/rc.c
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+char* alpha = "ATCGN";
+
+typedef struct Sequence {
+ char* lets;
+ int numlets;
+ char* name;
+ char* rptr;
+} seq;
+
+char comp(char c) {
+ switch(c) {
+ case 'A': return 'T';
+ case 'T': return 'A';
+ case 'C': return 'G';
+ case 'G': return 'C';
+ case 'N': return 'N';
+ case 'a': return 't';
+ case 't': return 'a';
+ case 'c': return 'g';
+ case 'g': return 'c';
+ case 'n': return 'n';
+ default: return c;
+ }
+}
+
+int main (int argc, char **argv){
+ char* res = (char*) malloc(sizeof(char));
+ int ressize = 1, numread = 0, i;
+ char temp[256];
+ char currchar;
+
+ if (feof(stdin))
+ return 0;
+ fgets(temp, 255, stdin);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ *(strchr(temp,'\n')) = 0;
+ // strcat (temp, "(-)");
+ printf ("%s\n", temp);
+
+ currchar = fgetc(stdin);
+ while ((currchar != '>') && (currchar != EOF)) {
+ if (!isspace(currchar)) {
+ res[numread++] = comp (currchar);
+ if (numread >= ressize) {
+ res=(char*)realloc(res, sizeof(char)*(ressize*=2));
+ }
+ }
+ currchar = fgetc(stdin);
+ }
+ res[numread]=0;
+ i = 0;
+ while (--numread >= 0){
+ putchar (res[numread]);
+ i++;
+ if (i % 60 == 0){
+ putchar ('\n');
+ i = 0;
+ }
+ }
+ if (i != 0) putchar ('\n');
+ free (res);
+ return 0;
+}
diff --git a/src/utils/scorealign.c b/src/utils/scorealign.c
new file mode 100644
index 0000000..3ce40f2
--- /dev/null
+++ b/src/utils/scorealign.c
@@ -0,0 +1,479 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+
+#define NUCLEOTIDE_MATRIX_FILE "nucmatrix.txt"
+#define COLUMNS 60
+
+int cons_rate = 0;
+int doibounds = 0, doubounds = 0, leftbound, rightbound, pairseqlen;
+int doregions = 0, docropxmfa = 0;
+char **seqs;
+int *seqid, *seqstart, *seqend;
+char *seqdir, **seqcomment;
+int numseqs, seqlen = -1;
+int matchscore[256][256];
+int gapopen = -1500, gapcont = -50;
+
+inline int min (int a, int b){
+ if (a < b) return a;
+ return b;
+}
+
+inline int max (int a, int b){
+ if (a > b) return a;
+ return b;
+}
+
+inline int scoreMatch (char c, char d){
+ if (c == '-' && d == '-') return 0;
+ if (c == '-' || d == '-') return gapcont;
+ return matchscore[(unsigned char) c][(unsigned char) d];
+}
+
+int conv2seqcoords (int pos, int i, int j){
+ int alignpos = -1, pairpos = -1;
+
+ while (pairpos < pos && alignpos < seqlen){
+ alignpos++;
+ if (seqs[i][alignpos] != '-' || seqs[j][alignpos] != '-') pairpos++;
+ if (alignpos >= seqlen){
+ printf ("%d %d %d %d", pairpos, pos, alignpos, seqlen);
+ }
+ assert (alignpos < seqlen);
+ }
+
+ return alignpos+1;
+}
+
+#define CN 0
+#define NC 1
+
+int scorePair (char *seq1, char *seq2, int seqindex1, int seqindex2){
+ int score[2][2];
+ char *dad[2], *state;
+ int i, j, CNscore, NCscore, left = pairseqlen, right = 1;
+
+ for (i = 0; i < 2; i++){
+ dad[i] = (char *) malloc (sizeof (char) * pairseqlen); assert (dad[i]);
+ dad[i][0] = -1;
+ score[i][0] = 0;
+ }
+ state = (char *) malloc (sizeof (char) * pairseqlen); assert (state);
+
+ j = 0;
+ for (i = 0; i < pairseqlen; i++){
+ CNscore = score[CN][j];
+ NCscore = score[NC][j] + gapopen;
+ if (CNscore > NCscore){ score[CN][!j] = CNscore; dad[CN][i] = CN; }
+ else { score[CN][!j] = NCscore; dad[CN][i] = NC; }
+ score[CN][!j] += scoreMatch (seq1[i], seq2[i]);
+
+ CNscore = score[CN][j] + gapopen;
+ NCscore = score[NC][j];
+ if (CNscore > NCscore){ score[NC][!j] = CNscore; dad[NC][i] = CN; }
+ else { score[NC][!j] = NCscore; dad[NC][i] = NC; }
+
+ j = !j;
+ }
+
+ i = pairseqlen - 1;
+ j = (score[CN][j] > score[NC][j]) ? CN : NC;
+
+ while (i >= 0){
+ state[i] = j;
+ assert (j == CN || j == NC);
+ j = dad[j][i];
+ i--;
+ }
+
+ j = 0;
+ CNscore = 0;
+ for (i = 0; i < pairseqlen; i++){
+ if (state[i] == CN){
+ if (!CNscore){
+ CNscore = 1;
+ if (doregions) printf ("Conserved region: %d ", i+1);
+ left = min (left, i+1);
+ }
+ else if (i == pairseqlen - 1){
+ if (doregions) printf ("%d\n", i+1);
+ right = max (right, i+1);
+ }
+ j++;
+ }
+ else if (CNscore){
+ CNscore = 0;
+ if (doregions) printf ("%d\n", i);
+ right = max (right, i);
+ }
+ }
+
+ if (j > 0){
+ left = conv2seqcoords(left-1, seqindex1, seqindex2);
+ right = conv2seqcoords(right-1, seqindex1, seqindex2);
+
+ if (doibounds){
+ leftbound = max (leftbound, left);
+ rightbound = min (rightbound, right);
+ }
+ else if (doubounds){
+ leftbound = min (leftbound, left);
+ rightbound = max (rightbound, right);
+ }
+ }
+ else {
+ leftbound = 1;
+ rightbound = seqlen;
+ }
+
+ for (i = 0; i < 2; i++) free (dad[i]);
+ free (state);
+
+ return j;
+}
+
+void project (char *orig1, char *orig2, char *dest1, char *dest2, int *length){
+ int i, j;
+
+ j = 0;
+ for (i = 0; i < *length; i++){
+ if (orig1[i] != '-' || orig2[i] != '-'){
+ dest1[j] = orig1[i];
+ dest2[j] = orig2[i];
+ j++;
+ }
+ }
+ *length = j;
+}
+
+int countleft (int pos, int i){
+ int j, k;
+
+ k = 0;
+ for (j = 0; j < pos; j++)
+ if (seqs[i][j] != '-') k++;
+
+ return k;
+}
+
+int countright (int pos, int i){
+ int j, k;
+
+ k = 0;
+ for (j = seqlen - 1; j > pos; j--)
+ if (seqs[i][j] != '-') k++;
+
+ return k;
+}
+
+void printXMFA (int score){
+ int i, j, k;
+
+ if (leftbound > rightbound) {
+ return;
+ }
+
+ if (seqid[0] == -1){
+ for (i = 0; i < numseqs; i++){
+ seqid[i] = i+1;
+ seqstart[i] = 1;
+ seqend[i] = countleft (seqlen, i);
+ seqdir[i] = '+';
+ strcpy (seqcomment[i], "");
+ }
+ }
+
+ for (i = 0; i < numseqs; i++){
+ if (seqcomment[i][strlen(seqcomment[i]) - 1] == '\n')
+ seqcomment[i][strlen(seqcomment[i]) - 1] = '\0';
+
+ printf (">%d:%d-%d %c %s\n", seqid[i],
+ seqstart[i] + countleft (leftbound-1, i), seqend[i] - countright(rightbound-1, i),
+ seqdir[i], seqcomment[i]);
+
+ k = 0;
+ for (j = leftbound - 1; j <= rightbound - 1; j++){
+ printf ("%c", seqs[i][j]);
+ k++;
+ if (k % COLUMNS == 0) printf("\n");
+ }
+ if (k % COLUMNS != 0) printf("\n");
+ }
+ printf ("= score=%d\n", score);
+}
+
+void scoreAlign (){
+ int i, j;
+ int score = 0;
+ char *u, *v;
+
+ for (i = 0; i < numseqs - 1; i++){
+ for (j = i + 1; j < numseqs; j++){
+ pairseqlen = seqlen;
+ u = (char *) malloc (sizeof (char) * seqlen); assert (u);
+ v = (char *) malloc (sizeof (char) * seqlen); assert (v);
+ project (seqs[i], seqs[j], u, v, &pairseqlen);
+ score += scorePair (u, v, i, j);
+ free (u);
+ free (v);
+ }
+ }
+
+ if (!doregions){
+ if (doibounds || doubounds)
+ if (docropxmfa){
+ printXMFA(score);
+ }
+ else
+ printf ("score=%d start=%d end=%d\n", score, leftbound, rightbound);
+ else
+ printf ("%d\n", score);
+ }
+}
+
+inline int issymbol (char ch){
+ return ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '.' || ch == '-';
+}
+
+void extractXMFAinfo (char *line, int *si, int *ss, int *se, char *sd, char **sc){
+ int numread;
+
+ *sc = malloc (sizeof (char) * 1024);
+ numread = sscanf (line, ">%d:%d-%d %c %s", si, ss, se, sd, *sc);
+
+ if (numread < 4){
+ *si = *ss = *se = -1;
+ *sd = '~';
+ strcpy (*sc, "");
+ }
+ else if (numread < 5){
+ strcpy (*sc, "");
+ }
+}
+
+char *getSequence (FILE *file, int *si, int *ss, int *se, char *sd, char **sc){
+ int charsread = 0;
+ int bufsize = 1;
+ char *buffer;
+ char prevch = '~';
+ char line[1024];
+
+ if (feof (file)) return NULL;
+ fgets (line, 1024, file);
+ if (line[0] == '='){
+ return NULL;
+ }
+
+ extractXMFAinfo (line, si, ss, se, sd, sc);
+
+ buffer = (char *) malloc (sizeof (char) * bufsize); assert (buffer);
+
+ while (!feof (file)){
+ buffer[charsread] = toupper (fgetc (file));
+
+ if (buffer[charsread] == '>' || buffer[charsread] == '='){
+ ungetc (buffer[charsread], file);
+ break;
+ }
+
+ if (issymbol (buffer[charsread]))
+ charsread++;
+
+ if (charsread == bufsize){
+ bufsize *= 2;
+ buffer = (char *) realloc (buffer, sizeof (char) * bufsize);
+ }
+
+ prevch = buffer[charsread];
+ }
+
+ if (charsread == 0){
+ free (buffer);
+ return NULL;
+ }
+
+ if (seqlen == -1)
+ seqlen = charsread;
+ else {
+ assert (seqlen == charsread);
+ }
+
+ return buffer;
+}
+
+int getSequences (FILE *file){
+ char *newseq, sd, *sc;
+ int i, si, ss, se;
+
+ seqlen = -1;
+ numseqs = 0;
+
+ seqs = (char **) malloc (sizeof (char *) * 0);
+ seqid = (int *) malloc (sizeof (int) * 0);
+ seqstart = (int *) malloc (sizeof (int) * 0);
+ seqend = (int *) malloc (sizeof (int) * 0);
+ seqdir = (char *) malloc (sizeof (char) * 0);
+ seqcomment = (char **) malloc (sizeof (char *) * 0);
+
+ while (newseq = getSequence (file, &si, &ss, &se, &sd, &sc)){
+ numseqs++;
+
+ seqs = (char **) realloc (seqs, sizeof (char *) * numseqs);
+ seqid = (int *) realloc (seqid, sizeof (int) * numseqs);
+ seqstart = (int *) realloc (seqstart, sizeof (int) * numseqs);
+ seqend = (int *) realloc (seqend, sizeof (int) * numseqs);
+ seqdir = (char *) realloc (seqdir, sizeof (char) * numseqs);
+ seqcomment = (char **) realloc (seqcomment, sizeof (char *) * numseqs);
+
+ seqs[numseqs - 1] = newseq;
+ seqid[numseqs - 1] = si;
+ seqstart[numseqs - 1] = ss;
+ seqend[numseqs - 1] = se;
+ seqdir[numseqs - 1] = sd;
+ seqcomment[numseqs - 1] = sc;
+ }
+
+ if (numseqs > 0) return 1;
+
+ free (seqs);
+ free (seqid);
+ free (seqstart);
+ free (seqend);
+ free (seqdir);
+ free (seqcomment);
+
+ return 0;
+}
+
+int processSequences (FILE *file){
+ int i, j;
+
+ if (getSequences (file)){
+ if (doibounds){
+ leftbound = 0;
+ rightbound = 1000000000;
+ }
+ else if (doubounds){
+ leftbound = 1000000000;
+ rightbound = 0;
+ }
+
+ scoreAlign();
+
+ for (i = 0; i < numseqs; i++) free (seqs[i]);
+ free (seqs);
+ free (seqid);
+ free (seqstart);
+ free (seqend);
+ free (seqdir);
+ for (i = 0; i < numseqs; i++) free (seqcomment[i]);
+ free (seqcomment);
+
+ return 1;
+ }
+ return 0;
+}
+
+void calculateScoreMatrix(){
+ char *alpha = "ATCG";
+ int i, j;
+
+ double p_ij = (double) cons_rate / 100.0;
+ double match = log (p_ij / 0.25);
+ double mismatch = log ((1 - p_ij) / 0.75);
+
+ for (i = 0; i < strlen (alpha); i++){
+ for (j = 0; j < strlen (alpha); j++){
+ matchscore[(unsigned char) alpha[i]][(unsigned char) alpha[j]] =
+ (i == j) ? (int)(match * 100) : (int)(mismatch * 100);
+ }
+ }
+ gapopen = (int)(-40 * match * 100);
+}
+
+void readScoreMatrix (char *filename){
+ FILE *file;
+ int i, j, k, numlets = 0;
+ char lets[256], line[1024];
+ char *lagan_dir;
+
+ lagan_dir = getenv ("LAGAN_DIR");
+ if (!lagan_dir){
+ fprintf (stderr, "Error: $LAGAN_DIR not set.\n");
+ exit (1);
+ }
+
+ sprintf (line, "%s/%s", lagan_dir, filename);
+ fprintf (stderr, "%s\n", line);
+
+ file = fopen (line, "r"); assert (file);
+
+ fgets (line, 1024, file);
+ for (i = 0; i < strlen (line); i++){
+ if (!isspace (line[i])){
+ lets[numlets++] = line[i];
+ }
+ }
+
+ for (i = 0; i < numlets; i++){
+ fscanf (file, "%1s", &(line[0]));
+ for (j = 0; j < numlets; j++){
+ fscanf (file, "%d", &k);
+ matchscore[(unsigned char) line[0]][(unsigned char) lets[j]] = k;
+ }
+ }
+
+ fscanf (file, "%d%d", &gapopen, &gapcont);
+ fclose (file);
+}
+
+void processFile (char *filename){
+ FILE *file;
+ int i, j;
+
+ for (i = 0; i < 256; i++)
+ for (j = 0; j < 256; j++)
+ matchscore[i][j] = 0;
+
+ if (cons_rate >= 0)
+ calculateScoreMatrix();
+ else
+ readScoreMatrix (NUCLEOTIDE_MATRIX_FILE);
+
+ file = fopen (filename, "r"); assert (file);
+ while (!feof (file)){
+ processSequences (file);
+ }
+ fclose (file);
+}
+
+int main (int argc, char **argv){
+ int i;
+
+ if (argc < 3 || argc > 6){
+ // [-bounds seqidx]
+ fprintf (stderr, "Usage: scorealign mfa_file cons_rate [-regions] [-ibounds | -ubounds [-cropxmfa]]\n");
+ exit (1);
+ }
+
+ cons_rate = atoi (argv[2]);
+ for (i = 3; i < argc; i++){
+ if (strcmp (argv[i], "-cropxmfa") == 0)
+ docropxmfa = 1;
+ else if (strcmp (argv[i], "-ibounds") == 0)
+ doibounds = 1;
+ else if (strcmp (argv[i], "-ubounds") == 0)
+ doubounds = 1;
+ else if (strcmp (argv[i], "-regions") == 0)
+ doregions = 1;
+ }
+
+ if (docropxmfa) assert (doibounds || doubounds);
+
+ processFile (argv[1]);
+ return 0;
+}
diff --git a/src/utils/scorecontigs.c b/src/utils/scorecontigs.c
new file mode 100644
index 0000000..6eebb51
--- /dev/null
+++ b/src/utils/scorecontigs.c
@@ -0,0 +1,410 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#define MAX_SEQ 1024
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+#define CNTS_LEN 6
+#define CNTS_A 0
+#define CNTS_T 1
+#define CNTS_C 2
+#define CNTS_G 3
+#define CNTS_N 4
+#define CNTS_GAP 5
+
+#define STATE_NULL 0
+#define STATE_MATCH 1
+#define STATE_MISMATCH 2
+#define STATE_GAP 3
+#define CACHE_SIZE 1000
+
+int PEN_0_MIS, PEN_0_MTC, PEN_0_GAP;
+int PEN_1_MIS, PEN_1_MTC, PEN_1_GAP;
+int PEN_TO_0, PEN_TO_1;
+
+char* alpha = "ATCGN-.";
+double scoreMatch = 12;
+double scoreMismatch = -4;
+double scoreGapOpen = -80;
+double cache[CACHE_SIZE];
+
+typedef struct align_res {
+ char *names[MAX_SEQ];
+ int algnlen;
+ int numseqs;
+ char *data[MAX_SEQ];
+} align;
+
+typedef struct rangelist_res {
+ int seqlen;
+ int *score;
+} rangelist;
+
+int cntlets(FILE* input, int lettersonly) {
+ int numread=0;
+ char temp[1024];
+ char currchar = '~';
+
+ rewind (input);
+ if (feof(input))
+ return 0;
+ fgets(temp, 1024, input);
+ if (temp[0] != '>') {
+ fprintf(stderr, "File is not in FASTA format!!\n");
+ exit(1);
+ }
+ currchar = fgetc(input);
+ while ((currchar != '>') && !feof (input)) {
+
+ if (!isspace(currchar)) {
+ currchar = toupper(currchar);
+ if (!lettersonly || isalpha (currchar)){
+ numread++;
+ }
+ }
+ currchar = fgetc(input);
+ }
+
+ rewind(input);
+ return numread;
+}
+
+int readseq (FILE *input, align *res){
+ int numread = 0;
+ char temp[1024], currchar, *write;
+
+ if (feof (input)) return 0;
+ fgets (temp, 1024, input);
+ if (temp[0] != '>'){
+ fprintf (stderr, "scorealign: File is not in FASTA format!!\n");
+ exit (1);
+ }
+ res->names[res->numseqs] = (char*) malloc((strlen(temp))*sizeof(char));
+ strcpy(res->names[res->numseqs], temp+1);
+ *(strchr(res->names[res->numseqs], '\n')) = 0;
+
+ write = res->data[res->numseqs] = (char *) malloc (sizeof (char) * res->algnlen); assert (write);
+
+ currchar = fgetc (input);
+ while (numread <= res->algnlen && (currchar != '>') && !feof (input)){
+ if (!isspace (currchar)){
+ currchar = toupper (currchar);
+ if (!strchr(alpha, currchar)) currchar = 'N';
+ write[numread++] = currchar;
+ }
+ currchar = fgetc (input);
+ }
+
+ if (currchar == '>'){
+ ungetc (currchar, input);
+ }
+
+ if (numread != res->algnlen) {
+ fprintf (stderr, "Sequence (%s) of different lengths (%d v. %d)!!\n",
+ res->names[res->numseqs], numread, res->algnlen);
+ exit(1);
+ }
+ return 1;
+}
+
+align *readMultial (char *filename){
+ FILE *alfile;
+ align *res;
+
+ if (!(alfile = fopen (filename, "r"))){
+ fprintf (stderr, "scorecontigs: couldn't open alignment file: %s\n", filename);
+ exit (1);
+ }
+
+ res = (align *) malloc (sizeof (align)); assert (res);
+ res->algnlen = cntlets (alfile, 0);
+ res->numseqs = 0;
+
+ while (readseq (alfile, res)) res->numseqs++;
+
+ assert (res->numseqs == 2);
+
+ fclose (alfile);
+
+ return res;
+}
+
+inline int getstate (char c, char d){
+ if (c == '-' || d == '-') return 2;
+ if (c == 'N' || d == 'N') return 3;
+ return c == d;
+}
+
+rangelist *getranges (char *filename, int offs){
+ FILE *file;
+ align *myal = readMultial (filename);
+ rangelist *r = (rangelist *) malloc (sizeof (rangelist));
+ int *scores[2], i, j, k, l, m, state, from0, from1, herescore;
+ int *states, len, used, tot;
+ char *traceback[2];
+
+ assert (r);
+
+ file = fopen (filename, "r"); assert (file);
+ r->seqlen = cntlets (file, 1);
+ len = cntlets (file, 0);
+ for (i = 0; i < 2; i++){
+ scores[i] = (int *) malloc (sizeof (int) * len); assert (scores[i]);
+ traceback[i] = (char *) malloc (sizeof (char) * len); assert (traceback[i]);
+ }
+
+ for (i = 0; i < len; i++){
+ state = getstate (myal->data[0][i], myal->data[1][i]);
+ assert (i >= 0 && i < myal->algnlen);
+
+ if (i <= 5){
+ scores[0][i] = scores[1][i] = 0;
+ traceback[0][i] = traceback[1][i] = 0;
+ }
+ else {
+
+ // go to state 0
+ herescore = (state == 0 ? PEN_0_MIS : (state == 1 ? PEN_0_MTC : (state == 2 ? PEN_0_GAP : 0)));
+ from0 = scores[0][i-1] + herescore;
+ from1 = scores[1][i-1] + herescore + PEN_TO_0;
+ if (from0 > from1){ scores[0][i] = from0; traceback[0][i] = 0; }
+ else { scores[0][i] = from1; traceback[0][i] = 1; }
+
+ // go to state 1
+ herescore = (state == 0 ? PEN_1_MIS : (state == 1 ? PEN_1_MTC : (state == 2 ? PEN_1_GAP : 0)));
+ from0 = scores[0][i-1] + herescore + PEN_TO_1;
+ from1 = scores[1][i-1] + herescore;
+ if (from0 > from1){ scores[1][i] = from0; traceback[1][i] = 0; }
+ else { scores[1][i] = from1; traceback[1][i] = 1; }
+ }
+ }
+
+ states = (int *) malloc (sizeof (int) * len); assert (states);
+ states[len - 1] = (scores[0][len - 1] > scores[1][len - 1]) ? 0 : 1;
+ for (i = len - 2; i >= 0; i--) states[i] = traceback[states[i+1]][i+1];
+ r->score = (int *) malloc (sizeof (int) * r->seqlen); assert (r->score);
+
+ k = tot = used = 0;
+ for (i = 0; i < len; i++){
+
+ if (!states[i]){
+ if (isalpha (myal->data[0][i])){
+ r->score[k] = 0;
+ k++;
+ }
+ continue;
+ }
+
+ used = 1;
+ herescore = l = 0;
+
+ for (j = i; j < len && states[j]; j++){
+ if (isalpha (myal->data[0][j])) l++;
+ state = getstate (myal->data[0][j], myal->data[1][j]);
+ herescore += (state == 0 ? PEN_1_MIS : (state == 1 ? PEN_1_MTC : (state == 2 ? PEN_1_GAP : 0)));
+ }
+ tot += herescore;
+ herescore /= l;
+
+ // fprintf (stderr, "%s: (%d %d) %d %d\n", filename, k + offs, k + l + offs, herescore, r->seqlen);
+ for (m = k; m < k + l; m++) r->score[m] = herescore;
+
+ k += l;
+ i = j - 1;
+ }
+
+ // printf ("%d\n", tot);
+
+ free (states);
+
+ for (i = 0; i < 2; i++){
+ free (scores[i]);
+ free (traceback[i]);
+ }
+
+ if (!used){
+ free (r->score);
+ free (r);
+ return NULL;
+ }
+
+ return r;
+}
+
+inline int getdata (rangelist **ranges, int *offs, int j, int i){
+ i -= offs[j];
+ if (i >= 0 && i < ranges[j]->seqlen)
+ return ranges[j]->score[i];
+ return 0;
+}
+
+
+inline int match (rangelist **ranges, int numContigs, int i, int j, int *offs){
+ int k;
+ for (k = 0; k < numContigs; k++)
+ if ((getdata (ranges, offs, k, i) != 0) != (getdata (ranges, offs, k, j) != 0)) return 0;
+ return 1;
+}
+
+inline int allzeroes (rangelist **ranges, int numContigs, int pos, int *offs){
+ int i;
+
+ for (i = 0; i < numContigs; i++)
+ if (getdata (ranges, offs, i, pos) != 0) return 0;
+ return 1;
+}
+
+inline void print (int start, int end, int *score, int numContigs){
+ int j;
+
+ printf ("(%7d %7d)", start, end);
+ for (j = 0; j < numContigs; j++) printf (" %7d", score[j]);
+ printf ("\n");
+}
+
+void printRanges (rangelist **ranges, int numContigs, int seqLen, int *offs){
+ int i, j, start = 0, end;
+ int *score = (int *) malloc (sizeof (int) * numContigs);
+ int *pattern = (int *) malloc (sizeof (int) * numContigs);
+
+ assert (score);
+ assert (pattern);
+
+ printf ("numContigs = %d\n", numContigs);
+ printf ("seqLen = %d\n", seqLen);
+
+ for (i = 0; i < numContigs; i++) score[i] = 0;
+ for (i = 0; i <= seqLen; i++)
+ if (!allzeroes (ranges, numContigs, i, offs)) break;
+ if (i > 0) print (0, i - 1, score, numContigs);
+
+ start = end = i;
+ while (i <= seqLen){
+ if (i != seqLen && match (ranges, numContigs, start, i, offs)){
+ end = i;
+ for (j = 0; j < numContigs; j++){
+ score[j] += getdata (ranges, offs, j, i);
+ }
+ }
+ else if (i == seqLen || !allzeroes (ranges, numContigs, i, offs)){
+ print (start, end, score, numContigs);
+ for (j = 0; j < numContigs; j++) score[j] = 0;
+ if (end < i - 1) print (end + 1, i - 1, score, numContigs);
+ start = end = i;
+ }
+ i++;
+ }
+
+ free (score);
+ free (pattern);
+}
+
+inline double scoregap (int gaplen){
+ if (gaplen == 0) return 0;
+ //return (gaplen - 1) * -1 - 50;
+ return (log (gaplen) / log (10) + 1) * scoreGapOpen;
+}
+
+double scorealign (align *myal, int a, int b){
+ int i, gaplen = 0;
+ double score = 0;
+ double best = 0;
+ char c, d;
+
+
+ // compensate for lagan bug
+ for (i = 10; i < myal->algnlen; i++){
+ c = myal->data[a][i]; d = myal->data[b][i];
+ if (c == '-' && d == '-') continue;
+ if (c == '-' || d == '-') gaplen++;
+ else {
+ if (gaplen != i){
+ if (gaplen < CACHE_SIZE)
+ score += cache[gaplen];
+ else
+ score += scoregap (gaplen);
+ }
+ gaplen = 0;
+ if (c == d) score += scoreMatch;
+ else score += scoreMismatch;
+ if (score > best) best = score;
+ if (score < 0) score = 0;
+ }
+ }
+
+ return best;
+}
+
+void analyze (align *myal){
+
+ int i, j, k;
+ double score = 0;
+
+ for (i = 0; i < CACHE_SIZE; i++) cache[i] = scoregap (i);
+
+ for (i = 0; i < myal->numseqs; i++)
+ for (j = i + 1; j < myal->numseqs; j++)
+ score += scorealign (myal, i, j);
+
+ printf ("%d\n", (int) score);
+}
+
+int main(int argc, char** argv) {
+ FILE *filelist, *cfile;
+ char contignames[MAX_SEQ][1024];
+ rangelist *ranges[MAX_SEQ];
+ int numseqs, i, j;
+ int offs1[MAX_SEQ], offs2[MAX_SEQ], off[MAX_SEQ], num[MAX_SEQ];
+
+ if (argc != 5) {
+ fprintf(stderr, "Usage:\n\nscorecontigs file_list fasta_file contig_list cons_rate\n");
+ exit (1);
+ }
+
+ PEN_1_MIS = -(25 * atoi(argv[4])) / (101 - atoi (argv[4]));
+ PEN_1_MTC = 25;
+ PEN_1_GAP = PEN_1_MIS / 2;
+ PEN_0_MIS = 0;
+ PEN_0_MTC = 0;
+ PEN_0_GAP = 0;
+ PEN_TO_0 = -250; //-300;
+ PEN_TO_1 = -350; //-400;
+
+ if (!(filelist = fopen (argv[1], "r"))) {
+ fprintf(stderr, "scorecontigs: Couldn't open alignment file: %s\n", argv[1]);
+ exit (1);
+ }
+
+ numseqs = 0;
+ while (!feof (filelist)){
+ if (fscanf (filelist, "%d %d %d %s\n", &(num[numseqs]), &(offs1[numseqs]), &(offs2[numseqs]), &(contignames[numseqs])) == 4){
+ numseqs++;
+ }
+ }
+ fclose (filelist);
+
+ if (numseqs == 0){
+ fprintf (stderr, "scorecontigs: No contigs found.\n");
+ exit (1);
+ }
+
+ cfile = fopen (argv[3], "w"); assert (cfile);
+ j = 0;
+ for (i = 0; i < numseqs; i++){
+ ranges[j] = getranges (contignames[i], offs1[i]);
+ if (ranges[j]){
+ fprintf (cfile, "%d %d %d %s\n", num[i], offs1[i], offs2[i], contignames[i]);
+ off[j] = offs1[i];
+ j++;
+ }
+ }
+ fclose (cfile);
+
+ filelist = fopen (argv[2], "r"); assert (filelist);
+ printRanges (ranges, j, cntlets (filelist, 1), off);
+ fclose (filelist);
+}
diff --git a/src/utils/seqmerge.c b/src/utils/seqmerge.c
new file mode 100644
index 0000000..4eb5ae9
--- /dev/null
+++ b/src/utils/seqmerge.c
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+
+int main (int argc, char** argv){
+ FILE *file;
+ int i, written = 0;
+ char buffer[1024], ch;
+
+ if (argc == 1){
+ fprintf (stderr, "Usage:\n\nseqmerge fasta_file1 fasta_file2 ...\n");
+ exit (1);
+ }
+
+ for (i = 1; i < argc; i++){
+ file = fopen (argv[i], "r"); assert (file);
+ fgets (buffer, 1024, file);
+ if (i == 1) printf ("%s", buffer);
+
+ while (!feof (file)){
+ ch = fgetc (file);
+ if (ch == '>') break;
+ if (isalpha (ch) || ch == '.' || ch == '-'){
+ printf ("%c", ch);
+ written++;
+ if (written % 60 == 0) printf ("\n");
+ }
+ }
+ fclose (file);
+ }
+ if (written ^ 60 != 0) printf ("\n");
+}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/xmfa2mfa.pl b/src/xmfa2mfa.pl
new file mode 100755
index 0000000..5dd1391
--- /dev/null
+++ b/src/xmfa2mfa.pl
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+
+use strict;
+
+$0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0;
+
+my (@lines, @filt_lines);
+my ($line, $line_in, $type);
+
+my $mode = ($ARGV[0] eq "1" ? "M1" : ($ARGV[0] eq "2" ? "M2" : die("$0: Invalid base genome argument (expected 1 or 2)")));
+
+die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"};
+
+while (<STDIN>) {
+ $line_in = $_;
+ if ($line_in =~ /^\=.*(DM|M1|M2)$/) {
+ $type = $1; $line .= $line_in;
+ $lines[$#lines+1] = $line if $type eq "DM" or $type eq $mode;
+ undef $line; undef $type;
+ } else {
+ $line .= $line_in;
+ }
+}
+
+foreach my $line (@lines) {
+ if ($mode eq "M2") {
+ $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\=.+?)\n/s;
+# $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\=.+?\n)/s;
+
+ my ($head1, $strand1, $seq1, $head2, $strand2, $seq2, $foot) = ($1, $2, $3, $4, $5, $6, $7);
+
+ die if $strand1 ne $strand2;
+ if ($strand1 eq "-") {
+ $seq1 =~ s/\n//g;
+ $seq2 =~ s/\n//g;
+ $seq1 = reverse($seq1);
+ $seq2 = reverse($seq2);
+ $seq1 =~ s/(.{80})/$1\n/g;
+ $seq2 =~ s/(.{80})/$1\n/g;
+ }
+ $line = $head2."\n".$seq2."\n".$head1."\n".$seq1."\n".$foot."\n";
+ }
+ push @filt_lines, $line;
+}
+
+open(OUT, "> tmp.xmfa");
+foreach my $line (@filt_lines) { print OUT $line; }
+close OUT;
+
+system($ENV{"LAGAN_DIR"}."/utils/Glue tmp.xmfa > glue.out 2> glue.err");
+
+open(IN, "< glue.out");
+my @glue_out = <IN>;
+close IN;
+
+open(IN, "< glue.err");
+my @glue_err = <IN>;
+close IN;
+
+unlink("tmp.xmfa");
+unlink("glue.out");
+unlink("glue.err");
+
+print STDOUT @glue_out;
+print STDERR @glue_err;
diff --git a/supermap.pl b/supermap.pl
new file mode 100755
index 0000000..c27b6a1
--- /dev/null
+++ b/supermap.pl
@@ -0,0 +1,1622 @@
+#!/usr/bin/perl
+
+# Supermap: Piecewise monotonic alignment map generator for Shuffle-LAGAN
+# Author: Andrey Kislyuk (kislyuk at ocf.berkeley.edu)
+
+package Supermap;
+require 5.005;
+my ($VERSION) = ('$Id: supermap.pl,v 1.50 2005/06/15 22:40:04 kislyuk Exp $' =~ /,v\s+(\d+\S+)/o);
+
+# Default constant values
+my $overlap_factor = 0.8; # Aligns will be discarded if another align overlaps them by this factor or more in both seqs and has the same orientation
+my $max_asym = 10; # Chains will be formed only if the resulting region's lengths differ by at most this factor
+my $min_seq_score; # All aligns for sequences with this total score will be discarded. See getMinSeqScore
+my $max_expand_len = 30000; # Aligns will be expanded or contracted on both sides on both strands by this amount up to the total length below
+my $expand_factor = 4; # When one of an align's sequences is constrained in its expansion by a neighbor/start/end, the other one will be expanded by this times more than the first one
+my $max_chainlen = 1500000; # Aligns will not be joined if the total length on either strand exceeds this. Set 0 to disable (no chain length limit)
+my $max_job_size = 50000; # Maximum job size, in blat hits, for chunking when running glocal in parallel
+my $erode_align = 15; # Amount by which to erode the coords of each align loaded (to avoid overlap problems when chaining)
+my ($c1, $c2, $c3, $c4) = (100, 50, 400, 25); # BLAT->CHAOS score conversion parameters
+#my $max_dist_y = 10000; # Join x-monotonic into same single-chain only if at most that apart in y-species.
+my $default_lagan_dir = "/home/genome/glocal";
+my $glocal_name = (0 ? "SLAGAN" : "glocal");
+
+use Getopt::Long;
+use File::Path;
+use File::Copy;
+use Cwd;
+use IPC::Open2;
+use IO::Handle;
+#use Carp;
+use strict;
+use warnings;
+no warnings "uninitialized";
+
+sub main();
+sub init();
+sub getSeqSizes($$$);
+sub prepareHits();
+sub runSLAGAN();
+sub reprintInputHits($$$);
+sub processResults();
+sub removeSLAGANOutput();
+sub seqBelowMinScore($);
+sub alignHashID($);
+sub printChainToTemp($$$$);
+sub chainBase1Hits($$);
+sub chainBase2Hits($);
+sub load2MHashes($);
+sub loadBase2Hashes($);
+sub postProcessRegions();
+sub workerRun($$$$);
+sub dequeueClustJobs($);
+sub get_all_seqs($$);
+sub isBLAT($);
+sub useIf($$);
+sub writeSizes($$);
+sub getMinSeqScore($);
+sub checkAlignCoords($);
+sub expandSeq1($$);
+sub expandSeq2($$);
+sub finalExpand($$);
+sub expSeq1Reg($$$$$);
+sub expSeq2Reg($$$$$);
+sub finalExpReg($$$$$);
+
+# array index constants
+use constant START1 => 0; use constant END1 => 1;
+use constant START2 => 2; use constant END2 => 3;
+use constant SEQ1 => 4; use constant SEQ2 => 5;
+use constant ORIENT => 6; use constant ORIGIN => 7;
+use constant SCORE => 8; use constant TOTSC => 9;
+use constant HASHID => 10; use constant FLIPPED=> 11;
+use constant CHALO1 => 12; use constant CHAHI1 => 13;
+use constant CHALO2 => 14; use constant CHAHI2 => 15;
+use constant CHALO1E=> 16; use constant CHAHI1E=> 17;
+use constant CHALO2E=> 18; use constant CHAHI2E=> 19;
+#use constant PREV1 => 8; use constant NEXT1 => 9;
+#use constant PREV2 => 10; use constant NEXT2 => 11;
+#use constant OSTART1=> 12; use constant OEND1 => 13;
+#use constant OSTART2=> 14; use constant OEND2 => 15;
+
+$SIG{'INT'} = $SIG{'QUIT'} = $SIG{'HUP'} = $SIG{'TRAP'} = $SIG{'ABRT'} = $SIG{'STOP'} = $SIG{'TERM'} = \&dequeueClustJobs;
+
+my ($debug, $quiet, $outfile, $proflip, $skip, $no_pid, $input_glob, $input_dir,
+ $server, $db, $gen1, $gen2, $gen1sizefile, $gen2sizefile, $write_sizes1, $write_sizes2,
+ $score_file, $cfg, $cfg_file, $sizes1, $sizes2, $dbh, $tmp_dir, $tmp_prefix, $nodelete,
+ $clust_run_pid, $print_chains, $no_aligntotals, $no_clust_run, $num_jobs, $input_is_blat,
+ $force_overwrite, $print_csv, $using_GP, $slagan_params, $tmp_existed, $print_stats, $lagan_dir, $glocal_out_logfile);
+my (@input_files);
+my (%offsets1, %offsets2, %aligns1, %aligns2, %flipped_aligns);
+
+my $supermapexec = $0; my $mycwd = getcwd(); $supermapexec =~ s/^\./$mycwd/ unless $supermapexec =~ /^\.\./; $supermapexec = $mycwd."/".$supermapexec if $supermapexec =~ /^\.\./;
+die("$0: Problem resolving my name, \'$supermapexec\' is not a file") unless -f $supermapexec or $ARGV[0] eq "worker";
+$0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0;
+
+$lagan_dir = $ENV{"LAGAN_DIR"} if defined $ENV{"LAGAN_DIR"};
+$lagan_dir = $ENV{"LAGAN_DIR"} = $default_lagan_dir unless defined $ENV{"LAGAN_DIR"};
+$lagan_dir =~ s/^\.\./$mycwd\/\.\./;
+$lagan_dir =~ s/^\./$mycwd\//;
+$ENV{"LAGAN_DIR"} = $lagan_dir;
+print STDERR "$0: Warning: LAGAN_DIR=$lagan_dir is not a valid directory\n" unless -d $lagan_dir;
+push @INC, $lagan_dir;
+
+my $SLAGAN = $lagan_dir."/".$glocal_name;
+my $error_file = "./$0.$$.error.log";
+my $default_score_file = $lagan_dir."/test.score";
+my $default_outfile = "$0.out";
+my $worker_tmp_dir = "/tmp/$0.$$.worker/"; # The directory where workers store their intermediate files (two workers should not use the same directory)
+
+my $usage = "
+-infile=file \t Name of input file containing all hits for the two genomes
+-outfile=file \t Output filename (default: $default_outfile)
+-gen1=id \t First genome ID (must exist in the GPDB)
+-gen2=id \t Second genome ID (must exist in the GPDB)
+-sizes1=file \t File with sequence sizes for first genome
+-sizes2=file \t File with sequence sizes for second genome
+-bacteria \t Rearrange circular DNA to find a better alignment map
+-server=hostname GPDB server (default: lemur)
+-db=dbname \t GPDB name (default: GP)
+-config=file \t GPDB config file (default: ~/.gprc)
+-score=file \t Score file for SLAGAN (default: $default_score_file)
+-glocal_out=file \t Save intermediate GLOCAL alignment hits to this file
+-no_clust_run \t Run CPU/memory intensive jobs locally, not on the GP cluster
+-tmp_dir=dir \t Working directory (default: /tmp/$0.pid)
+-f \t\t Overwrite output file without prompting if it exists
+-v \t\t Verbose mode
+-q \t\t Quiet mode
+-k \t\t Keep all temporary files
+-expand_length=N Maximum length by which to expand alignments (default: $max_expand_len)
+-max_length=N \t Maximum length for any alignment chain in either strand
+\t\t (default: $max_chainlen)
+-min_seq_score=N Sequences with total align score below this threshold will be
+\t\t discarded (default: U penalty in SLAGAN score file)
+-max_job_size=N Threshold, in hits, for splitting workload into separate jobs
+\t\t for clust_run (default: $max_job_size)
+-c1, c2, c3, c4=N: Score factors for BLAT->CHAOS conversion
+\t\t (default: $c1, $c2, $c3, $c4)
+
+Options may be abbreviated.
+Input file format is BLAT or CHAOS. Sequence names should not contain spaces.
+Alignments with negative scores are discarded.
+Sequence size file format, one sequence per line: seq_name seq_size
+";
+
+exit(main());
+
+# ___ Subroutines _______________
+
+sub main() {
+ if ($ARGV[0] eq "worker") { workerRun($ARGV[1], $ARGV[2], $ARGV[3], $ARGV[4]); exit(0); } # Running SLAGAN in distributed mode
+ init();
+
+ print("$0: Retrieving sequence info...\n") unless $quiet;
+ $sizes1 = getSeqSizes($dbh, $gen1, $gen1sizefile);
+ (writeSizes($sizes1, $write_sizes1), exit(0)) if defined $write_sizes1;
+ $sizes2 = getSeqSizes($dbh, $gen2, $gen2sizefile);
+ (writeSizes($sizes2, $write_sizes2), exit(0)) if defined $write_sizes2;
+
+ die("$0: No sequence size data found. Stopped") if (keys(%$sizes1) < 1 or keys(%$sizes2) < 1);
+ die("$0: Flip mode is only applicable for two single-sequence organisms. Stopped") if ($proflip and not (keys(%$sizes1) == 1 and keys(%$sizes2) == 1));
+
+ # Sort and separate the alignments, run SLAGAN on them
+ prepareHits();
+ runSLAGAN();
+
+ # Chain SLAGAN alignments into supermonotonic chain and save the intermediate results
+ my ($dc, $sc1, $sc2) = processResults();
+
+ # Load the results back and expand regions, then print them
+ postProcessRegions();
+
+ print "$0: Output written to $outfile\n" unless $quiet;
+ print "$0: Intermediate files kept in $tmp_dir\n" if $nodelete and not $quiet;
+ rmdir $tmp_dir unless $tmp_existed or $nodelete;
+
+ return 0;
+}
+
+
+# Startup tasks
+sub init() {
+ system('export LC_ALL="C"'); # Things may misbehave if locale is set to UTF-8
+
+ # Berkeley Genome Pipeline functionality is used if corresponding Perl modules are found in @INC
+ foreach my $dir (@INC) {
+ $using_GP = 1 if -f $dir."/GPDBI.pm" and -f $dir."/GPutils.pm";
+ }
+
+ useIf $using_GP, "GPDBI";
+ useIf $using_GP, "GPutils";
+ useIf 1, "Utils";
+# useIf 1, "Desoverlap";
+
+ die("$0: GetOptions failed to retrieve options. Check the input options. Usage:".$usage) unless
+ GetOptions(
+ "server=s" => \$server,
+ "gen1=s" => \$gen1,
+ "gen2=s" => \$gen2,
+ "sizes1=s" => \$gen1sizefile,
+ "sizes2=s" => \$gen2sizefile,
+ "blatfile=s" => \$input_glob,
+ "infile=s" => \$input_glob,
+ "outfile=s" => \$outfile,
+ "glocal_out=s" => \$glocal_out_logfile,
+ "bacteria" => \$proflip,
+ "server=s" => \$server,
+ "db=s" => \$db,
+ "config=s" => \$cfg_file,
+ "tmp_dir=s" => \$tmp_dir,
+ "skip" => \$skip,
+ "no_pid" => \$no_pid,
+ "no_clust_run" => \$no_clust_run,
+ "print_chains" => \$print_chains,
+ "print_stats" => \$print_stats,
+ "no_aligntotals"=> \$no_aligntotals,
+ "print_csv" => \$print_csv,
+ "max_job_size" => \$max_job_size,
+ "max_length=i" => \$max_chainlen,
+ "expand_length=i"=>\$max_expand_len,
+ "min_seq_score=i"=>\$min_seq_score,
+ "max_asym=i" => \$max_asym,
+ "overlap_factor"=> \$overlap_factor,
+ "score=s" => \$score_file,
+ "c1=i" => \$c1,
+ "c2=i" => \$c2,
+ "c3=i" => \$c3,
+ "c4=i" => \$c4,
+ "slagan_params" => \$slagan_params,
+ "write_sizes1=s"=> \$write_sizes1,
+ "write_sizes2=s"=> \$write_sizes2,
+ "keep" => \$nodelete,
+ "f" => \$force_overwrite,
+ "v" => \$debug,
+ "q" => \$quiet
+ );
+
+ undef $quiet if $debug;
+ my @uinfo = getpwuid($>);
+ print("$0: Version ".$VERSION." started ".localtime()." by ".$uinfo[0]."\n") unless $quiet;
+ $tmp_prefix = $0.($no_pid ? "" : ".".$$);
+
+ unless ($no_clust_run) {
+ $no_clust_run = `which clust_run 2> /dev/null`; $no_clust_run = not $no_clust_run;
+ print("$0: clust_run not found - cluster operation disabled\n") if $no_clust_run and not $quiet;
+ }
+
+ if ($tmp_dir) {
+ $tmp_existed = 1 if -d $tmp_dir;
+ mkdir $tmp_dir unless -d $tmp_dir;
+ $tmp_dir .= "/" unless /\/^Z/;
+ } else {
+ $tmp_dir = "/tmp/".$tmp_prefix;
+ mkdir $tmp_dir;
+ $tmp_dir .= "/";
+ }
+ die("$0: No write permissions in working directory $tmp_dir. Stopped") unless -w $tmp_dir;
+ die("$0: Genome IDs or size files not specified. Usage:".$usage) unless ($gen1 or $gen1sizefile) and ($gen2 or $gen2sizefile);
+ die("$0: '-gen' options are invalid because GPDB is not available. Use '-sizes'. Stopped") if (($gen1 or $gen2) and not $using_GP);
+ die("$0: Sequence size file $gen1sizefile not found. Stopped") unless -f $gen1sizefile or $gen1;
+ die("$0: Sequence size file $gen2sizefile not found. Stopped") unless -f $gen2sizefile or $gen2;
+ die("$0: Maximum job size too small, must exceed 10000 hits. Stopped") if $max_job_size < 10000;
+ die("$0: Overlap factor must be between 0 and 1. Stopped") if $overlap_factor < 0 or $overlap_factor > 1;
+ print("$0: SLAGAN score file not specified, using default $default_score_file\n") unless $score_file or $quiet;
+ print("$0: Output file not specified, using default $default_outfile\n") unless $outfile or $quiet;
+
+ # Check input file or glob
+ if (defined $input_glob) {
+ if ($input_glob =~ /\//) { ($input_dir, $input_glob) = ($input_glob =~ /\A(.*\/)([^\/]+)\Z/); }
+ $input_glob .= "\$" unless $input_glob =~ /\$$/;
+ $input_glob = "^".$input_glob unless $input_glob =~ /^\^/;
+ @input_files = Utils::safe_glob($input_glob, $input_dir);
+ } elsif (@ARGV > 0) {
+ foreach my $file (@ARGV) {
+ if ($file =~ /\//) { ($input_dir, $file) = ($file =~ /\A(.*\/)([^\/]+)\Z/); }
+ push @input_files, $file;
+ }
+ } else { # TODO: split stdin for >2GB input
+ open(FH, "> $tmp_dir$tmp_prefix.in");
+ print FH while <STDIN>;
+ close FH;
+ push @input_files, "$tmp_prefix.in";
+ $input_dir = $tmp_dir;
+ }
+ unless ($input_dir =~ /\A\//) { $input_dir = $mycwd."/".$input_dir; }
+ die("$0: No input files matching \"$input_dir$input_glob\" found. Stopped") unless @input_files > 0;
+ print "$0: ". at input_files." input file(s)\n" if $debug;
+
+ # Check output file
+ $outfile = $default_outfile unless $outfile;
+ if (-f $outfile and not $force_overwrite and -t STDERR) {
+ print STDERR "$0: $outfile exists. Overwrite? (y/N, '-f' to force) ";
+ my $overwrite = <STDIN>; chomp $overwrite;
+ (print("Move \"$outfile\" or use option '-f'.\n"), exit(1)) unless ($overwrite eq "Y" or $overwrite eq "y" or $overwrite eq "yes");
+ }
+ open(FH, "> ".$outfile) or die("$0: Cannot open $outfile for writing: $!");
+ close FH;
+
+ # Check SLAGAN score file
+ $score_file = $default_score_file unless $score_file;
+ unless ($score_file =~ /\A\//) { $score_file = $mycwd."/".$score_file; }
+ $max_expand_len += $erode_align;
+ die("$0: max_length cannot be less than 0. Stopped") if $max_chainlen < 0;
+ $max_chainlen = 1000000000 if $max_chainlen == 0;
+ $max_chainlen -= 2*$max_expand_len;
+ # SLAGAN output for a given sequence will be discarded if the total score for the sequence is below this threshold. Default value is the SLAGAN unrelated gap penalty.
+ $min_seq_score = getMinSeqScore($score_file) unless defined $min_seq_score;
+
+ # Connect to GPDB
+ if ($using_GP) {
+ $GPutils::Error = "";
+ $cfg = read_gp_config(Get_Abs_Path($cfg_file)) or die($GPutils::Error);
+ $server ||= $cfg->Get_Val("DB", "server");
+ $db ||= $cfg->Get_Val("DB", "main_db");
+ $dbh = GPDBI->connect($server, 0, $db, undef, undef, "gp_cgi", undef, {PrintError => 0, RaiseError => 1});
+ }
+}
+
+
+# Load sequence names and sizes either from GPDB or from file
+sub getSeqSizes($$$) {
+ my ($dbh, $dataset, $gen_size_file) = @_;
+ if ($dataset) {
+ return get_all_seqs($dbh, $dataset);
+ } else {
+ my %sizes;
+ open(FH, "< ".$gen_size_file) or die("$0: Could not open file $gen_size_file for reading: ".$!);
+ while (<FH>) {
+ chomp;
+ my ($seq, $size) = split;
+ die("$0: Invalid format in file $gen_size_file") unless $seq and $size;
+ $sizes{$seq} = $size;
+ }
+ close FH;
+ return \%sizes;
+ }
+}
+
+
+# Convert BLAT to CHAOS if necessary
+# Flip hits on circular sequence if necessary
+sub prepareHits() {
+ my ($cur_align);
+ local (*FH, *OUT1);
+
+ print "$0: Preparing files...\n" unless $quiet;
+ $input_is_blat = 1 if isBLAT($input_dir.$input_files[0]);
+
+ if ($input_is_blat) {
+ foreach my $file (@input_files) {
+ system('awk \'{$13=($13+$15)?$13:1; print $1,$2,$3";",$5,$6,$7"; '.
+ 'score = "' . $c1 . '*$8-' . $c2 . '*$9-' . $c3 . '*($12+$14)-' . $c4 .
+ '*log($13+$15),"("$4")"}\''.
+ "< $input_dir$file > $tmp_dir$file.chaos");
+ }
+ } else {
+ foreach my $file (@input_files) {
+ system('ln -s "'.$input_dir.$file.'" "'.$tmp_dir.$file.'.chaos"');
+ }
+ }
+
+ if ($proflip) {
+ open(FH, "< ".$tmp_dir.$input_files[0].".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".chaos for reading: ".$!);
+ open(OUT1, "> ".$tmp_dir.$input_files[0].".flipped.chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".flipped.chaos for writing: ".$!);
+
+ my (@seq1s, @seq1e, @seq2s, @seq2e, @scores, @orientations, @seqn1, @seqn2);
+ my ($seq1center, $seq2center, $seq1median, $seq2median);
+ my $i = 0;
+ while (<FH>) {
+ /\A[\s]*.*\s([\d]+)\s([\d]+)\;\s.*\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/;
+# ($seqn1[$i], $seq1s[$i], $seq1e[$i], $seqn2[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6, $7, $8);
+ ($seq1s[$i], $seq1e[$i], $seq2s[$i], $seq2e[$i], $scores[$i], $orientations[$i]) = ($1, $2, $3, $4, $5, $6);
+ if ($seq1s[$i] > $seq1e[$i]) { my $j = $seq1s[$i]; $seq1s[$i] = $seq1e[$i]; $seq1e[$i] = $j; }
+ if ($seq2s[$i] > $seq2e[$i]) { my $j = $seq2s[$i]; $seq2s[$i] = $seq2e[$i]; $seq2e[$i] = $j; }
+ $i++;
+ }
+
+ # For each interval pair,
+ # if the seq1 interval median is greater than seq1 median, and the corresponding interval median in seq2 is less than seq2 median,
+ # OR if the seq1 interval median is less than seq1 median, and the corresponding interval median in seq2 is greater than seq2 median,
+ # set start of interval in seq1 to 2CoM1 - previous end of interval
+ # set end of interval in seq1 to 2CoM1 - previous start of interval
+ # flip the orientation (+/-)
+ $seq1center = $$sizes1{(keys(%$sizes1))[0]} / 2;
+ $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2;
+ my $flip_counter = 0;
+ foreach $i (0.. at seq1s-1) {
+ $seq1median = ($seq1s[$i] + $seq1e[$i]) / 2;
+ $seq2median = ($seq2s[$i] + $seq2e[$i]) / 2;
+ if (($seq1median > $seq1center and $seq2median < $seq2center)
+ or ($seq1median < $seq1center and $seq2median > $seq2center)) {
+ my $j = $seq2s[$i];
+ $seq2s[$i] = (2 * $seq2center) - $seq2e[$i];
+ $seq2e[$i] = (2 * $seq2center) - $j;
+ if ($orientations[$i] eq "+") { $orientations[$i] = "-"; } else { $orientations[$i] = "+"; }
+ $cur_align = [];
+ $$cur_align[START1] = $seq1s[$i]; $$cur_align[START2] = $seq2s[$i];
+ $$cur_align[END1] = $seq1e[$i]; $$cur_align[END2] = $seq2e[$i];
+ $$cur_align[SCORE] = $scores[$i]; $$cur_align[ORIENT] = $orientations[$i];
+$$cur_align[SEQ1] = (keys(%$sizes1))[0]; $$cur_align[SEQ2] = (keys(%$sizes2))[0];
+$$cur_align[START1] += $erode_align; $$cur_align[END1] -= $erode_align;
+$$cur_align[START2] += $erode_align; $$cur_align[END2] -= $erode_align;
+ $flipped_aligns{alignHashID($cur_align)} = $cur_align;
+ $flip_counter++;
+ }
+ print OUT1 "seq1 ".$seq1s[$i]." ".$seq1e[$i]."; seq2 ".$seq2s[$i]." ".$seq2e[$i]."; score = ".$scores[$i]." (".$orientations[$i].")\n";
+ }
+ close FH; close OUT1;
+ print "$0: Single-sequence flip mode: ".($flip_counter+0)." hits flipped\n" if $debug;
+ }
+}
+
+
+# Load all hits into a hash table, then write the hits for each sequence into a file
+# Run SLAGAN on each of these files, via worker instances either on the cluster or sequentially
+sub runSLAGAN() {
+ my ($clust_run_invoke, $num_jobs, $sort_pid1, $sort_pid2, $sort_pid3, $one_seq_mode,
+ $cur_align, $next_align, $curlen1, $curlen2, $nextlen1, $nextlen2, $overlap1, $overlap2, $dump_count);
+ local (*RH1, *WH1, *RH2, *WH2, *RH3, *WH3, *IN, *DUPES);
+# my $filter = Desoverlap->new($overlap_factor, $debug);
+
+ print "$0: Sorting input hits...\n" if $debug;
+ open(DUPES, "> supermap.duplicates") if $debug;
+
+ $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1);
+
+ $sort_pid1 = open2(\*RH1, \*WH1, "sort -k 1,1 -k 2,2n"); # pre-scan
+ $sort_pid2 = open2(\*RH2, \*WH2, "sort -k 1,1 -k 2,2n"); # gen1base
+ $sort_pid3 = open2(\*RH3, \*WH3, "sort -k 4,4 -k 5,5n"); # gen2base
+
+ # Sort input on seq1
+ foreach my $file (@input_files) {
+ open(IN, "< $tmp_dir$file".($proflip?".flipped":"").".chaos");
+ print WH1 while <IN>;
+ close IN;
+ }
+ close WH1;
+
+ # Scan input, check if start2, end2 are ascending for sorting, erode alignments
+ while (<RH1>) {
+ /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o;
+
+ $next_align=[];
+ ($$next_align[SEQ1], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[START2], $$next_align[END2], $$next_align[SCORE], $$next_align[ORIENT])
+ = ($1, $2, $3, $4, $5, $6, $7, $8);
+ next if $$next_align[SCORE] <= 0;
+ if ($one_seq_mode) { $$next_align[SEQ1] = (keys(%$sizes1))[0]; $$next_align[SEQ2] = (keys(%$sizes2))[0]; }
+ checkAlignCoords($next_align);
+
+ unless ($$next_align[END1]-$$next_align[START1] <= $erode_align*2 or $$next_align[END2]-$$next_align[START2] <= $erode_align*2) {
+ $$next_align[START1] += $erode_align; $$next_align[END1] -= $erode_align;
+ $$next_align[START2] += $erode_align; $$next_align[END2] -= $erode_align;
+ }
+
+=head1
+ # Overlap scan
+ if ($$next_align[START1] <= $$cur_align[END1] and $$next_align[END1] >= $$cur_align[START1] # overlap in seq1
+ and $$next_align[START2] <= $$cur_align[END2] and $$next_align[END2] >= $$cur_align[START2] # overlap in seq2
+ and $$cur_align[SEQ1] eq $$next_align[SEQ1] and $$cur_align[SEQ2] eq $$next_align[SEQ2]
+ and $$cur_align[ORIENT] eq $$next_align[ORIENT]) {
+ ($curlen1, $curlen2, $nextlen1, $nextlen2)
+ = ($$cur_align[END1] - $$cur_align[START1] + 1, $$cur_align[END2] - $$cur_align[START2] + 1,
+ $$next_align[END1] - $$next_align[START1] + 1, $$next_align[END2] - $$next_align[START2] + 1);
+
+ if ($$next_align[START1] <= $$cur_align[START1] and $$next_align[END1] >= $$cur_align[END1]) {
+ $overlap1 = $$cur_align[END1] - $$cur_align[START1] + 1; # next covers cur
+ } elsif ($$next_align[START1] <= $$cur_align[START1]) {
+ $overlap1 = $$next_align[END1] - $$cur_align[START1] + 1; # next is to the left
+ } elsif ($$next_align[END1] >= $$cur_align[END1]) {
+ $overlap1 = $$cur_align[END1] - $$next_align[START1] + 1; # next is to the right
+ } else {
+ $overlap1 = $$next_align[END1] - $$next_align[START1] + 1; # cur covers next
+ }
+ if ($$next_align[START2] <= $$cur_align[START2] and $$next_align[END2] >= $$cur_align[END2]) {
+ $overlap2 = $$cur_align[END2] - $$cur_align[START2] + 1;
+ } elsif ($$next_align[START2] <= $$cur_align[START2]) {
+ $overlap2 = $$next_align[END2] - $$cur_align[START2] + 1;
+ } elsif ($$next_align[END2] >= $$cur_align[END2]) {
+ $overlap2 = $$cur_align[END2] - $$next_align[START2] + 1;
+ } else {
+ $overlap2 = $$next_align[END2] - $$next_align[START2] + 1;
+ }
+ die("$0: Bad internal state") if $overlap1 < 0 or $overlap2 < 0;
+
+ if (($overlap1 / $curlen1 > $overlap_factor) and ($overlap2 / $curlen2 > $overlap_factor)
+ and $$cur_align[SCORE] <= $$next_align[SCORE]) {
+ $dump_count++;
+ print DUPES "Cur: (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]." over with (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]."\n" if $debug;
+ $cur_align = $next_align; next; # discard current align
+ } elsif (($overlap1 / $nextlen1 > $overlap_factor) and ($overlap2 / $nextlen2 > $overlap_factor)
+ and $$cur_align[SCORE] >= $$next_align[SCORE]) {
+ $dump_count++;
+ print DUPES "Nxt: (".$$next_align[START1]."-".$$next_align[END1].")(".$$next_align[START2]."-".$$next_align[END2].") ".$$next_align[SCORE]." over with (".$$cur_align[START1]."-".$$cur_align[END1].")(".$$cur_align[START2]."-".$$cur_align[END2].") ".$$cur_align[SCORE]."\n" if $debug;
+ next; # discard next align
+ }
+ }
+=cut
+ foreach my $cur_align ($next_align){ # (@{$filter->put($next_align)}) {
+ print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n";
+ print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n";
+ }
+
+# print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align;
+# print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if @$cur_align;
+# $cur_align = $next_align;
+ }
+# $filter->printAll();
+ # Flush alignments remaining in filter buffer
+# foreach my $cur_align (@{$filter->getBuffer()}) {
+# print WH2 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0;
+# print WH3 $$cur_align[SEQ1]." ".$$cur_align[START1]." ".$$cur_align[END1]."; ".$$cur_align[SEQ2]." ".$$cur_align[START2]." ".$$cur_align[END2]."; "."score = ".$$cur_align[SCORE]." (".$$cur_align[ORIENT].")\n" if $cur_align != 0;
+# }
+
+ close RH1; waitpid $sort_pid1, 0;
+
+ close WH2;
+ $num_jobs = reprintInputHits(1, 1, \*RH2);
+ close RH2; waitpid $sort_pid2, 0;
+
+ close WH3;
+ $num_jobs = reprintInputHits(2, $num_jobs, \*RH3);
+ close RH3; waitpid $sort_pid3, 0;
+
+ close DUPES if defined fileno DUPES;
+# print STDERR "$0: Warning: ".$filter->{dump_count}." near duplicate alignments discarded (overlap factor $overlap_factor)\n" if $filter->{dump_count} and not $quiet;
+
+ open(FH, "> ".$tmp_dir."CLUSTER_JOB_PARAMS") or die;
+ foreach my $i (1..$num_jobs-1) {
+ print FH "worker JOB".$i.".tar ".$score_file." ".$SLAGAN.($debug ? " -v" : "");
+ print FH " << JOB$i.tar > CLUSTER_JOB_MESSAGES.$i >> CLUSTER_JOB_ERRMSG.$i" unless $no_clust_run;
+ print FH "\n";
+ }
+ close FH;
+
+ if ($no_clust_run) {
+ open(FH, "< ".$tmp_dir."CLUSTER_JOB_PARAMS") or die;
+ print "$0: Running ".($num_jobs-1)." SLAGAN jobs locally...\n" unless $quiet;
+ while (<FH>) {
+ chomp;
+ print("Job $.: \"$0 $_\"\n") if $debug;
+ system("cd $tmp_dir; $supermapexec ".$_);
+ }
+ close FH;
+ } else {
+ $clust_run_invoke = "clust_run -program=".$supermapexec." -parameters=".$tmp_dir."CLUSTER_JOB_PARAMS -init_dir=$tmp_dir -wait";
+ print "$0: Running ".($num_jobs-1)." distributed SLAGAN jobs with clust_run...\n" unless $quiet;
+ print "$0: \"$clust_run_invoke\"\n" if $debug;
+
+ if ($clust_run_pid = fork()) { # I am the parent
+ waitpid($clust_run_pid, 0);
+ } elsif (not defined $clust_run_pid) {
+ die("$0: Could not fork");
+ } else { # I am the child
+ die("$0: Could not exec \"$clust_run_invoke\"") unless exec($clust_run_invoke);
+ }
+ undef $clust_run_pid;
+ }
+
+ foreach my $i (1..$num_jobs-1) {
+ system("cd $tmp_dir; tar -xf ".$tmp_dir."JOB".$i.".results.tar");
+ unlink $tmp_dir."JOB".$i.".tar" unless $nodelete;
+ unlink $tmp_dir."JOB".$i.".results.tar" unless $nodelete;
+ unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i" unless $nodelete;
+ unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i" unless $nodelete;
+ }
+
+ unlink "$tmp_dir$input_glob.chaos" unless $nodelete;
+ unlink $tmp_dir."CLUSTER_JOB_PARAMS" unless $nodelete;
+
+ foreach my $file (@input_files) {
+ unlink $tmp_dir.$file.".chaos" unless $nodelete;
+ }
+}
+
+
+sub reprintInputHit($$$) {
+ my ($base_gen, $align, $FH) = @_;
+ if ($base_gen == 1 and $$align[ORIENT] eq "+") {
+ print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } elsif ($base_gen == 1 and $$align[ORIENT] eq "-") {
+ print $FH $$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[END2]." ".$$align[START2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } elsif ($base_gen == 2 and $$align[ORIENT] eq "+") {
+ print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } elsif ($base_gen == 2 and $$align[ORIENT] eq "-") {
+ print $FH $$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; ".$$align[SEQ1]." ".$$align[END1]." ".$$align[START1]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")\n";
+ } else {
+ die("$0: Bad internal state from hit ".$$align[SEQ1]." ".$$align[START1]." ".$$align[END1]."; ".$$align[SEQ2]." ".$$align[START2]." ".$$align[END2]."; "."score = ".$$align[SCORE]." (".$$align[ORIENT].")");
+ }
+}
+
+
+sub writeJobFile($$) {
+ my ($job_id, $seq_list) = @_;
+ local *LIST;
+
+ open(LIST, "| cd $tmp_dir; xargs tar --append --file=".$tmp_dir."JOB".$job_id.".tar");
+ foreach my $file (sort alnum keys(%$seq_list)) { $file =~ /\/([^\/]+)$/; print LIST $1." "; }
+ close LIST;
+
+ foreach my $file (sort alnum keys(%$seq_list)) { unlink $file unless $nodelete; }
+}
+
+
+# Separate input into files based on sequence name and reverse order in gen2base hits
+sub reprintInputHits($$$) {
+ my ($base_gen, $job_id, $RH) = @_;
+ my ($one_seq_mode, $line_count, $prev_seq, $cur_seq, $cur_align);
+ my (%cur_seq_list, %pruned_sizes);
+ local (*OUT, *LIST);
+
+ $one_seq_mode = 1 if (keys(%$sizes1) == 1 and keys(%$sizes2) == 1);
+
+ print "$0: Reprinting hits (base genome $base_gen)..." if $debug;
+
+ $line_count = 0;
+ while (<$RH>) {
+ /\A[\s]*(.*)\s([\d]+)\s([\d]+)\;\s(.*)\s([\d]+)\s([\d]+)\;\sscore\s\=\s([e\d\.\+\-]+)\s\(([\+\-]+)\)/o;
+
+ $cur_align=[];
+ ($$cur_align[SEQ1], $$cur_align[START1], $$cur_align[END1], $$cur_align[SEQ2], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT])
+ = ($1, $2, $3, $4, $5, $6, $7, $8);
+
+ $cur_seq = ($base_gen == 1 ? $$cur_align[SEQ1] : $$cur_align[SEQ2]);
+
+ if ($cur_seq ne $prev_seq) {
+ $pruned_sizes{$cur_seq} = ($base_gen == 1 ? $$sizes1{$cur_seq} : $$sizes2{$cur_seq});
+ print " ".$cur_seq if $debug;
+ close OUT if defined fileno OUT;
+ open(OUT, "> ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos") or die("$0: Could not open file ".$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos for writing: ".$!);
+ if ($line_count > $max_job_size) {
+ writeJobFile($job_id, \%cur_seq_list);
+ undef %cur_seq_list; $line_count = 0; $job_id++;
+ }
+ $cur_seq_list{$tmp_dir.$input_files[0].".gen".$base_gen."base.".$cur_seq.".chaos"} = 1;
+ }
+ reprintInputHit($base_gen, $cur_align, \*OUT) if @$cur_align;
+
+ $prev_seq = $cur_seq;
+# $cur_align = $next_align;
+ $line_count++;
+ }
+
+# reprintInputHit($base_gen, $next_align, \*OUT) if @$next_align;
+ writeJobFile($job_id, \%cur_seq_list);
+ $job_id++;
+
+ close OUT;
+ print "\n" if $debug;
+ $sizes1 = \%pruned_sizes if $base_gen == 1;
+ $sizes2 = \%pruned_sizes if $base_gen == 2;
+ return $job_id;
+}
+
+
+sub seqBelowMinScore($) {
+ my ($line) = @_;
+ $line =~ /\A[\s]*\([\d]+\s[\d]+\)\=\([\d]+\s[\d]+\)\s([\d\.\-]+)\s[\+\-]+\s\[([\d\.\-]+)\][\s]*s1\:.*[\s]*s2\:.*\n\Z/;
+ die("$0: Unable to extract score values from SLAGAN output:\n$line") if not defined $2;
+ return ($2 < $min_seq_score);
+}
+
+sub processResults() {
+ my ($cur_seq, $input_prefix, $dropped_seqs, $sort_pid, $sort_pid2);
+ local (*RH, *WH, *IN, *OUT, *hashesDM_RH, *hashesDM_WH);
+ print "$0: Loading SLAGAN output...\n" unless $quiet;
+ open(GLOCAL_OUT_LOG, "> ".$glocal_out_logfile) if $glocal_out_logfile;
+
+ # Sort gen2base aligns on seq1, then seq2, then start2, then print them to separate files, one file per gen1 seq
+ # These files will be loaded on demand when scanning gen1base aligns (chainBase1Hits())
+ $sort_pid = open2(\*RH, \*WH, "sort -k 9,9 -k 7,7 -k 1.2,1n"); # input is base 2, key is 9 because a space is expected between s2: and seq2name
+ $input_prefix = $tmp_dir.$input_files[0].".gen2base";
+ foreach my $seq (sort alnum keys(%$sizes2)) {
+ open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes2{$seq}), next);
+ my $line = <IN>;
+ die("$0: Empty SLAGAN output file $input_prefix.$seq.chaos.glocal-out, check corresponding job logs. Stopped") unless $line;
+ if (seqBelowMinScore($line)) { print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; }
+ seek IN, 0, 0; # back to start
+ print WH while <IN>;
+ close IN;
+ }
+ close WH or die("$0: Error executing sort");
+ while (<RH>) {
+ /\ss2\:[\s]*([^\s]+)[\s]*\n\Z/;
+ if ($1 ne $cur_seq or not defined $cur_seq) {
+ next unless $1;
+ close OUT if defined fileno OUT;
+ $cur_seq = $1;
+ open(OUT, "> $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out") or die("$0: Could not open file $input_prefix.sorted-gen1.$cur_seq.chaos.glocal-out for writing: ".$!);
+ }
+ print OUT $_;
+ }
+ close RH; close OUT if defined fileno OUT;
+ waitpid $sort_pid, 0;
+
+ # Sort gen1base aligns on seq1, then start1
+ $sort_pid = open2(\*RH, \*WH, "sort -k 7,7 -k 1.2,1n"); # input is base 1
+ $input_prefix = $tmp_dir.$input_files[0].".gen1base";
+ foreach my $seq (sort alnum keys(%$sizes1)) {
+ open(IN, "< $input_prefix.$seq.chaos.glocal-out") or (delete($$sizes1{$seq}), next);
+ my $line = <IN>;
+ if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; }
+ seek IN, 0, 0; # back to start
+ print WH while <IN>;
+ if ($glocal_out_logfile) { seek IN, 0, 0; print GLOCAL_OUT_LOG while <IN>; }
+ close IN;
+ unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete;
+ }
+ unlink $input_prefix.".chaos" unless $nodelete;
+ close WH or die("$0: Error executing sort");
+
+ # Feed the gen1base aligns to the 2M/1M1 chain scanner (chainBase1Hits())
+ # The hashesDM handle is used to write 2M aligns' hashes to be sorted in seq2 order
+ print "$0: Generating supermonotonic map...\n" unless $quiet;
+ $sort_pid2 = open2(\*hashesDM_RH, \*hashesDM_WH, "sort -k 2,2");
+ chainBase1Hits(*RH, *hashesDM_WH);
+ close RH;
+ waitpid $sort_pid, 0;
+ close hashesDM_WH or die("$0: Error executing sort");
+
+ # Print sorted 2M aligns' hashes, one file per gen2 seq
+ undef $cur_seq;
+ while(<hashesDM_RH>) {
+ my $line = $_;
+ $line =~ /\A[^\s]+\s([^\s]+)\s[^\s]+\n\Z/;
+ if ($1 ne $cur_seq or not defined $cur_seq) {
+ close OUT if defined fileno OUT;
+ $cur_seq = $1;
+ open(OUT, "> $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq") or die("$0: Could not open file $tmp_dir".$input_files[0].".hashesDM.gen2.$cur_seq for writing: ".$!);
+ }
+ print OUT $line;
+ }
+ close hashesDM_RH;
+ waitpid $sort_pid2, 0;
+
+ # Sort gen2base aligns on seq2, then start2
+ $sort_pid = open2(\*RH, \*WH, "sort -k 7,7 -k 1.2,1n"); # input is base 2
+ $input_prefix = $tmp_dir.$input_files[0].".gen2base";
+ foreach my $seq (sort alnum keys(%$sizes2)) {
+ open(IN, "< $input_prefix.$seq.chaos.glocal-out") or next;
+ my $line = <IN>;
+ if (seqBelowMinScore($line)) { $dropped_seqs++; print "$0: Discarding file $input_prefix.$seq.chaos.glocal-out - score too low ($1<$min_seq_score)\n" if $debug; next; }
+ seek IN, 0, 0; # back to start
+ print WH while <IN>;
+ close IN;
+ unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete;
+ }
+ unlink $input_prefix.".chaos" unless $nodelete;
+ close WH or die("$0: Error executing sort");
+
+ # Feed the gen2base aligns to the 1M2 chain scanner (chainBase2Hits())
+ chainBase2Hits(*RH);
+ close RH;
+ waitpid $sort_pid, 0;
+
+ close GLOCAL_OUT_LOG if defined fileno GLOCAL_OUT_LOG;
+
+ removeSLAGANOutput();
+ print STDERR "$0: Warning: Alignments for $dropped_seqs sequences discarded due to total score below cutoff ($min_seq_score)\n" if $dropped_seqs and not $quiet;
+}
+
+
+sub removeSLAGANOutput() {
+ my $input_prefix = $tmp_dir.$input_files[0].".gen1base";
+ foreach my $seq (sort alnum keys(%$sizes1)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; }
+ unlink $input_prefix.".chaos" unless $nodelete;
+
+ $input_prefix = $tmp_dir.$input_files[0].".gen2base";
+ foreach my $seq (sort alnum keys(%$sizes2)) { unlink "$input_prefix.$seq.chaos.glocal-out" unless $nodelete; }
+ unlink $input_prefix.".chaos" unless $nodelete;
+
+ rmdir $tmp_dir;
+}
+
+
+sub alignHashID($) {
+ my ($align) = @_;
+# return 23*$$align[START1] + 41*$$align[START2] + 61*$$align[END1] + 83*$$align[END2];
+ return $$align[SEQ1].":".$$align[START1]."-".$$align[END1]."=".$$align[SEQ2].":".$$align[START2]."-".$$align[END2];
+}
+
+
+# The chain writer lags the chainer by two chains because the full contents of neighboring chains must be known.
+sub printChainToTemp($$$$) {
+ my ($FH, $prev_chain, $cur_chain, $next_chain) = @_;
+ return unless defined $cur_chain;
+
+ my $type = ${$$cur_chain[0]}[ORIGIN];
+ my ($first_align, $last_align) = ($$cur_chain[0], $$cur_chain[@$cur_chain-1]);
+ print $FH ${$$cur_chain[0]}[ORIGIN]." ".@$cur_chain." ".
+ $$first_align[START1]." ".$$first_align[END1]." ".$$first_align[START2]." ".$$first_align[END2]." ".
+ $$first_align[SEQ1]." ".$$first_align[SEQ2]." ".$$first_align[ORIENT]." ".$$first_align[SCORE]." ".
+ $$last_align[START1]." ".$$last_align[END1]." ".$$last_align[START2]." ".$$last_align[END2]." ".
+ $$last_align[SEQ1]." ".$$last_align[SEQ2]." ".$$last_align[ORIENT]." ".$$last_align[SCORE];
+ if ($print_chains) {
+ foreach my $align (@$cur_chain) {
+ print $FH " ".$$align[START1]." ".$$align[END1]." ".$$align[START2]." ".$$align[END2];
+ }
+ }
+ print $FH "\n";
+}
+
+
+sub chainBase1Hits($$) {
+ my ($FH, $hashesDM) = @_;
+ local *OUT;
+ my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M1,
+ $cur_seq, $align_peers, $flip_counter);
+ my @bad_aligns; my %base2peers;
+
+ while (<$FH>) {
+ /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/;
+
+ next if ($1==$2); # skip null alignments
+ (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6;
+
+ $cur_align = [];
+ ($$cur_align[START1], $$cur_align[END1], $$cur_align[START2], $$cur_align[END2], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ1], $$cur_align[SEQ2])
+ = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
+ $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//;
+ $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//;
+#warn("Seen: ".$_) if $$cur_align[SEQ1] eq "AC002301.1";
+ checkAlignCoords($cur_align);
+
+ if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) {
+ my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2;
+ my $j = $$cur_align[START2];
+ $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2];
+ $$cur_align[END2] = (2 * $seq2center) - $j;
+ if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; }
+ $$cur_align[FLIPPED]=1;
+ $flip_counter++;
+ }
+
+ $$cur_align[HASHID] = alignHashID($cur_align);
+
+ if ($$cur_align[SEQ1] ne $cur_seq) {
+#warn("Handling seq trans") if $prev_align and $$prev_align[SEQ1] eq "AC002301.1";
+printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);# unless defined $cur_seq;
+printChainToTemp(*OUT, $prev_chain, $cur_chain, undef);# unless defined $cur_seq;
+
+ undef $chain_start_2M; undef $chain_start_1M1; undef $prev_align;
+ undef $pre_prev_chain; undef $prev_chain; undef $cur_chain;
+ $cur_seq = $$cur_align[SEQ1];
+ %base2peers = %{loadBase2Hashes($tmp_dir.$input_files[0].".gen2base.sorted-gen1.$cur_seq.chaos.glocal-out")};
+ close OUT if defined fileno OUT;
+ open(OUT, "> ".$tmp_dir.$input_files[0].".2MM1.$cur_seq");
+ }
+
+ $align_peers = $base2peers{$$cur_align[HASHID]};
+ $$cur_align[ORIGIN] = defined($align_peers) ? 2 : 1;
+
+ if ($chain_start_2M and defined $align_peers and defined $prev_align # continue open 2M chain
+ and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2]
+ and $$prev_align[HASHID] eq $$align_peers[0])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2]
+ and $$prev_align[HASHID] eq $$align_peers[1])
+ or ($$cur_align[FLIPPED] and ($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2]
+ and $$prev_align[HASHID] eq $$align_peers[0])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2]
+ and $$prev_align[HASHID] eq $$align_peers[1])))
+ and $$cur_align[ORIENT] eq $$prev_align[ORIENT]
+ and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED]
+ and $$cur_align[SEQ2] eq $$prev_align[SEQ2]
+ and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1]))
+ and abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_chainlen
+ and abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_chainlen
+#and abs($$cur_align[END1] - $$chain_start_2M[START1])/abs($$cur_align[END2] - $$chain_start_2M[START2]) < $max_asym
+#and abs($$cur_align[END2] - $$chain_start_2M[START2])/abs($$cur_align[END1] - $$chain_start_2M[START1]) < $max_asym
+ ) {
+ push(@$cur_chain, $cur_align);
+ print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n";
+ } elsif (defined $align_peers) { # start new 2M chain
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);
+ $chain_start_2M = $cur_align; undef $chain_start_1M1;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ print $hashesDM $$cur_align[SEQ1]."\t".$$cur_align[SEQ2]."\t".$$cur_align[HASHID]."\n";
+ } elsif ($chain_start_1M1 and defined $prev_align # continue open 1M1 chain
+ and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START2] > $$prev_align[END2])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] < $$prev_align[START2]))
+ or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START2] < $$prev_align[END2])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END2] > $$prev_align[START2]))))
+ and $$cur_align[ORIENT] eq $$prev_align[ORIENT]
+ and $$cur_align[FLIPPED] eq $$prev_align[FLIPPED]
+ and $$cur_align[SEQ2] eq $$prev_align[SEQ2]
+ and ($$cur_align[START1] > $$prev_align[END1] or ($$cur_align[FLIPPED] and $$cur_align[START1] > $$prev_align[END1]))
+ and abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_chainlen
+ and abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_chainlen
+#and abs($$cur_align[END1] - $$chain_start_1M1[START1])/abs($$cur_align[END2] - $$chain_start_1M1[START2]) < $max_asym
+#and abs($$cur_align[END2] - $$chain_start_1M1[START2])/abs($$cur_align[END1] - $$chain_start_1M1[START1]) < $max_asym
+ ) {
+ push(@$cur_chain, $cur_align);
+ } else { # start new 1M1 chain
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);
+ $chain_start_1M1 = $cur_align; undef $chain_start_2M;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ }
+ $prev_align = $cur_align;
+ }
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain);
+ printChainToTemp(*OUT, $prev_chain, $cur_chain, undef);
+ print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen1base hits backflipped\n" if $debug and $proflip;
+ warn "$0: Warning: ". at bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0;
+}
+
+
+# Input is base 2, i.e. (start2 end2)=(start1 end1)...
+sub chainBase2Hits($) {
+ my ($FH) = @_;
+ local *OUT;
+ my ($cur_align, $prev_align, $cur_chain, $prev_chain, $pre_prev_chain, $chain_start_2M, $chain_start_1M2,
+ $cur_seq, $align_is_2M, $flip_counter);
+ my @bad_aligns; my %aligns2M;
+
+ while(<$FH>) {
+ /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s([\d\.\-]+)\s([\+\-]+)\s\[([\d\.\-]+)\][\s]*s1\:(.*)[\s]*s2\:(.*)\n\Z/;
+
+ next if ($1==$2); # skip null alignments
+ (push(@bad_aligns, $_), next) unless $1 and $2 and $3 and $4 and $5 and $6;
+
+ $cur_align = [];
+ ($$cur_align[START2], $$cur_align[END2], $$cur_align[START1], $$cur_align[END1], $$cur_align[SCORE], $$cur_align[ORIENT], $$cur_align[TOTSC], $$cur_align[SEQ2], $$cur_align[SEQ1])
+ = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
+ $$cur_align[SEQ1] =~ s/^\s+//; $$cur_align[SEQ1] =~ s/\s+$//;
+ $$cur_align[SEQ2] =~ s/^\s+//; $$cur_align[SEQ2] =~ s/\s+$//;
+ checkAlignCoords($cur_align);
+
+ if ($proflip and defined $flipped_aligns{alignHashID($cur_align)}) {
+ my $seq2center = $$sizes2{(keys(%$sizes2))[0]} / 2;
+ my $j = $$cur_align[START2];
+ $$cur_align[START2] = (2 * $seq2center) - $$cur_align[END2];
+ $$cur_align[END2] = (2 * $seq2center) - $j;
+ if ($$cur_align[ORIENT] eq "+") { $$cur_align[ORIENT] = "-"; } else { $$cur_align[ORIENT] = "+"; }
+ $$cur_align[FLIPPED] = 1;
+ $flip_counter++;
+ }
+
+ $$cur_align[HASHID] = alignHashID($cur_align);
+
+ if ($$cur_align[SEQ2] ne $cur_seq) {
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;# and not defined $cur_seq;
+ printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3;# and not defined $cur_seq;
+ undef $chain_start_1M2; undef $prev_align;
+ undef $pre_prev_chain; undef $prev_chain; undef $cur_chain;
+ $cur_seq = $$cur_align[SEQ2];
+ %aligns2M = %{load2MHashes($tmp_dir.$input_files[0].".hashesDM.gen2.$cur_seq")};
+ close OUT if defined fileno OUT;
+ open(OUT, "> ".$tmp_dir.$input_files[0].".M2.$cur_seq");
+ }
+ $$cur_align[ORIGIN] = defined($aligns2M{$$cur_align[HASHID]}) ? 2 : 3;
+
+ if (defined $aligns2M{$$cur_align[HASHID]}) { # align is 2M
+ my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : [];
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;
+ undef $chain_start_1M2; # close 1M2 chain
+ $chain_start_2M = $cur_align;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ } elsif ($chain_start_1M2 # continue open 1M2 chain
+ and ((($$cur_align[ORIENT] eq "+" and $$cur_align[START1] > $$prev_align[END1])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] < $$prev_align[START1]))
+ or ($$cur_align[FLIPPED] and (($$cur_align[ORIENT] eq "+" and $$cur_align[START1] < $$prev_align[END1])
+ or ($$cur_align[ORIENT] eq "-" and $$cur_align[END1] > $$prev_align[START1]))))
+ and $$cur_align[ORIENT] eq $$prev_align[ORIENT]
+ and $$cur_align[SEQ1] eq $$prev_align[SEQ1]
+ and $$cur_align[FLIPPED] == $$prev_align[FLIPPED]
+ and ($$cur_align[START2] > $$prev_align[END2] or ($$cur_align[FLIPPED] and $$cur_align[START2] < $$prev_align[END2]))
+ and abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_chainlen
+ and abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_chainlen
+#and abs($$cur_align[END1] - $$chain_start_1M2[START1])/abs($$cur_align[END2] - $$chain_start_1M2[START2]) < $max_asym
+#and abs($$cur_align[END2] - $$chain_start_1M2[START2])/abs($$cur_align[END1] - $$chain_start_1M2[START1]) < $max_asym
+ ) {
+ push(@$cur_chain, $cur_align);
+ } else { # start new 1M2 chain
+ my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : [];
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;
+ $chain_start_1M2 = $cur_align;
+ $pre_prev_chain = $prev_chain; $prev_chain = $cur_chain;
+ $cur_chain = [$cur_align];
+ }
+ $prev_align = $cur_align;
+ }
+ my $prev_ch_last_al = $prev_chain ? $$prev_chain[scalar(@$prev_chain)-1] : [];
+ printChainToTemp(*OUT, $pre_prev_chain, $prev_chain, $cur_chain) if $$prev_chain[0][ORIGIN] == 3;
+ printChainToTemp(*OUT, $prev_chain, $cur_chain, undef) if $$cur_chain[0][ORIGIN] == 3;
+ print "$0: Single-sequence flip mode: ".($flip_counter+0)." gen2base hits backflipped\n" if $debug and $proflip;
+ warn "$0: Warning: ". at bad_aligns." bad SLAGAN alignments discarded" if @bad_aligns > 0;
+}
+
+
+# Input: file with lines of the form "seq1 seq2 hash" (seq2 should be the same per file)
+# Output: hash(key->align hash ID, value->1). Input file is deleted.
+sub load2MHashes($) {
+ my ($file) = @_;
+ my %hashes;
+ local *FH;
+ open(FH, "< $file") or return {};
+ while (<FH>) {
+ /\A[^\s]+\t[^\s]+\t([^\s]+)\n\Z/;
+ warn("Hash collision in \"$_\" vs. \"".$hashes{$1}."\"") if defined $hashes{$1};
+ $hashes{$1} = 1;
+ }
+ close FH;
+ unlink $file unless $nodelete;
+ return \%hashes;
+}
+
+
+# Input: file with gen2base alignments which should have the same seq1 ordered by start2 or not exist
+# Output: hash(key->align hash ID, value->[prev align hash ID, next align hash ID]). Input file is deleted.
+# Input is base 2, i.e. (start2 end2)=(start1 end1)...
+sub loadBase2Hashes($) {
+ my ($file) = @_;
+ my ($prev_align, $cur_align, $next_align);
+ my %hashes;
+ local *FH;
+ open(FH, "< $file") or return {};
+ while (<FH>) { # Scan 1 line ahead because the next align must also be seen
+ /\A[\s]*\(([\d]+)\s([\d]+)\)\=\(([\d]+)\s([\d]+)\)\s.*s1\:(.*)[\s]*s2\:(.*)/;
+
+ $next_align = [];
+ # Hits are gen2base
+ ($$next_align[START2], $$next_align[END2], $$next_align[START1], $$next_align[END1], $$next_align[SEQ2], $$next_align[SEQ1]) = ($1, $2, $3, $4, $5, $6);
+ checkAlignCoords($next_align);
+ $$next_align[SEQ1] =~ s/^\s+//; $$next_align[SEQ1] =~ s/\s+$//;
+ $$next_align[SEQ2] =~ s/^\s+//; $$next_align[SEQ2] =~ s/\s+$//;
+ $$next_align[HASHID] = alignHashID($next_align);
+ warn("LB2H: Hash collision in \"$_\"") if defined $cur_align and defined $hashes{$$cur_align[HASHID]};
+ $hashes{$$cur_align[HASHID]} =
+ [$prev_align ? $$prev_align[HASHID] : 1,
+ $next_align ? $$next_align[HASHID] : 1] if $cur_align;
+ $prev_align = $cur_align; $cur_align = $next_align;
+ }
+ $hashes{$$cur_align[HASHID]} = [$prev_align ? $$prev_align[HASHID] : 1, undef] if $cur_align;
+ close FH;
+ unlink $file unless $nodelete;
+ return \%hashes;
+}
+
+
+# Load chained regions and expand them according to the expansion rules, then print them out and display some chain statistics
+sub postProcessRegions() {
+ local (*IN, *OUT, *RH1, *WH1, *RH2, *WH2, *RH3, *WH3);
+ my ($first_align, $last_align, $type, $num_aligns, $sort_pid1, $sort_pid2, $sort_pid3);
+ my (@line, @min_lengths, @max_lengths, @means, @pos_counts, @neg_counts);
+
+ $sort_pid1 = open2(\*RH1, \*WH1, "sort -k 7,7 -k 3,3n"); # sort on seq1, start1
+ $sort_pid2 = open2(\*RH2, \*WH2, "sort -k 8,8 -k 5,5n"); # sort on seq2, start2
+ $sort_pid3 = open2(\*RH3, \*WH3, "sort -k 7,7 -k 3,3n"); # sort on seq1, start1
+# open(WH1, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!);
+
+ open(OUT, "> ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!);
+# open(OUT, "| sort -k 1,1 -k 2,2n > ".$outfile) or die("$0: Could not open output file $outfile for writing: ".$!);
+ foreach my $seq (sort alnum keys %$sizes1) {
+ open(IN, "< ".$tmp_dir.$input_files[0].".2MM1.$seq") or next;
+ print WH1 while <IN>;
+ close IN;
+ unlink $tmp_dir.$input_files[0].".2MM1.$seq" unless $nodelete;
+ }
+
+ foreach my $seq (sort alnum keys %$sizes2) {
+ open(IN, "< ".$tmp_dir.$input_files[0].".M2.$seq") or next;
+ print WH1 while <IN>;
+ close IN;
+ unlink $tmp_dir.$input_files[0].".M2.$seq" unless $nodelete;
+ }
+
+ close WH1;
+ expandSeq1(\*RH1, \*WH2);
+ close RH1; waitpid $sort_pid1, 0;
+ close WH2;
+ expandSeq2(\*RH2, \*WH3);
+ close RH2; waitpid $sort_pid2, 0;
+ close WH3;
+ finalExpand(\*RH3, \*OUT);
+ close RH3; waitpid $sort_pid3, 0;
+ close OUT;
+}
+
+
+# Input: chains ordered by seq1, start1
+# Output: chains expanded on seq1
+sub expandSeq1($$) {
+ my ($RH, $WH) = @_;
+ my ($first_align, $last_align, $type, $num_aligns,
+ $cur_seq, $preexpand1, $postexpand1,
+ $prev_chain, $cur_chain, $next_chain);
+ my (@line);
+
+ while (<$RH>) {
+ chomp; @line = split;
+
+ # skip M2 regions
+ if ($line[0] == 3) {
+ $,= " "; print $WH @line[0..17]; print $WH " 0 0 0 0 "; print $WH @line[18..$#line]; print $WH "\n"; undef $,; next;
+ }
+
+ $prev_chain = $cur_chain;
+ $cur_chain = $next_chain;
+
+ $first_align = []; $last_align = [];
+ ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2],
+ $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE],
+ $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2],
+ $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE]) = @line;
+
+ $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]);
+ $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]);
+
+ my @saved_line = @line;
+ $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line];
+ next unless defined $cur_chain;
+
+ expSeq1Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq);
+# TODO
+# if ($cur_seq ne $$first_align[SEQ1]) {
+# undef $cur_chain;
+# $cur_seq = $$first_align[SEQ1];
+# }
+ }
+ expSeq1Reg($WH, $cur_chain, $next_chain, undef, $cur_seq);
+}
+
+
+sub expSeq1Reg($$$$$) {
+ my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_;
+ my ($preexpand1, $postexpand1);
+
+ $preexpand1 = $$cur_chain[0][CHALO1] - (defined $prev_chain ? $$prev_chain[0][CHAHI1] : 0);
+ $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len;
+#$preexpand1 = 0 if $preexpand1 < 0;
+ $preexpand1 = $max_expand_len if $preexpand1 < 0; # !!!
+ $postexpand1 = $$next_chain[0][CHALO1] - $$cur_chain[0][CHAHI1];
+ $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len;
+#$postexpand1 = 0 if $postexpand1 < 0;
+ $postexpand1 = $max_expand_len if $postexpand1 < 0;
+#$postexpand1 = 0 if defined $prev_chain and $$prev_chain[0][CHAHI1] > $$cur_chain[0][CHAHI1]; # don't expand if covered by another align
+ $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1;
+ $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1;
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+
+ $cur_seq = $$cur_chain[0][SEQ1] if not defined $cur_seq;
+ if ($cur_seq ne $$cur_chain[0][SEQ1]) { # Correct upper expansion
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $max_expand_len;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+ }
+
+ print $WH $$cur_chain[2]." ".$$cur_chain[3]." ".
+ $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ".
+ $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ".
+ $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ".
+ $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ".
+ $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E];
+
+ if ($print_chains) {
+ my $i = 18;
+ while (1) {
+ print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3];
+ last if @{$$cur_chain[4]} <= $i+4;
+ $i+=4;
+ }
+ }
+ print $WH "\n";
+}
+
+
+# Input: chains ordered by seq2, start2
+# Output: chains expanded on seq1 and seq2 (final output)
+sub expandSeq2($$) {
+ my ($RH, $WH) = @_;
+ my ($first_align, $last_align, $type, $num_aligns,
+ $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2,
+ $prev_chain, $cur_chain, $next_chain);
+ my (@line);
+
+ while (<$RH>) {
+ chomp; @line = split;
+
+ # skip M1 regions
+ if ($line[0] == 1) {
+ $,= " "; print $WH @line[0..21]; print $WH " 0 0 0 0 "; print $WH @line[22..$#line]; print $WH "\n"; undef $,; next;
+ }
+
+ $prev_chain = $cur_chain;
+ $cur_chain = $next_chain;
+
+ $first_align = []; $last_align = [];
+ ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2],
+ $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE],
+ $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2],
+ $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE],
+ $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E]) = @line;
+
+ $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]);
+ $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]);
+
+ my @saved_line = @line;
+ $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line];
+
+ next unless defined $cur_chain;
+ expSeq2Reg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq);
+# if ($cur_seq ne $$first_align[SEQ2]) {
+# undef $cur_chain;
+# $cur_seq = $$first_align[SEQ2];
+# }
+ }
+ expSeq2Reg($WH, $cur_chain, $next_chain, undef, $cur_seq);
+}
+
+
+sub expSeq2Reg($$$$$) {
+ my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_;
+ my ($preexpand1, $postexpand1, $preexpand2, $postexpand2);
+
+ $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E];
+ $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1];
+
+ $preexpand2 = $$cur_chain[0][CHALO2] - (defined $prev_chain ? $$prev_chain[0][CHAHI2] : 0);
+ $preexpand2 = $preexpand1 * $expand_factor if $preexpand2 > $preexpand1 * $expand_factor and $$cur_chain[2] != 3;
+ $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len;
+#$preexpand2 = 0 if $preexpand2 < 0;
+ $preexpand2 = $max_expand_len if $preexpand2 < 0;
+ $preexpand1 = $preexpand2 * $expand_factor if $preexpand1 > $preexpand2 * $expand_factor and $$cur_chain[2] != 3;
+ $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len;
+
+ $postexpand2 = $$next_chain[0][CHALO2] - $$cur_chain[0][CHAHI2];
+ $postexpand2 = $postexpand1 * $expand_factor if $postexpand2 > $postexpand1 * $expand_factor and $$cur_chain[2] != 3;
+ $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len;
+#$postexpand2 = 0 if $postexpand2 < 0;
+ $postexpand2 = $max_expand_len if $postexpand2 < 0;
+ $postexpand1 = $postexpand2 * $expand_factor if $postexpand1 > $postexpand2 * $expand_factor and $$cur_chain[2] != 3;
+ $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len;
+
+ $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1;
+ $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1;
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+
+ $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2;
+ $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1;
+ $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2;
+ $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]};
+ if ($cur_seq ne $$cur_chain[0][SEQ2]) { # Correct upper expansion
+ $postexpand2 = $postexpand1 * $expand_factor;
+ $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len;
+ $postexpand2 = 0 if $postexpand2 < 0;
+ $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2;
+ $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]};
+ }
+
+ print $WH $$cur_chain[2]." ".$$cur_chain[3]." ".
+ $$cur_chain[0][START1]." ".$$cur_chain[0][END1]." ".$$cur_chain[0][START2]." ".$$cur_chain[0][END2]." ".
+ $$cur_chain[0][SEQ1]." ".$$cur_chain[0][SEQ2]." ".$$cur_chain[0][ORIENT]." ".$$cur_chain[0][SCORE]." ".
+ $$cur_chain[1][START1]." ".$$cur_chain[1][END1]." ".$$cur_chain[1][START2]." ".$$cur_chain[1][END2]." ".
+ $$cur_chain[1][SEQ1]." ".$$cur_chain[1][SEQ2]." ".$$cur_chain[1][ORIENT]." ".$$cur_chain[1][SCORE]." ".
+ $$cur_chain[0][CHALO1]." ".$$cur_chain[0][CHAHI1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ".
+ $$cur_chain[0][CHALO2]." ".$$cur_chain[0][CHAHI2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E];
+ if ($print_chains) {
+ my $i = 22;
+ while (1) {
+ print $WH " ".${$$cur_chain[4]}[$i]." ".${$$cur_chain[4]}[$i+1]." ".${$$cur_chain[4]}[$i+2]." ".${$$cur_chain[4]}[$i+3];
+ last if @{$$cur_chain[4]} <= $i+4;
+ $i+=4;
+ }
+ }
+ print $WH "\n";
+}
+
+
+sub finalExpReg($$$$$) {
+ my ($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq) = @_;
+ my ($preexpand1, $postexpand1, $preexpand2, $postexpand2);
+ if ($$cur_chain[2] == 1) { # M1: expand in seq1 on seq2 expands * factor only
+ $preexpand1 = $$cur_chain[0][CHALO1] - $$cur_chain[0][CHALO1E];
+ $preexpand2 = $preexpand1 * $expand_factor;
+ $preexpand2 = $max_expand_len if $preexpand2 > $max_expand_len;
+ $postexpand1 = $$cur_chain[0][CHAHI1E] - $$cur_chain[0][CHAHI1];
+ $postexpand2 = $postexpand1 * $expand_factor;
+ $postexpand2 = $max_expand_len if $postexpand2 > $max_expand_len;
+ $$cur_chain[0][CHALO2E] = $$cur_chain[0][CHALO2] - $preexpand2;
+ $$cur_chain[0][CHALO2E] = 1 if $$cur_chain[0][CHALO2E] < 1;
+ $$cur_chain[0][CHAHI2E] = $$cur_chain[0][CHAHI2] + $postexpand2;
+ $$cur_chain[0][CHAHI2E] = $$sizes2{$$cur_chain[0][SEQ2]} if $$cur_chain[0][CHAHI2E] > $$sizes2{$$cur_chain[0][SEQ2]};
+ } elsif ($$cur_chain[2] == 3) { # M2: expand in seq2 on seq1 expands * factor only
+ $preexpand2 = $$cur_chain[0][CHALO2] - $$cur_chain[0][CHALO2E];
+ $preexpand1 = $preexpand2 * $expand_factor;
+ $preexpand1 = $max_expand_len if $preexpand1 > $max_expand_len;
+ $postexpand2 = $$cur_chain[0][CHAHI2E] - $$cur_chain[0][CHAHI2];
+ $postexpand1 = $postexpand2 * $expand_factor;
+ $postexpand1 = $max_expand_len if $postexpand1 > $max_expand_len;
+ $$cur_chain[0][CHALO1E] = $$cur_chain[0][CHALO1] - $preexpand1;
+ $$cur_chain[0][CHALO1E] = 1 if $$cur_chain[0][CHALO1E] < 1;
+ $$cur_chain[0][CHAHI1E] = $$cur_chain[0][CHAHI1] + $postexpand1;
+ $$cur_chain[0][CHAHI1E] = $$sizes1{$$cur_chain[0][SEQ1]} if $$cur_chain[0][CHAHI1E] > $$sizes1{$$cur_chain[0][SEQ1]};
+ }
+
+ print $WH $$cur_chain[0][SEQ1]." ".$$cur_chain[0][CHALO1E]." ".$$cur_chain[0][CHAHI1E]." ".
+ $$cur_chain[0][SEQ2]." ".$$cur_chain[0][CHALO2E]." ".$$cur_chain[0][CHAHI2E]." ".$$cur_chain[0][ORIENT];
+ print $WH " (".($$cur_chain[2]==1?"M1, ":$$cur_chain[2]==2?"DM, ":"M2, ").$$cur_chain[3]." aligns)" unless $no_aligntotals;
+ if ($print_chains) {
+ my $i = 26;
+ while (1) {
+ print $WH " [".${$$cur_chain[4]}[$i]."-".${$$cur_chain[4]}[$i+1]."=".${$$cur_chain[4]}[$i+2]."-".${$$cur_chain[4]}[$i+3]."]";
+ last if @{$$cur_chain[4]} <= $i+4;
+ $i+=4;
+ }
+ }
+ print $WH "\n";
+}
+
+
+sub finalExpand($$) {
+ my ($RH, $WH) = @_;
+ my ($first_align, $last_align, $type, $num_aligns,
+ $cur_seq, $preexpand1, $postexpand1, $preexpand2, $postexpand2,
+ $prev_chain, $cur_chain, $next_chain);
+ my %stats;
+ my (@line);
+
+ while (<$RH>) {
+ chomp; @line = split;
+
+ $prev_chain = $cur_chain;
+ $cur_chain = $next_chain;
+
+ $first_align = []; $last_align = [];
+ ($type, $num_aligns, $$first_align[START1], $$first_align[END1], $$first_align[START2], $$first_align[END2],
+ $$first_align[SEQ1], $$first_align[SEQ2],$$first_align[ORIENT], $$first_align[SCORE],
+ $$last_align[START1], $$last_align[END1], $$last_align[START2], $$last_align[END2],
+ $$last_align[SEQ1], $$last_align[SEQ2], $$last_align[ORIENT], $$last_align[SCORE],
+ $$first_align[CHALO1], $$first_align[CHAHI1], $$first_align[CHALO1E], $$first_align[CHAHI1E],
+ $$first_align[CHALO2], $$first_align[CHAHI2], $$first_align[CHALO2E], $$first_align[CHAHI2E]) = @line;
+
+ if ($type == 1) {
+ $$first_align[CHALO2] = ($$first_align[START2] < $$last_align[START2] ? $$first_align[START2] : $$last_align[START2]);
+ $$first_align[CHAHI2] = ($$first_align[END2] > $$last_align[END2] ? $$first_align[END2] : $$last_align[END2]);
+ } elsif ($type == 3) {
+ $$first_align[CHALO1] = ($$first_align[START1] < $$last_align[START1] ? $$first_align[START1] : $$last_align[START1]);
+ $$first_align[CHAHI1] = ($$first_align[END1] > $$last_align[END1] ? $$first_align[END1] : $$last_align[END1]);
+ }
+
+ my @saved_line = @line;
+ $next_chain = [$first_align, $last_align, $type, $num_aligns, \@saved_line];
+
+ next unless defined $cur_chain;
+
+ finalExpReg($WH, $prev_chain, $cur_chain, $next_chain, $cur_seq);
+
+ if ($debug or $print_stats) {
+ if ($type == 1) {
+ $$cur_chain[0][ORIENT] eq "+" ? $stats{"M1+"}++ : $stats{"M1-"}++;
+ $stats{"M1min"} = $num_aligns if $stats{"M1min"} > $num_aligns or not defined $stats{"M1min"};
+ $stats{"M1max"} = $num_aligns if $stats{"M1max"} < $num_aligns or not defined $stats{"M1max"};
+ $stats{"M1mean"} += $num_aligns;
+ } elsif ($type == 2) {
+ $$cur_chain[0][ORIENT] eq "+" ? $stats{"DM+"}++ : $stats{"DM-"}++;
+ $stats{"DMmin"} = $num_aligns if $stats{"DMmin"} > $num_aligns or not defined $stats{"DMmin"};
+ $stats{"DMmax"} = $num_aligns if $stats{"DMmax"} < $num_aligns or not defined $stats{"DMmax"};
+ $stats{"DMmean"} += $num_aligns;
+ } else {
+ $$cur_chain[0][ORIENT] eq "+" ? $stats{"M2+"}++ : $stats{"M2-"}++;
+ $stats{"M2min"} = $num_aligns if $stats{"M2min"} > $num_aligns or not defined $stats{"M2min"};
+ $stats{"M2max"} = $num_aligns if $stats{"M2max"} < $num_aligns or not defined $stats{"M2max"};
+ $stats{"M2mean"} += $num_aligns;
+ }
+ }
+ if ($cur_seq ne $$first_align[SEQ2]) {
+ undef $cur_chain;
+ $cur_seq = $$first_align[SEQ2];
+ }
+ }
+ finalExpReg($WH, $cur_chain, $next_chain, undef, $cur_seq);
+
+ if ($debug or $print_stats) {
+ foreach my $i ("DM", "M1", "M2") {
+ $stats{$i."mean"} /= ($stats{$i."+"} + $stats{$i."-"}) unless ($stats{$i."+"} + $stats{$i."-"} == 0);
+ print $i.": ".($stats{$i."+"} + $stats{$i."-"})." chains (".$stats{$i."+"}."+, ".$stats{$i."-"}."-); ".
+ "length min ".$stats{$i."min"}.", avg ".$stats{$i."mean"}.", max ".$stats{$i."max"}."\n";
+ }
+ }
+}
+
+
+# Called only in a "$0 worker" invocation
+sub workerRun($$$$) {
+ my ($tar_file, $score_file, $SLAGAN, $debug) = @_;
+ my ($tmp_dir, $io_dir) = ($worker_tmp_dir, getcwd);
+ local *FH;
+
+ mkdir($tmp_dir) or die("$0 (worker): Could not create directory $tmp_dir: ".$!);
+
+ copy($score_file, $tmp_dir);
+ $score_file =~ /.*\/([^\/]+)$/;
+ $score_file = $tmp_dir.$1;
+
+ print("$0 (worker): Version ".$VERSION." started ".localtime()."\n") if $debug;
+ print("$0 (worker): Jobfile=$tar_file, scorefile=$score_file, tmpdir=$tmp_dir, iodir=$io_dir, SLAGAN=$SLAGAN\n") if $debug;
+
+ move($io_dir."/".$tar_file, $tmp_dir);
+ my @files = `cd $tmp_dir; tar -xvf $tar_file` or warn("$0 (worker): Error extracting $tar_file");
+ foreach my $file (@files) {
+ chomp $file;
+#print "$SLAGAN $tmp_dir$file $score_file > $tmp_dir$file.glocal-out 2> $tmp_dir$file.glocal-err\n";
+ system("$SLAGAN $tmp_dir$file $score_file ".
+ "> $tmp_dir$file.glocal-out ".
+ "2> $tmp_dir$file.glocal-err");
+ }
+
+ $tar_file =~ /(.*)\.tar$/; $tar_file = $1;
+ open(FH, "| cd $tmp_dir; xargs tar --append --file=$io_dir/$tar_file.results.tar");
+ foreach my $file (glob("$tmp_dir/*glocal-out")) { $file =~ /\/([^\/]+)$/; print FH $1." "; }
+ close FH;
+
+ rmtree $tmp_dir;
+ opendir(DIR, "."); if (my @x = grep(/core\./,readdir(DIR))) { warn("$0 (worker): WARNING: $SLAGAN crashed ". at x." times"); } closedir DIR;
+ unlink(glob("core.*")) unless $nodelete;
+}
+
+
+# Interrupt handler
+sub dequeueClustJobs($) {
+ print "\n$0: Received SIG".$_[0].". Cleaning up... ";
+ if ($clust_run_pid) {
+ # send SIGQUIT to clust_run so it can dequeue cluster jobs
+ kill "QUIT", $clust_run_pid;
+ }
+ unless ($debug or $nodelete) {
+ print "Removing job files...";
+ foreach my $i (1..$num_jobs-1) {
+ unlink $tmp_dir."JOB".$i.".tar";
+ unlink $tmp_dir."JOB".$i.".results.tar";
+ unlink $tmp_dir."CLUSTER_JOB_MESSAGES.$i";
+ unlink $tmp_dir."CLUSTER_JOB_ERRMSG.$i";
+ }
+
+ unlink "$tmp_dir$input_glob.chaos";
+ unlink $tmp_dir."CLUSTER_JOB_PARAMS";
+ rmtree($tmp_dir) if $ARGV[0] eq "worker";
+ }
+ print "\n";
+ exit(1);
+}
+
+
+# Retrieve sequence length data from GPDB
+sub get_all_seqs($$) {
+ my ($dbh, $genome) = @_;
+ my ($dset, $annot_db, $family, $check_chroms, %sizes, $chroms, @real_chroms,
+ $ctgs);
+
+ ($dset, $annot_db, $family) = ($genome =~ /^\d+$/o) ?
+ ($genome + 0, ($dbh->get_data_set($genome))[4,14]) :
+ ($dbh->get_family_dset($genome))[0,4,14];
+ print "$0: Genome $genome, dataset $dset, annotation db \"$annot_db\", family \"$family\"\n" if $debug;
+ $annot_db and $check_chroms = 1;
+ if ($check_chroms) {
+ $chroms = $dbh->get_chroms(($dbh->get_data_set($dset))[2]);
+ foreach my $chrom (@$chroms) {
+ $$chrom[1] == 1 or next;
+ my $name = "chr$$chrom[2]";
+ my ($chr_id, $chr_type, $ctg_id, $size) =
+ $dbh->find_seq($name, $dset, $annot_db);
+ $chr_id and $sizes{$name} = $size;
+ }
+ }
+ $ctgs = $dbh->selectcol("SELECT name FROM dset$dset\_contigs " .
+ "WHERE name is not null and name != ? group by name", undef, "");
+ foreach my $ctg (@$ctgs) {
+ $sizes{$ctg} = $dbh->get_contig_size($dset, $ctg);
+ }
+ return \%sizes;
+}
+
+
+sub alnum {
+ my ($i);
+ my ($len1, $len2) = (length($a), length($b));
+ for ($i = 0; ($i < $len1) && ($i < $len2); ++$i) {
+ my $c1 = substr($a, $i, 1);
+ my $c2 = substr($b, $i, 1);
+ ($c1 =~ /^\d/o) || ($c2 =~ /^\d/o) || ($c1 ne $c2) and last;
+ }
+ my $a_r = ($i < $len1) ? substr($a, $i) : "";
+ my $b_r = ($i < $len2) ? substr($b, $i) : "";
+ my ($a_n, $a_s) = ($a_r =~ /^(\d+)(.*)$/o);
+ my ($b_n, $b_s) = ($b_r =~ /^(\d+)(.*)$/o);
+ return (defined($a_n) && defined($b_n)) ?
+ (($a_n <=> $b_n) || ($a_s cmp $b_s)) : ($a cmp $b);
+}
+
+
+sub isBLAT($) {
+ my ($file) = @_;
+ local *FH;
+ open(FH, "< ".$file) or die("$0: Cannot open input file $file: ".$!);
+ my $line = <FH>;
+ close FH;
+ if ($line =~ /\A.+\s[\d]+\s[\d]+\;\s.+\s[\d]+\s[\d]+\;\sscore/) {
+ return 0;
+ } elsif ($line =~ /\A[^\s]+\s[\d]+\s[\d]+\s[^\s]+\s/) {
+ return 1;
+ } else {
+ die("$0: Unknown input format in $file. Stopped");
+ }
+}
+
+
+sub getMinSeqScore($) {
+ my ($file) = @_;
+ my $score; local *FH;
+ open(FH, "< ".$file) or die("$0: Could not open SLAGAN scorefile $file: $!");
+ while (<FH>) {
+ # sample line: {+U+;+U-;-U+;-U-}{70000 0 0 0}
+ /\{\+U\+\;.+\}.*\{(\d+)\s.+\}/;
+ $score = $1 if $1;
+ }
+ close FH;
+ die("$0: Could not determine min_seq_score from SLAGAN scorefile $file. Stopped") unless $score;
+ print "$0: min_seq_score: $score\n" if $debug;
+ return $score;
+}
+
+
+sub writeSizes($$) {
+ my ($sizes, $outfile) = @_; local *FH;
+ open(FH, "> ".$outfile) or die("$0: Could not open file $outfile for writing: ".$!);
+ foreach my $key (sort alnum keys %$sizes1) {
+ print FH $key."\t".$$sizes1{$key}."\n";
+ }
+ close FH;
+}
+
+
+# Borrowed from if.pm to enable standalone conditional module loading on earlier versions of Perl
+sub useIf($$) {
+ my $method = 'import';
+ return unless shift; # CONDITION
+
+ my $package = $_[0];
+ (my $file = $package.".pm") =~ s!::!/!g;
+ require $file;
+ my $method_entry_point = $package->can($method);
+ goto &$method_entry_point if $method_entry_point;
+}
+
+
+sub checkAlignCoords($) {
+ my $cur_align = $_[0];
+ if ($$cur_align[START1] > $$cur_align[END1]) { my $i = $$cur_align[START1]; $$cur_align[START1] = $$cur_align[END1]; $$cur_align[END1] = $i; }
+ if ($$cur_align[START2] > $$cur_align[END2]) { my $i = $$cur_align[START2]; $$cur_align[START2] = $$cur_align[END2]; $$cur_align[END2] = $i; }
+
+# if ($$cur_align[OSTART1] > $$cur_align[OEND1]) { my $i = $$cur_align[OSTART1]; $$cur_align[OSTART1] = $$cur_align[OEND1]; $$cur_align[OEND1] = $i; }
+# if ($$cur_align[OSTART2] > $$cur_align[OEND2]) { my $i = $$cur_align[OSTART2]; $$cur_align[OSTART2] = $$cur_align[OEND2]; $$cur_align[OEND2] = $i; }
+}
+
+
+=head1 NAME
+
+Supermap: Piecewise monotonic alignment map generator for shuffle-lagan
+
+=head1 SYNOPSIS
+
+supermap.pl (gen2=id | sizes2=filename) (gen1=id | sizes1=filename)
+[-infile=<file>] [-outfile=<file>] [-bacteria] [-score=filename] [-f]
+[file1 file2 ...]
+
+=head1 EXAMPLES
+
+supermap.pl -sizes1=human.sizes -sizes2=mouse.sizes hm.chr*.chaos
+
+=head1 DESCRIPTION
+
+Supermap is a whole-genome alignment map generator. It is an extension to the
+Shuffle-LAGAN suite (Brudno et al., 2003). Supermap removes the asymmetry between
+the query genomes by running multiple SLAGAN passes and combining them into a full
+two-genome alignment.
+
+To run Supermap without the Berkeley Genome Pipeline functionality, you will need
+sequence length files for each of the genomes. Each file should contain one sequence
+length entry per line, of the form "sequence_name sequence_length".
+
+In the CHAOS output format (this program's input), negative orientation always means second pair of coords is inverted.
+In this program's output, negative orientation does not invert coordinates (coordinate pairs are always ascending).
+
+Run supermap.pl with no arguments to see a further description.
+
+The terms "hit" and "anchor" usually refer to local alignments produced by CHAOS or another program.
+The term "chain" refers to an extended union of a number of these local alignments.
+
+=head1 DEPENDENCIES
+
+Supermap depends on Utils.pm, SLAGAN, and a number of Unix utilities.
+
+To use the Berkeley Genome Pipeline and cluster functionality, Supermap needs
+GPutils.pm, GPDBI.pm, and clust_run.
+
+=head1 LIMITATIONS
+
+Supermap is designed to allow the manipulation of large datasets in a reasonable memory footprint.
+To do this, it allows multiple files on input and keeps most of its intermediate data in small temporary files.
+However, one current limitation is that the alignments for any sequence in either genome must fit into the largest
+addressable file size (typically 2GB), and the output alignments must also fit in that size (the remainder will be truncated).
+
+=head1 BUGS
+
+=head1 TODO
+
+TODO: bacteria description, examples, other input formats
+TODO: installer routine
+TODO: discuss input glob parameters
+TODO: local multithreading
+TODO: ignore escaped slashes when splitting dir/file (copy Alex)
+TODO: check for ++ etc in SLAGAN out
+TODO: .supermaprc file for score files, etc
+TODO: hazelton.lbl.gov/bugzilla for supermap
+
+=head1 AUTHOR
+
+Andrey Kislyuk L<mailto:kislyuk at ocf.berkeley.edu>.
+
+=cut
diff --git a/test.score b/test.score
new file mode 100644
index 0000000..e59ea9f
--- /dev/null
+++ b/test.score
@@ -0,0 +1,5 @@
+{+R+;-L-}{0 0.02 0 0;5000 0 0 0}
+{+R-;-L+}{200 0 0.1 0.02;5000 0 0 0}
+{-R+;+L-}{3000 0 0.5 0.02;5000 0 0 0}
+{+L+;-R-}{3000 0 0.5 0.02;5000 0 0 0}
+{+U+;+U-;-U+;-U-}{5000 0 0 0}
diff --git a/utils/Utils.pm b/utils/Utils.pm
new file mode 100644
index 0000000..e4e7214
--- /dev/null
+++ b/utils/Utils.pm
@@ -0,0 +1,553 @@
+#!/usr/bin/env perl
+
+package Utils;
+require 5.000;
+
+use strict;
+use Exporter;
+use Cwd;
+use IO::File;
+use POSIX qw(setsid);
+use Sys::Syslog qw(:DEFAULT setlogsock);
+
+sub Trim( @ );
+sub Lock_File( $ ; $ $ $ );
+sub Unlock_File( $ );
+sub Write_Log( $ $ ; $ $ );
+sub Parse_Filename( $ );
+sub Get_Abs_Path( $ );
+sub Expand_Path( $ );
+sub Get_Random_Key( ; $ );
+sub Hex2Ascii( $ );
+sub Ascii2Hex( $ );
+sub Get_Config_Record( $ $ );
+sub Round( $ );
+sub Set_Log( $ $ );
+sub Log( $ $ );
+sub Min( $ $ );
+sub Max( $ $ );
+sub Reg_Diff( $ $ ; $ $ $ $ $ );
+sub Reg_Rem_Overlap( $ ; $ $ $ );
+sub Reg_Sort( $ ; $ $ $ );
+sub Reg_Intersect( $ $ ; $ $ $ $ $ );
+sub Reg_Merge( $ ; $ $ $ );
+
+use vars qw(@ISA @EXPORT $VERSION $JOB $Error $Syslog $Facility $Msg_Prefix);
+
+ at ISA = qw(Exporter);
+ at EXPORT = qw(Trim Lock_File Unlock_File Write_Log Parse_Filename
+ Get_Abs_Path Expand_Path Hex2Ascii Ascii2Hex Get_Config_Record
+ Get_Random_Key Round Set_Log Log Min Max Reg_Diff Reg_Rem_Overlap
+ Reg_Sort Reg_Intersect Reg_Merge redirect_err2log openlogs safe_glob
+ daemon wr_log wr_err start_watcher confirm $JOB);
+
+my $Id = '$Id: Utils.pm,v 1.21 2005/01/07 23:08:59 poliakov Exp $';
+($VERSION) = ($Id =~ /,v\s+(\d+\S+)/o);
+$JOB = '^(\S+)\@(\S+?)_(\d{4})(?:_(.+)|)$';
+
+$Error = 0;
+$Syslog = 0;
+$Facility = "user";
+$Msg_Prefix = undef;
+
+my $E_FORK = "cannot fork";
+my @LOG_FILE = ();
+my %Locks = ();
+
+sub Trim( @ ) {
+ for (my $i = 0; $i <= $#_; ++$i) {
+ $_[$i] =~ s/^\s+//;
+ $_[$i] =~ s/\s+$//
+ }
+}
+
+sub Lock_File( $ ; $ $ $ ) {
+ my ($file, $retry, $timeout, $max_mtime) = @_;
+ my ($lock_fh, $start_time, $mtime);
+
+ if (!$file || ($file =~ /\/$/o)) {
+ $Error = "Invalid filename";
+ return 0;
+ }
+ $file = Get_Abs_Path("$file.lock");
+ if (exists($Locks{$file})) { $Error = "Already locked"; return 1; }
+ if (!-w (Parse_Filename($file))[0]) {
+ $Error = "Permission denied";
+ return 0;
+ }
+ if (!defined($retry)) { $retry = 1; }
+ if (!defined($timeout)) { $timeout = 1200; }
+ if (!defined($max_mtime)) {
+ $max_mtime = ($timeout > 0) ? int($timeout / 2) : 0;
+ }
+ $start_time = time();
+ LOCK: {
+ if (!($lock_fh = IO::File->new($file, O_RDWR|O_CREAT|O_EXCL))) {
+ if (!$retry || (($timeout > 0) && ((time() - $start_time) > $timeout))) {
+ $Error = "Locked by someone else";
+ return 0;
+ }
+ if ($max_mtime > 0) {
+ $mtime = (stat($file))[9];
+ if ($mtime && ((time() - $mtime) > $max_mtime)) { unlink($file); }
+ }
+ redo LOCK;
+ }
+ }
+ $lock_fh->close();
+ $Locks{$file} = 1;
+ return 1;
+}
+
+sub Unlock_File( $ ) {
+ my ($file) = @_;
+
+ if (!$file) { $Error = "Invalid filename"; return 0; }
+ $file = Get_Abs_Path("$file.lock");
+ if (!exists($Locks{$file})) { $Error = "Not locked"; return 0; }
+ if (!unlink($file)) { $Error = "Cannot unlock"; return 0; }
+ delete($Locks{$file});
+ return 1;
+}
+
+{
+ my $Uname;
+ foreach my $dir ('/bin', '/sbin', '/usr/bin', '/usr/sbin') {
+ -x "$dir/uname" and $Uname = "$dir/uname", last;
+ }
+ my $Host = $Uname ? `$Uname -n` : 'localhost';
+ chomp($Host);
+ ($Host) = ($Host =~ /^([^\.]+)(\..*)?$/);
+
+sub Write_Log( $ $ ; $ $ ) {
+ no strict "refs";
+ my ($log_file, $msg, $name, $pid) = @_;
+ my $error = 0;
+ my $date;
+ local *LOG;
+
+ if (!defined($log_file) || !defined($msg)) { return 0; }
+ if (*{$log_file}{IO}) {
+ *LOG = *{$log_file}{IO};
+ } elsif ($log_file eq '/dev/null') {
+ return 1;
+ } else {
+ if (!Lock_File($log_file)) { return 0; }
+ if (!open(LOG, ">> $log_file")) { $error = 1; }
+ }
+ if (!$error) {
+ chomp($msg);
+ $date = localtime(time());
+ if (!$name) { $name = $0; }
+ if (!$pid) { $pid = $$; }
+ if (!print LOG "$date $Host $name\[$pid\]: $msg\n") { $error = 1; }
+ if (!*{$log_file}{IO}) { close(LOG); }
+ }
+ if ($error && $!) { $Error = "$!"; }
+ if (!*{$log_file}{IO}) { Unlock_File($log_file); }
+ return !$error;
+}}
+
+sub Parse_Filename( $ ) {
+ my ($name) = @_;
+ my ($last_slash_pos, $dir, $file);
+
+ if (!defined($name)) { return (); }
+ $last_slash_pos = rindex($name, "/");
+ if ($last_slash_pos >= 0) {
+ $dir = substr($name, 0, $last_slash_pos + 1);
+ $file = substr($name, $last_slash_pos + 1);
+ } else {
+ $dir = "";
+ $file = $name;
+ }
+ return ($dir, $file);
+}
+
+sub Expand_Path( $ ) {
+ my ($path) = @_;
+ my $home_dir;
+
+ $path && ($path =~ /^~/o) or return $path;
+ $path =~ /^~([^\/]*)(.*)$/o;
+ $home_dir = $1 ? (getpwnam($1))[7] :
+ ($ENV{"HOME"} || $ENV{"LOGDIR"} || (getpwuid($>))[7]);
+ defined($home_dir) and $path = "$home_dir$2";
+ return $path;
+}
+
+sub Get_Abs_Path( $ ) {
+ my ($path) = @_;
+
+ defined($path) or return $path;
+ $path = Expand_Path($path);
+ $path =~ /^\//o or $path = getcwd() . "/$path";
+ $path =~ s(/{2,})(/)g;
+
+# get rid of "/./"
+
+ while ($path =~ /^(.*?)\/\.(?:|\/(.*))$/o) {
+ $path = "$1/" . ($2 ? $2 : "");
+ }
+
+# get rid of "/../"
+
+ while ($path =~ /^(((?:.*?\/)*?)[^\/]+){0,1}?\/\.\.(?:|\/(.*))$/o) {
+ $path = ($1 ? $2 : "/") . ($3 ? $3 : "");
+ }
+ return $path;
+}
+
+{
+ my @Chars = ("A" .. "Z", "a" .. "z", 0 .. 9);
+ srand();
+
+sub Get_Random_Key( ; $ ) {
+ my ($len) = @_;
+
+ if (!defined($len) || ($len !~ /^\d+$/o) || ($len < 2) || ($len > 1024)) {
+ $len = 8;
+ }
+ return join("", @Chars[map {rand @Chars } (1 .. 8)]);
+}}
+
+sub Hex2Ascii( $ ) {
+ my ($str) = @_;
+
+ if ($str) { $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; }
+ return $str;
+}
+
+{
+ my $a2h = {
+ "\t" => "%29",
+ "+" => "%2B",
+ "," => "%2C",
+ "." => "%2E",
+ ";" => "%3B",
+ "/" => "%2F",
+ "?" => "%3F",
+ ":" => "%3A",
+ "@" => "%40",
+ "=" => "%3D",
+ "&" => "%26",
+ " " => "%20",
+ "<" => "%3C",
+ ">" => "%3E",
+ "\"" => "%22",
+ "%" => "%25",
+ "#" => "%23",
+ "[" => "%5B",
+ "]" => "%5D",
+ "{" => "%7B",
+ "}" => "%7D",
+ "|" => "%7C",
+ "\\" => "%5C",
+ "^" => "%5E",
+ "~" => "%7E",
+ "`" => "%60"};
+
+sub Ascii2Hex( $ ) {
+ my ($str) = @_;
+ my $new_str = "";
+
+ if (!$str) { return $str; }
+ foreach my $char (split(//, $str)) {
+ if (exists($a2h->{$char})) { $char = $a2h->{$char}; }
+ $new_str .= $char;
+ }
+ return $new_str;
+}}
+
+sub Get_Config_Record( $ $ ) {
+ my ($conf_file, $rec) = @_;
+ my ($db, $field, $value);
+ my @result = ();
+
+ if (!($db = Registry->New($conf_file, "r", 1))) {
+ $Error = "$Registry::Error", return ();
+ }
+ if (!$db->Record_Exists($rec)) {
+ $Error = qq("$rec" record not found);
+ return ();
+ }
+ foreach my $field (qw(dir users log)) {
+ if (!($value = Expand_Path($db->Get_Val($rec, $field)))) {
+ if ($field eq "log") {
+ $value = "";
+ } else {
+ $Error = qq("$field" field of "$rec" record is missing), return ();
+ }
+ } elsif ($value !~ /^\//o) {
+ $Error = qq("$field" field of "$rec" record should be absolute path);
+ return ();
+ }
+ push(@result, $value);
+ }
+ foreach my $field (qw(max_down grace_period)) {
+ if (!($value = $db->Get_Val($rec, $field)) ||
+ ($value !~ /^\d+$/o)) {
+ $value = 0;
+ }
+ push(@result, $value);
+ }
+ return @result;
+}
+
+sub Round( $ ) {
+ my ($num) = @_;
+
+ return int($num + 0.5);
+}
+
+sub Log( $ $ ) {
+ my ($log_num, $msg) = @_;
+
+ (defined($log_num) && ($log_num >= 0) && $LOG_FILE[$log_num]) and
+ Write_Log($LOG_FILE[$log_num], $msg);
+}
+
+sub Set_Log( $ $ ) {
+ my ($log_num, $file) = @_;
+
+ (defined($log_num) && ($log_num >= 0) && $file) and
+ $LOG_FILE[$log_num] = $file;
+}
+
+sub Min( $ $ ) {
+ my ($i, $j) = @_;
+
+ return ($i < $j) ? $i : $j;
+}
+
+sub Max( $ $ ) {
+ my ($i, $j) = @_;
+
+ return ($i > $j) ? $i : $j;
+}
+
+sub Reg_Diff( $ $ ; $ $ $ $ $ ) {
+ my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_;
+ my (@new_regs, $start, $end, $new_reg);
+
+ $regs1 && $regs2 or return $regs1;
+ $s1 ||= 0;
+ defined($e1) or $e1 = 1;
+ $s2 ||= 0;
+ defined($e2) or $e2 = 1;
+ for (my $i = 0; $i < @$regs1; ++$i) {
+ $start = $$regs1[$i][$s1];
+ $end = $$regs1[$i][$e1];
+ for (my $j = 0; $j < @$regs2; ++$j) {
+ $$regs2[$j][$s2] > $end and last;
+ $$regs2[$j][$e2] < $start and next;
+ if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] >= $end)) {
+ undef($start), last;
+ }
+ if (($$regs2[$j][$s2] > $start) && ($$regs2[$j][$e2] >= $end)) {
+ $end = $$regs2[$j][$s2] - 1, last;
+ }
+ if (($$regs2[$j][$s2] <= $start) && ($$regs2[$j][$e2] < $end)) {
+ $start = $$regs2[$j][$e2] + 1, next;
+ }
+ ($start < ($$regs2[$j][$s2] - 1)) || !$strict and
+ $new_reg = [@{$$regs1[$i]}],
+ $$new_reg[$s1] = $start,
+ $$new_reg[$e1] = $$regs2[$j][$s2] - 1,
+ push(@new_regs, $new_reg);
+ $start = $$regs2[$j][$e2] + 1;
+ }
+ !defined($start) || ($start > $end) and next;
+ ($start < $end) || !$strict and
+ $new_reg = [@{$$regs1[$i]}],
+ $$new_reg[$s1] = $start,
+ $$new_reg[$e1] = $end,
+ push(@new_regs, $new_reg);
+ }
+ return \@new_regs;
+}
+
+sub Reg_Rem_Overlap( $ ; $ $ $ ) {
+ my ($regs, $strict, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ if (($i < $#new_regs) && ($new_regs[$i + 1][$s] <= $new_regs[$i][$e])) {
+ $new_regs[$i + 1][$e] <= $new_regs[$i][$e] and
+ splice(@new_regs, $i + 1, 1),
+ --$i, next;
+ $new_regs[$i + 1][$s] = $new_regs[$i][$e] + 1;
+ }
+ ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next;
+ splice(@new_regs, $i, 1);
+ --$i;
+ }
+ return \@new_regs;
+}
+
+sub Reg_Sort( $ ; $ $ $ ) {
+ my ($regs, $rev, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ if ($rev) {
+ @new_regs = sort { ($$b[$s] <=> $$a[$s]) || ($$b[$e] <=> $$a[$e]) } @$regs;
+ } else {
+ @new_regs = sort { ($$a[$s] <=> $$b[$s]) || ($$a[$e] <=> $$b[$e]) } @$regs;
+ }
+ return \@new_regs;
+}
+
+sub Reg_Intersect( $ $ ; $ $ $ $ $ ) {
+ my ($regs1, $regs2, $strict, $s1, $e1, $s2, $e2) = @_;
+
+ $regs1 && $regs2 or return undef;
+ $s1 ||= 0;
+ defined($e1) or $e1 = 1;
+ $s2 ||= 0;
+ defined($e2) or $e2 = 1;
+ return Reg_Diff($regs1, Reg_Diff($regs1, $regs2, $strict, $s1, $e1,
+ $s2, $e2), $strict, $s1, $e1, $s1, $e1);
+}
+
+sub Reg_Merge( $ ; $ $ $ ) {
+ my ($regs, $strict, $s, $e) = @_;
+ my (@new_regs);
+
+ $regs or return $regs;
+ $s ||= 0;
+ defined($e) or $e = 1;
+ for (my $i = 0; $i < @$regs; ++$i) { push(@new_regs, [@{$$regs[$i]}]); }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ ($i < $#new_regs) &&
+ ($new_regs[$i + 1][$s] == ($new_regs[$i][$e] + 1)) and
+ $new_regs[$i][$e] = $new_regs[$i + 1][$e],
+ splice(@new_regs, $i + 1, 1),
+ --$i, next;
+ }
+ for (my $i = 0; $i < @new_regs; ++$i) {
+ ($new_regs[$i][$s] < $new_regs[$i][$e]) || !$strict and next;
+ splice(@new_regs, $i, 1);
+ --$i;
+ }
+ return \@new_regs;
+}
+
+sub safe_glob {
+ my ($regexp, $dir) = @_;
+ my (@files);
+ local (*DIR);
+
+ $dir ||= ".";
+ $regexp ||= ".*";
+ opendir(DIR, $dir) or return;
+ @files = grep { /$regexp/ } readdir(DIR);
+ closedir(DIR);
+ return wantarray() ? @files : scalar(@files);
+}
+
+sub redirect_err2log {
+ my ($facility) = @_;
+
+ $Facility = $facility;
+ stderr2log();
+}
+
+sub stderr2log {
+ my ($oldfh);
+
+ open(STDERR, "> /dev/null");
+ open(STDERR, "| logger -p $Facility.err -t '$0\[$$\]'");
+ $oldfh = select(STDERR); $| = 1; select($oldfh);
+}
+
+sub openlogs {
+ my ($facility) = @_;
+
+ $facility and $Facility = $facility;
+ stderr2log();
+ setlogsock("unix");
+ openlog($0, "pid", $Facility);
+ $Syslog = 1;
+}
+
+sub daemon {
+ my ($facility) = @_;
+ my ($pid);
+
+ if ($pid = fork()) {
+ exit(0);
+ } elsif (!defined($pid)) {
+ wr_err("$E_FORK: $!");
+ die;
+ } else {
+ setsid();
+ close(STDIN);
+ close(STDOUT);
+ open(STDOUT, "> /dev/null");
+ openlogs($facility);
+ }
+}
+
+sub start_watcher {
+ my ($watcher, $facility, @params) = @_;
+ my ($pid, $parent);
+
+ $parent = $$;
+ if ($pid = fork()) {
+ return;
+ } elsif (!defined($pid)) {
+ wr_err("$E_FORK: $!");
+ die;
+ } else {
+ setsid();
+ close(STDIN);
+ close(STDOUT);
+ open(STDOUT, "> /dev/null");
+ $0 .= "_watcher";
+ openlogs($facility);
+ &$watcher($parent, @params);
+ }
+}
+
+sub wr_log {
+ my $msg = shift;
+
+ chomp($msg);
+ $msg = ( $Msg_Prefix ? &$Msg_Prefix : "") . $msg;
+ if ($Syslog) {
+ syslog("info", "%s", $msg);
+ } else {
+ print "$msg\n";
+ }
+}
+
+sub wr_err {
+ my $msg = shift;
+
+ chomp($msg);
+ print STDERR (( $Msg_Prefix ? &$Msg_Prefix : ""), "$msg\n");
+ return 1;
+}
+
+sub confirm {
+ my ($msg) = @_;
+ my ($ans);
+
+ print $msg;
+ $ans = <STDIN>;
+ chomp($ans);
+ return ($ans =~ /^(y|yes)$/io) ? 1 : 0;
+}
+
+END {
+ foreach my $lock (keys(%Locks)) { unlink($lock); }
+}
+
+1;
diff --git a/utils/cmerge2.pl b/utils/cmerge2.pl
new file mode 100755
index 0000000..dc98edd
--- /dev/null
+++ b/utils/cmerge2.pl
@@ -0,0 +1,207 @@
+#!/usr/bin/env perl
+use File::Basename;
+
+$lagandir = $ENV{LAGAN_DIR};
+$pid = $$;
+
+# process arguments
+if (@ARGV < 4 && @ARGV > 6) {
+ print STDERR ("usage:\n cmerge seqfile mfafile draftfile outfile [-nocrop] [-skipfr pid]\n");
+ exit(1);
+}
+$arglist = "";
+$nocrop = 0;
+for ($i = 4; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-nocrop/){
+ $nocrop = 1;
+ }
+ elsif ($ARGV[$i] =~ /-skipfr/){
+ $skipfr = 1;
+ $pid = $ARGV[++$i];
+ chomp $pid;
+ }
+ else {
+ print STDERR "Bad arg to cmerge: $ARGV[$i]";
+ exit(1);
+ }
+}
+$arglist = "$arglist $recurfl";
+
+if (!$skipfr) {
+ exit(1);
+}
+$newdir = `pwd`;
+chomp $newdir;
+$newdir = "$newdir/$pid";
+
+open (LOGFILE, ">>$newdir/log");
+open (INFOFILE, ">$newdir/minfo");
+
+print STDERR ("\n");
+print STDERR ("Computing Contig Overlaps\n");
+print STDERR ("-------------------------\n");
+
+print LOGFILE ("\n");
+print LOGFILE ("Computing Contig Overlaps\n");
+print LOGFILE ("-------------------------\n");
+
+# initialize merged file
+open (OFILE, ">$ARGV[3]");
+print OFILE (">merged\n");
+close (OFILE);
+`cp $ARGV[3] $ARGV[3].masked`;
+
+# initialize padding file
+open (OFILE, ">$newdir/padding");
+print OFILE (">padding\n");
+print OFILE ("NNNNNNNNNNNNNNNNNNNN.NNNNNNNNNNNNNNNNNNNN\n");
+close (OFILE);
+$padlength = `$lagandir/utils/getlength $newdir/padding`; chomp $padlength;
+
+# other initialization
+$totlength = `$lagandir/utils/getlength $ARGV[0]`;
+chomp $totlength;
+$mergedEnd = 0;
+
+# read contig list
+$numContigs = 0;
+ at list = `cat $ARGV[2]`;
+
+for ($i = 3; $i < @list; $i++){
+ $list[$i] =~ /(.*)\.mfa --\> \((\d+) (\d+)\) score=(\d+), offset=\((\d+) (\d+)\), index=(\d+)/;
+ $filenames[$i-3] = $1;
+ $seq1Begin[$i-3] = $2;
+ $seq1End[$i-3] = $3;
+ $score[$i-3] = $4;
+ $s1shifts[$i-3] = $5;
+ $s2shifts[$i-3] = $6;
+ $num[$i-3] = $7;
+
+
+ $temp = $seq1Begin[$i-3] - $s1shifts[$i-3];
+ $seq2Begin[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2Begin[$i-3];
+ $seq2Begin[$i-3] += $s2shifts[$i-3];
+
+ $temp = $seq1End[$i-3] - $s1shifts[$i-3];
+ $seq2End[$i-3] = `$lagandir/utils/getcontigpos $filenames[$i-3].mfa $temp`; chomp $seq2End[$i-3];
+ $seq2End[$i-3] += $s2shifts[$i-3];
+
+ print STDERR "$filenames[$i-3].mfa --> $seq1Begin[$i-3] $seq1End[$i-3] $score[$i-3] $s1shifts[$i-3] $s2shifts[$i-3] $num[$i-3] $seq2Begin[$i-3] $seq2End[$i-3]\n";
+
+ $numContigs++;
+}
+
+# extract contigs
+$contigfile = basename ($ARGV[1]);
+$contigdir = dirname ($ARGV[1]);
+$newdir = `pwd`;
+chomp $newdir;
+$newdir = "$newdir/$pid";
+
+# start out merged file with only padding
+`mv $ARGV[3] $ARGV[3].new`;
+`$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`;
+`mv $ARGV[3].masked $ARGV[3].masked.new`;
+`$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/padding > $ARGV[3].masked`;
+$contigStart[0] = 1;
+$startChop[0] = 0;
+
+`cp $filenames[0] $newdir/current`;
+`cp $filenames[0].masked $newdir/current.masked`;
+
+# merge contigs
+for ($i = 1; $i < $numContigs; $i++){
+ `$lagandir/rechaos.pl $newdir/current $filenames[$i] -recurse \"(12,0,40,0)x\" -maskedonly > $newdir/currentanchs`;
+ # find the overlap
+
+ `$lagandir/utils/getoverlap $newdir/currentanchs` =~ /(-?\d+) (-?\d+) (-?\d+) (-?\d+)/;
+ $rangebegin1 = $1;
+ $rangeend1 = $2;
+ $rangebegin2 = $3;
+ $rangeend2 = $4;
+
+ chomp $rangebegin1;
+ chomp $rangeend1;
+ chomp $rangebegin2;
+ chomp $rangeend2;
+
+ $thislength = `$lagandir/utils/getlength $filenames[$i-1]`; chomp $thislength;
+ $nextlength = `$lagandir/utils/getlength $filenames[$i]`; chomp $nextlength;
+
+ # if no overlap, flush the buffer
+ if ($rangebegin1 == -1 && $rangeend1 == -1){
+
+ print STDERR "No overlap found...\n";
+
+ `mv $ARGV[3] $ARGV[3].new`;
+ `$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`;
+ `cp $filenames[$i] $newdir/current`;
+
+ `mv $ARGV[3].masked $ARGV[3].masked.new`;
+ `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`;
+ `cp $filenames[$i].masked $newdir/current.masked`;
+
+ $contigEnd[$i-1] = $contigStart[$i-1] + $thislength - 1;
+ $contigStart[$i] = $contigEnd[$i-1] + $padlength + 1;
+ $endChop[$i-1] = 0;
+ $startChop[$i] = 0;
+ }
+ else {
+ print STDERR "Overlap detected!\n";
+
+ # extract the overlapped region > overlap
+ $j = $rangebegin1 - 1;
+
+ if ($j > 0){
+ `$lagandir/utils/cextract $newdir/current 1 $j 0 0 > $newdir/overlap`;
+ `$lagandir/utils/cextract $newdir/current.masked 1 $j 0 0 > $newdir/overlap.masked`;
+ $overlaplength = `$lagandir/utils/getlength $newdir/overlap`; chomp $overlaplength;
+
+ `mv $ARGV[3] $ARGV[3].new`;
+ `$lagandir/utils/seqmerge $ARGV[3].new $newdir/overlap > $ARGV[3]`;
+ `mv $ARGV[3].masked $ARGV[3].masked.new`;
+ `$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/overlap.masked > $ARGV[3].masked`;
+ }
+
+ # extract the nonoverlapped region > current
+ `$lagandir/utils/cextract $filenames[$i] $rangebegin2 $nextlength 0 0 > $newdir/current`;
+ `$lagandir/utils/cextract $filenames[$i].masked $rangebegin2 $nextlength 0 0 > $newdir/current.masked`;
+
+ $contigEnd[$i-1] = $contigStart[$i-1] + $overlaplength - 1;
+ $contigStart[$i] = $contigEnd[$i-1] + 1;
+ $endChop[$i-1] = $thislength - $rangeend1;
+ $startChop[$i] = $rangebegin2 - 1;
+ }
+
+ if (index ($filenames[$i-1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; }
+ @temp = `head $filenames[$i-1]`;
+ chomp $temp[0]; $temp[0] = substr $temp[0], 1;
+
+ print INFOFILE "$temp[0]\n";
+ print INFOFILE "$num[$i-1] $seq1Begin[$i-1] $seq1End[$i-1] $contigStart[$i-1] $contigEnd[$i-1] $startChop[$i-1] $endChop[$i-1] $direction $score[$i-1] $seq2Begin[$i-1] $seq2End[$i-1]\n";
+
+}
+
+$thislength = `$lagandir/utils/getlength $filenames[$numContigs - 1]`; chomp $thislength;
+$contigEnd[$numContigs - 1] = $contigStart[$numContigs - 1] + $thislength - 1;
+$endChop[$numContigs - 1] = 0;
+
+`mv $ARGV[3] $ARGV[3].new`;
+`$lagandir/utils/seqmerge $ARGV[3].new $newdir/current $newdir/padding > $ARGV[3]`;
+`mv $ARGV[3].masked $ARGV[3].masked.new`;
+`$lagandir/utils/seqmerge $ARGV[3].masked.new $newdir/current.masked $newdir/padding > $ARGV[3].masked`;
+
+if (index ($filenames[$numContigs - 1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; }
+ at temp = `head $filenames[$numContigs - 1]`;
+chomp $temp[0]; $temp[0] = substr $temp[0], 1;
+print INFOFILE "$temp[0]\n";
+print INFOFILE "$num[$numContigs - 1] $seq1Begin[$numContigs - 1] $seq1End[$numContigs - 1] $contigStart[$numContigs - 1] $contigEnd[$numContigs - 1] $startChop[$numContigs - 1] $endChop[$numContigs - 1] $direction $score[$numContigs - 1] $seq2Begin[$numContigs - 1] $seq2End[$numContigs - 1]\n";
+
+
+print STDERR "Merging complete!\n\n";
+print LOGFILE "Merging complete!\n\n";
+
+# 1. write getoverlap() -- given a set of chaos hits, find the beginning and end in both seqs
+# 2. implement contigStart, contigStop -- positions of the contig begins/ends in the merged draft sequence
+# 3. startChop, endChop -- number chopped from each end
+# 4. secFrom, secTo -- pos in the chopped contig sequence
diff --git a/utils/draft.pl b/utils/draft.pl
new file mode 100755
index 0000000..4bda5cf
--- /dev/null
+++ b/utils/draft.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+use File::Basename;
+
+$lazyflag = 0;
+$lagandir = $ENV{LAGAN_DIR};
+$recurfl = "-recurse \"(12,0,30,0)x,(13,1,30,0)x,(3,0,30,0)xt,(8,1,30,0)x,(7,1,30,0)x,(7,1,15,0)x\"";
+$laganparams = "-maskedonly ";
+$anchgapstart = -5;
+$anchgapcont = -0.2;
+$usebounds = 1;
+
+$startingrate = 65;
+$rateinc = 1;
+$frlevel = "";
+$pid = "mergedir";
+
+if (@ARGV < 2) {
+ if ((@ARGV == 1) && ($ARGV[0] =~ /-version/)){
+ print STDERR "DRAFT version 0.1\n";
+ exit (0);
+ }
+ else {
+ print STDERR ("Usage:\n\ndraft.pl SEQFILE MFAFILE [-cons RATE] [-translate] [-version]\n");
+ exit (1);
+ }
+}
+
+$arglist = "";
+$skipfr = 0;
+for ($i = 2; $i < @ARGV; $i++) {
+ if ($ARGV[$i] =~ /-recurse/){
+ $recurfl = " -recurse \"".$ARGV[++$i]."\"";
+ }
+ elsif ($ARGV[$i] =~ /-skipfr/){
+ $skipfr = 1;
+ $pid = $ARGV[++$i];
+ chomp $pid;
+ }
+ elsif ($ARGV[$i] =~ /-translate/){
+ $recurfl = $recurfl." -translate";
+ }
+ elsif ($ARGV[$i] =~ /-cons/){
+ $startingrate = $ARGV[++$i];
+ chomp $startingrate;
+ }
+ elsif ($ARGV[$i] =~ /-lazy/){
+ $lazyflag = 1;
+ }
+ elsif ($ARGV[$i] =~ /-fastreject/){
+ $frarg = " -fastreject $frlevel";
+ }
+ else {
+ print STDERR "Bad arg to draft: $ARGV[$i]";
+ }
+}
+
+$arglist = "$arglist $recurfl -usebounds $laganparams $frarg";
+
+# create new directory
+$newdir = `pwd`;
+chomp $newdir;
+$newdir = "$newdir/$pid";
+`mkdir $newdir` if (!(-e $newdir));
+
+open (LOGFILE, ">$newdir/log");
+
+print STDERR ("\n");
+print STDERR ("Finding Contig Alignments\n");
+print STDERR ("-------------------------\n");
+
+print LOGFILE ("\n");
+print LOGFILE ("Finding Contig Alignments\n");
+print LOGFILE ("-------------------------\n");
+
+# extract contigs;
+$contigfile = basename ($ARGV[1]);
+$contigdir = dirname ($ARGV[1]);
+
+`cp $ARGV[1] $newdir`;
+ at contigs = `perl $lagandir/mextract.pl $newdir/$contigfile`;
+if ($?) { exit(1);}
+for ($i = 0; $i < @contigs; $i++){
+ chomp $contigs[$i];
+ `$lagandir/utils/rc < $contigs[$i] > $contigs[$i].rc`;
+ if ($?) { exit(1); }
+}
+
+# extract masked contigs
+$maskedname = $ARGV[1].".masked";
+
+if (-e $maskedname){
+ $maskedcontigfile = basename ($maskedname);
+ `cp $maskedname $newdir`;
+ @maskedcontigs = `perl $lagandir/mextract.pl $newdir/$maskedcontigfile -masked`;
+ if ($?) { exit(1);}
+ for ($i = 0; $i < @maskedcontigs; $i++){
+ chomp $maskedcontigs[$i];
+ `$lagandir/utils/rc < $maskedcontigs[$i] > $contigs[$i].rc.masked`;
+ if ($?) { exit(1); }
+ }
+}
+
+# create file storing name of contig stats
+open (LFILE, ">$newdir/filenames") if (!$lazyflag);
+$num = 0;
+
+for ($i = 0; $i < @contigs; $i++){
+ chomp $contigs[$i];
+ $skip1 = $skip2 = 0;
+ # make alignments
+ if (!$lazyflag || !(-e "$contigs[$i].mfa")){
+ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i] -mfa $arglist -out $contigs[$i].mfa";
+ $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds);
+ `$execute`;
+ $ex_val = $? >> 8;
+ if (!(-e "$contigs[$i].mfa")) { $skip1 = 1; }
+ elsif ($?) { exit(1);}
+
+ if (!$skip1 && $usebounds){
+ # compute bounds
+ @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i]`;
+ if ($?) { exit(1);}
+ $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/;
+ $s1shift = $1 - 1;
+ $s2shift = $3 - 1;
+ }
+ `rm anchs.final`;
+ }
+
+ if (!$lazyflag || !(-e "$contigs[$i].rc.mfa")){
+ $execute = "perl $lagandir/lagan.pl $ARGV[0] $contigs[$i].rc -mfa $arglist -out $contigs[$i].rc.mfa";
+ $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds);
+ `$execute`;
+ $ex_val = $? >> 8;
+ if (!(-e "$contigs[$i].rc.mfa")) { $skip2 = 1; }
+ elsif ($?) { exit(1);}
+ if (!$skip2 && $usebounds){
+ # compute bounds
+ @bounds = `$lagandir/utils/getbounds anchs.final $ARGV[0] $contigs[$i].rc`;
+ if ($?) { exit(1);}
+ $bounds[0] =~ /-s1 (\d+) (\d+) -s2 (\d+) (\d+)/;
+ $s1rcshift = $1 - 1;
+ $s2rcshift = $3 - 1;
+ }
+ `rm anchs.final`;
+ }
+
+ if ($skip1) {
+ $fscore = 0;
+ }
+ else {
+ $fscore = `$lagandir/utils/scorealign $contigs[$i].mfa $startingrate`; chomp $fscore;
+ if ($?) { exit(1);}
+ }
+ if ($skip2) {
+ $bscore = 0;
+ }
+ else {
+ $bscore = `$lagandir/utils/scorealign $contigs[$i].rc.mfa $startingrate`; chomp $bscore;
+ if ($?) { exit(1);}
+ }
+ # pick strand
+
+# print LFILE "$s1shift $contigs[$i].mfa\n" if (!$lazyflag);
+# print LFILE "$s1rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag);
+
+# if (0){
+ if ($fscore > 0 || $bscore > 0){
+ $j = $i + 1;
+ if ($fscore > $bscore){
+ print STDERR ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n");
+ print LOGFILE ("(+) direction preferred for Contig \"$contigs[$i]\": $fscore > $bscore\n");
+ print LFILE "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag);
+ print STDERR "$j $s1shift $s2shift $contigs[$i].mfa\n" if (!$lazyflag);
+ }
+ elsif ($bscore > $fscore){
+ print STDERR ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n");
+ print LOGFILE ("(-) direction preferred for Contig \"$contigs[$i]\": $fscore < $bscore\n");
+ print LFILE "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag);
+ print STDERR "$j $s1rcshift $s2rcshift $contigs[$i].rc.mfa\n" if (!$lazyflag);
+ }
+ }
+# }
+ else {
+ print STDERR ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n");
+ print LOGFILE ("Contig \"$contigs[$i]\" could not be matched: $fscore, $bscore\n");
+ }
+}
+close (LFILE);
+
+print STDERR ("\n");
+print STDERR ("Computing Contig Ordering\n");
+print STDERR ("-------------------------\n\n");
+
+print LOGFILE ("\n");
+print LOGFILE ("Computing Contig Ordering\n");
+print LOGFILE ("-------------------------\n\n");
+
+$foundorder = 0;
+
+for ($cutoff = $startingrate; !$foundorder && ($cutoff < 100); $cutoff += $rateinc){
+ `$lagandir/utils/scorecontigs /$newdir/filenames $ARGV[0] $newdir/contignames $cutoff > $newdir/ranges`;
+ if ($?) { exit(1);}
+ @list = `cat $newdir/ranges`;
+ $list[0] =~ /numContigs = (\d+)/;
+ next if ($1 == 0);
+
+ `$lagandir/utils/contigorder $newdir/ranges > $newdir/corder`;
+ if ($?) { exit(1);}
+ @list = `cat $newdir/corder`;
+ chomp $list[0];
+ $foundorder = 1 if ($list[0] ne "ordering failed");
+}
+
+if ($foundorder){
+ open (OFILE, ">$newdir/draft");
+ print OFILE ("Draft Ordering\n");
+ print OFILE ("--------------\n\n");
+
+ @contignames = `cat $newdir/contignames`;
+ for ($i = 0; $i < @contignames; $i++){
+ $contignames[$i] =~ /(\d+) (\d+) (\d+) (.*)/;
+ $num[$i] = $1; chomp $num[$i];
+ $s1shifts[$i] = $2; chomp $s1shifts[$i];
+ $s2shifts[$i] = $3; chomp $s2shifts[$i];
+ $filenames[$i] = $4; chomp $filenames[$i];
+ }
+
+ @list = `cat $newdir/corder`;
+ for ($i = 0; $i < @list; $i++){
+ $list[$i] =~ /(\d+) --\> \((\d+) (\d+)\) (.*)/;
+ $score = $4; chomp $score;
+ print OFILE ("$filenames[$1] --> ($2 $3) score=$score, offset=($s1shifts[$1] $s2shifts[$1]), index=$num[$1]\n");
+ }
+ close (OFILE);
+
+ print STDERR `cat $newdir/draft`;
+ print LOGFILE `cat $newdir/draft`;
+ close (LOGFILE);
+}
+else {
+ print STDERR "Could not compute ordering.";
+ print LOGFILE "Could not compute ordering.";
+ close (LOGFILE);
+ exit (0);
+}
+
+$filename1 = $ARGV[0];
+$filename2 = "$newdir/$contigfile";
+
+`$lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid`;
+if ($?) { exit(1); }
+
+print STDERR "EXECUTE $lagandir/cmerge2.pl $filename1 $filename2 $newdir/draft $filename2.merged -skipfr $pid\n";
+
+`cp $filename2.merged merged_seq.fa`;
+`cp $filename2.merged.masked merged_seq.fa.masked`;
+`cp $newdir/minfo minfo`;
+`cp $newdir/ranges ranges`;
+`cp $newdir/log log`;
+
+print STDERR ("\n");
+print STDERR ("Computing Final Alignment\n");
+print STDERR ("-------------------------\n\n");
+
+# `rm -rf $newdir`;
+
diff --git a/utils/flipchaos.pl b/utils/flipchaos.pl
new file mode 100644
index 0000000..f2ef9d7
--- /dev/null
+++ b/utils/flipchaos.pl
@@ -0,0 +1,13 @@
+#!/usr/bin/perl
+
+
+while ($line = <STDIN>) {
+ $line =~ /(.*)\s+([0-9]+)\s+([0-9]+);\s*(.*)\s+([0-9]+)\s+([0-9]+);\s*score\s* =\s*([0-9]*)\.?([0-9]*)\s*\(([+-])\)/;
+ if ($9 eq "+" || $6 > $5) {
+ print "$4 $5 $6; $1 $2 $3; score = $7.$8 ($9)\n";
+ }
+ else {
+ print "$4 $6 $5; $1 $3 $2; score = $7.$8 ($9)\n";
+ }
+
+}
diff --git a/utils/mextract.pl b/utils/mextract.pl
new file mode 100755
index 0000000..b609109
--- /dev/null
+++ b/utils/mextract.pl
@@ -0,0 +1,88 @@
+#!/usr/bin/env perl
+
+if (@ARGV < 1) {
+ print ("usage:\n mextract.pl filename [-masked]\n");
+ exit(1);
+}
+
+$masked=0;
+$filename = $ARGV[0];
+if(@ARGV==2) {
+ if ($ARGV[1] eq "-masked") {
+ $masked = 1;
+ }
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+$prefix = substr $filename, 0, (rindex $filename, ".");
+if ($masked || index ($filename, ".masked") != -1) {
+ $prefix = substr $filename, 0, (rindex $prefix, ".");
+}
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$suffix = "fa";
+if ($masked) {
+ $suffix = "$suffix.masked";
+}
+
+if (substr($line, 0, 1) eq ">") {
+ $name = substr($line, 1);
+ if (index ($name, " ") != -1){
+ $name = substr($name, 0, index ($name, " "));
+ }
+ if (substr ($name, length ($name) - 1) eq ","){
+ $name = substr($name, 0, length ($name) - 1);
+ }
+# $name = substr($line, 1);
+# $_ = substr($line, 1);
+# /\w+/g;
+# $name = $&;
+
+# substr($line, 1)." " =~ /(.+)[,]\s+/g;
+# $name = $1;
+
+ $fname = "$prefix\_$name.$suffix";
+ print("$fname\n");
+ open(OUTFILE, ">$fname");
+ print OUTFILE ">$name\n";
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ close OUTFILE;
+
+# substr($line, 1)." " =~ /(.+)[,]\s/g;
+# $name = $1;
+
+ $name = substr($line, 1);
+ if (index ($name, " ") != -1){
+ $name = substr($name, 0, index ($name, " "));
+ }
+ if (substr ($name, length ($name) - 1) eq ","){
+ $name = substr($name, 0, length ($name) - 1);
+ }
+# $_ = substr($line, 1);
+# /\w+/g;
+# $name = $&;
+
+ $fname = "$prefix\_$name.$suffix";
+ print("$fname\n");
+ open(OUTFILE, ">$fname");
+ print OUTFILE ">$name\n";
+ } else {
+ print OUTFILE "$line";
+ }
+}
+
+close OUTFILE;
diff --git a/utils/mf2bin.pl b/utils/mf2bin.pl
new file mode 100755
index 0000000..6e5105c
--- /dev/null
+++ b/utils/mf2bin.pl
@@ -0,0 +1,93 @@
+#!/usr/bin/env perl
+
+# defaults
+# constants
+
+# usage notes
+
+if (@ARGV < 1) {
+ print ("usage:\n mf2bin.pl inputfile [-out outputfile] \n");
+ exit(1);
+}
+
+# parse parameters
+
+$tofile = 0;
+for ($i=1; $i<@ARGV; $i++) {
+ if ($ARGV[$i] eq "-out") {
+ $tofile = 1;
+ $outfilename = $ARGV[++$i];
+ }
+}
+
+if ($tofile) {
+ open(OUTFILE, ">$outfilename");
+}
+
+# read in Multi-FASTA file
+
+$infilename = $ARGV[0];
+open(FASTAFILE, "$infilename") || die "Could not open $infilename.\n\n";
+$line = <FASTAFILE>;
+chomp $line;
+
+$i=0;
+%list=();
+ at seqs=(());
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ push @seqs, ();
+ } else {
+ push @{$seqs[$i]}, "$line";
+ }
+}
+
+$i=0;
+for $row (@seqs) {
+ @strs[$i++] = join "", @$row;
+}
+
+if (@keys != 2) {
+ print ("mpack needs two FASTA sequences\n");
+ exit(1);
+}
+
+
+# pack bin
+# format from Alex Poliakov's glass2bin.pl script
+
+%base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5,
+ 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5);
+$l = length @strs[0]; # $l--;
+$s1 = reverse(@strs[0]);
+$s2 = reverse(@strs[1]);
+
+
+for ($i=0; $i<$l; $i++) {
+ if ($tofile) {
+ print OUTFILE pack("H2",
+ $base_code{chop($s1)} . $base_code{chop($s2)});
+ } else {
+ print pack("H2",
+ $base_code{chop($s1)} . $base_code{chop($s2)});
+ }
+}
+
+
diff --git a/utils/mpretty.pl b/utils/mpretty.pl
new file mode 100755
index 0000000..a090f03
--- /dev/null
+++ b/utils/mpretty.pl
@@ -0,0 +1,263 @@
+#!/usr/bin/env perl
+
+# defaults
+
+$linelen = 50;
+$interval = 10;
+$labellen = 5;
+$uselabels = 1;
+$useintervals = 1;
+$usecounts = 1;
+$usebase = 0;
+$liststart = 1;
+$listend = 0;
+$usestart = 0;
+$useend = 0;
+
+# constants
+
+$minlinelen = 10;
+$mininterval = 10;
+$minlabellen = 3;
+
+
+# usage notes
+
+if (@ARGV < 1) {
+ print ("usage:\n mpretty.pl filename\n");
+ print ("options:\n");
+ print (" -linelen value\n");
+ print (" (min: $minlinelen, default: $linelen)\n");
+ print (" -interval value\n");
+ print (" (min: $mininterval, default: $interval, none: 0)\n");
+ print (" -labellen value\n");
+ print (" (min: $labellen, default: $labellen, none: 0)\n");
+ print (" -base sequence_name\n");
+ print (" (if used, must specify a sequence on which to base counting\n");
+ print (" -start value\n");
+ print (" (if used, must specify a start coordinate (>=1)\n");
+ print (" -end value\n");
+ print (" (if used, must specify an end coordinate (>=start)\n");
+ print (" -nocounts\n");
+ exit(1);
+}
+
+
+# parse parameters
+
+for ($i=1; $i<@ARGV; $i++) {
+ if ($ARGV[$i] eq "-nocounts") {
+ $usecounts = 0;
+ }
+ if ($ARGV[$i] eq "-linelen") {
+ $linelen = $ARGV[++$i];
+ if ($linelen < $minlinelen) {
+ $linelen = $minlinelen;
+ }
+ }
+ if ($ARGV[$i] eq "-interval") {
+ $interval = $ARGV[++$i];
+ if ($interval <= 0) {
+ $useintervals = 0;
+ }
+ if ($interval < $mininterval) {
+ $interval = $mininterval;
+ }
+ }
+ if ($ARGV[$i] eq "-labellen") {
+ $labellen = $ARGV[++$i];
+ if ($labellen <= 0) {
+ $uselabels = 0;
+ }
+ if ($labellen < $minlabellen) {
+ $labellen = $minlabellen;
+ }
+ }
+ if ($ARGV[$i] eq "-base") {
+ $baseseq = $ARGV[++$i];
+ $usebase = 1;
+ }
+ if ($ARGV[$i] eq "-start") {
+ $usestart = 1;
+ $liststart = $ARGV[++$i];
+ }
+ if ($ARGV[$i] eq "-end") {
+ $useend = 1;
+ $listend = $ARGV[++$i];
+ }
+}
+
+# preprocessing for labels
+
+if ($uselabels) {
+ $labtail = "";
+ for ($i=0; $i<$labellen; $i++) {
+ $labtail="$labtail ";
+ }
+}
+
+if (($usestart && ($liststart<1)) || ($useend && ($listend<$liststart))) {
+ die "Invalid range specified: [$liststart, $listend].\n\n";
+}
+
+# read in Multi-FASTA file
+
+$filename = $ARGV[0];
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+ at seqs=(());
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ @count[$i]=0;
+ @label[$i] = substr("@keys[$i]$labtail", 0, $labellen);
+ $list{@keys[$i]}=$i;
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ @count[$i]=0;
+ @label[$i] = substr("@keys[$i]$labtail", 0, $labellen);
+ $list{@keys[$i]}=$i;
+ push @seqs, ();
+ } else {
+ push @{$seqs[$i]}, "$line";
+ }
+}
+
+$i=0;
+$maxlen = 0;
+for $row (@seqs) {
+ @strs[$i++] = join "", @$row;
+ $templen = length @strs[$i-1];
+ if ($templen > $maxlen) {
+ $maxlen = $templen;
+ }
+}
+
+$foundseq=0;
+if ($usebase) {
+ foreach $s (@keys) {
+ $foundseq = ($s eq $baseseq) || $foundseq;
+ }
+if (!$foundseq) { die "Could not find Base Sequence: <$baseseq>\n\n"; }
+}
+
+# preprocessing for counts
+
+if ($usecounts) {
+ foreach $s (@keys) {
+ $_ = @strs[$list{$s}];
+ $ls = tr/ATCGNatcgn/ATGCNatcgn/;
+ @tot[$list{$s}] = $ls;
+ }
+}
+
+# length of sequence display
+$l=$maxlen;
+if ((!$listend) || ($listend>$maxlen)) {
+ $listend = $maxlen;
+}
+
+if ($maxlen < $liststart) { die "Starting out of bounds...\b\b"; }
+
+
+if ($usebase) {
+
+# find base sequence position
+
+ $i=0;
+ $j=0;
+ while ($j<$liststart) {
+ if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") {
+ $j++;
+ }
+ $i++;
+ }
+ $liststart = $i;
+ while ($j<$listend) {
+ if (substr(@strs[$list{$baseseq}], $i, 1) ne "-") {
+ $j++;
+ }
+ $i++;
+ }
+ $listend = $i;
+}
+
+# pretty print
+
+if ($usecounts) {
+ foreach $s (@keys) {
+ $_ = substr(@strs[$list{$s}], 0, $liststart-1);
+ $lc = tr/ATCGN/ATGCN/;
+ @count[$list{$s}]+=$lc;
+ }
+}
+
+for ($i=$liststart-1; $i<$listend; $i+=$linelen) {
+ if ($listend-$i<$linelen) { $linelen = $listend-$i;}
+ foreach $s (@keys) {
+ if ($uselabels) {
+ print "@label[$list{$s}] : ";
+ }
+ $p = substr(@strs[$list{$s}], $i, $linelen);
+ print "$p";
+
+ if ($usecounts) {
+ $_ = $p;
+ $lc = tr/ATCGN/ATGCN/;
+ @count[$list{$s}]+=$lc;
+ print " @ @count[$list{$s}]/@tot[$list{$s}]";
+ }
+
+ print "\n";
+ }
+
+ if ($useintervals) {
+ if ($uselabels) {
+ print "$labtail = ";
+ }
+ for ($j=$i+1; $j<=$i+$linelen && $j<=$l; $j+=$interval) {
+ $ct = "$j";
+ print $ct;
+ for ($k=0; $k<($interval-(length $ct)); $k++) {
+ print " ";
+ }
+ }
+ print "\n";
+ }
+ print "\n";
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/utils/mproject.pl b/utils/mproject.pl
new file mode 100755
index 0000000..1fef41e
--- /dev/null
+++ b/utils/mproject.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+
+if (@ARGV < 2) {
+ print ("usage:\n mproject.pl filename seqname1 [seqname2 ... ]\n");
+ exit(1);
+}
+
+$filename = $ARGV[0];
+
+$i = 1;
+while ($i < @ARGV) {
+ @targets[$i-1] = $ARGV[$i];
+ $i++;
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$line = <FASTAFILE>;
+chomp $line;
+
+$i=0;
+%list=();
+ at seqs=(());
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line,1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ push @seqs, ();
+ } else {
+ push @{$seqs[$i]}, "$line";
+ }
+}
+
+$i=0;
+for $row (@seqs) {
+ @strs[$i++] = join "", @$row;
+}
+
+$seqlen = length $strs[0];
+# $seqlen--;
+
+for ($i=0; $i<$seqlen; $i++) {
+ @isgap[$i] = 1;
+ foreach $s (@targets) {
+ if (substr(@strs[$list{$s}], $i, 1) ne "-") {
+ @isgap[$i] = 0;
+ break;
+ }
+ }
+}
+
+foreach $s (@targets) {
+ print ">@keys[$list{$s}]\n";
+ $j=0;
+ for ($i=0; $i<$seqlen; $i++) {
+ if(!@isgap[$i]) {
+ print substr(@strs[$list{$s}], $i, 1);
+ $j++;
+ if (($j % 60) == 0) {
+ print "\n";
+ }
+ }
+ }
+ print "\n";
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/utils/mrun.pl b/utils/mrun.pl
new file mode 100755
index 0000000..ca34ce1
--- /dev/null
+++ b/utils/mrun.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+# VISTA .plotfile defaults
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+$paregmin = 75;
+$paregmax = 100;
+$pamin = 50;
+
+$pbases = 10000;
+$ptickdist = 2000;
+$presolution = 25;
+$pwindow = 40;
+$pnumwindows = 4;
+
+
+if (@ARGV < 1) {
+ print ("usage:\n mrun.pl filename -tree \"(tree...)\"\n");
+ print ("options: [base sequence name [sequence pairs]]\n");
+ print ("default: [base sequence name = first sequence]\n");
+ print ("other MLAGAN parameters...\n");
+ print ("other VISTA parameters...\n");
+ exit(1);
+}
+
+$filename = $ARGV[0];
+
+$i = 1;
+$j = 0;
+$k = 0;
+$l = 0;
+$treespec = 0;
+while ($i < @ARGV) {
+ if ($ARGV[$i] eq "-tree") {
+ @params[$j] = "-tree";
+ @params[++$j] = "\"$ARGV[++$i]\"";
+ $_ = @params[$j];
+ $topen = tr/"\("/"\("/;
+ $tclose = tr/"\)"/"\)"/;
+ $treespec = ($topen == $tclose);
+ } else {
+ if (substr($ARGV[$i],0,1) eq "-") {
+ if (substr($ARGV[$i],0,2) eq "--") {
+ @vparams[$l++] = $ARGV[$i++];
+ @vparams[$l++] = $ARGV[$i];
+ } else {
+ $j++;
+ @params[$j] = $ARGV[$i];
+ if ((@params[$j] eq "-gapstart") ||
+ (@params[$j] eq "-gapend") ||
+ (@params[$j] eq "-gapcont") ||
+ (@params[$j] eq "-gapperseq") ||
+ (@params[$j] eq "-match") ||
+ (@params[$j] eq "-mismatch") ||
+ (@params[$j] eq "-overlap") ||
+ (@params[$j] eq "-translate") ||
+ (@params[$j] eq "-gfc") ||
+ (@params[$j] eq "-ext") ||
+ (@params[$j] eq "-glwidth")) {
+ @params[++$j] = $ARGV[++$i];
+ }
+ }
+ } else {
+ @targets[$k++] = $ARGV[$i];
+ }
+ }
+ $i++;
+}
+
+for ($i=0; $i<@vparams; $i+=2) {
+ if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; }
+}
+
+if (!$treespec) {
+ print ("Must specify valid phylogenetic tree...\n");
+ exit(1);
+}
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+$mextstr = "$lagandir/utils/mextract.pl $filename";
+print "$mextstr\n";
+if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); }
+
+if (-e "$filename.masked") {
+ $mextstr = "$lagandir/utils/mextract.pl $filename.masked -masked";
+ print "$mextstr\n";
+ if(!`$mextstr`) {
+ print "\nMasked Multi-FASTA extraction failure...\n";
+ exit(1);
+ }
+}
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ if (@targets == 0) {
+ @targets[0] = @keys[$i];
+ print "Setting Base Sequence: @targets[0]\n";
+ }
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ }
+}
+
+$prefix = substr $filename, 0, (rindex $filename, ".");
+$prefix = "$prefix\_";
+
+foreach $s (@keys) {
+ @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa";
+}
+
+if ((@targets > 1)) {
+ if (@targets %2 != 1) {
+ $c = @targets;
+ print ("$c sequences: ");
+ print ("Must specify single base sequence\n");
+ print (" OR base sequence and pairs of sequences.\n");
+ exit(1);
+ }
+}
+
+$mfiles = "";
+foreach $s (@fnames) {
+ $mfiles = "$mfiles $s";
+}
+
+$mparams = "";
+foreach $s (@params) {
+ $mparams = "$mparams $s";
+}
+
+$mlagan = "$lagandir/mlagan$mfiles$mparams > $prefix.out";
+print STDERR "\n$mlagan\n\n";
+if(`$mlagan`) { print "\n\n"; exit(1); }
+
+$i=0;
+if (@targets == 1) {
+ foreach $s (@keys) {
+ if ($s ne @targets[0]) {
+ @targets[++$i] = @targets[0];
+ @targets[++$i] = $s;
+ }
+ }
+
+}
+
+$prjhead = "$lagandir/utils/mproject.pl $prefix.out";
+$binhead = "$lagandir/utils/mf2bin.pl";
+$j=0;
+for($i=1; $i<@targets; $i+=2) {
+ $outprefix = "$prefix at targets[$i]\_ at targets[$i+1]";
+ $pargs = "$targets[$i]_aligned $targets[$i+1]_aligned";
+ $pstr = "$prjhead $pargs > $outprefix.prj";
+ print "$pstr\n";
+ if(`$pstr`) { print "\nprojection failure...\n"; exit(1); }
+ $bstr = "$binhead $outprefix.prj -out $outprefix.bin";
+ print "$bstr\n";
+ if(`$bstr`) { print "\npacking failure...\n"; exit(1); }
+ @bins[$j++] = "$outprefix.bin";
+ print "\n";
+}
+
+%distinct=();
+foreach $s (@targets) {
+ $distinct{$s} = 0;
+}
+
+ at dseqs = keys %distinct;
+
+$plotfile = "$prefix.plotfile";
+open (PLOTFILE, ">$plotfile");
+
+print PLOTFILE "TITLE $prefix.fa - mlagan\n\n";
+print PLOTFILE "OUTPUT $prefix.pdf\n\n";
+
+print PLOTFILE "SEQUENCES ";
+foreach $s (@dseqs) {
+ print PLOTFILE "$s ";
+}
+print PLOTFILE "\n\n";
+
+$i=1;
+foreach $s (@bins) {
+ print PLOTFILE "ALIGN $s BINARY\n";
+ print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n";
+ print PLOTFILE " REGIONS $paregmin $paregmax\n";
+ print PLOTFILE " MIN $pamin\n";
+ print PLOTFILE "END\n\n";
+ $i+=2;
+}
+
+print "touch $prefix.ann\n\n";
+`touch $prefix.ann`;
+
+print PLOTFILE "GENES $prefix.ann\n\n";
+print PLOTFILE "LEGEND on\n\n";
+print PLOTFILE "COORDINATE @targets[0]\n\n";
+print PLOTFILE "PAPER letter\n\n";
+print PLOTFILE "BASES $pbases\n\n";
+print PLOTFILE "TICK_DIST $ptickdist\n\n";
+print PLOTFILE "RESOLUTION $presolution\n\n";
+print PLOTFILE "WINDOW $pwindow\n\n";
+print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n";
+
+#$vistadir = `echo \$VISTA_DIR`;
+#chomp $vistadir;
+
+#if ($vistadir eq "") {
+# print ("Must specify environment variable VISTA_DIR\n");
+# exit(1);
+#}
+
+#$vistastr = "$vistadir/RunVista $plotfile";
+#print "$vistastr\n";
+#if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+print "\n\nmrun.pl -- end.\n\n";
+
+
+
+
+
+
+
+
+
+
diff --git a/utils/mrunfile.pl b/utils/mrunfile.pl
new file mode 100755
index 0000000..2a20397
--- /dev/null
+++ b/utils/mrunfile.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+if (@ARGV < 1) {
+ print ("usage:\n mrunfile.pl filename [-pairwise] [-vista]\n\n");
+ exit(1);
+}
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+
+$filename = $ARGV[0];
+open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$pairwise = 0;
+$dovista = 0;
+
+for ($l=1; $l<@ARGV; $l++) {
+ if ($ARGV[$l] eq "-pairwise") {
+ $pairwise = 1;
+ }
+ elsif ($ARGV[$l] eq "-vista") {
+ $dovista = 1;
+ }
+}
+
+$i=0;
+$j=0;
+$k=0;
+$filespec = 0;
+while ($line = <PARAMFILE>) {
+ chomp $line;
+ if ((substr($line, 0, 1) ne "#") && ($line ne "")) {
+ if (!$filespec) {
+ $seqfile = $line;
+ $filespec = 1;
+ } elsif (substr($line,0,1) eq "-") {
+ if (substr($line,0,2) eq "--") {
+ @vparams[$j++] = $line;
+ } else {
+ @params[$i++] = $line;
+ }
+ } else {
+ @seqs[$k++] = $line;
+ }
+ }
+}
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+if ($pairwise) {
+ $mexecs = "mrunpairs.pl";
+} else {
+ $mexecs = "mrun.pl";
+}
+
+$mstr = "$lagandir/utils/$mexecs $seqfile";
+
+foreach $s (@params) {
+ $mstr = "$mstr $s"
+}
+
+foreach $s (@seqs) {
+ $mstr = "$mstr $s"
+}
+
+foreach $s (@vparams) {
+ $mstr = "$mstr $s"
+}
+
+print "$mstr\n";
+`$mstr`;
+
+if($dovista) {
+
+ $prefix = substr $seqfile, 0, (rindex $filename, ".");
+ $prefix = "$prefix\_";
+
+ if ($pairwise) {
+ $prefix="$prefix\pairwise\_";
+ }
+
+ $plotfile = "$prefix.plotfile";
+
+ ($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set";
+
+ $vistastr = "$vistadir/RunVista $plotfile";
+ print "$vistastr\n";
+ if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+}
+
+print "\nmrunfile.pl -- end.\n\n";
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/utils/mrunpairs.pl b/utils/mrunpairs.pl
new file mode 100755
index 0000000..f5fa2be
--- /dev/null
+++ b/utils/mrunpairs.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+# VISTA .plotfile defaults
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+$paregmin = 75;
+$paregmax = 100;
+$pamin = 50;
+
+$pbases = 10000;
+$ptickdist = 2000;
+$presolution = 25;
+$pwindow = 40;
+$pnumwindows = 4;
+
+
+if (@ARGV < 1) {
+ print ("usage:\n mrunpairs.pl filename\n");
+ print ("options: [base sequence name [sequence pairs]]\n");
+ print ("default: [base sequence name = first sequence]\n");
+ print ("other MLAGAN parameters...\n");
+ print ("other VISTA parameters...\n");
+ exit(1);
+}
+
+$filename = $ARGV[0];
+
+$i = 1;
+$j = 0;
+$k = 0;
+$l = 0;
+$treespec = 0;
+while ($i < @ARGV) {
+ if ($ARGV[$i] eq "-tree") {
+ $treepos = $j+1;
+ @params[$j] = "-tree";
+ @params[++$j] = "\"$ARGV[++$i]\"";
+ $_ = @params[$j];
+ $topen = tr/"\("/"\("/;
+ $tclose = tr/"\)"/"\)"/;
+ $treespec = ($topen == $tclose);
+ } else {
+ if (substr($ARGV[$i],0,1) eq "-") {
+ if (substr($ARGV[$i],0,2) eq "--") {
+ @vparams[$l++] = $ARGV[$i++];
+ @vparams[$l++] = $ARGV[$i];
+ } else {
+ $j++;
+ @params[$j] = $ARGV[$i];
+ if ((@params[$j] eq "-gapstart") ||
+ (@params[$j] eq "-gapend") ||
+ (@params[$j] eq "-gapcont") ||
+ (@params[$j] eq "-gapperseq") ||
+ (@params[$j] eq "-match") ||
+ (@params[$j] eq "-mismatch") ||
+ (@params[$j] eq "-overlap") ||
+ (@params[$j] eq "-glwidth")) {
+ @params[++$j] = $ARGV[++$i];
+ }
+ }
+ } else {
+ @targets[$k++] = $ARGV[$i];
+ }
+ }
+ $i++;
+}
+
+for ($i=0; $i<@vparams; $i+=2) {
+ if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; }
+}
+
+if (!$treespec) {
+ $j++;
+ $treepos = $j+1;
+ @params[$j] = "-tree";
+ @params[++$j] = "\"()\"";
+}
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+$mextstr = "$lagandir/mextract.pl $filename";
+print "$mextstr\n";
+if(!`$mextstr`) { print "\nMulti-FASTA extraction failure...\n"; exit(1); }
+
+if (-e "$filename.masked") {
+ $mextstr = "$lagandir/mextract.pl $filename.masked -masked";
+ print "$mextstr\n";
+ if(!`$mextstr`) {
+ print "\nMasked Multi-FASTA extraction failure...\n";
+ exit(1);
+ }
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+$i=0;
+%list=();
+
+if (substr($line, 0, 1) eq ">") {
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ if (@targets == 0) {
+ @targets[0] = @keys[$i];
+ print "Setting Base Sequence: @targets[0]\n";
+ }
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ $_ = substr($line, 1);
+ /\w+/g;
+ @keys[$i] = $&;
+ $list{@keys[$i]}=$i;
+ }
+}
+
+$fprefix = substr $filename, 0, (rindex $filename, ".");
+$prefix = "$fprefix\_";
+$pprefix = "$fprefix\_pairwise\_";
+
+foreach $s (@keys) {
+ @fnames[$list{$s}] = "$prefix$keys[$list{$s}].fa";
+}
+
+
+if ((@targets > 1)) {
+ if (@targets %2 != 1) {
+ $c = @targets;
+ print ("$c sequences: ");
+ print ("Must specify single base sequence\n");
+ print (" OR base sequence and pairs of sequences.\n");
+ exit(1);
+ }
+}
+
+$i=0;
+if (@targets == 1) {
+ foreach $s (@keys) {
+ if ($s ne @targets[0]) {
+ @targets[++$i] = @targets[0];
+ @targets[++$i] = $s;
+ }
+ }
+
+}
+
+$j=0;
+for($i=1; $i<@targets; $i+=2) {
+
+ $outprefix = "$pprefix at targets[$i]\_ at targets[$i+1]";
+
+ $mfiles = " @fnames[$list{@targets[$i]}] @fnames[$list{@targets[$i+1]}]";
+
+ @params[$treepos]="\"(@targets[$i] @targets[$i+1])\"";
+
+ $mparams = "";
+ foreach $s (@params) {
+ $mparams = "$mparams $s";
+ }
+
+ $mlagan = "$lagandir/mlagan$mfiles$mparams > $outprefix.out";
+ print "\n$mlagan\n\n";
+ if(`$mlagan`) { print "\n\n"; exit(1); }
+
+ $binhead = "$lagandir/mpack.pl";
+ $bstr = "$binhead $outprefix.out -out $outprefix.bin";
+ print "$bstr\n";
+ if(`$bstr`) { print "\npacking failure...\n"; exit(1); }
+ @bins[$j++] = "$outprefix.bin";
+ print "\n";
+
+}
+
+
+%distinct=();
+foreach $s (@targets) {
+ $distinct{$s} = 0;
+}
+
+ at dseqs = keys %distinct;
+
+
+$plotfile = "$pprefix.plotfile";
+open (PLOTFILE, ">$plotfile");
+
+print PLOTFILE "TITLE $prefix.fa - mlagan\n\n";
+print PLOTFILE "OUTPUT $pprefix.pdf\n\n";
+
+print PLOTFILE "SEQUENCES ";
+foreach $s (@dseqs) {
+ print PLOTFILE "$s ";
+}
+print PLOTFILE "\n\n";
+
+$i=1;
+foreach $s (@bins) {
+ print PLOTFILE "ALIGN $s BINARY\n";
+ print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n";
+ print PLOTFILE " REGIONS $paregmin $paregmax\n";
+ print PLOTFILE " MIN $pamin\n";
+ print PLOTFILE "END\n\n";
+ $i+=2;
+}
+
+print "touch $prefix.ann\n\n";
+`touch $prefix.ann`;
+
+print PLOTFILE "GENES $prefix.ann\n\n";
+print PLOTFILE "LEGEND on\n\n";
+print PLOTFILE "COORDINATE @targets[0]\n\n";
+print PLOTFILE "PAPER letter\n\n";
+print PLOTFILE "BASES $pbases\n\n";
+print PLOTFILE "TICK_DIST $ptickdist\n\n";
+print PLOTFILE "RESOLUTION $presolution\n\n";
+print PLOTFILE "WINDOW $pwindow\n\n";
+print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n";
+
+#$vistadir = `echo \$VISTA_DIR`;
+#chomp $vistadir;
+
+#if ($vistadir eq "") {
+# print ("Must specify environment variable VISTA_DIR\n");
+# exit(1);
+#}
+
+#$vistastr = "$vistadir/RunVista $plotfile";
+#print "$vistastr\n";
+#if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+
+print "\n\nmrunpairs.pl -- end.\n\n";
+
+
+
+
+
diff --git a/utils/msplit.pl b/utils/msplit.pl
new file mode 100755
index 0000000..d77334b
--- /dev/null
+++ b/utils/msplit.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+
+if (@ARGV < 1) {
+ print ("usage:\n msplit.pl filename [-masked]\n");
+ exit(1);
+}
+
+$masked=0;
+$filename = $ARGV[0];
+if(@ARGV==2) {
+ if ($ARGV[1] eq "-masked") {
+ $masked = 1;
+ }
+}
+
+open(FASTAFILE, "$filename") || die "Could not open $filename.\n\n";
+
+#$prefix = substr $filename, 0, (rindex $filename, ".");
+#if ($masked || index ($filename, ".masked") != -1) {
+# $prefix = substr $filename, 0, (rindex $prefix, ".");
+#}
+
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$suffix = "fa";
+if ($masked) {
+ $suffix = "$suffix.masked";
+}
+
+if (substr($line, 0, 1) eq ">") {
+ $name = substr($line, 1);
+ if (index ($name, " ") != -1){
+ $name = substr($name, 0, index ($name, " "));
+ }
+ if (substr ($name, length ($name) - 1) eq ","){
+ $name = substr($name, 0, length ($name) - 1);
+ }
+# $name = substr($line, 1);
+# $_ = substr($line, 1);
+# /\w+/g;
+# $name = $&;
+
+# substr($line, 1)." " =~ /(.+)[,]\s+/g;
+# $name = $1;
+
+ $fname = "$name.$suffix";
+ print("$fname\n");
+ open(OUTFILE, ">$fname");
+ print OUTFILE ">$name\n";
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+ if (substr($line, 0, 1) eq ">") {
+ close OUTFILE;
+
+# substr($line, 1)." " =~ /(.+)[,]\s/g;
+# $name = $1;
+
+ $name = substr($line, 1);
+ if (index ($name, " ") != -1){
+ $name = substr($name, 0, index ($name, " "));
+ }
+ if (substr ($name, length ($name) - 1) eq ","){
+ $name = substr($name, 0, length ($name) - 1);
+ }
+# $_ = substr($line, 1);
+# /\w+/g;
+# $name = $&;
+
+ $fname = "$name.$suffix";
+ print("$fname\n");
+ open(OUTFILE, ">$fname");
+ print OUTFILE ">$name\n";
+ } else {
+ print OUTFILE "$line";
+ }
+}
+
+close OUTFILE;
diff --git a/utils/mviz.pl b/utils/mviz.pl
new file mode 100755
index 0000000..d402021
--- /dev/null
+++ b/utils/mviz.pl
@@ -0,0 +1,222 @@
+#!/usr/bin/env perl
+
+
+# This script requires the environment variables:
+# LAGAN_DIR and VISTA_DIR
+
+($lagandir = $ENV{LAGAN_DIR}) or die "LAGAN_DIR not set";
+
+$paregmin = 75;
+$paregmax = 100;
+$pamin = 50;
+
+$pbases = 10000;
+$ptickdist = 2000;
+$presolution = 25;
+$pwindow = 40;
+$pnumwindows = 4;
+
+
+if (@ARGV < 2) {
+ print ("usage:\n mviz.pl data_file param_file [plotfile]\n\n");
+ exit(1);
+}
+
+$pfspec = 0;
+if (@ARGV==3) {
+ $pfspec = 1;
+ $plotfile=@ARGV[2];
+ print "Using VISTA plotfile: $plotfile\n";
+}
+
+
+$filename = $ARGV[1];
+open(PARAMFILE, "$filename") || die "Could not open $filename.\n\n";
+
+$i=0;
+$j=0;
+$k=0;
+$filespec = 0;
+while ($line = <PARAMFILE>) {
+ chomp $line;
+ if ((substr($line, 0, 1) ne "#") && ($line ne "")) {
+ if (!$filespec) {
+ $seqfile = $line;
+ $filespec = 1;
+ } elsif (substr($line,0,1) eq "-") {
+ if (substr($line,0,2) eq "--") {
+ @vparams[$j++] = $line;
+ } else {
+ @params[$i++] = $line;
+ }
+ } else {
+ @targets[$k++] = $line;
+ }
+ }
+}
+
+$seqfile = @ARGV[0];
+
+if ($lagandir eq "") {
+ print ("Must specify environment variable LAGAN_DIR\n");
+ exit(1);
+}
+
+for ($i=0; $i<@vparams; $i+=2) {
+ if (@vparams[$i] eq "--regmin") { $paregmin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--regmax") { $paregmax = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--min") { $pamin = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--bases") { $pbases = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--tickdist") { $ptickdist = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--resolution") { $presolution = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--window") { $pwindow = @vparams[$i+1]; }
+ elsif (@vparams[$i] eq "--numwindows") { $pnumwindows = @vparams[$i+1]; }
+}
+
+open(FASTAFILE, "$seqfile") || die "Could not open $seqfile.\n\n";
+
+$prefix = substr $seqfile, 0, (rindex $seqfile, ".");
+if (substr($prefix, -1, 1) ne "_") {$prefix = "$prefix\_";}
+
+$line = <FASTAFILE>;
+chomp $line;
+
+while (substr($line, 0, 1) ne ">") {
+ $line = <FASTAFILE>;
+ chomp $line;
+}
+
+$i=0;
+%list=();
+
+if (substr($line, 0, 1) eq ">") {
+ @keys[$i] = substr($line, 1);
+
+ $list{@keys[$i]}=$i;
+
+ if (@targets == 0) {
+ @targets[0] = @keys[$i];
+ print "Setting Base Sequence: @targets[0]\n";
+ }
+} else {
+ print ("$filename is NOT a Multi-FASTA file...\n");
+ exit(1);
+}
+
+while ($line = <FASTAFILE>) {
+ chomp $line;
+
+ if (substr($line, 0, 1) eq ">") {
+ $i++;
+ @keys[$i] = substr($line, 1);
+
+ $list{@keys[$i]}=$i;
+ }
+}
+
+if ((@targets > 1)) {
+
+ $j=0;
+ for ($i=1; $i<@targets; $i++) {
+ $_ = @targets[$i];
+ @bp[$j++]=/\w+/g;
+ $_=$&;
+ @bp[$j++]=/\w+/g;
+ }
+ $j=1;
+ foreach $s (@bp) {
+ @targets[$j++]=$s;
+ }
+ if (@targets %2 != 1) {
+ $c = @targets;
+ print ("$c sequences: ");
+ print ("Must specify single base sequence\n");
+ print (" OR base sequence and pairs of sequences.\n");
+ exit(1);
+ }
+}
+
+$i=0;
+if (@targets == 1) {
+ foreach $s (@keys) {
+# $s = substr $s, 0, (rindex $s, "_aligned");
+ if ($s ne @targets[0]) {
+ @targets[++$i] = @targets[0];
+ @targets[++$i] = $s;
+ }
+ }
+}
+
+print "TARGETS:\n";foreach $s (@targets) { print "\"$s\"\n"; }
+
+$prjhead = "$lagandir/utils/mproject.pl $seqfile";
+$binhead = "$lagandir/utils/mf2bin.pl";
+$j=0;
+for($i=1; $i<@targets; $i+=2) {
+ $outprefix = "$prefix at targets[$i]\_ at targets[$i+1]";
+ $pargs = "$targets[$i] $targets[$i+1]";
+ $pstr = "$prjhead $pargs > $outprefix.prj";
+ print "$pstr\n";
+ if(`$pstr`) { print "\nprojection failure...\n"; exit(1); }
+ $bstr = "$binhead $outprefix.prj -out $outprefix.bin";
+ print "$bstr\n";
+ if(`$bstr`) { print "\npacking failure...\n"; exit(1); }
+ @bins[$j++] = "$outprefix.bin";
+ print "\n";
+}
+
+%distinct=();
+foreach $s (@targets) {
+ $distinct{$s} = 0;
+}
+
+ at dseqs = keys %distinct;
+
+if (!$pfspec) {
+
+ $plotfile = "$prefix.plotfile";
+ open (PLOTFILE, ">$plotfile");
+
+ print PLOTFILE "TITLE $prefix.fa - mlagan\n\n";
+ print PLOTFILE "OUTPUT $prefix.pdf\n\n";
+
+ print PLOTFILE "SEQUENCES ";
+ foreach $s (@dseqs) {
+ print PLOTFILE "$s ";
+ }
+ print PLOTFILE "\n\n";
+
+ $i=1;
+ foreach $s (@bins) {
+ print PLOTFILE "ALIGN $s BINARY\n";
+ print PLOTFILE " SEQUENCES @targets[$i] @targets[$i+1]\n";
+ print PLOTFILE " REGIONS $paregmin $paregmax\n";
+ print PLOTFILE " MIN $pamin\n";
+ print PLOTFILE "END\n\n";
+ $i+=2;
+ }
+
+ print "touch $prefix.ann\n\n";
+ `touch $prefix.ann`;
+
+ print PLOTFILE "GENES $prefix.ann GFF\n\n";
+ print PLOTFILE "LEGEND on\n\n";
+ print PLOTFILE "COORDINATE @targets[0]\n\n";
+ print PLOTFILE "PAPER letter\n\n";
+ print PLOTFILE "BASES $pbases\n\n";
+ print PLOTFILE "TICK_DIST $ptickdist\n\n";
+ print PLOTFILE "RESOLUTION $presolution\n\n";
+ print PLOTFILE "WINDOW $pwindow\n\n";
+ print PLOTFILE "NUM_WINDOWS $pnumwindows\n\n";
+
+}
+
+($vistadir = $ENV{VISTA_DIR}) or die "VISTA_DIR not set";
+
+$vistastr = "$vistadir/RunVista $plotfile";
+print "$vistastr\n";
+if (!`$vistastr`) { print "\nVISTA failure...\n"; exit(1); }
+
+print "\n\nmviz.pl -- end.\n\n";
+
+
diff --git a/xmfa2mfa.pl b/xmfa2mfa.pl
new file mode 100755
index 0000000..5dd1391
--- /dev/null
+++ b/xmfa2mfa.pl
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+
+use strict;
+
+$0 = rindex($0, "/") > -1 ? substr($0, rindex($0, "/")+1) : $0;
+
+my (@lines, @filt_lines);
+my ($line, $line_in, $type);
+
+my $mode = ($ARGV[0] eq "1" ? "M1" : ($ARGV[0] eq "2" ? "M2" : die("$0: Invalid base genome argument (expected 1 or 2)")));
+
+die("$0: LAGAN_DIR not defined. Stopped") unless defined $ENV{"LAGAN_DIR"};
+
+while (<STDIN>) {
+ $line_in = $_;
+ if ($line_in =~ /^\=.*(DM|M1|M2)$/) {
+ $type = $1; $line .= $line_in;
+ $lines[$#lines+1] = $line if $type eq "DM" or $type eq $mode;
+ undef $line; undef $type;
+ } else {
+ $line .= $line_in;
+ }
+}
+
+foreach my $line (@lines) {
+ if ($mode eq "M2") {
+ $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n(.+)\n(\=.+?)\n/s;
+# $line =~ /(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\>[^\s\n]+\s([\+\-])[^\n]+)\n([^\n]+)\n(\=.+?\n)/s;
+
+ my ($head1, $strand1, $seq1, $head2, $strand2, $seq2, $foot) = ($1, $2, $3, $4, $5, $6, $7);
+
+ die if $strand1 ne $strand2;
+ if ($strand1 eq "-") {
+ $seq1 =~ s/\n//g;
+ $seq2 =~ s/\n//g;
+ $seq1 = reverse($seq1);
+ $seq2 = reverse($seq2);
+ $seq1 =~ s/(.{80})/$1\n/g;
+ $seq2 =~ s/(.{80})/$1\n/g;
+ }
+ $line = $head2."\n".$seq2."\n".$head1."\n".$seq1."\n".$foot."\n";
+ }
+ push @filt_lines, $line;
+}
+
+open(OUT, "> tmp.xmfa");
+foreach my $line (@filt_lines) { print OUT $line; }
+close OUT;
+
+system($ENV{"LAGAN_DIR"}."/utils/Glue tmp.xmfa > glue.out 2> glue.err");
+
+open(IN, "< glue.out");
+my @glue_out = <IN>;
+close IN;
+
+open(IN, "< glue.err");
+my @glue_err = <IN>;
+close IN;
+
+unlink("tmp.xmfa");
+unlink("glue.out");
+unlink("glue.err");
+
+print STDOUT @glue_out;
+print STDERR @glue_err;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/lagan.git
More information about the debian-med-commit
mailing list